diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/PhotobucketRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/PhotobucketRipper.java index ad0159b3..4906f824 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/PhotobucketRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/PhotobucketRipper.java @@ -10,23 +10,63 @@ import java.util.regex.Pattern; import org.json.JSONArray; import org.json.JSONObject; -import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import com.rarchives.ripme.ripper.AlbumRipper; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.utils.Http; -public class PhotobucketRipper extends AlbumRipper { +public class PhotobucketRipper extends AbstractHTMLRipper { private static final String DOMAIN = "photobucket.com", HOST = "photobucket"; + private static final int ITEMS_PER_PAGE = 24; + private static final int WAIT_BEFORE_NEXT_PAGE = 2000; - private Response pageResponse = null; + private final class AlbumMetadata { + private final String url; + private final String location; + private final int sortOrder; + private int currPage = 1; + private int numPages; + + private AlbumMetadata(JSONObject data) { + this.url = data.getString("url"); + this.location = data.getString("location") + .replace(" ", "_"); + this.sortOrder = data.getInt("sortOrder"); + } + + private String getCurrPageURL(){ + return url + String.format("?sort=%d&page=%d", + sortOrder, currPage); + } + } + + private final Pattern collDataPattern; + private final Pattern pbURLPattern; + + // all albums including sub-albums to rip + private List albums; + // the album currently being ripped + private AlbumMetadata currAlbum; + // a new index per album downloaded + private int index = 0; public PhotobucketRipper(URL url) throws IOException { super(url); + this.collDataPattern = Pattern.compile( + "^.*collectionData: (\\{.*}).*$", Pattern.DOTALL + ); + this.pbURLPattern = Pattern.compile( + "^https?://([a-zA-Z0-9]+)\\.photobucket\\.com/user/" + + "([a-zA-Z0-9_\\-]+)/library/([^?]*).*$" + ); + } + + @Override + protected String getDomain() { + return DOMAIN; } @Override @@ -34,45 +74,35 @@ public class PhotobucketRipper extends AlbumRipper { return HOST; } + @Override public URL sanitizeURL(URL url) throws MalformedURLException { LOGGER.info(url); String u = url.toExternalForm(); if (u.contains("?")) { + // strip options from URL u = u.substring(0, u.indexOf("?")); - return new URL(u); } - else { - return url; + if (!u.endsWith("/")) { + // append trailing slash + u = u + "/"; } - } - - public String getAlbumTitle(URL url) throws MalformedURLException { - try { - // Attempt to use album title as GID - if (pageResponse == null) { - pageResponse = Http.url(url).response(); - } - Document albumDoc = pageResponse.parse(); - Elements els = albumDoc.select("div.libraryTitle > h1"); - if (els.isEmpty()) { - throw new IOException("Could not find libraryTitle at " + url); - } - return els.get(0).text(); - } catch (IOException e) { - // Fall back to default album naming convention - } - return super.getAlbumTitle(url); + return new URL(u); } @Override public String getGID(URL url) throws MalformedURLException { - Pattern p; Matcher m; + Matcher m; + + URL sanitized = sanitizeURL(url); // http://s844.photobucket.com/user/SpazzySpizzy/library/Lady%20Gaga?sort=3&page=1 - p = Pattern.compile("^https?://[a-zA-Z0-9]+\\.photobucket\\.com/user/([a-zA-Z0-9_\\-]+)/library.*$"); - m = p.matcher(url.toExternalForm()); + m = pbURLPattern.matcher(sanitized.toExternalForm()); if (m.matches()) { - return m.group(1); + // the username is not really a unique GID, because the same user + // can have multiple albums, but on the other hand, using HOST_GID + // as save directory means we can group ripped albums of the same + // user. + return m.group(2); } throw new MalformedURLException( @@ -81,134 +111,177 @@ public class PhotobucketRipper extends AlbumRipper { + " Got: " + url); } + + + // Page iteration + + + @Override - public void rip() throws IOException { - List subalbums = ripAlbumAndGetSubalbums(this.url.toExternalForm()); - - List subsToRip = new ArrayList<>(), - rippedSubs = new ArrayList<>(); - - for (String sub : subalbums) { - subsToRip.add(sub); + protected Document getFirstPage() throws IOException { + if (this.currAlbum == null) { + this.albums = getAlbumMetadata(this.url.toExternalForm()); + LOGGER.info("Detected " + albums.size() + " albums in total"); } - - while (!subsToRip.isEmpty() && !isStopped()) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - break; - } - String nextSub = subsToRip.remove(0); - rippedSubs.add(nextSub); - LOGGER.info("Attempting to rip next subalbum: " + nextSub); - try { - pageResponse = null; - subalbums = ripAlbumAndGetSubalbums(nextSub); - } catch (IOException e) { - LOGGER.error("Error while ripping " + nextSub, e); - break; - } - for (String subalbum : subalbums) { - if (!subsToRip.contains(subalbum) && !rippedSubs.contains(subalbum)) { - subsToRip.add(subalbum); - } - } - } - waitForThreads(); + this.currAlbum = this.albums.remove(0); + // NOTE: Why not just get media count in the metadata json? + // + // Because that data might not reflect what the user sees on the page + // and can lead to iterating more pages than there actually are. + // + // An example: + // Metadata JSON -> AlbumStats: 146 images + 0 videos -> 146 items/7 pages + // http://s1255.photobucket.com/api/user/mimajki/album/Movie%20gifs/get?subAlbums=48&json=1 + // Actual item count when looking at the album url: 131 items/6 pages + // http://s1255.photobucket.com/user/mimajki/library/Movie%20gifs?sort=6&page=1 + Document page = Http.url(currAlbum.getCurrPageURL()).get(); + JSONObject collectionData = getCollectionData(page); + int totalNumItems = collectionData.getInt("total"); + this.currAlbum.numPages = (int) Math.ceil( + (double)totalNumItems / (double) ITEMS_PER_PAGE); + this.index = 0; + return page; } - private List ripAlbumAndGetSubalbums(String theUrl) throws IOException { - int filesIndex = 0, - filesTotal = 0, - pageIndex = 0; - String currentAlbumPath = null, - url = null; - - while (pageIndex == 0 || filesIndex < filesTotal) { - if (isStopped()) { - break; - } - pageIndex++; - if (pageIndex > 1 || pageResponse == null) { - url = theUrl + String.format("?sort=3&page=%d", pageIndex); - LOGGER.info(" Retrieving " + url); - pageResponse = Http.url(url).response(); - } - Document albumDoc = pageResponse.parse(); - // Retrieve JSON from request - String jsonString = null; - for (Element script : albumDoc.select("script[type=text/javascript]")) { - String data = script.data(); - // Ensure this chunk of javascript contains the album info - if (!data.contains("libraryAlbumsPageCollectionData")) { - continue; - } - // Grab the JSON - Pattern p; Matcher m; - p = Pattern.compile("^.*collectionData: (\\{.*}).*$", Pattern.DOTALL); - m = p.matcher(data); - if (m.matches()) { - jsonString = m.group(1); - break; - } - } - if (jsonString == null) { - LOGGER.error("Unable to find JSON data at URL: " + url); - break; - } - JSONObject json = new JSONObject(jsonString); - JSONObject items = json.getJSONObject("items"); - JSONArray objects = items.getJSONArray("objects"); - filesTotal = items.getInt("total"); - currentAlbumPath = json.getString("currentAlbumPath"); - for (int i = 0; i < objects.length(); i++) { - JSONObject object = objects.getJSONObject(i); - String image = object.getString("fullsizeUrl"); - filesIndex += 1; - addURLToDownload(new URL(image), - "", - object.getString("location").replaceAll(" ", "_"), - albumDoc.location(), - pageResponse.cookies()); - } + @Override + public Document getNextPage(Document page) throws IOException { + currAlbum.currPage++; + boolean endOfAlbum = currAlbum.currPage > currAlbum.numPages; + boolean noMoreSubalbums = albums.isEmpty(); + if (endOfAlbum && noMoreSubalbums){ + throw new IOException("No more pages"); } - // Get subalbums - if (url != null) { - return getSubAlbums(url, currentAlbumPath); - } else { - return new ArrayList<>(); - } - } - - private List getSubAlbums(String url, String currentAlbumPath) { - List result = new ArrayList<>(); - String subdomain = url.substring(url.indexOf("://")+3); - subdomain = subdomain.substring(0, subdomain.indexOf(".")); - String apiUrl = "http://" + subdomain + ".photobucket.com/component/Albums-SubalbumList" - + "?deferCollapsed=true" - + "&albumPath=" + currentAlbumPath // %2Falbums%2Fab10%2FSpazzySpizzy" - + "&json=1"; try { - LOGGER.info("Loading " + apiUrl); - JSONObject json = Http.url(apiUrl).getJSON(); - JSONArray subalbums = json.getJSONObject("body").getJSONArray("subAlbums"); - for (int i = 0; i < subalbums.length(); i++) { - String suburl = - "http://" - + subdomain - + ".photobucket.com" - + subalbums.getJSONObject(i).getString("path"); - suburl = suburl.replace(" ", "%20"); - result.add(suburl); - } - } catch (IOException e) { - LOGGER.error("Failed to get subalbums from " + apiUrl, e); + Thread.sleep(WAIT_BEFORE_NEXT_PAGE); + } catch (InterruptedException e) { + LOGGER.info("Interrupted while waiting before getting next page"); + } + if (endOfAlbum){ + LOGGER.info("Turning to next album " + albums.get(0).url); + return getFirstPage(); + } else { + LOGGER.info("Turning to page " + currAlbum.currPage + + " of album " + currAlbum.url); + return Http.url(currAlbum.getCurrPageURL()).get(); } - return result; } - public boolean canRip(URL url) { - return url.getHost().endsWith(DOMAIN); + + + // Media parsing + + + + @Override + protected List getURLsFromPage(Document page) { + JSONObject collectionData = getCollectionData(page); + if (collectionData == null) { + LOGGER.error("Unable to find JSON data at URL: " + page.location()); + return null; + } else { + return getImageURLs(collectionData); + } } + private JSONObject getCollectionData(Document page){ + // Retrieve JSON from a script tag in the returned document + for (Element script : page.select("script[type=text/javascript]")) { + String data = script.data(); + // Ensure this chunk of javascript contains the album info + if (data.contains("libraryAlbumsPageCollectionData")) { + Matcher m = collDataPattern.matcher(data); + if (m.matches()) { + // Grab the JSON + return new JSONObject(m.group(1)); + } + } + } + return null; + } + + private List getImageURLs(JSONObject json){ + List results = new ArrayList<>(); + JSONObject items = json.getJSONObject("items"); + JSONArray objects = items.getJSONArray("objects"); + for (int i = 0; i < objects.length(); i++) { + JSONObject object = objects.getJSONObject(i); + String imgURL = object.getString("fullsizeUrl"); + results.add(imgURL); + } + return results; + } + + @Override + protected void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(++this.index), currAlbum.location); + } + + + + // helper methods (for album metadata retrieval) + + + + private List getAlbumMetadata(String albumURL) + throws IOException { + JSONObject data = getAlbumMetadataJSON(albumURL); + List metadata = new ArrayList<>(); + metadata.add(new AlbumMetadata(data)); + if (!data.getString("location").equals("")) { + // if the location were to equal "", then we are at the profile + // page of a user. Ripping all sub-albums here would mean ripping + // all albums of a user (Not supported, only rip items in a users + // personal bucket). + for (JSONObject sub : getSubAlbumJSONs(data)){ + metadata.add(new AlbumMetadata(sub)); + } + } + LOGGER.info("Succesfully retrieved and parsed metadata"); + return metadata; + } + + private JSONObject getAlbumMetadataJSON(String albumURL) + throws IOException { + String subdomain, user, albumTitle; + Matcher m = pbURLPattern.matcher(albumURL); + if (!m.matches()){ + throw new MalformedURLException("invalid URL " + albumURL); + } + subdomain = m.group(1); + user = m.group(2); + albumTitle = m.group(3); + if (albumTitle.endsWith("/")){ + albumTitle = albumTitle.substring(0, albumTitle.length() - 1); + } + String apiURL = String.format("http://%s.photobucket.com/api/user/" + + "%s/album/%s/get?subAlbums=%d&json=1", + subdomain, user, albumTitle, ITEMS_PER_PAGE); + LOGGER.info("Loading " + apiURL); + JSONObject data = Http.url(apiURL).getJSON().getJSONObject("data"); + if (data.has("subAlbums")) { + int count = data.getInt("subAlbumCount"); + if (count > ITEMS_PER_PAGE) { + apiURL = String.format("http://%s.photobucket.com/api/user/" + + "%s/album/%s/get?subAlbums=%d&json=1", + subdomain, user, albumTitle, count); + data = Http.url(apiURL).getJSON().getJSONObject("data"); + } + } + return data; + } + + private List getSubAlbumJSONs(JSONObject data) { + List subalbumJSONs = new ArrayList<>(); + if (data.has("subAlbums")) { + JSONArray subalbums = data.getJSONArray("subAlbums"); + for (int idx = 0; idx < subalbums.length(); idx++) { + JSONObject subalbumJSON = subalbums.getJSONObject(idx); + subalbumJSONs.add(subalbumJSON); + } + } + return subalbumJSONs; + } + + // TODO: Probably want to add queue support for cases like this: + // http://s732.photobucket.com/user/doublesix66/library/WARZONE?sort=3&page=1 } \ No newline at end of file diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PhotobucketRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PhotobucketRipperTest.java index dff101a0..30885eaa 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PhotobucketRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PhotobucketRipperTest.java @@ -15,7 +15,51 @@ public class PhotobucketRipperTest extends RippersTest { deleteDir(ripper.getWorkingDir()); } */ -} - + /* + // new test, still commented out because of the issue above, + // since this test also involves network IO. + public void testGetNextPage() throws IOException { + // this album should have more than enough sub-albums and pages + // to serve as a pretty good iteration test (barring server or + // network errors) + String baseURL = "http://s1255.photobucket.com/user/mimajki/library/Movie%20gifs?sort=6&page=1"; + URL url = new URL(baseURL); + PhotobucketRipper ripper = new PhotobucketRipper(url); + org.jsoup.nodes.Document page = null; + try { + // I'm not sure it makes much sense that getFirstPage() + // is not public while getNextPage() is. + java.lang.reflect.Method method = ripper.getClass() + .getDeclaredMethod("getFirstPage"); + method.setAccessible(true); + page = (org.jsoup.nodes.Document) method.invoke(ripper); + } catch (Exception e){ + e.printStackTrace(); + fail("Calling getFirstPage() failed"); + } + int numPagesRemaining = 38; + for (int idx = 0; idx < numPagesRemaining; idx++){ + page = ripper.getNextPage(page); + System.out.println("URL: " + page.location()); + } + try { + page = ripper.getNextPage(page); + fail("Get next page did not throw an exception on the last page"); + } catch(IOException e){ + assertEquals(e.getMessage(), "No more pages"); + } + }*/ + public void testGetGID() throws IOException { + URL url = new URL("http://s732.photobucket.com/user/doublesix66/library/Army%20Painter%20examples?sort=3&page=1"); + PhotobucketRipper ripper = new PhotobucketRipper(url); + assertEquals("doublesix66", ripper.getGID(url)); + url = new URL("http://s732.photobucket.com/user/doublesix66/library/Army%20Painter%20examples/Painting%20examples?page=1&sort=3"); + assertEquals("doublesix66", ripper.getGID(url)); + url = new URL("http://s844.photobucket.com/user/SpazzySpizzy/library/Album%20Covers"); + assertEquals("SpazzySpizzy", ripper.getGID(url)); + url = new URL("http://s844.photobucket.com/user/SpazzySpizzy/library"); + assertEquals("SpazzySpizzy", ripper.getGID(url)); + } +} \ No newline at end of file