diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/NewsfilterRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/NewsfilterRipper.java index 6454c508..eee733db 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/NewsfilterRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/NewsfilterRipper.java @@ -3,9 +3,13 @@ package com.rarchives.ripme.ripper.rippers; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -14,21 +18,15 @@ import org.jsoup.select.Elements; import com.rarchives.ripme.ripper.AlbumRipper; -public class NewsfilterRipper extends AlbumRipper { +public class NewsfilterRipper extends AbstractHTMLRipper { + private static final String HOST = "newsfilter"; + private static final String DOMAIN = "newsfilter.org"; public NewsfilterRipper(URL url) throws IOException { super(url); } - @Override - public boolean canRip(URL url) { - //http://newsfilter.org/gallery/he-doubted-she-would-fuck-on-cam-happy-to-be-proven-wrong-216799 - Pattern p = Pattern.compile("^https?://([wm]+\\.)?newsfilter\\.org/gallery/.+$"); - Matcher m = p.matcher(url.toExternalForm()); - return m.matches(); - } - @Override public URL sanitizeURL(URL url) throws MalformedURLException { String u = url.toExternalForm(); @@ -40,27 +38,15 @@ public class NewsfilterRipper extends AlbumRipper { } @Override - public void rip() throws IOException { - String gid = getGID(this.url); - String theurl = "http://newsfilter.org/gallery/" + gid; - LOGGER.info("Loading " + theurl); - - Connection.Response resp = Jsoup.connect(theurl) - .timeout(5000) - .referrer("") - .userAgent(USER_AGENT) - .method(Connection.Method.GET) - .execute(); - Document doc = resp.parse(); - - Elements thumbnails = doc.select("#galleryImages .inner-block img"); - for (Element thumb : thumbnails) { - String thumbUrl = thumb.attr("src"); - String picUrl = thumbUrl.replace("thumbs/", ""); - addURLToDownload(new URL(picUrl)); + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://([wm]+\\.)?newsfilter\\.org/gallery/([^/]+)$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(2); } - - waitForThreads(); + throw new MalformedURLException( + "Expected newsfilter gallery format: http://newsfilter.org/gallery/galleryid" + + " Got: " + url); } @Override @@ -69,14 +55,30 @@ public class NewsfilterRipper extends AlbumRipper { } @Override - public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://([wm]+\\.)?newsfilter\\.org/gallery/([^/]+)$"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(2); + protected String getDomain() { + return DOMAIN; + } + + @Override + protected Document getFirstPage() throws IOException { + return Http.url(url).get(); + } + + @Override + protected List getURLsFromPage(Document page) { + List imgURLs = new ArrayList<>(); + Elements thumbnails = page.select("#galleryImages .inner-block img"); + for (Element thumb : thumbnails) { + String thumbUrl = thumb.attr("src"); + String picUrl = thumbUrl.replace("thumbs/", ""); + // use HTTP instead of HTTPS (less headaches) + imgURLs.add(picUrl.replaceFirst("https://", "http://")); } - throw new MalformedURLException( - "Expected newsfilter gallery format: http://newsfilter.org/gallery/galleryid" + - " Got: " + url); + return imgURLs; + } + + @Override + protected void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index)); } } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/NfsfwRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/NfsfwRipper.java index 3585b6bb..b525a39a 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/NfsfwRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/NfsfwRipper.java @@ -8,6 +8,7 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -18,13 +19,22 @@ import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; -public class NfsfwRipper extends AlbumRipper { +public class NfsfwRipper extends AbstractHTMLRipper { private static final String DOMAIN = "nfsfw.com", HOST = "nfsfw"; - private Document albumDoc = null; + private int index = 0; + private String currentDir = ""; + private List subalbumURLs = new ArrayList<>(); + private Pattern subalbumURLPattern = Pattern.compile( + "https?://[wm.]*nfsfw.com/gallery/v/[^/]+/(.+)$" + ); + + // cached first page + private Document fstPage; + // threads pool for downloading images from image pages private DownloadThreadPool nfsfwThreadPool; public NfsfwRipper(URL url) throws IOException { @@ -32,39 +42,104 @@ public class NfsfwRipper extends AlbumRipper { nfsfwThreadPool = new DownloadThreadPool("NFSFW"); } + @Override + protected String getDomain() { + return DOMAIN; + } + @Override public String getHost() { return HOST; } @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - return url; + protected Document getFirstPage() throws IOException { + // cache the first page + this.fstPage = Http.url(url).get(); + return fstPage; } @Override - public String getAlbumTitle(URL url) throws MalformedURLException { - try { - // Attempt to use album title as GID - if (albumDoc == null) { - albumDoc = Http.url(url).get(); + public Document getNextPage(Document page) throws IOException { + String nextURL = null; + Elements a = page.select("a.next"); + if (!a.isEmpty()){ + // Get next page of current album + nextURL = "http://nfsfw.com" + a.first().attr("href"); + } else if (!subalbumURLs.isEmpty()){ + // Get next sub-album + nextURL = subalbumURLs.remove(0); + LOGGER.info("Detected subalbum URL at:" + nextURL); + Matcher m = subalbumURLPattern.matcher(nextURL); + if (m.matches()) { + // Set the new save directory and save images with a new index + this.currentDir = m.group(1); + this.index = 0; + } else { + LOGGER.error("Invalid sub-album URL: " + nextURL); + nextURL = null; } - String title = albumDoc.select("h2").first().text().trim(); - return "nfsfw_" + Utils.filesystemSafe(title); - } catch (Exception e) { - // Fall back to default album naming convention } - return super.getAlbumTitle(url); + // Wait + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + LOGGER.error("Interrupted while waiting to load next page", e); + } + if (nextURL == null){ + throw new IOException("No more pages"); + } else { + return Http.url(nextURL).get(); + } + } + + @Override + protected List getURLsFromPage(Document page) { + List imagePageURLs = getImagePageURLs(page); + + // Check if any sub-albums are present on this page + List subalbumURLs = getSubalbumURLs(page); + this.subalbumURLs.addAll(subalbumURLs); + + return imagePageURLs; + } + + @Override + protected void downloadURL(URL url, int index) { + // if we are now downloading a sub-album, all images in it + // should be indexed starting from 0 + if (!this.currentDir.equals("")){ + index = ++this.index; + } + NfsfwImageThread t = new NfsfwImageThread(url, currentDir, index); + nfsfwThreadPool.addThread(t); + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + // always start on the first page of an album + // (strip the options after the '?') + String u = url.toExternalForm(); + if (u.contains("?")) { + u = u.substring(0, u.indexOf("?")); + return new URL(u); + } else { + return url; + } } @Override public String getGID(URL url) throws MalformedURLException { Pattern p; Matcher m; - p = Pattern.compile("https?://[wm.]*nfsfw.com/gallery/v/([a-zA-Z0-9\\-_]+).*"); + p = Pattern.compile("https?://[wm.]*nfsfw.com/gallery/v/(.*)$"); m = p.matcher(url.toExternalForm()); if (m.matches()) { - return m.group(1); + String group = m.group(1); + if (group.endsWith("/")) { + group = group.substring(0, group.length() - 1); + } + return group.replaceAll("/", "__"); } throw new MalformedURLException( @@ -74,75 +149,51 @@ public class NfsfwRipper extends AlbumRipper { } @Override - public void rip() throws IOException { - List subAlbums = new ArrayList<>(); - int index = 0; - subAlbums.add(new Pair(this.url.toExternalForm(), "")); - while (!subAlbums.isEmpty()) { - if (isStopped()) { - break; - } - Pair nextAlbum = subAlbums.remove(0); - String nextURL = nextAlbum.first; - String nextSubalbum = nextAlbum.second; - sendUpdate(STATUS.LOADING_RESOURCE, nextURL); - LOGGER.info(" Retrieving " + nextURL); - if (albumDoc == null) { - albumDoc = Http.url(nextURL).get(); - } - // Subalbums - for (Element suba : albumDoc.select("td.IMG > a")) { - if (isStopped() || isThisATest()) { - break; - } - String subURL = "http://nfsfw.com" + suba.attr("href"); - String subdir = subURL; - while (subdir.endsWith("/")) { - subdir = subdir.substring(0, subdir.length() - 1); - } - subdir = subdir.substring(subdir.lastIndexOf("/") + 1); - subAlbums.add(new Pair(subURL, subdir)); - } - // Images - for (Element thumb : albumDoc.select("td.giItemCell > div > a")) { - if (isStopped()) { - break; - } - String imagePage = "http://nfsfw.com" + thumb.attr("href"); - try { - NfsfwImageThread t = new NfsfwImageThread(new URL(imagePage), nextSubalbum, ++index); - nfsfwThreadPool.addThread(t); - if (isThisATest()) { - break; - } - } catch (MalformedURLException mue) { - LOGGER.warn("Invalid URL: " + imagePage); - } - } - if (isThisATest()) { - break; - } - // Get next page - for (Element a : albumDoc.select("a.next")) { - subAlbums.add(0, new Pair("http://nfsfw.com" + a.attr("href"), "")); - break; - } - // Insert next page at the top - albumDoc = null; - // Wait - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - LOGGER.error("Interrupted while waiting to load next page", e); - throw new IOException(e); - } - } - nfsfwThreadPool.waitForThreads(); - waitForThreads(); + public DownloadThreadPool getThreadPool() { + return nfsfwThreadPool; } - public boolean canRip(URL url) { - return url.getHost().endsWith(DOMAIN); + @Override + public boolean hasQueueSupport() { + return true; + } + + @Override + public boolean pageContainsAlbums(URL url) { + List imageURLs = getImagePageURLs(fstPage); + List subalbumURLs = getSubalbumURLs(fstPage); + return imageURLs.isEmpty() && !subalbumURLs.isEmpty(); + } + + @Override + public List getAlbumsToQueue(Document doc) { + return getSubalbumURLs(doc); + } + + // helper methods + + private List getImagePageURLs(Document page){ + // get image pages + // NOTE: It might be possible to get the (non-thumbnail) image URL + // without going to its page first as there seems to be a pattern + // between the thumb and actual image URLs, but that is outside the + // scope of the current issue being solved. + List imagePageURLs = new ArrayList<>(); + for (Element thumb : page.select("td.giItemCell > div > a")) { + String imagePage = "http://nfsfw.com" + thumb.attr("href"); + imagePageURLs.add(imagePage); + } + return imagePageURLs; + } + + private List getSubalbumURLs(Document page){ + // Check if sub-albums are present on this page + List subalbumURLs = new ArrayList<>(); + for (Element suba : page.select("td.IMG > a")) { + String subURL = "http://nfsfw.com" + suba.attr("href"); + subalbumURLs.add(subURL); + } + return subalbumURLs; } /** @@ -175,23 +226,10 @@ public class NfsfwRipper extends AlbumRipper { if (file.startsWith("/")) { file = "http://nfsfw.com" + file; } - String prefix = ""; - if (Utils.getConfigBoolean("download.save_order", true)) { - prefix = String.format("%03d_", index); - } - addURLToDownload(new URL(file), prefix, this.subdir); + addURLToDownload(new URL(file), getPrefix(index), this.subdir); } catch (IOException e) { LOGGER.error("[!] Exception while loading/parsing " + this.url, e); } } } - - private class Pair { - String first; - String second; - Pair(String first, String second) { - this.first = first; - this.second = second; - } - } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/PhotobucketRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/PhotobucketRipper.java index ad0159b3..d436af1f 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/PhotobucketRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/PhotobucketRipper.java @@ -10,23 +10,65 @@ import java.util.regex.Pattern; import org.json.JSONArray; import org.json.JSONObject; -import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import com.rarchives.ripme.ripper.AlbumRipper; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.utils.Http; -public class PhotobucketRipper extends AlbumRipper { +// TODO: Probably want to add queue support for cases like this: +// http://s732.photobucket.com/user/doublesix66/library/WARZONE?sort=3&page=1 +public class PhotobucketRipper extends AbstractHTMLRipper { private static final String DOMAIN = "photobucket.com", HOST = "photobucket"; + private static final int ITEMS_PER_PAGE = 24; + private static final int WAIT_BEFORE_NEXT_PAGE = 2000; - private Response pageResponse = null; + private final class AlbumMetadata { + private final String url; + private final String location; + private final int sortOrder; + private int currPage = 1; + private int numPages; + + private AlbumMetadata(JSONObject data) { + this.url = data.getString("url"); + this.location = data.getString("location") + .replace(" ", "_"); + this.sortOrder = data.getInt("sortOrder"); + } + + private String getCurrPageURL(){ + return url + String.format("?sort=%d&page=%d", + sortOrder, currPage); + } + } + + private final Pattern collDataPattern; + private final Pattern pbURLPattern; + + // all albums including sub-albums to rip + private List albums; + // the album currently being ripped + private AlbumMetadata currAlbum; + // a new index per album downloaded + private int index = 0; public PhotobucketRipper(URL url) throws IOException { super(url); + this.collDataPattern = Pattern.compile( + "^.*collectionData: (\\{.*}).*$", Pattern.DOTALL + ); + this.pbURLPattern = Pattern.compile( + "^https?://([a-zA-Z0-9]+)\\.photobucket\\.com/user/" + + "([a-zA-Z0-9_\\-]+)/library/([^?]*).*$" + ); + } + + @Override + protected String getDomain() { + return DOMAIN; } @Override @@ -34,45 +76,35 @@ public class PhotobucketRipper extends AlbumRipper { return HOST; } + @Override public URL sanitizeURL(URL url) throws MalformedURLException { LOGGER.info(url); String u = url.toExternalForm(); if (u.contains("?")) { + // strip options from URL u = u.substring(0, u.indexOf("?")); - return new URL(u); } - else { - return url; + if (!u.endsWith("/")) { + // append trailing slash + u = u + "/"; } - } - - public String getAlbumTitle(URL url) throws MalformedURLException { - try { - // Attempt to use album title as GID - if (pageResponse == null) { - pageResponse = Http.url(url).response(); - } - Document albumDoc = pageResponse.parse(); - Elements els = albumDoc.select("div.libraryTitle > h1"); - if (els.isEmpty()) { - throw new IOException("Could not find libraryTitle at " + url); - } - return els.get(0).text(); - } catch (IOException e) { - // Fall back to default album naming convention - } - return super.getAlbumTitle(url); + return new URL(u); } @Override public String getGID(URL url) throws MalformedURLException { - Pattern p; Matcher m; + Matcher m; + + URL sanitized = sanitizeURL(url); // http://s844.photobucket.com/user/SpazzySpizzy/library/Lady%20Gaga?sort=3&page=1 - p = Pattern.compile("^https?://[a-zA-Z0-9]+\\.photobucket\\.com/user/([a-zA-Z0-9_\\-]+)/library.*$"); - m = p.matcher(url.toExternalForm()); + m = pbURLPattern.matcher(sanitized.toExternalForm()); if (m.matches()) { - return m.group(1); + // the username is not really a unique GID, because the same user + // can have multiple albums, but on the other hand, using HOST_GID + // as save directory means we can group ripped albums of the same + // user. + return m.group(2); } throw new MalformedURLException( @@ -81,134 +113,174 @@ public class PhotobucketRipper extends AlbumRipper { + " Got: " + url); } + + + // Page iteration + + + @Override - public void rip() throws IOException { - List subalbums = ripAlbumAndGetSubalbums(this.url.toExternalForm()); - - List subsToRip = new ArrayList<>(), - rippedSubs = new ArrayList<>(); - - for (String sub : subalbums) { - subsToRip.add(sub); + protected Document getFirstPage() throws IOException { + if (this.currAlbum == null) { + this.albums = getAlbumMetadata(this.url.toExternalForm()); + LOGGER.info("Detected " + albums.size() + " albums in total"); } - - while (!subsToRip.isEmpty() && !isStopped()) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - break; - } - String nextSub = subsToRip.remove(0); - rippedSubs.add(nextSub); - LOGGER.info("Attempting to rip next subalbum: " + nextSub); - try { - pageResponse = null; - subalbums = ripAlbumAndGetSubalbums(nextSub); - } catch (IOException e) { - LOGGER.error("Error while ripping " + nextSub, e); - break; - } - for (String subalbum : subalbums) { - if (!subsToRip.contains(subalbum) && !rippedSubs.contains(subalbum)) { - subsToRip.add(subalbum); - } - } - } - waitForThreads(); + this.currAlbum = this.albums.remove(0); + // NOTE: Why not just get media count in the metadata json? + // + // Because that data might not reflect what the user sees on the page + // and can lead to iterating more pages than there actually are. + // + // An example: + // Metadata JSON -> AlbumStats: 146 images + 0 videos -> 146 items/7 pages + // http://s1255.photobucket.com/api/user/mimajki/album/Movie%20gifs/get?subAlbums=48&json=1 + // Actual item count when looking at the album url: 131 items/6 pages + // http://s1255.photobucket.com/user/mimajki/library/Movie%20gifs?sort=6&page=1 + Document page = Http.url(currAlbum.getCurrPageURL()).get(); + JSONObject collectionData = getCollectionData(page); + int totalNumItems = collectionData.getInt("total"); + this.currAlbum.numPages = (int) Math.ceil( + (double)totalNumItems / (double) ITEMS_PER_PAGE); + this.index = 0; + return page; } - private List ripAlbumAndGetSubalbums(String theUrl) throws IOException { - int filesIndex = 0, - filesTotal = 0, - pageIndex = 0; - String currentAlbumPath = null, - url = null; - - while (pageIndex == 0 || filesIndex < filesTotal) { - if (isStopped()) { - break; - } - pageIndex++; - if (pageIndex > 1 || pageResponse == null) { - url = theUrl + String.format("?sort=3&page=%d", pageIndex); - LOGGER.info(" Retrieving " + url); - pageResponse = Http.url(url).response(); - } - Document albumDoc = pageResponse.parse(); - // Retrieve JSON from request - String jsonString = null; - for (Element script : albumDoc.select("script[type=text/javascript]")) { - String data = script.data(); - // Ensure this chunk of javascript contains the album info - if (!data.contains("libraryAlbumsPageCollectionData")) { - continue; - } - // Grab the JSON - Pattern p; Matcher m; - p = Pattern.compile("^.*collectionData: (\\{.*}).*$", Pattern.DOTALL); - m = p.matcher(data); - if (m.matches()) { - jsonString = m.group(1); - break; - } - } - if (jsonString == null) { - LOGGER.error("Unable to find JSON data at URL: " + url); - break; - } - JSONObject json = new JSONObject(jsonString); - JSONObject items = json.getJSONObject("items"); - JSONArray objects = items.getJSONArray("objects"); - filesTotal = items.getInt("total"); - currentAlbumPath = json.getString("currentAlbumPath"); - for (int i = 0; i < objects.length(); i++) { - JSONObject object = objects.getJSONObject(i); - String image = object.getString("fullsizeUrl"); - filesIndex += 1; - addURLToDownload(new URL(image), - "", - object.getString("location").replaceAll(" ", "_"), - albumDoc.location(), - pageResponse.cookies()); - } + @Override + public Document getNextPage(Document page) throws IOException { + currAlbum.currPage++; + boolean endOfAlbum = currAlbum.currPage > currAlbum.numPages; + boolean noMoreSubalbums = albums.isEmpty(); + if (endOfAlbum && noMoreSubalbums){ + throw new IOException("No more pages"); } - // Get subalbums - if (url != null) { - return getSubAlbums(url, currentAlbumPath); - } else { - return new ArrayList<>(); - } - } - - private List getSubAlbums(String url, String currentAlbumPath) { - List result = new ArrayList<>(); - String subdomain = url.substring(url.indexOf("://")+3); - subdomain = subdomain.substring(0, subdomain.indexOf(".")); - String apiUrl = "http://" + subdomain + ".photobucket.com/component/Albums-SubalbumList" - + "?deferCollapsed=true" - + "&albumPath=" + currentAlbumPath // %2Falbums%2Fab10%2FSpazzySpizzy" - + "&json=1"; try { - LOGGER.info("Loading " + apiUrl); - JSONObject json = Http.url(apiUrl).getJSON(); - JSONArray subalbums = json.getJSONObject("body").getJSONArray("subAlbums"); - for (int i = 0; i < subalbums.length(); i++) { - String suburl = - "http://" - + subdomain - + ".photobucket.com" - + subalbums.getJSONObject(i).getString("path"); - suburl = suburl.replace(" ", "%20"); - result.add(suburl); - } - } catch (IOException e) { - LOGGER.error("Failed to get subalbums from " + apiUrl, e); + Thread.sleep(WAIT_BEFORE_NEXT_PAGE); + } catch (InterruptedException e) { + LOGGER.info("Interrupted while waiting before getting next page"); + } + if (endOfAlbum){ + LOGGER.info("Turning to next album " + albums.get(0).url); + return getFirstPage(); + } else { + LOGGER.info("Turning to page " + currAlbum.currPage + + " of album " + currAlbum.url); + return Http.url(currAlbum.getCurrPageURL()).get(); } - return result; } - public boolean canRip(URL url) { - return url.getHost().endsWith(DOMAIN); + + + // Media parsing + + + + @Override + protected List getURLsFromPage(Document page) { + JSONObject collectionData = getCollectionData(page); + if (collectionData == null) { + LOGGER.error("Unable to find JSON data at URL: " + page.location()); + return null; + } else { + return getImageURLs(collectionData); + } } + private JSONObject getCollectionData(Document page){ + // Retrieve JSON from a script tag in the returned document + for (Element script : page.select("script[type=text/javascript]")) { + String data = script.data(); + // Ensure this chunk of javascript contains the album info + if (data.contains("libraryAlbumsPageCollectionData")) { + Matcher m = collDataPattern.matcher(data); + if (m.matches()) { + // Grab the JSON + return new JSONObject(m.group(1)); + } + } + } + return null; + } + + private List getImageURLs(JSONObject json){ + List results = new ArrayList<>(); + JSONObject items = json.getJSONObject("items"); + JSONArray objects = items.getJSONArray("objects"); + for (int i = 0; i < objects.length(); i++) { + JSONObject object = objects.getJSONObject(i); + String imgURL = object.getString("fullsizeUrl"); + results.add(imgURL); + } + return results; + } + + @Override + protected void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(++this.index), currAlbum.location); + } + + + + // helper methods (for album metadata retrieval) + + + + private List getAlbumMetadata(String albumURL) + throws IOException { + JSONObject data = getAlbumMetadataJSON(albumURL); + List metadata = new ArrayList<>(); + metadata.add(new AlbumMetadata(data)); + if (!data.getString("location").equals("")) { + // if the location were to equal "", then we are at the profile + // page of a user. Ripping all sub-albums here would mean ripping + // all albums of a user (Not supported, only rip items in a users + // personal bucket). + for (JSONObject sub : getSubAlbumJSONs(data)){ + metadata.add(new AlbumMetadata(sub)); + } + } + LOGGER.info("Succesfully retrieved and parsed metadata"); + return metadata; + } + + private JSONObject getAlbumMetadataJSON(String albumURL) + throws IOException { + String subdomain, user, albumTitle; + Matcher m = pbURLPattern.matcher(albumURL); + if (!m.matches()){ + throw new MalformedURLException("invalid URL " + albumURL); + } + subdomain = m.group(1); + user = m.group(2); + albumTitle = m.group(3); + if (albumTitle.endsWith("/")){ + albumTitle = albumTitle.substring(0, albumTitle.length() - 1); + } + String apiURL = String.format("http://%s.photobucket.com/api/user/" + + "%s/album/%s/get?subAlbums=%d&json=1", + subdomain, user, albumTitle, ITEMS_PER_PAGE); + LOGGER.info("Loading " + apiURL); + JSONObject data = Http.url(apiURL).getJSON().getJSONObject("data"); + if (data.has("subAlbums")) { + int count = data.getInt("subAlbumCount"); + if (count > ITEMS_PER_PAGE) { + apiURL = String.format("http://%s.photobucket.com/api/user/" + + "%s/album/%s/get?subAlbums=%d&json=1", + subdomain, user, albumTitle, count); + data = Http.url(apiURL).getJSON().getJSONObject("data"); + } + } + return data; + } + + private List getSubAlbumJSONs(JSONObject data) { + List subalbumJSONs = new ArrayList<>(); + if (data.has("subAlbums")) { + JSONArray subalbums = data.getJSONArray("subAlbums"); + for (int idx = 0; idx < subalbums.length(); idx++) { + JSONObject subalbumJSON = subalbums.getJSONObject(idx); + subalbumJSONs.add(subalbumJSON); + } + } + return subalbumJSONs; + } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/PornhubRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/PornhubRipper.java index bffd0f2d..eb7a421b 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/PornhubRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/PornhubRipper.java @@ -4,9 +4,12 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -17,7 +20,7 @@ import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; -public class PornhubRipper extends AlbumRipper { +public class PornhubRipper extends AbstractHTMLRipper { // All sleep times are in milliseconds private static final int IMAGE_SLEEP_TIME = 1000; @@ -26,9 +29,6 @@ public class PornhubRipper extends AlbumRipper { // Thread pool for finding direct image links from "image" pages (html) private DownloadThreadPool pornhubThreadPool = new DownloadThreadPool("pornhub"); - // Current HTML document - private Document albumDoc = null; - public PornhubRipper(URL url) throws IOException { super(url); } @@ -38,25 +38,63 @@ public class PornhubRipper extends AlbumRipper { return HOST; } - public URL sanitizeURL(URL url) throws MalformedURLException { - return url; + @Override + protected String getDomain() { + return DOMAIN; } - public String getAlbumTitle(URL url) throws MalformedURLException { - try { - // Attempt to use album title as GID - if (albumDoc == null) { - LOGGER.info(" Retrieving " + url.toExternalForm()); - sendUpdate(STATUS.LOADING_RESOURCE, url.toString()); - albumDoc = Http.url(url).get(); - } - Elements elems = albumDoc.select(".photoAlbumTitleV2"); - return HOST + "_" + elems.get(0).text(); - } catch (Exception e) { - // Fall back to default album naming convention - LOGGER.warn("Failed to get album title from " + url, e); + @Override + protected Document getFirstPage() throws IOException { + return Http.url(url).referrer(url).get(); + } + + @Override + public Document getNextPage(Document page) throws IOException { + Elements nextPageLink = page.select("li.page_next > a"); + if (nextPageLink.isEmpty()){ + throw new IOException("No more pages"); + } else { + URL nextURL = new URL(this.url, nextPageLink.first().attr("href")); + return Http.url(nextURL).get(); + } + } + + @Override + protected List getURLsFromPage(Document page) { + List pageURLs = new ArrayList<>(); + // Find thumbnails + Elements thumbs = page.select(".photoBlockBox li"); + // Iterate over thumbnail images on page + for (Element thumb : thumbs) { + String imagePage = thumb.select(".photoAlbumListBlock > a") + .first().attr("href"); + String fullURL = "https://pornhub.com" + imagePage; + pageURLs.add(fullURL); + } + return pageURLs; + } + + @Override + protected void downloadURL(URL url, int index) { + PornhubImageThread t = new PornhubImageThread(url, index, this.workingDir); + pornhubThreadPool.addThread(t); + try { + Thread.sleep(IMAGE_SLEEP_TIME); + } catch (InterruptedException e) { + LOGGER.warn("Interrupted while waiting to load next image", e); + } + } + + public URL sanitizeURL(URL url) throws MalformedURLException { + // always start on the first page of an album + // (strip the options after the '?') + String u = url.toExternalForm(); + if (u.contains("?")) { + u = u.substring(0, u.indexOf("?")); + return new URL(u); + } else { + return url; } - return super.getAlbumTitle(url); } @Override @@ -64,7 +102,7 @@ public class PornhubRipper extends AlbumRipper { Pattern p; Matcher m; - p = Pattern.compile("^.*pornhub\\.com/album/([0-9]+)$"); + p = Pattern.compile("^.*pornhub\\.com/album/([0-9]+).*$"); m = p.matcher(url.toExternalForm()); if (m.matches()) { return m.group(1); @@ -77,48 +115,8 @@ public class PornhubRipper extends AlbumRipper { } @Override - public void rip() throws IOException { - int index = 0; - String nextUrl = this.url.toExternalForm(); - - if (albumDoc == null) { - LOGGER.info(" Retrieving album page " + nextUrl); - sendUpdate(STATUS.LOADING_RESOURCE, nextUrl); - albumDoc = Http.url(nextUrl) - .referrer(this.url) - .get(); - } - - // Find thumbnails - Elements thumbs = albumDoc.select(".photoBlockBox li"); - if (thumbs.isEmpty()) { - LOGGER.debug("albumDoc: " + albumDoc); - LOGGER.debug("No images found at " + nextUrl); - return; - } - - // Iterate over images on page - for (Element thumb : thumbs) { - if (isStopped()) { - break; - } - index++; - String imagePageUrl = thumb.select(".photoAlbumListBlock > a").first().attr("href"); - URL imagePage = new URL(url, imagePageUrl); - PornhubImageThread t = new PornhubImageThread(imagePage, index, this.workingDir); - pornhubThreadPool.addThread(t); - if (isThisATest()) { - break; - } - try { - Thread.sleep(IMAGE_SLEEP_TIME); - } catch (InterruptedException e) { - LOGGER.warn("Interrupted while waiting to load next image", e); - } - } - - pornhubThreadPool.waitForThreads(); - waitForThreads(); + public DownloadThreadPool getThreadPool(){ + return pornhubThreadPool; } public boolean canRip(URL url) { diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/TeenplanetRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/TeenplanetRipper.java index d25ef345..9791ab90 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/TeenplanetRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/TeenplanetRipper.java @@ -3,51 +3,66 @@ package com.rarchives.ripme.ripper.rippers; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.ArrayList; +import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import com.rarchives.ripme.ripper.AlbumRipper; -import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; -public class TeenplanetRipper extends AlbumRipper { +public class TeenplanetRipper extends AbstractHTMLRipper { private static final String DOMAIN = "teenplanet.org", HOST = "teenplanet"; - private Document albumDoc = null; - public TeenplanetRipper(URL url) throws IOException { super(url); } + @Override + protected String getDomain() { + return DOMAIN; + } + @Override public String getHost() { return HOST; } - public URL sanitizeURL(URL url) throws MalformedURLException { - return url; + @Override + protected Document getFirstPage() throws IOException { + return Http.url(url).get(); } - public String getAlbumTitle(URL url) throws MalformedURLException { - try { - // Attempt to use album title as GID - if (albumDoc == null) { - albumDoc = Http.url(url).get(); + @Override + protected List getURLsFromPage(Document page) { + List imageURLs = new ArrayList<>(); + for (Element thumb : page.select("#galleryImages > a > img")) { + if (!thumb.hasAttr("src")) { + continue; } - Elements elems = albumDoc.select("div.header > h2"); - return HOST + "_" + elems.get(0).text(); - } catch (Exception e) { - // Fall back to default album naming convention - e.printStackTrace(); + String imageURL = thumb.attr("src"); + imageURL = imageURL.replace( + "/thumbs/", + "/"); + imageURLs.add(imageURL); } - return super.getAlbumTitle(url); + System.out.println("Found" + imageURLs.size() + " image urls"); + return imageURLs; + } + + @Override + protected void downloadURL(URL url, int index) { + String prefix = ""; + if (Utils.getConfigBoolean("download.save_order", true)) { + prefix = String.format("%03d_", index); + } + addURLToDownload(url, prefix); } @Override @@ -65,38 +80,4 @@ public class TeenplanetRipper extends AlbumRipper { + "teenplanet.org/galleries/....html" + " Got: " + url); } - - @Override - public void rip() throws IOException { - int index = 0; - LOGGER.info("Retrieving " + this.url); - sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm()); - if (albumDoc == null) { - albumDoc = Http.url(url).get(); - } - for (Element thumb : albumDoc.select("#galleryImages > a > img")) { - if (!thumb.hasAttr("src")) { - continue; - } - String image = thumb.attr("src"); - image = image.replace( - "/thumbs/", - "/"); - index += 1; - String prefix = ""; - if (Utils.getConfigBoolean("download.save_order", true)) { - prefix = String.format("%03d_", index); - } - addURLToDownload(new URL(image), prefix); - if (isThisATest()) { - break; - } - } - waitForThreads(); - } - - public boolean canRip(URL url) { - return url.getHost().endsWith(DOMAIN); - } - } \ No newline at end of file diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NewsfilterRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NewsfilterRipperTest.java index 4a5b55aa..c22ba9c5 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NewsfilterRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NewsfilterRipperTest.java @@ -6,5 +6,9 @@ import java.net.URL; import com.rarchives.ripme.ripper.rippers.NewsfilterRipper; public class NewsfilterRipperTest extends RippersTest { - // TODO add a test + + public void testNewsfilterRip() throws IOException { + NewsfilterRipper ripper = new NewsfilterRipper(new URL("http://newsfilter.org/gallery/he-doubted-she-would-fuck-on-cam-happy-to-be-proven-wrong-216799")); + testRipper(ripper); + } } \ No newline at end of file diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NfsfwRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NfsfwRipperTest.java index 3f1ba6cc..7f85fa5f 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NfsfwRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NfsfwRipperTest.java @@ -6,11 +6,21 @@ import java.net.URL; import com.rarchives.ripme.ripper.rippers.NfsfwRipper; public class NfsfwRipperTest extends RippersTest { - /* - // https://github.com/RipMeApp/ripme/issues/291 -- nfsfw "account suspended" error; disabled flaky test in CI - public void testNfsfwRip() throws IOException { + // https://github.com/RipMeApp/ripme/issues/291 -- nfsfw "account suspended" error; disabled flaky test in CI + /*public void testNfsfwRip() throws IOException { NfsfwRipper ripper = new NfsfwRipper(new URL("http://nfsfw.com/gallery/v/Kitten/")); testRipper(ripper); + }*/ + + public void testGetGID() throws IOException { + URL url = new URL("http://nfsfw.com/gallery/v/Kitten/"); + NfsfwRipper ripper = new NfsfwRipper(url); + assertEquals("Kitten", ripper.getGID(url)); + url = new URL("http://nfsfw.com/gallery/v/Kitten"); + assertEquals("Kitten", ripper.getGID(url)); + url = new URL("http://nfsfw.com/gallery/v/Kitten/gif_001/"); + assertEquals("Kitten__gif_001", ripper.getGID(url)); + url = new URL("http://nfsfw.com/gallery/v/Kitten/gif_001/"); + assertEquals("Kitten__gif_001", ripper.getGID(url)); } - */ } diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PhotobucketRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PhotobucketRipperTest.java index dff101a0..30885eaa 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PhotobucketRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PhotobucketRipperTest.java @@ -15,7 +15,51 @@ public class PhotobucketRipperTest extends RippersTest { deleteDir(ripper.getWorkingDir()); } */ -} - + /* + // new test, still commented out because of the issue above, + // since this test also involves network IO. + public void testGetNextPage() throws IOException { + // this album should have more than enough sub-albums and pages + // to serve as a pretty good iteration test (barring server or + // network errors) + String baseURL = "http://s1255.photobucket.com/user/mimajki/library/Movie%20gifs?sort=6&page=1"; + URL url = new URL(baseURL); + PhotobucketRipper ripper = new PhotobucketRipper(url); + org.jsoup.nodes.Document page = null; + try { + // I'm not sure it makes much sense that getFirstPage() + // is not public while getNextPage() is. + java.lang.reflect.Method method = ripper.getClass() + .getDeclaredMethod("getFirstPage"); + method.setAccessible(true); + page = (org.jsoup.nodes.Document) method.invoke(ripper); + } catch (Exception e){ + e.printStackTrace(); + fail("Calling getFirstPage() failed"); + } + int numPagesRemaining = 38; + for (int idx = 0; idx < numPagesRemaining; idx++){ + page = ripper.getNextPage(page); + System.out.println("URL: " + page.location()); + } + try { + page = ripper.getNextPage(page); + fail("Get next page did not throw an exception on the last page"); + } catch(IOException e){ + assertEquals(e.getMessage(), "No more pages"); + } + }*/ + public void testGetGID() throws IOException { + URL url = new URL("http://s732.photobucket.com/user/doublesix66/library/Army%20Painter%20examples?sort=3&page=1"); + PhotobucketRipper ripper = new PhotobucketRipper(url); + assertEquals("doublesix66", ripper.getGID(url)); + url = new URL("http://s732.photobucket.com/user/doublesix66/library/Army%20Painter%20examples/Painting%20examples?page=1&sort=3"); + assertEquals("doublesix66", ripper.getGID(url)); + url = new URL("http://s844.photobucket.com/user/SpazzySpizzy/library/Album%20Covers"); + assertEquals("SpazzySpizzy", ripper.getGID(url)); + url = new URL("http://s844.photobucket.com/user/SpazzySpizzy/library"); + assertEquals("SpazzySpizzy", ripper.getGID(url)); + } +} \ No newline at end of file diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PornhubRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PornhubRipperTest.java index 74bee8d9..278ad97c 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PornhubRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PornhubRipperTest.java @@ -4,6 +4,8 @@ import java.io.IOException; import java.net.URL; import com.rarchives.ripme.ripper.rippers.PornhubRipper; +import com.rarchives.ripme.utils.Http; +import org.jsoup.nodes.Document; public class PornhubRipperTest extends RippersTest { public void testPornhubRip() throws IOException { @@ -12,8 +14,28 @@ public class PornhubRipperTest extends RippersTest { } public void testGetGID() throws IOException { - URL url = new URL("https://www.pornhub.com/album/15680522"); + URL url = new URL("https://www.pornhub.com/album/15680522?page=2"); PornhubRipper ripper = new PornhubRipper(url); assertEquals("15680522", ripper.getGID(url)); + url = new URL("https://www.pornhub.com/album/15680522"); + assertEquals("15680522", ripper.getGID(url)); + } + + // alternate album, with only 2 pages: https://www.pornhub.com/album/4771891 + public void testGetNextPage() throws IOException { + String baseURL = "https://www.pornhub.com/album/15680522"; + PornhubRipper ripper = new PornhubRipper(new URL(baseURL)); + Document page = Http.url(baseURL).get(); + int numPagesRemaining = 4; + for (int idx = 0; idx < numPagesRemaining; idx++){ + page = ripper.getNextPage(page); + assertEquals(baseURL + "?page=" + (idx + 2), page.location()); + } + try { + page = ripper.getNextPage(page); + fail("Get next page did not throw an exception on the last page"); + } catch(IOException e){ + assertEquals(e.getMessage(), "No more pages"); + } } }