From 5ae2bb43e80d404d85ea35cabc509a5611d1b7aa Mon Sep 17 00:00:00 2001 From: Peter Szakacs Date: Mon, 29 Oct 2018 17:05:31 +0100 Subject: [PATCH] Make NfsfwRipper inherit from AbstractHTMLRipper Also make it have queue support since there are some galleries that have only subalbum links. In case of galleries with both images and subalbums, such as the one in the unit test, first rip the images in the base album and then the images in the subalbum (and save them to a subdirectory of the current album directory). Note that Nfsfw.com showed some read time-outs when ripping the same albums that it ripped Ok before. Adding a timeout in getNextPage() seems to help somewhat, but if any issues are encountered for now the simplest fix seems to be to wait a while before trying to rip the album again. --- .../ripme/ripper/rippers/NfsfwRipper.java | 232 ++++++++++-------- .../tst/ripper/rippers/NfsfwRipperTest.java | 18 +- 2 files changed, 149 insertions(+), 101 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/NfsfwRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/NfsfwRipper.java index 3585b6bb..b525a39a 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/NfsfwRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/NfsfwRipper.java @@ -8,6 +8,7 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; @@ -18,13 +19,22 @@ import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; -public class NfsfwRipper extends AlbumRipper { +public class NfsfwRipper extends AbstractHTMLRipper { private static final String DOMAIN = "nfsfw.com", HOST = "nfsfw"; - private Document albumDoc = null; + private int index = 0; + private String currentDir = ""; + private List subalbumURLs = new ArrayList<>(); + private Pattern subalbumURLPattern = Pattern.compile( + "https?://[wm.]*nfsfw.com/gallery/v/[^/]+/(.+)$" + ); + + // cached first page + private Document fstPage; + // threads pool for downloading images from image pages private DownloadThreadPool nfsfwThreadPool; public NfsfwRipper(URL url) throws IOException { @@ -32,39 +42,104 @@ public class NfsfwRipper extends AlbumRipper { nfsfwThreadPool = new DownloadThreadPool("NFSFW"); } + @Override + protected String getDomain() { + return DOMAIN; + } + @Override public String getHost() { return HOST; } @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - return url; + protected Document getFirstPage() throws IOException { + // cache the first page + this.fstPage = Http.url(url).get(); + return fstPage; } @Override - public String getAlbumTitle(URL url) throws MalformedURLException { - try { - // Attempt to use album title as GID - if (albumDoc == null) { - albumDoc = Http.url(url).get(); + public Document getNextPage(Document page) throws IOException { + String nextURL = null; + Elements a = page.select("a.next"); + if (!a.isEmpty()){ + // Get next page of current album + nextURL = "http://nfsfw.com" + a.first().attr("href"); + } else if (!subalbumURLs.isEmpty()){ + // Get next sub-album + nextURL = subalbumURLs.remove(0); + LOGGER.info("Detected subalbum URL at:" + nextURL); + Matcher m = subalbumURLPattern.matcher(nextURL); + if (m.matches()) { + // Set the new save directory and save images with a new index + this.currentDir = m.group(1); + this.index = 0; + } else { + LOGGER.error("Invalid sub-album URL: " + nextURL); + nextURL = null; } - String title = albumDoc.select("h2").first().text().trim(); - return "nfsfw_" + Utils.filesystemSafe(title); - } catch (Exception e) { - // Fall back to default album naming convention } - return super.getAlbumTitle(url); + // Wait + try { + Thread.sleep(2000); + } catch (InterruptedException e) { + LOGGER.error("Interrupted while waiting to load next page", e); + } + if (nextURL == null){ + throw new IOException("No more pages"); + } else { + return Http.url(nextURL).get(); + } + } + + @Override + protected List getURLsFromPage(Document page) { + List imagePageURLs = getImagePageURLs(page); + + // Check if any sub-albums are present on this page + List subalbumURLs = getSubalbumURLs(page); + this.subalbumURLs.addAll(subalbumURLs); + + return imagePageURLs; + } + + @Override + protected void downloadURL(URL url, int index) { + // if we are now downloading a sub-album, all images in it + // should be indexed starting from 0 + if (!this.currentDir.equals("")){ + index = ++this.index; + } + NfsfwImageThread t = new NfsfwImageThread(url, currentDir, index); + nfsfwThreadPool.addThread(t); + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + // always start on the first page of an album + // (strip the options after the '?') + String u = url.toExternalForm(); + if (u.contains("?")) { + u = u.substring(0, u.indexOf("?")); + return new URL(u); + } else { + return url; + } } @Override public String getGID(URL url) throws MalformedURLException { Pattern p; Matcher m; - p = Pattern.compile("https?://[wm.]*nfsfw.com/gallery/v/([a-zA-Z0-9\\-_]+).*"); + p = Pattern.compile("https?://[wm.]*nfsfw.com/gallery/v/(.*)$"); m = p.matcher(url.toExternalForm()); if (m.matches()) { - return m.group(1); + String group = m.group(1); + if (group.endsWith("/")) { + group = group.substring(0, group.length() - 1); + } + return group.replaceAll("/", "__"); } throw new MalformedURLException( @@ -74,75 +149,51 @@ public class NfsfwRipper extends AlbumRipper { } @Override - public void rip() throws IOException { - List subAlbums = new ArrayList<>(); - int index = 0; - subAlbums.add(new Pair(this.url.toExternalForm(), "")); - while (!subAlbums.isEmpty()) { - if (isStopped()) { - break; - } - Pair nextAlbum = subAlbums.remove(0); - String nextURL = nextAlbum.first; - String nextSubalbum = nextAlbum.second; - sendUpdate(STATUS.LOADING_RESOURCE, nextURL); - LOGGER.info(" Retrieving " + nextURL); - if (albumDoc == null) { - albumDoc = Http.url(nextURL).get(); - } - // Subalbums - for (Element suba : albumDoc.select("td.IMG > a")) { - if (isStopped() || isThisATest()) { - break; - } - String subURL = "http://nfsfw.com" + suba.attr("href"); - String subdir = subURL; - while (subdir.endsWith("/")) { - subdir = subdir.substring(0, subdir.length() - 1); - } - subdir = subdir.substring(subdir.lastIndexOf("/") + 1); - subAlbums.add(new Pair(subURL, subdir)); - } - // Images - for (Element thumb : albumDoc.select("td.giItemCell > div > a")) { - if (isStopped()) { - break; - } - String imagePage = "http://nfsfw.com" + thumb.attr("href"); - try { - NfsfwImageThread t = new NfsfwImageThread(new URL(imagePage), nextSubalbum, ++index); - nfsfwThreadPool.addThread(t); - if (isThisATest()) { - break; - } - } catch (MalformedURLException mue) { - LOGGER.warn("Invalid URL: " + imagePage); - } - } - if (isThisATest()) { - break; - } - // Get next page - for (Element a : albumDoc.select("a.next")) { - subAlbums.add(0, new Pair("http://nfsfw.com" + a.attr("href"), "")); - break; - } - // Insert next page at the top - albumDoc = null; - // Wait - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - LOGGER.error("Interrupted while waiting to load next page", e); - throw new IOException(e); - } - } - nfsfwThreadPool.waitForThreads(); - waitForThreads(); + public DownloadThreadPool getThreadPool() { + return nfsfwThreadPool; } - public boolean canRip(URL url) { - return url.getHost().endsWith(DOMAIN); + @Override + public boolean hasQueueSupport() { + return true; + } + + @Override + public boolean pageContainsAlbums(URL url) { + List imageURLs = getImagePageURLs(fstPage); + List subalbumURLs = getSubalbumURLs(fstPage); + return imageURLs.isEmpty() && !subalbumURLs.isEmpty(); + } + + @Override + public List getAlbumsToQueue(Document doc) { + return getSubalbumURLs(doc); + } + + // helper methods + + private List getImagePageURLs(Document page){ + // get image pages + // NOTE: It might be possible to get the (non-thumbnail) image URL + // without going to its page first as there seems to be a pattern + // between the thumb and actual image URLs, but that is outside the + // scope of the current issue being solved. + List imagePageURLs = new ArrayList<>(); + for (Element thumb : page.select("td.giItemCell > div > a")) { + String imagePage = "http://nfsfw.com" + thumb.attr("href"); + imagePageURLs.add(imagePage); + } + return imagePageURLs; + } + + private List getSubalbumURLs(Document page){ + // Check if sub-albums are present on this page + List subalbumURLs = new ArrayList<>(); + for (Element suba : page.select("td.IMG > a")) { + String subURL = "http://nfsfw.com" + suba.attr("href"); + subalbumURLs.add(subURL); + } + return subalbumURLs; } /** @@ -175,23 +226,10 @@ public class NfsfwRipper extends AlbumRipper { if (file.startsWith("/")) { file = "http://nfsfw.com" + file; } - String prefix = ""; - if (Utils.getConfigBoolean("download.save_order", true)) { - prefix = String.format("%03d_", index); - } - addURLToDownload(new URL(file), prefix, this.subdir); + addURLToDownload(new URL(file), getPrefix(index), this.subdir); } catch (IOException e) { LOGGER.error("[!] Exception while loading/parsing " + this.url, e); } } } - - private class Pair { - String first; - String second; - Pair(String first, String second) { - this.first = first; - this.second = second; - } - } } \ No newline at end of file diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NfsfwRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NfsfwRipperTest.java index 3f1ba6cc..7f85fa5f 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NfsfwRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/NfsfwRipperTest.java @@ -6,11 +6,21 @@ import java.net.URL; import com.rarchives.ripme.ripper.rippers.NfsfwRipper; public class NfsfwRipperTest extends RippersTest { - /* - // https://github.com/RipMeApp/ripme/issues/291 -- nfsfw "account suspended" error; disabled flaky test in CI - public void testNfsfwRip() throws IOException { + // https://github.com/RipMeApp/ripme/issues/291 -- nfsfw "account suspended" error; disabled flaky test in CI + /*public void testNfsfwRip() throws IOException { NfsfwRipper ripper = new NfsfwRipper(new URL("http://nfsfw.com/gallery/v/Kitten/")); testRipper(ripper); + }*/ + + public void testGetGID() throws IOException { + URL url = new URL("http://nfsfw.com/gallery/v/Kitten/"); + NfsfwRipper ripper = new NfsfwRipper(url); + assertEquals("Kitten", ripper.getGID(url)); + url = new URL("http://nfsfw.com/gallery/v/Kitten"); + assertEquals("Kitten", ripper.getGID(url)); + url = new URL("http://nfsfw.com/gallery/v/Kitten/gif_001/"); + assertEquals("Kitten__gif_001", ripper.getGID(url)); + url = new URL("http://nfsfw.com/gallery/v/Kitten/gif_001/"); + assertEquals("Kitten__gif_001", ripper.getGID(url)); } - */ }