Make PornhubRipper inherit from AbstractHTMLRipper

Also make sure that if the album has multiple pages, make it download all pages (tested).
2025-09-02 02:12:45 +02:00 · 2018-10-29 21:03:25 +01:00
parent 5ae2bb43e8
commit da509663d1
2 changed files with 84 additions and 64 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/PornhubRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/PornhubRipper.java
@@ -4,9 +4,12 @@ import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

+import com.rarchives.ripme.ripper.AbstractHTMLRipper;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
@@ -17,7 +20,7 @@ import com.rarchives.ripme.ui.RipStatusMessage.STATUS;
 import com.rarchives.ripme.utils.Http;
 import com.rarchives.ripme.utils.Utils;

-public class PornhubRipper extends AlbumRipper {
+public class PornhubRipper extends AbstractHTMLRipper {
    // All sleep times are in milliseconds
    private static final int IMAGE_SLEEP_TIME    = 1000;

@@ -26,9 +29,6 @@ public class PornhubRipper extends AlbumRipper {
    // Thread pool for finding direct image links from "image" pages (html)
    private DownloadThreadPool pornhubThreadPool = new DownloadThreadPool("pornhub");

-    // Current HTML document
-    private Document albumDoc = null;
-
    public PornhubRipper(URL url) throws IOException {
        super(url);
    }
@@ -38,25 +38,63 @@ public class PornhubRipper extends AlbumRipper {
        return HOST;
    }

-    public URL sanitizeURL(URL url) throws MalformedURLException {
-        return url;
+    @Override
+    protected String getDomain() {
+        return DOMAIN;
    }

-    public String getAlbumTitle(URL url) throws MalformedURLException {
-        try {
-            // Attempt to use album title as GID
-            if (albumDoc == null) {
-                LOGGER.info("    Retrieving " + url.toExternalForm());
-                sendUpdate(STATUS.LOADING_RESOURCE, url.toString());
-                albumDoc = Http.url(url).get();
-            }
-            Elements elems = albumDoc.select(".photoAlbumTitleV2");
-            return HOST + "_" + elems.get(0).text();
-        } catch (Exception e) {
-            // Fall back to default album naming convention
-            LOGGER.warn("Failed to get album title from " + url, e);
+    @Override
+    protected Document getFirstPage() throws IOException {
+        return Http.url(url).referrer(url).get();
+    }
+
+    @Override
+    public Document getNextPage(Document page) throws IOException {
+        Elements nextPageLink = page.select("li.page_next > a");
+        if (nextPageLink.isEmpty()){
+            throw new IOException("No more pages");
+        } else {
+            URL nextURL = new URL(this.url, nextPageLink.first().attr("href"));
+            return Http.url(nextURL).get();
+        }
+    }
+
+    @Override
+    protected List<String> getURLsFromPage(Document page) {
+        List<String> pageURLs = new ArrayList<>();
+        // Find thumbnails
+        Elements thumbs = page.select(".photoBlockBox li");
+        // Iterate over thumbnail images on page
+        for (Element thumb : thumbs) {
+            String imagePage = thumb.select(".photoAlbumListBlock > a")
+                    .first().attr("href");
+            String fullURL = "https://pornhub.com" + imagePage;
+            pageURLs.add(fullURL);
+        }
+        return pageURLs;
+    }
+
+    @Override
+    protected void downloadURL(URL url, int index) {
+        PornhubImageThread t = new PornhubImageThread(url, index, this.workingDir);
+        pornhubThreadPool.addThread(t);
+        try {
+            Thread.sleep(IMAGE_SLEEP_TIME);
+        } catch (InterruptedException e) {
+            LOGGER.warn("Interrupted while waiting to load next image", e);
+        }
+    }
+
+    public URL sanitizeURL(URL url) throws MalformedURLException {
+        // always start on the first page of an album
+        // (strip the options after the '?')
+        String u = url.toExternalForm();
+        if (u.contains("?")) {
+            u = u.substring(0, u.indexOf("?"));
+            return new URL(u);
+        } else {
+            return url;
        }
-        return super.getAlbumTitle(url);
    }

    @Override
@@ -64,7 +102,7 @@ public class PornhubRipper extends AlbumRipper {
        Pattern p;
        Matcher m;

-        p = Pattern.compile("^.*pornhub\\.com/album/([0-9]+)$");
+        p = Pattern.compile("^.*pornhub\\.com/album/([0-9]+).*$");
        m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            return m.group(1);
@@ -77,48 +115,8 @@ public class PornhubRipper extends AlbumRipper {
    }

    @Override
-    public void rip() throws IOException {
-        int index = 0;
-        String nextUrl = this.url.toExternalForm();
-
-        if (albumDoc == null) {
-            LOGGER.info("    Retrieving album page " + nextUrl);
-            sendUpdate(STATUS.LOADING_RESOURCE, nextUrl);
-            albumDoc = Http.url(nextUrl)
-                           .referrer(this.url)
-                           .get();
-        }
-
-        // Find thumbnails
-        Elements thumbs = albumDoc.select(".photoBlockBox li");
-        if (thumbs.isEmpty()) {
-            LOGGER.debug("albumDoc: " + albumDoc);
-            LOGGER.debug("No images found at " + nextUrl);
-            return;
-        }
-
-        // Iterate over images on page
-        for (Element thumb : thumbs) {
-            if (isStopped()) {
-                break;
-            }
-            index++;
-            String imagePageUrl = thumb.select(".photoAlbumListBlock > a").first().attr("href");
-            URL imagePage = new URL(url, imagePageUrl);
-            PornhubImageThread t = new PornhubImageThread(imagePage, index, this.workingDir);
-            pornhubThreadPool.addThread(t);
-            if (isThisATest()) {
-                break;
-            }
-            try {
-                Thread.sleep(IMAGE_SLEEP_TIME);
-            } catch (InterruptedException e) {
-                LOGGER.warn("Interrupted while waiting to load next image", e);
-            }
-        }
-
-        pornhubThreadPool.waitForThreads();
-        waitForThreads();
+    public DownloadThreadPool getThreadPool(){
+        return pornhubThreadPool;
    }

    public boolean canRip(URL url) {
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PornhubRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/PornhubRipperTest.java
@@ -4,6 +4,8 @@ import java.io.IOException;
 import java.net.URL;

 import com.rarchives.ripme.ripper.rippers.PornhubRipper;
+import com.rarchives.ripme.utils.Http;
+import org.jsoup.nodes.Document;

 public class PornhubRipperTest extends RippersTest {
    public void testPornhubRip() throws IOException {
@@ -12,8 +14,28 @@ public class PornhubRipperTest extends RippersTest {
    }

    public void testGetGID() throws IOException {
-        URL url = new URL("https://www.pornhub.com/album/15680522");
+        URL url = new URL("https://www.pornhub.com/album/15680522?page=2");
        PornhubRipper ripper = new PornhubRipper(url);
        assertEquals("15680522", ripper.getGID(url));
+        url = new URL("https://www.pornhub.com/album/15680522");
+        assertEquals("15680522", ripper.getGID(url));
+    }
+
+    // alternate album, with only 2 pages: https://www.pornhub.com/album/4771891
+    public void testGetNextPage() throws IOException {
+        String baseURL = "https://www.pornhub.com/album/15680522";
+        PornhubRipper ripper = new PornhubRipper(new URL(baseURL));
+        Document page = Http.url(baseURL).get();
+        int numPagesRemaining = 4;
+        for (int idx = 0; idx < numPagesRemaining; idx++){
+            page = ripper.getNextPage(page);
+            assertEquals(baseURL + "?page=" + (idx + 2), page.location());
+        }
+        try {
+            page = ripper.getNextPage(page);
+            fail("Get next page did not throw an exception on the last page");
+        } catch(IOException e){
+            assertEquals(e.getMessage(), "No more pages");
+        }
    }
 }