Implement caching of the first page for every ripper

2025-09-02 10:23:47 +02:00 · 2022-08-03 01:49:03 +02:00
parent e581eeee29
commit 98f37208b1
17 changed files with 37 additions and 55 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java
@@ -32,6 +32,7 @@ public abstract class AbstractHTMLRipper extends AbstractRipper {
    private final Map<URL, File> itemsPending = Collections.synchronizedMap(new HashMap<>());
    private final Map<URL, Path> itemsCompleted = Collections.synchronizedMap(new HashMap<>());
    private final Map<URL, String> itemsErrored = Collections.synchronizedMap(new HashMap<>());
    Document cachedFirstPage;
    protected AbstractHTMLRipper(URL url) throws IOException {
        super(url);
@@ -41,6 +42,14 @@ public abstract class AbstractHTMLRipper extends AbstractRipper {
    public abstract String getHost();
    protected abstract Document getFirstPage() throws IOException;
    protected Document getCachedFirstPage() throws IOException {
        if (cachedFirstPage == null) {
            cachedFirstPage = getFirstPage();
        }
        return cachedFirstPage;
    }
    public Document getNextPage(Document doc) throws IOException {
        return null;
    }
@@ -98,7 +107,7 @@ public abstract class AbstractHTMLRipper extends AbstractRipper {
        int textindex = 0;
        LOGGER.info("Retrieving " + this.url);
        sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm());
-        Document doc = getFirstPage();
+        var doc = getCachedFirstPage();
        if (hasQueueSupport() && pageContainsAlbums(this.url)) {
            List<String> urls = getAlbumsToQueue(doc);
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/AerisdiesRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/AerisdiesRipper.java
@@ -49,7 +49,7 @@ public class AerisdiesRipper extends AbstractHTMLRipper {
    @Override
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
-            Element el = getFirstPage().select(".headtext").first();
+            Element el = getCachedFirstPage().select(".headtext").first();
            if (el == null) {
                throw new IOException("Unable to get album title");
            }
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/BatoRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/BatoRipper.java
@@ -73,7 +73,7 @@ public class BatoRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            return getHost() + "_" + getGID(url) + "_" + getFirstPage().select("title").first().text().replaceAll(" ", "_");
+            return getHost() + "_" + getGID(url) + "_" + getCachedFirstPage().select("title").first().text().replaceAll(" ", "_");
        } catch (IOException e) {
            // Fall back to default album naming convention
            LOGGER.info("Unable to find title at " + url);
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java
@@ -104,7 +104,7 @@ public class ChanRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            Document doc = getFirstPage();
+            Document doc = getCachedFirstPage();
            try {
                String subject = doc.select(".post.op > .postinfo > .subject").first().text();
                return getHost() + "_" + getGID(url) + "_" + subject;
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/CheveretoRipper.java
@@ -53,7 +53,7 @@ public class CheveretoRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            Element titleElement = getFirstPage().select("meta[property=og:title]").first();
+            Element titleElement = getCachedFirstPage().select("meta[property=og:title]").first();
            String title = titleElement.attr("content");
            title = title.substring(title.lastIndexOf('/') + 1);
            return getHost() + "_" + title.trim();
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/EightmusesRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/EightmusesRipper.java
@@ -23,7 +23,6 @@ import com.rarchives.ripme.utils.Http;
 public class EightmusesRipper extends AbstractHTMLRipper {
    private Document albumDoc = null;
    private Map<String,String> cookies = new HashMap<>();
    // TODO put up a wiki page on using maps to store titles
    // the map for storing the title of each album when downloading sub albums
@@ -64,7 +63,7 @@ public class EightmusesRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            Element titleElement = getFirstPage().select("meta[name=description]").first();
+            Element titleElement = getCachedFirstPage().select("meta[name=description]").first();
            String title = titleElement.attr("content");
            title = title.replace("A huge collection of free porn comics for adults. Read", "");
            title = title.replace("online for free at 8muses.com", "");
@@ -78,12 +77,9 @@ public class EightmusesRipper extends AbstractHTMLRipper {
    @Override
    public Document getFirstPage() throws IOException {
-        if (albumDoc == null) {
+        Response resp = Http.url(url).response();
-            Response resp = Http.url(url).response();
+        cookies.putAll(resp.cookies());
-            cookies.putAll(resp.cookies());
+        return resp.parse();
            albumDoc = resp.parse();
        }
        return albumDoc;
    }
    @Override
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/EroShareRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/EroShareRipper.java
@@ -97,7 +97,7 @@ public class EroShareRipper extends AbstractHTMLRipper {
        if (!is_profile(url)) {
            try {
                // Attempt to use album title as GID
-                Element titleElement = getFirstPage().select("meta[property=og:title]").first();
+                Element titleElement = getCachedFirstPage().select("meta[property=og:title]").first();
                String title = titleElement.attr("content");
                title = title.substring(title.lastIndexOf('/') + 1);
                return getHost() + "_" + getGID(url) + "_" + title.trim();
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/EromeRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/EromeRipper.java
@@ -71,7 +71,7 @@ public class EromeRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            Element titleElement = getFirstPage().select("meta[property=og:title]").first();
+            Element titleElement = getCachedFirstPage().select("meta[property=og:title]").first();
            String title = titleElement.attr("content");
            title = title.substring(title.lastIndexOf('/') + 1);
            return getHost() + "_" + getGID(url) + "_" + title.trim();
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/FlickrRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/FlickrRipper.java
@@ -20,7 +20,6 @@ import org.jsoup.nodes.Element;
 public class FlickrRipper extends AbstractHTMLRipper {
    private Document albumDoc = null;
    private final DownloadThreadPool flickrThreadPool;
    private enum UrlType {
@@ -178,7 +177,7 @@ public class FlickrRipper extends AbstractHTMLRipper {
        }
        try {   
            // Attempt to use album title as GID
-            Document doc = getFirstPage();
+            Document doc = getCachedFirstPage();
            String user = url.toExternalForm();
            user = user.substring(user.indexOf("/photos/") + "/photos/".length());
            user = user.substring(0, user.indexOf("/"));
@@ -230,10 +229,7 @@ public class FlickrRipper extends AbstractHTMLRipper {
    @Override
    public Document getFirstPage() throws IOException {
-        if (albumDoc == null) {
+        return Http.url(url).get();
            albumDoc = Http.url(url).get();
        }
        return albumDoc;
    }
    @Override
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/GirlsOfDesireRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/GirlsOfDesireRipper.java
@@ -16,8 +16,6 @@ import com.rarchives.ripme.ripper.AbstractHTMLRipper;
 import com.rarchives.ripme.utils.Http;
 public class GirlsOfDesireRipper extends AbstractHTMLRipper {
    // Current HTML document
    private Document albumDoc = null;
    public GirlsOfDesireRipper(URL url) throws IOException {
        super(url);
@@ -35,7 +33,7 @@ public class GirlsOfDesireRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            Document doc = getFirstPage();
+            Document doc = getCachedFirstPage();
            Elements elems = doc.select(".albumName");
            return getHost() + "_" + elems.first().text();
        } catch (Exception e) {
@@ -64,10 +62,7 @@ public class GirlsOfDesireRipper extends AbstractHTMLRipper {
    @Override
    public Document getFirstPage() throws IOException {
-        if (albumDoc == null) {
+        return Http.url(url).get();
            albumDoc = Http.url(url).get();
        }
        return albumDoc;
    }
    @Override
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/HbrowseRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/HbrowseRipper.java
@@ -51,7 +51,7 @@ public class HbrowseRipper extends AbstractHTMLRipper {
        @Override
        public String getAlbumTitle(URL url) throws MalformedURLException {
            try {
-                Document doc = getFirstPage();
+                Document doc = getCachedFirstPage();
                String title = doc.select("div[id=main] > table.listTable > tbody > tr > td.listLong").first().text();
                return getHost() + "_" + title + "_" + getGID(url);
            } catch (Exception e) {
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoxRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoxRipper.java
@@ -61,7 +61,7 @@ public class HentaifoxRipper extends AbstractHTMLRipper {
    @Override
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
-            Document doc = getFirstPage();
+            Document doc = getCachedFirstPage();
            String title = doc.select("div.info > h1").first().text();
            return getHost() + "_" + title + "_" + getGID(url);
        } catch (Exception e) {
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImagebamRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImagebamRipper.java
@@ -19,9 +19,6 @@ import org.jsoup.select.Elements;
 public class ImagebamRipper extends AbstractHTMLRipper {
    // Current HTML document
    private Document albumDoc = null;
    // Thread pool for finding direct image links from "image" pages (html)
    private DownloadThreadPool imagebamThreadPool = new DownloadThreadPool("imagebam");
    @Override
@@ -61,10 +58,7 @@ public class ImagebamRipper extends AbstractHTMLRipper {
    @Override
    public Document getFirstPage() throws IOException {
-        if (albumDoc == null) {
+        return Http.url(url).get();
            albumDoc = Http.url(url).get();
        }
        return albumDoc;
    }
    @Override
@@ -99,7 +93,7 @@ public class ImagebamRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            Elements elems = getFirstPage().select("[id=gallery-name]");
+            Elements elems = getCachedFirstPage().select("[id=gallery-name]");
            String title = elems.first().text();
            LOGGER.info("Title text: '" + title + "'");
            if (StringUtils.isNotBlank(title)) {
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java
@@ -17,7 +17,6 @@ import com.rarchives.ripme.utils.Http;
 public class ImagefapRipper extends AbstractHTMLRipper {
    private Document albumDoc = null;
    private boolean isNewAlbumType = false;
    private int callsMade = 0;
@@ -109,10 +108,7 @@ public class ImagefapRipper extends AbstractHTMLRipper {
    @Override
    public Document getFirstPage() throws IOException {
-        if (albumDoc == null) {
+        return getPageWithRetries(url);
            albumDoc = getPageWithRetries(url);
        }
        return albumDoc;
    }
    @Override
@@ -162,7 +158,7 @@ public class ImagefapRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            String title = getFirstPage().title();
+            String title = getCachedFirstPage().title();
            title = title.replace("Porn Pics & Porn GIFs", "");
            title = title.replace(" ", "_");
            String toReturn = getHost() + "_" + title + "_" + getGID(url);
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/ViewcomicRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ViewcomicRipper.java
@@ -34,7 +34,7 @@ public class ViewcomicRipper extends AbstractHTMLRipper {
        public String getAlbumTitle(URL url) throws MalformedURLException {
            try {
                // Attempt to use album title as GID
-                String titleText = getFirstPage().select("title").first().text();
+                String titleText = getCachedFirstPage().select("title").first().text();
                String title = titleText.replace("Viewcomic reading comics online for free", "");
                title = title.replace("_", "");
                title = title.replace("|", "");
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/XhamsterRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/XhamsterRipper.java
@@ -220,7 +220,7 @@ public class XhamsterRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title and username as GID
-            Document doc = getFirstPage();
+            Document doc = getCachedFirstPage();
            Element user = doc.select("a.author").first();
            String username = user.text();
            String path = url.getPath();
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/ZizkiRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ZizkiRipper.java
@@ -19,7 +19,6 @@ import com.rarchives.ripme.utils.Http;
 public class ZizkiRipper extends AbstractHTMLRipper {
    private Document albumDoc = null;
    private Map<String,String> cookies = new HashMap<>();
    public ZizkiRipper(URL url) throws IOException {
@@ -49,10 +48,10 @@ public class ZizkiRipper extends AbstractHTMLRipper {
    public String getAlbumTitle(URL url) throws MalformedURLException {
        try {
            // Attempt to use album title as GID
-            Element titleElement = getFirstPage().select("h1.title").first();
+            Element titleElement = getCachedFirstPage().select("h1.title").first();
            String title = titleElement.text();
-            Element authorSpan = getFirstPage().select("span[class=creator]").first();
+            Element authorSpan = getCachedFirstPage().select("span[class=creator]").first();
            String author = authorSpan.select("a").first().text();
            LOGGER.debug("Author: " + author);
            return getHost() + "_" + author + "_" + title.trim();
@@ -65,12 +64,9 @@ public class ZizkiRipper extends AbstractHTMLRipper {
    @Override
    public Document getFirstPage() throws IOException {
-        if (albumDoc == null) {
+        Response resp = Http.url(url).response();
-            Response resp = Http.url(url).response();
+        cookies.putAll(resp.cookies());
-            cookies.putAll(resp.cookies());
+        return resp.parse();
            albumDoc = resp.parse();
        }
        return albumDoc;
    }
    @Override