make chan retry

2025-08-25 23:06:22 +02:00 · 2025-01-04 08:32:29 +01:00
parent 461e717a69
commit a1726db88a
3 changed files with 155 additions and 7 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java
+++ b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java
@@ -154,11 +154,15 @@ class DownloadFileThread implements Runnable {
                    throw new IOException("Redirect status code " + statusCode + " - redirect to " + location);
                }
                if (statusCode / 100 == 4) { // 4xx errors
-                    logger.error("[!] " + Utils.getLocalizedString("nonretriable.status.code") + " " + statusCode
-                            + " while downloading from " + url);
-                    observer.downloadErrored(url, Utils.getLocalizedString("nonretriable.status.code") + " "
-                            + statusCode + " while downloading " + url.toExternalForm());
-                    return; // Not retriable, drop out.
+                    if (statusCode == 429) {
+                        throw new IOException(Utils.getLocalizedString("retriable.status.code") + " " + statusCode);
+                    } else {
+                        logger.error("[!] " + Utils.getLocalizedString("nonretriable.status.code") + " " + statusCode
+                                + " while downloading from " + url);
+                        observer.downloadErrored(url, Utils.getLocalizedString("nonretriable.status.code") + " "
+                                + statusCode + " while downloading " + url.toExternalForm());
+                        return; // Not retriable, drop out.
+                    }
                }
                if (statusCode / 100 == 5) { // 5xx errors
                    observer.downloadErrored(url, Utils.getLocalizedString("retriable.status.code") + " " + statusCode
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java
@@ -2,6 +2,8 @@ package com.rarchives.ripme.ripper.rippers;

 import com.rarchives.ripme.ripper.AbstractHTMLRipper;
 import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite;
+import com.rarchives.ripme.ui.RipStatusMessage;
+import com.rarchives.ripme.utils.Http;
 import com.rarchives.ripme.utils.Utils;
 import com.rarchives.ripme.utils.RipUtils;
 import java.io.IOException;
@@ -19,6 +21,20 @@ import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;

 public class ChanRipper extends AbstractHTMLRipper {
+
+    private int callsMade = 0;
+    private long startTime = System.nanoTime();
+
+    private static final int RETRY_LIMIT = 10;
+    private static final int HTTP_RETRY_LIMIT = 3;
+    private static final int RATE_LIMIT_HOUR = 1000;
+
+    // All sleep times are in milliseconds
+    private static final int PAGE_SLEEP_TIME = 60 * 60 * 1000 / RATE_LIMIT_HOUR;
+    private static final int IMAGE_SLEEP_TIME = 60 * 60 * 1000 / RATE_LIMIT_HOUR;
+    // Timeout when blocked = 1 hours. Retry every retry within the hour mark + 1 time after the hour mark.
+    private static final int IP_BLOCK_SLEEP_TIME = (int) Math.round((double) 60 / (RETRY_LIMIT - 1) * 60 * 1000);
+
    private static List<ChanSite> bakedin_explicit_domains = Arrays.asList(
            new ChanSite("boards.4chan.org",   Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
            new ChanSite("boards.4channel.org",   Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
@@ -196,9 +212,41 @@ public class ChanRipper extends AbstractHTMLRipper {
        return this.url.getHost();
    }

-    public Document getFirstPage() throws IOException, URISyntaxException {
-        return super.getFirstPage();
+    @Override
+    public Document getFirstPage() throws IOException {
+
+        Document firstPage = getPageWithRetries(url);
+
+        sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, "Loading first page...");
+
+        return firstPage;
    }
+
+    @Override
+    public Document getNextPage(Document doc) throws IOException, URISyntaxException {
+        String nextURL = null;
+        for (Element a : doc.select("a.link3")) {
+            if (a.text().contains("next")) {
+                nextURL = this.sanitizeURL(this.url) + a.attr("href");
+                break;
+            }
+        }
+        if (nextURL == null) {
+            throw new IOException("No next page found");
+        }
+        // Sleep before fetching next page.
+        sleep(PAGE_SLEEP_TIME);
+
+        sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, "Loading next page URL: " + nextURL);
+        LOGGER.info("Attempting to load next page URL: " + nextURL);
+
+        // Load next page
+        Document nextPage = getPageWithRetries(new URI(nextURL).toURL());
+
+        return nextPage;
+    }
+
+
    private boolean isURLBlacklisted(String url) {
        for (String blacklist_item : url_piece_blacklist) {
            if (url.contains(blacklist_item)) {
@@ -277,4 +325,98 @@ public class ChanRipper extends AbstractHTMLRipper {
    public void downloadURL(URL url, int index) {
        addURLToDownload(url, getPrefix(index));
    }
+
+    /**
+     * Attempts to get page, checks for IP ban, waits.
+     * @param url
+     * @return Page document
+     * @throws IOException If page loading errors, or if retries are exhausted
+     */
+    private Document getPageWithRetries(URL url) throws IOException {
+        Document doc = null;
+        int retries = RETRY_LIMIT;
+        while (true) {
+
+            sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, url.toExternalForm());
+
+            // For debugging rate limit checker. Useful to track wheter the timeout should be altered or not.
+            callsMade++;
+            checkRateLimit();
+
+            LOGGER.info("Retrieving " + url);
+
+            boolean httpCallThrottled = false;
+            int httpAttempts = 0;
+
+            // we attempt the http call, knowing it can fail for network reasons
+            while(true) {
+                httpAttempts++;
+                try {
+                    doc = Http.url(url).get();
+                } catch(IOException e) {
+
+                    LOGGER.info("Retrieving " + url + " error: " + e.getMessage());
+
+                    if(e.getMessage().contains("404"))
+                        throw new IOException("Gallery/Page not found!");
+
+                    if(httpAttempts < HTTP_RETRY_LIMIT) {
+                        sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, "HTTP call failed: " + e.getMessage() + " retrying " + httpAttempts + " / " + HTTP_RETRY_LIMIT);
+
+                        // we sleep for a few seconds
+                        sleep(PAGE_SLEEP_TIME);
+                        continue;
+                    } else {
+                        sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, "HTTP call failed too many times: " + e.getMessage() + " treating this as a throttle");
+                        httpCallThrottled = true;
+                    }
+                }
+                // no errors, we exit
+                break;
+            }
+
+            if (httpCallThrottled || (doc != null && doc.toString().contains("Your IP made too many requests to our servers and we need to check that you are a real human being"))) {
+                if (retries == 0) {
+                    throw new IOException("Hit rate limit and maximum number of retries, giving up");
+                }
+                String message = "Probably hit rate limit while loading " + url + ", sleeping for " + IP_BLOCK_SLEEP_TIME + "ms, " + retries + " retries remaining";
+                LOGGER.warn(message);
+                sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, message);
+                retries--;
+                try {
+                    Thread.sleep(IP_BLOCK_SLEEP_TIME);
+                } catch (InterruptedException e) {
+                    throw new IOException("Interrupted while waiting for rate limit to subside");
+                }
+            } else {
+                return doc;
+            }
+        }
+    }
+
+    /**
+     * Used for debugging the rate limit issue.
+     * This in order to prevent hitting the rate limit altoghether by remaining under the limit threshold.
+     * @return Long duration
+     */
+    private long checkRateLimit() {
+        long endTime = System.nanoTime();
+        long duration = (endTime - startTime) / 1000000;
+
+        int rateLimitMinute = 100;
+        int rateLimitFiveMinutes = 200;
+        int rateLimitHour = RATE_LIMIT_HOUR;        // Request allowed every 3.6 seconds.
+
+        if(duration / 1000 < 60){
+            LOGGER.debug("Rate limit: " + (rateLimitMinute - callsMade) + " calls remaining for first minute mark.");
+        } else if(duration / 1000 <  300){
+            LOGGER.debug("Rate limit: " + (rateLimitFiveMinutes - callsMade) + " calls remaining for first 5 minute mark.");
+        } else if(duration / 1000 <  3600){
+            LOGGER.debug("Rate limit: " + (RATE_LIMIT_HOUR - callsMade) + " calls remaining for first hour mark.");
+        }
+
+        return duration;
+    }
+
+
 }
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java
@@ -25,6 +25,8 @@ public class ChanRipperTest extends RippersTest {
    public void testChanURLPasses() throws IOException, URISyntaxException {
        List<URL> passURLs = new ArrayList<>();
        // URLs that should work
+
+        passURLs.add(new URI("https://boards.4chan.org/gif/thread/28319534").toURL());
        passURLs.add(new URI("https://boards.4chan.org/g/thread/103742599").toURL());
        passURLs.add(new URI("https://rbt.asia/g/thread/70643087/").toURL()); //must work with TLDs with len of 4
        for (URL url : passURLs) {