diff --git a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java index e9c6f242..40279b4d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java +++ b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java @@ -154,11 +154,15 @@ class DownloadFileThread implements Runnable { throw new IOException("Redirect status code " + statusCode + " - redirect to " + location); } if (statusCode / 100 == 4) { // 4xx errors - logger.error("[!] " + Utils.getLocalizedString("nonretriable.status.code") + " " + statusCode - + " while downloading from " + url); - observer.downloadErrored(url, Utils.getLocalizedString("nonretriable.status.code") + " " - + statusCode + " while downloading " + url.toExternalForm()); - return; // Not retriable, drop out. + if (statusCode == 429) { + throw new IOException(Utils.getLocalizedString("retriable.status.code") + " " + statusCode); + } else { + logger.error("[!] " + Utils.getLocalizedString("nonretriable.status.code") + " " + statusCode + + " while downloading from " + url); + observer.downloadErrored(url, Utils.getLocalizedString("nonretriable.status.code") + " " + + statusCode + " while downloading " + url.toExternalForm()); + return; // Not retriable, drop out. + } } if (statusCode / 100 == 5) { // 5xx errors observer.downloadErrored(url, Utils.getLocalizedString("retriable.status.code") + " " + statusCode diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java index c985f161..31cedc06 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java @@ -2,6 +2,8 @@ package com.rarchives.ripme.ripper.rippers; import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite; +import com.rarchives.ripme.ui.RipStatusMessage; +import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; import com.rarchives.ripme.utils.RipUtils; import java.io.IOException; @@ -19,6 +21,20 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; public class ChanRipper extends AbstractHTMLRipper { + + private int callsMade = 0; + private long startTime = System.nanoTime(); + + private static final int RETRY_LIMIT = 10; + private static final int HTTP_RETRY_LIMIT = 3; + private static final int RATE_LIMIT_HOUR = 1000; + + // All sleep times are in milliseconds + private static final int PAGE_SLEEP_TIME = 60 * 60 * 1000 / RATE_LIMIT_HOUR; + private static final int IMAGE_SLEEP_TIME = 60 * 60 * 1000 / RATE_LIMIT_HOUR; + // Timeout when blocked = 1 hours. Retry every retry within the hour mark + 1 time after the hour mark. + private static final int IP_BLOCK_SLEEP_TIME = (int) Math.round((double) 60 / (RETRY_LIMIT - 1) * 60 * 1000); + private static List bakedin_explicit_domains = Arrays.asList( new ChanSite("boards.4chan.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")), new ChanSite("boards.4channel.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")), @@ -196,9 +212,41 @@ public class ChanRipper extends AbstractHTMLRipper { return this.url.getHost(); } - public Document getFirstPage() throws IOException, URISyntaxException { - return super.getFirstPage(); + @Override + public Document getFirstPage() throws IOException { + + Document firstPage = getPageWithRetries(url); + + sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, "Loading first page..."); + + return firstPage; } + + @Override + public Document getNextPage(Document doc) throws IOException, URISyntaxException { + String nextURL = null; + for (Element a : doc.select("a.link3")) { + if (a.text().contains("next")) { + nextURL = this.sanitizeURL(this.url) + a.attr("href"); + break; + } + } + if (nextURL == null) { + throw new IOException("No next page found"); + } + // Sleep before fetching next page. + sleep(PAGE_SLEEP_TIME); + + sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, "Loading next page URL: " + nextURL); + LOGGER.info("Attempting to load next page URL: " + nextURL); + + // Load next page + Document nextPage = getPageWithRetries(new URI(nextURL).toURL()); + + return nextPage; + } + + private boolean isURLBlacklisted(String url) { for (String blacklist_item : url_piece_blacklist) { if (url.contains(blacklist_item)) { @@ -277,4 +325,98 @@ public class ChanRipper extends AbstractHTMLRipper { public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index)); } + + /** + * Attempts to get page, checks for IP ban, waits. + * @param url + * @return Page document + * @throws IOException If page loading errors, or if retries are exhausted + */ + private Document getPageWithRetries(URL url) throws IOException { + Document doc = null; + int retries = RETRY_LIMIT; + while (true) { + + sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, url.toExternalForm()); + + // For debugging rate limit checker. Useful to track wheter the timeout should be altered or not. + callsMade++; + checkRateLimit(); + + LOGGER.info("Retrieving " + url); + + boolean httpCallThrottled = false; + int httpAttempts = 0; + + // we attempt the http call, knowing it can fail for network reasons + while(true) { + httpAttempts++; + try { + doc = Http.url(url).get(); + } catch(IOException e) { + + LOGGER.info("Retrieving " + url + " error: " + e.getMessage()); + + if(e.getMessage().contains("404")) + throw new IOException("Gallery/Page not found!"); + + if(httpAttempts < HTTP_RETRY_LIMIT) { + sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, "HTTP call failed: " + e.getMessage() + " retrying " + httpAttempts + " / " + HTTP_RETRY_LIMIT); + + // we sleep for a few seconds + sleep(PAGE_SLEEP_TIME); + continue; + } else { + sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, "HTTP call failed too many times: " + e.getMessage() + " treating this as a throttle"); + httpCallThrottled = true; + } + } + // no errors, we exit + break; + } + + if (httpCallThrottled || (doc != null && doc.toString().contains("Your IP made too many requests to our servers and we need to check that you are a real human being"))) { + if (retries == 0) { + throw new IOException("Hit rate limit and maximum number of retries, giving up"); + } + String message = "Probably hit rate limit while loading " + url + ", sleeping for " + IP_BLOCK_SLEEP_TIME + "ms, " + retries + " retries remaining"; + LOGGER.warn(message); + sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, message); + retries--; + try { + Thread.sleep(IP_BLOCK_SLEEP_TIME); + } catch (InterruptedException e) { + throw new IOException("Interrupted while waiting for rate limit to subside"); + } + } else { + return doc; + } + } + } + + /** + * Used for debugging the rate limit issue. + * This in order to prevent hitting the rate limit altoghether by remaining under the limit threshold. + * @return Long duration + */ + private long checkRateLimit() { + long endTime = System.nanoTime(); + long duration = (endTime - startTime) / 1000000; + + int rateLimitMinute = 100; + int rateLimitFiveMinutes = 200; + int rateLimitHour = RATE_LIMIT_HOUR; // Request allowed every 3.6 seconds. + + if(duration / 1000 < 60){ + LOGGER.debug("Rate limit: " + (rateLimitMinute - callsMade) + " calls remaining for first minute mark."); + } else if(duration / 1000 < 300){ + LOGGER.debug("Rate limit: " + (rateLimitFiveMinutes - callsMade) + " calls remaining for first 5 minute mark."); + } else if(duration / 1000 < 3600){ + LOGGER.debug("Rate limit: " + (RATE_LIMIT_HOUR - callsMade) + " calls remaining for first hour mark."); + } + + return duration; + } + + } diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java index 553515b9..8dff9a4e 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ChanRipperTest.java @@ -25,6 +25,8 @@ public class ChanRipperTest extends RippersTest { public void testChanURLPasses() throws IOException, URISyntaxException { List passURLs = new ArrayList<>(); // URLs that should work + + passURLs.add(new URI("https://boards.4chan.org/gif/thread/28319534").toURL()); passURLs.add(new URI("https://boards.4chan.org/g/thread/103742599").toURL()); passURLs.add(new URI("https://rbt.asia/g/thread/70643087/").toURL()); //must work with TLDs with len of 4 for (URL url : passURLs) {