mirror of
https://github.com/RipMeApp/ripme.git
synced 2025-08-26 15:24:51 +02:00
make chan retry
This commit is contained in:
@@ -154,11 +154,15 @@ class DownloadFileThread implements Runnable {
|
|||||||
throw new IOException("Redirect status code " + statusCode + " - redirect to " + location);
|
throw new IOException("Redirect status code " + statusCode + " - redirect to " + location);
|
||||||
}
|
}
|
||||||
if (statusCode / 100 == 4) { // 4xx errors
|
if (statusCode / 100 == 4) { // 4xx errors
|
||||||
logger.error("[!] " + Utils.getLocalizedString("nonretriable.status.code") + " " + statusCode
|
if (statusCode == 429) {
|
||||||
+ " while downloading from " + url);
|
throw new IOException(Utils.getLocalizedString("retriable.status.code") + " " + statusCode);
|
||||||
observer.downloadErrored(url, Utils.getLocalizedString("nonretriable.status.code") + " "
|
} else {
|
||||||
+ statusCode + " while downloading " + url.toExternalForm());
|
logger.error("[!] " + Utils.getLocalizedString("nonretriable.status.code") + " " + statusCode
|
||||||
return; // Not retriable, drop out.
|
+ " while downloading from " + url);
|
||||||
|
observer.downloadErrored(url, Utils.getLocalizedString("nonretriable.status.code") + " "
|
||||||
|
+ statusCode + " while downloading " + url.toExternalForm());
|
||||||
|
return; // Not retriable, drop out.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (statusCode / 100 == 5) { // 5xx errors
|
if (statusCode / 100 == 5) { // 5xx errors
|
||||||
observer.downloadErrored(url, Utils.getLocalizedString("retriable.status.code") + " " + statusCode
|
observer.downloadErrored(url, Utils.getLocalizedString("retriable.status.code") + " " + statusCode
|
||||||
|
@@ -2,6 +2,8 @@ package com.rarchives.ripme.ripper.rippers;
|
|||||||
|
|
||||||
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
|
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
|
||||||
import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite;
|
import com.rarchives.ripme.ripper.rippers.ripperhelpers.ChanSite;
|
||||||
|
import com.rarchives.ripme.ui.RipStatusMessage;
|
||||||
|
import com.rarchives.ripme.utils.Http;
|
||||||
import com.rarchives.ripme.utils.Utils;
|
import com.rarchives.ripme.utils.Utils;
|
||||||
import com.rarchives.ripme.utils.RipUtils;
|
import com.rarchives.ripme.utils.RipUtils;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -19,6 +21,20 @@ import org.jsoup.nodes.Document;
|
|||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
|
||||||
public class ChanRipper extends AbstractHTMLRipper {
|
public class ChanRipper extends AbstractHTMLRipper {
|
||||||
|
|
||||||
|
private int callsMade = 0;
|
||||||
|
private long startTime = System.nanoTime();
|
||||||
|
|
||||||
|
private static final int RETRY_LIMIT = 10;
|
||||||
|
private static final int HTTP_RETRY_LIMIT = 3;
|
||||||
|
private static final int RATE_LIMIT_HOUR = 1000;
|
||||||
|
|
||||||
|
// All sleep times are in milliseconds
|
||||||
|
private static final int PAGE_SLEEP_TIME = 60 * 60 * 1000 / RATE_LIMIT_HOUR;
|
||||||
|
private static final int IMAGE_SLEEP_TIME = 60 * 60 * 1000 / RATE_LIMIT_HOUR;
|
||||||
|
// Timeout when blocked = 1 hours. Retry every retry within the hour mark + 1 time after the hour mark.
|
||||||
|
private static final int IP_BLOCK_SLEEP_TIME = (int) Math.round((double) 60 / (RETRY_LIMIT - 1) * 60 * 1000);
|
||||||
|
|
||||||
private static List<ChanSite> bakedin_explicit_domains = Arrays.asList(
|
private static List<ChanSite> bakedin_explicit_domains = Arrays.asList(
|
||||||
new ChanSite("boards.4chan.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
|
new ChanSite("boards.4chan.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
|
||||||
new ChanSite("boards.4channel.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
|
new ChanSite("boards.4channel.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")),
|
||||||
@@ -196,9 +212,41 @@ public class ChanRipper extends AbstractHTMLRipper {
|
|||||||
return this.url.getHost();
|
return this.url.getHost();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Document getFirstPage() throws IOException, URISyntaxException {
|
@Override
|
||||||
return super.getFirstPage();
|
public Document getFirstPage() throws IOException {
|
||||||
|
|
||||||
|
Document firstPage = getPageWithRetries(url);
|
||||||
|
|
||||||
|
sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, "Loading first page...");
|
||||||
|
|
||||||
|
return firstPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Document getNextPage(Document doc) throws IOException, URISyntaxException {
|
||||||
|
String nextURL = null;
|
||||||
|
for (Element a : doc.select("a.link3")) {
|
||||||
|
if (a.text().contains("next")) {
|
||||||
|
nextURL = this.sanitizeURL(this.url) + a.attr("href");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (nextURL == null) {
|
||||||
|
throw new IOException("No next page found");
|
||||||
|
}
|
||||||
|
// Sleep before fetching next page.
|
||||||
|
sleep(PAGE_SLEEP_TIME);
|
||||||
|
|
||||||
|
sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, "Loading next page URL: " + nextURL);
|
||||||
|
LOGGER.info("Attempting to load next page URL: " + nextURL);
|
||||||
|
|
||||||
|
// Load next page
|
||||||
|
Document nextPage = getPageWithRetries(new URI(nextURL).toURL());
|
||||||
|
|
||||||
|
return nextPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean isURLBlacklisted(String url) {
|
private boolean isURLBlacklisted(String url) {
|
||||||
for (String blacklist_item : url_piece_blacklist) {
|
for (String blacklist_item : url_piece_blacklist) {
|
||||||
if (url.contains(blacklist_item)) {
|
if (url.contains(blacklist_item)) {
|
||||||
@@ -277,4 +325,98 @@ public class ChanRipper extends AbstractHTMLRipper {
|
|||||||
public void downloadURL(URL url, int index) {
|
public void downloadURL(URL url, int index) {
|
||||||
addURLToDownload(url, getPrefix(index));
|
addURLToDownload(url, getPrefix(index));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Attempts to get page, checks for IP ban, waits.
|
||||||
|
* @param url
|
||||||
|
* @return Page document
|
||||||
|
* @throws IOException If page loading errors, or if retries are exhausted
|
||||||
|
*/
|
||||||
|
private Document getPageWithRetries(URL url) throws IOException {
|
||||||
|
Document doc = null;
|
||||||
|
int retries = RETRY_LIMIT;
|
||||||
|
while (true) {
|
||||||
|
|
||||||
|
sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, url.toExternalForm());
|
||||||
|
|
||||||
|
// For debugging rate limit checker. Useful to track wheter the timeout should be altered or not.
|
||||||
|
callsMade++;
|
||||||
|
checkRateLimit();
|
||||||
|
|
||||||
|
LOGGER.info("Retrieving " + url);
|
||||||
|
|
||||||
|
boolean httpCallThrottled = false;
|
||||||
|
int httpAttempts = 0;
|
||||||
|
|
||||||
|
// we attempt the http call, knowing it can fail for network reasons
|
||||||
|
while(true) {
|
||||||
|
httpAttempts++;
|
||||||
|
try {
|
||||||
|
doc = Http.url(url).get();
|
||||||
|
} catch(IOException e) {
|
||||||
|
|
||||||
|
LOGGER.info("Retrieving " + url + " error: " + e.getMessage());
|
||||||
|
|
||||||
|
if(e.getMessage().contains("404"))
|
||||||
|
throw new IOException("Gallery/Page not found!");
|
||||||
|
|
||||||
|
if(httpAttempts < HTTP_RETRY_LIMIT) {
|
||||||
|
sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, "HTTP call failed: " + e.getMessage() + " retrying " + httpAttempts + " / " + HTTP_RETRY_LIMIT);
|
||||||
|
|
||||||
|
// we sleep for a few seconds
|
||||||
|
sleep(PAGE_SLEEP_TIME);
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, "HTTP call failed too many times: " + e.getMessage() + " treating this as a throttle");
|
||||||
|
httpCallThrottled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// no errors, we exit
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (httpCallThrottled || (doc != null && doc.toString().contains("Your IP made too many requests to our servers and we need to check that you are a real human being"))) {
|
||||||
|
if (retries == 0) {
|
||||||
|
throw new IOException("Hit rate limit and maximum number of retries, giving up");
|
||||||
|
}
|
||||||
|
String message = "Probably hit rate limit while loading " + url + ", sleeping for " + IP_BLOCK_SLEEP_TIME + "ms, " + retries + " retries remaining";
|
||||||
|
LOGGER.warn(message);
|
||||||
|
sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, message);
|
||||||
|
retries--;
|
||||||
|
try {
|
||||||
|
Thread.sleep(IP_BLOCK_SLEEP_TIME);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new IOException("Interrupted while waiting for rate limit to subside");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used for debugging the rate limit issue.
|
||||||
|
* This in order to prevent hitting the rate limit altoghether by remaining under the limit threshold.
|
||||||
|
* @return Long duration
|
||||||
|
*/
|
||||||
|
private long checkRateLimit() {
|
||||||
|
long endTime = System.nanoTime();
|
||||||
|
long duration = (endTime - startTime) / 1000000;
|
||||||
|
|
||||||
|
int rateLimitMinute = 100;
|
||||||
|
int rateLimitFiveMinutes = 200;
|
||||||
|
int rateLimitHour = RATE_LIMIT_HOUR; // Request allowed every 3.6 seconds.
|
||||||
|
|
||||||
|
if(duration / 1000 < 60){
|
||||||
|
LOGGER.debug("Rate limit: " + (rateLimitMinute - callsMade) + " calls remaining for first minute mark.");
|
||||||
|
} else if(duration / 1000 < 300){
|
||||||
|
LOGGER.debug("Rate limit: " + (rateLimitFiveMinutes - callsMade) + " calls remaining for first 5 minute mark.");
|
||||||
|
} else if(duration / 1000 < 3600){
|
||||||
|
LOGGER.debug("Rate limit: " + (RATE_LIMIT_HOUR - callsMade) + " calls remaining for first hour mark.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return duration;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -25,6 +25,8 @@ public class ChanRipperTest extends RippersTest {
|
|||||||
public void testChanURLPasses() throws IOException, URISyntaxException {
|
public void testChanURLPasses() throws IOException, URISyntaxException {
|
||||||
List<URL> passURLs = new ArrayList<>();
|
List<URL> passURLs = new ArrayList<>();
|
||||||
// URLs that should work
|
// URLs that should work
|
||||||
|
|
||||||
|
passURLs.add(new URI("https://boards.4chan.org/gif/thread/28319534").toURL());
|
||||||
passURLs.add(new URI("https://boards.4chan.org/g/thread/103742599").toURL());
|
passURLs.add(new URI("https://boards.4chan.org/g/thread/103742599").toURL());
|
||||||
passURLs.add(new URI("https://rbt.asia/g/thread/70643087/").toURL()); //must work with TLDs with len of 4
|
passURLs.add(new URI("https://rbt.asia/g/thread/70643087/").toURL()); //must work with TLDs with len of 4
|
||||||
for (URL url : passURLs) {
|
for (URL url : passURLs) {
|
||||||
|
Reference in New Issue
Block a user