mirror of
https://github.com/RipMeApp/ripme.git
synced 2025-08-10 16:04:19 +02:00
* Additional logging in AbstractHTMLRipper
* fixed ImagefapRipper.java to handle new URL schema + possible IOError
This commit is contained in:
@@ -124,10 +124,16 @@ public abstract class AbstractHTMLRipper extends AbstractRipper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
List<String> doclocation = new ArrayList<>();
|
List<String> doclocation = new ArrayList<>();
|
||||||
|
|
||||||
|
LOGGER.info("Got doc location " + doc.location());
|
||||||
|
|
||||||
while (doc != null) {
|
while (doc != null) {
|
||||||
|
|
||||||
|
LOGGER.info("Processing a doc...");
|
||||||
|
|
||||||
// catch if we saw a doc location already, save the ones seen in a list
|
// catch if we saw a doc location already, save the ones seen in a list
|
||||||
if (doclocation.contains(doc.location())) {
|
if (doclocation.contains(doc.location())) {
|
||||||
|
LOGGER.info("Already processed location " + doc.location() + " breaking");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
doclocation.add(doc.location());
|
doclocation.add(doc.location());
|
||||||
@@ -136,6 +142,9 @@ public abstract class AbstractHTMLRipper extends AbstractRipper {
|
|||||||
sendUpdate(STATUS.DOWNLOAD_COMPLETE_HISTORY, "Already seen the last " + alreadyDownloadedUrls + " images ending rip");
|
sendUpdate(STATUS.DOWNLOAD_COMPLETE_HISTORY, "Already seen the last " + alreadyDownloadedUrls + " images ending rip");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOGGER.info("retrieving urls from doc");
|
||||||
|
|
||||||
List<String> imageURLs = getURLsFromPage(doc);
|
List<String> imageURLs = getURLsFromPage(doc);
|
||||||
// If hasASAPRipping() returns true then the ripper will handle downloading the files
|
// If hasASAPRipping() returns true then the ripper will handle downloading the files
|
||||||
// if not it's done in the following block of code
|
// if not it's done in the following block of code
|
||||||
|
@@ -17,12 +17,11 @@ import com.rarchives.ripme.utils.Http;
|
|||||||
|
|
||||||
public class ImagefapRipper extends AbstractHTMLRipper {
|
public class ImagefapRipper extends AbstractHTMLRipper {
|
||||||
|
|
||||||
private boolean isNewAlbumType = false;
|
|
||||||
|
|
||||||
private int callsMade = 0;
|
private int callsMade = 0;
|
||||||
private long startTime = System.nanoTime();
|
private long startTime = System.nanoTime();
|
||||||
|
|
||||||
private static final int RETRY_LIMIT = 10;
|
private static final int RETRY_LIMIT = 10;
|
||||||
|
private static final int HTTP_RETRY_LIMIT = 3;
|
||||||
private static final int RATE_LIMIT_HOUR = 1000;
|
private static final int RATE_LIMIT_HOUR = 1000;
|
||||||
|
|
||||||
// All sleep times are in milliseconds
|
// All sleep times are in milliseconds
|
||||||
@@ -50,11 +49,7 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
@Override
|
@Override
|
||||||
public URL sanitizeURL(URL url) throws MalformedURLException {
|
public URL sanitizeURL(URL url) throws MalformedURLException {
|
||||||
String gid = getGID(url);
|
String gid = getGID(url);
|
||||||
String newURL = "https://www.imagefap.com/gallery.php?";
|
String newURL = "https://www.imagefap.com/pictures/" + gid + "/random-string";
|
||||||
if (isNewAlbumType) {
|
|
||||||
newURL += "p";
|
|
||||||
}
|
|
||||||
newURL += "gid=" + gid + "&view=2";
|
|
||||||
LOGGER.debug("Changed URL from " + url + " to " + newURL);
|
LOGGER.debug("Changed URL from " + url + " to " + newURL);
|
||||||
return new URL(newURL);
|
return new URL(newURL);
|
||||||
}
|
}
|
||||||
@@ -63,39 +58,29 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
public String getGID(URL url) throws MalformedURLException {
|
public String getGID(URL url) throws MalformedURLException {
|
||||||
Pattern p; Matcher m;
|
Pattern p; Matcher m;
|
||||||
|
|
||||||
|
// Old format (I suspect no longer supported)
|
||||||
p = Pattern.compile("^.*imagefap.com/gallery.php\\?pgid=([a-f0-9]+).*$");
|
p = Pattern.compile("^.*imagefap.com/gallery.php\\?pgid=([a-f0-9]+).*$");
|
||||||
m = p.matcher(url.toExternalForm());
|
m = p.matcher(url.toExternalForm());
|
||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
isNewAlbumType = true;
|
|
||||||
return m.group(1);
|
return m.group(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
p = Pattern.compile("^.*imagefap.com/gallery.php\\?gid=([0-9]+).*$");
|
p = Pattern.compile("^.*imagefap.com/gallery.php\\?gid=([0-9]+).*$");
|
||||||
m = p.matcher(url.toExternalForm());
|
m = p.matcher(url.toExternalForm());
|
||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
return m.group(1);
|
return m.group(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
p = Pattern.compile("^.*imagefap.com/pictures/([0-9]+).*$");
|
|
||||||
m = p.matcher(url.toExternalForm());
|
|
||||||
if (m.matches()) {
|
|
||||||
return m.group(1);
|
|
||||||
}
|
|
||||||
p = Pattern.compile("^.*imagefap.com/pictures/([a-f0-9]+).*$");
|
|
||||||
m = p.matcher(url.toExternalForm());
|
|
||||||
if (m.matches()) {
|
|
||||||
isNewAlbumType = true;
|
|
||||||
return m.group(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
p = Pattern.compile("^.*imagefap.com/gallery/([0-9]+).*$");
|
|
||||||
m = p.matcher(url.toExternalForm());
|
|
||||||
if (m.matches()) {
|
|
||||||
return m.group(1);
|
|
||||||
}
|
|
||||||
p = Pattern.compile("^.*imagefap.com/gallery/([a-f0-9]+).*$");
|
p = Pattern.compile("^.*imagefap.com/gallery/([a-f0-9]+).*$");
|
||||||
m = p.matcher(url.toExternalForm());
|
m = p.matcher(url.toExternalForm());
|
||||||
if (m.matches()) {
|
if (m.matches()) {
|
||||||
isNewAlbumType = true;
|
return m.group(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// most recent format
|
||||||
|
p = Pattern.compile("^.*imagefap.com/pictures/([a-f0-9]+).*$");
|
||||||
|
m = p.matcher(url.toExternalForm());
|
||||||
|
if (m.matches()) {
|
||||||
return m.group(1);
|
return m.group(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -108,7 +93,12 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Document getFirstPage() throws IOException {
|
public Document getFirstPage() throws IOException {
|
||||||
return getPageWithRetries(url);
|
|
||||||
|
Document firstPage = getPageWithRetries(url);
|
||||||
|
|
||||||
|
sendUpdate(STATUS.LOADING_RESOURCE, "Loading first page...");
|
||||||
|
|
||||||
|
return firstPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -116,7 +106,7 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
String nextURL = null;
|
String nextURL = null;
|
||||||
for (Element a : doc.select("a.link3")) {
|
for (Element a : doc.select("a.link3")) {
|
||||||
if (a.text().contains("next")) {
|
if (a.text().contains("next")) {
|
||||||
nextURL = "https://imagefap.com/gallery.php" + a.attr("href");
|
nextURL = this.sanitizeURL(this.url) + a.attr("href");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -125,6 +115,9 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
}
|
}
|
||||||
// Sleep before fetching next page.
|
// Sleep before fetching next page.
|
||||||
sleep(PAGE_SLEEP_TIME);
|
sleep(PAGE_SLEEP_TIME);
|
||||||
|
|
||||||
|
sendUpdate(STATUS.LOADING_RESOURCE, "Loading next page URL: " + nextURL);
|
||||||
|
LOGGER.info("Attempting to load next page URL: " + nextURL);
|
||||||
|
|
||||||
// Load next page
|
// Load next page
|
||||||
Document nextPage = getPageWithRetries(new URL(nextURL));
|
Document nextPage = getPageWithRetries(new URL(nextURL));
|
||||||
@@ -134,17 +127,27 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getURLsFromPage(Document doc) {
|
public List<String> getURLsFromPage(Document doc) {
|
||||||
|
|
||||||
List<String> imageURLs = new ArrayList<>();
|
List<String> imageURLs = new ArrayList<>();
|
||||||
|
|
||||||
|
LOGGER.debug("Trying to get URLs from document... ");
|
||||||
|
|
||||||
for (Element thumb : doc.select("#gallery img")) {
|
for (Element thumb : doc.select("#gallery img")) {
|
||||||
if (!thumb.hasAttr("src") || !thumb.hasAttr("width")) {
|
if (!thumb.hasAttr("src") || !thumb.hasAttr("width")) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
String image = getFullSizedImage("https://www.imagefap.com" + thumb.parent().attr("href"));
|
String image = getFullSizedImage("https://www.imagefap.com" + thumb.parent().attr("href"));
|
||||||
|
|
||||||
|
if(image == null)
|
||||||
|
throw new RuntimeException("Unable to extract image URL from single image page! Unable to continue");
|
||||||
|
|
||||||
imageURLs.add(image);
|
imageURLs.add(image);
|
||||||
if (isThisATest()) {
|
if (isThisATest()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
LOGGER.debug("Adding " + imageURLs.size() + " URLs to download");
|
||||||
|
|
||||||
return imageURLs;
|
return imageURLs;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -176,6 +179,7 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
Document doc = getPageWithRetries(new URL(pageURL));
|
Document doc = getPageWithRetries(new URL(pageURL));
|
||||||
return doc.select("img#mainPhoto").attr("src");
|
return doc.select("img#mainPhoto").attr("src");
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
LOGGER.debug("Unable to get full size image URL from page URL " + pageURL + " because: " + e.getMessage());
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -187,9 +191,10 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
* @throws IOException If page loading errors, or if retries are exhausted
|
* @throws IOException If page loading errors, or if retries are exhausted
|
||||||
*/
|
*/
|
||||||
private Document getPageWithRetries(URL url) throws IOException {
|
private Document getPageWithRetries(URL url) throws IOException {
|
||||||
Document doc;
|
Document doc = null;
|
||||||
int retries = RETRY_LIMIT;
|
int retries = RETRY_LIMIT;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
|
||||||
sendUpdate(STATUS.LOADING_RESOURCE, url.toExternalForm());
|
sendUpdate(STATUS.LOADING_RESOURCE, url.toExternalForm());
|
||||||
|
|
||||||
// For debugging rate limit checker. Useful to track wheter the timeout should be altered or not.
|
// For debugging rate limit checker. Useful to track wheter the timeout should be altered or not.
|
||||||
@@ -197,15 +202,42 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
checkRateLimit();
|
checkRateLimit();
|
||||||
|
|
||||||
LOGGER.info("Retrieving " + url);
|
LOGGER.info("Retrieving " + url);
|
||||||
doc = Http.url(url)
|
|
||||||
.get();
|
boolean httpCallThrottled = false;
|
||||||
|
int httpAttempts = 0;
|
||||||
|
|
||||||
|
// we attempt the http call, knowing it can fail for network reasons
|
||||||
|
while(true) {
|
||||||
|
httpAttempts++;
|
||||||
|
try {
|
||||||
|
doc = Http.url(url).get();
|
||||||
|
} catch(IOException e) {
|
||||||
|
|
||||||
if (doc.toString().contains("Your IP made too many requests to our servers and we need to check that you are a real human being")) {
|
LOGGER.info("Retrieving " + url + " error: " + e.getMessage());
|
||||||
|
|
||||||
|
if(e.getMessage().contains("404"))
|
||||||
|
throw new IOException("Gallery/Page not found!");
|
||||||
|
|
||||||
|
if(httpAttempts < HTTP_RETRY_LIMIT) {
|
||||||
|
sendUpdate(STATUS.DOWNLOAD_WARN, "HTTP call failed: " + e.getMessage() + " retrying " + httpAttempts + " / " + HTTP_RETRY_LIMIT);
|
||||||
|
|
||||||
|
// we sleep for a few seconds
|
||||||
|
sleep(PAGE_SLEEP_TIME);
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
sendUpdate(STATUS.DOWNLOAD_WARN, "HTTP call failed too many times: " + e.getMessage() + " treating this as a throttle");
|
||||||
|
httpCallThrottled = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// no errors, we exit
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (httpCallThrottled || (doc != null && doc.toString().contains("Your IP made too many requests to our servers and we need to check that you are a real human being"))) {
|
||||||
if (retries == 0) {
|
if (retries == 0) {
|
||||||
throw new IOException("Hit rate limit and maximum number of retries, giving up");
|
throw new IOException("Hit rate limit and maximum number of retries, giving up");
|
||||||
}
|
}
|
||||||
String message = "Hit rate limit while loading " + url + ", sleeping for " + IP_BLOCK_SLEEP_TIME + "ms, " + retries + " retries remaining";
|
String message = "Probably hit rate limit while loading " + url + ", sleeping for " + IP_BLOCK_SLEEP_TIME + "ms, " + retries + " retries remaining";
|
||||||
LOGGER.warn(message);
|
LOGGER.warn(message);
|
||||||
sendUpdate(STATUS.DOWNLOAD_WARN, message);
|
sendUpdate(STATUS.DOWNLOAD_WARN, message);
|
||||||
retries--;
|
retries--;
|
||||||
@@ -214,8 +246,7 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
throw new IOException("Interrupted while waiting for rate limit to subside");
|
throw new IOException("Interrupted while waiting for rate limit to subside");
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -245,4 +276,5 @@ public class ImagefapRipper extends AbstractHTMLRipper {
|
|||||||
return duration;
|
return duration;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user