1
0
mirror of https://github.com/RipMeApp/ripme.git synced 2025-09-02 10:23:47 +02:00

Implement caching of the first page for every ripper

This commit is contained in:
Zsombor Gegesy
2022-08-03 01:49:03 +02:00
committed by soloturn
parent e581eeee29
commit 98f37208b1
17 changed files with 37 additions and 55 deletions

View File

@@ -32,6 +32,7 @@ public abstract class AbstractHTMLRipper extends AbstractRipper {
private final Map<URL, File> itemsPending = Collections.synchronizedMap(new HashMap<>());
private final Map<URL, Path> itemsCompleted = Collections.synchronizedMap(new HashMap<>());
private final Map<URL, String> itemsErrored = Collections.synchronizedMap(new HashMap<>());
Document cachedFirstPage;
protected AbstractHTMLRipper(URL url) throws IOException {
super(url);
@@ -41,6 +42,14 @@ public abstract class AbstractHTMLRipper extends AbstractRipper {
public abstract String getHost();
protected abstract Document getFirstPage() throws IOException;
protected Document getCachedFirstPage() throws IOException {
if (cachedFirstPage == null) {
cachedFirstPage = getFirstPage();
}
return cachedFirstPage;
}
public Document getNextPage(Document doc) throws IOException {
return null;
}
@@ -98,7 +107,7 @@ public abstract class AbstractHTMLRipper extends AbstractRipper {
int textindex = 0;
LOGGER.info("Retrieving " + this.url);
sendUpdate(STATUS.LOADING_RESOURCE, this.url.toExternalForm());
Document doc = getFirstPage();
var doc = getCachedFirstPage();
if (hasQueueSupport() && pageContainsAlbums(this.url)) {
List<String> urls = getAlbumsToQueue(doc);

View File

@@ -49,7 +49,7 @@ public class AerisdiesRipper extends AbstractHTMLRipper {
@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
Element el = getFirstPage().select(".headtext").first();
Element el = getCachedFirstPage().select(".headtext").first();
if (el == null) {
throw new IOException("Unable to get album title");
}

View File

@@ -73,7 +73,7 @@ public class BatoRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
return getHost() + "_" + getGID(url) + "_" + getFirstPage().select("title").first().text().replaceAll(" ", "_");
return getHost() + "_" + getGID(url) + "_" + getCachedFirstPage().select("title").first().text().replaceAll(" ", "_");
} catch (IOException e) {
// Fall back to default album naming convention
LOGGER.info("Unable to find title at " + url);

View File

@@ -104,7 +104,7 @@ public class ChanRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
Document doc = getFirstPage();
Document doc = getCachedFirstPage();
try {
String subject = doc.select(".post.op > .postinfo > .subject").first().text();
return getHost() + "_" + getGID(url) + "_" + subject;

View File

@@ -53,7 +53,7 @@ public class CheveretoRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
Element titleElement = getFirstPage().select("meta[property=og:title]").first();
Element titleElement = getCachedFirstPage().select("meta[property=og:title]").first();
String title = titleElement.attr("content");
title = title.substring(title.lastIndexOf('/') + 1);
return getHost() + "_" + title.trim();

View File

@@ -23,7 +23,6 @@ import com.rarchives.ripme.utils.Http;
public class EightmusesRipper extends AbstractHTMLRipper {
private Document albumDoc = null;
private Map<String,String> cookies = new HashMap<>();
// TODO put up a wiki page on using maps to store titles
// the map for storing the title of each album when downloading sub albums
@@ -64,7 +63,7 @@ public class EightmusesRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
Element titleElement = getFirstPage().select("meta[name=description]").first();
Element titleElement = getCachedFirstPage().select("meta[name=description]").first();
String title = titleElement.attr("content");
title = title.replace("A huge collection of free porn comics for adults. Read", "");
title = title.replace("online for free at 8muses.com", "");
@@ -78,12 +77,9 @@ public class EightmusesRipper extends AbstractHTMLRipper {
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
Response resp = Http.url(url).response();
cookies.putAll(resp.cookies());
albumDoc = resp.parse();
}
return albumDoc;
Response resp = Http.url(url).response();
cookies.putAll(resp.cookies());
return resp.parse();
}
@Override

View File

@@ -97,7 +97,7 @@ public class EroShareRipper extends AbstractHTMLRipper {
if (!is_profile(url)) {
try {
// Attempt to use album title as GID
Element titleElement = getFirstPage().select("meta[property=og:title]").first();
Element titleElement = getCachedFirstPage().select("meta[property=og:title]").first();
String title = titleElement.attr("content");
title = title.substring(title.lastIndexOf('/') + 1);
return getHost() + "_" + getGID(url) + "_" + title.trim();

View File

@@ -71,7 +71,7 @@ public class EromeRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
Element titleElement = getFirstPage().select("meta[property=og:title]").first();
Element titleElement = getCachedFirstPage().select("meta[property=og:title]").first();
String title = titleElement.attr("content");
title = title.substring(title.lastIndexOf('/') + 1);
return getHost() + "_" + getGID(url) + "_" + title.trim();

View File

@@ -20,7 +20,6 @@ import org.jsoup.nodes.Element;
public class FlickrRipper extends AbstractHTMLRipper {
private Document albumDoc = null;
private final DownloadThreadPool flickrThreadPool;
private enum UrlType {
@@ -178,7 +177,7 @@ public class FlickrRipper extends AbstractHTMLRipper {
}
try {
// Attempt to use album title as GID
Document doc = getFirstPage();
Document doc = getCachedFirstPage();
String user = url.toExternalForm();
user = user.substring(user.indexOf("/photos/") + "/photos/".length());
user = user.substring(0, user.indexOf("/"));
@@ -230,10 +229,7 @@ public class FlickrRipper extends AbstractHTMLRipper {
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
albumDoc = Http.url(url).get();
}
return albumDoc;
return Http.url(url).get();
}
@Override

View File

@@ -16,8 +16,6 @@ import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Http;
public class GirlsOfDesireRipper extends AbstractHTMLRipper {
// Current HTML document
private Document albumDoc = null;
public GirlsOfDesireRipper(URL url) throws IOException {
super(url);
@@ -35,7 +33,7 @@ public class GirlsOfDesireRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
Document doc = getFirstPage();
Document doc = getCachedFirstPage();
Elements elems = doc.select(".albumName");
return getHost() + "_" + elems.first().text();
} catch (Exception e) {
@@ -64,10 +62,7 @@ public class GirlsOfDesireRipper extends AbstractHTMLRipper {
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
albumDoc = Http.url(url).get();
}
return albumDoc;
return Http.url(url).get();
}
@Override

View File

@@ -51,7 +51,7 @@ public class HbrowseRipper extends AbstractHTMLRipper {
@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
Document doc = getFirstPage();
Document doc = getCachedFirstPage();
String title = doc.select("div[id=main] > table.listTable > tbody > tr > td.listLong").first().text();
return getHost() + "_" + title + "_" + getGID(url);
} catch (Exception e) {

View File

@@ -61,7 +61,7 @@ public class HentaifoxRipper extends AbstractHTMLRipper {
@Override
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
Document doc = getFirstPage();
Document doc = getCachedFirstPage();
String title = doc.select("div.info > h1").first().text();
return getHost() + "_" + title + "_" + getGID(url);
} catch (Exception e) {

View File

@@ -19,9 +19,6 @@ import org.jsoup.select.Elements;
public class ImagebamRipper extends AbstractHTMLRipper {
// Current HTML document
private Document albumDoc = null;
// Thread pool for finding direct image links from "image" pages (html)
private DownloadThreadPool imagebamThreadPool = new DownloadThreadPool("imagebam");
@Override
@@ -61,10 +58,7 @@ public class ImagebamRipper extends AbstractHTMLRipper {
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
albumDoc = Http.url(url).get();
}
return albumDoc;
return Http.url(url).get();
}
@Override
@@ -99,7 +93,7 @@ public class ImagebamRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
Elements elems = getFirstPage().select("[id=gallery-name]");
Elements elems = getCachedFirstPage().select("[id=gallery-name]");
String title = elems.first().text();
LOGGER.info("Title text: '" + title + "'");
if (StringUtils.isNotBlank(title)) {

View File

@@ -17,7 +17,6 @@ import com.rarchives.ripme.utils.Http;
public class ImagefapRipper extends AbstractHTMLRipper {
private Document albumDoc = null;
private boolean isNewAlbumType = false;
private int callsMade = 0;
@@ -109,10 +108,7 @@ public class ImagefapRipper extends AbstractHTMLRipper {
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
albumDoc = getPageWithRetries(url);
}
return albumDoc;
return getPageWithRetries(url);
}
@Override
@@ -162,7 +158,7 @@ public class ImagefapRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
String title = getFirstPage().title();
String title = getCachedFirstPage().title();
title = title.replace("Porn Pics & Porn GIFs", "");
title = title.replace(" ", "_");
String toReturn = getHost() + "_" + title + "_" + getGID(url);

View File

@@ -34,7 +34,7 @@ public class ViewcomicRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
String titleText = getFirstPage().select("title").first().text();
String titleText = getCachedFirstPage().select("title").first().text();
String title = titleText.replace("Viewcomic reading comics online for free", "");
title = title.replace("_", "");
title = title.replace("|", "");

View File

@@ -220,7 +220,7 @@ public class XhamsterRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title and username as GID
Document doc = getFirstPage();
Document doc = getCachedFirstPage();
Element user = doc.select("a.author").first();
String username = user.text();
String path = url.getPath();

View File

@@ -19,7 +19,6 @@ import com.rarchives.ripme.utils.Http;
public class ZizkiRipper extends AbstractHTMLRipper {
private Document albumDoc = null;
private Map<String,String> cookies = new HashMap<>();
public ZizkiRipper(URL url) throws IOException {
@@ -49,10 +48,10 @@ public class ZizkiRipper extends AbstractHTMLRipper {
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
Element titleElement = getFirstPage().select("h1.title").first();
Element titleElement = getCachedFirstPage().select("h1.title").first();
String title = titleElement.text();
Element authorSpan = getFirstPage().select("span[class=creator]").first();
Element authorSpan = getCachedFirstPage().select("span[class=creator]").first();
String author = authorSpan.select("a").first().text();
LOGGER.debug("Author: " + author);
return getHost() + "_" + author + "_" + title.trim();
@@ -65,12 +64,9 @@ public class ZizkiRipper extends AbstractHTMLRipper {
@Override
public Document getFirstPage() throws IOException {
if (albumDoc == null) {
Response resp = Http.url(url).response();
cookies.putAll(resp.cookies());
albumDoc = resp.parse();
}
return albumDoc;
Response resp = Http.url(url).response();
cookies.putAll(resp.cookies());
return resp.parse();
}
@Override