diff --git a/patch.py b/patch.py index 5ed45626..aa53755d 100644 --- a/patch.py +++ b/patch.py @@ -12,6 +12,9 @@ from hashlib import sha256 # - commit all changes message = input('message: ') +# Strip any spaces that might've been entered before the message +message.lstrip() + def get_ripme_json(): with open('ripme.json') as dataFile: diff --git a/pom.xml b/pom.xml index 81ad033f..39a5fb29 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.rarchives.ripme ripme jar - 1.7.77 + 1.7.83 ripme http://rip.rarchives.com diff --git a/ripme.json b/ripme.json index 9bf13c14..6f2317bf 100644 --- a/ripme.json +++ b/ripme.json @@ -1,6 +1,13 @@ { - "currentHash": "34f326ec23f3c1ce8df1147c1d9660a1dd7b85074e79351c9295bd74ac8f127a", + "currentHash": "4994abc3d8102823c3f35159a0759707fa4c1ccea0746081954f6acfdbe63d8f", + "latestVersion": "1.7.83", "changeList": [ + "1.7.83: Added a ripper for hentaifox.com; Added ripper for Erofus.com; Fixed fsktr not ripping some images; Added support for Gfycat profiles; Added opt to disable prefix for HentaifoundryRipper ", + "1.7.82: Hentai foundry now rips oldest first by default; 8muses ripper no longer makes unneeded requests; Added support for i.thechive.com", + "1.7.81: Added support for artstn.co; Added new boolean config twitter.rip_retweet; Fixed MulemaxRipper; Fix minor bug that could cause a freeze at pending 1", + "1.7.80: Fixed porncomix.one ripper; Fixed instagram ripper; Fixed Fuskator ripper; Fixed handling of urls with spaces in them", + "1.7.79: Fixed artstation ripper; Fixed imagefap ripper folder naming; Can now filter reddit posts by votes; Added Ripper for Xlecx; Linux/Mac updater is now pure java", + "1.7.78: Fixed gfycat ripper; Fixed E621 ripper; Added support for new xhamster url format; Now supports furaffinty scraps", "1.7.77: Reduced log spam; HQporner now supports actress/category/studio/top links; Improved luscious ripper; Fixed Pornhub video ripper; Tumblr ripper now always downloads highest quality available", "1.7.76: Fixed remember url history", "1.7.75: Fix e-hentai ripper; added comixfap ripper; fixed writting urls to files on windows; Fixed update screen issues; Added support for hentaidude; Fixed erome ripper", @@ -248,6 +255,5 @@ "1.0.4: Fixed spaces-in-directory bug", "1.0.3: Added VK.com ripper", "1.0.1: Added auto-update functionality" - ], - "latestVersion": "1.7.77" + ] } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java index b24017f7..e1c7c507 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractHTMLRipper.java @@ -93,6 +93,7 @@ public abstract class AbstractHTMLRipper extends AlbumRipper { // We set doc to null here so the while loop below this doesn't fire doc = null; + LOGGER.debug("Adding items from " + this.url + " to queue"); } while (doc != null) { diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index e708ef68..1220c5f4 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -218,6 +218,44 @@ public abstract class AbstractRipper protected abstract boolean addURLToDownload(URL url, File saveAs, String referrer, Map cookies, Boolean getFileExtFromMIME); + /** + * Queues image to be downloaded and saved. + * @param url + * URL of the file + * @param options + * A map containing any changes to the default options. + * Options are getFileExtFromMIME, prefix, subdirectory, referrer, fileName, extension, getFileExtFromMIME. + * getFileExtFromMIME should be "true" or "false" + * @param cookies + * The cookies to send to the server while downloading this file. + * @return + * True if downloaded successfully + * False if failed to download + */ + protected boolean addURLToDownload(URL url, Map options, Map cookies) { + // Bit of a hack but this lets us pass a bool using a map + boolean useMIME = options.getOrDefault("getFileExtFromMIME", "false").toLowerCase().equals("true"); + return addURLToDownload(url, options.getOrDefault("prefix", ""), options.getOrDefault("subdirectory", ""), options.getOrDefault("referrer", null), + cookies, options.getOrDefault("fileName", null), options.getOrDefault("extension", null), useMIME); + } + + + /** + * Queues image to be downloaded and saved. + * @param url + * URL of the file + * @param options + * A map containing any changes to the default options. + * Options are getFileExtFromMIME, prefix, subdirectory, referrer, fileName, extension, getFileExtFromMIME. + * getFileExtFromMIME should be "true" or "false" + * @return + * True if downloaded successfully + * False if failed to download + */ + protected boolean addURLToDownload(URL url, Map options) { + return addURLToDownload(url, options, null); + } + /** * Queues image to be downloaded and saved. * @param url @@ -237,6 +275,22 @@ public abstract class AbstractRipper * False if failed to download */ protected boolean addURLToDownload(URL url, String prefix, String subdirectory, String referrer, Map cookies, String fileName, String extension, Boolean getFileExtFromMIME) { + // A common bug is rippers adding urls that are just "http:". This rejects said urls + if (url.toExternalForm().equals("http:") || url.toExternalForm().equals("https:")) { + LOGGER.info(url.toExternalForm() + " is a invalid url amd will be changed"); + return false; + + } + // Make sure the url doesn't contain any spaces as that can cause a 400 error when requesting the file + if (url.toExternalForm().contains(" ")) { + // If for some reason the url with all spaces encoded as %20 is malformed print an error + try { + url = new URL(url.toExternalForm().replaceAll(" ", "%20")); + } catch (MalformedURLException e) { + LOGGER.error("Unable to remove spaces from url\nURL: " + url.toExternalForm()); + e.printStackTrace(); + } + } // Don't re-add the url if it was downloaded in a previous rip if (Utils.getConfigBoolean("remember.url_history", true) && !isThisATest()) { if (hasDownloadedURL(url.toExternalForm())) { diff --git a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java index 6f57ec0c..3b1e7c16 100644 --- a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java +++ b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java @@ -1,7 +1,6 @@ package com.rarchives.ripme.ripper; import java.io.*; -import java.lang.reflect.Array; import java.net.HttpURLConnection; import java.net.SocketTimeoutException; import java.net.URL; @@ -14,13 +13,11 @@ import java.util.ResourceBundle; import javax.net.ssl.HttpsURLConnection; import com.rarchives.ripme.ui.MainWindow; -import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; import org.jsoup.HttpStatusException; import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Utils; -import static java.lang.Math.toIntExact; /** * Thread for downloading files. @@ -139,6 +136,7 @@ class DownloadFileThread extends Thread { int statusCode = huc.getResponseCode(); logger.debug("Status code: " + statusCode); + // If the server doesn't allow resuming downloads error out if (statusCode != 206 && observer.tryResumeDownload() && saveAs.exists()) { // TODO find a better way to handle servers that don't support resuming downloads then just erroring out throw new IOException(rb.getString("server.doesnt.support.resuming.downloads")); diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ArtStationRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ArtStationRipper.java index 611d9be6..6e1b4820 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ArtStationRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ArtStationRipper.java @@ -7,12 +7,13 @@ import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; - +import org.json.JSONObject; +import org.jsoup.Connection; +import org.jsoup.Connection.Method; +import org.jsoup.Connection.Response; import com.rarchives.ripme.ripper.AbstractJSONRipper; import com.rarchives.ripme.utils.Http; -import org.json.JSONObject; - public class ArtStationRipper extends AbstractJSONRipper { enum URL_TYPE { SINGLE_PROJECT, USER_PORTFOLIO, UNKNOWN @@ -47,7 +48,8 @@ public class ArtStationRipper extends AbstractJSONRipper { if (albumURL.getType() == URL_TYPE.SINGLE_PROJECT) { // URL points to single project, use project title as GID try { - groupData = Http.url(albumURL.getLocation()).getJSON(); + // groupData = Http.url(albumURL.getLocation()).getJSON(); + groupData = getJson(albumURL.getLocation()); } catch (IOException e) { throw new MalformedURLException("Couldn't load JSON from " + albumURL.getLocation()); } @@ -58,7 +60,8 @@ public class ArtStationRipper extends AbstractJSONRipper { // URL points to user portfolio, use user's full name as GID String userInfoURL = "https://www.artstation.com/users/" + albumURL.getID() + "/quick.json"; try { - groupData = Http.url(userInfoURL).getJSON(); + // groupData = Http.url(userInfoURL).getJSON(); + groupData = getJson(userInfoURL); } catch (IOException e) { throw new MalformedURLException("Couldn't load JSON from " + userInfoURL); } @@ -67,26 +70,29 @@ public class ArtStationRipper extends AbstractJSONRipper { // No JSON found in the URL entered, can't rip throw new MalformedURLException( - "Expected URL to an ArtStation project or user profile - got " + url + " instead"); + "Expected URL to an ArtStation 'project url' or 'user profile url' - got " + url + " instead"); } @Override protected JSONObject getFirstPage() throws IOException { if (albumURL.getType() == URL_TYPE.SINGLE_PROJECT) { // URL points to JSON of a single project, just return it - return Http.url(albumURL.getLocation()).getJSON(); + // return Http.url(albumURL.getLocation()).getJSON(); + return getJson(albumURL.getLocation()); } if (albumURL.getType() == URL_TYPE.USER_PORTFOLIO) { // URL points to JSON of a list of projects, load it to parse individual // projects - JSONObject albumContent = Http.url(albumURL.getLocation()).getJSON(); + // JSONObject albumContent = Http.url(albumURL.getLocation()).getJSON(); + JSONObject albumContent = getJson(albumURL.getLocation()); if (albumContent.getInt("total_count") > 0) { // Get JSON of the first project and return it JSONObject projectInfo = albumContent.getJSONArray("data").getJSONObject(0); ParsedURL projectURL = parseURL(new URL(projectInfo.getString("permalink"))); - return Http.url(projectURL.getLocation()).getJSON(); + // return Http.url(projectURL.getLocation()).getJSON(); + return getJson(projectURL.getLocation()); } } @@ -112,14 +118,16 @@ public class ArtStationRipper extends AbstractJSONRipper { } Integer currentProject = ((projectPageNumber - 1) * 50) + (projectIndex + 1); - JSONObject albumContent = Http.url(albumURL.getLocation() + "?page=" + projectPageNumber).getJSON(); + // JSONObject albumContent = Http.url(albumURL.getLocation() + "?page=" + projectPageNumber).getJSON(); + JSONObject albumContent = getJson(albumURL.getLocation() + "?page=" + projectPageNumber); if (albumContent.getInt("total_count") > currentProject) { // Get JSON of the next project and return it JSONObject projectInfo = albumContent.getJSONArray("data").getJSONObject(projectIndex); ParsedURL projectURL = parseURL(new URL(projectInfo.getString("permalink"))); projectIndex++; - return Http.url(projectURL.getLocation()).getJSON(); + // return Http.url(projectURL.getLocation()).getJSON(); + return getJson(projectURL.getLocation()); } throw new IOException("No more projects"); @@ -181,9 +189,12 @@ public class ArtStationRipper extends AbstractJSONRipper { /** * Construct a new ParsedURL object. * - * @param urlType URL_TYPE enum containing the URL type - * @param jsonURL String containing the JSON URL location - * @param urlID String containing the ID of this URL + * @param urlType + * URL_TYPE enum containing the URL type + * @param jsonURL + * String containing the JSON URL location + * @param urlID + * String containing the ID of this URL * */ ParsedURL(URL_TYPE urlType, String jsonURL, String urlID) { @@ -226,7 +237,8 @@ public class ArtStationRipper extends AbstractJSONRipper { /** * Parses an ArtStation URL. * - * @param url URL to an ArtStation user profile + * @param url + * URL to an ArtStation user profile * (https://www.artstation.com/username) or single project * (https://www.artstation.com/artwork/projectid) * @return ParsedURL object containing URL type, JSON location and ID (stores @@ -239,7 +251,30 @@ public class ArtStationRipper extends AbstractJSONRipper { // Load HTML Source of the specified URL try { - htmlSource = Http.url(url).get().html(); + // htmlSource = Http.url(url).get().html(); + Connection con = Http.url(url).method(Method.GET).connection(); + con.ignoreHttpErrors(true); + con.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"); + con.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + con.header("Accept-Language", "en-US,en;q=0.5"); + con.header("Accept-Encoding", "gzip, deflate, br"); + con.header("Upgrade-Insecure-Requests", "1"); + Response res = con.execute(); + int status = res.statusCode(); + + if (status / 100 == 2) { + htmlSource = res.parse().html(); + } else if (status == 403 && url.toString().contains("artwork/")) { + // Catches cloudflare page. Error 403. + // Usually caused by artwork URLs( arstation.com/artwork/someProjectId) + String urlId = url.toString().substring(url.toString().lastIndexOf("/") + 1); + String jsonURL = "https://www.artstation.com/projects/" + urlId + ".json"; + parsedURL = new ParsedURL(URL_TYPE.SINGLE_PROJECT, jsonURL, urlId); + return parsedURL; + } else { + LOGGER.error("Couldnt fetch URL: " + url); + throw new IOException("Error fetching URL: " + url + " Status Code: " + status); + } } catch (IOException e) { htmlSource = ""; } @@ -266,5 +301,28 @@ public class ArtStationRipper extends AbstractJSONRipper { parsedURL = new ParsedURL(URL_TYPE.UNKNOWN, null, null); return parsedURL; } + + // Use this method instead of direct call to Http.url(url).getJson() to avoid cloudflare 403 page. + private JSONObject getJson(URL url) throws IOException { + Connection con = Http.url(url).method(Method.GET).connection(); + con.ignoreHttpErrors(true); + con.ignoreContentType(true); + con.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"); + con.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + con.header("Accept-Language", "en-US,en;q=0.5"); + con.header("Accept-Encoding", "gzip, deflate, br"); + con.header("Upgrade-Insecure-Requests", "1"); + Response res = con.execute(); + int status = res.statusCode(); + if (status / 100 == 2) { + String jsonString = res.body(); + return new JSONObject(jsonString); + } + throw new IOException("Error fetching json. Status code:" + status); + } + + private JSONObject getJson(String url) throws IOException{ + return getJson(new URL(url)); + } } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ArtstnRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ArtstnRipper.java new file mode 100644 index 00000000..82b6e97c --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ArtstnRipper.java @@ -0,0 +1,58 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; + +import org.jsoup.Connection.Response; + +import com.rarchives.ripme.utils.Http; + +/* + * Ripper for ArtStation's short URL domain. + * Example URL: https://artstn.co/p/JlE15Z + */ + +public class ArtstnRipper extends ArtStationRipper { + public URL artStationUrl = null; + + public ArtstnRipper(URL url) throws IOException { + super(url); + } + + @Override + public boolean canRip(URL url) { + return url.getHost().endsWith("artstn.co"); + } + + @Override + public String getGID(URL url) throws MalformedURLException { + if (artStationUrl == null) { + // Run only once. + try { + artStationUrl = getFinalUrl(url); + if (artStationUrl == null) { + throw new IOException("Null url received."); + } + } catch (IOException e) { + LOGGER.error("Couldnt resolve URL.", e); + } + + } + return super.getGID(artStationUrl); + } + + public URL getFinalUrl(URL url) throws IOException { + if (url.getHost().endsWith("artstation.com")) { + return url; + } + + LOGGER.info("Checking url: " + url); + Response response = Http.url(url).connection().followRedirects(false).execute(); + if (response.statusCode() / 100 == 3 && response.hasHeader("location")) { + return getFinalUrl(new URL(response.header("location"))); + } else { + return null; + } + } +} diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java index 1e47b4dd..a55cdf09 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ChanRipper.java @@ -12,11 +12,13 @@ import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; + +import com.rarchives.ripme.utils.Utils; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; public class ChanRipper extends AbstractHTMLRipper { - private static List explicit_domains = Arrays.asList( + private static List bakedin_explicit_domains = Arrays.asList( new ChanSite("boards.4chan.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")), new ChanSite("boards.4channel.org", Arrays.asList("4cdn.org", "is.4chan.org", "is2.4chan.org", "is3.4chan.org")), new ChanSite("4archive.org", "imgur.com"), @@ -29,6 +31,34 @@ public class ChanRipper extends AbstractHTMLRipper { new ChanSite("desuarchive.org", "desu-usergeneratedcontent.xyz"), new ChanSite("8ch.net", "media.8ch.net") ); + private static List user_give_explicit_domains = getChansFromConfig(Utils.getConfigString("chans.chan_sites", null)); + private static List explicit_domains = new ArrayList<>(); + + /** + * reads a string in the format of site1[cdn|cdn2|cdn3], site2[cdn] + */ + public static List getChansFromConfig(String rawChanString) { + List userChans = new ArrayList<>(); + if (rawChanString != null) { + String[] listOfChans = rawChanString.split(","); + for (String chanInfo : listOfChans) { + // If this is true we're parsing a chan with cdns + if (chanInfo.contains("[")) { + String siteUrl = chanInfo.split("\\[")[0]; + String[] cdns = chanInfo.replaceAll(siteUrl + "\\[", "").replaceAll("]", "").split("\\|"); + LOGGER.debug("site url: " + siteUrl); + LOGGER.debug("cdn: " + Arrays.toString(cdns)); + userChans.add(new ChanSite(siteUrl, Arrays.asList(cdns))); + } else { + // We're parsing a site without cdns + LOGGER.debug("site: " + chanInfo); + userChans.add(new ChanSite(chanInfo)); + } + } + return userChans; + } + return null; + } private static List url_piece_blacklist = Arrays.asList( "=http", @@ -43,6 +73,7 @@ public class ChanRipper extends AbstractHTMLRipper { public ChanRipper(URL url) throws IOException { super(url); for (ChanSite _chanSite : explicit_domains) { + LOGGER.info(_chanSite.domains); if (_chanSite.domains.contains(url.getHost())) { chanSite = _chanSite; generalChanSite = false; @@ -86,6 +117,10 @@ public class ChanRipper extends AbstractHTMLRipper { @Override public boolean canRip(URL url) { + explicit_domains.addAll(bakedin_explicit_domains); + if (user_give_explicit_domains != null) { + explicit_domains.addAll(user_give_explicit_domains); + } for (ChanSite _chanSite : explicit_domains) { if (_chanSite.domains.contains(url.getHost())) { return true; diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java new file mode 100644 index 00000000..08b27a76 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java @@ -0,0 +1,173 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; + +/** + * @author Tushar + * + */ +public class ComicextraRipper extends AbstractHTMLRipper { + + private static final String FILE_NAME = "page"; + + private Pattern p1 = + Pattern.compile("https:\\/\\/www.comicextra.com\\/comic\\/([A-Za-z0-9_-]+)"); + private Pattern p2 = Pattern.compile( + "https:\\/\\/www.comicextra.com\\/([A-Za-z0-9_-]+)\\/([A-Za-z0-9_-]+)(?:\\/full)?"); + private UrlType urlType = UrlType.UNKNOWN; + private List chaptersList = null; + private int chapterIndex = -1; // index for the chaptersList, useful in getting the next page. + private int imageIndex = 0; // image index for each chapter images. + + public ComicextraRipper(URL url) throws IOException { + super(url); + } + + @Override + protected String getDomain() { + return "comicextra.com"; + } + + @Override + public String getHost() { + return "comicextra"; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Matcher m1 = p1.matcher(url.toExternalForm()); + if (m1.matches()) { + // URL is of comic( https://www.comicextra.com/comic/the-punisher-frank-castle-max). + urlType = UrlType.COMIC; + return m1.group(1); + } + + Matcher m2 = p2.matcher(url.toExternalForm()); + if (m2.matches()) { + // URL is of chapter( https://www.comicextra.com/the-punisher-frank-castle-max/chapter-75). + urlType = UrlType.CHAPTER; + return m2.group(1); + } + + throw new MalformedURLException( + "Expected comicextra.com url of type: https://www.comicextra.com/comic/some-comic-name\n" + + " or https://www.comicextra.com/some-comic-name/chapter-001 got " + url + + " instead"); + } + + @Override + protected Document getFirstPage() throws IOException { + Document doc = null; + + switch (urlType) { + case COMIC: + // For COMIC type url we extract the urls of each chapters and store them in chapters. + chaptersList = new ArrayList<>(); + Document comicPage = Http.url(url).get(); + Elements elements = comicPage.select("div.episode-list a"); + for (Element e : elements) { + chaptersList.add(getCompleteChapterUrl(e.attr("abs:href"))); + } + + // Set the first chapter from the chapterList as the doc. + chapterIndex = 0; + doc = Http.url(chaptersList.get(chapterIndex)).get(); + break; + case CHAPTER: + doc = Http.url(url).get(); + break; + case UNKNOWN: + default: + throw new IOException("Unknown url type encountered."); + } + + return doc; + } + + @Override + public Document getNextPage(Document doc) throws IOException { + if (urlType == UrlType.COMIC) { + ++chapterIndex; + imageIndex = 0; // Resetting the imagesIndex so that images prefix within each chapter starts from '001_'. + if (chapterIndex < chaptersList.size()) { + return Http.url(chaptersList.get(chapterIndex)).get(); + } + } + + return super.getNextPage(doc); + } + + @Override + protected List getURLsFromPage(Document page) { + List urls = new ArrayList<>(); + + if (urlType == UrlType.COMIC || urlType == UrlType.CHAPTER) { + Elements images = page.select("img.chapter_img"); + for (Element img : images) { + urls.add(img.attr("src")); + } + } + + return urls; + } + + @Override + protected void downloadURL(URL url, int index) { + String subdirectory = getSubDirectoryName(); + String prefix = getPrefix(++imageIndex); + + addURLToDownload(url, prefix, subdirectory, null, null, FILE_NAME, null, Boolean.TRUE); + } + + /* + * This function appends /full at the end of the chapters url to get all the images for the + * chapter in the same Document. + */ + private String getCompleteChapterUrl(String chapterUrl) { + if (!chapterUrl.endsWith("/full")) { + chapterUrl = chapterUrl + "/full"; + } + return chapterUrl; + } + + /* + * This functions returns sub folder name for the current chapter. + */ + private String getSubDirectoryName() { + String subDirectory = ""; + + if (urlType == UrlType.COMIC) { + Matcher m = p2.matcher(chaptersList.get(chapterIndex)); + if (m.matches()) { + subDirectory = m.group(2); + } + } + + if (urlType == UrlType.CHAPTER) { + Matcher m = p2.matcher(url.toExternalForm()); + if (m.matches()) { + subDirectory = m.group(2); + } + } + + return subDirectory; + } + + /* + * Enum to classify different types of urls. + */ + private enum UrlType { + COMIC, CHAPTER, UNKNOWN + } +} diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java index ad7d79fa..8a24e2c9 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/DeviantartRipper.java @@ -1,409 +1,645 @@ package com.rarchives.ripme.ripper.rippers; -import com.rarchives.ripme.ripper.AbstractJSONRipper; -import com.rarchives.ripme.utils.Base64; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ripper.DownloadThreadPool; +import com.rarchives.ripme.ui.RipStatusMessage.STATUS; import com.rarchives.ripme.utils.Http; -import com.rarchives.ripme.utils.RipUtils; import com.rarchives.ripme.utils.Utils; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.net.HttpURLConnection; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; +import java.util.Base64; import java.util.HashMap; -import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.json.JSONArray; -import org.json.JSONObject; +import org.jsoup.Connection; +import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; -import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +/** + * + * @author MrPlaygon + * + * NOT using Deviantart API like the old JSON ripper because it is SLOW + * and somehow annoying to use. Things to consider: Using the API might + * be less work/maintenance later because APIs do not change as + * frequently as HTML source code does...? + * + * + * + * Tested for: + * + * SFW: + * + * https://www.deviantart.com/apofiss/gallery/41388863/sceneries + * https://www.deviantart.com/kageuri/gallery/ + * https://www.deviantart.com/kageuri/gallery/?catpath=/ + * https://www.deviantart.com/apofiss/favourites/39881418/gifts-and + * https://www.deviantart.com/kageuri/favourites/ + * https://www.deviantart.com/kageuri/favourites/?catpath=/ + * + * NSFW: + * + * https://www.deviantart.com/revpeng/gallery/67734353/Siren-Lee-Agent-of-S-I-R-E-N-S + * + * + * Deactivated account: + * + * https://www.deviantart.com/gingerbreadpony + * + * Banned Account: + * + * https://www.deviantart.com/ghostofflossenburg + * + * + * + * + * Login Data (PLEASE DONT ACTUALLY USE!!!): + * + * email: 5g5_8l4dii5lbbpc@byom.de + * + * username: 5g58l4dii5lbbpc + * + * password: 5g5_8l4dii5lbbpc + * + * + * + */ +public class DeviantartRipper extends AbstractHTMLRipper { -public class DeviantartRipper extends AbstractJSONRipper { - String requestID; - String galleryID; - String username; - String baseApiUrl = "https://www.deviantart.com/dapi/v1/gallery/"; - String csrf; - Map pageCookies = new HashMap<>(); + private final String username = "5g58l4dii5lbbpc"; + private final String password = "5g5_8l4dii5lbbpc"; + private int offset = 0; + private boolean usingCatPath = false; + private int downloadCount = 0; + private Map cookies = new HashMap(); + private DownloadThreadPool deviantartThreadPool = new DownloadThreadPool("deviantart"); + private ArrayList names = new ArrayList(); - private static final int PAGE_SLEEP_TIME = 3000, - IMAGE_SLEEP_TIME = 2000; + List allowedCookies = Arrays.asList("agegate_state", "userinfo", "auth", "auth_secure"); - private Map cookies = new HashMap<>(); - private Set triedURLs = new HashSet<>(); + private Connection conn = null; - public DeviantartRipper(URL url) throws IOException { - super(url); - } + // Constants + private final String referer = "https://www.deviantart.com/"; + private final String userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0"; + private final String utilsKey = "DeviantartLogin.cookies"; - String loginCookies = "auth=__0f9158aaec09f417b235%3B%221ff79836392a515d154216d919eae573%22;" + - "auth_secure=__41d14dd0da101f411bb0%3B%2281cf2cf9477776162a1172543aae85ce%22;" + - "userinfo=__bf84ac233bfa8ae642e8%3B%7B%22username%22%3A%22grabpy%22%2C%22uniqueid%22%3A%22a0a876aa37dbd4b30e1c80406ee9c280%22%2C%22vd%22%3A%22BbHUXZ%2CBbHUXZ%2CA%2CU%2CA%2C%2CB%2CA%2CB%2CBbHUXZ%2CBbHUdj%2CL%2CL%2CA%2CBbHUdj%2C13%2CA%2CB%2CA%2C%2CA%2CA%2CB%2CA%2CA%2C%2CA%22%2C%22attr%22%3A56%7D"; + @Override + public DownloadThreadPool getThreadPool() { + return deviantartThreadPool; + } - @Override - public String getHost() { - return "deviantart"; - } + public DeviantartRipper(URL url) throws IOException { + super(url); + } - @Override - public String getDomain() { - return "deviantart.com"; - } + @Override + protected String getDomain() { + return "deviantart.com"; + } - @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - String u = url.toExternalForm(); - if (u.contains("/gallery/")) { - return url; - } else if (u.contains("/favourites")) { - return url; - } else if (u.contains("/favorites")) { - return url; - } + @Override + public String getHost() { + return "deviantart"; + } - if (!u.endsWith("/gallery/") && !u.endsWith("/gallery")) { - if (!u.endsWith("/")) { - u += "/gallery/"; - } else { - u += "gallery/"; - } - } + @Override + protected Document getFirstPage() throws IOException { + if (isDeactivated()) { + throw new IOException("Account Deactivated"); + } + login(); + // Saving connection to reuse later for following pages. + this.conn = Http.url(urlWithParams(this.offset)).cookies(getDACookie()).referrer(this.referer) + .userAgent(this.userAgent).connection(); - Pattern p = Pattern.compile("^https?://www\\.deviantart\\.com/([a-zA-Z0-9\\-]+)/favou?rites/([0-9]+)/*?$"); - Matcher m = p.matcher(url.toExternalForm()); - if (!m.matches()) { - String subdir = "/"; - if (u.contains("catpath=scraps")) { - subdir = "scraps"; - } - u = u.replaceAll("\\?.*", "?catpath=" + subdir); - } - return new URL(u); - } + return this.conn.get(); + } - @Override - public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://www\\.deviantart\\.com/([a-zA-Z0-9\\-]+)(/gallery)?/?(\\?.*)?$"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - // Root gallery - if (url.toExternalForm().contains("catpath=scraps")) { - return m.group(1) + "_scraps"; - } - else { - return m.group(1); - } - } - p = Pattern.compile("^https?://www\\.deviantart\\.com/([a-zA-Z0-9\\-]+)/gallery/([0-9]+).*$"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - // Subgallery - return m.group(1) + "_" + m.group(2); - } - p = Pattern.compile("^https?://www\\.deviantart\\.com/([a-zA-Z0-9\\-]+)/favou?rites/([0-9]+)/.*?$"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1) + "_faves_" + m.group(2); - } - p = Pattern.compile("^https?://www\\.deviantart\\.com/([a-zA-Z0-9\\-]+)/favou?rites/?$"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - // Subgallery - return m.group(1) + "_faves"; - } - throw new MalformedURLException("Expected URL format: http://www.deviantart.com/username[/gallery/#####], got: " + url); - } + /** + * Checks if the URL refers to a deactivated account using the HTTP status Codes + * + * @return true when the account is good + * @throws IOException when the account is deactivated + */ + private boolean isDeactivated() throws IOException { + Response res = Http.url(this.url).connection().followRedirects(true).referrer(this.referer) + .userAgent(this.userAgent).execute(); + return res.statusCode() != 200 ? true : false; - private String getUsernameFromURL(String u) { - Pattern p = Pattern.compile("^https?://www\\.deviantart\\.com/([a-zA-Z0-9\\-]+)/gallery/?(\\S+)?"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - return null; + } - } + /** + * Stores logged in Cookies. Needed for art pieces only visible to logged in + * users. + * + * + * @throws IOException when failed to load webpage or failed to read/write + * cookies in file (used when running multiple instances of + * RipMe) + */ + private void login() throws IOException { - private String getFullsizedNSFWImage(String pageURL) { - try { - Document doc = Http.url(pageURL).cookies(cookies).get(); - String imageToReturn = ""; - String[] d = doc.select("img").attr("srcset").split(","); + String customUsername = Utils.getConfigString("DeviantartCustomLoginUsername", this.username); + String customPassword = Utils.getConfigString("DeviantartCustomLoginPassword", this.password); + try { + String dACookies = Utils.getConfigString(utilsKey, null); + updateCookie(dACookies != null ? deserialize(dACookies) : null); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + if (getDACookie() == null || !checkLogin()) { + LOGGER.info("Do Login now"); + // Do login now - String s = d[d.length -1].split(" ")[0]; - LOGGER.info("2:" + s); + // Load login page + Response res = Http.url("https://www.deviantart.com/users/login").connection().method(Method.GET) + .referrer(referer).userAgent(userAgent).execute(); - if (s == null || s.equals("")) { - LOGGER.error("Could not find full sized image at " + pageURL); - } - return s; - } catch (IOException e) { - LOGGER.error("Could not find full sized image at " + pageURL); - return null; - } - } + updateCookie(res.cookies()); - /** - * Gets first page. - * Will determine if login is supplied, - * if there is a login, then login and add that login cookies. - * Otherwise, just bypass the age gate with an anonymous flag. - * @return - * @throws IOException - */ - @Override - public JSONObject getFirstPage() throws IOException { - - // Base64 da login - // username: Z3JhYnB5 - // password: ZmFrZXJz + // Find tokens + Document doc = res.parse(); + Element form = doc.getElementById("login"); + String token = form.select("input[name=\"validate_token\"]").first().attr("value"); + String key = form.select("input[name=\"validate_key\"]").first().attr("value"); + LOGGER.info("Token: " + token + " & Key: " + key); + // Build Login Data + HashMap loginData = new HashMap(); + loginData.put("challenge", ""); + loginData.put("username", customUsername); + loginData.put("password", customPassword); + loginData.put("remember_me", "1"); + loginData.put("validate_token", token); + loginData.put("validate_key", key); + Map cookies = res.cookies(); - cookies = getDACookies(); - if (cookies.isEmpty()) { - LOGGER.warn("Failed to get login cookies"); - cookies.put("agegate_state","1"); // Bypasses the age gate - } - cookies.put("agegate_state", "1"); - - Response res = Http.url(this.url) - .cookies(cookies) - .response(); - Document page = res.parse(); + // Log in using data. Handle redirect + res = Http.url("https://www.deviantart.com/users/login").connection().referrer(referer).userAgent(userAgent) + .method(Method.POST).data(loginData).cookies(cookies).followRedirects(false).execute(); + updateCookie(res.cookies()); - JSONObject firstPageJSON = getFirstPageJSON(page); - requestID = firstPageJSON.getJSONObject("dapx").getString("requestid"); - galleryID = getGalleryID(page); - username = getUsernameFromURL(url.toExternalForm()); - csrf = firstPageJSON.getString("csrf"); - pageCookies = res.cookies(); + res = Http.url(res.header("location")).connection().referrer(referer).userAgent(userAgent) + .method(Method.GET).cookies(cookies).followRedirects(false).execute(); - return requestPage(0, galleryID, username, requestID, csrf, pageCookies); - } + // Store cookies + updateCookie(res.cookies()); - private JSONObject requestPage(int offset, String galleryID, String username, String requestID, String csfr, Map c) { - LOGGER.debug("offset: " + Integer.toString(offset)); - LOGGER.debug("galleryID: " + galleryID); - LOGGER.debug("username: " + username); - LOGGER.debug("requestID: " + requestID); - String url = baseApiUrl + galleryID + "?iid=" + requestID; - try { - Document doc = Http.url(url).cookies(c).data("username", username).data("offset", Integer.toString(offset)) - .data("limit", "24").data("_csrf", csfr).data("id", requestID) - .ignoreContentType().post(); - return new JSONObject(doc.body().text()); - } catch (IOException e) { - LOGGER.error("Got error trying to get page: " + e.getMessage()); - e.printStackTrace(); - return null; - } + // Write Cookie to file for other RipMe Instances or later use + Utils.setConfigString(utilsKey, serialize(new HashMap(getDACookie()))); + Utils.saveConfig(); // save now because of other instances that might work simultaneously + } else { + LOGGER.info("No new Login needed"); + } - } + LOGGER.info("DA Cookies: " + getDACookie()); + } - private JSONObject getFirstPageJSON(Document doc) { - for (Element js : doc.select("script")) { - if (js.html().contains("requestid")) { - String json = js.html().replaceAll("window.__initial_body_data=", "").replaceAll("\\);", "") - .replaceAll(";__wake\\(.+", ""); - JSONObject j = new JSONObject(json); - return j; - } - } - return null; - } + /** + * Returns next page Document using offset. + */ + @Override + public Document getNextPage(Document doc) throws IOException { + this.offset += 24; + this.conn.url(urlWithParams(this.offset)).cookies(getDACookie()); + Response re = this.conn.execute(); +// Response re = Http.url(urlWithParams(this.offset)).cookies(getDACookie()).referrer(referer).userAgent(userAgent) +// .response(); + updateCookie(re.cookies()); + Document docu = re.parse(); + Elements messages = docu.getElementsByClass("message"); + LOGGER.info("Current Offset: " + this.offset); - public String getGalleryID(Document doc) { - // If the url contains catpath we return 0 as the DA api will provide all galery images if you sent the - // gallery id to 0 - if (url.toExternalForm().contains("catpath=")) { - return "0"; - } - Pattern p = Pattern.compile("^https?://www\\.deviantart\\.com/[a-zA-Z0-9\\-]+/gallery/([0-9]+)/?\\S+"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - for (Element el : doc.select("input[name=set]")) { - try { - String galleryID = el.attr("value"); - return galleryID; - } catch (NullPointerException e) { - continue; - } - } - LOGGER.error("Could not find gallery ID"); - return null; - } + if (messages.size() > 0) { - public String getUsername(Document doc) { - return doc.select("meta[property=og:title]").attr("content") - .replaceAll("'s DeviantArt gallery", "").replaceAll("'s DeviantArt Gallery", ""); - } - + // if message exists -> last page + LOGGER.info("Messages amount: " + messages.size() + " - Next Page does not exists"); + throw new IOException("No more pages"); + } - @Override - public List getURLsFromJSON(JSONObject json) { - List imageURLs = new ArrayList<>(); - JSONArray results = json.getJSONObject("content").getJSONArray("results"); - for (int i = 0; i < results.length(); i++) { - Document doc = Jsoup.parseBodyFragment(results.getJSONObject(i).getString("html")); - if (doc.html().contains("ismature")) { - LOGGER.info("Downloading nsfw image"); - String nsfwImage = getFullsizedNSFWImage(doc.select("span").attr("href")); - if (nsfwImage != null && nsfwImage.startsWith("http")) { - imageURLs.add(nsfwImage); - } - } - try { - String imageURL = doc.select("span").first().attr("data-super-full-img"); - if (!imageURL.isEmpty() && imageURL.startsWith("http")) { - imageURLs.add(imageURL); - } - } catch (NullPointerException e) { - LOGGER.info(i + " does not contain any images"); - } + return Http.url(urlWithParams(this.offset)).referrer(referer).userAgent(userAgent).cookies(getDACookie()).get(); - } - return imageURLs; - } + } + /** + * Returns list of Links to the Image pages. NOT links to fullsize image!!! e.g. + * https://www.deviantart.com/kageuri/art/RUBY-568396655 + */ + @Override + protected List getURLsFromPage(Document page) { - @Override - public JSONObject getNextPage(JSONObject page) throws IOException { - boolean hasMore = page.getJSONObject("content").getBoolean("has_more"); - if (hasMore) { - return requestPage(page.getJSONObject("content").getInt("next_offset"), galleryID, username, requestID, csrf, pageCookies); - } + List result = new ArrayList(); - throw new IOException("No more pages"); - } + Element div; + if (usingCatPath) { + div = page.getElementById("gmi-"); - @Override - public boolean keepSortOrder() { - // Don't keep sort order (do not add prefixes). - // Causes file duplication, as outlined in https://github.com/4pr0n/ripme/issues/113 - return false; - } + } else { + div = page.getElementsByClass("folderview-art").first().child(0); - @Override - public void downloadURL(URL url, int index) { - addURLToDownload(url, getPrefix(index), "", this.url.toExternalForm(), cookies); - sleep(IMAGE_SLEEP_TIME); - } + } + Elements links = div.select("a.torpedo-thumb-link"); - /** - * Tries to get full size image from thumbnail URL - * @param thumb Thumbnail URL - * @param throwException Whether or not to throw exception when full size image isn't found - * @return Full-size image URL - * @throws Exception If it can't find the full-size URL - */ - private static String thumbToFull(String thumb, boolean throwException) throws Exception { - thumb = thumb.replace("http://th", "http://fc"); - List fields = new ArrayList<>(Arrays.asList(thumb.split("/"))); - fields.remove(4); - if (!fields.get(4).equals("f") && throwException) { - // Not a full-size image - throw new Exception("Can't get full size image from " + thumb); - } - StringBuilder result = new StringBuilder(); - for (int i = 0; i < fields.size(); i++) { - if (i > 0) { - result.append("/"); - } - result.append(fields.get(i)); - } - return result.toString(); - } + for (Element el : links) { + result.add(el.attr("href")); + } + LOGGER.info("Amount of Images on Page: " + result.size()); + LOGGER.info(page.location()); - /** - * If largest resolution for image at 'thumb' is found, starts downloading - * and returns null. - * If it finds a larger resolution on another page, returns the image URL. - * @param thumb Thumbnail URL - * @param page Page the thumbnail is retrieved from - * @return Highest-resolution version of the image based on thumbnail URL and the page. - */ - private String smallToFull(String thumb, String page) { - try { - // Fetch the image page - Response resp = Http.url(page) - .referrer(this.url) - .cookies(cookies) - .response(); - cookies.putAll(resp.cookies()); - Document doc = resp.parse(); - Elements els = doc.select("img.dev-content-full"); - String fsimage = null; - // Get the largest resolution image on the page - if (!els.isEmpty()) { - // Large image - fsimage = els.get(0).attr("src"); - LOGGER.info("Found large-scale: " + fsimage); - if (fsimage.contains("//orig")) { - return fsimage; - } - } - // Try to find the download button - els = doc.select("a.dev-page-download"); - if (!els.isEmpty()) { - // Full-size image - String downloadLink = els.get(0).attr("href"); - LOGGER.info("Found download button link: " + downloadLink); - HttpURLConnection con = (HttpURLConnection) new URL(downloadLink).openConnection(); - con.setRequestProperty("Referer",this.url.toString()); - String cookieString = ""; - for (Map.Entry entry : cookies.entrySet()) { - cookieString = cookieString + entry.getKey() + "=" + entry.getValue() + "; "; - } - cookieString = cookieString.substring(0,cookieString.length() - 1); - con.setRequestProperty("Cookie",cookieString); - con.setRequestProperty("User-Agent", USER_AGENT); - con.setInstanceFollowRedirects(true); - con.connect(); - int code = con.getResponseCode(); - String location = con.getURL().toString(); - con.disconnect(); - if (location.contains("//orig")) { - fsimage = location; - LOGGER.info("Found image download: " + location); - } - } - if (fsimage != null) { - return fsimage; - } - throw new IOException("No download page found"); - } catch (IOException ioe) { - try { - LOGGER.info("Failed to get full size download image at " + page + " : '" + ioe.getMessage() + "'"); - String lessThanFull = thumbToFull(thumb, false); - LOGGER.info("Falling back to less-than-full-size image " + lessThanFull); - return lessThanFull; - } catch (Exception e) { - return null; - } - } - } + return result; + } - /** - * Returns DA cookies. - * @return Map of cookies containing session data. - */ - private Map getDACookies() { - return RipUtils.getCookiesFromString(Utils.getConfigString("deviantart.cookies", loginCookies)); - } + /** + * Starts new Thread to find download link + filename + filetype + */ + @Override + protected void downloadURL(URL url, int index) { + this.downloadCount += 1; + LOGGER.info("Downloading URL Number " + this.downloadCount); + LOGGER.info("Deviant Art URL: " + url.toExternalForm()); + try { + Response re = Http.url(urlWithParams(this.offset)).cookies(getDACookie()).referrer(referer) + .userAgent(userAgent).response(); + updateCookie(re.cookies()); + } catch (IOException e) { + e.printStackTrace(); + } + + // Start Thread and add to pool. + DeviantartImageThread t = new DeviantartImageThread(url); + deviantartThreadPool.addThread(t); + + } + + @Override + public String normalizeUrl(String url) { + return (urlWithParams(this.offset).toExternalForm()); + } + + /** + * Returns name of album. Album name consists of 3 words: - Artist (owner of + * gallery) - Type (gallery or favorites folder) - Name of the folder + * + * Returns artist_type_name + */ + @Override + public String getGID(URL url) throws MalformedURLException { + + String s = url.toExternalForm(); + String artist = "unknown"; + String what = "unknown"; + String albumname = "unknown"; + + if (url.toExternalForm().contains("catpath=/")) { + this.usingCatPath = true; + } + + Pattern p = Pattern.compile("^https?://www.deviantart\\.com/([a-zA-Z0-9]+).*$"); + Matcher m = p.matcher(s); + + // Artist + if (m.matches()) { + artist = m.group(1); + } else { + throw new MalformedURLException("Expected deviantart.com URL format: " + + "www.deviantart.com//gallery//\nOR\nwww.deviantart.com//favourites//\\nOr simply the gallery or favorites of some artist - got " + + url + " instead"); + } + + // What is it + if (s.contains("/gallery/")) { + what = "gallery"; + } else if (s.contains("/favourites/")) { + what = "favourites"; + } else { + throw new MalformedURLException("Expected deviantart.com URL format: " + + "www.deviantart.com//gallery//\nOR\nwww.deviantart.com//favourites//\nOr simply the gallery or favorites of some artist - got " + + url + " instead"); + } + + // Album Name + Pattern artistP = Pattern + .compile("^https?://www.deviantart\\.com/[a-zA-Z0-9]+/[a-zA-Z]+/[0-9]+/([a-zA-Z0-9-]+).*$"); + Matcher artistM = artistP.matcher(s); + if (s.endsWith("?catpath=/")) { + albumname = "all"; + } else if (s.endsWith("/favourites/") || s.endsWith("/gallery/")) { + albumname = "featured"; + } else if (artistM.matches()) { + albumname = artistM.group(1); + } + LOGGER.info("Album Name: " + artist + "_" + what + "_" + albumname); + + return artist + "_" + what + "_" + albumname; + + } + + /** + * + * @return Clean URL as String + */ + private String cleanURL() { + return (this.url.toExternalForm().split("\\?"))[0]; + } + + /** + * Return correct url with params (catpath) and current offset + * + * @return URL to page with offset + */ + private URL urlWithParams(int offset) { + try { + String url = cleanURL(); + if (this.usingCatPath) { + return (new URL(url + "?catpath=/&offset=" + offset)); + } else { + return (new URL(url + "?offset=" + offset)); + } + } catch (MalformedURLException e) { + e.printStackTrace(); + } + return null; + } + + /** + * Returns Hashmap usable as Cookie for NSFW Artworks Not really needed but + * maybe useful later. + * + * @return Cookie Hashmap + */ + private Map getDACookie() { + return this.cookies; + } + + /** + * Updates cookies + * + * @param m new Cookies + */ + private void updateCookie(Map m) { + + if (m == null) { + return; + } + + Iterator iter = m.keySet().iterator(); + while (iter.hasNext()) { + String current = iter.next(); + if (!this.allowedCookies.contains(current)) { + // m.remove(current); + iter.remove(); + } + } + + LOGGER.info("Updating Cookies"); + LOGGER.info("Old Cookies: " + getDACookie() + " "); + LOGGER.info("New Cookies: " + m + " "); + this.cookies.putAll(m); + this.cookies.put("agegate_state", "1"); + LOGGER.info("Merged Cookies: " + getDACookie() + " "); + + try { + Utils.setConfigString(utilsKey, serialize(new HashMap(getDACookie()))); + Utils.saveConfig(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + + /** + * Serializes an Object and returns a String ready to store Used to store + * cookies in the config file because the deviantart cookies contain all sort of + * special characters like ; , = : and so on. + * + * @param o Object to serialize + * @return The serialized base64 encoded object + * @throws IOException + */ + private String serialize(Serializable o) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(baos); + oos.writeObject(o); + oos.close(); + return Base64.getEncoder().encodeToString(baos.toByteArray()); + } + + /** + * Recreates the object from the base64 encoded String. Used for Cookies + * + * @param s the Base64 encoded string + * @return the Cookie Map + * @throws IOException + * @throws ClassNotFoundException + */ + private Map deserialize(String s) throws IOException, ClassNotFoundException { + byte[] data = Base64.getDecoder().decode(s); + ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(data)); + HashMap o = (HashMap) ois.readObject(); // Unchecked cast here but should never + // be something else + ois.close(); + return o; + } + + /** + * Checks if the current cookies are still valid/usable. Also checks if agegate + * is given. + * + * @return True when all is good. + */ + private boolean checkLogin() { + if (!getDACookie().containsKey("agegate_state")) { + LOGGER.info("No agegate key"); + return false; + } else if (!getDACookie().get("agegate_state").equals("1")) { + LOGGER.info("Wrong agegate value"); + return false; + } + + try { + LOGGER.info("Login with Cookies: " + getDACookie()); + Response res = Http.url("https://www.deviantart.com/users/login").connection().followRedirects(true) + .cookies(getDACookie()).referrer(this.referer).userAgent(this.userAgent).execute(); + if (!res.url().toExternalForm().equals("https://www.deviantart.com/users/login")) { + LOGGER.info("Cookies are valid"); + LOGGER.info(res.url()); + return true; + } else { + LOGGER.info("Cookies invalid. Wrong URL: " + res.url()); + LOGGER.info(res.statusCode()); + LOGGER.info(res.parse()); + return false; + } + } catch (IOException e) { + e.printStackTrace(); + return false; + } + } + + /** + * Analyzes an image page like + * https://www.deviantart.com/kageuri/art/RUBY-568396655 . + * + * Looks for download button, follows the authentications and redirects and adds + * the Image URL to the download queue. If no download button is present it will + * use the largest version of the image. + * + * Should work with all filetypes on Deviantart. Tested with .JPG .PNG and .PDF + * + * @author MrPlaygon + * + */ + private class DeviantartImageThread extends Thread { + private URL url; + + public DeviantartImageThread(URL url) { + this.url = url; + } + + @Override + public void run() { + getFullSizeURL(); + } + + /** + * Get URL to Artwork and return fullsize URL with file ending. + * + * @param page Like + * https://www.deviantart.com/apofiss/art/warmest-of-the-days-455668450 + * @return URL like + * https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/intermediary/f/07f7a6bb-2d35-4630-93fc-be249af22b3e/d7jak0y-d20e5932-df72-4d13-b002-5e122037b373.jpg + * + * + */ + private void getFullSizeURL() { + + LOGGER.info("Searching max. Resolution for " + url); + sendUpdate(STATUS.LOADING_RESOURCE, "Searching max. resolution for " + url); + try { + Response re = Http.url(url).connection().referrer(referer).userAgent(userAgent).cookies(getDACookie()) + .execute(); + Document doc = re.parse(); + + // Artwork Title + String title = doc.select("a.title").first().html(); + title = title.replaceAll("[^a-zA-Z0-9\\.\\-]", "_").toLowerCase(); + + int counter = 1; + if (names.contains(title)) { + while (names.contains(title + "_" + counter)) { + counter++; + } + title = title + "_" + counter; + } + names.add(title); + + // Check for download button + Element downloadButton = null; + + downloadButton = doc.select("a.dev-page-download").first(); + + // Download Button + if (downloadButton != null) { + LOGGER.info("Download Button found: " + downloadButton.attr("href")); + + Response download = Http.url(downloadButton.attr("href")).connection().cookies(getDACookie()) + .method(Method.GET).referrer(referer).userAgent(userAgent).ignoreContentType(true) + .followRedirects(true).execute(); + URL location = download.url(); + + System.out.println("----------------> " + url); + String[] filetypePart = download.header("Content-Disposition").split("\\."); + + LOGGER.info("Found Image URL"); + LOGGER.info(url); + LOGGER.info(location); + + addURLToDownload(location, "", "", "", getDACookie(), + title + "." + filetypePart[filetypePart.length - 1]); + return; + } + + // No Download Button + Element div = doc.select("div.dev-view-deviation").first(); + + Element image = div.getElementsByTag("img").first(); + + String source = ""; + if (image == null) { + LOGGER.error("ERROR on " + url); + + LOGGER.error("Cookies: " + getDACookie() + " "); + LOGGER.error(div); + sendUpdate(STATUS.DOWNLOAD_ERRORED, "ERROR at\n" + url); + return; + } + + // When it is text art (e.g. story) the only image is the avator (profile + // picture) + if (image.hasClass("avatar")) { + LOGGER.error("No Image found, probably text art"); + LOGGER.error(url); + return; + } + + source = image.attr("src"); + + String[] parts = source.split("/v1/"); + + // Image page uses scaled down version. Split at /v1/ to receive max size. + if (parts.length > 2) { + LOGGER.error("Unexpected URL Format"); + sendUpdate(STATUS.DOWNLOAD_ERRORED, "Unexpected URL Format"); + return; + } + + String[] tmpParts = parts[0].split("\\."); + + LOGGER.info("Found Image URL"); + LOGGER.info(url); + LOGGER.info(parts[0]); + while (Http.url(parts[0]).connection().execute().statusCode() == 404) { + try { + LOGGER.error("404 on " + url); + Thread.sleep(1000); + } catch (Exception e) { + e.printStackTrace(); + } + } + addURLToDownload(new URL(parts[0]), "", "", "", new HashMap(), + title + "." + tmpParts[tmpParts.length - 1]); + return; + + } catch (IOException e) { + e.printStackTrace(); + } + + LOGGER.error("No Full Size URL for: " + url); + sendUpdate(STATUS.DOWNLOAD_ERRORED, "No image found for " + url); + + return; + + } + } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java index 864a730a..534a1d0d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/E621Ripper.java @@ -6,8 +6,6 @@ import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Utils; import java.io.IOException; import java.net.MalformedURLException; -import java.net.URI; -import java.net.URISyntaxException; import java.net.URL; import java.util.ArrayList; import java.util.List; @@ -18,136 +16,154 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; -public class E621Ripper extends AbstractHTMLRipper{ - private static final Logger logger = Logger.getLogger(E621Ripper.class); +public class E621Ripper extends AbstractHTMLRipper { + private static final Logger logger = Logger.getLogger(E621Ripper.class); - private static Pattern gidPattern=null; - private static Pattern gidPattern2=null; - private static Pattern gidPatternPool=null; + private static Pattern gidPattern = null; + private static Pattern gidPattern2 = null; + private static Pattern gidPatternPool = null; - private DownloadThreadPool e621ThreadPool=new DownloadThreadPool("e621"); + private DownloadThreadPool e621ThreadPool = new DownloadThreadPool("e621"); - public E621Ripper(URL url) throws IOException { - super(url); - } - - @Override - public DownloadThreadPool getThreadPool() { - return e621ThreadPool; - } - - @Override - public String getDomain() { - return "e621.net"; - } - - @Override - public String getHost() { - return "e621"; - } - - @Override - public Document getFirstPage() throws IOException { - if(url.getPath().startsWith("/pool/show/")) - return Http.url("https://e621.net/pool/show/"+getTerm(url)).get(); - else - return Http.url("https://e621.net/post/index/1/"+getTerm(url)).get(); - } - - private String getFullSizedImage(String url) { - try { - Document page = Http.url("https://e621.net" + url).get(); - Elements video = page.select("video > source"); - Elements flash = page.select("embed"); - Elements image = page.select("a#highres"); - if (video.size() > 0) { - return video.attr("src"); - } else if (flash.size() > 0) { - return flash.attr("src"); - } else if (image.size() > 0) { - return image.attr("href"); - } else { - throw new IOException(); - } - } catch (IOException e) { - logger.error("Unable to get full sized image from " + url); - return null; - } + public E621Ripper(URL url) throws IOException { + super(url); } - @Override - public List getURLsFromPage(Document page) { - Elements elements = page.select("div > span.thumb > a"); - List res = new ArrayList<>(); + @Override + public DownloadThreadPool getThreadPool() { + return e621ThreadPool; + } - for(Element e:elements) { - if (!e.attr("href").isEmpty()) { - String fullSizedImage = getFullSizedImage(e.attr("href")); - if (fullSizedImage != null && !fullSizedImage.equals("")) { - res.add(getFullSizedImage(e.attr("href"))); - } + @Override + public String getDomain() { + return "e621.net"; + } + + @Override + public String getHost() { + return "e621"; + } + + @Override + public Document getFirstPage() throws IOException { + if (url.getPath().startsWith("/pool/show/")) + return Http.url("https://e621.net/pool/show/" + getTerm(url)).get(); + else + return Http.url("https://e621.net/post/index/1/" + getTerm(url)).get(); + } + + @Override + public List getURLsFromPage(Document page) { + Elements elements = page.select("div > span.thumb > a"); + List res = new ArrayList<>(); + + for (Element e : elements) { + if (!e.attr("href").isEmpty()) { + res.add(e.attr("abs:href")); } - } + } - return res; - } + return res; + } - @Override - public Document getNextPage(Document page) throws IOException { - if (page.select("a.next_page") != null) { - return Http.url("https://e621.net" + page.select("a.next_page").attr("href")).get(); + @Override + public Document getNextPage(Document page) throws IOException { + if (!page.select("a.next_page").isEmpty()) { + return Http.url(page.select("a.next_page").attr("abs:href")).get(); } else { - throw new IOException("No more pages"); + throw new IOException("No more pages."); } } - @Override - public void downloadURL(final URL url, int index) { - addURLToDownload(url, getPrefix(index)); - } + @Override + public void downloadURL(final URL url, int index) { + // addURLToDownload(url, getPrefix(index)); + e621ThreadPool.addThread(new E621FileThread(url, getPrefix(index))); + } - private String getTerm(URL url) throws MalformedURLException{ - if(gidPattern==null) - gidPattern=Pattern.compile("^https?://(www\\.)?e621\\.net/post/index/[^/]+/([a-zA-Z0-9$_.+!*'():,%\\-]+)(/.*)?(#.*)?$"); - if(gidPatternPool==null) - gidPatternPool=Pattern.compile("^https?://(www\\.)?e621\\.net/pool/show/([a-zA-Z0-9$_.+!*'(),%:\\-]+)(\\?.*)?(/.*)?(#.*)?$"); + private String getTerm(URL url) throws MalformedURLException { + if (gidPattern == null) + gidPattern = Pattern.compile( + "^https?://(www\\.)?e621\\.net/post/index/[^/]+/([a-zA-Z0-9$_.+!*'():,%\\-]+)(/.*)?(#.*)?$"); + if (gidPatternPool == null) + gidPatternPool = Pattern.compile( + "^https?://(www\\.)?e621\\.net/pool/show/([a-zA-Z0-9$_.+!*'(),%:\\-]+)(\\?.*)?(/.*)?(#.*)?$"); - Matcher m = gidPattern.matcher(url.toExternalForm()); - if(m.matches()) { + Matcher m = gidPattern.matcher(url.toExternalForm()); + if (m.matches()) { LOGGER.info(m.group(2)); return m.group(2); } - m = gidPatternPool.matcher(url.toExternalForm()); - if(m.matches()) { + m = gidPatternPool.matcher(url.toExternalForm()); + if (m.matches()) { return m.group(2); } - throw new MalformedURLException("Expected e621.net URL format: e621.net/post/index/1/searchterm - got "+url+" instead"); - } + throw new MalformedURLException( + "Expected e621.net URL format: e621.net/post/index/1/searchterm - got " + url + " instead"); + } - @Override - public String getGID(URL url) throws MalformedURLException { + @Override + public String getGID(URL url) throws MalformedURLException { + String prefix = ""; + if (url.getPath().startsWith("/pool/show/")) { + prefix = "pool_"; + } + return Utils.filesystemSafe(prefix + getTerm(url)); + } - String prefix=""; - if (url.getPath().startsWith("/pool/show/")) { - prefix = "pool_"; + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + if (gidPattern2 == null) + gidPattern2 = Pattern.compile( + "^https?://(www\\.)?e621\\.net/post/search\\?tags=([a-zA-Z0-9$_.+!*'():,%-]+)(/.*)?(#.*)?$"); + + Matcher m = gidPattern2.matcher(url.toExternalForm()); + if (m.matches()) + return new URL("https://e621.net/post/index/1/" + m.group(2).replace("+", "%20")); + + return url; + } + + public class E621FileThread extends Thread { + + private URL url; + private String index; + + public E621FileThread(URL url, String index) { + this.url = url; + this.index = index; + } + + @Override + public void run() { + try { + String fullSizedImage = getFullSizedImage(url); + if (fullSizedImage != null && !fullSizedImage.equals("")) { + addURLToDownload(new URL(fullSizedImage), index); + } + } catch (IOException e) { + logger.error("Unable to get full sized image from " + url); + } + } + + private String getFullSizedImage(URL imageURL) throws IOException { + Document page = Http.url(imageURL).retries(3).get(); + Elements video = page.select("video > source"); + Elements flash = page.select("embed"); + Elements image = page.select("a#highres"); + if (video.size() > 0) { + return video.attr("src"); + } else if (flash.size() > 0) { + return flash.attr("src"); + } else if (image.size() > 0) { + return image.attr("href"); + } else { + throw new IOException(); } - return Utils.filesystemSafe(prefix+getTerm(url)); - - } - - @Override - public URL sanitizeURL(URL url) throws MalformedURLException { - if(gidPattern2==null) - gidPattern2=Pattern.compile("^https?://(www\\.)?e621\\.net/post/search\\?tags=([a-zA-Z0-9$_.+!*'():,%-]+)(/.*)?(#.*)?$"); - - Matcher m = gidPattern2.matcher(url.toExternalForm()); - if(m.matches()) - return new URL("https://e621.net/post/index/1/"+m.group(2).replace("+","%20")); - - return url; - } + } + } } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/EightmusesRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/EightmusesRipper.java index 7c4d15c1..22968216 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/EightmusesRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/EightmusesRipper.java @@ -115,8 +115,7 @@ public class EightmusesRipper extends AbstractHTMLRipper { String image = null; if (thumb.hasAttr("data-cfsrc")) { image = thumb.attr("data-cfsrc"); - } - else { + } else { // Deobfustace the json data String rawJson = deobfuscateJSON(page.select("script#ractive-public").html() .replaceAll(">", ">").replaceAll("<", "<").replace("&", "&")); @@ -125,17 +124,16 @@ public class EightmusesRipper extends AbstractHTMLRipper { for (int i = 0; i != json.getJSONArray("pictures").length(); i++) { image = "https://www.8muses.com/image/fl/" + json.getJSONArray("pictures").getJSONObject(i).getString("publicUri"); URL imageUrl = new URL(image); - if (Utils.getConfigBoolean("8muses.use_short_names", false)) { - addURLToDownload(imageUrl, getPrefixShort(x), getSubdir(page.select("title").text()), this.url.toExternalForm(), cookies, "", null, true); - } else { - addURLToDownload(imageUrl, getPrefixLong(x), getSubdir(page.select("title").text()), this.url.toExternalForm(), cookies, "", null, true); - } + addURLToDownload(imageUrl, getPrefixShort(x), getSubdir(page.select("title").text()), this.url.toExternalForm(), cookies, "", null, true); // X is our page index x++; + if (isThisATest()) { + break; + } } - - } catch (IOException e) { - continue; + return imageURLs; + } catch (MalformedURLException e) { + LOGGER.error("\"" + image + "\" is malformed"); } } if (!image.contains("8muses.com")) { diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ErofusRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ErofusRipper.java new file mode 100644 index 00000000..dc535dea --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ErofusRipper.java @@ -0,0 +1,119 @@ +package com.rarchives.ripme.ripper.rippers; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ui.RipStatusMessage; +import com.rarchives.ripme.utils.Http; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ErofusRipper extends AbstractHTMLRipper { + + public ErofusRipper(URL url) throws IOException { + super(url); + } + + @Override + public boolean hasASAPRipping() { + return true; + } + + @Override + public String getHost() { + return "erofus"; + } + + @Override + public String getDomain() { + return "erofus.com"; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https://www.erofus.com/comics/([a-zA-Z0-9\\-_]+).*$"); + Matcher m = p.matcher(url.toExternalForm()); + if (!m.matches()) { + throw new MalformedURLException("Expected URL format: http://www.8muses.com/index/category/albumname, got: " + url); + } + return m.group(m.groupCount()); + } + + @Override + public Document getFirstPage() throws IOException { + return Http.url(url).get(); + } + + @Override + public List getURLsFromPage(Document page) { + LOGGER.info(page); + List imageURLs = new ArrayList<>(); + int x = 1; + if (pageContainsImages(page)) { + LOGGER.info("Page contains images"); + ripAlbum(page); + } else { + // This contains the thumbnails of all images on the page + Elements pageImages = page.select("a.a-click"); + for (Element pageLink : pageImages) { + if (super.isStopped()) break; + if (pageLink.attr("href").contains("comics")) { + String subUrl = "https://erofus.com" + pageLink.attr("href"); + try { + LOGGER.info("Retrieving " + subUrl); + sendUpdate(RipStatusMessage.STATUS.LOADING_RESOURCE, subUrl); + Document subPage = Http.url(subUrl).get(); + List subalbumImages = getURLsFromPage(subPage); + } catch (IOException e) { + LOGGER.warn("Error while loading subalbum " + subUrl, e); + } + } + if (isThisATest()) break; + } + } + + + return imageURLs; + } + + public void ripAlbum(Document page) { + int x = 1; + Elements thumbs = page.select("a.a-click > div.thumbnail > img"); + for (Element thumb : thumbs) { + String image = "https://www.erofus.com" + thumb.attr("src").replaceAll("thumb", "medium"); + try { + Map opts = new HashMap(); + opts.put("subdirectory", page.title().replaceAll(" \\| Erofus - Sex and Porn Comics", "").replaceAll(" ", "_")); + opts.put("prefix", getPrefix(x)); + addURLToDownload(new URL(image), opts); + } catch (MalformedURLException e) { + LOGGER.info(e.getMessage()); + } + x++; + } + } + + private boolean pageContainsImages(Document page) { + Elements pageImages = page.select("a.a-click"); + for (Element pageLink : pageImages) { + if (pageLink.attr("href").contains("/pic/")) { + return true; + } + } + return false; + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index)); + } +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/EromeRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/EromeRipper.java index 39098f98..9b586b9a 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/EromeRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/EromeRipper.java @@ -51,7 +51,7 @@ public class EromeRipper extends AbstractHTMLRipper { @Override public boolean pageContainsAlbums(URL url) { - Pattern pa = Pattern.compile("https?://www.erome.com/([a-zA-Z0-9_-]*)/?"); + Pattern pa = Pattern.compile("https?://www.erome.com/([a-zA-Z0-9_\\-?=]*)/?"); Matcher ma = pa.matcher(url.toExternalForm()); return ma.matches(); } @@ -111,7 +111,7 @@ public class EromeRipper extends AbstractHTMLRipper { return m.group(1); } - p = Pattern.compile("^https?://www.erome.com/([a-zA-Z0-9_-]+)/?$"); + p = Pattern.compile("^https?://www.erome.com/([a-zA-Z0-9_\\-?=]+)/?$"); m = p.matcher(url.toExternalForm()); if (m.matches()) { diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/FuraffinityRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/FuraffinityRipper.java index 26699c2a..683c791b 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/FuraffinityRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/FuraffinityRipper.java @@ -219,16 +219,21 @@ public class FuraffinityRipper extends AbstractHTMLRipper { @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern - .compile("^https?://www\\.furaffinity\\.net/gallery/([-_.0-9a-zA-Z]+).*$"); + // Gallery + Pattern p = Pattern.compile("^https?://www\\.furaffinity\\.net/gallery/([-_.0-9a-zA-Z]+).*$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { return m.group(1); } - throw new MalformedURLException("Expected furaffinity.net URL format: " - + "www.furaffinity.net/gallery/username - got " + url - + " instead"); + //Scraps + p = Pattern.compile("^https?://www\\.furaffinity\\.net/scraps/([-_.0-9a-zA-Z]+).*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + + throw new MalformedURLException("Unable to find images in" + url); } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/FuskatorRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/FuskatorRipper.java index 45ce2b92..d88b16e8 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/FuskatorRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/FuskatorRipper.java @@ -1,23 +1,30 @@ package com.rarchives.ripme.ripper.rippers; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; -import java.net.URLDecoder; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.json.JSONArray; +import org.json.JSONObject; +import org.jsoup.Connection.Method; +import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; import com.rarchives.ripme.ripper.AbstractHTMLRipper; import com.rarchives.ripme.utils.Http; -import com.rarchives.ripme.utils.Utils; public class FuskatorRipper extends AbstractHTMLRipper { + private String jsonurl = "https://fuskator.com/ajax/gal.aspx"; + private String xAuthUrl = "https://fuskator.com/ajax/auth.aspx"; + private String xAuthToken; + private Map cookies; + public FuskatorRipper(URL url) throws IOException { super(url); } @@ -26,6 +33,7 @@ public class FuskatorRipper extends AbstractHTMLRipper { public String getHost() { return "fuskator"; } + @Override public String getDomain() { return "fuskator.com"; @@ -37,45 +45,55 @@ public class FuskatorRipper extends AbstractHTMLRipper { if (u.contains("/thumbs/")) { u = u.replace("/thumbs/", "/full/"); } + if (u.contains("/expanded/")) { + u = u.replaceAll("/expanded/", "/full/"); + } return new URL(u); } @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^.*fuskator.com/full/([a-zA-Z0-9\\-]+).*$"); + Pattern p = Pattern.compile("^.*fuskator.com/full/([a-zA-Z0-9\\-~]+).*$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { return m.group(1); } throw new MalformedURLException( - "Expected fuskator.com gallery formats: " - + "fuskator.com/full/id/..." - + " Got: " + url); + "Expected fuskator.com gallery formats: " + "fuskator.com/full/id/..." + " Got: " + url); } @Override public Document getFirstPage() throws IOException { - return Http.url(url).get(); + // return Http.url(url).get(); + Response res = Http.url(url).response(); + cookies = res.cookies(); + return res.parse(); } @Override public List getURLsFromPage(Document doc) { List imageURLs = new ArrayList<>(); - String html = doc.html(); - // Get "baseUrl" - String baseUrl = Utils.between(html, "unescape('", "'").get(0); + JSONObject json; + try { - baseUrl = URLDecoder.decode(baseUrl, "UTF-8"); - } catch (UnsupportedEncodingException e) { - LOGGER.warn("Error while decoding " + baseUrl, e); + getXAuthToken(); + if (xAuthToken == null || xAuthToken.isEmpty()) { + throw new IOException("No xAuthToken found."); + } + + // All good. Fetch JSON data from jsonUrl. + json = Http.url(jsonurl).cookies(cookies).data("X-Auth", xAuthToken).data("hash", getGID(url)) + .data("_", Long.toString(System.currentTimeMillis())).getJSON(); + } catch (IOException e) { + LOGGER.error("Couldnt fetch images.", e.getCause()); + return imageURLs; } - if (baseUrl.startsWith("//")) { - baseUrl = "http:" + baseUrl; - } - // Iterate over images - for (String filename : Utils.between(html, "+'", "'")) { - imageURLs.add(baseUrl + filename); + + JSONArray imageArray = json.getJSONArray("images"); + for (int i = 0; i < imageArray.length(); i++) { + imageURLs.add("https:" + imageArray.getJSONObject(i).getString("imageUrl")); } + return imageURLs; } @@ -83,4 +101,12 @@ public class FuskatorRipper extends AbstractHTMLRipper { public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index)); } + + private void getXAuthToken() throws IOException { + if (cookies == null || cookies.isEmpty()) { + throw new IOException("Null cookies or no cookies found."); + } + Response res = Http.url(xAuthUrl).cookies(cookies).method(Method.POST).response(); + xAuthToken = res.body(); + } } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/GfycatRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/GfycatRipper.java index 9c2db859..52a19b74 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/GfycatRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/GfycatRipper.java @@ -9,16 +9,24 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -import com.rarchives.ripme.ripper.AbstractSingleFileRipper; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import org.json.JSONArray; +import org.json.JSONObject; import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.rarchives.ripme.utils.Http; -public class GfycatRipper extends AbstractSingleFileRipper { +public class GfycatRipper extends AbstractHTMLRipper { private static final String HOST = "gfycat.com"; + String username = ""; + String cursor = ""; + String count = "30"; + + public GfycatRipper(URL url) throws IOException { super(url); @@ -45,10 +53,20 @@ public class GfycatRipper extends AbstractSingleFileRipper { return url; } + public boolean isProfile() { + Pattern p = Pattern.compile("^https?://[wm.]*gfycat\\.com/@([a-zA-Z0-9]+).*$"); + Matcher m = p.matcher(url.toExternalForm()); + return m.matches(); + } @Override public Document getFirstPage() throws IOException { - return Http.url(url).get(); + if (!isProfile()) { + return Http.url(url).get(); + } else { + username = getGID(url); + return Http.url(new URL("https://api.gfycat.com/v1/users/" + username + "/gfycats")).ignoreContentType().get(); + } } @Override @@ -58,7 +76,7 @@ public class GfycatRipper extends AbstractSingleFileRipper { @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://[wm.]*gfycat\\.com/([a-zA-Z0-9]+).*$"); + Pattern p = Pattern.compile("^https?://[wm.]*gfycat\\.com/@?([a-zA-Z0-9]+).*$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { return m.group(1); @@ -70,15 +88,45 @@ public class GfycatRipper extends AbstractSingleFileRipper { + " Got: " + url); } + private String stripHTMLTags(String t) { + t = t.replaceAll("\n" + + " \n" + + " ", ""); + t.replaceAll("\n" + + "", ""); + t = t.replaceAll("\n", ""); + t = t.replaceAll("=\"\"", ""); + return t; + } + + @Override + public Document getNextPage(Document doc) throws IOException { + if (cursor.equals("")) { + throw new IOException("No more pages"); + } + return Http.url(new URL("https://api.gfycat.com/v1/users/" + username + "/gfycats?count=" + count + "&cursor=" + cursor)).ignoreContentType().get(); + } + @Override public List getURLsFromPage(Document doc) { List result = new ArrayList<>(); - Elements videos = doc.select("source"); - String vidUrl = videos.first().attr("src"); - if (vidUrl.startsWith("//")) { - vidUrl = "http:" + vidUrl; + if (isProfile()) { + JSONObject page = new JSONObject(stripHTMLTags(doc.html())); + JSONArray content = page.getJSONArray("gfycats"); + for (int i = 0; i < content.length(); i++) { + result.add(content.getJSONObject(i).getString("mp4Url")); + } + cursor = page.getString("cursor"); + } else { + Elements videos = doc.select("script"); + for (Element el : videos) { + String json = el.html(); + if (json.startsWith("{")) { + JSONObject page = new JSONObject(json); + result.add(page.getJSONObject("video").getString("contentUrl")); + } + } } - result.add(vidUrl); return result; } @@ -95,14 +143,14 @@ public class GfycatRipper extends AbstractSingleFileRipper { url = new URL(url.toExternalForm().replace("/gifs/detail", "")); Document doc = Http.url(url).get(); - Elements videos = doc.select("source"); - if (videos.isEmpty()) { - throw new IOException("Could not find source at " + url); + Elements videos = doc.select("script"); + for (Element el : videos) { + String json = el.html(); + if (json.startsWith("{")) { + JSONObject page = new JSONObject(json); + return page.getJSONObject("video").getString("contentUrl"); + } } - String vidUrl = videos.first().attr("src"); - if (vidUrl.startsWith("//")) { - vidUrl = "http:" + vidUrl; - } - return vidUrl; + throw new IOException(); } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoundryRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoundryRipper.java index e9c1a810..d4482b4c 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoundryRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoundryRipper.java @@ -10,6 +10,7 @@ import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.rarchives.ripme.utils.Utils; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; @@ -53,8 +54,8 @@ public class HentaifoundryRipper extends AbstractHTMLRipper { Response resp; Document doc; - resp = Http.url("http://www.hentai-foundry.com/?enterAgree=1&size=1500") - .referrer("http://www.hentai-foundry.com/") + resp = Http.url("https://www.hentai-foundry.com/?enterAgree=1&size=1500") + .referrer("https://www.hentai-foundry.com/") .cookies(cookies) .response(); // The only cookie that seems to matter in getting around the age wall is the phpsession cookie @@ -86,11 +87,11 @@ public class HentaifoundryRipper extends AbstractHTMLRipper { data.put("rating_incest" , "1"); data.put("rating_rape" , "1"); data.put("filter_media" , "A"); - data.put("filter_order" , "date_new"); + data.put("filter_order" , Utils.getConfigString("hentai-foundry.filter_order","date_old")); data.put("filter_type" , "0"); - resp = Http.url("http://www.hentai-foundry.com/site/filters") - .referrer("http://www.hentai-foundry.com/") + resp = Http.url("https://www.hentai-foundry.com/site/filters") + .referrer("https://www.hentai-foundry.com/") .cookies(cookies) .data(data) .method(Method.POST) @@ -102,7 +103,7 @@ public class HentaifoundryRipper extends AbstractHTMLRipper { } resp = Http.url(url) - .referrer("http://www.hentai-foundry.com/") + .referrer("https://www.hentai-foundry.com/") .cookies(cookies) .response(); cookies.putAll(resp.cookies()); @@ -119,7 +120,7 @@ public class HentaifoundryRipper extends AbstractHTMLRipper { Element first = els.first(); try { String nextURL = first.attr("href"); - nextURL = "http://www.hentai-foundry.com" + nextURL; + nextURL = "https://www.hentai-foundry.com" + nextURL; return Http.url(nextURL) .referrer(url) .cookies(cookies) @@ -135,8 +136,8 @@ public class HentaifoundryRipper extends AbstractHTMLRipper { // this if is for ripping pdf stories if (url.toExternalForm().contains("/stories/")) { for (Element pdflink : doc.select("a.pdfLink")) { - LOGGER.info("grabbing " + "http://www.hentai-foundry.com" + pdflink.attr("href")); - imageURLs.add("http://www.hentai-foundry.com" + pdflink.attr("href")); + LOGGER.info("grabbing " + "https://www.hentai-foundry.com" + pdflink.attr("href")); + imageURLs.add("https://www.hentai-foundry.com" + pdflink.attr("href")); } return imageURLs; } @@ -153,8 +154,8 @@ public class HentaifoundryRipper extends AbstractHTMLRipper { Document imagePage; try { - LOGGER.info("grabbing " + "http://www.hentai-foundry.com" + thumb.attr("href")); - imagePage = Http.url("http://www.hentai-foundry.com" + thumb.attr("href")).cookies(cookies).get(); + LOGGER.info("grabbing " + "https://www.hentai-foundry.com" + thumb.attr("href")); + imagePage = Http.url("https://www.hentai-foundry.com" + thumb.attr("href")).cookies(cookies).get(); } catch (IOException e) { @@ -164,10 +165,10 @@ public class HentaifoundryRipper extends AbstractHTMLRipper { } // This is here for when the image is resized to a thumbnail because ripme doesn't report a screensize if (imagePage.select("div.boxbody > img.center").attr("src").contains("thumbs.")) { - imageURLs.add("http:" + imagePage.select("div.boxbody > img.center").attr("onclick").replace("this.src=", "").replace("'", "").replace("; $(#resize_message).hide();", "")); + imageURLs.add("https:" + imagePage.select("div.boxbody > img.center").attr("onclick").replace("this.src=", "").replace("'", "").replace("; $(#resize_message).hide();", "")); } else { - imageURLs.add("http:" + imagePage.select("div.boxbody > img.center").attr("src")); + imageURLs.add("https:" + imagePage.select("div.boxbody > img.center").attr("src")); } } return imageURLs; @@ -179,7 +180,12 @@ public class HentaifoundryRipper extends AbstractHTMLRipper { if (url.toExternalForm().endsWith(".pdf")) { addURLToDownload(url, getPrefix(index), "", this.url.toExternalForm(), cookies); } else { - addURLToDownload(url, getPrefix(index)); +// If hentai-foundry.use_prefix is false the ripper will not add a numbered prefix to any images + if (Utils.getConfigBoolean("hentai-foundry.use_prefix", true)) { + addURLToDownload(url, getPrefix(index)); + } else { + addURLToDownload(url, ""); + } } } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoxRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoxRipper.java new file mode 100644 index 00000000..a4e5895d --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/HentaifoxRipper.java @@ -0,0 +1,78 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; + +public class HentaifoxRipper extends AbstractHTMLRipper { + + public HentaifoxRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + return "hentaifox"; + } + + @Override + public String getDomain() { + return "hentaifox.com"; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("https://hentaifox.com/gallery/([\\d]+)/?"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + throw new MalformedURLException("Expected hentaifox URL format: " + + "https://hentaifox.com/gallery/ID - got " + url + " instead"); + } + + @Override + public Document getFirstPage() throws IOException { + // "url" is an instance field of the superclass + return Http.url(url).get(); + } + + @Override + public List getURLsFromPage(Document doc) { + LOGGER.info(doc); + List result = new ArrayList<>(); + for (Element el : doc.select("div.preview_thumb > a > img")) { + String imageSource = "https:" + el.attr("data-src").replaceAll("t\\.jpg", ".jpg"); + result.add(imageSource); + } + return result; + } + + @Override + public String getAlbumTitle(URL url) throws MalformedURLException { + try { + Document doc = getFirstPage(); + String title = doc.select("div.info > h1").first().text(); + return getHost() + "_" + title + "_" + getGID(url); + } catch (Exception e) { + // Fall back to default album naming convention + LOGGER.warn("Failed to get album title from " + url, e); + } + return super.getAlbumTitle(url); + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index)); + } +} diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java index 07a6e529..f097e667 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java @@ -145,15 +145,13 @@ public class ImagefapRipper extends AbstractHTMLRipper { try { // Attempt to use album title as GID String title = getFirstPage().title(); - Pattern p = Pattern.compile("^Porn pics of (.*) \\(Page 1\\)$"); - Matcher m = p.matcher(title); - if (m.matches()) { - return getHost() + "_" + m.group(1) + "_" + getGID(url); - } + title = title.replace("Porn Pics & Porn GIFs", ""); + title = title.replace(" ", "_"); + String toReturn = getHost() + "_" + title + "_" + getGID(url); + return toReturn.replaceAll("__", "_"); } catch (IOException e) { - // Fall back to default album naming convention + return super.getAlbumTitle(url); } - return super.getAlbumTitle(url); } private String getFullSizedImage(String pageURL) { diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 37e27214..d0f8dd9a 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -39,9 +39,6 @@ public class InstagramRipper extends AbstractJSONRipper { private String userID; private String rhx_gis = null; private String csrftoken; - // Run into a weird issue with Jsoup cutting some json pages in half, this is a work around - // see https://github.com/RipMeApp/ripme/issues/601 - private String workAroundJsonString; @@ -192,6 +189,9 @@ public class InstagramRipper extends AbstractJSONRipper { Document p = resp.parse(); // Get the query hash so we can download the next page qHash = getQHash(p); + if (qHash == null) { + throw new IOException("Unable to extract qhash from page"); + } return getJSONFromPage(p); } @@ -398,7 +398,6 @@ public class InstagramRipper extends AbstractJSONRipper { } private boolean pageHasImages(JSONObject json) { - LOGGER.info(json); int numberOfImages = json.getJSONObject("data").getJSONObject("user") .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); if (numberOfImages == 0) { @@ -422,23 +421,36 @@ public class InstagramRipper extends AbstractJSONRipper { } in.close(); - workAroundJsonString = sb.toString(); return new JSONObject(sb.toString()); } catch (MalformedURLException e) { - LOGGER.info("Unable to get query_hash, " + url + " is a malformed URL"); + LOGGER.info("Unable to get page, " + url + " is a malformed URL"); return null; } catch (IOException e) { - LOGGER.info("Unable to get query_hash"); + LOGGER.info("Unable to get page"); LOGGER.info(e.getMessage()); return null; } } + private String getQhashUrl(Document doc) { + for(Element el : doc.select("link[rel=preload]")) { + if (el.attr("href").contains("ProfilePageContainer")) { + return el.attr("href"); + } + } + for(Element el : doc.select("link[rel=preload]")) { + if (el.attr("href").contains("metro")) { + return el.attr("href"); + } + } + return null; + } + private String getQHash(Document doc) { - String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href"); + String jsFileURL = "https://www.instagram.com" + getQhashUrl(doc); StringBuilder sb = new StringBuilder(); - Document jsPage; + LOGGER.info(jsFileURL); try { // We can't use Jsoup here because it won't download a non-html file larger than a MB // even if you set maxBodySize to 0 @@ -454,7 +466,7 @@ public class InstagramRipper extends AbstractJSONRipper { LOGGER.info("Unable to get query_hash, " + jsFileURL + " is a malformed URL"); return null; } catch (IOException e) { - LOGGER.info("Unable to get query_hash"); + LOGGER.info("Unable to get query_hash from " + jsFileURL); LOGGER.info(e.getMessage()); return null; } @@ -468,6 +480,12 @@ public class InstagramRipper extends AbstractJSONRipper { m = jsP.matcher(sb.toString()); if (m.find()) { return m.group(1); + } else { + jsP = Pattern.compile(",u=.([a-zA-Z0-9]+)."); + m = jsP.matcher(sb.toString()); + if (m.find()) { + return m.group(1); + } } } @@ -477,6 +495,7 @@ public class InstagramRipper extends AbstractJSONRipper { if (m.find()) { return m.group(1); } + } LOGGER.error("Could not find query_hash on " + jsFileURL); return null; diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/MulemaxRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/MulemaxRipper.java index 6f9d178d..01bf4b1c 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/MulemaxRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/MulemaxRipper.java @@ -74,6 +74,6 @@ public class MulemaxRipper extends AbstractSingleFileRipper { @Override public void downloadURL(URL url, int index) { - addURLToDownload(url, getPrefix(index)); + addURLToDownload(url, getPrefix(index), "", "mulemax.com", null); } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/PorncomixDotOneRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/PorncomixDotOneRipper.java index 558060eb..c1e7fac7 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/PorncomixDotOneRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/PorncomixDotOneRipper.java @@ -51,7 +51,7 @@ public class PorncomixDotOneRipper extends AbstractHTMLRipper { public List getURLsFromPage(Document doc) { List result = new ArrayList<>(); // We have 2 loops here to cover all the different album types - for (Element el : doc.select(".dgwt-jg-gallery > a")) { + for (Element el : doc.select(".dgwt-jg-item > a")) { result.add(el.attr("href")); } for (Element el : doc.select(".unite-gallery > img")) { diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java index f0984d7d..e68e477d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/RedditRipper.java @@ -118,6 +118,12 @@ public class RedditRipper extends AlbumRipper { return nextURL; } + /** + * Gets a representation of the specified reddit page as a JSONArray using the reddit API + * @param url The url of the desired page + * @return A JSONArray object representation of the desired page + * @throws IOException If no response is received from the url + */ private JSONArray getJsonArrayFromURL(URL url) throws IOException { // Wait 2 seconds before the next request long timeDiff = System.currentTimeMillis() - lastRequestTime; @@ -149,9 +155,30 @@ public class RedditRipper extends AlbumRipper { return jsonArray; } + /** + * Turns child JSONObject's into usable URLs and hands them off for further processing + * Performs filtering checks based on the reddit. + * Only called from getAndParseAndReturnNext() while parsing the JSONArray returned from reddit's API + * @param child The child to process + */ private void parseJsonChild(JSONObject child) { String kind = child.getString("kind"); JSONObject data = child.getJSONObject("data"); + + //Upvote filtering + if (Utils.getConfigBoolean("reddit.rip_by_upvote", false)){ + int score = data.getInt("score"); + int maxScore = Utils.getConfigInteger("reddit.max_upvotes", Integer.MAX_VALUE); + int minScore = Utils.getConfigInteger("reddit.min_upvotes", Integer.MIN_VALUE); + + if (score > maxScore || score < minScore) { + + String message = "Skipping post with score outside specified range of " + minScore + " to " + maxScore; + sendUpdate(RipStatusMessage.STATUS.DOWNLOAD_WARN, message); + return; //Outside specified range, do not download + } + } + if (kind.equals("t1")) { // Comment handleBody(data.getString("body"), data.getString("id"), ""); diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java index 7d1a38bc..3c9d751d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java @@ -7,13 +7,31 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; public class ThechiveRipper extends AbstractHTMLRipper { + private Pattern p1 = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$"); + private Pattern imagePattern = Pattern.compile(""); + + // i.thechive.com specific variables. + private Pattern p2 = Pattern.compile("^https?://i.thechive.com/([0-9a-zA-Z_]+)"); + private String jsonUrl = "https://i.thechive.com/rest/uploads"; + private Map cookies = new HashMap<>(); + private String nextSeed = ""; + private String username = ""; public ThechiveRipper(URL url) throws IOException { super(url); @@ -21,7 +39,12 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public String getHost() { - return "thechive"; + Matcher m1 = p1.matcher(url.toExternalForm()); + if (m1.matches()) { + return "thechive"; + } else { + return "i.thechive"; // for suitable album title. + } } @Override @@ -31,14 +54,20 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - boolean isTag = false; - return m.group(1); + + Matcher m1 = p1.matcher(url.toExternalForm()); + if (m1.matches()) { + return m1.group(1); } + + Matcher m2 = p2.matcher(url.toExternalForm()); + if (m2.matches()) { + username = m2.group(1); + return username; + } + throw new MalformedURLException("Expected thechive.com URL format: " - + "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ - got " + url + " instead"); + + "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ OR i.thechive.com/username, got " + url + " instead."); } @Override @@ -49,27 +78,148 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public List getURLsFromPage(Document doc) { - List result = new ArrayList<>(); - for (Element el : doc.select("img.attachment-gallery-item-full")) { - String imageSource; - if (el.attr("data-gifsrc").isEmpty()) { //If it's not a gif - imageSource = el.attr("src"); - } else { //If it is a gif - imageSource = el.attr("data-gifsrc") //from data-gifsrc attribute - .replaceAll("\\?w=\\d{3}", ""); //remove the width modifier at the end to get highest resolution - //May need to replace the regex's {3} later on if website starts giving higher-res photos by default. - } + List result; + Matcher matcher = p1.matcher(url.toExternalForm()); - // We replace thumbs with resizes so we can the full sized images - imageSource = imageSource.replace("thumbs", "resizes"); - result.add(imageSource); + if (matcher.matches()) { + // for url type: thechive.com/YEAR/MONTH/DAY/POSTTITLE/ + result = getUrlsFromThechive(doc); + } else { + // for url type: i.thechive.com/username + result = getUrlsFromIDotThechive(); } return result; } + @Override + public Document getNextPage(Document doc) throws IOException { + Matcher matcher = p1.matcher(url.toExternalForm()); + + if (matcher.matches()) { + // url type thechive.com/YEAR/MONTH/DAY/POSTTITLE/ has a single page. + return null; + } else { + if (nextSeed == null) { + throw new IOException("No more pages."); + } + } + + // Following try block checks if the next JSON object has images or not. + // This is done to avoid IOException in rip() method, caused when + // getURLsFromPage() returns empty list. + JSONArray imgList; + try { + Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username") + .data("username", username).ignoreContentType().cookies(cookies).response(); + cookies = response.cookies(); + JSONObject json = new JSONObject(response.body()); + imgList = json.getJSONArray("uploads"); + } catch (Exception e) { + throw new IOException("Error fetching next page.", e); + } + + if (imgList != null && imgList.length() > 0) { + // Pass empty document as it is of no use for thechive.com/userName url type. + return new Document(url.toString()); + } else { + // Return null as this is last page. + return null; + } + } + @Override public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index)); } + private List getUrlsFromThechive(Document doc) { + /* + * The image urls are stored in a