From 5f3575ba37e8afed9c2da61369c7e932a23e23f1 Mon Sep 17 00:00:00 2001 From: Tushar Date: Mon, 18 Mar 2019 20:01:58 +0530 Subject: [PATCH] Added support for i.thechive.com --- .../ripme/ripper/rippers/ThechiveRipper.java | 162 +++++++++++++++--- .../ripper/rippers/ThechiveRipperTest.java | 77 +++++---- 2 files changed, 187 insertions(+), 52 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java index 7d1a38bc..e3bdd028 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java @@ -7,13 +7,31 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; public class ThechiveRipper extends AbstractHTMLRipper { + private Pattern p1 = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$"); + private Pattern imagePattern = Pattern.compile(""); + + // i.thechive.com specific variables. + private Pattern p2 = Pattern.compile("^https?://i.thechive.com/([0-9a-zA-Z_]+)"); + private String jsonUrl = "https://i.thechive.com/rest/uploads"; + private Map cookies = new HashMap<>(); + private String nextSeed = ""; + private String username = ""; public ThechiveRipper(URL url) throws IOException { super(url); @@ -21,7 +39,12 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public String getHost() { - return "thechive"; + Matcher m1 = p1.matcher(url.toExternalForm()); + if (m1.matches()) { + return "thechive"; + } else { + return "i.thechive"; // for suitable album title. + } } @Override @@ -31,14 +54,20 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - boolean isTag = false; - return m.group(1); + + Matcher m1 = p1.matcher(url.toExternalForm()); + if (m1.matches()) { + return m1.group(1); } + + Matcher m2 = p2.matcher(url.toExternalForm()); + if (m2.matches()) { + username = m2.group(1); + return username; + } + throw new MalformedURLException("Expected thechive.com URL format: " - + "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ - got " + url + " instead"); + + "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ OR i.thechive.com/username, got " + url + " instead."); } @Override @@ -49,27 +78,120 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public List getURLsFromPage(Document doc) { - List result = new ArrayList<>(); - for (Element el : doc.select("img.attachment-gallery-item-full")) { - String imageSource; - if (el.attr("data-gifsrc").isEmpty()) { //If it's not a gif - imageSource = el.attr("src"); - } else { //If it is a gif - imageSource = el.attr("data-gifsrc") //from data-gifsrc attribute - .replaceAll("\\?w=\\d{3}", ""); //remove the width modifier at the end to get highest resolution - //May need to replace the regex's {3} later on if website starts giving higher-res photos by default. - } + List result; + Matcher matcher = p1.matcher(url.toExternalForm()); - // We replace thumbs with resizes so we can the full sized images - imageSource = imageSource.replace("thumbs", "resizes"); - result.add(imageSource); + if (matcher.matches()) { + result = getUrlsFromThechive(doc); + } else { + result = getUrlsFromIDotThechive(); } return result; } + @Override + public Document getNextPage(Document doc) throws IOException { + Matcher matcher = p1.matcher(url.toExternalForm()); + + if (matcher.matches()) { + // for pattern p1. + return null; + } else { + if (nextSeed == null) { + throw new IOException("No more pages."); + } + } + + // check if next json has elements. + JSONArray imgList; + try { + Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username") + .data("username", username).ignoreContentType().cookies(cookies).response(); + cookies = response.cookies(); + JSONObject json = new JSONObject(response.body()); + imgList = json.getJSONArray("uploads"); + } catch (Exception e) { + throw new IOException("Error fetching next page.", e); + } + + if (imgList != null && imgList.length() > 0) { + return new Document(url.toString()); // empty document. + } else { + return null; + } + } + @Override public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index)); } + private List getUrlsFromThechive(Document doc) { + List result = new ArrayList<>(); + Elements scripts = doc.getElementsByTag("script"); + + for (Element script : scripts) { + String data = script.data(); + + if (!data.contains("CHIVE_GALLERY_ITEMS")) { + continue; + } + + /* + * We add all the tags in a single StringBuilder and parse as HTML for + * easy sorting of img/ gifs. + */ + StringBuilder allImgTags = new StringBuilder(); + Matcher matcher = imagePattern.matcher(data); + while (matcher.find()) { + allImgTags.append(matcher.group(0).replaceAll("\\\\", "")); + } + + // Now we parse and sort links. + Document imgDoc = Jsoup.parse(allImgTags.toString()); + Elements imgs = imgDoc.getElementsByTag("img"); + for (Element img : imgs) { + if (img.hasAttr("data-gifsrc")) { + // result.add(img.attr("data-gifsrc")); + result.add(img.attr("data-gifsrc")); + } else { + // result.add(img.attr("src")); + result.add(img.attr("src")); + } + } + } + + // strip all GET parameters from the links( such as quality). + result.replaceAll(s -> s.substring(0, s.indexOf("?"))); + + return result; + } + + private List getUrlsFromIDotThechive() { + // check for pattern p2. + List result = new ArrayList<>(); + try { + Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username") + .data("username", username).ignoreContentType().cookies(cookies).response(); + cookies = response.cookies(); + JSONObject json = new JSONObject(response.body()); + JSONArray imgList = json.getJSONArray("uploads"); + nextSeed = null; // if no more images, nextSeed stays null + for (int i = 0; i < imgList.length(); i++) { + JSONObject img = imgList.getJSONObject(i); + if (img.getString("mediaType").equals("gif")) { + result.add("https:" + img.getString("mediaUrlOverlay")); + } else { + result.add("https:" + img.getString("mediaGifFrameUrl")); + } + nextSeed = img.getString("activityId"); + } + } catch (IOException e) { + LOGGER.error("Unable to fetch JSON data for url: " + url); + } catch (JSONException e) { + LOGGER.error("JSON error while parsing data for url: " + url); + } + return result; + } + } diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ThechiveRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ThechiveRipperTest.java index 89470dce..3e2e3f6c 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ThechiveRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ThechiveRipperTest.java @@ -26,9 +26,9 @@ package com.rarchives.ripme.tst.ripper.rippers; import com.rarchives.ripme.ripper.rippers.ThechiveRipper; import java.io.IOException; import java.net.URL; -import org.jsoup.nodes.Attributes; -import org.jsoup.nodes.Element; -import org.jsoup.parser.Tag; +//import org.jsoup.nodes.Attributes; +//import org.jsoup.nodes.Element; +//import org.jsoup.parser.Tag; /** * @@ -41,40 +41,53 @@ public class ThechiveRipperTest extends RippersTest { * * @throws IOException */ - public void theChiveRip() throws IOException { - ThechiveRipper ripper = new ThechiveRipper(new URL("https://thechive.com/2018/10/03/the-definitive-list-of-the-hottest-horror-movie-babes/")); + public void testTheChiveRip() throws IOException { + ThechiveRipper ripper = new ThechiveRipper(new URL( + "https://thechive.com/2019/03/16/beautiful-badasses-lookin-good-in-and-out-of-uniform-35-photos/")); + testRipper(ripper); + } + + public void testTheChiveGif() throws IOException { + ThechiveRipper ripper = new ThechiveRipper( + new URL("https://thechive.com/2019/03/14/dont-tease-me-just-squeeze-me-20-gifs/")); testRipper(ripper); } /* - - //If anyone figures out how to get JSOUP Elements mocked up, we can use the following methods to test both jpeg + gif ripping. - - public void testGifRip() throws IOException { - String elementInString = "" - - Element el = new Element( - new Tag("img"), - "",//URI - new Attributes()); - String URL = ThechiveRipper.getImageSource(el); - assertTrue(URL.equals("https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif")); + * "i.thechive.com" test. + */ + public void testIDotThechive() throws IOException { + ThechiveRipper ripper = new ThechiveRipper(new URL("https://i.thechive.com/HHHoney")); + testRipper(ripper); } - public void testGifRip() throws IOException { - String elementInString = ""; - Element el = new Element( - new Tag("img"), - "",//URI - new Attributes()); - String URL = ThechiveRipper.getImageSource(el); - assertTrue(URL.equals("https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg")); - } + /* + * + * //If anyone figures out how to get JSOUP Elements mocked up, we can use the + * following methods to test both jpeg + gif ripping. + * + * public void testGifRip() throws IOException { String elementInString = + * "" + * + * Element el = new Element( new Tag("img"), "",//URI new Attributes()); String + * URL = ThechiveRipper.getImageSource(el); assertTrue(URL.equals( + * "https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif" + * )); } + * + * public void testGifRip() throws IOException { String elementInString = + * "" + * ; Element el = new Element( new Tag("img"), "",//URI new Attributes()); + * String URL = ThechiveRipper.getImageSource(el); assertTrue(URL.equals( + * "https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg" + * )); } */ } \ No newline at end of file