1
0
mirror of https://github.com/RipMeApp/ripme.git synced 2025-08-26 07:14:38 +02:00

Merge pull request #1246 from Tush-r/thechive

Fixed ThechiveRipper( and added support for i.thechive.com)
This commit is contained in:
cyian-1756
2019-03-24 12:03:36 -05:00
committed by GitHub
2 changed files with 215 additions and 52 deletions

View File

@@ -7,13 +7,31 @@ import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ThechiveRipper extends AbstractHTMLRipper { public class ThechiveRipper extends AbstractHTMLRipper {
private Pattern p1 = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$");
private Pattern imagePattern = Pattern.compile("<img\\s(?:.|\\n)+?>");
// i.thechive.com specific variables.
private Pattern p2 = Pattern.compile("^https?://i.thechive.com/([0-9a-zA-Z_]+)");
private String jsonUrl = "https://i.thechive.com/rest/uploads";
private Map<String, String> cookies = new HashMap<>();
private String nextSeed = "";
private String username = "";
public ThechiveRipper(URL url) throws IOException { public ThechiveRipper(URL url) throws IOException {
super(url); super(url);
@@ -21,7 +39,12 @@ public class ThechiveRipper extends AbstractHTMLRipper {
@Override @Override
public String getHost() { public String getHost() {
return "thechive"; Matcher m1 = p1.matcher(url.toExternalForm());
if (m1.matches()) {
return "thechive";
} else {
return "i.thechive"; // for suitable album title.
}
} }
@Override @Override
@@ -31,14 +54,20 @@ public class ThechiveRipper extends AbstractHTMLRipper {
@Override @Override
public String getGID(URL url) throws MalformedURLException { public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$");
Matcher m = p.matcher(url.toExternalForm()); Matcher m1 = p1.matcher(url.toExternalForm());
if (m.matches()) { if (m1.matches()) {
boolean isTag = false; return m1.group(1);
return m.group(1);
} }
Matcher m2 = p2.matcher(url.toExternalForm());
if (m2.matches()) {
username = m2.group(1);
return username;
}
throw new MalformedURLException("Expected thechive.com URL format: " throw new MalformedURLException("Expected thechive.com URL format: "
+ "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ - got " + url + " instead"); + "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ OR i.thechive.com/username, got " + url + " instead.");
} }
@Override @Override
@@ -49,27 +78,148 @@ public class ThechiveRipper extends AbstractHTMLRipper {
@Override @Override
public List<String> getURLsFromPage(Document doc) { public List<String> getURLsFromPage(Document doc) {
List<String> result = new ArrayList<>(); List<String> result;
for (Element el : doc.select("img.attachment-gallery-item-full")) { Matcher matcher = p1.matcher(url.toExternalForm());
String imageSource;
if (el.attr("data-gifsrc").isEmpty()) { //If it's not a gif
imageSource = el.attr("src");
} else { //If it is a gif
imageSource = el.attr("data-gifsrc") //from data-gifsrc attribute
.replaceAll("\\?w=\\d{3}", ""); //remove the width modifier at the end to get highest resolution
//May need to replace the regex's {3} later on if website starts giving higher-res photos by default.
}
// We replace thumbs with resizes so we can the full sized images if (matcher.matches()) {
imageSource = imageSource.replace("thumbs", "resizes"); // for url type: thechive.com/YEAR/MONTH/DAY/POSTTITLE/
result.add(imageSource); result = getUrlsFromThechive(doc);
} else {
// for url type: i.thechive.com/username
result = getUrlsFromIDotThechive();
} }
return result; return result;
} }
@Override
public Document getNextPage(Document doc) throws IOException {
Matcher matcher = p1.matcher(url.toExternalForm());
if (matcher.matches()) {
// url type thechive.com/YEAR/MONTH/DAY/POSTTITLE/ has a single page.
return null;
} else {
if (nextSeed == null) {
throw new IOException("No more pages.");
}
}
// Following try block checks if the next JSON object has images or not.
// This is done to avoid IOException in rip() method, caused when
// getURLsFromPage() returns empty list.
JSONArray imgList;
try {
Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username")
.data("username", username).ignoreContentType().cookies(cookies).response();
cookies = response.cookies();
JSONObject json = new JSONObject(response.body());
imgList = json.getJSONArray("uploads");
} catch (Exception e) {
throw new IOException("Error fetching next page.", e);
}
if (imgList != null && imgList.length() > 0) {
// Pass empty document as it is of no use for thechive.com/userName url type.
return new Document(url.toString());
} else {
// Return null as this is last page.
return null;
}
}
@Override @Override
public void downloadURL(URL url, int index) { public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index)); addURLToDownload(url, getPrefix(index));
} }
private List<String> getUrlsFromThechive(Document doc) {
/*
* The image urls are stored in a <script> tag of the document. This script
* contains a single array var by name CHIVE_GALLERY_ITEMS.
*
* We grab all the <img> tags from the particular script, combine them in a
* string, parse it, and grab all the img/gif urls.
*
*/
List<String> result = new ArrayList<>();
Elements scripts = doc.getElementsByTag("script");
for (Element script : scripts) {
String data = script.data();
if (!data.contains("CHIVE_GALLERY_ITEMS")) {
continue;
}
/*
* We add all the <img/> tags in a single StringBuilder and parse as HTML for
* easy sorting of img/ gifs.
*/
StringBuilder allImgTags = new StringBuilder();
Matcher matcher = imagePattern.matcher(data);
while (matcher.find()) {
// Unescape '\' from the img tags, which also unescape's img url as well.
allImgTags.append(matcher.group(0).replaceAll("\\\\", ""));
}
// Now we parse and sort links.
Document imgDoc = Jsoup.parse(allImgTags.toString());
Elements imgs = imgDoc.getElementsByTag("img");
for (Element img : imgs) {
if (img.hasAttr("data-gifsrc")) {
// For gifs.
result.add(img.attr("data-gifsrc"));
} else {
// For jpeg images.
result.add(img.attr("src"));
}
}
}
// strip all GET parameters from the links( such as quality, width, height as to
// get the original image.).
result.replaceAll(s -> s.substring(0, s.indexOf("?")));
return result;
}
private List<String> getUrlsFromIDotThechive() {
/*
* Image urls for i.thechive.com/someUserName as fetched via JSON request. Each
*
* JSON request uses the cookies from previous response( which contains the next
* CSRF token).
*
* JSON request parameters:
* 1. seed: activityId of the last url.
* 2. queryType: 'by-username' always.
* 3. username: username from the url itself.
*/
List<String> result = new ArrayList<>();
try {
Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username")
.data("username", username).ignoreContentType().cookies(cookies).response();
cookies = response.cookies();
JSONObject json = new JSONObject(response.body());
JSONArray imgList = json.getJSONArray("uploads");
nextSeed = null; // if no more images, nextSeed stays null
for (int i = 0; i < imgList.length(); i++) {
JSONObject img = imgList.getJSONObject(i);
if (img.getString("mediaType").equals("gif")) {
result.add("https:" + img.getString("mediaUrlOverlay"));
} else {
result.add("https:" + img.getString("mediaGifFrameUrl"));
}
nextSeed = img.getString("activityId");
}
} catch (IOException e) {
LOGGER.error("Unable to fetch JSON data for url: " + url);
} catch (JSONException e) {
LOGGER.error("JSON error while parsing data for url: " + url);
}
return result;
}
} }

View File

@@ -26,9 +26,9 @@ package com.rarchives.ripme.tst.ripper.rippers;
import com.rarchives.ripme.ripper.rippers.ThechiveRipper; import com.rarchives.ripme.ripper.rippers.ThechiveRipper;
import java.io.IOException; import java.io.IOException;
import java.net.URL; import java.net.URL;
import org.jsoup.nodes.Attributes; //import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element; //import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag; //import org.jsoup.parser.Tag;
/** /**
* *
@@ -41,40 +41,53 @@ public class ThechiveRipperTest extends RippersTest {
* *
* @throws IOException * @throws IOException
*/ */
public void theChiveRip() throws IOException { public void testTheChiveRip() throws IOException {
ThechiveRipper ripper = new ThechiveRipper(new URL("https://thechive.com/2018/10/03/the-definitive-list-of-the-hottest-horror-movie-babes/")); ThechiveRipper ripper = new ThechiveRipper(new URL(
"https://thechive.com/2019/03/16/beautiful-badasses-lookin-good-in-and-out-of-uniform-35-photos/"));
testRipper(ripper);
}
public void testTheChiveGif() throws IOException {
ThechiveRipper ripper = new ThechiveRipper(
new URL("https://thechive.com/2019/03/14/dont-tease-me-just-squeeze-me-20-gifs/"));
testRipper(ripper); testRipper(ripper);
} }
/* /*
* "i.thechive.com" test.
//If anyone figures out how to get JSOUP Elements mocked up, we can use the following methods to test both jpeg + gif ripping. */
public void testIDotThechive() throws IOException {
public void testGifRip() throws IOException { ThechiveRipper ripper = new ThechiveRipper(new URL("https://i.thechive.com/HHHoney"));
String elementInString = "<img width=\"500\" height=\"305\" \n" testRipper(ripper);
+ "src=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-111.jpg?quality=85&amp;strip=info\" \n"
+ "class=\"attachment-gallery-item-full size-gallery-item-full gif-animate\" \n"
+ "alt=\"american mary crimson quill 111 The hottest horror movie villains ever according to science (18 Photos)\" \n"
+ "title=\"\" data-gifsrc=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif?w=500\">"
Element el = new Element(
new Tag("img"),
"",//URI
new Attributes());
String URL = ThechiveRipper.getImageSource(el);
assertTrue(URL.equals("https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif"));
} }
public void testGifRip() throws IOException { /*
String elementInString = "<img width=\"600\" height=\"409\" src=\"https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg?quality=85&amp;strip=info&amp;w=600\" \n" *
+ "class=\"attachment-gallery-item-full size-gallery-item-full\" \n" * //If anyone figures out how to get JSOUP Elements mocked up, we can use the
+ "alt=\"the definitive list of the hottest horror movie babes 11 The hottest horror movie villains ever according to science (18 Photos)\" title=\"\">"; * following methods to test both jpeg + gif ripping.
Element el = new Element( *
new Tag("img"), * public void testGifRip() throws IOException { String elementInString =
"",//URI * "<img width=\"500\" height=\"305\" \n" +
new Attributes()); * "src=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-111.jpg?quality=85&amp;strip=info\" \n"
String URL = ThechiveRipper.getImageSource(el); * +
assertTrue(URL.equals("https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg")); * "class=\"attachment-gallery-item-full size-gallery-item-full gif-animate\" \n"
} * +
* "alt=\"american mary crimson quill 111 The hottest horror movie villains ever according to science (18 Photos)\" \n"
* +
* "title=\"\" data-gifsrc=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif?w=500\">"
*
* Element el = new Element( new Tag("img"), "",//URI new Attributes()); String
* URL = ThechiveRipper.getImageSource(el); assertTrue(URL.equals(
* "https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif"
* )); }
*
* public void testGifRip() throws IOException { String elementInString =
* "<img width=\"600\" height=\"409\" src=\"https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg?quality=85&amp;strip=info&amp;w=600\" \n"
* + "class=\"attachment-gallery-item-full size-gallery-item-full\" \n" +
* "alt=\"the definitive list of the hottest horror movie babes 11 The hottest horror movie villains ever according to science (18 Photos)\" title=\"\">"
* ; Element el = new Element( new Tag("img"), "",//URI new Attributes());
* String URL = ThechiveRipper.getImageSource(el); assertTrue(URL.equals(
* "https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg"
* )); }
*/ */
} }