mirror of
https://github.com/RipMeApp/ripme.git
synced 2025-08-26 07:14:38 +02:00
Merge pull request #1246 from Tush-r/thechive
Fixed ThechiveRipper( and added support for i.thechive.com)
This commit is contained in:
@@ -7,13 +7,31 @@ import java.io.IOException;
|
|||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONException;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.Connection.Response;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
public class ThechiveRipper extends AbstractHTMLRipper {
|
public class ThechiveRipper extends AbstractHTMLRipper {
|
||||||
|
private Pattern p1 = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$");
|
||||||
|
private Pattern imagePattern = Pattern.compile("<img\\s(?:.|\\n)+?>");
|
||||||
|
|
||||||
|
// i.thechive.com specific variables.
|
||||||
|
private Pattern p2 = Pattern.compile("^https?://i.thechive.com/([0-9a-zA-Z_]+)");
|
||||||
|
private String jsonUrl = "https://i.thechive.com/rest/uploads";
|
||||||
|
private Map<String, String> cookies = new HashMap<>();
|
||||||
|
private String nextSeed = "";
|
||||||
|
private String username = "";
|
||||||
|
|
||||||
public ThechiveRipper(URL url) throws IOException {
|
public ThechiveRipper(URL url) throws IOException {
|
||||||
super(url);
|
super(url);
|
||||||
@@ -21,7 +39,12 @@ public class ThechiveRipper extends AbstractHTMLRipper {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getHost() {
|
public String getHost() {
|
||||||
return "thechive";
|
Matcher m1 = p1.matcher(url.toExternalForm());
|
||||||
|
if (m1.matches()) {
|
||||||
|
return "thechive";
|
||||||
|
} else {
|
||||||
|
return "i.thechive"; // for suitable album title.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -31,14 +54,20 @@ public class ThechiveRipper extends AbstractHTMLRipper {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getGID(URL url) throws MalformedURLException {
|
public String getGID(URL url) throws MalformedURLException {
|
||||||
Pattern p = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$");
|
|
||||||
Matcher m = p.matcher(url.toExternalForm());
|
Matcher m1 = p1.matcher(url.toExternalForm());
|
||||||
if (m.matches()) {
|
if (m1.matches()) {
|
||||||
boolean isTag = false;
|
return m1.group(1);
|
||||||
return m.group(1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Matcher m2 = p2.matcher(url.toExternalForm());
|
||||||
|
if (m2.matches()) {
|
||||||
|
username = m2.group(1);
|
||||||
|
return username;
|
||||||
|
}
|
||||||
|
|
||||||
throw new MalformedURLException("Expected thechive.com URL format: "
|
throw new MalformedURLException("Expected thechive.com URL format: "
|
||||||
+ "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ - got " + url + " instead");
|
+ "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ OR i.thechive.com/username, got " + url + " instead.");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -49,27 +78,148 @@ public class ThechiveRipper extends AbstractHTMLRipper {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getURLsFromPage(Document doc) {
|
public List<String> getURLsFromPage(Document doc) {
|
||||||
List<String> result = new ArrayList<>();
|
List<String> result;
|
||||||
for (Element el : doc.select("img.attachment-gallery-item-full")) {
|
Matcher matcher = p1.matcher(url.toExternalForm());
|
||||||
String imageSource;
|
|
||||||
if (el.attr("data-gifsrc").isEmpty()) { //If it's not a gif
|
|
||||||
imageSource = el.attr("src");
|
|
||||||
} else { //If it is a gif
|
|
||||||
imageSource = el.attr("data-gifsrc") //from data-gifsrc attribute
|
|
||||||
.replaceAll("\\?w=\\d{3}", ""); //remove the width modifier at the end to get highest resolution
|
|
||||||
//May need to replace the regex's {3} later on if website starts giving higher-res photos by default.
|
|
||||||
}
|
|
||||||
|
|
||||||
// We replace thumbs with resizes so we can the full sized images
|
if (matcher.matches()) {
|
||||||
imageSource = imageSource.replace("thumbs", "resizes");
|
// for url type: thechive.com/YEAR/MONTH/DAY/POSTTITLE/
|
||||||
result.add(imageSource);
|
result = getUrlsFromThechive(doc);
|
||||||
|
} else {
|
||||||
|
// for url type: i.thechive.com/username
|
||||||
|
result = getUrlsFromIDotThechive();
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Document getNextPage(Document doc) throws IOException {
|
||||||
|
Matcher matcher = p1.matcher(url.toExternalForm());
|
||||||
|
|
||||||
|
if (matcher.matches()) {
|
||||||
|
// url type thechive.com/YEAR/MONTH/DAY/POSTTITLE/ has a single page.
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
if (nextSeed == null) {
|
||||||
|
throw new IOException("No more pages.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Following try block checks if the next JSON object has images or not.
|
||||||
|
// This is done to avoid IOException in rip() method, caused when
|
||||||
|
// getURLsFromPage() returns empty list.
|
||||||
|
JSONArray imgList;
|
||||||
|
try {
|
||||||
|
Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username")
|
||||||
|
.data("username", username).ignoreContentType().cookies(cookies).response();
|
||||||
|
cookies = response.cookies();
|
||||||
|
JSONObject json = new JSONObject(response.body());
|
||||||
|
imgList = json.getJSONArray("uploads");
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new IOException("Error fetching next page.", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (imgList != null && imgList.length() > 0) {
|
||||||
|
// Pass empty document as it is of no use for thechive.com/userName url type.
|
||||||
|
return new Document(url.toString());
|
||||||
|
} else {
|
||||||
|
// Return null as this is last page.
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void downloadURL(URL url, int index) {
|
public void downloadURL(URL url, int index) {
|
||||||
addURLToDownload(url, getPrefix(index));
|
addURLToDownload(url, getPrefix(index));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<String> getUrlsFromThechive(Document doc) {
|
||||||
|
/*
|
||||||
|
* The image urls are stored in a <script> tag of the document. This script
|
||||||
|
* contains a single array var by name CHIVE_GALLERY_ITEMS.
|
||||||
|
*
|
||||||
|
* We grab all the <img> tags from the particular script, combine them in a
|
||||||
|
* string, parse it, and grab all the img/gif urls.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
List<String> result = new ArrayList<>();
|
||||||
|
Elements scripts = doc.getElementsByTag("script");
|
||||||
|
|
||||||
|
for (Element script : scripts) {
|
||||||
|
String data = script.data();
|
||||||
|
|
||||||
|
if (!data.contains("CHIVE_GALLERY_ITEMS")) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We add all the <img/> tags in a single StringBuilder and parse as HTML for
|
||||||
|
* easy sorting of img/ gifs.
|
||||||
|
*/
|
||||||
|
StringBuilder allImgTags = new StringBuilder();
|
||||||
|
Matcher matcher = imagePattern.matcher(data);
|
||||||
|
while (matcher.find()) {
|
||||||
|
// Unescape '\' from the img tags, which also unescape's img url as well.
|
||||||
|
allImgTags.append(matcher.group(0).replaceAll("\\\\", ""));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we parse and sort links.
|
||||||
|
Document imgDoc = Jsoup.parse(allImgTags.toString());
|
||||||
|
Elements imgs = imgDoc.getElementsByTag("img");
|
||||||
|
for (Element img : imgs) {
|
||||||
|
if (img.hasAttr("data-gifsrc")) {
|
||||||
|
// For gifs.
|
||||||
|
result.add(img.attr("data-gifsrc"));
|
||||||
|
} else {
|
||||||
|
// For jpeg images.
|
||||||
|
result.add(img.attr("src"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// strip all GET parameters from the links( such as quality, width, height as to
|
||||||
|
// get the original image.).
|
||||||
|
result.replaceAll(s -> s.substring(0, s.indexOf("?")));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> getUrlsFromIDotThechive() {
|
||||||
|
/*
|
||||||
|
* Image urls for i.thechive.com/someUserName as fetched via JSON request. Each
|
||||||
|
*
|
||||||
|
* JSON request uses the cookies from previous response( which contains the next
|
||||||
|
* CSRF token).
|
||||||
|
*
|
||||||
|
* JSON request parameters:
|
||||||
|
* 1. seed: activityId of the last url.
|
||||||
|
* 2. queryType: 'by-username' always.
|
||||||
|
* 3. username: username from the url itself.
|
||||||
|
*/
|
||||||
|
List<String> result = new ArrayList<>();
|
||||||
|
try {
|
||||||
|
Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username")
|
||||||
|
.data("username", username).ignoreContentType().cookies(cookies).response();
|
||||||
|
cookies = response.cookies();
|
||||||
|
JSONObject json = new JSONObject(response.body());
|
||||||
|
JSONArray imgList = json.getJSONArray("uploads");
|
||||||
|
nextSeed = null; // if no more images, nextSeed stays null
|
||||||
|
|
||||||
|
for (int i = 0; i < imgList.length(); i++) {
|
||||||
|
JSONObject img = imgList.getJSONObject(i);
|
||||||
|
if (img.getString("mediaType").equals("gif")) {
|
||||||
|
result.add("https:" + img.getString("mediaUrlOverlay"));
|
||||||
|
} else {
|
||||||
|
result.add("https:" + img.getString("mediaGifFrameUrl"));
|
||||||
|
}
|
||||||
|
nextSeed = img.getString("activityId");
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOGGER.error("Unable to fetch JSON data for url: " + url);
|
||||||
|
} catch (JSONException e) {
|
||||||
|
LOGGER.error("JSON error while parsing data for url: " + url);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -26,9 +26,9 @@ package com.rarchives.ripme.tst.ripper.rippers;
|
|||||||
import com.rarchives.ripme.ripper.rippers.ThechiveRipper;
|
import com.rarchives.ripme.ripper.rippers.ThechiveRipper;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import org.jsoup.nodes.Attributes;
|
//import org.jsoup.nodes.Attributes;
|
||||||
import org.jsoup.nodes.Element;
|
//import org.jsoup.nodes.Element;
|
||||||
import org.jsoup.parser.Tag;
|
//import org.jsoup.parser.Tag;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
@@ -41,40 +41,53 @@ public class ThechiveRipperTest extends RippersTest {
|
|||||||
*
|
*
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public void theChiveRip() throws IOException {
|
public void testTheChiveRip() throws IOException {
|
||||||
ThechiveRipper ripper = new ThechiveRipper(new URL("https://thechive.com/2018/10/03/the-definitive-list-of-the-hottest-horror-movie-babes/"));
|
ThechiveRipper ripper = new ThechiveRipper(new URL(
|
||||||
|
"https://thechive.com/2019/03/16/beautiful-badasses-lookin-good-in-and-out-of-uniform-35-photos/"));
|
||||||
|
testRipper(ripper);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTheChiveGif() throws IOException {
|
||||||
|
ThechiveRipper ripper = new ThechiveRipper(
|
||||||
|
new URL("https://thechive.com/2019/03/14/dont-tease-me-just-squeeze-me-20-gifs/"));
|
||||||
testRipper(ripper);
|
testRipper(ripper);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* "i.thechive.com" test.
|
||||||
//If anyone figures out how to get JSOUP Elements mocked up, we can use the following methods to test both jpeg + gif ripping.
|
*/
|
||||||
|
public void testIDotThechive() throws IOException {
|
||||||
public void testGifRip() throws IOException {
|
ThechiveRipper ripper = new ThechiveRipper(new URL("https://i.thechive.com/HHHoney"));
|
||||||
String elementInString = "<img width=\"500\" height=\"305\" \n"
|
testRipper(ripper);
|
||||||
+ "src=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-111.jpg?quality=85&strip=info\" \n"
|
|
||||||
+ "class=\"attachment-gallery-item-full size-gallery-item-full gif-animate\" \n"
|
|
||||||
+ "alt=\"american mary crimson quill 111 The hottest horror movie villains ever according to science (18 Photos)\" \n"
|
|
||||||
+ "title=\"\" data-gifsrc=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif?w=500\">"
|
|
||||||
|
|
||||||
Element el = new Element(
|
|
||||||
new Tag("img"),
|
|
||||||
"",//URI
|
|
||||||
new Attributes());
|
|
||||||
String URL = ThechiveRipper.getImageSource(el);
|
|
||||||
assertTrue(URL.equals("https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testGifRip() throws IOException {
|
/*
|
||||||
String elementInString = "<img width=\"600\" height=\"409\" src=\"https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg?quality=85&strip=info&w=600\" \n"
|
*
|
||||||
+ "class=\"attachment-gallery-item-full size-gallery-item-full\" \n"
|
* //If anyone figures out how to get JSOUP Elements mocked up, we can use the
|
||||||
+ "alt=\"the definitive list of the hottest horror movie babes 11 The hottest horror movie villains ever according to science (18 Photos)\" title=\"\">";
|
* following methods to test both jpeg + gif ripping.
|
||||||
Element el = new Element(
|
*
|
||||||
new Tag("img"),
|
* public void testGifRip() throws IOException { String elementInString =
|
||||||
"",//URI
|
* "<img width=\"500\" height=\"305\" \n" +
|
||||||
new Attributes());
|
* "src=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-111.jpg?quality=85&strip=info\" \n"
|
||||||
String URL = ThechiveRipper.getImageSource(el);
|
* +
|
||||||
assertTrue(URL.equals("https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg"));
|
* "class=\"attachment-gallery-item-full size-gallery-item-full gif-animate\" \n"
|
||||||
}
|
* +
|
||||||
|
* "alt=\"american mary crimson quill 111 The hottest horror movie villains ever according to science (18 Photos)\" \n"
|
||||||
|
* +
|
||||||
|
* "title=\"\" data-gifsrc=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif?w=500\">"
|
||||||
|
*
|
||||||
|
* Element el = new Element( new Tag("img"), "",//URI new Attributes()); String
|
||||||
|
* URL = ThechiveRipper.getImageSource(el); assertTrue(URL.equals(
|
||||||
|
* "https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif"
|
||||||
|
* )); }
|
||||||
|
*
|
||||||
|
* public void testGifRip() throws IOException { String elementInString =
|
||||||
|
* "<img width=\"600\" height=\"409\" src=\"https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg?quality=85&strip=info&w=600\" \n"
|
||||||
|
* + "class=\"attachment-gallery-item-full size-gallery-item-full\" \n" +
|
||||||
|
* "alt=\"the definitive list of the hottest horror movie babes 11 The hottest horror movie villains ever according to science (18 Photos)\" title=\"\">"
|
||||||
|
* ; Element el = new Element( new Tag("img"), "",//URI new Attributes());
|
||||||
|
* String URL = ThechiveRipper.getImageSource(el); assertTrue(URL.equals(
|
||||||
|
* "https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg"
|
||||||
|
* )); }
|
||||||
*/
|
*/
|
||||||
}
|
}
|
Reference in New Issue
Block a user