1
0
mirror of https://github.com/RipMeApp/ripme.git synced 2025-08-26 07:14:38 +02:00

Added support for i.thechive.com

This commit is contained in:
Tushar
2019-03-18 20:01:58 +05:30
parent 6590b3b80a
commit 5f3575ba37
2 changed files with 187 additions and 52 deletions

View File

@@ -7,13 +7,31 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ThechiveRipper extends AbstractHTMLRipper {
private Pattern p1 = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$");
private Pattern imagePattern = Pattern.compile("<img\\s(?:.|\\n)+?>");
// i.thechive.com specific variables.
private Pattern p2 = Pattern.compile("^https?://i.thechive.com/([0-9a-zA-Z_]+)");
private String jsonUrl = "https://i.thechive.com/rest/uploads";
private Map<String, String> cookies = new HashMap<>();
private String nextSeed = "";
private String username = "";
public ThechiveRipper(URL url) throws IOException {
super(url);
@@ -21,7 +39,12 @@ public class ThechiveRipper extends AbstractHTMLRipper {
@Override
public String getHost() {
return "thechive";
Matcher m1 = p1.matcher(url.toExternalForm());
if (m1.matches()) {
return "thechive";
} else {
return "i.thechive"; // for suitable album title.
}
}
@Override
@@ -31,14 +54,20 @@ public class ThechiveRipper extends AbstractHTMLRipper {
@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$");
Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) {
boolean isTag = false;
return m.group(1);
Matcher m1 = p1.matcher(url.toExternalForm());
if (m1.matches()) {
return m1.group(1);
}
Matcher m2 = p2.matcher(url.toExternalForm());
if (m2.matches()) {
username = m2.group(1);
return username;
}
throw new MalformedURLException("Expected thechive.com URL format: "
+ "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ - got " + url + " instead");
+ "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ OR i.thechive.com/username, got " + url + " instead.");
}
@Override
@@ -49,27 +78,120 @@ public class ThechiveRipper extends AbstractHTMLRipper {
@Override
public List<String> getURLsFromPage(Document doc) {
List<String> result = new ArrayList<>();
for (Element el : doc.select("img.attachment-gallery-item-full")) {
String imageSource;
if (el.attr("data-gifsrc").isEmpty()) { //If it's not a gif
imageSource = el.attr("src");
} else { //If it is a gif
imageSource = el.attr("data-gifsrc") //from data-gifsrc attribute
.replaceAll("\\?w=\\d{3}", ""); //remove the width modifier at the end to get highest resolution
//May need to replace the regex's {3} later on if website starts giving higher-res photos by default.
}
List<String> result;
Matcher matcher = p1.matcher(url.toExternalForm());
// We replace thumbs with resizes so we can the full sized images
imageSource = imageSource.replace("thumbs", "resizes");
result.add(imageSource);
if (matcher.matches()) {
result = getUrlsFromThechive(doc);
} else {
result = getUrlsFromIDotThechive();
}
return result;
}
@Override
public Document getNextPage(Document doc) throws IOException {
Matcher matcher = p1.matcher(url.toExternalForm());
if (matcher.matches()) {
// for pattern p1.
return null;
} else {
if (nextSeed == null) {
throw new IOException("No more pages.");
}
}
// check if next json has elements.
JSONArray imgList;
try {
Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username")
.data("username", username).ignoreContentType().cookies(cookies).response();
cookies = response.cookies();
JSONObject json = new JSONObject(response.body());
imgList = json.getJSONArray("uploads");
} catch (Exception e) {
throw new IOException("Error fetching next page.", e);
}
if (imgList != null && imgList.length() > 0) {
return new Document(url.toString()); // empty document.
} else {
return null;
}
}
@Override
public void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(index));
}
private List<String> getUrlsFromThechive(Document doc) {
List<String> result = new ArrayList<>();
Elements scripts = doc.getElementsByTag("script");
for (Element script : scripts) {
String data = script.data();
if (!data.contains("CHIVE_GALLERY_ITEMS")) {
continue;
}
/*
* We add all the <img/> tags in a single StringBuilder and parse as HTML for
* easy sorting of img/ gifs.
*/
StringBuilder allImgTags = new StringBuilder();
Matcher matcher = imagePattern.matcher(data);
while (matcher.find()) {
allImgTags.append(matcher.group(0).replaceAll("\\\\", ""));
}
// Now we parse and sort links.
Document imgDoc = Jsoup.parse(allImgTags.toString());
Elements imgs = imgDoc.getElementsByTag("img");
for (Element img : imgs) {
if (img.hasAttr("data-gifsrc")) {
// result.add(img.attr("data-gifsrc"));
result.add(img.attr("data-gifsrc"));
} else {
// result.add(img.attr("src"));
result.add(img.attr("src"));
}
}
}
// strip all GET parameters from the links( such as quality).
result.replaceAll(s -> s.substring(0, s.indexOf("?")));
return result;
}
private List<String> getUrlsFromIDotThechive() {
// check for pattern p2.
List<String> result = new ArrayList<>();
try {
Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username")
.data("username", username).ignoreContentType().cookies(cookies).response();
cookies = response.cookies();
JSONObject json = new JSONObject(response.body());
JSONArray imgList = json.getJSONArray("uploads");
nextSeed = null; // if no more images, nextSeed stays null
for (int i = 0; i < imgList.length(); i++) {
JSONObject img = imgList.getJSONObject(i);
if (img.getString("mediaType").equals("gif")) {
result.add("https:" + img.getString("mediaUrlOverlay"));
} else {
result.add("https:" + img.getString("mediaGifFrameUrl"));
}
nextSeed = img.getString("activityId");
}
} catch (IOException e) {
LOGGER.error("Unable to fetch JSON data for url: " + url);
} catch (JSONException e) {
LOGGER.error("JSON error while parsing data for url: " + url);
}
return result;
}
}

View File

@@ -26,9 +26,9 @@ package com.rarchives.ripme.tst.ripper.rippers;
import com.rarchives.ripme.ripper.rippers.ThechiveRipper;
import java.io.IOException;
import java.net.URL;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;
//import org.jsoup.nodes.Attributes;
//import org.jsoup.nodes.Element;
//import org.jsoup.parser.Tag;
/**
*
@@ -41,40 +41,53 @@ public class ThechiveRipperTest extends RippersTest {
*
* @throws IOException
*/
public void theChiveRip() throws IOException {
ThechiveRipper ripper = new ThechiveRipper(new URL("https://thechive.com/2018/10/03/the-definitive-list-of-the-hottest-horror-movie-babes/"));
public void testTheChiveRip() throws IOException {
ThechiveRipper ripper = new ThechiveRipper(new URL(
"https://thechive.com/2019/03/16/beautiful-badasses-lookin-good-in-and-out-of-uniform-35-photos/"));
testRipper(ripper);
}
public void testTheChiveGif() throws IOException {
ThechiveRipper ripper = new ThechiveRipper(
new URL("https://thechive.com/2019/03/14/dont-tease-me-just-squeeze-me-20-gifs/"));
testRipper(ripper);
}
/*
//If anyone figures out how to get JSOUP Elements mocked up, we can use the following methods to test both jpeg + gif ripping.
public void testGifRip() throws IOException {
String elementInString = "<img width=\"500\" height=\"305\" \n"
+ "src=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-111.jpg?quality=85&amp;strip=info\" \n"
+ "class=\"attachment-gallery-item-full size-gallery-item-full gif-animate\" \n"
+ "alt=\"american mary crimson quill 111 The hottest horror movie villains ever according to science (18 Photos)\" \n"
+ "title=\"\" data-gifsrc=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif?w=500\">"
Element el = new Element(
new Tag("img"),
"",//URI
new Attributes());
String URL = ThechiveRipper.getImageSource(el);
assertTrue(URL.equals("https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif"));
* "i.thechive.com" test.
*/
public void testIDotThechive() throws IOException {
ThechiveRipper ripper = new ThechiveRipper(new URL("https://i.thechive.com/HHHoney"));
testRipper(ripper);
}
public void testGifRip() throws IOException {
String elementInString = "<img width=\"600\" height=\"409\" src=\"https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg?quality=85&amp;strip=info&amp;w=600\" \n"
+ "class=\"attachment-gallery-item-full size-gallery-item-full\" \n"
+ "alt=\"the definitive list of the hottest horror movie babes 11 The hottest horror movie villains ever according to science (18 Photos)\" title=\"\">";
Element el = new Element(
new Tag("img"),
"",//URI
new Attributes());
String URL = ThechiveRipper.getImageSource(el);
assertTrue(URL.equals("https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg"));
}
/*
*
* //If anyone figures out how to get JSOUP Elements mocked up, we can use the
* following methods to test both jpeg + gif ripping.
*
* public void testGifRip() throws IOException { String elementInString =
* "<img width=\"500\" height=\"305\" \n" +
* "src=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-111.jpg?quality=85&amp;strip=info\" \n"
* +
* "class=\"attachment-gallery-item-full size-gallery-item-full gif-animate\" \n"
* +
* "alt=\"american mary crimson quill 111 The hottest horror movie villains ever according to science (18 Photos)\" \n"
* +
* "title=\"\" data-gifsrc=\"https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif?w=500\">"
*
* Element el = new Element( new Tag("img"), "",//URI new Attributes()); String
* URL = ThechiveRipper.getImageSource(el); assertTrue(URL.equals(
* "https://thechive.files.wordpress.com/2018/10/american_mary_crimson_quill-1.gif"
* )); }
*
* public void testGifRip() throws IOException { String elementInString =
* "<img width=\"600\" height=\"409\" src=\"https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg?quality=85&amp;strip=info&amp;w=600\" \n"
* + "class=\"attachment-gallery-item-full size-gallery-item-full\" \n" +
* "alt=\"the definitive list of the hottest horror movie babes 11 The hottest horror movie villains ever according to science (18 Photos)\" title=\"\">"
* ; Element el = new Element( new Tag("img"), "",//URI new Attributes());
* String URL = ThechiveRipper.getImageSource(el); assertTrue(URL.equals(
* "https://thechive.files.wordpress.com/2018/10/the-definitive-list-of-the-hottest-horror-movie-babes-11.jpg"
* )); }
*/
}