diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java index 7d1a38bc..3c9d751d 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ThechiveRipper.java @@ -7,13 +7,31 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; + +import org.json.JSONArray; +import org.json.JSONException; +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.Connection.Response; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; public class ThechiveRipper extends AbstractHTMLRipper { + private Pattern p1 = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$"); + private Pattern imagePattern = Pattern.compile(""); + + // i.thechive.com specific variables. + private Pattern p2 = Pattern.compile("^https?://i.thechive.com/([0-9a-zA-Z_]+)"); + private String jsonUrl = "https://i.thechive.com/rest/uploads"; + private Map cookies = new HashMap<>(); + private String nextSeed = ""; + private String username = ""; public ThechiveRipper(URL url) throws IOException { super(url); @@ -21,7 +39,12 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public String getHost() { - return "thechive"; + Matcher m1 = p1.matcher(url.toExternalForm()); + if (m1.matches()) { + return "thechive"; + } else { + return "i.thechive"; // for suitable album title. + } } @Override @@ -31,14 +54,20 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://thechive.com/[0-9]*/[0-9]*/[0-9]*/([a-zA-Z0-9_\\-]*)/?$"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - boolean isTag = false; - return m.group(1); + + Matcher m1 = p1.matcher(url.toExternalForm()); + if (m1.matches()) { + return m1.group(1); } + + Matcher m2 = p2.matcher(url.toExternalForm()); + if (m2.matches()) { + username = m2.group(1); + return username; + } + throw new MalformedURLException("Expected thechive.com URL format: " - + "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ - got " + url + " instead"); + + "thechive.com/YEAR/MONTH/DAY/POSTTITLE/ OR i.thechive.com/username, got " + url + " instead."); } @Override @@ -49,27 +78,148 @@ public class ThechiveRipper extends AbstractHTMLRipper { @Override public List getURLsFromPage(Document doc) { - List result = new ArrayList<>(); - for (Element el : doc.select("img.attachment-gallery-item-full")) { - String imageSource; - if (el.attr("data-gifsrc").isEmpty()) { //If it's not a gif - imageSource = el.attr("src"); - } else { //If it is a gif - imageSource = el.attr("data-gifsrc") //from data-gifsrc attribute - .replaceAll("\\?w=\\d{3}", ""); //remove the width modifier at the end to get highest resolution - //May need to replace the regex's {3} later on if website starts giving higher-res photos by default. - } + List result; + Matcher matcher = p1.matcher(url.toExternalForm()); - // We replace thumbs with resizes so we can the full sized images - imageSource = imageSource.replace("thumbs", "resizes"); - result.add(imageSource); + if (matcher.matches()) { + // for url type: thechive.com/YEAR/MONTH/DAY/POSTTITLE/ + result = getUrlsFromThechive(doc); + } else { + // for url type: i.thechive.com/username + result = getUrlsFromIDotThechive(); } return result; } + @Override + public Document getNextPage(Document doc) throws IOException { + Matcher matcher = p1.matcher(url.toExternalForm()); + + if (matcher.matches()) { + // url type thechive.com/YEAR/MONTH/DAY/POSTTITLE/ has a single page. + return null; + } else { + if (nextSeed == null) { + throw new IOException("No more pages."); + } + } + + // Following try block checks if the next JSON object has images or not. + // This is done to avoid IOException in rip() method, caused when + // getURLsFromPage() returns empty list. + JSONArray imgList; + try { + Response response = Http.url(jsonUrl).data("seed", nextSeed).data("queryType", "by-username") + .data("username", username).ignoreContentType().cookies(cookies).response(); + cookies = response.cookies(); + JSONObject json = new JSONObject(response.body()); + imgList = json.getJSONArray("uploads"); + } catch (Exception e) { + throw new IOException("Error fetching next page.", e); + } + + if (imgList != null && imgList.length() > 0) { + // Pass empty document as it is of no use for thechive.com/userName url type. + return new Document(url.toString()); + } else { + // Return null as this is last page. + return null; + } + } + @Override public void downloadURL(URL url, int index) { addURLToDownload(url, getPrefix(index)); } + private List getUrlsFromThechive(Document doc) { + /* + * The image urls are stored in a