From 71652ecb00bb5ae1ffaf6ea76897a9b4f873c0b1 Mon Sep 17 00:00:00 2001 From: Edvin Boul Date: Mon, 1 Jul 2019 13:19:54 +0300 Subject: [PATCH 1/2] Added Meituri Ripper [NSFW] --- .../ripme/ripper/rippers/MeituriRipper.java | 91 +++++++++++++++++++ .../tst/ripper/rippers/MeituriRipperTest.java | 19 ++++ 2 files changed, 110 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/MeituriRipper.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/MeituriRipperTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/MeituriRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/MeituriRipper.java new file mode 100644 index 00000000..8855846a --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/MeituriRipper.java @@ -0,0 +1,91 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; + +public class MeituriRipper extends AbstractHTMLRipper { + public MeituriRipper(URL url) throws IOException { + super(url); + } + + @Override + public String getHost() { + return "meituri"; + } + + @Override + public String getDomain() { + return "meituri.com"; + } + + // To use in getting URLs + String albumID = ""; + + @Override + public String getGID(URL url) throws MalformedURLException { + // without escape + // ^https?://[w.]*meituri\.com/a/([0-9]+)/$ + // https://www.meituri.com/a/14449/ + // group 1 is 14449 + Pattern p = Pattern.compile("^https?://[w.]*meituri\\.com/a/([0-9]+)/$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + albumID = m.group(1); + return m.group(1); + } + throw new MalformedURLException( + "Expected meituri.com URL format: " + "meituri.com/a/albumid/ - got " + url + "instead"); + } + + @Override + public Document getFirstPage() throws IOException { + return Http.url(url).get(); + } + + @Override + public List getURLsFromPage(Document doc) { + List imageURLs = new ArrayList<>(); + // Get number of images from the page + // Then generate links according to that + String numOfImages = ""; + // A very ugly way of getting "图片数量: 55P" from paragraphs + // 3rd p in div.tuji + int n = 0; + for (Element para : doc.select("div.tuji > p")) { + // 图片数量: 55P + if (n == 2) { + numOfImages = para.toString(); + } + n++; + } + // ["

图片数量:", "55P

"] + String[] splitNumOfImages = numOfImages.split(" "); + // "55P

" -> "55" -> 55 + int actualNumOfImages = Integer.parseInt(splitNumOfImages[1].replace("P

", "")); + + // Base URL: http://ii.hywly.com/a/1/albumid/imgnum.jpg + String baseURL = "http://ii.hywly.com/a/1/" + albumID + "/"; + + // Loop through and add images to the URL list + for (int i = 1; i <= actualNumOfImages; i++) { + imageURLs.add(baseURL + i + ".jpg"); + } + return imageURLs; + } + + @Override + public void downloadURL(URL url, int index) { + addURLToDownload(url, getPrefix(index)); + } +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/MeituriRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/MeituriRipperTest.java new file mode 100644 index 00000000..a8505590 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/MeituriRipperTest.java @@ -0,0 +1,19 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; + +import com.rarchives.ripme.ripper.rippers.MeituriRipper; + +public class MeituriRipperTest extends RippersTest { + public void testMeituriRip() throws IOException { + MeituriRipper ripper = new MeituriRipper(new URL("https://www.meituri.com/a/14449/")); + testRipper(ripper); + } + + public void testGetGID() throws IOException { + URL url = new URL("https://www.meituri.com/a/14449/"); + MeituriRipper ripper = new MeituriRipper(url); + assertEquals("14449", ripper.getGID(url)); + } +} From ec22b13cc3c4e6a3cabeba1e4631eca1fc679063 Mon Sep 17 00:00:00 2001 From: Edvin Boul Date: Mon, 1 Jul 2019 13:28:40 +0300 Subject: [PATCH 2/2] Regex change to also match pagination --- .../com/rarchives/ripme/ripper/rippers/MeituriRipper.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/MeituriRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/MeituriRipper.java index 8855846a..4e39a985 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/MeituriRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/MeituriRipper.java @@ -35,10 +35,11 @@ public class MeituriRipper extends AbstractHTMLRipper { @Override public String getGID(URL url) throws MalformedURLException { // without escape - // ^https?://[w.]*meituri\.com/a/([0-9]+)/$ + // ^https?://[w.]*meituri\.com/a/([0-9]+)/([0-9\.html]+)*$ // https://www.meituri.com/a/14449/ + // also matches https://www.meituri.com/a/14449/3.html etc. // group 1 is 14449 - Pattern p = Pattern.compile("^https?://[w.]*meituri\\.com/a/([0-9]+)/$"); + Pattern p = Pattern.compile("^https?://[w.]*meituri\\.com/a/([0-9]+)/([0-9\\.html]+)*$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { albumID = m.group(1);