From 822ff72a58ea674bbb09241f0da29185b6b61e3c Mon Sep 17 00:00:00 2001 From: Tushar Date: Mon, 15 Apr 2019 18:25:06 +0530 Subject: [PATCH 1/2] Added support for comicextra. --- .../ripper/rippers/ComicextraRipper.java | 171 ++++++++++++++++++ .../ripper/rippers/ComicextraRipperTest.java | 21 +++ 2 files changed, 192 insertions(+) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/ComicextraRipperTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java new file mode 100644 index 00000000..c8cca3a9 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java @@ -0,0 +1,171 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.utils.Http; + +/** + * @author Tushar + * + */ +public class ComicextraRipper extends AbstractHTMLRipper { + + private static final String FILE_NAME = "page"; + + private Pattern p1 = + Pattern.compile("https:\\/\\/www.comicextra.com\\/comic\\/([A-Za-z0-9_-]+)"); + private Pattern p2 = Pattern.compile( + "https:\\/\\/www.comicextra.com\\/([A-Za-z0-9_-]+)\\/([A-Za-z0-9_-]+)(?:\\/full)?"); + private UrlType urlType = UrlType.UNKNOWN; + private List chaptersList = null; + private int chapterIndex = -1; // index for the chaptersList, useful in getting the next page. + private int imageIndex = 0; // image index for each chapter images. + + public ComicextraRipper(URL url) throws IOException { + super(url); + } + + @Override + protected String getDomain() { + return "comicextra.com"; + } + + @Override + public String getHost() { + return "comicextra"; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Matcher m1 = p1.matcher(url.toExternalForm()); + if (m1.matches()) { + // URL is of comic( https://www.comicextra.com/comic/the-punisher-frank-castle-max). + urlType = UrlType.COMIC; + return m1.group(1); + } + + Matcher m2 = p2.matcher(url.toExternalForm()); + if (m2.matches()) { + // URL is of chapter( https://www.comicextra.com/the-punisher-frank-castle-max/chapter-75). + urlType = UrlType.CHAPTER; + return m2.group(1); + } + + throw new MalformedURLException( + "Expected comicextra.com url of type: https://www.comicextra.com/comic/some-comic-name\n" + + " or https://www.comicextra.com/some-comic-name/chapter-001 got " + url + + " instead"); + } + + @Override + protected Document getFirstPage() throws IOException { + Document doc = null; + + switch (urlType) { + case COMIC: + // For COMIC type url we extract the urls of each chapters and store them in chapters. + chaptersList = new ArrayList<>(); + Document comicPage = Http.url(url).get(); + Elements elements = comicPage.select("div.episode-list a"); + for (Element e : elements) { + chaptersList.add(getCompleteChapterUrl(e.attr("abs:href"))); + } + + // Set the first chapter from the chapterList as the doc. + chapterIndex = 0; + doc = Http.url(chaptersList.get(chapterIndex)).get(); + break; + case CHAPTER: + doc = Http.url(url).get(); + break; + case UNKNOWN: + default: + throw new IOException("Unknown url type encountered."); + } + + return doc; + } + + @Override + public Document getNextPage(Document doc) throws IOException { + if (urlType == UrlType.COMIC) { + ++chapterIndex; + imageIndex = 0; // Resetting the imagesIndex so that images prefix within each chapter starts from '001_'. + return Http.url(chaptersList.get(chapterIndex)).get(); + } + + return super.getNextPage(doc); + } + + @Override + protected List getURLsFromPage(Document page) { + List urls = new ArrayList<>(); + + if (urlType == UrlType.COMIC || urlType == UrlType.CHAPTER) { + Elements images = page.select("img.chapter_img"); + for (Element img : images) { + urls.add(img.attr("src")); + } + } + + return urls; + } + + @Override + protected void downloadURL(URL url, int index) { + String subdirectory = getSubDirectoryName(); + String prefix = getPrefix(++imageIndex); + + addURLToDownload(url, prefix, subdirectory, null, null, FILE_NAME, null, Boolean.TRUE); + } + + /* + * This function appends /full at the end of the chapters url to get all the images for the + * chapter in the same Document. + */ + private String getCompleteChapterUrl(String chapterUrl) { + if (!chapterUrl.endsWith("/full")) { + chapterUrl = chapterUrl + "/full"; + } + return chapterUrl; + } + + /* + * This functions returns sub folder name for the current chapter. + */ + private String getSubDirectoryName() { + String subDirectory = ""; + + if (urlType == UrlType.COMIC) { + Matcher m = p2.matcher(chaptersList.get(chapterIndex)); + if (m.matches()) { + subDirectory = m.group(2); + } + } + + if (urlType == UrlType.CHAPTER) { + Matcher m = p2.matcher(url.toExternalForm()); + if (m.matches()) { + subDirectory = m.group(2); + } + } + + return subDirectory; + } + + /* + * Enum to classify different types of urls. + */ + private enum UrlType { + COMIC, CHAPTER, UNKNOWN + } +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ComicextraRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ComicextraRipperTest.java new file mode 100644 index 00000000..a3d66d50 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ComicextraRipperTest.java @@ -0,0 +1,21 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import com.rarchives.ripme.ripper.rippers.ComicextraRipper; + +public class ComicextraRipperTest extends RippersTest { + + public void testComicUrl() throws IOException { + URL url = new URL("https://www.comicextra.com/comic/karma-police"); + ComicextraRipper ripper = new ComicextraRipper(url); + testRipper(ripper); + } + + public void testChapterUrl() throws IOException { + URL url = new URL("https://www.comicextra.com/v-for-vendetta/chapter-1"); + ComicextraRipper ripper = new ComicextraRipper(url); + testRipper(ripper); + } + +} From b36fd1aa6aa4a28e9367137d3be0d0818fa84b85 Mon Sep 17 00:00:00 2001 From: Tushar Date: Mon, 15 Apr 2019 18:53:42 +0530 Subject: [PATCH 2/2] Fixed an IndexOutOfBoundException while fetching the next page. --- .../com/rarchives/ripme/ripper/rippers/ComicextraRipper.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java index c8cca3a9..08b27a76 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ComicextraRipper.java @@ -100,7 +100,9 @@ public class ComicextraRipper extends AbstractHTMLRipper { if (urlType == UrlType.COMIC) { ++chapterIndex; imageIndex = 0; // Resetting the imagesIndex so that images prefix within each chapter starts from '001_'. - return Http.url(chaptersList.get(chapterIndex)).get(); + if (chapterIndex < chaptersList.size()) { + return Http.url(chaptersList.get(chapterIndex)).get(); + } } return super.getNextPage(doc);