From 77e9a15c31ac7eb1d3366ab03ccc9c50b55f710c Mon Sep 17 00:00:00 2001 From: Tushar Date: Wed, 16 Jan 2019 00:30:41 +0530 Subject: [PATCH 1/2] Improved luscious ripper for faster and complete rips. --- .../ripme/ripper/rippers/LusciousRipper.java | 155 +++++++++++------- .../ripper/rippers/LusciousRipperTest.java | 31 +++- 2 files changed, 119 insertions(+), 67 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java index 2ffddb70..28cd601b 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java @@ -13,79 +13,110 @@ import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.rarchives.ripme.ripper.AbstractHTMLRipper; +import com.rarchives.ripme.ripper.DownloadThreadPool; import com.rarchives.ripme.utils.Http; public class LusciousRipper extends AbstractHTMLRipper { + private static final int RETRY_COUNT = 5; // Keeping it high for read timeout exception. - public LusciousRipper(URL url) throws IOException { - super(url); - } + private Pattern p = Pattern.compile("^https?://(?:members.)?luscious\\.net/albums/([-_.0-9a-zA-Z]+).*$"); + private DownloadThreadPool lusciousThreadPool = new DownloadThreadPool("lusciousThreadPool"); - @Override - public String getDomain() { - return "luscious.net"; - } + public LusciousRipper(URL url) throws IOException { + super(url); + } - @Override - public String getHost() { - return "luscious"; - } + @Override + public String getDomain() { + return "luscious.net"; + } - @Override - public Document getFirstPage() throws IOException { - // "url" is an instance field of the superclass - Document page = Http.url(url).get(); - URL firstUrl = new URL("https://luscious.net" + page.select("div > div.item.thumbnail.ic_container > a").first().attr("href")); - LOGGER.info("First page is " + "https://luscious.net" + page.select("div > div.album_cover_item > a").first().attr("href")); - return Http.url(firstUrl).get(); - } + @Override + public String getHost() { + return "luscious"; + } - @Override - public List getURLsFromPage(Document page) { - List urls = new ArrayList<>(); - Elements urlElements = page.select(".icon-download"); - for (Element e : urlElements) { - urls.add(e.attr("href")); - } - - // This is here for pages with mp4s instead of images - String video_image = ""; - video_image = page.select("div > video > source").attr("src"); - if (!video_image.equals("")) { - urls.add(video_image); - } - return urls; - } + @Override + public Document getFirstPage() throws IOException { + // "url" is an instance field of the superclass + Document page = Http.url(url).get(); + LOGGER.info("First page is " + url); + return page; + } - @Override - public Document getNextPage(Document doc) throws IOException { - // Find next page - String nextPageUrl = "https://luscious.net" + doc.select("a.image_link[rel=next]").attr("href"); - // The more_like_this is here so we don't try to download the page that comes after the end of an album - if (nextPageUrl == "https://luscious.net" || - nextPageUrl.contains("more_like_this")) { - throw new IOException("No more pages"); - } + @Override + public List getURLsFromPage(Document page) { + List urls = new ArrayList<>(); + Elements urlElements = page.select("div.item.thumbnail.ic_container > a"); + for (Element e : urlElements) { + urls.add(e.attr("abs:href")); + } - return Http.url(nextPageUrl).get(); - } + return urls; + } - @Override - public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern - .compile("^https?://luscious\\.net/albums/([-_.0-9a-zA-Z]+).*$"); - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - throw new MalformedURLException("Expected luscious.net URL format: " - + "luscious.net/albums/albumname - got " + url - + " instead"); - } + @Override + public Document getNextPage(Document doc) throws IOException { + // luscious sends xhr requests to nextPageUrl and appends new set of images to the current page while in browser. + // Simply GET the nextPageUrl also works. Therefore, we do this... + Element nextPageElement = doc.select("div#next_page > div > a").first(); + if (nextPageElement == null) { + throw new IOException("No next page found."); + } - @Override - public void downloadURL(URL url, int index) { - addURLToDownload(url, getPrefix(index)); - } + return Http.url(nextPageElement.attr("abs:href")).get(); + } + @Override + public String getGID(URL url) throws MalformedURLException { + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + throw new MalformedURLException("Expected luscious.net URL format: " + + "luscious.net/albums/albumname \n members.luscious.net/albums/albumname - got " + url + " instead."); + } + + @Override + public void downloadURL(URL url, int index) { + lusciousThreadPool.addThread(new LusciousDownloadThread(url, index)); + } + + @Override + public DownloadThreadPool getThreadPool() { + return lusciousThreadPool; + } + + public class LusciousDownloadThread extends Thread { + private URL url; + private int index; + + public LusciousDownloadThread(URL url, int index) { + this.url = url; + this.index = index; + } + + @Override + public void run() { + try { + Document page = Http.url(url).retries(RETRY_COUNT).get(); + + String downloadUrl = page.select(".icon-download").attr("abs:href"); + if (downloadUrl.equals("")) { + // This is here for pages with mp4s instead of images. + downloadUrl = page.select("div > video > source").attr("src"); + if (!downloadUrl.equals("")) { + throw new IOException("Could not find download url for image or video."); + } + } + + //If a valid download url was found. + addURLToDownload(new URL(downloadUrl), getPrefix(index)); + + } catch (IOException e) { + LOGGER.error("Error downloadiong url " + url, e); + } + } + + } } diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/LusciousRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/LusciousRipperTest.java index 30526659..4235608a 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/LusciousRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/LusciousRipperTest.java @@ -6,9 +6,30 @@ import java.net.URL; import com.rarchives.ripme.ripper.rippers.LusciousRipper; public class LusciousRipperTest extends RippersTest { - public void testPahealRipper() throws IOException { - // a photo set - LusciousRipper ripper = new LusciousRipper(new URL("https://luscious.net/albums/h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609/")); - testRipper(ripper); - } + public void testPahealRipper() throws IOException { + // a photo set + LusciousRipper ripper = new LusciousRipper( + new URL("https://luscious.net/albums/h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609/")); + testRipper(ripper); + } + + public void testGetGID() throws IOException { + URL url = new URL("https://luscious.net/albums/h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609/"); + LusciousRipper ripper = new LusciousRipper(url); + assertEquals("h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609", ripper.getGID(url)); + } + + public void testGetNextPage() throws IOException { + URL multiPageAlbumUrl = new URL("https://luscious.net/albums/women-of-color_58/"); + LusciousRipper multiPageRipper = new LusciousRipper(multiPageAlbumUrl); + assert (multiPageRipper.getNextPage(multiPageRipper.getFirstPage()) != null); + + URL singlePageAlbumUrl = new URL("https://members.luscious.net/albums/bakaneko-navidarks_332097/"); + LusciousRipper singlePageRipper = new LusciousRipper(singlePageAlbumUrl); + try { + singlePageRipper.getNextPage(singlePageRipper.getFirstPage()); + } catch (IOException e) { + assertEquals("No next page found.", e.getMessage()); + } + } } \ No newline at end of file From 27973663197c3440a0c8f0af8e0ffa77998c3bd1 Mon Sep 17 00:00:00 2001 From: Tushar Date: Mon, 21 Jan 2019 09:56:04 +0530 Subject: [PATCH 2/2] Fixed indentation. --- .../ripme/ripper/rippers/LusciousRipper.java | 166 +++++++++--------- .../ripper/rippers/LusciousRipperTest.java | 46 ++--- 2 files changed, 106 insertions(+), 106 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java index 28cd601b..e56f8dbc 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/LusciousRipper.java @@ -17,106 +17,106 @@ import com.rarchives.ripme.ripper.DownloadThreadPool; import com.rarchives.ripme.utils.Http; public class LusciousRipper extends AbstractHTMLRipper { - private static final int RETRY_COUNT = 5; // Keeping it high for read timeout exception. + private static final int RETRY_COUNT = 5; // Keeping it high for read timeout exception. - private Pattern p = Pattern.compile("^https?://(?:members.)?luscious\\.net/albums/([-_.0-9a-zA-Z]+).*$"); - private DownloadThreadPool lusciousThreadPool = new DownloadThreadPool("lusciousThreadPool"); + private Pattern p = Pattern.compile("^https?://(?:members.)?luscious\\.net/albums/([-_.0-9a-zA-Z]+).*$"); + private DownloadThreadPool lusciousThreadPool = new DownloadThreadPool("lusciousThreadPool"); - public LusciousRipper(URL url) throws IOException { - super(url); - } + public LusciousRipper(URL url) throws IOException { + super(url); + } - @Override - public String getDomain() { - return "luscious.net"; - } + @Override + public String getDomain() { + return "luscious.net"; + } - @Override - public String getHost() { - return "luscious"; - } + @Override + public String getHost() { + return "luscious"; + } - @Override - public Document getFirstPage() throws IOException { - // "url" is an instance field of the superclass - Document page = Http.url(url).get(); - LOGGER.info("First page is " + url); - return page; - } + @Override + public Document getFirstPage() throws IOException { + // "url" is an instance field of the superclass + Document page = Http.url(url).get(); + LOGGER.info("First page is " + url); + return page; + } - @Override - public List getURLsFromPage(Document page) { - List urls = new ArrayList<>(); - Elements urlElements = page.select("div.item.thumbnail.ic_container > a"); - for (Element e : urlElements) { - urls.add(e.attr("abs:href")); - } + @Override + public List getURLsFromPage(Document page) { + List urls = new ArrayList<>(); + Elements urlElements = page.select("div.item.thumbnail.ic_container > a"); + for (Element e : urlElements) { + urls.add(e.attr("abs:href")); + } - return urls; - } + return urls; + } - @Override - public Document getNextPage(Document doc) throws IOException { - // luscious sends xhr requests to nextPageUrl and appends new set of images to the current page while in browser. - // Simply GET the nextPageUrl also works. Therefore, we do this... - Element nextPageElement = doc.select("div#next_page > div > a").first(); - if (nextPageElement == null) { - throw new IOException("No next page found."); - } + @Override + public Document getNextPage(Document doc) throws IOException { + // luscious sends xhr requests to nextPageUrl and appends new set of images to the current page while in browser. + // Simply GET the nextPageUrl also works. Therefore, we do this... + Element nextPageElement = doc.select("div#next_page > div > a").first(); + if (nextPageElement == null) { + throw new IOException("No next page found."); + } - return Http.url(nextPageElement.attr("abs:href")).get(); - } + return Http.url(nextPageElement.attr("abs:href")).get(); + } - @Override - public String getGID(URL url) throws MalformedURLException { - Matcher m = p.matcher(url.toExternalForm()); - if (m.matches()) { - return m.group(1); - } - throw new MalformedURLException("Expected luscious.net URL format: " - + "luscious.net/albums/albumname \n members.luscious.net/albums/albumname - got " + url + " instead."); - } + @Override + public String getGID(URL url) throws MalformedURLException { + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + throw new MalformedURLException("Expected luscious.net URL format: " + + "luscious.net/albums/albumname \n members.luscious.net/albums/albumname - got " + url + " instead."); + } - @Override - public void downloadURL(URL url, int index) { - lusciousThreadPool.addThread(new LusciousDownloadThread(url, index)); - } + @Override + public void downloadURL(URL url, int index) { + lusciousThreadPool.addThread(new LusciousDownloadThread(url, index)); + } - @Override - public DownloadThreadPool getThreadPool() { - return lusciousThreadPool; - } + @Override + public DownloadThreadPool getThreadPool() { + return lusciousThreadPool; + } - public class LusciousDownloadThread extends Thread { - private URL url; - private int index; + public class LusciousDownloadThread extends Thread { + private URL url; + private int index; - public LusciousDownloadThread(URL url, int index) { - this.url = url; - this.index = index; - } + public LusciousDownloadThread(URL url, int index) { + this.url = url; + this.index = index; + } - @Override - public void run() { - try { - Document page = Http.url(url).retries(RETRY_COUNT).get(); + @Override + public void run() { + try { + Document page = Http.url(url).retries(RETRY_COUNT).get(); - String downloadUrl = page.select(".icon-download").attr("abs:href"); - if (downloadUrl.equals("")) { - // This is here for pages with mp4s instead of images. - downloadUrl = page.select("div > video > source").attr("src"); - if (!downloadUrl.equals("")) { - throw new IOException("Could not find download url for image or video."); - } - } + String downloadUrl = page.select(".icon-download").attr("abs:href"); + if (downloadUrl.equals("")) { + // This is here for pages with mp4s instead of images. + downloadUrl = page.select("div > video > source").attr("src"); + if (!downloadUrl.equals("")) { + throw new IOException("Could not find download url for image or video."); + } + } - //If a valid download url was found. - addURLToDownload(new URL(downloadUrl), getPrefix(index)); + //If a valid download url was found. + addURLToDownload(new URL(downloadUrl), getPrefix(index)); - } catch (IOException e) { - LOGGER.error("Error downloadiong url " + url, e); - } - } + } catch (IOException e) { + LOGGER.error("Error downloadiong url " + url, e); + } + } - } + } } diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/LusciousRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/LusciousRipperTest.java index 4235608a..f8da140c 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/LusciousRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/LusciousRipperTest.java @@ -6,30 +6,30 @@ import java.net.URL; import com.rarchives.ripme.ripper.rippers.LusciousRipper; public class LusciousRipperTest extends RippersTest { - public void testPahealRipper() throws IOException { - // a photo set - LusciousRipper ripper = new LusciousRipper( - new URL("https://luscious.net/albums/h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609/")); - testRipper(ripper); - } + public void testPahealRipper() throws IOException { + // a photo set + LusciousRipper ripper = new LusciousRipper( + new URL("https://luscious.net/albums/h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609/")); + testRipper(ripper); + } - public void testGetGID() throws IOException { - URL url = new URL("https://luscious.net/albums/h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609/"); - LusciousRipper ripper = new LusciousRipper(url); - assertEquals("h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609", ripper.getGID(url)); - } + public void testGetGID() throws IOException { + URL url = new URL("https://luscious.net/albums/h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609/"); + LusciousRipper ripper = new LusciousRipper(url); + assertEquals("h-na-alice-wa-suki-desu-ka-do-you-like-alice-when_321609", ripper.getGID(url)); + } - public void testGetNextPage() throws IOException { - URL multiPageAlbumUrl = new URL("https://luscious.net/albums/women-of-color_58/"); - LusciousRipper multiPageRipper = new LusciousRipper(multiPageAlbumUrl); - assert (multiPageRipper.getNextPage(multiPageRipper.getFirstPage()) != null); + public void testGetNextPage() throws IOException { + URL multiPageAlbumUrl = new URL("https://luscious.net/albums/women-of-color_58/"); + LusciousRipper multiPageRipper = new LusciousRipper(multiPageAlbumUrl); + assert (multiPageRipper.getNextPage(multiPageRipper.getFirstPage()) != null); - URL singlePageAlbumUrl = new URL("https://members.luscious.net/albums/bakaneko-navidarks_332097/"); - LusciousRipper singlePageRipper = new LusciousRipper(singlePageAlbumUrl); - try { - singlePageRipper.getNextPage(singlePageRipper.getFirstPage()); - } catch (IOException e) { - assertEquals("No next page found.", e.getMessage()); - } - } + URL singlePageAlbumUrl = new URL("https://members.luscious.net/albums/bakaneko-navidarks_332097/"); + LusciousRipper singlePageRipper = new LusciousRipper(singlePageAlbumUrl); + try { + singlePageRipper.getNextPage(singlePageRipper.getFirstPage()); + } catch (IOException e) { + assertEquals("No next page found.", e.getMessage()); + } + } } \ No newline at end of file