diff --git a/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java b/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java index e0a93404..56e0281f 100644 --- a/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java +++ b/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java @@ -14,8 +14,16 @@ public class DownloadThreadPool { private ExecutorService threadPool = null; public DownloadThreadPool() { + initialize("Main"); + } + + public DownloadThreadPool(String threadPoolName) { + initialize(threadPoolName); + } + + private void initialize(String threadPoolName) { int threads = Utils.getConfigInteger("threads.size", 10); - logger.debug("Initializing thread pool with " + threads + " threads"); + logger.debug("Initializing " + threadPoolName + " thread pool with " + threads + " threads"); threadPool = Executors.newFixedThreadPool(threads); } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java index b83f3a75..78a18798 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java @@ -143,6 +143,9 @@ public class ImgurRipper extends AbstractRipper { } } + // TODO If album is empty, use this to check for cached images: + // http://i.rarchives.com/search.cgi?cache=http://imgur.com/a/albumID + // At the least, get the thumbnails. logger.info("[!] Falling back to elemental retrieval method"); // Fall back to parsing HTML elements diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/MotherlessRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/MotherlessRipper.java new file mode 100644 index 00000000..5f827106 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/MotherlessRipper.java @@ -0,0 +1,111 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractRipper; +import com.rarchives.ripme.ripper.DownloadThreadPool; + +public class MotherlessRipper extends AbstractRipper { + + private static final String DOMAIN = "motherless.com", + HOST = "motherless"; + private static final Logger logger = Logger.getLogger(MotherlessRipper.class); + private static final String USER_AGENT = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:27.0) Gecko/20100101 Firefox/27.0"; + + private DownloadThreadPool motherlessThreadPool; + + public MotherlessRipper(URL url) throws IOException { + super(url); + motherlessThreadPool = new DownloadThreadPool(); + } + + @Override + public boolean canRip(URL url) { + return url.getHost().endsWith(DOMAIN); + } + + @Override + public String getHost() { + return HOST; + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + String gid = getGID(url); + URL newURL = new URL("http://motherless.com/G" + gid); + logger.debug("Sanitized URL from " + url + " to " + newURL); + return newURL; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://(www\\.)?motherless\\.com/G([A-Z0-9]{6,8}).*$"); + Matcher m = p.matcher(url.toExternalForm()); + if (!m.matches()) { + throw new MalformedURLException("Expected URL format: http://motherless.com/GXXXXXXXX"); + } + return m.group(m.groupCount()); + } + + @Override + public void rip() throws IOException { + int index = 0; + logger.info("[ ] Retrieving " + this.url.toExternalForm()); + Document doc = Jsoup.connect(this.url.toExternalForm()) + .userAgent(USER_AGENT) + .get(); + for (Element thumb : doc.select("div.thumb a.img-container")) { + URL url = new URL("http://" + DOMAIN + thumb.attr("href")); + index += 1; + // Create thread for finding image at "url" page + MotherlessImageThread mit = new MotherlessImageThread(url, index); + motherlessThreadPool.addThread(mit); + } + motherlessThreadPool.waitForThreads(); + waitForThreads(); + } + + /** + * Helper class to find and download images found on "image" pages + */ + private class MotherlessImageThread extends Thread { + private URL url; + private int index; + + public MotherlessImageThread(URL url, int index) { + super(); + this.url = url; + this.index = index; + } + + @Override + public void run() { + try { + Document doc = Jsoup.connect(this.url.toExternalForm()) + .userAgent(USER_AGENT) + .get(); + Pattern p = Pattern.compile("^.*__fileurl = '([^']{1,})';.*$", Pattern.DOTALL); + Matcher m = p.matcher(doc.outerHtml()); + if (m.matches()) { + String file = m.group(1); + addURLToDownload(new URL(file), String.format("%03d_", index)); + } else { + logger.warn("[!] could not find '__fileurl' at " + url); + } + } catch (IOException e) { + logger.error("[!] Exception while loading/parsing " + this.url, e); + } + } + } + +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java index 413f1639..6f2cb276 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java @@ -55,6 +55,9 @@ public class ImgurRipperTest extends RippersTest { } public void testImgurAlbums() throws IOException { + if (!DOWNLOAD_CONTENT) { + return; + } List contentURLs = new ArrayList(); // URLs that should return more than 1 image contentURLs.add(new URL("http://imgur.com/a/hqJIu")); // Vertical layout diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java index e0883851..e1b95436 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java @@ -10,6 +10,9 @@ import com.rarchives.ripme.ripper.rippers.InstagramRipper; public class InstagramRipperTest extends RippersTest { public void testInstagramAlbums() throws IOException { + if (!DOWNLOAD_CONTENT) { + return; + } List contentURLs = new ArrayList(); contentURLs.add(new URL("http://instagram.com/feelgoodincc#")); for (URL url : contentURLs) { diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/MotherlessRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/MotherlessRipperTest.java new file mode 100644 index 00000000..c4047680 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/MotherlessRipperTest.java @@ -0,0 +1,36 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import com.rarchives.ripme.ripper.rippers.MotherlessRipper; + +public class MotherlessRipperTest extends RippersTest { + + public void testMotherlessAlbums() throws IOException { + if (!DOWNLOAD_CONTENT) { + return; + } + List contentURLs = new ArrayList(); + + // Image album + contentURLs.add(new URL("http://motherless.com/G4DAA18D")); + // Video album + contentURLs.add(new URL("http://motherless.com/GFD0F537")); + + for (URL url : contentURLs) { + try { + MotherlessRipper ripper = new MotherlessRipper(url); + ripper.rip(); + assert(ripper.getWorkingDir().listFiles().length > 1); + deleteDir(ripper.getWorkingDir()); + } catch (Exception e) { + e.printStackTrace(); + fail("Error while ripping URL " + url + ": " + e.getMessage()); + } + } + } + +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java index 8d7eb344..c4ca7bed 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java @@ -6,6 +6,9 @@ import junit.framework.TestCase; public class RippersTest extends TestCase { + // Flag for avoiding downloading content with every unit test + public final boolean DOWNLOAD_CONTENT = false; + public void testNothing() { // Avoid complaints about no test cases in this file. assert(true); diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java index 094c3f42..f42c10d1 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java @@ -10,6 +10,9 @@ import com.rarchives.ripme.ripper.rippers.TwitterRipper; public class TwitterRipperTest extends RippersTest { public void testTwitterAlbums() throws IOException { + if (!DOWNLOAD_CONTENT) { + return; + } List contentURLs = new ArrayList(); //contentURLs.add(new URL("https://twitter.com/danngamber01/media")); contentURLs.add(new URL("https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd"));