From e2bb412d9f0e542abd8b28b8f723cd260685faab Mon Sep 17 00:00:00 2001 From: 4pr0n Date: Mon, 3 Mar 2014 00:44:07 -0800 Subject: [PATCH] Added instagram ripper, integration tests. Also fixed parts of the imgur ripper. --- .../ripme/ripper/AbstractRipper.java | 10 +- .../ripme/ripper/rippers/ImgurRipper.java | 25 +++- .../ripme/ripper/rippers/InstagramRipper.java | 140 ++++++++++++++++++ .../rarchives/ripme/{ => tst}/AppTest.java | 2 +- .../tst/ripper/rippers/ImgurRipperTest.java | 78 ++++++++++ .../ripper/rippers/InstagramRipperTest.java | 28 ++++ .../ripme/tst/ripper/rippers/RippersTest.java | 22 +++ 7 files changed, 296 insertions(+), 9 deletions(-) create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java rename src/test/java/com/rarchives/ripme/{ => tst}/AppTest.java (94%) create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java create mode 100644 src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index a1786859..79d6cfb4 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -178,7 +178,6 @@ public abstract class AbstractRipper } private void checkIfComplete() { - System.err.println("Pending: " + itemsPending.size() + ", Completed: " + itemsCompleted.size() + ", Errored: " + itemsErrored.size()); if (!completed && itemsPending.size() == 0) { completed = true; logger.info("Rip completed!"); @@ -193,6 +192,10 @@ public abstract class AbstractRipper public URL getURL() { return url; } + + public File getWorkingDir() { + return workingDir; + } public void setWorkingDir(URL url) throws IOException { String path = Utils.getWorkingDirectory().getCanonicalPath(); @@ -224,6 +227,7 @@ public abstract class AbstractRipper return ripper; } catch (Exception e) { // Incompatible rippers *will* throw exceptions during instantiation. + logger.error("Excepion while instantiating: " + constructor.getClass().getName(), e); } } throw new Exception("No compatible ripper found"); @@ -245,7 +249,9 @@ public abstract class AbstractRipper URL classURL = urls.nextElement(); for (File f : new File(classURL.toURI()).listFiles()) { String className = f.getName(); - if (!className.endsWith(".class") || className.contains("$")) { + if (!className.endsWith(".class") + || className.contains("$") + || className.endsWith("Test.class")) { // Ignore non-class or nested classes. continue; } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java index b47b0ff9..b83f3a75 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java @@ -22,9 +22,9 @@ public class ImgurRipper extends AbstractRipper { private static final String DOMAIN = "imgur.com", HOST = "imgur"; private static final Logger logger = Logger.getLogger(ImgurRipper.class); - + private final int SLEEP_BETWEEN_ALBUMS; - + static enum ALBUM_TYPE { ALBUM, USER, @@ -61,6 +61,8 @@ public class ImgurRipper extends AbstractRipper { if (u.indexOf('#') >= 0) { u = u.substring(0, u.indexOf('#')); } + u = u.replace("https?://m\\.imgur\\.com", "http://imgur.com"); + u = u.replace("https?://i\\.imgur\\.com", "http://imgur.com"); return new URL(u); } @@ -204,14 +206,18 @@ public class ImgurRipper extends AbstractRipper { this.url = new URL("http://imgur.com/a/" + gid); return gid; } - p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{1,})\\.imgur\\.com/?$"); + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/?$"); m = p.matcher(url.toExternalForm()); if (m.matches()) { // Root imgur account + String gid = m.group(1); + if (gid.equals("i")) { + throw new MalformedURLException("Ripping i.imgur.com links not supported"); + } albumType = ALBUM_TYPE.USER; - return m.group(1); + return gid; } - p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$"); + p = Pattern.compile("^https?://([a-zA-Z0-9\\-]{3,})\\.imgur\\.com/([a-zA-Z0-9])?$"); m = p.matcher(url.toExternalForm()); if (m.matches()) { // Imgur account album @@ -223,9 +229,16 @@ public class ImgurRipper extends AbstractRipper { if (m.matches()) { // Series of imgur images albumType = ALBUM_TYPE.SERIES_OF_IMAGES; - return m.group(m.groupCount()).replaceAll(",", "-"); + String gid = m.group(m.groupCount()); + if (!gid.contains(",")) { + throw new MalformedURLException("Imgur image doesn't contain commas"); + } + return gid.replaceAll(",", "-"); } throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm()); } + public ALBUM_TYPE getAlbumType() { + return albumType; + } } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java new file mode 100644 index 00000000..02a44fad --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -0,0 +1,140 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; +import org.json.JSONArray; +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractRipper; + +public class InstagramRipper extends AbstractRipper { + + private static final String DOMAIN = "instagram.com", + HOST = "instagram"; + private static final Logger logger = Logger.getLogger(ImagearnRipper.class); + + public InstagramRipper(URL url) throws IOException { + super(url); + } + + @Override + public boolean canRip(URL url) { + return url.getHost().endsWith(DOMAIN); + } + + @Override + public URL sanitizeURL(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://instagram\\.com/p/([a-zA-Z0-9]{1,}).*$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Link to photo, not the user account + try { + url = getUserPageFromImage(url); + } catch (Exception e) { + logger.error("[!] Failed to get user page from " + url, e); + throw new MalformedURLException("Failed to retrieve user page from " + url); + } + } + p = Pattern.compile("^.*instagram.com/([a-zA-Z0-9]{3,}).*$"); + m = p.matcher(url.toExternalForm()); + if (!m.matches()) { + throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url); + } + return new URL("http://statigr.am/" + m.group(1)); + } + + private URL getUserPageFromImage(URL url) throws IOException { + Document doc = Jsoup.connect(url.toExternalForm()).get(); + for (Element element : doc.select("meta[property='og:description']")) { + String content = element.attr("content"); + if (content.endsWith("'s photo on Instagram")) { + return new URL("http://statigr.am/" + content.substring(0, content.indexOf("'"))); + } + } + throw new MalformedURLException("Expected username in URL (instagram.com/username and not " + url); + } + + private String getUserID(URL url) throws IOException { + logger.info(" Retrieving " + url); + Document doc = Jsoup.connect(this.url.toExternalForm()).get(); + for (Element element : doc.select("input[id=user_public]")) { + return element.attr("value"); + } + throw new IOException("Unable to find userID at " + this.url); + } + + @Override + public void rip() throws IOException { + int index = 0; + String userID = getUserID(this.url); + String baseURL = "http://statigr.am/controller_nl.php?action=getPhotoUserPublic&user_id=" + userID; + String params = ""; + while (true) { + String url = baseURL + params; + logger.info(" Retrieving " + url); + String jsonString = Jsoup.connect(url).ignoreContentType(true).execute().body(); + JSONObject json = new JSONObject(jsonString); + JSONArray datas = json.getJSONArray("data"); + String nextMaxID = ""; + if (datas.length() == 0) { + break; + } + for (int i = 0; i < datas.length(); i++) { + JSONObject data = (JSONObject) datas.get(i); + if (data.has("id")) { + nextMaxID = data.getString("id"); + } + if (data.has("videos")) { + index += 1; + String video = data.getJSONObject("videos").getJSONObject("standard_resolution").getString("url"); + addURLToDownload(new URL(video), String.format("%03d_", index)); + } else if (data.has("images")) { + index += 1; + String image = data.getJSONObject("images").getJSONObject("standard_resolution").getString("url"); + // addURLToDownload(new URL(image), String.format("%03d_", index)); + addURLToDownload(new URL(image)); + } + } + JSONObject pagination = json.getJSONObject("pagination"); + if (nextMaxID.equals("")) { + if (!pagination.has("next_max_id")) { + break; + } else { + nextMaxID = pagination.getString("next_max_id"); + } + } + params = "&max_id=" + nextMaxID; + try { + Thread.sleep(3000); + } catch (InterruptedException e) { + logger.error("[!] Interrupted while waiting to load next album:", e); + break; + } + } + waitForThreads(); + } + + @Override + public String getHost() { + return HOST; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://statigr.am/([a-zA-Z0-9]{3,}).*$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); + } + throw new MalformedURLException("Unable to find user in " + url); + } + +} diff --git a/src/test/java/com/rarchives/ripme/AppTest.java b/src/test/java/com/rarchives/ripme/tst/AppTest.java similarity index 94% rename from src/test/java/com/rarchives/ripme/AppTest.java rename to src/test/java/com/rarchives/ripme/tst/AppTest.java index 1be02ea5..630e46d9 100644 --- a/src/test/java/com/rarchives/ripme/AppTest.java +++ b/src/test/java/com/rarchives/ripme/tst/AppTest.java @@ -1,4 +1,4 @@ -package com.rarchives.ripme; +package com.rarchives.ripme.tst; import junit.framework.Test; import junit.framework.TestCase; diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java new file mode 100644 index 00000000..c4662d5f --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/ImgurRipperTest.java @@ -0,0 +1,78 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import com.rarchives.ripme.ripper.rippers.ImgurRipper; + +public class ImgurRipperTest extends RippersTest { + + public void testImgurURLFailures() throws IOException { + List failURLs = new ArrayList(); + // Imgur urls that should not work + failURLs.add(new URL("http://imgur.com")); + failURLs.add(new URL("http://imgur.com/")); + failURLs.add(new URL("http://i.imgur.com")); + failURLs.add(new URL("http://i.imgur.com/")); + failURLs.add(new URL("http://imgur.com/image")); + failURLs.add(new URL("http://imgur.com/image.jpg")); + failURLs.add(new URL("http://i.imgur.com/image.jpg")); + for (URL url : failURLs) { + try { + new ImgurRipper(url); + fail("Instantiated ripper for URL that should not work: " + url); + } catch (Exception e) { + // Expected + continue; + } + } + } + + public void testImgurURLPasses() throws IOException { + List passURLs = new ArrayList(); + // Imgur URLs that should work + passURLs.add(new URL("http://imgur.com/a/XPd4F")); + passURLs.add(new URL("http://imgur.com/a/XPd4F/")); + passURLs.add(new URL("http://imgur.com/a/WxG6f/all")); + passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0")); + passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0")); + passURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0")); + passURLs.add(new URL("http://imgur.com/YOdjht3,x5VxH9G,5juXjJ2")); + passURLs.add(new URL("http://markedone911.imgur.com")); + passURLs.add(new URL("http://markedone911.imgur.com/")); + + for (URL url : passURLs) { + try { + ImgurRipper ripper = new ImgurRipper(url); + assertTrue(ripper.canRip(url)); + deleteDir(ripper.getWorkingDir()); + } catch (Exception e) { + fail("Failed to instantiate ripper for " + url); + } + } + } + + public void testImgurAlbums() throws IOException { + List contentURLs = new ArrayList(); + // URLs that should return more than 1 image + contentURLs.add(new URL("http://imgur.com/a/hqJIu")); // Vertical layout + contentURLs.add(new URL("http://imgur.com/a/dS9OQ#0")); // Horizontal layout + contentURLs.add(new URL("http://imgur.com/a/YpsW9#0")); // Grid layout + contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/vertical#0")); + contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/horizontal#0")); + contentURLs.add(new URL("http://imgur.com/a/WxG6f/layout/grid#0")); + for (URL url : contentURLs) { + try { + ImgurRipper ripper = new ImgurRipper(url); + ripper.rip(); + assert(ripper.getWorkingDir().listFiles().length > 1); + deleteDir(ripper.getWorkingDir()); + } catch (Exception e) { + fail("Error while ripping URL " + url + ": " + e.getMessage()); + } + } + } + +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java new file mode 100644 index 00000000..7f7b1593 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/InstagramRipperTest.java @@ -0,0 +1,28 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; + +import com.rarchives.ripme.ripper.rippers.InstagramRipper; + + +public class InstagramRipperTest extends RippersTest { + + public void testInstagramAlbums() throws IOException { + List contentURLs = new ArrayList(); + contentURLs.add(new URL("http://instagram.com/feelgoodincc#")); + for (URL url : contentURLs) { + try { + InstagramRipper ripper = new InstagramRipper(url); + ripper.rip(); + assert(ripper.getWorkingDir().listFiles().length > 1); + deleteDir(ripper.getWorkingDir()); + } catch (Exception e) { + fail("Error while ripping URL " + url + ": " + e.getMessage()); + } + } + } + +} diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java new file mode 100644 index 00000000..f0a7f351 --- /dev/null +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/RippersTest.java @@ -0,0 +1,22 @@ +package com.rarchives.ripme.tst.ripper.rippers; + +import java.io.File; + +import junit.framework.TestCase; + +public class RippersTest extends TestCase { + + protected void deleteDir(File dir) { + return; + /* + for (File f : dir.listFiles()) { + if (f.isDirectory()) { + deleteDir(f); + } + f.delete(); + } + dir.delete(); + //*/ + } + +}