From 46e29484039bb1ed71d7acbc42f6f1a940ae7d24 Mon Sep 17 00:00:00 2001 From: 4pr0n Date: Wed, 26 Feb 2014 19:54:44 -0800 Subject: [PATCH] Added threaded download manager, imgur ripper works --- config/log4j.properties | 17 +++ config/rip.properties | 1 + src/main/java/com/rarchives/ripme/App.java | 23 +-- .../ripme/ripper/AbstractRipper.java | 64 +++++++- .../ripme/ripper/DownloadFileThread.java | 55 +++++++ .../ripme/ripper/DownloadThreadPool.java | 34 +++++ .../ripme/ripper/RipperInterface.java | 7 +- .../ripme/ripper/rippers/ImagefapRipper.java | 76 +++++----- .../ripme/ripper/rippers/ImgurRipper.java | 142 ++++++++++++++++++ .../java/com/rarchives/ripme/utils/Utils.java | 59 +++++--- 10 files changed, 397 insertions(+), 81 deletions(-) create mode 100644 config/log4j.properties create mode 100644 config/rip.properties create mode 100644 src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java create mode 100644 src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java create mode 100644 src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java diff --git a/config/log4j.properties b/config/log4j.properties new file mode 100644 index 00000000..8b6e7433 --- /dev/null +++ b/config/log4j.properties @@ -0,0 +1,17 @@ +# define the file appender +log4j.appender.FILE = org.apache.log4j.RollingFileAppender +log4j.appender.FILE.File = ripme.log +log4j.appender.FILE.ImmediateFlush = true +log4j.appender.FILE.Threshold = debug +log4j.appender.FILE.maxFileSize = 20MB +log4j.appender.FILE.layout = org.apache.log4j.PatternLayout +log4j.appender.FILE.layout.ConversionPattern = %d %-4r [%t] %-5p %c{2} %x - %m%n + +# define the console appender +log4j.appender.stdout = org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target = System.out +log4j.appender.stdout.layout = org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern = %d %-4r [%t] %-5p %c{2} %x - %m%n + +# now map our console appender as a root logger, means all log messages will go to this appender +log4j.rootLogger = DEBUG, FILE, stdout \ No newline at end of file diff --git a/config/rip.properties b/config/rip.properties new file mode 100644 index 00000000..fe0aa878 --- /dev/null +++ b/config/rip.properties @@ -0,0 +1 @@ +threads.size = 5 \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/App.java b/src/main/java/com/rarchives/ripme/App.java index b3db5720..a8a3efd7 100644 --- a/src/main/java/com/rarchives/ripme/App.java +++ b/src/main/java/com/rarchives/ripme/App.java @@ -1,26 +1,27 @@ package com.rarchives.ripme; -import java.io.IOException; import java.net.URL; import org.apache.log4j.Logger; -import org.apache.log4j.PropertyConfigurator; -import com.rarchives.ripme.ripper.rippers.ImagefapRipper; +import com.rarchives.ripme.ripper.rippers.ImgurRipper; /** * */ public class App { - public static void main( String[] args ) throws IOException { + public static void main( String[] args ) throws Exception { Logger logger = Logger.getLogger(App.class); - PropertyConfigurator.configure("config/log4j.properties"); - logger.debug("Testing"); - URL url = new URL("http://www.imagefap.com/pictures/4117023/Mirror-flat-stomach-small-firm-tits"); - System.out.println("URL: " + url.toExternalForm()); - ImagefapRipper ir = new ImagefapRipper(url); - System.out.println("Ripping"); - ir.rip(); + logger.debug("Initialized"); + //URL url = new URL("http://www.imagefap.com/pictures/4117023/Mirror-flat-stomach-small-firm-tits"); + URL url = new URL("http://imgur.com/a/Ox6jN"); + try { + ImgurRipper ir = new ImgurRipper(url); + ir.rip(); + } catch (Exception e) { + logger.error("Caught exception:", e); + throw e; + } } public static void initialize() { diff --git a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java index f45e93ce..520f689c 100644 --- a/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/AbstractRipper.java @@ -5,18 +5,26 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import org.apache.log4j.Logger; + import com.rarchives.ripme.utils.Utils; public abstract class AbstractRipper implements RipperInterface { + private static final Logger logger = Logger.getLogger(AbstractRipper.class); + protected URL url; - protected File workingDir = null; + protected File workingDir; + protected DownloadThreadPool threadPool; public abstract void rip() throws IOException; - public abstract void setWorkingDir() throws IOException; + public abstract String getHost(); + public abstract String getGID(URL url) throws MalformedURLException; /** - * Ensures inheriting ripper can rip this URL. + * Ensures inheriting ripper can rip this URL, raises exception if not. + * Otherwise initializes working directory and thread pool. + * * @param url * URL to rip. * @throws IOException @@ -26,13 +34,57 @@ public abstract class AbstractRipper implements RipperInterface { if (!canRip(url)) { throw new MalformedURLException("Unable to rip url: " + url); } - this.url = url; - setWorkingDir(); - workingDir = Utils.getWorkingDirectory(); + this.url = sanitizeURL(url); + setWorkingDir(url); + this.threadPool = new DownloadThreadPool(); + } + + public void addURLToDownload(URL url) { + addURLToDownload(url, ""); + } + + public void addURLToDownload(URL url, String prefix) { + String saveAs = url.toExternalForm(); + saveAs = saveAs.substring(saveAs.lastIndexOf('/')+1); + if (saveAs.indexOf('?') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('?')); } + if (saveAs.indexOf('#') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('#')); } + if (saveAs.indexOf('&') >= 0) { saveAs = saveAs.substring(0, saveAs.indexOf('&')); } + File saveFileAs; + try { + saveFileAs = new File(workingDir.getCanonicalPath() + File.separator + prefix + saveAs); + } catch (IOException e) { + logger.error("Error creating save file path for URL '" + url + "':", e); + return; + } + logger.info("Downloading " + url + " to " + saveFileAs); + addURLToDownload(url, saveFileAs); + } + /** + * Add image to be downloaded and saved. + * @param url + * URL of the file + * @param saveAs + * Path of the local file to save the content to. + */ + public void addURLToDownload(URL url, File saveAs) { + threadPool.addThread(new DownloadFileThread(url, saveAs)); } public URL getURL() { return url; } + public void setWorkingDir(URL url) throws IOException { + String path = Utils.getWorkingDirectory().getCanonicalPath(); + if (!path.endsWith(File.separator)) { + path += File.separator; + } + path += getHost() + "_" + getGID(this.url) + File.separator; + this.workingDir = new File(path); + if (!this.workingDir.exists()) { + logger.info("Creating working directory(s): " + this.workingDir); + this.workingDir.mkdirs(); + } + logger.debug("Set working directory to: " + this.workingDir); + } } \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java new file mode 100644 index 00000000..1cb3c5cd --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/DownloadFileThread.java @@ -0,0 +1,55 @@ +package com.rarchives.ripme.ripper; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.URL; + +import org.apache.log4j.Logger; +import org.jsoup.Connection.Response; +import org.jsoup.Jsoup; + +import com.rarchives.ripme.utils.Utils; + +public class DownloadFileThread extends Thread { + + private static final Logger logger = Logger.getLogger(DownloadFileThread.class); + + private URL url; + private File saveAs; + + public DownloadFileThread(URL url, File saveAs) { + super(); + this.url = url; + this.saveAs = saveAs; + } + + public void run() { + // Check if file already exists + if (saveAs.exists()) { + if (Utils.getConfigBoolean("file.overwrite", false)) { + logger.info("File already exists and 'file.overwrite' is true, deleting: " + saveAs); + saveAs.delete(); + } else { + logger.info("Not downloading " + url + " because file already exists: " + saveAs); + return; + } + } + + logger.debug("Downloading file from: " + url); + try { + Response response; + response = Jsoup.connect(url.toExternalForm()) + .ignoreContentType(true) + .execute(); + FileOutputStream out = (new FileOutputStream(saveAs)); + out.write(response.bodyAsBytes()); + out.close(); + } catch (IOException e) { + logger.error("Exception while downloading file: " + url, e); + return; + } + logger.debug("Download completed: " + url); + } + +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java b/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java new file mode 100644 index 00000000..e0a93404 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/DownloadThreadPool.java @@ -0,0 +1,34 @@ +package com.rarchives.ripme.ripper; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import org.apache.log4j.Logger; + +import com.rarchives.ripme.utils.Utils; + +public class DownloadThreadPool { + + private static final Logger logger = Logger.getLogger(DownloadThreadPool.class); + private ExecutorService threadPool = null; + + public DownloadThreadPool() { + int threads = Utils.getConfigInteger("threads.size", 10); + logger.debug("Initializing thread pool with " + threads + " threads"); + threadPool = Executors.newFixedThreadPool(threads); + } + + public void addThread(Thread t) { + threadPool.execute(t); + } + + public void waitForThreads() { + threadPool.shutdown(); + try { + threadPool.awaitTermination(60, TimeUnit.SECONDS); + } catch (InterruptedException e) { + logger.error("Interrupted while waiting for threads to finish: ", e); + } + } +} diff --git a/src/main/java/com/rarchives/ripme/ripper/RipperInterface.java b/src/main/java/com/rarchives/ripme/ripper/RipperInterface.java index 9c0a7e76..d6c811e8 100644 --- a/src/main/java/com/rarchives/ripme/ripper/RipperInterface.java +++ b/src/main/java/com/rarchives/ripme/ripper/RipperInterface.java @@ -1,11 +1,14 @@ package com.rarchives.ripme.ripper; import java.io.IOException; +import java.net.MalformedURLException; import java.net.URL; public interface RipperInterface { public void rip() throws IOException; - public void processURL(String url); public boolean canRip(URL url); - public void setWorkingDir() throws IOException; + public URL sanitizeURL(URL url) throws MalformedURLException; + public void setWorkingDir(URL url) throws IOException; + public String getHost(); + public String getGID(URL url) throws MalformedURLException; } diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java index 051e0bfe..d3d360d7 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImagefapRipper.java @@ -1,92 +1,88 @@ package com.rarchives.ripme.ripper.rippers; -import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import com.rarchives.ripme.ripper.AbstractRipper; -import com.rarchives.ripme.utils.Utils; public class ImagefapRipper extends AbstractRipper { - private static final String HOST = "imagefap.com"; - - private String gid; + private static final String DOMAIN = "imagefap.com", + HOST = "imagefap"; + private static final Logger logger = Logger.getLogger(ImagefapRipper.class); public ImagefapRipper(URL url) throws IOException { super(url); - this.gid = getGID(url); + } + + @Override + public String getHost() { + return HOST; } /** * Reformat given URL into the desired format (all images on single page) */ - public void sanitizeURL() throws MalformedURLException { - this.url = new URL("http://www.imagefap.com/gallery.php?gid=" - + this.gid + "&view=2"); + public URL sanitizeURL(URL url) throws MalformedURLException { + String gid = getGID(url); + logger.debug("GID=" + gid); + URL newURL = new URL("http://www.imagefap.com/gallery.php?gid=" + + gid + "&view=2"); + logger.debug("Sanitized URL from " + url + " to " + newURL); + return newURL; } - private static String getGID(URL url) throws MalformedURLException { - String gid = null; - Pattern p = Pattern.compile("^.*imagefap.com/gallery.php?gid=([0-9]{1,}).*$"); + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^.*imagefap.com/gallery.php\\?gid=([0-9]{1,}).*$"); Matcher m = p.matcher(url.toExternalForm()); if (m.matches()) { - gid = m.group(1); - } else { - p = Pattern.compile("^.*imagefap.com/pictures/([0-9]{1,}).*$"); - m = p.matcher(url.toExternalForm()); - if (m.matches()) { - gid = m.group(1); - } + return m.group(1); } - if (gid == null) { - throw new MalformedURLException( - "Expected imagefap.com gallery formats:" - + "imagefap.com/gallery.php?gid=####... or" - + "imagefap.com/pictures/####..."); + p = Pattern.compile("^.*imagefap.com/pictures/([0-9]{1,}).*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + return m.group(1); } - return gid; - } - - @Override - public void setWorkingDir() throws IOException { - String path = Utils.getWorkingDirectory().getCanonicalPath(); - path += this.gid + File.separator; - this.workingDir = new File(path); + throw new MalformedURLException( + "Expected imagefap.com gallery formats: " + + "imagefap.com/gallery.php?gid=####... or " + + "imagefap.com/pictures/####..." + + " Got: " + url); } @Override public void rip() throws IOException { - System.err.println("Connecting to " + this.url.toExternalForm()); + logger.debug("Retrieving " + this.url.toExternalForm()); Document doc = Jsoup.connect(this.url.toExternalForm()).get(); for (Element thumb : doc.select("#gallery img")) { if (!thumb.hasAttr("src") || !thumb.hasAttr("width")) { continue; } String image = thumb.attr("src"); - image = image.replaceAll("http://x.*.fap.to/images/thumb/", + image = image.replaceAll( + "http://x.*.fap.to/images/thumb/", "http://fap.to/images/full/"); - processURL(image); - System.err.println(image); + processURL(new URL(image)); } } - public void processURL(String url) { - + public void processURL(URL url) { + logger.info("Found " + url); } public boolean canRip(URL url) { - if (!url.getHost().endsWith(HOST)) { + if (!url.getHost().endsWith(DOMAIN)) { return false; } return true; } -} +} \ No newline at end of file diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java new file mode 100644 index 00000000..62322bd8 --- /dev/null +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/ImgurRipper.java @@ -0,0 +1,142 @@ +package com.rarchives.ripme.ripper.rippers; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; + +import com.rarchives.ripme.ripper.AbstractRipper; + +public class ImgurRipper extends AbstractRipper { + + private static final String DOMAIN = "imgur.com", + HOST = "imgur"; + private static final Logger logger = Logger.getLogger(ImgurRipper.class); + + static enum ALBUM_TYPE { + ALBUM, + USER, + USER_ALBUM, + SERIES_OF_IMAGES + }; + private ALBUM_TYPE albumType; + + public ImgurRipper(URL url) throws IOException { + super(url); + } + + public void processURL(URL url, String prefix) { + logger.info("Found URL: " + url); + addURLToDownload(url, prefix); + } + + public boolean canRip(URL url) { + if (!url.getHost().endsWith(DOMAIN)) { + return false; + } + try { + getGID(url); + } catch (Exception e) { + // Can't get GID, can't rip it. + return false; + } + return true; + } + + public URL sanitizeURL(URL url) throws MalformedURLException { + String u = url.toExternalForm(); + if (u.indexOf('#') >= 0) { + u = u.substring(0, u.indexOf('#')); + } + return new URL(u); + } + + @Override + public void rip() throws IOException { + switch (albumType) { + case ALBUM: + this.url = new URL(this.url.toExternalForm() + "/noscript"); + // Fall-through + case USER_ALBUM: + ripAlbum(this.url); + break; + + case SERIES_OF_IMAGES: + // TODO Get all images + break; + + case USER: + // TODO Get all albums by user + break; + } + threadPool.waitForThreads(); + } + + private void ripAlbum(URL url) throws IOException { + int index = 0; + logger.debug("Retrieving " + url.toExternalForm()); + Document doc = Jsoup.connect(url.toExternalForm()).get(); + for (Element thumb : doc.select("div.image")) { + String image; + if (thumb.select("a.zoom").size() > 0) { + // Clickably full-size + image = "http:" + thumb.select("a").attr("href"); + } else if (thumb.select("img").size() > 0) { + image = "http:" + thumb.select("img").attr("src"); + } else { + // Unable to find image in this div + logger.error("Unable to find image in div: " + thumb.toString()); + continue; + } + index += 1; + processURL(new URL(image), String.format("%03d_", index)); + } + } + + @Override + public String getHost() { + return HOST; + } + + @Override + public String getGID(URL url) throws MalformedURLException { + Pattern p = Pattern.compile("^https?://(m\\.)?imgur\\.com/a/([a-zA-Z0-9]{5,8}).*$"); + Matcher m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Imgur album + albumType = ALBUM_TYPE.ALBUM; + String gid = m.group(m.groupCount()); + this.url = new URL("http://imgur.com/a/" + gid); + return gid; + } + p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/?$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Root imgur account + albumType = ALBUM_TYPE.USER; + return m.group(m.groupCount()); + } + p = Pattern.compile("^https?://([a-zA-Z0-9\\-])\\.imgur\\.com/([a-zA-Z0-9])?$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Imgur account album + albumType = ALBUM_TYPE.USER_ALBUM; + return m.group(); + } + p = Pattern.compile("^https?://(i\\.)?imgur\\.com/([a-zA-Z0-9,]{5,}).*$"); + m = p.matcher(url.toExternalForm()); + if (m.matches()) { + // Series of imgur images + albumType = ALBUM_TYPE.SERIES_OF_IMAGES; + return m.group(); + } + throw new MalformedURLException("Unexpected URL format: " + url.toExternalForm()); + } + +} diff --git a/src/main/java/com/rarchives/ripme/utils/Utils.java b/src/main/java/com/rarchives/ripme/utils/Utils.java index 79692d9e..9512a995 100644 --- a/src/main/java/com/rarchives/ripme/utils/Utils.java +++ b/src/main/java/com/rarchives/ripme/utils/Utils.java @@ -1,18 +1,17 @@ package com.rarchives.ripme.utils; import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import org.apache.commons.configuration.Configuration; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.configuration.PropertiesConfiguration; -import org.jsoup.Connection.Response; -import org.jsoup.Jsoup; +import org.apache.log4j.Logger; public class Utils { public static final String RIP_DIRECTORY = "rips"; + private static final Logger logger = Logger.getLogger(Utils.class); public static File getWorkingDirectory() throws IOException { String path = new File(".").getCanonicalPath() + File.separator; @@ -23,25 +22,41 @@ public class Utils { } return workingDir; } - - public static String getConfigString(String key) { - Configuration config = null; - try { - config = new PropertiesConfiguration("rip.properties"); - } catch (ConfigurationException e) { - System.err.println(e); - return null; - } - return config.getString(key); - } - - public static void downloadFile(String url, File saveAs) throws IOException { - Response response = Jsoup.connect(url) - .ignoreContentType(true) - .execute(); - FileOutputStream out = (new FileOutputStream(saveAs)); - out.write(response.bodyAsBytes()); - out.close(); + public static String getConfigString(String key, String defaultValue) { + String value = defaultValue; + try { + Configuration config = new PropertiesConfiguration("config/rip.properties"); + value = config.getString(key); + } catch (ConfigurationException e) { + logger.error("Failed to get configuration value for " + key + + ", using default '" + value + "'"); + } + return value; } + + public static int getConfigInteger(String key, int defaultValue) { + int value = defaultValue; + try { + Configuration config = new PropertiesConfiguration(new File("./config/rip.properties")); + value = config.getInt(key, defaultValue); + } catch (Exception e) { + logger.error("Failed to get configuration value for " + key + + ", using default '" + value + "'"); + } + return value; + } + + public static boolean getConfigBoolean(String key, boolean defaultValue) { + boolean value = defaultValue; + try { + Configuration config = new PropertiesConfiguration(new File("./config/rip.properties")); + value = config.getBoolean(key, defaultValue); + } catch (Exception e) { + logger.error("Failed to get configuration value for " + key + + ", using default '" + value + "'"); + } + return value; + } + } \ No newline at end of file