From 227161bb31e2f4e50aa398a72291fbbdd11a4068 Mon Sep 17 00:00:00 2001 From: Philipp Erhardt Date: Sun, 10 Nov 2019 11:22:56 +0100 Subject: [PATCH 1/5] Add cookie support for all rippers --- .../java/com/rarchives/ripme/utils/Http.java | 45 ++++++++++++++++++- .../com/rarchives/ripme/utils/RipUtils.java | 2 +- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/utils/Http.java b/src/main/java/com/rarchives/ripme/utils/Http.java index 885a194d..1776463a 100644 --- a/src/main/java/com/rarchives/ripme/utils/Http.java +++ b/src/main/java/com/rarchives/ripme/utils/Http.java @@ -1,15 +1,19 @@ package com.rarchives.ripme.utils; import java.io.IOException; +import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; import java.util.Map; +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.json.JSONObject; import org.jsoup.Connection; import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; +import org.jsoup.helper.StringUtil; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -22,8 +26,8 @@ import com.rarchives.ripme.ripper.AbstractRipper; */ public class Http { - private static final int TIMEOUT = Utils.getConfigInteger("page.timeout", 5 * 1000); - private static final Logger logger = Logger.getLogger(Http.class); + private static final int TIMEOUT = Utils.getConfigInteger("page.timeout", 5 * 1000); + private static final Logger logger = Logger.getLogger(Http.class); private int retries; private String url; @@ -53,6 +57,43 @@ public class Http { connection.method(Method.GET); connection.timeout(TIMEOUT); connection.maxBodySize(0); + + // Extract cookies from config entry: + // Example config entry: + // cookies.reddit.com = reddit_session=; other_cookie= + connection.cookies(cookiesForURL(this.url)); + } + + private Map cookiesForURL(String u) { + Map cookiesParsed = new HashMap<>(); + + try { + URL parsed = new URL(this.url); + String cookieStr = ""; + + String[] parts = parsed.getHost().split("\\."); + + // if url is www.reddit.com, we should also use cookies from reddit.com; + // this rule is applied for all subdomains (for all rippers); e.g. also + // old.reddit.com, new.reddit.com + while (parts.length > 1) { + // Try to get cookies for this host from config + cookieStr = Utils.getConfigString("cookies." + String.join(".", parts), ""); + if (cookieStr != "") { + // we found something, start parsing + break; + } + parts = (String[]) ArrayUtils.remove(parts, 0); + } + + if (cookieStr != "") { + cookiesParsed = RipUtils.getCookiesFromString(cookieStr.trim()); + } + } catch (MalformedURLException e) { + logger.warn("Parsing url while getting cookies" + url, e); + } + + return cookiesParsed; } // Setters diff --git a/src/main/java/com/rarchives/ripme/utils/RipUtils.java b/src/main/java/com/rarchives/ripme/utils/RipUtils.java index 5dea166b..03a480cf 100644 --- a/src/main/java/com/rarchives/ripme/utils/RipUtils.java +++ b/src/main/java/com/rarchives/ripme/utils/RipUtils.java @@ -301,7 +301,7 @@ public class RipUtils { Map cookies = new HashMap<>(); for (String pair : line.split(";")) { String[] kv = pair.split("="); - cookies.put(kv[0], kv[1]); + cookies.put(kv[0].trim(), kv[1]); } return cookies; } From 7fe3ce059b53cc2a41712484287e2ff79bf34a0f Mon Sep 17 00:00:00 2001 From: Philipp Erhardt Date: Mon, 18 Nov 2019 16:56:55 +0100 Subject: [PATCH 2/5] Use the passed url in cookiesForURL function While this doesn't make any difference, not using the argument is kind of bad. One could also remove the argument and use `this.url` directly. --- src/main/java/com/rarchives/ripme/utils/Http.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/utils/Http.java b/src/main/java/com/rarchives/ripme/utils/Http.java index 1776463a..0eabaea1 100644 --- a/src/main/java/com/rarchives/ripme/utils/Http.java +++ b/src/main/java/com/rarchives/ripme/utils/Http.java @@ -68,7 +68,7 @@ public class Http { Map cookiesParsed = new HashMap<>(); try { - URL parsed = new URL(this.url); + URL parsed = new URL(u); String cookieStr = ""; String[] parts = parsed.getHost().split("\\."); From 152d6f635fd4764e56a4dee34ca7fc2a9bc290b4 Mon Sep 17 00:00:00 2001 From: Philipp Erhardt Date: Thu, 21 Nov 2019 20:02:02 +0100 Subject: [PATCH 3/5] Warn users about possibly fixable permission error --- .../java/com/rarchives/ripme/utils/Http.java | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/utils/Http.java b/src/main/java/com/rarchives/ripme/utils/Http.java index 0eabaea1..5ce9c48b 100644 --- a/src/main/java/com/rarchives/ripme/utils/Http.java +++ b/src/main/java/com/rarchives/ripme/utils/Http.java @@ -15,6 +15,7 @@ import org.jsoup.Connection.Method; import org.jsoup.Connection.Response; import org.jsoup.helper.StringUtil; import org.jsoup.Jsoup; +import org.jsoup.HttpStatusException; import org.jsoup.nodes.Document; import com.rarchives.ripme.ripper.AbstractRipper; @@ -90,7 +91,7 @@ public class Http { cookiesParsed = RipUtils.getCookiesFromString(cookieStr.trim()); } } catch (MalformedURLException e) { - logger.warn("Parsing url while getting cookies" + url, e); + logger.warn("Parsing url " + u + " while getting cookies", e); } return cookiesParsed; @@ -171,6 +172,20 @@ public class Http { response = connection.execute(); return response; } catch (IOException e) { + // Warn users about possibly fixable permission error + if (e instanceof org.jsoup.HttpStatusException) { + HttpStatusException ex = (HttpStatusException)e; + + // These status codes might indicate missing cookies + // 401 Unauthorized + // 403 Forbidden + + int status = ex.getStatusCode(); + if (status == 401 || status == 403) { + throw new IOException("Failed to load " + url + ": Status Code " + Integer.toString(status) + ". You might be able to circumvent this error by setting cookies for this domain" , e); + } + } + logger.warn("Error while loading " + url, e); lastException = e; } From 6c330c6932dd66b7ab25d8f6e36a207e5d827439 Mon Sep 17 00:00:00 2001 From: Philipp Erhardt Date: Sun, 1 Dec 2019 11:33:08 +0100 Subject: [PATCH 4/5] Add a logging statement if cookies were added to a request --- src/main/java/com/rarchives/ripme/utils/Http.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/rarchives/ripme/utils/Http.java b/src/main/java/com/rarchives/ripme/utils/Http.java index 5ce9c48b..d2796b53 100644 --- a/src/main/java/com/rarchives/ripme/utils/Http.java +++ b/src/main/java/com/rarchives/ripme/utils/Http.java @@ -68,6 +68,7 @@ public class Http { private Map cookiesForURL(String u) { Map cookiesParsed = new HashMap<>(); + String cookieDomain = ""; try { URL parsed = new URL(u); String cookieStr = ""; @@ -78,9 +79,11 @@ public class Http { // this rule is applied for all subdomains (for all rippers); e.g. also // old.reddit.com, new.reddit.com while (parts.length > 1) { + String domain = String.join(".", parts); // Try to get cookies for this host from config - cookieStr = Utils.getConfigString("cookies." + String.join(".", parts), ""); + cookieStr = Utils.getConfigString("cookies." + domain, ""); if (cookieStr != "") { + cookieDomain = domain; // we found something, start parsing break; } @@ -94,6 +97,10 @@ public class Http { logger.warn("Parsing url " + u + " while getting cookies", e); } + if (cookiesParsed.size() > 0) { + logger.info("Cookies for " + cookieDomain + " have been added to this request"); + } + return cookiesParsed; } From 74bfac0e3efc331588b6eebf8ce78ac0733ab066 Mon Sep 17 00:00:00 2001 From: Philipp Erhardt Date: Sun, 1 Dec 2019 11:45:45 +0100 Subject: [PATCH 5/5] String comparisons: don't use "reference equality" See https://stackoverflow.com/q/513832 for more details --- src/main/java/com/rarchives/ripme/utils/Http.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/rarchives/ripme/utils/Http.java b/src/main/java/com/rarchives/ripme/utils/Http.java index d2796b53..1b85005c 100644 --- a/src/main/java/com/rarchives/ripme/utils/Http.java +++ b/src/main/java/com/rarchives/ripme/utils/Http.java @@ -82,7 +82,7 @@ public class Http { String domain = String.join(".", parts); // Try to get cookies for this host from config cookieStr = Utils.getConfigString("cookies." + domain, ""); - if (cookieStr != "") { + if (cookieStr.equals("")) { cookieDomain = domain; // we found something, start parsing break; @@ -90,7 +90,7 @@ public class Http { parts = (String[]) ArrayUtils.remove(parts, 0); } - if (cookieStr != "") { + if (!cookieStr.equals("")) { cookiesParsed = RipUtils.getCookiesFromString(cookieStr.trim()); } } catch (MalformedURLException e) {