Merge pull request #1433 from Isaaku/issues/twitter_ripper

TwitterRipper changes
2025-08-03 12:27:39 +02:00 · 2019-10-22 12:03:04 -05:00
parent 4e637eee57 419c5f4f2f
commit 2023ddac4d
2 changed files with 134 additions and 119 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
@@ -14,18 +14,17 @@ import org.json.JSONObject;
 import org.json.JSONTokener;
 import org.jsoup.nodes.Document;

-import com.rarchives.ripme.ripper.AlbumRipper;
+import com.rarchives.ripme.ripper.AbstractJSONRipper;
 import com.rarchives.ripme.utils.Http;
 import com.rarchives.ripme.utils.Utils;

-public class TwitterRipper extends AlbumRipper {
-
-    int downloadUrls = 1;
+public class TwitterRipper extends AbstractJSONRipper {

    private static final String DOMAIN = "twitter.com", HOST = "twitter";

    private static final int MAX_REQUESTS = Utils.getConfigInteger("twitter.max_requests", 10);
    private static final boolean RIP_RETWEETS = Utils.getConfigBoolean("twitter.rip_retweets", true);
+    private static final int MAX_ITEMS_REQUEST = Utils.getConfigInteger("twitter.max_items_request", 200);
    private static final int WAIT_TIME = 2000;

    // Base 64 of consumer key : consumer secret
@@ -38,6 +37,10 @@ public class TwitterRipper extends AlbumRipper {

    private ALBUM_TYPE albumType;
    private String searchText, accountName;
+    private Long lastMaxID = 0L;
+    private int currentRequest = 0;
+
+    private boolean hasTweets = true;

    public TwitterRipper(URL url) throws IOException {
        super(url);
@@ -47,19 +50,23 @@ public class TwitterRipper extends AlbumRipper {
        }
    }

-    @Override
-    public boolean canRip(URL url) {
-        return url.getHost().endsWith(DOMAIN);
-    }
-
    @Override
    public URL sanitizeURL(URL url) throws MalformedURLException {
        // https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd
-        Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%\\-_]+).*$");
+        Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?(.*)q=(?<search>[a-zA-Z0-9%\\-_]+).*$");
        Matcher m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            albumType = ALBUM_TYPE.SEARCH;
-            searchText = m.group(2);
+            searchText = m.group("search");
+
+            if (searchText.startsWith("from%3A")) {
+                // from filter not supported
+                searchText = searchText.substring(7);
+            }
+            if (searchText.contains("x")) {
+                // x character not supported
+                searchText = searchText.replace("x", "");
+            }
            return url;
        }
        p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9\\-_]+).*$");
@@ -114,10 +121,10 @@ public class TwitterRipper extends AlbumRipper {
        case ACCOUNT:
            req.append("https://api.twitter.com/1.1/statuses/user_timeline.json")
                    .append("?screen_name=" + this.accountName).append("&include_entities=true")
-                    .append("&exclude_replies=true").append("&trim_user=true").append("&count=" + 200)
+                    .append("&exclude_replies=true").append("&trim_user=true").append("&count=" + MAX_ITEMS_REQUEST)
                    .append("&tweet_mode=extended");
            break;
-        case SEARCH:
+        case SEARCH:// Only get tweets from last week
            req.append("https://api.twitter.com/1.1/search/tweets.json").append("?q=" + this.searchText)
                    .append("&include_entities=true").append("&result_type=recent").append("&count=100")
                    .append("&tweet_mode=extended");
@@ -129,8 +136,9 @@ public class TwitterRipper extends AlbumRipper {
        return req.toString();
    }

-    private List<JSONObject> getTweets(String url) throws IOException {
-        List<JSONObject> tweets = new ArrayList<>();
+    private JSONObject getTweets() throws IOException {
+        currentRequest++;
+        String url = getApiURL(lastMaxID - 1);
        LOGGER.info("    Retrieving " + url);
        Document doc = Http.url(url).ignoreContentType().header("Authorization", "Bearer " + accessToken)
                .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
@@ -148,22 +156,112 @@ public class TwitterRipper extends AlbumRipper {
        } else {
            statuses = (JSONArray) jsonObj;
        }
+
+        JSONObject r = new JSONObject();
+        r.put("tweets", statuses);
+        return r;
+    }
+
+    public String getPrefix(int index) {
+        return Utils.getConfigBoolean("download.save_order", true) ? String.format("%03d_", index) : "";
+    }
+
+    @Override
+    protected JSONObject getFirstPage() throws IOException {
+        getAccessToken();
+
+        switch (albumType) {
+        case ACCOUNT:
+            checkRateLimits("statuses", "/statuses/user_timeline");
+            break;
+        case SEARCH:
+            checkRateLimits("search", "/search/tweets");
+            break;
+        }
+
+        return getTweets();
+    }
+
+    @Override
+    protected JSONObject getNextPage(JSONObject doc) throws IOException {
+        try {
+            Thread.sleep(WAIT_TIME);
+        } catch (InterruptedException e) {
+            LOGGER.error("[!] Interrupted while waiting to load more results", e);
+        }
+        return currentRequest <= MAX_REQUESTS ? getTweets() : null;
+    }
+
+    @Override
+    public String getHost() {
+        return HOST;
+    }
+
+    @Override
+    protected String getDomain() {
+        return DOMAIN;
+    }
+
+    @Override
+    public String getGID(URL url) throws MalformedURLException {
+        switch (albumType) {
+        case ACCOUNT:
+            return "account_" + accountName;
+        case SEARCH:
+            StringBuilder gid = new StringBuilder();
+            for (int i = 0; i < searchText.length(); i++) {
+                char c = searchText.charAt(i);
+                // Ignore URL-encoded chars
+                if (c == '%') {
+                    gid.append('_');
+                    i += 2;
+                    // Ignore non-alphanumeric chars
+                } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
+                    gid.append(c);
+                }
+            }
+            return "search_" + gid.toString();
+        }
+        throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
+    }
+
+    @Override
+    public boolean hasASAPRipping() {
+        return hasTweets;
+    }
+
+    @Override
+    protected List<String> getURLsFromJSON(JSONObject json) {
+        List<String> urls = new ArrayList<>();
+        List<JSONObject> tweets = new ArrayList<>();
+        JSONArray statuses = json.getJSONArray("tweets");
+
        for (int i = 0; i < statuses.length(); i++) {
            tweets.add((JSONObject) statuses.get(i));
        }
-        return tweets;
+
+        if (tweets.isEmpty()) {
+            LOGGER.info("   No more tweets found.");
+            return urls;
        }

-    private int parseTweet(JSONObject tweet) throws MalformedURLException {
-        int parsedCount = 0;
+        LOGGER.debug("Twitter response #" + (currentRequest) + " Tweets:\n" + tweets);
+        if (tweets.size() == 1 && lastMaxID.equals(tweets.get(0).getString("id_str"))) {
+            LOGGER.info("   No more tweet found.");
+            return urls;
+        }
+
+        for (JSONObject tweet : tweets) {
+            lastMaxID = tweet.getLong("id");
+
            if (!tweet.has("extended_entities")) {
-            LOGGER.error("XXX Tweet doesn't have entitites");
-            return 0;
+                LOGGER.error("XXX Tweet doesn't have entities");
+                continue;
            }

            if (!RIP_RETWEETS && tweet.has("retweeted_status")) {
                LOGGER.info("Skipping a retweet as twitter.rip_retweet is set to false.");
-            return 0;
+                continue;
            }

            JSONObject entities = tweet.getJSONObject("extended_entities");
@@ -196,109 +294,28 @@ public class TwitterRipper extends AlbumRipper {
                            }
                        }
                        if (urlToDownload != null) {
-                        addURLToDownload(new URL(urlToDownload), getPrefix(downloadUrls));
-                        downloadUrls++;
+                            urls.add(urlToDownload);
                        } else {
                            LOGGER.error("URLToDownload was null");
                        }
-                    parsedCount++;
                    } else if (media.getString("type").equals("photo")) {
                        if (url.contains(".twimg.com/")) {
                            url += ":orig";
-                        addURLToDownload(new URL(url), getPrefix(downloadUrls));
-                        downloadUrls++;
-                        parsedCount++;
+                            urls.add(url);
                        } else {
                            LOGGER.debug("Unexpected media_url: " + url);
                        }
                    }
                }
            }
-
-        return parsedCount;
        }

-    public String getPrefix(int index) {
-        return Utils.getConfigBoolean("download.save_order", true) ? String.format("%03d_", index) : "";
+        return urls;
    }

    @Override
-    public void rip() throws IOException {
-        getAccessToken();
-
-        switch (albumType) {
-        case ACCOUNT:
-            checkRateLimits("statuses", "/statuses/user_timeline");
-            break;
-        case SEARCH:
-            checkRateLimits("search", "/search/tweets");
-            break;
-        }
-
-        Long lastMaxID = 0L;
-        int parsedCount = 0;
-        for (int i = 0; i < MAX_REQUESTS; i++) {
-            List<JSONObject> tweets = getTweets(getApiURL(lastMaxID - 1));
-            if (tweets.isEmpty()) {
-                LOGGER.info("   No more tweets found.");
-                break;
-            }
-            LOGGER.debug("Twitter response #" + (i + 1) + " Tweets:\n" + tweets);
-            if (tweets.size() == 1 && lastMaxID.equals(tweets.get(0).getString("id_str"))) {
-                LOGGER.info("   No more tweet found.");
-                break;
-            }
-
-            for (JSONObject tweet : tweets) {
-                lastMaxID = tweet.getLong("id");
-                parsedCount += parseTweet(tweet);
-
-                if (isStopped() || (isThisATest() && parsedCount > 0)) {
-                    break;
-                }
-            }
-
-            if (isStopped() || (isThisATest() && parsedCount > 0)) {
-                break;
-            }
-
-            try {
-                Thread.sleep(WAIT_TIME);
-            } catch (InterruptedException e) {
-                LOGGER.error("[!] Interrupted while waiting to load more results", e);
-                break;
-            }
-        }
-
-        waitForThreads();
-    }
-
-    @Override
-    public String getHost() {
-        return HOST;
-    }
-
-    @Override
-    public String getGID(URL url) throws MalformedURLException {
-        switch (albumType) {
-        case ACCOUNT:
-            return "account_" + accountName;
-        case SEARCH:
-            StringBuilder gid = new StringBuilder();
-            for (int i = 0; i < searchText.length(); i++) {
-                char c = searchText.charAt(i);
-                // Ignore URL-encoded chars
-                if (c == '%') {
-                    gid.append('_');
-                    i += 2;
-                    // Ignore non-alphanumeric chars
-                } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
-                    gid.append(c);
-                }
-            }
-            return "search_" + gid.toString();
-        }
-        throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
+    protected void downloadURL(URL url, int index) {
+        addURLToDownload(url, getPrefix(index));
    }

 }
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
@@ -5,7 +5,6 @@ import java.net.URL;

 import com.rarchives.ripme.ripper.rippers.TwitterRipper;

-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

 public class TwitterRipperTest extends RippersTest {
@@ -16,7 +15,6 @@ public class TwitterRipperTest extends RippersTest {
    }

    @Test
-    @Disabled("https://github.com/RipMeApp/ripme/issues/251")
    public void testTwitterSearchRip() throws IOException {
        TwitterRipper ripper = new TwitterRipper(
                new URL("https://twitter.com/search?f=tweets&q=from%3Aalinalixxx%20filter%3Aimages&src=typd"));