Merge pull request #1433 from Isaaku/issues/twitter_ripper

TwitterRipper changes
2025-08-02 03:50:12 +02:00 · 2019-10-22 12:03:04 -05:00
parent 4e637eee57 419c5f4f2f
commit 2023ddac4d
2 changed files with 134 additions and 119 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
@@ -14,18 +14,17 @@ import org.json.JSONObject;
 import org.json.JSONTokener;
 import org.jsoup.nodes.Document;
-import com.rarchives.ripme.ripper.AlbumRipper;
+import com.rarchives.ripme.ripper.AbstractJSONRipper;
 import com.rarchives.ripme.utils.Http;
 import com.rarchives.ripme.utils.Utils;
-public class TwitterRipper extends AlbumRipper {
+public class TwitterRipper extends AbstractJSONRipper {
    int downloadUrls = 1;
    private static final String DOMAIN = "twitter.com", HOST = "twitter";
    private static final int MAX_REQUESTS = Utils.getConfigInteger("twitter.max_requests", 10);
    private static final boolean RIP_RETWEETS = Utils.getConfigBoolean("twitter.rip_retweets", true);
    private static final int MAX_ITEMS_REQUEST = Utils.getConfigInteger("twitter.max_items_request", 200);
    private static final int WAIT_TIME = 2000;
    // Base 64 of consumer key : consumer secret
@@ -38,6 +37,10 @@ public class TwitterRipper extends AlbumRipper {
    private ALBUM_TYPE albumType;
    private String searchText, accountName;
    private Long lastMaxID = 0L;
    private int currentRequest = 0;
    private boolean hasTweets = true;
    public TwitterRipper(URL url) throws IOException {
        super(url);
@@ -47,19 +50,23 @@ public class TwitterRipper extends AlbumRipper {
        }
    }
    @Override
    public boolean canRip(URL url) {
        return url.getHost().endsWith(DOMAIN);
    }
    @Override
    public URL sanitizeURL(URL url) throws MalformedURLException {
        // https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd
-        Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%\\-_]+).*$");
+        Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?(.*)q=(?<search>[a-zA-Z0-9%\\-_]+).*$");
        Matcher m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            albumType = ALBUM_TYPE.SEARCH;
-            searchText = m.group(2);
+            searchText = m.group("search");
            if (searchText.startsWith("from%3A")) {
                // from filter not supported
                searchText = searchText.substring(7);
            }
            if (searchText.contains("x")) {
                // x character not supported
                searchText = searchText.replace("x", "");
            }
            return url;
        }
        p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9\\-_]+).*$");
@@ -114,10 +121,10 @@ public class TwitterRipper extends AlbumRipper {
        case ACCOUNT:
            req.append("https://api.twitter.com/1.1/statuses/user_timeline.json")
                    .append("?screen_name=" + this.accountName).append("&include_entities=true")
-                    .append("&exclude_replies=true").append("&trim_user=true").append("&count=" + 200)
+                    .append("&exclude_replies=true").append("&trim_user=true").append("&count=" + MAX_ITEMS_REQUEST)
                    .append("&tweet_mode=extended");
            break;
-        case SEARCH:
+        case SEARCH:// Only get tweets from last week
            req.append("https://api.twitter.com/1.1/search/tweets.json").append("?q=" + this.searchText)
                    .append("&include_entities=true").append("&result_type=recent").append("&count=100")
                    .append("&tweet_mode=extended");
@@ -129,8 +136,9 @@ public class TwitterRipper extends AlbumRipper {
        return req.toString();
    }
-    private List<JSONObject> getTweets(String url) throws IOException {
+    private JSONObject getTweets() throws IOException {
-        List<JSONObject> tweets = new ArrayList<>();
+        currentRequest++;
        String url = getApiURL(lastMaxID - 1);
        LOGGER.info("    Retrieving " + url);
        Document doc = Http.url(url).ignoreContentType().header("Authorization", "Bearer " + accessToken)
                .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
@@ -148,22 +156,112 @@ public class TwitterRipper extends AlbumRipper {
        } else {
            statuses = (JSONArray) jsonObj;
        }
        JSONObject r = new JSONObject();
        r.put("tweets", statuses);
        return r;
    }
    public String getPrefix(int index) {
        return Utils.getConfigBoolean("download.save_order", true) ? String.format("%03d_", index) : "";
    }
    @Override
    protected JSONObject getFirstPage() throws IOException {
        getAccessToken();
        switch (albumType) {
        case ACCOUNT:
            checkRateLimits("statuses", "/statuses/user_timeline");
            break;
        case SEARCH:
            checkRateLimits("search", "/search/tweets");
            break;
        }
        return getTweets();
    }
    @Override
    protected JSONObject getNextPage(JSONObject doc) throws IOException {
        try {
            Thread.sleep(WAIT_TIME);
        } catch (InterruptedException e) {
            LOGGER.error("[!] Interrupted while waiting to load more results", e);
        }
        return currentRequest <= MAX_REQUESTS ? getTweets() : null;
    }
    @Override
    public String getHost() {
        return HOST;
    }
    @Override
    protected String getDomain() {
        return DOMAIN;
    }
    @Override
    public String getGID(URL url) throws MalformedURLException {
        switch (albumType) {
        case ACCOUNT:
            return "account_" + accountName;
        case SEARCH:
            StringBuilder gid = new StringBuilder();
            for (int i = 0; i < searchText.length(); i++) {
                char c = searchText.charAt(i);
                // Ignore URL-encoded chars
                if (c == '%') {
                    gid.append('_');
                    i += 2;
                    // Ignore non-alphanumeric chars
                } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
                    gid.append(c);
                }
            }
            return "search_" + gid.toString();
        }
        throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
    }
    @Override
    public boolean hasASAPRipping() {
        return hasTweets;
    }
    @Override
    protected List<String> getURLsFromJSON(JSONObject json) {
        List<String> urls = new ArrayList<>();
        List<JSONObject> tweets = new ArrayList<>();
        JSONArray statuses = json.getJSONArray("tweets");
        for (int i = 0; i < statuses.length(); i++) {
            tweets.add((JSONObject) statuses.get(i));
        }
-        return tweets;
+
        if (tweets.isEmpty()) {
            LOGGER.info("   No more tweets found.");
            return urls;
        }
-    private int parseTweet(JSONObject tweet) throws MalformedURLException {
+        LOGGER.debug("Twitter response #" + (currentRequest) + " Tweets:\n" + tweets);
-        int parsedCount = 0;
+        if (tweets.size() == 1 && lastMaxID.equals(tweets.get(0).getString("id_str"))) {
            LOGGER.info("   No more tweet found.");
            return urls;
        }
        for (JSONObject tweet : tweets) {
            lastMaxID = tweet.getLong("id");
            if (!tweet.has("extended_entities")) {
-            LOGGER.error("XXX Tweet doesn't have entitites");
+                LOGGER.error("XXX Tweet doesn't have entities");
-            return 0;
+                continue;
            }
            if (!RIP_RETWEETS && tweet.has("retweeted_status")) {
                LOGGER.info("Skipping a retweet as twitter.rip_retweet is set to false.");
-            return 0;
+                continue;
            }
            JSONObject entities = tweet.getJSONObject("extended_entities");
@@ -196,109 +294,28 @@ public class TwitterRipper extends AlbumRipper {
                            }
                        }
                        if (urlToDownload != null) {
-                        addURLToDownload(new URL(urlToDownload), getPrefix(downloadUrls));
+                            urls.add(urlToDownload);
                        downloadUrls++;
                        } else {
                            LOGGER.error("URLToDownload was null");
                        }
                    parsedCount++;
                    } else if (media.getString("type").equals("photo")) {
                        if (url.contains(".twimg.com/")) {
                            url += ":orig";
-                        addURLToDownload(new URL(url), getPrefix(downloadUrls));
+                            urls.add(url);
                        downloadUrls++;
                        parsedCount++;
                        } else {
                            LOGGER.debug("Unexpected media_url: " + url);
                        }
                    }
                }
            }
        return parsedCount;
        }
-    public String getPrefix(int index) {
+        return urls;
        return Utils.getConfigBoolean("download.save_order", true) ? String.format("%03d_", index) : "";
    }
    @Override
-    public void rip() throws IOException {
+    protected void downloadURL(URL url, int index) {
-        getAccessToken();
+        addURLToDownload(url, getPrefix(index));
        switch (albumType) {
        case ACCOUNT:
            checkRateLimits("statuses", "/statuses/user_timeline");
            break;
        case SEARCH:
            checkRateLimits("search", "/search/tweets");
            break;
        }
        Long lastMaxID = 0L;
        int parsedCount = 0;
        for (int i = 0; i < MAX_REQUESTS; i++) {
            List<JSONObject> tweets = getTweets(getApiURL(lastMaxID - 1));
            if (tweets.isEmpty()) {
                LOGGER.info("   No more tweets found.");
                break;
            }
            LOGGER.debug("Twitter response #" + (i + 1) + " Tweets:\n" + tweets);
            if (tweets.size() == 1 && lastMaxID.equals(tweets.get(0).getString("id_str"))) {
                LOGGER.info("   No more tweet found.");
                break;
            }
            for (JSONObject tweet : tweets) {
                lastMaxID = tweet.getLong("id");
                parsedCount += parseTweet(tweet);
                if (isStopped() || (isThisATest() && parsedCount > 0)) {
                    break;
                }
            }
            if (isStopped() || (isThisATest() && parsedCount > 0)) {
                break;
            }
            try {
                Thread.sleep(WAIT_TIME);
            } catch (InterruptedException e) {
                LOGGER.error("[!] Interrupted while waiting to load more results", e);
                break;
            }
        }
        waitForThreads();
    }
    @Override
    public String getHost() {
        return HOST;
    }
    @Override
    public String getGID(URL url) throws MalformedURLException {
        switch (albumType) {
        case ACCOUNT:
            return "account_" + accountName;
        case SEARCH:
            StringBuilder gid = new StringBuilder();
            for (int i = 0; i < searchText.length(); i++) {
                char c = searchText.charAt(i);
                // Ignore URL-encoded chars
                if (c == '%') {
                    gid.append('_');
                    i += 2;
                    // Ignore non-alphanumeric chars
                } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
                    gid.append(c);
                }
            }
            return "search_" + gid.toString();
        }
        throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
    }
 }
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
@@ -5,7 +5,6 @@ import java.net.URL;
 import com.rarchives.ripme.ripper.rippers.TwitterRipper;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 public class TwitterRipperTest extends RippersTest {
@@ -16,7 +15,6 @@ public class TwitterRipperTest extends RippersTest {
    }
    @Test
    @Disabled("https://github.com/RipMeApp/ripme/issues/251")
    public void testTwitterSearchRip() throws IOException {
        TwitterRipper ripper = new TwitterRipper(
                new URL("https://twitter.com/search?f=tweets&q=from%3Aalinalixxx%20filter%3Aimages&src=typd"));