Change AlbumRipper with AbstractJSONRipper

Change regex to better read from search links Remove filter "From" from search since it's not supported Remove character x from search since otherwise will not return values Add twitter.max_items_request config to set the max numbers of tweets for each request
2025-08-16 10:54:09 +02:00 · 2019-09-19 17:52:57 -05:00
parent bb287b7331
commit 9f106cd0a3
2 changed files with 134 additions and 119 deletions
--- a/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
+++ b/src/main/java/com/rarchives/ripme/ripper/rippers/TwitterRipper.java
@@ -14,18 +14,17 @@ import org.json.JSONObject;
 import org.json.JSONTokener;
 import org.jsoup.nodes.Document;

-import com.rarchives.ripme.ripper.AlbumRipper;
+import com.rarchives.ripme.ripper.AbstractJSONRipper;
 import com.rarchives.ripme.utils.Http;
 import com.rarchives.ripme.utils.Utils;

-public class TwitterRipper extends AlbumRipper {
-
-    int downloadUrls = 1;
+public class TwitterRipper extends AbstractJSONRipper {

    private static final String DOMAIN = "twitter.com", HOST = "twitter";

    private static final int MAX_REQUESTS = Utils.getConfigInteger("twitter.max_requests", 10);
    private static final boolean RIP_RETWEETS = Utils.getConfigBoolean("twitter.rip_retweets", true);
+    private static final int MAX_ITEMS_REQUEST = Utils.getConfigInteger("twitter.max_items_request", 200);
    private static final int WAIT_TIME = 2000;

    // Base 64 of consumer key : consumer secret
@@ -38,6 +37,10 @@ public class TwitterRipper extends AlbumRipper {

    private ALBUM_TYPE albumType;
    private String searchText, accountName;
+    private Long lastMaxID = 0L;
+    private int currentRequest = 0;
+
+    private boolean hasTweets = true;

    public TwitterRipper(URL url) throws IOException {
        super(url);
@@ -47,19 +50,23 @@ public class TwitterRipper extends AlbumRipper {
        }
    }

-    @Override
-    public boolean canRip(URL url) {
-        return url.getHost().endsWith(DOMAIN);
-    }
-
    @Override
    public URL sanitizeURL(URL url) throws MalformedURLException {
        // https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd
-        Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%\\-_]+).*$");
+        Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?(.*)q=(?<search>[a-zA-Z0-9%\\-_]+).*$");
        Matcher m = p.matcher(url.toExternalForm());
        if (m.matches()) {
            albumType = ALBUM_TYPE.SEARCH;
-            searchText = m.group(2);
+            searchText = m.group("search");
+
+            if (searchText.startsWith("from%3A")) {
+                // from filter not supported
+                searchText = searchText.substring(7);
+            }
+            if (searchText.contains("x")) {
+                // x character not supported
+                searchText = searchText.replace("x", "");
+            }
            return url;
        }
        p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9\\-_]+).*$");
@@ -114,10 +121,10 @@ public class TwitterRipper extends AlbumRipper {
        case ACCOUNT:
            req.append("https://api.twitter.com/1.1/statuses/user_timeline.json")
                    .append("?screen_name=" + this.accountName).append("&include_entities=true")
-                    .append("&exclude_replies=true").append("&trim_user=true").append("&count=" + 200)
+                    .append("&exclude_replies=true").append("&trim_user=true").append("&count=" + MAX_ITEMS_REQUEST)
                    .append("&tweet_mode=extended");
            break;
-        case SEARCH:
+        case SEARCH:// Only get tweets from last week
            req.append("https://api.twitter.com/1.1/search/tweets.json").append("?q=" + this.searchText)
                    .append("&include_entities=true").append("&result_type=recent").append("&count=100")
                    .append("&tweet_mode=extended");
@@ -129,8 +136,9 @@ public class TwitterRipper extends AlbumRipper {
        return req.toString();
    }

-    private List<JSONObject> getTweets(String url) throws IOException {
-        List<JSONObject> tweets = new ArrayList<>();
+    private JSONObject getTweets() throws IOException {
+        currentRequest++;
+        String url = getApiURL(lastMaxID - 1);
        LOGGER.info("    Retrieving " + url);
        Document doc = Http.url(url).ignoreContentType().header("Authorization", "Bearer " + accessToken)
                .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
@@ -148,74 +156,10 @@ public class TwitterRipper extends AlbumRipper {
        } else {
            statuses = (JSONArray) jsonObj;
        }
-        for (int i = 0; i < statuses.length(); i++) {
-            tweets.add((JSONObject) statuses.get(i));
-        }
-        return tweets;
-    }

-    private int parseTweet(JSONObject tweet) throws MalformedURLException {
-        int parsedCount = 0;
-        if (!tweet.has("extended_entities")) {
-            LOGGER.error("XXX Tweet doesn't have entitites");
-            return 0;
-        }
-
-        if (!RIP_RETWEETS && tweet.has("retweeted_status")) {
-            LOGGER.info("Skipping a retweet as twitter.rip_retweet is set to false.");
-            return 0;
-        }
-
-        JSONObject entities = tweet.getJSONObject("extended_entities");
-
-        if (entities.has("media")) {
-            JSONArray medias = entities.getJSONArray("media");
-            String url;
-            JSONObject media;
-
-            for (int i = 0; i < medias.length(); i++) {
-                media = (JSONObject) medias.get(i);
-                url = media.getString("media_url");
-                if (media.getString("type").equals("video") || media.getString("type").equals("animated_gif")) {
-                    JSONArray variants = media.getJSONObject("video_info").getJSONArray("variants");
-                    int largestBitrate = 0;
-                    String urlToDownload = null;
-                    // Loop over all the video options and find the biggest video
-                    for (int j = 0; j < variants.length(); j++) {
-                        JSONObject variant = (JSONObject) variants.get(j);
-                        LOGGER.info(variant);
-                        // If the video doesn't have a bitrate it's a m3u8 file we can't download
-                        if (variant.has("bitrate")) {
-                            if (variant.getInt("bitrate") > largestBitrate) {
-                                largestBitrate = variant.getInt("bitrate");
-                                urlToDownload = variant.getString("url");
-                            } else if (media.getString("type").equals("animated_gif")) {
-                                // If the type if animated_gif the bitrate doesn't matter
-                                urlToDownload = variant.getString("url");
-                            }
-                        }
-                    }
-                    if (urlToDownload != null) {
-                        addURLToDownload(new URL(urlToDownload), getPrefix(downloadUrls));
-                        downloadUrls++;
-                    } else {
-                        LOGGER.error("URLToDownload was null");
-                    }
-                    parsedCount++;
-                } else if (media.getString("type").equals("photo")) {
-                    if (url.contains(".twimg.com/")) {
-                        url += ":orig";
-                        addURLToDownload(new URL(url), getPrefix(downloadUrls));
-                        downloadUrls++;
-                        parsedCount++;
-                    } else {
-                        LOGGER.debug("Unexpected media_url: " + url);
-                    }
-                }
-            }
-        }
-
-        return parsedCount;
+        JSONObject r = new JSONObject();
+        r.put("tweets", statuses);
+        return r;
    }

    public String getPrefix(int index) {
@@ -223,7 +167,7 @@ public class TwitterRipper extends AlbumRipper {
    }

    @Override
-    public void rip() throws IOException {
+    protected JSONObject getFirstPage() throws IOException {
        getAccessToken();

        switch (albumType) {
@@ -235,42 +179,17 @@ public class TwitterRipper extends AlbumRipper {
            break;
        }

-        Long lastMaxID = 0L;
-        int parsedCount = 0;
-        for (int i = 0; i < MAX_REQUESTS; i++) {
-            List<JSONObject> tweets = getTweets(getApiURL(lastMaxID - 1));
-            if (tweets.isEmpty()) {
-                LOGGER.info("   No more tweets found.");
-                break;
-            }
-            LOGGER.debug("Twitter response #" + (i + 1) + " Tweets:\n" + tweets);
-            if (tweets.size() == 1 && lastMaxID.equals(tweets.get(0).getString("id_str"))) {
-                LOGGER.info("   No more tweet found.");
-                break;
-            }
+        return getTweets();
+    }

-            for (JSONObject tweet : tweets) {
-                lastMaxID = tweet.getLong("id");
-                parsedCount += parseTweet(tweet);
-
-                if (isStopped() || (isThisATest() && parsedCount > 0)) {
-                    break;
-                }
-            }
-
-            if (isStopped() || (isThisATest() && parsedCount > 0)) {
-                break;
-            }
-
-            try {
-                Thread.sleep(WAIT_TIME);
-            } catch (InterruptedException e) {
-                LOGGER.error("[!] Interrupted while waiting to load more results", e);
-                break;
-            }
+    @Override
+    protected JSONObject getNextPage(JSONObject doc) throws IOException {
+        try {
+            Thread.sleep(WAIT_TIME);
+        } catch (InterruptedException e) {
+            LOGGER.error("[!] Interrupted while waiting to load more results", e);
        }
-
-        waitForThreads();
+        return currentRequest <= MAX_REQUESTS ? getTweets() : null;
    }

    @Override
@@ -278,6 +197,11 @@ public class TwitterRipper extends AlbumRipper {
        return HOST;
    }

+    @Override
+    protected String getDomain() {
+        return DOMAIN;
+    }
+
    @Override
    public String getGID(URL url) throws MalformedURLException {
        switch (albumType) {
@@ -301,4 +225,97 @@ public class TwitterRipper extends AlbumRipper {
        throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
    }

+    @Override
+    public boolean hasASAPRipping() {
+        return hasTweets;
+    }
+
+    @Override
+    protected List<String> getURLsFromJSON(JSONObject json) {
+        List<String> urls = new ArrayList<>();
+        List<JSONObject> tweets = new ArrayList<>();
+        JSONArray statuses = json.getJSONArray("tweets");
+
+        for (int i = 0; i < statuses.length(); i++) {
+            tweets.add((JSONObject) statuses.get(i));
+        }
+
+        if (tweets.isEmpty()) {
+            LOGGER.info("   No more tweets found.");
+            return urls;
+        }
+
+        LOGGER.debug("Twitter response #" + (currentRequest) + " Tweets:\n" + tweets);
+        if (tweets.size() == 1 && lastMaxID.equals(tweets.get(0).getString("id_str"))) {
+            LOGGER.info("   No more tweet found.");
+            return urls;
+        }
+
+        for (JSONObject tweet : tweets) {
+            lastMaxID = tweet.getLong("id");
+
+            if (!tweet.has("extended_entities")) {
+                LOGGER.error("XXX Tweet doesn't have entities");
+                continue;
+            }
+
+            if (!RIP_RETWEETS && tweet.has("retweeted_status")) {
+                LOGGER.info("Skipping a retweet as twitter.rip_retweet is set to false.");
+                continue;
+            }
+
+            JSONObject entities = tweet.getJSONObject("extended_entities");
+
+            if (entities.has("media")) {
+                JSONArray medias = entities.getJSONArray("media");
+                String url;
+                JSONObject media;
+
+                for (int i = 0; i < medias.length(); i++) {
+                    media = (JSONObject) medias.get(i);
+                    url = media.getString("media_url");
+                    if (media.getString("type").equals("video") || media.getString("type").equals("animated_gif")) {
+                        JSONArray variants = media.getJSONObject("video_info").getJSONArray("variants");
+                        int largestBitrate = 0;
+                        String urlToDownload = null;
+                        // Loop over all the video options and find the biggest video
+                        for (int j = 0; j < variants.length(); j++) {
+                            JSONObject variant = (JSONObject) variants.get(j);
+                            LOGGER.info(variant);
+                            // If the video doesn't have a bitrate it's a m3u8 file we can't download
+                            if (variant.has("bitrate")) {
+                                if (variant.getInt("bitrate") > largestBitrate) {
+                                    largestBitrate = variant.getInt("bitrate");
+                                    urlToDownload = variant.getString("url");
+                                } else if (media.getString("type").equals("animated_gif")) {
+                                    // If the type if animated_gif the bitrate doesn't matter
+                                    urlToDownload = variant.getString("url");
+                                }
+                            }
+                        }
+                        if (urlToDownload != null) {
+                            urls.add(urlToDownload);
+                        } else {
+                            LOGGER.error("URLToDownload was null");
+                        }
+                    } else if (media.getString("type").equals("photo")) {
+                        if (url.contains(".twimg.com/")) {
+                            url += ":orig";
+                            urls.add(url);
+                        } else {
+                            LOGGER.debug("Unexpected media_url: " + url);
+                        }
+                    }
+                }
+            }
+        }
+
+        return urls;
+    }
+
+    @Override
+    protected void downloadURL(URL url, int index) {
+        addURLToDownload(url, getPrefix(index));
+    }
+
 }
--- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
+++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/TwitterRipperTest.java
@@ -5,7 +5,6 @@ import java.net.URL;

 import com.rarchives.ripme.ripper.rippers.TwitterRipper;

-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;

 public class TwitterRipperTest extends RippersTest {
@@ -16,7 +15,6 @@ public class TwitterRipperTest extends RippersTest {
    }

    @Test
-    @Disabled("https://github.com/RipMeApp/ripme/issues/251")
    public void testTwitterSearchRip() throws IOException {
        TwitterRipper ripper = new TwitterRipper(
                new URL("https://twitter.com/search?f=tweets&q=from%3Aalinalixxx%20filter%3Aimages&src=typd"));