1
0
mirror of https://github.com/RipMeApp/ripme.git synced 2025-08-02 03:50:12 +02:00

Merge pull request #1433 from Isaaku/issues/twitter_ripper

TwitterRipper changes
This commit is contained in:
cyian-1756
2019-10-22 12:03:04 -05:00
committed by GitHub
2 changed files with 134 additions and 119 deletions

View File

@@ -14,18 +14,17 @@ import org.json.JSONObject;
import org.json.JSONTokener; import org.json.JSONTokener;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import com.rarchives.ripme.ripper.AlbumRipper; import com.rarchives.ripme.ripper.AbstractJSONRipper;
import com.rarchives.ripme.utils.Http; import com.rarchives.ripme.utils.Http;
import com.rarchives.ripme.utils.Utils; import com.rarchives.ripme.utils.Utils;
public class TwitterRipper extends AlbumRipper { public class TwitterRipper extends AbstractJSONRipper {
int downloadUrls = 1;
private static final String DOMAIN = "twitter.com", HOST = "twitter"; private static final String DOMAIN = "twitter.com", HOST = "twitter";
private static final int MAX_REQUESTS = Utils.getConfigInteger("twitter.max_requests", 10); private static final int MAX_REQUESTS = Utils.getConfigInteger("twitter.max_requests", 10);
private static final boolean RIP_RETWEETS = Utils.getConfigBoolean("twitter.rip_retweets", true); private static final boolean RIP_RETWEETS = Utils.getConfigBoolean("twitter.rip_retweets", true);
private static final int MAX_ITEMS_REQUEST = Utils.getConfigInteger("twitter.max_items_request", 200);
private static final int WAIT_TIME = 2000; private static final int WAIT_TIME = 2000;
// Base 64 of consumer key : consumer secret // Base 64 of consumer key : consumer secret
@@ -38,6 +37,10 @@ public class TwitterRipper extends AlbumRipper {
private ALBUM_TYPE albumType; private ALBUM_TYPE albumType;
private String searchText, accountName; private String searchText, accountName;
private Long lastMaxID = 0L;
private int currentRequest = 0;
private boolean hasTweets = true;
public TwitterRipper(URL url) throws IOException { public TwitterRipper(URL url) throws IOException {
super(url); super(url);
@@ -47,19 +50,23 @@ public class TwitterRipper extends AlbumRipper {
} }
} }
@Override
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
}
@Override @Override
public URL sanitizeURL(URL url) throws MalformedURLException { public URL sanitizeURL(URL url) throws MalformedURLException {
// https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd // https://twitter.com/search?q=from%3Apurrbunny%20filter%3Aimages&src=typd
Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?q=([a-zA-Z0-9%\\-_]+).*$"); Pattern p = Pattern.compile("^https?://(m\\.)?twitter\\.com/search\\?(.*)q=(?<search>[a-zA-Z0-9%\\-_]+).*$");
Matcher m = p.matcher(url.toExternalForm()); Matcher m = p.matcher(url.toExternalForm());
if (m.matches()) { if (m.matches()) {
albumType = ALBUM_TYPE.SEARCH; albumType = ALBUM_TYPE.SEARCH;
searchText = m.group(2); searchText = m.group("search");
if (searchText.startsWith("from%3A")) {
// from filter not supported
searchText = searchText.substring(7);
}
if (searchText.contains("x")) {
// x character not supported
searchText = searchText.replace("x", "");
}
return url; return url;
} }
p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9\\-_]+).*$"); p = Pattern.compile("^https?://(m\\.)?twitter\\.com/([a-zA-Z0-9\\-_]+).*$");
@@ -114,10 +121,10 @@ public class TwitterRipper extends AlbumRipper {
case ACCOUNT: case ACCOUNT:
req.append("https://api.twitter.com/1.1/statuses/user_timeline.json") req.append("https://api.twitter.com/1.1/statuses/user_timeline.json")
.append("?screen_name=" + this.accountName).append("&include_entities=true") .append("?screen_name=" + this.accountName).append("&include_entities=true")
.append("&exclude_replies=true").append("&trim_user=true").append("&count=" + 200) .append("&exclude_replies=true").append("&trim_user=true").append("&count=" + MAX_ITEMS_REQUEST)
.append("&tweet_mode=extended"); .append("&tweet_mode=extended");
break; break;
case SEARCH: case SEARCH:// Only get tweets from last week
req.append("https://api.twitter.com/1.1/search/tweets.json").append("?q=" + this.searchText) req.append("https://api.twitter.com/1.1/search/tweets.json").append("?q=" + this.searchText)
.append("&include_entities=true").append("&result_type=recent").append("&count=100") .append("&include_entities=true").append("&result_type=recent").append("&count=100")
.append("&tweet_mode=extended"); .append("&tweet_mode=extended");
@@ -129,8 +136,9 @@ public class TwitterRipper extends AlbumRipper {
return req.toString(); return req.toString();
} }
private List<JSONObject> getTweets(String url) throws IOException { private JSONObject getTweets() throws IOException {
List<JSONObject> tweets = new ArrayList<>(); currentRequest++;
String url = getApiURL(lastMaxID - 1);
LOGGER.info(" Retrieving " + url); LOGGER.info(" Retrieving " + url);
Document doc = Http.url(url).ignoreContentType().header("Authorization", "Bearer " + accessToken) Document doc = Http.url(url).ignoreContentType().header("Authorization", "Bearer " + accessToken)
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8") .header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
@@ -148,22 +156,112 @@ public class TwitterRipper extends AlbumRipper {
} else { } else {
statuses = (JSONArray) jsonObj; statuses = (JSONArray) jsonObj;
} }
JSONObject r = new JSONObject();
r.put("tweets", statuses);
return r;
}
public String getPrefix(int index) {
return Utils.getConfigBoolean("download.save_order", true) ? String.format("%03d_", index) : "";
}
@Override
protected JSONObject getFirstPage() throws IOException {
getAccessToken();
switch (albumType) {
case ACCOUNT:
checkRateLimits("statuses", "/statuses/user_timeline");
break;
case SEARCH:
checkRateLimits("search", "/search/tweets");
break;
}
return getTweets();
}
@Override
protected JSONObject getNextPage(JSONObject doc) throws IOException {
try {
Thread.sleep(WAIT_TIME);
} catch (InterruptedException e) {
LOGGER.error("[!] Interrupted while waiting to load more results", e);
}
return currentRequest <= MAX_REQUESTS ? getTweets() : null;
}
@Override
public String getHost() {
return HOST;
}
@Override
protected String getDomain() {
return DOMAIN;
}
@Override
public String getGID(URL url) throws MalformedURLException {
switch (albumType) {
case ACCOUNT:
return "account_" + accountName;
case SEARCH:
StringBuilder gid = new StringBuilder();
for (int i = 0; i < searchText.length(); i++) {
char c = searchText.charAt(i);
// Ignore URL-encoded chars
if (c == '%') {
gid.append('_');
i += 2;
// Ignore non-alphanumeric chars
} else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
gid.append(c);
}
}
return "search_" + gid.toString();
}
throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
}
@Override
public boolean hasASAPRipping() {
return hasTweets;
}
@Override
protected List<String> getURLsFromJSON(JSONObject json) {
List<String> urls = new ArrayList<>();
List<JSONObject> tweets = new ArrayList<>();
JSONArray statuses = json.getJSONArray("tweets");
for (int i = 0; i < statuses.length(); i++) { for (int i = 0; i < statuses.length(); i++) {
tweets.add((JSONObject) statuses.get(i)); tweets.add((JSONObject) statuses.get(i));
} }
return tweets;
if (tweets.isEmpty()) {
LOGGER.info(" No more tweets found.");
return urls;
} }
private int parseTweet(JSONObject tweet) throws MalformedURLException { LOGGER.debug("Twitter response #" + (currentRequest) + " Tweets:\n" + tweets);
int parsedCount = 0; if (tweets.size() == 1 && lastMaxID.equals(tweets.get(0).getString("id_str"))) {
LOGGER.info(" No more tweet found.");
return urls;
}
for (JSONObject tweet : tweets) {
lastMaxID = tweet.getLong("id");
if (!tweet.has("extended_entities")) { if (!tweet.has("extended_entities")) {
LOGGER.error("XXX Tweet doesn't have entitites"); LOGGER.error("XXX Tweet doesn't have entities");
return 0; continue;
} }
if (!RIP_RETWEETS && tweet.has("retweeted_status")) { if (!RIP_RETWEETS && tweet.has("retweeted_status")) {
LOGGER.info("Skipping a retweet as twitter.rip_retweet is set to false."); LOGGER.info("Skipping a retweet as twitter.rip_retweet is set to false.");
return 0; continue;
} }
JSONObject entities = tweet.getJSONObject("extended_entities"); JSONObject entities = tweet.getJSONObject("extended_entities");
@@ -196,109 +294,28 @@ public class TwitterRipper extends AlbumRipper {
} }
} }
if (urlToDownload != null) { if (urlToDownload != null) {
addURLToDownload(new URL(urlToDownload), getPrefix(downloadUrls)); urls.add(urlToDownload);
downloadUrls++;
} else { } else {
LOGGER.error("URLToDownload was null"); LOGGER.error("URLToDownload was null");
} }
parsedCount++;
} else if (media.getString("type").equals("photo")) { } else if (media.getString("type").equals("photo")) {
if (url.contains(".twimg.com/")) { if (url.contains(".twimg.com/")) {
url += ":orig"; url += ":orig";
addURLToDownload(new URL(url), getPrefix(downloadUrls)); urls.add(url);
downloadUrls++;
parsedCount++;
} else { } else {
LOGGER.debug("Unexpected media_url: " + url); LOGGER.debug("Unexpected media_url: " + url);
} }
} }
} }
} }
return parsedCount;
} }
public String getPrefix(int index) { return urls;
return Utils.getConfigBoolean("download.save_order", true) ? String.format("%03d_", index) : "";
} }
@Override @Override
public void rip() throws IOException { protected void downloadURL(URL url, int index) {
getAccessToken(); addURLToDownload(url, getPrefix(index));
switch (albumType) {
case ACCOUNT:
checkRateLimits("statuses", "/statuses/user_timeline");
break;
case SEARCH:
checkRateLimits("search", "/search/tweets");
break;
}
Long lastMaxID = 0L;
int parsedCount = 0;
for (int i = 0; i < MAX_REQUESTS; i++) {
List<JSONObject> tweets = getTweets(getApiURL(lastMaxID - 1));
if (tweets.isEmpty()) {
LOGGER.info(" No more tweets found.");
break;
}
LOGGER.debug("Twitter response #" + (i + 1) + " Tweets:\n" + tweets);
if (tweets.size() == 1 && lastMaxID.equals(tweets.get(0).getString("id_str"))) {
LOGGER.info(" No more tweet found.");
break;
}
for (JSONObject tweet : tweets) {
lastMaxID = tweet.getLong("id");
parsedCount += parseTweet(tweet);
if (isStopped() || (isThisATest() && parsedCount > 0)) {
break;
}
}
if (isStopped() || (isThisATest() && parsedCount > 0)) {
break;
}
try {
Thread.sleep(WAIT_TIME);
} catch (InterruptedException e) {
LOGGER.error("[!] Interrupted while waiting to load more results", e);
break;
}
}
waitForThreads();
}
@Override
public String getHost() {
return HOST;
}
@Override
public String getGID(URL url) throws MalformedURLException {
switch (albumType) {
case ACCOUNT:
return "account_" + accountName;
case SEARCH:
StringBuilder gid = new StringBuilder();
for (int i = 0; i < searchText.length(); i++) {
char c = searchText.charAt(i);
// Ignore URL-encoded chars
if (c == '%') {
gid.append('_');
i += 2;
// Ignore non-alphanumeric chars
} else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
gid.append(c);
}
}
return "search_" + gid.toString();
}
throw new MalformedURLException("Could not decide type of URL (search/account): " + url);
} }
} }

View File

@@ -5,7 +5,6 @@ import java.net.URL;
import com.rarchives.ripme.ripper.rippers.TwitterRipper; import com.rarchives.ripme.ripper.rippers.TwitterRipper;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
public class TwitterRipperTest extends RippersTest { public class TwitterRipperTest extends RippersTest {
@@ -16,7 +15,6 @@ public class TwitterRipperTest extends RippersTest {
} }
@Test @Test
@Disabled("https://github.com/RipMeApp/ripme/issues/251")
public void testTwitterSearchRip() throws IOException { public void testTwitterSearchRip() throws IOException {
TwitterRipper ripper = new TwitterRipper( TwitterRipper ripper = new TwitterRipper(
new URL("https://twitter.com/search?f=tweets&q=from%3Aalinalixxx%20filter%3Aimages&src=typd")); new URL("https://twitter.com/search?f=tweets&q=from%3Aalinalixxx%20filter%3Aimages&src=typd"));