mirror of
https://github.com/RipMeApp/ripme.git
synced 2025-08-22 21:43:06 +02:00
Merge pull request #1499 from taurhel/master
InstagramRipper fixed by moving qHash extraction from regex to js parsing
This commit is contained in:
@@ -3,6 +3,16 @@ package com.rarchives.ripme.ripper.rippers;
|
|||||||
import com.rarchives.ripme.ripper.AbstractJSONRipper;
|
import com.rarchives.ripme.ripper.AbstractJSONRipper;
|
||||||
import com.rarchives.ripme.utils.Http;
|
import com.rarchives.ripme.utils.Http;
|
||||||
import com.rarchives.ripme.utils.Utils;
|
import com.rarchives.ripme.utils.Utils;
|
||||||
|
import jdk.nashorn.internal.ir.Block;
|
||||||
|
import jdk.nashorn.internal.ir.CallNode;
|
||||||
|
import jdk.nashorn.internal.ir.ExpressionStatement;
|
||||||
|
import jdk.nashorn.internal.ir.FunctionNode;
|
||||||
|
import jdk.nashorn.internal.ir.Statement;
|
||||||
|
import jdk.nashorn.internal.parser.Parser;
|
||||||
|
import jdk.nashorn.internal.runtime.Context;
|
||||||
|
import jdk.nashorn.internal.runtime.ErrorManager;
|
||||||
|
import jdk.nashorn.internal.runtime.Source;
|
||||||
|
import jdk.nashorn.internal.runtime.options.Options;
|
||||||
import org.json.JSONArray;
|
import org.json.JSONArray;
|
||||||
import org.json.JSONObject;
|
import org.json.JSONObject;
|
||||||
import org.jsoup.Connection;
|
import org.jsoup.Connection;
|
||||||
@@ -24,6 +34,8 @@ import java.util.Objects;
|
|||||||
import java.util.Spliterators;
|
import java.util.Spliterators;
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.function.Predicate;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@@ -141,7 +153,7 @@ public class InstagramRipper extends AbstractJSONRipper {
|
|||||||
qHash = getQhash(document);
|
qHash = getQhash(document);
|
||||||
JSONObject jsonObject = getJsonObjectFromDoc(document);
|
JSONObject jsonObject = getJsonObjectFromDoc(document);
|
||||||
String hashtagNamePath = "entry_data.TagPage[0].graphql.hashtag.name";
|
String hashtagNamePath = "entry_data.TagPage[0].graphql.hashtag.name";
|
||||||
String singlePostIdPath = "entry_data.PostPage[0].graphql.shortcode_media.shortcode";
|
String singlePostIdPath = "graphql.shortcode_media.shortcode";
|
||||||
String profileIdPath = "entry_data.ProfilePage[0].graphql.user.id";
|
String profileIdPath = "entry_data.ProfilePage[0].graphql.user.id";
|
||||||
String storiesPath = "entry_data.StoriesPage[0].user.id";
|
String storiesPath = "entry_data.StoriesPage[0].user.id";
|
||||||
String idPath = hashtagRip ? hashtagNamePath : storiesRip ? storiesPath : postRip ? singlePostIdPath : profileIdPath;
|
String idPath = hashtagRip ? hashtagNamePath : storiesRip ? storiesPath : postRip ? singlePostIdPath : profileIdPath;
|
||||||
@@ -164,37 +176,53 @@ public class InstagramRipper extends AbstractJSONRipper {
|
|||||||
if (postRip) {
|
if (postRip) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
String pinnedRegex = "=\"(?<hash>[0-9a-f]+)\"[^;]+[.]generatePaginationActionCreators";
|
Predicate<String> hrefFilter = (storiesRip || pinnedReelRip) ? href -> href.contains("Consumer.js") :
|
||||||
String storiesRegex = "=50,h=\"(?<hash>[0-9a-f]+)\"";
|
href -> href.contains("ProfilePageContainer.js") || href.contains("TagPageContainer.js");
|
||||||
String hashRegex = "%s[^;]+pagination}?,queryId:\"(?<hash>[0-9a-f]+)\"";
|
|
||||||
String forHashtag = "tagMedia[.]byTagName";
|
String href = doc.select("link[rel=preload]").stream()
|
||||||
String forTagged = "taggedPosts[.]byUserId";
|
.map(link -> link.attr("href"))
|
||||||
String forUser = "profilePosts[.]byUserId";
|
.filter(hrefFilter)
|
||||||
String href = "";
|
.findFirst().orElse("");
|
||||||
Pattern pattern = Pattern.compile(format(hashRegex, forUser));
|
String body = Http.url("https://www.instagram.com" + href).cookies(cookies).response().body();
|
||||||
for (Element el : doc.select("link[rel=preload]")) {
|
|
||||||
href = el.attr("href");
|
Function<String, String> hashExtractor =
|
||||||
if ((storiesRip || pinnedReelRip) && href.contains("Consumer.js")) {
|
storiesRip || pinnedReelRip ? this::getStoriesHash :
|
||||||
pattern = Pattern.compile(storiesRegex);
|
pinnedRip ? this::getPinnedHash : hashtagRip ? this::getTagHash :
|
||||||
break;
|
taggedRip ? this::getUserTagHash : this::getProfileHash;
|
||||||
} else if (href.contains("ProfilePageContainer.js") || href.contains("TagPageContainer.js")) {
|
|
||||||
pattern = Pattern.compile(pinnedRip ? pinnedRegex :
|
return hashExtractor.apply(body);
|
||||||
format(hashRegex, hashtagRip ? forHashtag : taggedRip ? forTagged : forUser));
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getStoriesHash(String jsData) {
|
||||||
|
return getHashValue(jsData, "loadStoryViewers", -5);
|
||||||
}
|
}
|
||||||
Matcher matcher = pattern.matcher(Http.url("https://www.instagram.com" + href).response().body());
|
|
||||||
return matcher.find() ? matcher.group("hash") : null;
|
private String getProfileHash(String jsData) {
|
||||||
|
return getHashValue(jsData, "loadProfilePageExtras", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getPinnedHash(String jsData) {
|
||||||
|
return getHashValue(jsData, "loadProfilePageExtras", -2);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getTagHash(String jsData) {
|
||||||
|
return getHashValue(jsData, "requestNextTagMedia", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getUserTagHash(String jsData) {
|
||||||
|
return getHashValue(jsData, "requestNextTaggedPosts", -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
private JSONObject getJsonObjectFromDoc(Document document) {
|
private JSONObject getJsonObjectFromDoc(Document document) {
|
||||||
for (Element script : document.select("script[type=text/javascript]")) {
|
for (Element script : document.select("script[type=text/javascript]")) {
|
||||||
String scriptText = script.data();
|
String scriptText = script.data();
|
||||||
if (scriptText.startsWith("window._sharedData")) {
|
if (scriptText.startsWith("window._sharedData") || scriptText.startsWith("window.__additionalDataLoaded")) {
|
||||||
String jsonText = scriptText.replaceAll("[^{]*([{].*})[^}]*", "$1");
|
String jsonText = scriptText.replaceAll("[^{]*([{].*})[^}]*", "$1");
|
||||||
|
if (jsonText.contains("graphql") || jsonText.contains("StoriesPage")) {
|
||||||
return new JSONObject(jsonText);
|
return new JSONObject(jsonText);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -383,6 +411,44 @@ public class InstagramRipper extends AbstractJSONRipper {
|
|||||||
addURLToDownload(url, itemPrefixes.get(index - 1), "", null, cookies);
|
addURLToDownload(url, itemPrefixes.get(index - 1), "", null, cookies);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Javascript parsing
|
||||||
|
/* ------------------------------------------------------------------------------------------------------- */
|
||||||
|
private String getHashValue(String javaScriptData, String keyword, int offset) {
|
||||||
|
List<Statement> statements = getJsBodyBlock(javaScriptData).getStatements();
|
||||||
|
return statements.stream()
|
||||||
|
.flatMap(statement -> filterItems(statement, ExpressionStatement.class))
|
||||||
|
.map(ExpressionStatement::getExpression)
|
||||||
|
.flatMap(expression -> filterItems(expression, CallNode.class))
|
||||||
|
.map(CallNode::getArgs)
|
||||||
|
.map(expressions -> expressions.get(0))
|
||||||
|
.flatMap(expression -> filterItems(expression, FunctionNode.class))
|
||||||
|
.map(FunctionNode::getBody)
|
||||||
|
.map(Block::getStatements)
|
||||||
|
.map(statementList -> lookForHash(statementList, keyword, offset))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.findFirst().orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String lookForHash(List<Statement> list, String keyword, int offset) {
|
||||||
|
for (int i = 0; i < list.size(); i++) {
|
||||||
|
Statement st = list.get(i);
|
||||||
|
if (st.toString().contains(keyword)) {
|
||||||
|
return list.get(i + offset).toString().replaceAll(".*\"([0-9a-f]*)\".*", "$1");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private <T> Stream<T> filterItems(Object obj, Class<T> aClass) {
|
||||||
|
return Stream.of(obj).filter(aClass::isInstance).map(aClass::cast);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Block getJsBodyBlock(String javaScriptData) {
|
||||||
|
ErrorManager errors = new ErrorManager();
|
||||||
|
Context context = new Context(new Options("nashorn"), errors, Thread.currentThread().getContextClassLoader());
|
||||||
|
return new Parser(context.getEnv(), Source.sourceFor("name", javaScriptData), errors).parse().getBody();
|
||||||
|
}
|
||||||
|
|
||||||
// Some JSON helper methods below
|
// Some JSON helper methods below
|
||||||
/* ------------------------------------------------------------------------------------------------------- */
|
/* ------------------------------------------------------------------------------------------------------- */
|
||||||
private JSONObject getJsonObjectByPath(JSONObject object, String key) {
|
private JSONObject getJsonObjectByPath(JSONObject object, String key) {
|
||||||
|
@@ -1,5 +1,8 @@
|
|||||||
package com.rarchives.ripme.tst.ripper.rippers;
|
package com.rarchives.ripme.tst.ripper.rippers;
|
||||||
|
|
||||||
|
import com.rarchives.ripme.ripper.rippers.InstagramRipper;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@@ -7,10 +10,6 @@ import java.util.HashMap;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.rarchives.ripme.ripper.rippers.InstagramRipper;
|
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
public class InstagramRipperTest extends RippersTest {
|
public class InstagramRipperTest extends RippersTest {
|
||||||
@Test
|
@Test
|
||||||
public void testInstagramGID() throws IOException {
|
public void testInstagramGID() throws IOException {
|
||||||
@@ -33,7 +32,8 @@ public class InstagramRipperTest extends RippersTest {
|
|||||||
deleteDir(ripper.getWorkingDir());
|
deleteDir(ripper.getWorkingDir());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@Test @Disabled("Fails to rip")
|
|
||||||
|
@Test
|
||||||
public void testInstagramAlbums() throws IOException {
|
public void testInstagramAlbums() throws IOException {
|
||||||
List<URL> contentURLs = new ArrayList<>();
|
List<URL> contentURLs = new ArrayList<>();
|
||||||
// This unit test is a bit flaky
|
// This unit test is a bit flaky
|
||||||
|
Reference in New Issue
Block a user