1
0
mirror of https://github.com/RipMeApp/ripme.git synced 2025-08-30 09:10:44 +02:00

Merge pull request #1229 from cyian-1756/ig-fix

Fixed instagram ripper
This commit is contained in:
cyian-1756
2019-03-02 17:05:56 -05:00
committed by GitHub

View File

@@ -39,9 +39,6 @@ public class InstagramRipper extends AbstractJSONRipper {
private String userID;
private String rhx_gis = null;
private String csrftoken;
// Run into a weird issue with Jsoup cutting some json pages in half, this is a work around
// see https://github.com/RipMeApp/ripme/issues/601
private String workAroundJsonString;
@@ -192,6 +189,9 @@ public class InstagramRipper extends AbstractJSONRipper {
Document p = resp.parse();
// Get the query hash so we can download the next page
qHash = getQHash(p);
if (qHash == null) {
throw new IOException("Unable to extract qhash from page");
}
return getJSONFromPage(p);
}
@@ -398,7 +398,6 @@ public class InstagramRipper extends AbstractJSONRipper {
}
private boolean pageHasImages(JSONObject json) {
LOGGER.info(json);
int numberOfImages = json.getJSONObject("data").getJSONObject("user")
.getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length();
if (numberOfImages == 0) {
@@ -422,23 +421,36 @@ public class InstagramRipper extends AbstractJSONRipper {
}
in.close();
workAroundJsonString = sb.toString();
return new JSONObject(sb.toString());
} catch (MalformedURLException e) {
LOGGER.info("Unable to get query_hash, " + url + " is a malformed URL");
LOGGER.info("Unable to get page, " + url + " is a malformed URL");
return null;
} catch (IOException e) {
LOGGER.info("Unable to get query_hash");
LOGGER.info("Unable to get page");
LOGGER.info(e.getMessage());
return null;
}
}
private String getQhashUrl(Document doc) {
for(Element el : doc.select("link[rel=preload]")) {
if (el.attr("href").contains("ProfilePageContainer")) {
return el.attr("href");
}
}
for(Element el : doc.select("link[rel=preload]")) {
if (el.attr("href").contains("metro")) {
return el.attr("href");
}
}
return null;
}
private String getQHash(Document doc) {
String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href");
String jsFileURL = "https://www.instagram.com" + getQhashUrl(doc);
StringBuilder sb = new StringBuilder();
Document jsPage;
LOGGER.info(jsFileURL);
try {
// We can't use Jsoup here because it won't download a non-html file larger than a MB
// even if you set maxBodySize to 0
@@ -454,7 +466,7 @@ public class InstagramRipper extends AbstractJSONRipper {
LOGGER.info("Unable to get query_hash, " + jsFileURL + " is a malformed URL");
return null;
} catch (IOException e) {
LOGGER.info("Unable to get query_hash");
LOGGER.info("Unable to get query_hash from " + jsFileURL);
LOGGER.info(e.getMessage());
return null;
}
@@ -468,6 +480,12 @@ public class InstagramRipper extends AbstractJSONRipper {
m = jsP.matcher(sb.toString());
if (m.find()) {
return m.group(1);
} else {
jsP = Pattern.compile(",u=.([a-zA-Z0-9]+).");
m = jsP.matcher(sb.toString());
if (m.find()) {
return m.group(1);
}
}
}
@@ -477,6 +495,7 @@ public class InstagramRipper extends AbstractJSONRipper {
if (m.find()) {
return m.group(1);
}
}
LOGGER.error("Could not find query_hash on " + jsFileURL);
return null;