diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java index 37e27214..d0f8dd9a 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/InstagramRipper.java @@ -39,9 +39,6 @@ public class InstagramRipper extends AbstractJSONRipper { private String userID; private String rhx_gis = null; private String csrftoken; - // Run into a weird issue with Jsoup cutting some json pages in half, this is a work around - // see https://github.com/RipMeApp/ripme/issues/601 - private String workAroundJsonString; @@ -192,6 +189,9 @@ public class InstagramRipper extends AbstractJSONRipper { Document p = resp.parse(); // Get the query hash so we can download the next page qHash = getQHash(p); + if (qHash == null) { + throw new IOException("Unable to extract qhash from page"); + } return getJSONFromPage(p); } @@ -398,7 +398,6 @@ public class InstagramRipper extends AbstractJSONRipper { } private boolean pageHasImages(JSONObject json) { - LOGGER.info(json); int numberOfImages = json.getJSONObject("data").getJSONObject("user") .getJSONObject("edge_owner_to_timeline_media").getJSONArray("edges").length(); if (numberOfImages == 0) { @@ -422,23 +421,36 @@ public class InstagramRipper extends AbstractJSONRipper { } in.close(); - workAroundJsonString = sb.toString(); return new JSONObject(sb.toString()); } catch (MalformedURLException e) { - LOGGER.info("Unable to get query_hash, " + url + " is a malformed URL"); + LOGGER.info("Unable to get page, " + url + " is a malformed URL"); return null; } catch (IOException e) { - LOGGER.info("Unable to get query_hash"); + LOGGER.info("Unable to get page"); LOGGER.info(e.getMessage()); return null; } } + private String getQhashUrl(Document doc) { + for(Element el : doc.select("link[rel=preload]")) { + if (el.attr("href").contains("ProfilePageContainer")) { + return el.attr("href"); + } + } + for(Element el : doc.select("link[rel=preload]")) { + if (el.attr("href").contains("metro")) { + return el.attr("href"); + } + } + return null; + } + private String getQHash(Document doc) { - String jsFileURL = "https://www.instagram.com" + doc.select("link[rel=preload]").attr("href"); + String jsFileURL = "https://www.instagram.com" + getQhashUrl(doc); StringBuilder sb = new StringBuilder(); - Document jsPage; + LOGGER.info(jsFileURL); try { // We can't use Jsoup here because it won't download a non-html file larger than a MB // even if you set maxBodySize to 0 @@ -454,7 +466,7 @@ public class InstagramRipper extends AbstractJSONRipper { LOGGER.info("Unable to get query_hash, " + jsFileURL + " is a malformed URL"); return null; } catch (IOException e) { - LOGGER.info("Unable to get query_hash"); + LOGGER.info("Unable to get query_hash from " + jsFileURL); LOGGER.info(e.getMessage()); return null; } @@ -468,6 +480,12 @@ public class InstagramRipper extends AbstractJSONRipper { m = jsP.matcher(sb.toString()); if (m.find()) { return m.group(1); + } else { + jsP = Pattern.compile(",u=.([a-zA-Z0-9]+)."); + m = jsP.matcher(sb.toString()); + if (m.find()) { + return m.group(1); + } } } @@ -477,6 +495,7 @@ public class InstagramRipper extends AbstractJSONRipper { if (m.find()) { return m.group(1); } + } LOGGER.error("Could not find query_hash on " + jsFileURL); return null;