diff --git a/pom.xml b/pom.xml index 9f86678f..2d6f9d62 100644 --- a/pom.xml +++ b/pom.xml @@ -46,7 +46,7 @@ org.json json - 20140107 + 20190722 commons-configuration diff --git a/src/main/java/com/rarchives/ripme/ripper/rippers/VkRipper.java b/src/main/java/com/rarchives/ripme/ripper/rippers/VkRipper.java index 99310dc4..b364a5ae 100644 --- a/src/main/java/com/rarchives/ripme/ripper/rippers/VkRipper.java +++ b/src/main/java/com/rarchives/ripme/ripper/rippers/VkRipper.java @@ -6,10 +6,12 @@ import java.net.URL; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; - +import org.apache.commons.lang.StringEscapeUtils; import com.rarchives.ripme.ripper.AbstractJSONRipper; import org.json.JSONArray; import org.json.JSONObject; +import org.jsoup.Connection.Method; +import org.jsoup.Connection.Response; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -26,6 +28,7 @@ public class VkRipper extends AbstractJSONRipper { private RipType RIP_TYPE; private String oid; + private int offset = 0; public VkRipper(URL url) throws IOException { super(url); @@ -59,68 +62,18 @@ public class VkRipper extends AbstractJSONRipper { String[] jsonStrings = doc.toString().split(""); return new JSONObject(jsonStrings[jsonStrings.length - 1]); } else { - Map photoIDsToURLs = new HashMap<>(); - int offset = 0; - while (true) { - LOGGER.info(" Retrieving " + this.url); - Map postData = new HashMap<>(); - postData.put("al", "1"); - postData.put("offset", Integer.toString(offset)); - postData.put("part", "1"); - Document doc = Http.url(this.url) - .referrer(this.url) - .ignoreContentType() - .data(postData) - .post(); - - String body = doc.toString(); - if (!body.contains(" elements = doc.select("a"); - Set photoIDsToGet = new HashSet<>(); - for (Element a : elements) { - if (!a.attr("onclick").contains("showPhoto('")) { - LOGGER.error("a: " + a); - continue; - } - String photoID = a.attr("onclick"); - photoID = photoID.substring(photoID.indexOf("showPhoto('") + "showPhoto('".length()); - photoID = photoID.substring(0, photoID.indexOf("'")); - if (!photoIDsToGet.contains(photoID)) { - photoIDsToGet.add(photoID); - } - } - for (String photoID : photoIDsToGet) { - if (!photoIDsToURLs.containsKey(photoID)) { - try { - photoIDsToURLs.putAll(getPhotoIDsToURLs(photoID)); - } catch (IOException e) { - LOGGER.error("Exception while retrieving photo id " + photoID, e); - continue; - } - } - if (!photoIDsToURLs.containsKey(photoID)) { - LOGGER.error("Could not find URL for photo ID: " + photoID); - continue; - } - if (isStopped() || isThisATest()) { - break; - } - } - - if (elements.size() < 40 || isStopped() || isThisATest()) { - break; - } - offset += elements.size(); - } - // Slight hack to make this into effectively a JSON ripper - return new JSONObject(photoIDsToURLs); + return getPage(); } } + @Override + protected JSONObject getNextPage(JSONObject doc) throws IOException { + if (isStopped() || isThisATest()) { + return null; + } + return getPage(); + } + @Override protected List getURLsFromJSON(JSONObject page) { List pageURLs = new ArrayList<>(); @@ -142,9 +95,9 @@ public class VkRipper extends AbstractJSONRipper { pageURLs.add(videoURL); } } else { - Iterator keys = page.keys(); + Iterator keys = page.keys(); while (keys.hasNext()) { - pageURLs.add(page.getString((String) keys.next())); + pageURLs.add(page.getString(keys.next())); } } return pageURLs; @@ -197,6 +150,7 @@ public class VkRipper extends AbstractJSONRipper { else { RIP_TYPE = RipType.IMAGE; } + super.rip(); } private Map getPhotoIDsToURLs(String photoID) throws IOException { @@ -208,40 +162,182 @@ public class VkRipper extends AbstractJSONRipper { postData.put("al", "1"); postData.put("module", "photos"); postData.put("photo", photoID); - Document doc = Jsoup - .connect("https://vk.com/al_photos.php") + Response res = Jsoup.connect("https://vk.com/al_photos.php") .header("Referer", this.url.toExternalForm()) + .header("Accept", "*/*") + .header("Accept-Language", "en-US,en;q=0.5") + .header("Content-Type", "application/x-www-form-urlencoded") + .header("X-Requested-With", "XMLHttpRequest") .ignoreContentType(true) .userAgent(USER_AGENT) .timeout(5000) .data(postData) - .post(); - String jsonString = doc.toString(); - jsonString = jsonString.substring(jsonString.indexOf("") + "".length()); - jsonString = jsonString.substring(0, jsonString.indexOf("")); - JSONArray json = new JSONArray(jsonString); - for (int i = 0; i < json.length(); i++) { - JSONObject jsonImage = json.getJSONObject(i); - for (String key : new String[] {"z_src", "y_src", "x_src"}) { - if (!jsonImage.has(key)) { - continue; - } - photoIDsToURLs.put(jsonImage.getString("id"), jsonImage.getString(key)); - break; - } + .method(Method.POST) + .execute(); + String jsonString = res.body(); + JSONObject json = new JSONObject(jsonString); + JSONObject photoObject = findJSONObjectContainingPhotoId(photoID, json); + String bestSourceUrl = getBestSourceUrl(photoObject); + + if (bestSourceUrl != null) { + photoIDsToURLs.put(photoID, bestSourceUrl); + } else { + LOGGER.error("Could not find image source for " + photoID); } + return photoIDsToURLs; } @Override public String getGID(URL url) throws MalformedURLException { - Pattern p = Pattern.compile("^https?://(www\\.)?vk\\.com/(photos|album|videos)-?([a-zA-Z0-9_]+).*$"); + Pattern p = Pattern.compile("^https?:\\/\\/(?:www\\.)?vk\\.com\\/((?:photos|album|videos)-?(?:[a-zA-Z0-9_]+).*$)"); Matcher m = p.matcher(url.toExternalForm()); if (!m.matches()) { throw new MalformedURLException("Expected format: http://vk.com/album#### or vk.com/photos####"); } - int count = m.groupCount(); - return m.group(count - 1) + m.group(count); + return m.group(1); } + + /** + * Finds the nested JSON object with entry "id": "photoID" recursively. + * @param photoID The photoId string to be found with "id" as the key. + * @param json Object of type JSONObject or JSONArray. + * @return JSONObject with id as the photoID or null. + */ + public JSONObject findJSONObjectContainingPhotoId(String photoID, Object json) { + // Termination condition + if (json instanceof JSONObject && ((JSONObject) json).has("id") + && ((JSONObject) json).optString("id").equals(photoID)) { + return ((JSONObject) json); + } + + if (json instanceof JSONObject) { + // Iterate through every key:value pair in the json. + Iterator iterator = ((JSONObject) json).keys(); + while (iterator.hasNext()) { + Object o = ((JSONObject) json).get(iterator.next()); + JSONObject responseJson = findJSONObjectContainingPhotoId(photoID, o); + if (responseJson != null) { + return responseJson; + } + } + + } + + if (json instanceof JSONArray) { + // Iterate through every array value in the json + for (Object o : (JSONArray) json) { + if (o instanceof JSONObject || o instanceof JSONArray) { + JSONObject responseJson = findJSONObjectContainingPhotoId(photoID, o); + if (responseJson != null) { + return responseJson; + } + } + } + } + + return null; + } + + /** + * Find the best source url( with highest resolution). + * @param json JSONObject containing src urls. + * @return Url string for the image src or null. + */ + public String getBestSourceUrl(JSONObject json) { + String bestSourceKey = null; + int bestSourceResolution = 0; + Iterator iterator = json.keys(); + + while (iterator.hasNext()) { + String key = iterator.next(); + Object o = json.get(key); + // JSON contains source urls in the below format. Check VkRipperTest.java for sample json. + // {..., + // "x_src":"src-url", + // "x_": ["incomplete-url", width, height], + // ...} + if (o instanceof JSONArray && ((JSONArray) o).length() == 3 + && !((JSONArray) o).optString(0).equals("") && ((JSONArray) o).optInt(1) != 0 + && ((JSONArray) o).optInt(2) != 0 && json.has(key + "src")) { + if (((JSONArray) o).optInt(1) * ((JSONArray) o).optInt(2) >= bestSourceResolution) { + bestSourceResolution = ((JSONArray) o).optInt(1) * ((JSONArray) o).optInt(2); + bestSourceKey = key; + } + } + } + + // In case no suitable source has been found, we fall back to the older way. + if(bestSourceKey == null) { + for (String key : new String[] {"z_src", "y_src", "x_src", "w_src"}) { + if(!json.has(key)) { + continue; + } + return json.getString(key); + } + }else { + return json.getString(bestSourceKey + "src"); + } + + return null; + } + + /** + * Common function to get the next page( containing next batch of images). + * @return JSONObject containing entries of "imgId": "src" + * @throws IOException + */ + private JSONObject getPage() throws IOException { + Map photoIDsToURLs = new HashMap<>(); + Map postData = new HashMap<>(); + + LOGGER.info("Retrieving " + this.url + " from offset " + offset); + postData.put("al", "1"); + postData.put("offset", Integer.toString(offset)); + postData.put("part", "1"); + Document doc = + Http.url(this.url).referrer(this.url).ignoreContentType().data(postData).post(); + String body = doc.toString(); + if (!body.contains(" elements = doc.select("a"); + Set photoIDsToGet = new HashSet<>(); + for (Element a : elements) { + if (!a.attr("onclick").contains("showPhoto('")) { + continue; + } + String photoID = a.attr("onclick"); + photoID = photoID.substring(photoID.indexOf("showPhoto('") + "showPhoto('".length()); + photoID = photoID.substring(0, photoID.indexOf("'")); + if (!photoIDsToGet.contains(photoID)) { + photoIDsToGet.add(photoID); + } + } + for (String photoID : photoIDsToGet) { + if (!photoIDsToURLs.containsKey(photoID)) { + try { + photoIDsToURLs.putAll(getPhotoIDsToURLs(photoID)); + } catch (IOException e) { + LOGGER.error("Exception while retrieving photo id " + photoID, e); + continue; + } + } + if (!photoIDsToURLs.containsKey(photoID)) { + LOGGER.error("Could not find URL for photo ID: " + photoID); + continue; + } + if (isStopped() || isThisATest()) { + break; + } + } + + offset += elements.size(); + // Slight hack to make this into effectively a JSON ripper + return new JSONObject(photoIDsToURLs); + } } diff --git a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VkRipperTest.java b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VkRipperTest.java index 22ccb641..327698bd 100644 --- a/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VkRipperTest.java +++ b/src/test/java/com/rarchives/ripme/tst/ripper/rippers/VkRipperTest.java @@ -4,6 +4,7 @@ import java.io.IOException; import java.net.URL; import com.rarchives.ripme.ripper.rippers.VkRipper; +import org.json.JSONObject; import org.junit.jupiter.api.Test; public class VkRipperTest extends RippersTest { @@ -17,11 +18,6 @@ public class VkRipperTest extends RippersTest { // EXAMPLE: https://vk.com/album45506334_101886701 (a single album - custom) @Test public void testVkAlbumHttpRip() throws IOException { - VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0")); - testRipper(ripper); - } - @Test - public void testVkAlbumHttpsRip() throws IOException { VkRipper ripper = new VkRipper(new URL("https://vk.com/album45506334_0")); testRipper(ripper); } @@ -30,4 +26,26 @@ public class VkRipperTest extends RippersTest { VkRipper ripper = new VkRipper(new URL("https://vk.com/photos45506334")); testRipper(ripper); } + + @Test + public void testFindJSONObjectContainingPhotoID() throws IOException { + VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0")); + String json = + "{\"payload\":[0,[\"album-45984105_268691406\",18,14,[{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"tagged\":[],\"likes\":0,\"shares\":0,\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960],\"w_src\":\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU.jpg\",\"w_\":[\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU\",1405,1054]}]]],\"langVersion\":\"4298\"}"; + String responseJson = + "{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"tagged\":[],\"likes\":0,\"shares\":0,\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960],\"w_src\":\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU.jpg\",\"w_\":[\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU\",1405,1054]}"; + + assertTrue( + ripper.findJSONObjectContainingPhotoId("-45984105_457345201", new JSONObject(json)) + .similar(new JSONObject(responseJson))); + } + + @Test + public void testGetBestSourceUrl() throws IOException { + VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0")); + String json = + "{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"commcount\":0,\"date\":\"3 Dec at 1:14 am\",\"tagged\":[],\"attached_tags\":{\"max_tags_per_object\":5},\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"y_src\":\"https://sun9-9.userapi.com/c857520/v857520962/10e249/dUDeuY10s0A.jpg\",\"y_\":[\"https://sun9-9.userapi.com/c857520/v857520962/10e249/dUDeuY10s0A\",807,605],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960]}"; + assertEquals("https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg", + ripper.getBestSourceUrl(new JSONObject(json))); + } }