1
0
mirror of https://github.com/RipMeApp/ripme.git synced 2025-08-14 09:54:40 +02:00

Merge pull request #1521 from Tush-r/vkfix

Fixed VkRipper and bumped JSON library.
This commit is contained in:
cyian-1756
2019-12-14 13:40:06 -05:00
committed by GitHub
3 changed files with 201 additions and 87 deletions

View File

@@ -46,7 +46,7 @@
<dependency> <dependency>
<groupId>org.json</groupId> <groupId>org.json</groupId>
<artifactId>json</artifactId> <artifactId>json</artifactId>
<version>20140107</version> <version>20190722</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>commons-configuration</groupId> <groupId>commons-configuration</groupId>

View File

@@ -6,10 +6,12 @@ import java.net.URL;
import java.util.*; import java.util.*;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import com.rarchives.ripme.ripper.AbstractJSONRipper; import com.rarchives.ripme.ripper.AbstractJSONRipper;
import org.json.JSONArray; import org.json.JSONArray;
import org.json.JSONObject; import org.json.JSONObject;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
@@ -26,6 +28,7 @@ public class VkRipper extends AbstractJSONRipper {
private RipType RIP_TYPE; private RipType RIP_TYPE;
private String oid; private String oid;
private int offset = 0;
public VkRipper(URL url) throws IOException { public VkRipper(URL url) throws IOException {
super(url); super(url);
@@ -59,68 +62,18 @@ public class VkRipper extends AbstractJSONRipper {
String[] jsonStrings = doc.toString().split("<!>"); String[] jsonStrings = doc.toString().split("<!>");
return new JSONObject(jsonStrings[jsonStrings.length - 1]); return new JSONObject(jsonStrings[jsonStrings.length - 1]);
} else { } else {
Map<String,String> photoIDsToURLs = new HashMap<>(); return getPage();
int offset = 0;
while (true) {
LOGGER.info(" Retrieving " + this.url);
Map<String,String> postData = new HashMap<>();
postData.put("al", "1");
postData.put("offset", Integer.toString(offset));
postData.put("part", "1");
Document doc = Http.url(this.url)
.referrer(this.url)
.ignoreContentType()
.data(postData)
.post();
String body = doc.toString();
if (!body.contains("<div")) {
break;
}
body = body.substring(body.indexOf("<div"));
doc = Jsoup.parseBodyFragment(body);
List<Element> elements = doc.select("a");
Set<String> photoIDsToGet = new HashSet<>();
for (Element a : elements) {
if (!a.attr("onclick").contains("showPhoto('")) {
LOGGER.error("a: " + a);
continue;
}
String photoID = a.attr("onclick");
photoID = photoID.substring(photoID.indexOf("showPhoto('") + "showPhoto('".length());
photoID = photoID.substring(0, photoID.indexOf("'"));
if (!photoIDsToGet.contains(photoID)) {
photoIDsToGet.add(photoID);
}
}
for (String photoID : photoIDsToGet) {
if (!photoIDsToURLs.containsKey(photoID)) {
try {
photoIDsToURLs.putAll(getPhotoIDsToURLs(photoID));
} catch (IOException e) {
LOGGER.error("Exception while retrieving photo id " + photoID, e);
continue;
}
}
if (!photoIDsToURLs.containsKey(photoID)) {
LOGGER.error("Could not find URL for photo ID: " + photoID);
continue;
}
if (isStopped() || isThisATest()) {
break;
}
}
if (elements.size() < 40 || isStopped() || isThisATest()) {
break;
}
offset += elements.size();
}
// Slight hack to make this into effectively a JSON ripper
return new JSONObject(photoIDsToURLs);
} }
} }
@Override
protected JSONObject getNextPage(JSONObject doc) throws IOException {
if (isStopped() || isThisATest()) {
return null;
}
return getPage();
}
@Override @Override
protected List<String> getURLsFromJSON(JSONObject page) { protected List<String> getURLsFromJSON(JSONObject page) {
List<String> pageURLs = new ArrayList<>(); List<String> pageURLs = new ArrayList<>();
@@ -142,9 +95,9 @@ public class VkRipper extends AbstractJSONRipper {
pageURLs.add(videoURL); pageURLs.add(videoURL);
} }
} else { } else {
Iterator keys = page.keys(); Iterator<String> keys = page.keys();
while (keys.hasNext()) { while (keys.hasNext()) {
pageURLs.add(page.getString((String) keys.next())); pageURLs.add(page.getString(keys.next()));
} }
} }
return pageURLs; return pageURLs;
@@ -197,6 +150,7 @@ public class VkRipper extends AbstractJSONRipper {
else { else {
RIP_TYPE = RipType.IMAGE; RIP_TYPE = RipType.IMAGE;
} }
super.rip();
} }
private Map<String,String> getPhotoIDsToURLs(String photoID) throws IOException { private Map<String,String> getPhotoIDsToURLs(String photoID) throws IOException {
@@ -208,40 +162,182 @@ public class VkRipper extends AbstractJSONRipper {
postData.put("al", "1"); postData.put("al", "1");
postData.put("module", "photos"); postData.put("module", "photos");
postData.put("photo", photoID); postData.put("photo", photoID);
Document doc = Jsoup Response res = Jsoup.connect("https://vk.com/al_photos.php")
.connect("https://vk.com/al_photos.php")
.header("Referer", this.url.toExternalForm()) .header("Referer", this.url.toExternalForm())
.header("Accept", "*/*")
.header("Accept-Language", "en-US,en;q=0.5")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("X-Requested-With", "XMLHttpRequest")
.ignoreContentType(true) .ignoreContentType(true)
.userAgent(USER_AGENT) .userAgent(USER_AGENT)
.timeout(5000) .timeout(5000)
.data(postData) .data(postData)
.post(); .method(Method.POST)
String jsonString = doc.toString(); .execute();
jsonString = jsonString.substring(jsonString.indexOf("<!json>") + "<!json>".length()); String jsonString = res.body();
jsonString = jsonString.substring(0, jsonString.indexOf("<!>")); JSONObject json = new JSONObject(jsonString);
JSONArray json = new JSONArray(jsonString); JSONObject photoObject = findJSONObjectContainingPhotoId(photoID, json);
for (int i = 0; i < json.length(); i++) { String bestSourceUrl = getBestSourceUrl(photoObject);
JSONObject jsonImage = json.getJSONObject(i);
for (String key : new String[] {"z_src", "y_src", "x_src"}) { if (bestSourceUrl != null) {
if (!jsonImage.has(key)) { photoIDsToURLs.put(photoID, bestSourceUrl);
continue; } else {
} LOGGER.error("Could not find image source for " + photoID);
photoIDsToURLs.put(jsonImage.getString("id"), jsonImage.getString(key));
break;
}
} }
return photoIDsToURLs; return photoIDsToURLs;
} }
@Override @Override
public String getGID(URL url) throws MalformedURLException { public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://(www\\.)?vk\\.com/(photos|album|videos)-?([a-zA-Z0-9_]+).*$"); Pattern p = Pattern.compile("^https?:\\/\\/(?:www\\.)?vk\\.com\\/((?:photos|album|videos)-?(?:[a-zA-Z0-9_]+).*$)");
Matcher m = p.matcher(url.toExternalForm()); Matcher m = p.matcher(url.toExternalForm());
if (!m.matches()) { if (!m.matches()) {
throw new MalformedURLException("Expected format: http://vk.com/album#### or vk.com/photos####"); throw new MalformedURLException("Expected format: http://vk.com/album#### or vk.com/photos####");
} }
int count = m.groupCount(); return m.group(1);
return m.group(count - 1) + m.group(count);
} }
/**
* Finds the nested JSON object with entry "id": "photoID" recursively.
* @param photoID The photoId string to be found with "id" as the key.
* @param json Object of type JSONObject or JSONArray.
* @return JSONObject with id as the photoID or null.
*/
public JSONObject findJSONObjectContainingPhotoId(String photoID, Object json) {
// Termination condition
if (json instanceof JSONObject && ((JSONObject) json).has("id")
&& ((JSONObject) json).optString("id").equals(photoID)) {
return ((JSONObject) json);
}
if (json instanceof JSONObject) {
// Iterate through every key:value pair in the json.
Iterator<String> iterator = ((JSONObject) json).keys();
while (iterator.hasNext()) {
Object o = ((JSONObject) json).get(iterator.next());
JSONObject responseJson = findJSONObjectContainingPhotoId(photoID, o);
if (responseJson != null) {
return responseJson;
}
}
}
if (json instanceof JSONArray) {
// Iterate through every array value in the json
for (Object o : (JSONArray) json) {
if (o instanceof JSONObject || o instanceof JSONArray) {
JSONObject responseJson = findJSONObjectContainingPhotoId(photoID, o);
if (responseJson != null) {
return responseJson;
}
}
}
}
return null;
}
/**
* Find the best source url( with highest resolution).
* @param json JSONObject containing src urls.
* @return Url string for the image src or null.
*/
public String getBestSourceUrl(JSONObject json) {
String bestSourceKey = null;
int bestSourceResolution = 0;
Iterator<String> iterator = json.keys();
while (iterator.hasNext()) {
String key = iterator.next();
Object o = json.get(key);
// JSON contains source urls in the below format. Check VkRipperTest.java for sample json.
// {...,
// "x_src":"src-url",
// "x_": ["incomplete-url", width, height],
// ...}
if (o instanceof JSONArray && ((JSONArray) o).length() == 3
&& !((JSONArray) o).optString(0).equals("") && ((JSONArray) o).optInt(1) != 0
&& ((JSONArray) o).optInt(2) != 0 && json.has(key + "src")) {
if (((JSONArray) o).optInt(1) * ((JSONArray) o).optInt(2) >= bestSourceResolution) {
bestSourceResolution = ((JSONArray) o).optInt(1) * ((JSONArray) o).optInt(2);
bestSourceKey = key;
}
}
}
// In case no suitable source has been found, we fall back to the older way.
if(bestSourceKey == null) {
for (String key : new String[] {"z_src", "y_src", "x_src", "w_src"}) {
if(!json.has(key)) {
continue;
}
return json.getString(key);
}
}else {
return json.getString(bestSourceKey + "src");
}
return null;
}
/**
* Common function to get the next page( containing next batch of images).
* @return JSONObject containing entries of "imgId": "src"
* @throws IOException
*/
private JSONObject getPage() throws IOException {
Map<String, String> photoIDsToURLs = new HashMap<>();
Map<String, String> postData = new HashMap<>();
LOGGER.info("Retrieving " + this.url + " from offset " + offset);
postData.put("al", "1");
postData.put("offset", Integer.toString(offset));
postData.put("part", "1");
Document doc =
Http.url(this.url).referrer(this.url).ignoreContentType().data(postData).post();
String body = doc.toString();
if (!body.contains("<div")) {
return null;
}
body = body.substring(body.indexOf("<div"));
body = StringEscapeUtils.unescapeJavaScript(body);
doc = Jsoup.parseBodyFragment(body);
List<Element> elements = doc.select("a");
Set<String> photoIDsToGet = new HashSet<>();
for (Element a : elements) {
if (!a.attr("onclick").contains("showPhoto('")) {
continue;
}
String photoID = a.attr("onclick");
photoID = photoID.substring(photoID.indexOf("showPhoto('") + "showPhoto('".length());
photoID = photoID.substring(0, photoID.indexOf("'"));
if (!photoIDsToGet.contains(photoID)) {
photoIDsToGet.add(photoID);
}
}
for (String photoID : photoIDsToGet) {
if (!photoIDsToURLs.containsKey(photoID)) {
try {
photoIDsToURLs.putAll(getPhotoIDsToURLs(photoID));
} catch (IOException e) {
LOGGER.error("Exception while retrieving photo id " + photoID, e);
continue;
}
}
if (!photoIDsToURLs.containsKey(photoID)) {
LOGGER.error("Could not find URL for photo ID: " + photoID);
continue;
}
if (isStopped() || isThisATest()) {
break;
}
}
offset += elements.size();
// Slight hack to make this into effectively a JSON ripper
return new JSONObject(photoIDsToURLs);
}
} }

View File

@@ -4,6 +4,7 @@ import java.io.IOException;
import java.net.URL; import java.net.URL;
import com.rarchives.ripme.ripper.rippers.VkRipper; import com.rarchives.ripme.ripper.rippers.VkRipper;
import org.json.JSONObject;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
public class VkRipperTest extends RippersTest { public class VkRipperTest extends RippersTest {
@@ -17,11 +18,6 @@ public class VkRipperTest extends RippersTest {
// EXAMPLE: https://vk.com/album45506334_101886701 (a single album - custom) // EXAMPLE: https://vk.com/album45506334_101886701 (a single album - custom)
@Test @Test
public void testVkAlbumHttpRip() throws IOException { public void testVkAlbumHttpRip() throws IOException {
VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0"));
testRipper(ripper);
}
@Test
public void testVkAlbumHttpsRip() throws IOException {
VkRipper ripper = new VkRipper(new URL("https://vk.com/album45506334_0")); VkRipper ripper = new VkRipper(new URL("https://vk.com/album45506334_0"));
testRipper(ripper); testRipper(ripper);
} }
@@ -30,4 +26,26 @@ public class VkRipperTest extends RippersTest {
VkRipper ripper = new VkRipper(new URL("https://vk.com/photos45506334")); VkRipper ripper = new VkRipper(new URL("https://vk.com/photos45506334"));
testRipper(ripper); testRipper(ripper);
} }
@Test
public void testFindJSONObjectContainingPhotoID() throws IOException {
VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0"));
String json =
"{\"payload\":[0,[\"album-45984105_268691406\",18,14,[{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"tagged\":[],\"likes\":0,\"shares\":0,\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960],\"w_src\":\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU.jpg\",\"w_\":[\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU\",1405,1054]}]]],\"langVersion\":\"4298\"}";
String responseJson =
"{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"tagged\":[],\"likes\":0,\"shares\":0,\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960],\"w_src\":\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU.jpg\",\"w_\":[\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU\",1405,1054]}";
assertTrue(
ripper.findJSONObjectContainingPhotoId("-45984105_457345201", new JSONObject(json))
.similar(new JSONObject(responseJson)));
}
@Test
public void testGetBestSourceUrl() throws IOException {
VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0"));
String json =
"{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"commcount\":0,\"date\":\"<span class=\\\"rel_date\\\">3 Dec at 1:14 am</span>\",\"tagged\":[],\"attached_tags\":{\"max_tags_per_object\":5},\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"y_src\":\"https://sun9-9.userapi.com/c857520/v857520962/10e249/dUDeuY10s0A.jpg\",\"y_\":[\"https://sun9-9.userapi.com/c857520/v857520962/10e249/dUDeuY10s0A\",807,605],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960]}";
assertEquals("https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg",
ripper.getBestSourceUrl(new JSONObject(json)));
}
} }