1
0
mirror of https://github.com/RipMeApp/ripme.git synced 2025-08-14 01:44:12 +02:00

Fixed VkRipper not ripping images and bumped json library.

This commit is contained in:
Tushar
2019-12-10 01:04:44 +05:30
parent f9f0372173
commit b1e9adfbf3
3 changed files with 201 additions and 87 deletions

View File

@@ -46,7 +46,7 @@
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20140107</version>
<version>20190722</version>
</dependency>
<dependency>
<groupId>commons-configuration</groupId>

View File

@@ -6,10 +6,12 @@ import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import com.rarchives.ripme.ripper.AbstractJSONRipper;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Connection.Method;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -26,6 +28,7 @@ public class VkRipper extends AbstractJSONRipper {
private RipType RIP_TYPE;
private String oid;
private int offset = 0;
public VkRipper(URL url) throws IOException {
super(url);
@@ -59,68 +62,18 @@ public class VkRipper extends AbstractJSONRipper {
String[] jsonStrings = doc.toString().split("<!>");
return new JSONObject(jsonStrings[jsonStrings.length - 1]);
} else {
Map<String,String> photoIDsToURLs = new HashMap<>();
int offset = 0;
while (true) {
LOGGER.info(" Retrieving " + this.url);
Map<String,String> postData = new HashMap<>();
postData.put("al", "1");
postData.put("offset", Integer.toString(offset));
postData.put("part", "1");
Document doc = Http.url(this.url)
.referrer(this.url)
.ignoreContentType()
.data(postData)
.post();
String body = doc.toString();
if (!body.contains("<div")) {
break;
}
body = body.substring(body.indexOf("<div"));
doc = Jsoup.parseBodyFragment(body);
List<Element> elements = doc.select("a");
Set<String> photoIDsToGet = new HashSet<>();
for (Element a : elements) {
if (!a.attr("onclick").contains("showPhoto('")) {
LOGGER.error("a: " + a);
continue;
}
String photoID = a.attr("onclick");
photoID = photoID.substring(photoID.indexOf("showPhoto('") + "showPhoto('".length());
photoID = photoID.substring(0, photoID.indexOf("'"));
if (!photoIDsToGet.contains(photoID)) {
photoIDsToGet.add(photoID);
}
}
for (String photoID : photoIDsToGet) {
if (!photoIDsToURLs.containsKey(photoID)) {
try {
photoIDsToURLs.putAll(getPhotoIDsToURLs(photoID));
} catch (IOException e) {
LOGGER.error("Exception while retrieving photo id " + photoID, e);
continue;
}
}
if (!photoIDsToURLs.containsKey(photoID)) {
LOGGER.error("Could not find URL for photo ID: " + photoID);
continue;
}
if (isStopped() || isThisATest()) {
break;
}
}
if (elements.size() < 40 || isStopped() || isThisATest()) {
break;
}
offset += elements.size();
}
// Slight hack to make this into effectively a JSON ripper
return new JSONObject(photoIDsToURLs);
return getPage();
}
}
@Override
protected JSONObject getNextPage(JSONObject doc) throws IOException {
if (isStopped() || isThisATest()) {
return null;
}
return getPage();
}
@Override
protected List<String> getURLsFromJSON(JSONObject page) {
List<String> pageURLs = new ArrayList<>();
@@ -142,9 +95,9 @@ public class VkRipper extends AbstractJSONRipper {
pageURLs.add(videoURL);
}
} else {
Iterator keys = page.keys();
Iterator<String> keys = page.keys();
while (keys.hasNext()) {
pageURLs.add(page.getString((String) keys.next()));
pageURLs.add(page.getString(keys.next()));
}
}
return pageURLs;
@@ -197,6 +150,7 @@ public class VkRipper extends AbstractJSONRipper {
else {
RIP_TYPE = RipType.IMAGE;
}
super.rip();
}
private Map<String,String> getPhotoIDsToURLs(String photoID) throws IOException {
@@ -208,40 +162,182 @@ public class VkRipper extends AbstractJSONRipper {
postData.put("al", "1");
postData.put("module", "photos");
postData.put("photo", photoID);
Document doc = Jsoup
.connect("https://vk.com/al_photos.php")
Response res = Jsoup.connect("https://vk.com/al_photos.php")
.header("Referer", this.url.toExternalForm())
.header("Accept", "*/*")
.header("Accept-Language", "en-US,en;q=0.5")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("X-Requested-With", "XMLHttpRequest")
.ignoreContentType(true)
.userAgent(USER_AGENT)
.timeout(5000)
.data(postData)
.post();
String jsonString = doc.toString();
jsonString = jsonString.substring(jsonString.indexOf("<!json>") + "<!json>".length());
jsonString = jsonString.substring(0, jsonString.indexOf("<!>"));
JSONArray json = new JSONArray(jsonString);
for (int i = 0; i < json.length(); i++) {
JSONObject jsonImage = json.getJSONObject(i);
for (String key : new String[] {"z_src", "y_src", "x_src"}) {
if (!jsonImage.has(key)) {
continue;
}
photoIDsToURLs.put(jsonImage.getString("id"), jsonImage.getString(key));
break;
}
.method(Method.POST)
.execute();
String jsonString = res.body();
JSONObject json = new JSONObject(jsonString);
JSONObject photoObject = findJSONObjectContainingPhotoId(photoID, json);
String bestSourceUrl = getBestSourceUrl(photoObject);
if (bestSourceUrl != null) {
photoIDsToURLs.put(photoID, bestSourceUrl);
} else {
LOGGER.error("Could not find image source for " + photoID);
}
return photoIDsToURLs;
}
@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p = Pattern.compile("^https?://(www\\.)?vk\\.com/(photos|album|videos)-?([a-zA-Z0-9_]+).*$");
Pattern p = Pattern.compile("^https?:\\/\\/(?:www\\.)?vk\\.com\\/((?:photos|album|videos)-?(?:[a-zA-Z0-9_]+).*$)");
Matcher m = p.matcher(url.toExternalForm());
if (!m.matches()) {
throw new MalformedURLException("Expected format: http://vk.com/album#### or vk.com/photos####");
}
int count = m.groupCount();
return m.group(count - 1) + m.group(count);
return m.group(1);
}
/**
* Finds the nested JSON object with entry "id": "photoID" recursively.
* @param photoID The photoId string to be found with "id" as the key.
* @param json Object of type JSONObject or JSONArray.
* @return JSONObject with id as the photoID or null.
*/
public JSONObject findJSONObjectContainingPhotoId(String photoID, Object json) {
// Termination condition
if (json instanceof JSONObject && ((JSONObject) json).has("id")
&& ((JSONObject) json).optString("id").equals(photoID)) {
return ((JSONObject) json);
}
if (json instanceof JSONObject) {
// Iterate through every key:value pair in the json.
Iterator<String> iterator = ((JSONObject) json).keys();
while (iterator.hasNext()) {
Object o = ((JSONObject) json).get(iterator.next());
JSONObject responseJson = findJSONObjectContainingPhotoId(photoID, o);
if (responseJson != null) {
return responseJson;
}
}
}
if (json instanceof JSONArray) {
// Iterate through every array value in the json
for (Object o : (JSONArray) json) {
if (o instanceof JSONObject || o instanceof JSONArray) {
JSONObject responseJson = findJSONObjectContainingPhotoId(photoID, o);
if (responseJson != null) {
return responseJson;
}
}
}
}
return null;
}
/**
* Find the best source url( with highest resolution).
* @param json JSONObject containing src urls.
* @return Url string for the image src or null.
*/
public String getBestSourceUrl(JSONObject json) {
String bestSourceKey = null;
int bestSourceResolution = 0;
Iterator<String> iterator = json.keys();
while (iterator.hasNext()) {
String key = iterator.next();
Object o = json.get(key);
// JSON contains source urls in the below format. Check VkRipperTest.java for sample json.
// {...,
// "x_src":"src-url",
// "x_": ["incomplete-url", width, height],
// ...}
if (o instanceof JSONArray && ((JSONArray) o).length() == 3
&& !((JSONArray) o).optString(0).equals("") && ((JSONArray) o).optInt(1) != 0
&& ((JSONArray) o).optInt(2) != 0 && json.has(key + "src")) {
if (((JSONArray) o).optInt(1) * ((JSONArray) o).optInt(2) >= bestSourceResolution) {
bestSourceResolution = ((JSONArray) o).optInt(1) * ((JSONArray) o).optInt(2);
bestSourceKey = key;
}
}
}
// In case no suitable source has been found, we fall back to the older way.
if(bestSourceKey == null) {
for (String key : new String[] {"z_src", "y_src", "x_src", "w_src"}) {
if(!json.has(key)) {
continue;
}
return json.getString(key);
}
}else {
return json.getString(bestSourceKey + "src");
}
return null;
}
/**
* Common function to get the next page( containing next batch of images).
* @return JSONObject containing entries of "imgId": "src"
* @throws IOException
*/
private JSONObject getPage() throws IOException {
Map<String, String> photoIDsToURLs = new HashMap<>();
Map<String, String> postData = new HashMap<>();
LOGGER.info("Retrieving " + this.url + " from offset " + offset);
postData.put("al", "1");
postData.put("offset", Integer.toString(offset));
postData.put("part", "1");
Document doc =
Http.url(this.url).referrer(this.url).ignoreContentType().data(postData).post();
String body = doc.toString();
if (!body.contains("<div")) {
return null;
}
body = body.substring(body.indexOf("<div"));
body = StringEscapeUtils.unescapeJavaScript(body);
doc = Jsoup.parseBodyFragment(body);
List<Element> elements = doc.select("a");
Set<String> photoIDsToGet = new HashSet<>();
for (Element a : elements) {
if (!a.attr("onclick").contains("showPhoto('")) {
continue;
}
String photoID = a.attr("onclick");
photoID = photoID.substring(photoID.indexOf("showPhoto('") + "showPhoto('".length());
photoID = photoID.substring(0, photoID.indexOf("'"));
if (!photoIDsToGet.contains(photoID)) {
photoIDsToGet.add(photoID);
}
}
for (String photoID : photoIDsToGet) {
if (!photoIDsToURLs.containsKey(photoID)) {
try {
photoIDsToURLs.putAll(getPhotoIDsToURLs(photoID));
} catch (IOException e) {
LOGGER.error("Exception while retrieving photo id " + photoID, e);
continue;
}
}
if (!photoIDsToURLs.containsKey(photoID)) {
LOGGER.error("Could not find URL for photo ID: " + photoID);
continue;
}
if (isStopped() || isThisATest()) {
break;
}
}
offset += elements.size();
// Slight hack to make this into effectively a JSON ripper
return new JSONObject(photoIDsToURLs);
}
}

View File

@@ -4,6 +4,7 @@ import java.io.IOException;
import java.net.URL;
import com.rarchives.ripme.ripper.rippers.VkRipper;
import org.json.JSONObject;
import org.junit.jupiter.api.Test;
public class VkRipperTest extends RippersTest {
@@ -17,11 +18,6 @@ public class VkRipperTest extends RippersTest {
// EXAMPLE: https://vk.com/album45506334_101886701 (a single album - custom)
@Test
public void testVkAlbumHttpRip() throws IOException {
VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0"));
testRipper(ripper);
}
@Test
public void testVkAlbumHttpsRip() throws IOException {
VkRipper ripper = new VkRipper(new URL("https://vk.com/album45506334_0"));
testRipper(ripper);
}
@@ -30,4 +26,26 @@ public class VkRipperTest extends RippersTest {
VkRipper ripper = new VkRipper(new URL("https://vk.com/photos45506334"));
testRipper(ripper);
}
@Test
public void testFindJSONObjectContainingPhotoID() throws IOException {
VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0"));
String json =
"{\"payload\":[0,[\"album-45984105_268691406\",18,14,[{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"tagged\":[],\"likes\":0,\"shares\":0,\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960],\"w_src\":\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU.jpg\",\"w_\":[\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU\",1405,1054]}]]],\"langVersion\":\"4298\"}";
String responseJson =
"{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"tagged\":[],\"likes\":0,\"shares\":0,\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960],\"w_src\":\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU.jpg\",\"w_\":[\"https://sun9-60.userapi.com/c857520/v857520962/10e24b/6ETsA15rAdU\",1405,1054]}";
assertTrue(
ripper.findJSONObjectContainingPhotoId("-45984105_457345201", new JSONObject(json))
.similar(new JSONObject(responseJson)));
}
@Test
public void testGetBestSourceUrl() throws IOException {
VkRipper ripper = new VkRipper(new URL("http://vk.com/album45506334_0"));
String json =
"{\"id\":\"-45984105_457345201\",\"base\":\"https://sun9-37.userapi.com/\",\"commcount\":0,\"date\":\"<span class=\\\"rel_date\\\">3 Dec at 1:14 am</span>\",\"tagged\":[],\"attached_tags\":{\"max_tags_per_object\":5},\"o_src\":\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E.jpg\",\"o_\":[\"https://sun9-65.userapi.com/c857520/v857520962/10e24c/DPxygc3XW5E\",130,98],\"y_src\":\"https://sun9-9.userapi.com/c857520/v857520962/10e249/dUDeuY10s0A.jpg\",\"y_\":[\"https://sun9-9.userapi.com/c857520/v857520962/10e249/dUDeuY10s0A\",807,605],\"z_src\":\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg\",\"z_\":[\"https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI\",1280,960]}";
assertEquals("https://sun9-41.userapi.com/c857520/v857520962/10e24a/EsDDQA36qKI.jpg",
ripper.getBestSourceUrl(new JSONObject(json)));
}
}