1
0
mirror of https://github.com/RipMeApp/ripme.git synced 2025-08-31 01:30:00 +02:00

Make PhotobucketRipper inherit AbstractHTMLRipper

Also change the API call used to check if an album has subalbums.
The new API call returns a JSON with more metadata.

The style is deliberately similar to NSFWRipper after refactoring
(commit: 5ae2bb43e8). Also, change
the waiting time between fetching pages from 1 second to 2 seconds,
as Photobucket seems to have fewer read timeouts with this value
(might just be a fluke of accessing it on moble networks though).
This commit is contained in:
Peter Szakacs
2018-10-31 13:30:46 +01:00
parent da509663d1
commit 3a8b87578f
2 changed files with 268 additions and 151 deletions

View File

@@ -10,23 +10,63 @@ import java.util.regex.Pattern;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Connection.Response;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.rarchives.ripme.ripper.AlbumRipper;
import com.rarchives.ripme.ripper.AbstractHTMLRipper;
import com.rarchives.ripme.utils.Http;
public class PhotobucketRipper extends AlbumRipper {
public class PhotobucketRipper extends AbstractHTMLRipper {
private static final String DOMAIN = "photobucket.com",
HOST = "photobucket";
private static final int ITEMS_PER_PAGE = 24;
private static final int WAIT_BEFORE_NEXT_PAGE = 2000;
private Response pageResponse = null;
private final class AlbumMetadata {
private final String url;
private final String location;
private final int sortOrder;
private int currPage = 1;
private int numPages;
private AlbumMetadata(JSONObject data) {
this.url = data.getString("url");
this.location = data.getString("location")
.replace(" ", "_");
this.sortOrder = data.getInt("sortOrder");
}
private String getCurrPageURL(){
return url + String.format("?sort=%d&page=%d",
sortOrder, currPage);
}
}
private final Pattern collDataPattern;
private final Pattern pbURLPattern;
// all albums including sub-albums to rip
private List<AlbumMetadata> albums;
// the album currently being ripped
private AlbumMetadata currAlbum;
// a new index per album downloaded
private int index = 0;
public PhotobucketRipper(URL url) throws IOException {
super(url);
this.collDataPattern = Pattern.compile(
"^.*collectionData: (\\{.*}).*$", Pattern.DOTALL
);
this.pbURLPattern = Pattern.compile(
"^https?://([a-zA-Z0-9]+)\\.photobucket\\.com/user/" +
"([a-zA-Z0-9_\\-]+)/library/([^?]*).*$"
);
}
@Override
protected String getDomain() {
return DOMAIN;
}
@Override
@@ -34,45 +74,35 @@ public class PhotobucketRipper extends AlbumRipper {
return HOST;
}
@Override
public URL sanitizeURL(URL url) throws MalformedURLException {
LOGGER.info(url);
String u = url.toExternalForm();
if (u.contains("?")) {
// strip options from URL
u = u.substring(0, u.indexOf("?"));
return new URL(u);
}
else {
return url;
if (!u.endsWith("/")) {
// append trailing slash
u = u + "/";
}
}
public String getAlbumTitle(URL url) throws MalformedURLException {
try {
// Attempt to use album title as GID
if (pageResponse == null) {
pageResponse = Http.url(url).response();
}
Document albumDoc = pageResponse.parse();
Elements els = albumDoc.select("div.libraryTitle > h1");
if (els.isEmpty()) {
throw new IOException("Could not find libraryTitle at " + url);
}
return els.get(0).text();
} catch (IOException e) {
// Fall back to default album naming convention
}
return super.getAlbumTitle(url);
return new URL(u);
}
@Override
public String getGID(URL url) throws MalformedURLException {
Pattern p; Matcher m;
Matcher m;
URL sanitized = sanitizeURL(url);
// http://s844.photobucket.com/user/SpazzySpizzy/library/Lady%20Gaga?sort=3&page=1
p = Pattern.compile("^https?://[a-zA-Z0-9]+\\.photobucket\\.com/user/([a-zA-Z0-9_\\-]+)/library.*$");
m = p.matcher(url.toExternalForm());
m = pbURLPattern.matcher(sanitized.toExternalForm());
if (m.matches()) {
return m.group(1);
// the username is not really a unique GID, because the same user
// can have multiple albums, but on the other hand, using HOST_GID
// as save directory means we can group ripped albums of the same
// user.
return m.group(2);
}
throw new MalformedURLException(
@@ -81,134 +111,177 @@ public class PhotobucketRipper extends AlbumRipper {
+ " Got: " + url);
}
// Page iteration
@Override
public void rip() throws IOException {
List<String> subalbums = ripAlbumAndGetSubalbums(this.url.toExternalForm());
List<String> subsToRip = new ArrayList<>(),
rippedSubs = new ArrayList<>();
for (String sub : subalbums) {
subsToRip.add(sub);
protected Document getFirstPage() throws IOException {
if (this.currAlbum == null) {
this.albums = getAlbumMetadata(this.url.toExternalForm());
LOGGER.info("Detected " + albums.size() + " albums in total");
}
while (!subsToRip.isEmpty() && !isStopped()) {
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
break;
}
String nextSub = subsToRip.remove(0);
rippedSubs.add(nextSub);
LOGGER.info("Attempting to rip next subalbum: " + nextSub);
try {
pageResponse = null;
subalbums = ripAlbumAndGetSubalbums(nextSub);
} catch (IOException e) {
LOGGER.error("Error while ripping " + nextSub, e);
break;
}
for (String subalbum : subalbums) {
if (!subsToRip.contains(subalbum) && !rippedSubs.contains(subalbum)) {
subsToRip.add(subalbum);
}
}
}
waitForThreads();
this.currAlbum = this.albums.remove(0);
// NOTE: Why not just get media count in the metadata json?
//
// Because that data might not reflect what the user sees on the page
// and can lead to iterating more pages than there actually are.
//
// An example:
// Metadata JSON -> AlbumStats: 146 images + 0 videos -> 146 items/7 pages
// http://s1255.photobucket.com/api/user/mimajki/album/Movie%20gifs/get?subAlbums=48&json=1
// Actual item count when looking at the album url: 131 items/6 pages
// http://s1255.photobucket.com/user/mimajki/library/Movie%20gifs?sort=6&page=1
Document page = Http.url(currAlbum.getCurrPageURL()).get();
JSONObject collectionData = getCollectionData(page);
int totalNumItems = collectionData.getInt("total");
this.currAlbum.numPages = (int) Math.ceil(
(double)totalNumItems / (double) ITEMS_PER_PAGE);
this.index = 0;
return page;
}
private List<String> ripAlbumAndGetSubalbums(String theUrl) throws IOException {
int filesIndex = 0,
filesTotal = 0,
pageIndex = 0;
String currentAlbumPath = null,
url = null;
while (pageIndex == 0 || filesIndex < filesTotal) {
if (isStopped()) {
break;
}
pageIndex++;
if (pageIndex > 1 || pageResponse == null) {
url = theUrl + String.format("?sort=3&page=%d", pageIndex);
LOGGER.info(" Retrieving " + url);
pageResponse = Http.url(url).response();
}
Document albumDoc = pageResponse.parse();
// Retrieve JSON from request
String jsonString = null;
for (Element script : albumDoc.select("script[type=text/javascript]")) {
String data = script.data();
// Ensure this chunk of javascript contains the album info
if (!data.contains("libraryAlbumsPageCollectionData")) {
continue;
}
// Grab the JSON
Pattern p; Matcher m;
p = Pattern.compile("^.*collectionData: (\\{.*}).*$", Pattern.DOTALL);
m = p.matcher(data);
if (m.matches()) {
jsonString = m.group(1);
break;
}
}
if (jsonString == null) {
LOGGER.error("Unable to find JSON data at URL: " + url);
break;
}
JSONObject json = new JSONObject(jsonString);
JSONObject items = json.getJSONObject("items");
JSONArray objects = items.getJSONArray("objects");
filesTotal = items.getInt("total");
currentAlbumPath = json.getString("currentAlbumPath");
for (int i = 0; i < objects.length(); i++) {
JSONObject object = objects.getJSONObject(i);
String image = object.getString("fullsizeUrl");
filesIndex += 1;
addURLToDownload(new URL(image),
"",
object.getString("location").replaceAll(" ", "_"),
albumDoc.location(),
pageResponse.cookies());
}
@Override
public Document getNextPage(Document page) throws IOException {
currAlbum.currPage++;
boolean endOfAlbum = currAlbum.currPage > currAlbum.numPages;
boolean noMoreSubalbums = albums.isEmpty();
if (endOfAlbum && noMoreSubalbums){
throw new IOException("No more pages");
}
// Get subalbums
if (url != null) {
return getSubAlbums(url, currentAlbumPath);
} else {
return new ArrayList<>();
}
}
private List<String> getSubAlbums(String url, String currentAlbumPath) {
List<String> result = new ArrayList<>();
String subdomain = url.substring(url.indexOf("://")+3);
subdomain = subdomain.substring(0, subdomain.indexOf("."));
String apiUrl = "http://" + subdomain + ".photobucket.com/component/Albums-SubalbumList"
+ "?deferCollapsed=true"
+ "&albumPath=" + currentAlbumPath // %2Falbums%2Fab10%2FSpazzySpizzy"
+ "&json=1";
try {
LOGGER.info("Loading " + apiUrl);
JSONObject json = Http.url(apiUrl).getJSON();
JSONArray subalbums = json.getJSONObject("body").getJSONArray("subAlbums");
for (int i = 0; i < subalbums.length(); i++) {
String suburl =
"http://"
+ subdomain
+ ".photobucket.com"
+ subalbums.getJSONObject(i).getString("path");
suburl = suburl.replace(" ", "%20");
result.add(suburl);
}
} catch (IOException e) {
LOGGER.error("Failed to get subalbums from " + apiUrl, e);
Thread.sleep(WAIT_BEFORE_NEXT_PAGE);
} catch (InterruptedException e) {
LOGGER.info("Interrupted while waiting before getting next page");
}
if (endOfAlbum){
LOGGER.info("Turning to next album " + albums.get(0).url);
return getFirstPage();
} else {
LOGGER.info("Turning to page " + currAlbum.currPage +
" of album " + currAlbum.url);
return Http.url(currAlbum.getCurrPageURL()).get();
}
return result;
}
public boolean canRip(URL url) {
return url.getHost().endsWith(DOMAIN);
// Media parsing
@Override
protected List<String> getURLsFromPage(Document page) {
JSONObject collectionData = getCollectionData(page);
if (collectionData == null) {
LOGGER.error("Unable to find JSON data at URL: " + page.location());
return null;
} else {
return getImageURLs(collectionData);
}
}
private JSONObject getCollectionData(Document page){
// Retrieve JSON from a script tag in the returned document
for (Element script : page.select("script[type=text/javascript]")) {
String data = script.data();
// Ensure this chunk of javascript contains the album info
if (data.contains("libraryAlbumsPageCollectionData")) {
Matcher m = collDataPattern.matcher(data);
if (m.matches()) {
// Grab the JSON
return new JSONObject(m.group(1));
}
}
}
return null;
}
private List<String> getImageURLs(JSONObject json){
List<String> results = new ArrayList<>();
JSONObject items = json.getJSONObject("items");
JSONArray objects = items.getJSONArray("objects");
for (int i = 0; i < objects.length(); i++) {
JSONObject object = objects.getJSONObject(i);
String imgURL = object.getString("fullsizeUrl");
results.add(imgURL);
}
return results;
}
@Override
protected void downloadURL(URL url, int index) {
addURLToDownload(url, getPrefix(++this.index), currAlbum.location);
}
// helper methods (for album metadata retrieval)
private List<AlbumMetadata> getAlbumMetadata(String albumURL)
throws IOException {
JSONObject data = getAlbumMetadataJSON(albumURL);
List<AlbumMetadata> metadata = new ArrayList<>();
metadata.add(new AlbumMetadata(data));
if (!data.getString("location").equals("")) {
// if the location were to equal "", then we are at the profile
// page of a user. Ripping all sub-albums here would mean ripping
// all albums of a user (Not supported, only rip items in a users
// personal bucket).
for (JSONObject sub : getSubAlbumJSONs(data)){
metadata.add(new AlbumMetadata(sub));
}
}
LOGGER.info("Succesfully retrieved and parsed metadata");
return metadata;
}
private JSONObject getAlbumMetadataJSON(String albumURL)
throws IOException {
String subdomain, user, albumTitle;
Matcher m = pbURLPattern.matcher(albumURL);
if (!m.matches()){
throw new MalformedURLException("invalid URL " + albumURL);
}
subdomain = m.group(1);
user = m.group(2);
albumTitle = m.group(3);
if (albumTitle.endsWith("/")){
albumTitle = albumTitle.substring(0, albumTitle.length() - 1);
}
String apiURL = String.format("http://%s.photobucket.com/api/user/" +
"%s/album/%s/get?subAlbums=%d&json=1",
subdomain, user, albumTitle, ITEMS_PER_PAGE);
LOGGER.info("Loading " + apiURL);
JSONObject data = Http.url(apiURL).getJSON().getJSONObject("data");
if (data.has("subAlbums")) {
int count = data.getInt("subAlbumCount");
if (count > ITEMS_PER_PAGE) {
apiURL = String.format("http://%s.photobucket.com/api/user/" +
"%s/album/%s/get?subAlbums=%d&json=1",
subdomain, user, albumTitle, count);
data = Http.url(apiURL).getJSON().getJSONObject("data");
}
}
return data;
}
private List<JSONObject> getSubAlbumJSONs(JSONObject data) {
List<JSONObject> subalbumJSONs = new ArrayList<>();
if (data.has("subAlbums")) {
JSONArray subalbums = data.getJSONArray("subAlbums");
for (int idx = 0; idx < subalbums.length(); idx++) {
JSONObject subalbumJSON = subalbums.getJSONObject(idx);
subalbumJSONs.add(subalbumJSON);
}
}
return subalbumJSONs;
}
// TODO: Probably want to add queue support for cases like this:
// http://s732.photobucket.com/user/doublesix66/library/WARZONE?sort=3&page=1
}

View File

@@ -15,7 +15,51 @@ public class PhotobucketRipperTest extends RippersTest {
deleteDir(ripper.getWorkingDir());
}
*/
}
/*
// new test, still commented out because of the issue above,
// since this test also involves network IO.
public void testGetNextPage() throws IOException {
// this album should have more than enough sub-albums and pages
// to serve as a pretty good iteration test (barring server or
// network errors)
String baseURL = "http://s1255.photobucket.com/user/mimajki/library/Movie%20gifs?sort=6&page=1";
URL url = new URL(baseURL);
PhotobucketRipper ripper = new PhotobucketRipper(url);
org.jsoup.nodes.Document page = null;
try {
// I'm not sure it makes much sense that getFirstPage()
// is not public while getNextPage() is.
java.lang.reflect.Method method = ripper.getClass()
.getDeclaredMethod("getFirstPage");
method.setAccessible(true);
page = (org.jsoup.nodes.Document) method.invoke(ripper);
} catch (Exception e){
e.printStackTrace();
fail("Calling getFirstPage() failed");
}
int numPagesRemaining = 38;
for (int idx = 0; idx < numPagesRemaining; idx++){
page = ripper.getNextPage(page);
System.out.println("URL: " + page.location());
}
try {
page = ripper.getNextPage(page);
fail("Get next page did not throw an exception on the last page");
} catch(IOException e){
assertEquals(e.getMessage(), "No more pages");
}
}*/
public void testGetGID() throws IOException {
URL url = new URL("http://s732.photobucket.com/user/doublesix66/library/Army%20Painter%20examples?sort=3&page=1");
PhotobucketRipper ripper = new PhotobucketRipper(url);
assertEquals("doublesix66", ripper.getGID(url));
url = new URL("http://s732.photobucket.com/user/doublesix66/library/Army%20Painter%20examples/Painting%20examples?page=1&sort=3");
assertEquals("doublesix66", ripper.getGID(url));
url = new URL("http://s844.photobucket.com/user/SpazzySpizzy/library/Album%20Covers");
assertEquals("SpazzySpizzy", ripper.getGID(url));
url = new URL("http://s844.photobucket.com/user/SpazzySpizzy/library");
assertEquals("SpazzySpizzy", ripper.getGID(url));
}
}