#!/usr/bin/env python # -*- coding: utf-8 -*- # A script to download full soundtracks from KHInsider. # __future__ import for forwards compatibility with Python 3 from __future__ import print_function from __future__ import unicode_literals import os import re import sys from functools import wraps try: from urllib.parse import unquote, urljoin, urlsplit except ImportError: # Python 2 from urlparse import unquote, urljoin, urlsplit try: # Python 2 from os import getcwdu as getcwd except ImportError: from os import getcwd class Silence(object): def __enter__(self): self._stdout = sys.stdout self._stderr = sys.stderr sys.stdout = open(os.devnull, 'w') sys.stderr = open(os.devnull, 'w') def __exit__(self, *_): sys.stdout = self._stdout sys.stderr = self._stderr # --- Install prerequisites --- # (This section in `if __name__ == '__main__':` is entirely unrelated to the # rest of the module, and doesn't even run if the module isn't run by itself.) if __name__ == '__main__': # To check for the existence of modules without importing them. # Apparently imp and importlib are a forest of deprecation! # The API was changed once in 3.3 (deprecating imp), # and then again in 3.4 (deprecating the 3.3 API). # So.... we have to do this dance to avoid deprecation warnings. try: try: from importlib.util import find_spec as find_module # Python 3.4+ except ImportError: from importlib import find_loader as find_module # Python 3.3 except ImportError: from imp import find_module # Python 2 # User-friendly name, import name, pip specification. requiredModules = [ ['requests', 'requests', 'requests >= 2.0.0, < 3.0.0'], ['Beautiful Soup 4', 'bs4', 'beautifulsoup4 >= 4.4.0, < 5.0.0'] ] def moduleExists(name): try: result = find_module(name) except ImportError: return False else: return result is not None def neededInstalls(requiredModules=requiredModules): uninstalledModules = [] for module in requiredModules: if not moduleExists(module[1]): uninstalledModules.append(module) return uninstalledModules def install(package): nowhere = open(os.devnull, 'w') exitStatus = subprocess.call([sys.executable, '-m', 'pip', 'install', package], stdout=nowhere, stderr=nowhere) if exitStatus != 0: raise OSError("Failed to install package.") def installModules(modules, verbose=True): for module in modules: if verbose: print("Installing {}...".format(module[0])) try: install(module[2]) except OSError as e: if verbose: print("Failed to install {}. " "You may need to run the script as an administrator " "or superuser.".format(module[0]), file=sys.stderr) print("You can also try to install the package manually " "(pip install \"{}\")".format(module[2]), file=sys.stderr) raise e def installRequiredModules(needed=None, verbose=True): needed = neededInstalls() if needed is None else needed installModules(neededInstalls(), verbose) needed = neededInstalls() if needed: if moduleExists('pip'): # Needed to call pip the official way. import subprocess else: print("You don't seem to have pip installed!", file=sys.stderr) print("Get it from https://pip.readthedocs.org/en/latest/installing.html", file=sys.stderr) sys.exit(1) try: installRequiredModules(needed) except OSError: sys.exit(1) # ------ import requests from bs4 import BeautifulSoup BASE_URL = 'https://downloads.khinsider.com/' # Although some of these are valid on Linux, keeping this the same # across systems is nice for consistency AND it works on WSL. FILENAME_INVALID_RE = re.compile(r'[<>:"/\\|?*]') def to_valid_filename(s): # Windows's Explorer doens't handle filenames that end in ' ' or '.'. s = s.rstrip(' .') if s in {'', '.', '..', '~', 'CON', 'PRN', 'AUX', 'NUL', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'}: return s + '_' return FILENAME_INVALID_RE.sub('-', s) # Different printin' for different Pythons. def unicodePrint(*args, **kwargs): unicodeType = str if sys.version_info[0] > 2 else unicode encoding = sys.stdout.encoding or 'utf-8' args = [ arg.encode(encoding, 'replace').decode(encoding) if isinstance(arg, unicodeType) else arg for arg in args ] print(*args, **kwargs) def lazyProperty(func): attrName = '_lazy_' + func.__name__ @property @wraps(func) def lazyVersion(self): if not hasattr(self, attrName): setattr(self, attrName, func(self)) return getattr(self, attrName) return lazyVersion def getSoup(*args, **kwargs): r = requests.get(*args, **kwargs) return toSoup(r) REMOVE_RE = re.compile(br"^\s*$", re.MULTILINE) BAD_AMPERSAND_RE = re.compile(br"&#([^0-9x]|x[^0-9A-Fa-f])") def toSoup(r): content = r.content # Fix errors in khinsider's HTML. content = REMOVE_RE.sub(b'', content) content = BAD_AMPERSAND_RE.sub(b'&#\1', content) # BS4 outputs unsuppressable error messages when it can't # decode the input bytes properly. This... suppresses them. with Silence(): return BeautifulSoup(content, 'html.parser') def getAppropriateFile(song, formatOrder): if formatOrder is None: return song.files[0] for extension in formatOrder: for file in song.files: if os.path.splitext(file.filename)[1][1:].lower() == extension: return file return song.files[0] def friendlyDownloadFile(file, path, index, total, verbose=False): numberStr = "{}/{}".format( str(index).zfill(len(str(total))), str(total) ) if file is None and verbose: print("Song {} is nonexistent (404: Not Found). Skipping over.".format(numberStr), file=sys.stderr) return False encoding = sys.getfilesystemencoding() # Fun(?) fact: on Python 2, sys.getfilesystemencoding returns 'mbcs' even # on Windows NT (1993!) and later where filenames are natively Unicode. encoding = 'utf-8' if encoding == 'mbcs' else 'utf-8' filename = file.filename.encode(encoding, 'replace').decode(encoding) byTheWay = "" if filename != file.filename: byTheWay = " (replaced characters not in the filesystem's \"{}\" encoding)".format(encoding) filename = to_valid_filename(filename) path = os.path.join(path, filename) if not os.path.exists(path): if verbose: unicodePrint("Downloading {}: {}{}...".format(numberStr, filename, byTheWay)) for triesElapsed in range(3): if verbose and triesElapsed: unicodePrint("Couldn't download {}. Trying again...".format(filename), file=sys.stderr) try: file.download(path) except (requests.ConnectionError, requests.Timeout): pass else: break else: if verbose: unicodePrint("Couldn't download {}. Skipping over.".format(filename), file=sys.stderr) return False else: if verbose: unicodePrint("Skipping over {}: {}{}. Already exists.".format(numberStr, filename, byTheWay)) return True class KhinsiderError(Exception): pass class NonexistentSongError(KhinsiderError): pass class SoundtrackError(Exception): def __init__(self, soundtrack): self.soundtrack = soundtrack class NonexistentSoundtrackError(SoundtrackError, ValueError): def __str__(self): ost = '"{}" '.format(self.soundtrack.id) if len(self.soundtrack.id) <= 80 else "" s = "The soundtrack {}does not exist.".format(ost) return s class NonexistentFormatsError(SoundtrackError, ValueError): def __init__(self, soundtrack, requestedFormats): super(NonexistentFormatsError, self).__init__(soundtrack) self.requestedFormats = requestedFormats def __str__(self): ost = '"{}" '.format(self.soundtrack.id) if len(self.soundtrack.id) <= 80 else "" s = "The soundtrack {}is not available in the requested formats ({}).".format( ost, ", ".join('"{}"'.format(extension) for extension in self.requestedFormats)) return s class Soundtrack(object): """A KHInsider soundtrack. Initialize with a soundtrack ID. Properties: * id: The soundtrack's unique ID, used at the end of its URL. * url: The full URL of the soundtrack. * name: The textual title of the soundtrack. * availableFormats: A list of the formats the soundtrack is available in. * songs: A list of Song objects representing the songs in the soundtrack. * images: A list of File objects representing the images in the soundtrack. """ def __init__(self, soundtrackId): self.id = soundtrackId self.url = urljoin(BASE_URL, 'game-soundtracks/album/' + self.id) def __repr__(self): return "<{}: {}>".format(self.__class__.__name__, self.id) def _isLoaded(self, property): return hasattr(self, '_lazy_' + property) @lazyProperty def _contentSoup(self): soup = getSoup(self.url) contentSoup = soup.find(id='pageContent') if contentSoup.find('p').string == "No such album": # The pageContent and p exist even if the soundtrack doesn't, so no # need for error handling here. raise NonexistentSoundtrackError(self) return contentSoup @lazyProperty def name(self): return self._contentSoup.find('h2').get_text(strip=True) @lazyProperty def availableFormats(self): table = self._contentSoup.find('table', id='songlist') header = table.find('tr') headings = [td.get_text(strip=True) for td in header(['th', 'td'])] formats = [s.lower() for s in headings if s not in {"", "Track", "Song Name", "Download", "Size"}] formats = formats or ['mp3'] return formats @lazyProperty def songs(self): table = self._contentSoup.find('table', id='songlist') anchors = [tr.find('a') for tr in table('tr') if not tr.find('th')] urls = [a['href'] for a in anchors] songs = [Song(urljoin(self.url, url)) for url in urls] return songs @lazyProperty def images(self): anchors = [a for a in self._contentSoup('p')[1]('a') if a.find('img')] urls = [a['href'] for a in anchors] images = [File(urljoin(self.url, url)) for url in urls] return images def download(self, path='', makeDirs=True, formatOrder=None, verbose=False): """Download the soundtrack to the directory specified by `path`! Create any directories that are missing if `makeDirs` is set to True. Set `formatOrder` to a list of file extensions to specify the order in which to prefer file formats. If set to ['flac', 'ogg', 'mp3'], for example, FLAC files will be downloaded if available - if not, Ogg files, and if those aren't available, MP3 files. Print progress along the way if `verbose` is set to True. Return True if all files were downloaded successfully, False if not. """ path = os.path.join(getcwd(), path) path = os.path.abspath(os.path.realpath(path)) if formatOrder: formatOrder = [extension.lower() for extension in formatOrder] if not set(self.availableFormats) & set(formatOrder): raise NonexistentFormatsError(self, formatOrder) if verbose and not self._isLoaded('songs'): print("Getting song list...") files = [] for song in self.songs: try: files.append(getAppropriateFile(song, formatOrder)) except NonexistentSongError: files.append(None) files.extend(self.images) totalFiles = len(files) if makeDirs and not os.path.isdir(path): os.makedirs(os.path.abspath(os.path.realpath(path))) success = True for fileNumber, file in enumerate(files, 1): if not friendlyDownloadFile(file, path, fileNumber, totalFiles, verbose): success = False return success class Song(object): """A song on KHInsider. Properties: * url: The full URL of the song page. * name: The name of the song. * files: A list of the song's files - there may be several if the song is available in more than one format. """ def __init__(self, url): self.url = url def __repr__(self): return "<{}: {}>".format(self.__class__.__name__, self.url) @lazyProperty def _soup(self): r = requests.get(self.url, timeout=10) if r.url.rsplit('/', 1)[-1] == '404': raise NonexistentSongError("Nonexistent song page (404).") return getSoup(self.url) @lazyProperty def name(self): return self._soup('p')[2]('b')[1].get_text() @lazyProperty def files(self): # The path used to be /ost/..., and was changed to # /soundtracks/... - but who knows? It might change back! anchors = self._soup('a', href=re.compile(r'^https?://[^/]+/(?:soundtracks|ost)/.+$')) return [File(urljoin(self.url, a['href'])) for a in anchors] class File(object): """A file belonging to a soundtrack on KHInsider. Properties: * url: The full URL of the file. * filename: The file's... filename. You got it. """ def __init__(self, url): self.url = url try: url = str(url) except UnicodeError: # Python 2's quote and unquote work with bytestrings. url = url.encode('utf-8') # str('/') makes sure the string doesn't get # converted to a Unicode string on Python 2. self.filename = unquote(url.rsplit(str('/'), 1)[-1]) try: # In Python 2, unquote doesn't handle escaped UTF-8 characters # automatically, so we gotta decode them manually from bytes. self.filename = self.filename.decode('utf-8') except AttributeError: pass def __repr__(self): return "<{}: {}>".format(self.__class__.__name__, self.url) def download(self, path): """Download the file to `path`.""" response = requests.get(self.url, timeout=10) with open(path, 'wb') as outFile: outFile.write(response.content) def download(soundtrackId, path='', makeDirs=True, formatOrder=None, verbose=False): """Download the soundtrack with the ID `soundtrackId`. See Soundtrack.download for more information. """ soundtrack = Soundtrack(soundtrackId) soundtrack.name # To conistently always load the content in advance. path = to_valid_filename(soundtrack.name) if path is None else path if verbose: unicodePrint("Downloading to \"{}\".".format(path)) return soundtrack.download(path, makeDirs, formatOrder, verbose) class SearchError(KhinsiderError): pass def search(term): """Return a list of Soundtrack objects for the search term `term`.""" r = requests.get(urljoin(BASE_URL, 'search'), params={'search': term}) path = urlsplit(r.url).path if path.split('/', 2)[1] == 'game-soundtracks': return [Soundtrack(path.rsplit('/', 1)[-1])] soup = toSoup(r) try: anchors = soup('p')[1]('a') except IndexError: raise SearchError(soup.find('p').get_text(strip=True)) soundtrackParams = [(a['href'].split('/')[-1], a.get_text(strip=True)) for a in anchors] soundtracks = [] for id, title in soundtrackParams: curSoundtrack = Soundtrack(id) curSoundtrack._lazy_title = title soundtracks.append(curSoundtrack) return soundtracks def printSearchResults(searchResults, file=sys.stdout): padLen = max(len(x.id) for x in searchResults) s = "" for soundtrack in searchResults: s += "{} {}. {}\n".format(soundtrack.id, '.' * (padLen - len(soundtrack.id)), soundtrack.name) unicodePrint(s, end="", file=file) # --- And now for the execution. --- if __name__ == '__main__': import argparse SCRIPT_NAME = os.path.split(sys.argv[0])[-1] # Tiny details! class KindArgumentParser(argparse.ArgumentParser): def error(self, message): print("No soundtrack specified! As the first parameter, use the name the soundtrack uses in its URL.", file=sys.stderr) print("If you want to, you can also specify an output directory as the second parameter.", file=sys.stderr) print("You can also search for soundtracks by using your search term as parameter - as long as it's not an existing soundtrack.", file=sys.stderr) print(file=sys.stderr) print("For detailed help and more options, run \"{} --help\".".format(SCRIPT_NAME), file=sys.stderr) sys.exit(1) # More tiny details! class ProperHelpFormatter(argparse.RawTextHelpFormatter): def add_usage(self, usage, actions, groups, prefix=None): if prefix is None: prefix = 'Usage: ' return super(ProperHelpFormatter, self).add_usage(usage, actions, groups, prefix) def doIt(): # Only in a function to be able to stop after errors, really. parser = KindArgumentParser(description="Download entire soundtracks from KHInsider.\n\n" "Examples:\n" "%(prog)s jumping-flash\n" "%(prog)s katamari-forever \"music{}Katamari Forever OST\"\n" "%(prog)s --search persona\n" "%(prog)s --format flac mother-3".format(os.sep), epilog="Hope you enjoy the script!", formatter_class=ProperHelpFormatter, add_help=False) try: # Even more tiny details! parser._positionals.title = "Positional arguments" parser._optionals.title = "Optional arguments" except AttributeError: pass parser.add_argument('soundtrack', help="The ID of the soundtrack, used at the end of its URL (e.g. \"jumping-flash\").\n" "May also simply be the URL of the soundtrack.\n" "If it doesn't exist (or --search is specified, orrrr too many arguments are supplied),\n" "all the positional arguments together are used as a search term.") parser.add_argument('outPath', metavar='download directory', nargs='?', help="The directory to download the soundtrack to.\n" "Defaults to creating a new directory with the soundtrack ID as its name.") parser.add_argument('trailingArguments', nargs=argparse.REMAINDER, help=argparse.SUPPRESS) parser.add_argument('-h', '--help', action='help', default=argparse.SUPPRESS, help="Show this help and exit.") parser.add_argument('-f', '--format', default=None, metavar="...", help="The file format in which to download the soundtrack (e.g. \"flac\").\n" "You can also specify a comma-separated list of which formats to try\n" "(for example, \"flac,mp3\": download FLAC if available, otherwise MP3).") parser.add_argument('-s', '--search', action='store_true', help="Always search, regardless of whether the specified soundtrack ID exists or not.") arguments = parser.parse_args() try: soundtrack = arguments.soundtrack.decode(sys.getfilesystemencoding()) except AttributeError: # Python 3's argv is in Unicode soundtrack = arguments.soundtrack urlRe = re.compile(r"^https?://" + urlsplit(BASE_URL).netloc + r"/game-soundtracks/album/(?P[^/]+)$", re.IGNORECASE) m = urlRe.match(soundtrack) soundtrack = m.group('soundtrack') if m is not None else soundtrack outPath = arguments.outPath # Can be None; handled in download(). # I think this makes the most sense for people who aren't used to the # command line - this'll yield useful results even if you just type # in an entire soundtrack name as arguments without quotation marks. onlySearch = arguments.search or len(arguments.trailingArguments) > 1 searchTerm = [soundtrack] + ([outPath] if arguments.outPath is not None else []) searchTerm += arguments.trailingArguments try: searchTerm = ' '.join(arg.decode(sys.getfilesystemencoding()) for arg in searchTerm) except AttributeError: # Python 3, again searchTerm = ' '.join(searchTerm) searchTerm = searchTerm.replace('-', ' ') formatOrder = arguments.format if formatOrder: formatOrder = re.split(r',\s*', formatOrder) formatOrder = [extension.lstrip('.').lower() for extension in formatOrder] try: if onlySearch: try: searchResults = search(searchTerm) except SearchError as e: print("Couldn't search. {}".format(e.args[0]), file=sys.stderr) else: if searchResults: print("Soundtracks found (to download, " "run \"{} soundtrack-name\"):".format(SCRIPT_NAME)) printSearchResults(searchResults) else: print("No soundtracks found.") else: try: success = download(soundtrack, outPath, formatOrder=formatOrder, verbose=True) if not success: print("\nNot all files could be downloaded.", file=sys.stderr) return 1 except NonexistentSoundtrackError: try: searchResults = search(searchTerm) except SearchError: searchResults = None print("The soundtrack \"{}\" does not seem to exist.".format(soundtrack), file=sys.stderr) if searchResults: # aww yeah we gon' do some searchin' print("\nThese exist, though:", file=sys.stderr) printSearchResults(searchResults, file=sys.stderr) elif searchResults is None: print("A search for \"{}\" could not be performed either. " "It may be too short.".format(searchTerm), file=sys.stderr) return 1 except NonexistentFormatsError as e: s = ("Format{} not available. " "The soundtrack \"{}\" is only available in the ").format( "" if len(formatOrder) == 1 else "s", soundtrack) formats = e.soundtrack.availableFormats if len(formats) == 1: s += "\"{}\" format.".format(formats[0]) else: s += "{}{} and \"{}\" formats.".format( ", ".join('"{}"'.format(extension) for extension in formats[:-1]), "," if len(formats) > 2 else "", formats[-1]) print(s, file=sys.stderr) return 1 except KeyboardInterrupt: print("Stopped download.", file=sys.stderr) return 1 except (requests.ConnectionError, requests.Timeout): print("Could not connect to KHInsider.", file=sys.stderr) print("Make sure you have a working internet connection.", file=sys.stderr) return 1 except Exception: print(file=sys.stderr) print("An unexpected error occurred! " "If it isn't too much to ask, please report to " "https://github.com/obskyr/khinsider/issues.", file=sys.stderr) print("Attach the following error message:", file=sys.stderr) print(file=sys.stderr) raise return 0 sys.exit(doIt())