mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-27 00:05:27 +02:00
fix fetch page title default
This commit is contained in:
@@ -212,15 +212,19 @@ def download_url(url):
|
|||||||
return source_path
|
return source_path
|
||||||
|
|
||||||
|
|
||||||
def fetch_page_title(url, default=None):
|
def fetch_page_title(url, default=True):
|
||||||
"""Attempt to guess a page's title by downloading the html"""
|
"""Attempt to guess a page's title by downloading the html"""
|
||||||
|
if default is True:
|
||||||
|
default = url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
html_content = urllib.request.urlopen(url).read().decode('utf-8')
|
html_content = urllib.request.urlopen(url).read().decode('utf-8')
|
||||||
|
|
||||||
match = re.search('<title>(.*?)</title>', html_content)
|
match = re.search('<title>(.*?)</title>', html_content)
|
||||||
return match.group(1) if match else default
|
return match.group(1) if match else default or None
|
||||||
except Exception:
|
except Exception:
|
||||||
|
if default is False:
|
||||||
|
raise
|
||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user