1
0
mirror of https://github.com/pirate/ArchiveBox.git synced 2025-08-13 18:14:24 +02:00

prefer dom dump to singlefile for generating readability output

This commit is contained in:
Nick Sweeting
2024-01-03 20:11:06 -08:00
parent 78d942ac22
commit db2984e47b

View File

@@ -66,7 +66,9 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
"""
canonical = link.canonical_outputs()
abs_path = path.absolute()
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
# prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
document = None
for source in sources:
try: