mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-09-03 03:13:12 +02:00
Fix JSON parser by not always mangling the input
Rather than by assuming the JSON file we are parsing has junk at the beginning (which maybe only used to happen?), try parsing it as-is first, and then fall back to trying again after skipping the first line Fixes #1347
This commit is contained in:
@@ -18,9 +18,16 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
|
||||
json_file.seek(0)
|
||||
|
||||
# sometimes the first line is a comment or filepath, so we get everything after the first {
|
||||
json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
|
||||
links = json.loads(json_file_json_str)
|
||||
try:
|
||||
links = json.load(json_file)
|
||||
except json.decoder.JSONDecodeError:
|
||||
# sometimes the first line is a comment or other junk, so try without
|
||||
json_file.seek(0)
|
||||
first_line = json_file.readline()
|
||||
#print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
|
||||
links = json.load(json_file)
|
||||
# we may fail again, which means we really don't know what to do
|
||||
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
for link in links:
|
||||
@@ -59,11 +66,15 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
|
||||
tags = ''
|
||||
if link.get('tags'):
|
||||
tags = link.get('tags').replace(' ',',')
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=ts_str,
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(link.get('tags')) or '',
|
||||
tags=htmldecode(tags),
|
||||
sources=[json_file.name],
|
||||
)
|
||||
|
||||
|
Reference in New Issue
Block a user