mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-21 13:52:30 +02:00
Fix JSON parser by not always mangling the input
Rather than by assuming the JSON file we are parsing has junk at the beginning (which maybe only used to happen?), try parsing it as-is first, and then fall back to trying again after skipping the first line Fixes #1347
This commit is contained in:
@@ -91,3 +91,53 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
|
||||
|
||||
assert (archived_item_path / "warc").exists()
|
||||
assert not (archived_item_path / "singlefile.html").exists()
|
||||
|
||||
def test_json(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=json"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://www.example.com/should-not-exist" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
||||
def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
|
||||
with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f:
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--parser=json"],
|
||||
stdin=f,
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url from core_snapshot").fetchall()
|
||||
tags = c.execute("SELECT name from core_tag").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
urls = list(map(lambda x: x[0], urls))
|
||||
assert "http://127.0.0.1:8080/static/example.com.html" in urls
|
||||
# if the following URL appears, we must have fallen back to another parser
|
||||
assert not "http://www.example.com/should-not-exist" in urls
|
||||
|
||||
tags = list(map(lambda x: x[0], tags))
|
||||
assert "Tag1" in tags
|
||||
assert "Tag2" in tags
|
||||
|
Reference in New Issue
Block a user