1
0
mirror of https://github.com/pirate/ArchiveBox.git synced 2025-08-21 05:41:54 +02:00

Add htmltotext extractor

Saves HTML text nodes and selected element attributes in
`htmltotext.txt` for each Snapshot. Primarily intended to be used
for search indexing.
This commit is contained in:
Ross Williams
2023-10-23 21:42:25 -04:00
parent 6555719489
commit 310b4d1242
9 changed files with 203 additions and 104 deletions

View File

@@ -17,6 +17,7 @@ def disable_extractors_dict():
"USE_SINGLEFILE": "false",
"USE_READABILITY": "false",
"USE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",