Add htmltotext extractor

Saves HTML text nodes and selected element attributes in `htmltotext.txt` for each Snapshot. Primarily intended to be used for search indexing.
2025-08-21 05:41:54 +02:00 · 2023-10-23 21:42:25 -04:00
parent 6555719489
commit 310b4d1242
9 changed files with 203 additions and 104 deletions
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -17,6 +17,7 @@ def disable_extractors_dict():
        "USE_SINGLEFILE": "false",
        "USE_READABILITY": "false",
        "USE_MERCURY": "false",
+        "SAVE_HTMLTOTEXT": "false",
        "SAVE_PDF": "false",
        "SAVE_SCREENSHOT": "false",
        "SAVE_DOM": "false",