1
0
mirror of https://github.com/pirate/ArchiveBox.git synced 2025-08-29 17:19:53 +02:00

add rudimentary method to parse back html index into urls

This commit is contained in:
Nick Sweeting
2019-04-24 11:37:51 -04:00
parent 3825ddc095
commit e91cdfbc88

View File

@@ -3,7 +3,7 @@ __package__ = 'archivebox.legacy.storage'
import os import os
from datetime import datetime from datetime import datetime
from typing import List, Optional from typing import List, Optional, Iterator
from ..schema import Link from ..schema import Link
from ..config import ( from ..config import (
@@ -39,6 +39,18 @@ TITLE_LOADING_MSG = 'Not yet archived...'
### Main Links Index ### Main Links Index
@enforce_types
def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
"""parse an archive index html file and return the list of urls"""
index_path = os.path.join(out_dir, HTML_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
for line in f:
if 'class="link-url"' in line:
yield line.split('"')[1]
return ()
@enforce_types @enforce_types
def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None: def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path""" """write the html link index to a given path"""