mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-29 17:19:53 +02:00
add rudimentary method to parse back html index into urls
This commit is contained in:
@@ -3,7 +3,7 @@ __package__ = 'archivebox.legacy.storage'
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Iterator
|
||||||
|
|
||||||
from ..schema import Link
|
from ..schema import Link
|
||||||
from ..config import (
|
from ..config import (
|
||||||
@@ -39,6 +39,18 @@ TITLE_LOADING_MSG = 'Not yet archived...'
|
|||||||
|
|
||||||
### Main Links Index
|
### Main Links Index
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
|
||||||
|
"""parse an archive index html file and return the list of urls"""
|
||||||
|
|
||||||
|
index_path = os.path.join(out_dir, HTML_INDEX_FILENAME)
|
||||||
|
if os.path.exists(index_path):
|
||||||
|
with open(index_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
if 'class="link-url"' in line:
|
||||||
|
yield line.split('"')[1]
|
||||||
|
return ()
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
|
def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
|
||||||
"""write the html link index to a given path"""
|
"""write the html link index to a given path"""
|
||||||
|
Reference in New Issue
Block a user