1
0
mirror of https://github.com/pirate/ArchiveBox.git synced 2025-08-11 17:14:38 +02:00

only add url-list lines that are real urls

This commit is contained in:
Nick Sweeting
2021-04-01 03:31:55 -04:00
parent d73f7d7d96
commit f59b6d4189

View File

@@ -1,12 +1,15 @@
__package__ = 'archivebox.parsers'
__description__ = 'URL list'
import re
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
enforce_types
enforce_types,
URL_REGEX,
)
@@ -17,7 +20,7 @@ def parse_url_list(text_file: IO[str], **_kwargs) -> Iterable[Link]:
text_file.seek(0)
for line in text_file.readlines():
url = line.strip()
if not url:
if (not url) or not re.findall(URL_REGEX, url):
continue
yield Link(