mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-21 21:57:26 +02:00
support finding multiple urls as substrings in text
This commit is contained in:
@@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
|
|||||||
|
|
||||||
|
|
||||||
URL_REGEX = re.compile(
|
URL_REGEX = re.compile(
|
||||||
|
r'(?=('
|
||||||
r'http[s]?://' # start matching from allowed schemes
|
r'http[s]?://' # start matching from allowed schemes
|
||||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||||
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
|
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
|
||||||
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
||||||
r'[^\]\[\(\)<>"\'\s]+', # stop parsing at these symbols
|
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
|
||||||
|
r'))',
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user