mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-26 15:54:36 +02:00
better title regex to match titles surrounded by newlines
This commit is contained in:
@@ -66,9 +66,9 @@ URL_REGEX = re.compile(
|
|||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
HTML_TITLE_REGEX = re.compile(
|
HTML_TITLE_REGEX = re.compile(
|
||||||
r'<title>' # start matching text after <title> tag
|
r'<title.*?>' # start matching text after <title> tag
|
||||||
r'(.[^<>]+)', # get everything up to these symbols
|
r'(.[^<>]+)', # get everything up to these symbols
|
||||||
re.IGNORECASE,
|
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
### Checks & Tests
|
### Checks & Tests
|
||||||
|
Reference in New Issue
Block a user