mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-29 09:10:13 +02:00
dedupe urls using exact url instead of fuzzy url
This commit is contained in:
@@ -149,11 +149,10 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
|||||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||||
|
|
||||||
for link in sorted_links:
|
for link in sorted_links:
|
||||||
fuzzy = fuzzy_url(link.url)
|
if link.base_url in unique_urls:
|
||||||
if fuzzy in unique_urls:
|
|
||||||
# merge with any other links that share the same url
|
# merge with any other links that share the same url
|
||||||
link = merge_links(unique_urls[fuzzy], link)
|
link = merge_links(unique_urls[link.base_url], link)
|
||||||
unique_urls[fuzzy] = link
|
unique_urls[link.base_url] = link
|
||||||
|
|
||||||
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
||||||
for link in unique_urls.values():
|
for link in unique_urls.values():
|
||||||
|
Reference in New Issue
Block a user