mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-29 01:00:11 +02:00
dedupe urls using exact url instead of fuzzy url
This commit is contained in:
@@ -149,11 +149,10 @@ def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||
|
||||
for link in sorted_links:
|
||||
fuzzy = fuzzy_url(link.url)
|
||||
if fuzzy in unique_urls:
|
||||
if link.base_url in unique_urls:
|
||||
# merge with any other links that share the same url
|
||||
link = merge_links(unique_urls[fuzzy], link)
|
||||
unique_urls[fuzzy] = link
|
||||
link = merge_links(unique_urls[link.base_url], link)
|
||||
unique_urls[link.base_url] = link
|
||||
|
||||
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
||||
for link in unique_urls.values():
|
||||
|
Reference in New Issue
Block a user