From 24d4c446247aafbef9787cfb9fd9a78675437b52 Mon Sep 17 00:00:00 2001 From: jdcaballerov Date: Sat, 12 Dec 2020 07:36:31 -0500 Subject: [PATCH 1/4] Add ripgrep configs --- archivebox/config.py | 11 +++++++++++ archivebox/search/backends/ripgrep.py | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index a3444f07..d3e34151 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -161,6 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'USE_CHROME': {'type': bool, 'default': True}, 'USE_NODE': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, + 'USE_RIPGREP': {'type': bool, 'default': True}, 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, @@ -170,6 +171,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'NODE_BINARY': {'type': str, 'default': 'node'}, + 'RIPGREP_BINARY': {'type': str, 'default': 'rg'}, 'CHROME_BINARY': {'type': str, 'default': None}, 'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, @@ -312,6 +314,8 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, + 'USE_RIPGREP': {'default': lambda c: c['USE_RIPGREP']}, + 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, @@ -827,6 +831,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_CHROME'], 'is_valid': bool(config['CHROME_VERSION']), }, + 'RIPGREP_BINARY': { + 'path': bin_path(config['RIPGREP_BINARY']), + 'version': config['RIPGREP_VERSION'], + 'hash': bin_hash(config['RIPGREP_BINARY']), + 'enabled': config['USE_RIPGREP'], + 'is_valid': bool(config['RIPGREP_VERSION']), + }, } def get_chrome_info(config: ConfigDict) -> ConfigValue: diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index e2e03c9b..b37eca20 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -2,7 +2,7 @@ import re from subprocess import run, PIPE, DEVNULL from typing import List, Generator -from archivebox.config import ARCHIVE_DIR +from archivebox.config import ARCHIVE_DIR, RIPGREP_BINARY from archivebox.util import enforce_types RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') @@ -26,7 +26,7 @@ def flush(snapshot_ids: Generator[str, None, None]): @enforce_types def search(text: str) -> List[str]: - is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL) + is_rg_installed = run(['which', RIPGREP_BINARY], stdout=DEVNULL, stderr=DEVNULL) if is_rg_installed.returncode: raise Exception("ripgrep binary not found, install ripgrep to use this search backend") From 50df10886346f12d16124fd8cf5a09a41ff9ee3c Mon Sep 17 00:00:00 2001 From: jdcaballerov <743513+jdcaballerov@users.noreply.github.com> Date: Sat, 12 Dec 2020 08:34:00 -0500 Subject: [PATCH 2/4] Update archivebox/config.py Co-authored-by: Nick Sweeting --- archivebox/config.py | 1 - 1 file changed, 1 deletion(-) diff --git a/archivebox/config.py b/archivebox/config.py index d3e34151..6c42eef5 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -314,7 +314,6 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, - 'USE_RIPGREP': {'default': lambda c: c['USE_RIPGREP']}, 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, From aa53f4f088bd5eca63db394d71597c32cdcb9d6c Mon Sep 17 00:00:00 2001 From: jdcaballerov <743513+jdcaballerov@users.noreply.github.com> Date: Sat, 12 Dec 2020 08:36:01 -0500 Subject: [PATCH 3/4] Update archivebox/search/backends/ripgrep.py Co-authored-by: Nick Sweeting --- archivebox/search/backends/ripgrep.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index b37eca20..b6532bfd 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -26,8 +26,7 @@ def flush(snapshot_ids: Generator[str, None, None]): @enforce_types def search(text: str) -> List[str]: - is_rg_installed = run(['which', RIPGREP_BINARY], stdout=DEVNULL, stderr=DEVNULL) - if is_rg_installed.returncode: + if not RIPGREP_VERSION: raise Exception("ripgrep binary not found, install ripgrep to use this search backend") from core.models import Snapshot @@ -44,4 +43,3 @@ def search(text: str) -> List[str]: snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] return snap_ids - From 9b6afa36a386c9e8f7c8d09c8f7a80ec70a285db Mon Sep 17 00:00:00 2001 From: jdcaballerov <743513+jdcaballerov@users.noreply.github.com> Date: Sat, 12 Dec 2020 08:36:08 -0500 Subject: [PATCH 4/4] Update archivebox/search/backends/ripgrep.py Co-authored-by: Nick Sweeting --- archivebox/search/backends/ripgrep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/search/backends/ripgrep.py b/archivebox/search/backends/ripgrep.py index b6532bfd..887a66d6 100644 --- a/archivebox/search/backends/ripgrep.py +++ b/archivebox/search/backends/ripgrep.py @@ -2,7 +2,7 @@ import re from subprocess import run, PIPE, DEVNULL from typing import List, Generator -from archivebox.config import ARCHIVE_DIR, RIPGREP_BINARY +from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION from archivebox.util import enforce_types RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')