From 4a19051f4a31cdd15b73e5782c5562780245903a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 1 Oct 2024 21:45:18 -0700 Subject: [PATCH] change BaseExtractor to use new extract hookspec --- archivebox/abx/archivebox/base_extractor.py | 105 ++++++++++++++++---- 1 file changed, 84 insertions(+), 21 deletions(-) diff --git a/archivebox/abx/archivebox/base_extractor.py b/archivebox/abx/archivebox/base_extractor.py index 4ba47cdd..e3202d96 100644 --- a/archivebox/abx/archivebox/base_extractor.py +++ b/archivebox/abx/archivebox/base_extractor.py @@ -1,16 +1,19 @@ __package__ = 'abx.archivebox' +import json +import socket from typing import Optional, List, Literal, Annotated, Dict, Any from typing_extensions import Self - from pathlib import Path from pydantic import model_validator, AfterValidator from pydantic_pkgr import BinName +from django.utils.functional import cached_property import abx from .base_hook import BaseHook, HookType +from .base_binary import BaseBinary def no_empty_args(args: List[str]) -> List[str]: @@ -49,37 +52,97 @@ class BaseExtractor(BaseHook): return Path(self.id.lower()) def should_extract(self, snapshot) -> bool: - output_dir = self.get_output_path(snapshot) - if output_dir.glob('*.*'): + try: + assert self.BIN.version + except Exception: + # could not load binary return False + + # output_dir = self.get_output_path(snapshot) + # if output_dir.glob('*.*'): + # return False return True - # TODO: move this to a hookimpl - def extract(self, url: str, **kwargs) -> Dict[str, Any]: - output_dir = self.get_output_path(url, **kwargs) + @abx.hookimpl + def extract(self, snapshot_id: str) -> Dict[str, Any]: + from core.models import Snapshot + snapshot = Snapshot.objects.get(id=snapshot_id) + + if not self.should_extract(snapshot): + return {} + + from archivebox import CONSTANTS + # output_dir = self.get_output_path(snapshot) or CONSTANTS.TMP_DIR + output_dir = CONSTANTS.TMP_DIR / 'test' + output_dir.mkdir(parents=True, exist_ok=True) - cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args] - proc = self.exec(cmd, pwd=output_dir) + cmd = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args] + proc = self.exec(cmd, cwd=output_dir) + + stdout = proc.stdout.strip() + stderr = proc.stderr.strip() + output_json = None + output_text = stdout + try: + output_json = json.loads(stdout.strip()) + output_text = None + except json.JSONDecodeError: + pass + + errors = [] + if proc.returncode != 0: + errors.append(f'{self.BIN.name} returned non-zero exit code: {proc.returncode}') + + # pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7 + binary_str = f'{self.BIN.abspath}@{self.BIN.binprovider.name}:{self.BIN.binprovider.get_packages(self.BIN.name)}=={self.BIN.version}' return { - 'status': 'succeeded' if proc.returncode == 0 else 'failed', - 'output': proc.stdout.decode().strip().split('\n')[-1], - 'output_files': list(output_dir.glob('*.*')), - - 'stdout': proc.stdout.decode().strip(), - 'stderr': proc.stderr.decode().strip(), + 'extractor': self.name, + + 'snapshot_id': snapshot.id, + 'snapshot_abid': snapshot.abid, + 'snapshot_url': snapshot.url, + 'snapshot_created_by_id': snapshot.created_by_id, + + 'hostname': socket.gethostname(), + + 'binary': binary_str, + 'binary_name': self.BIN.name, + 'binary_provider': self.BIN.binprovider.name, + 'binary_version': self.BIN.version, + 'binary_abspath': self.BIN.abspath, + + 'cmd': cmd, + 'stdout': stdout, + 'stderr': stderr, 'returncode': proc.returncode, + + 'status': 'succeeded' if proc.returncode == 0 else 'failed', + 'errors': errors, + 'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)), + 'output_files': list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*')), + 'output_json': output_json or {}, + 'output_text': output_text or '', } # TODO: move this to a hookimpl - def exec(self, args: CmdArgsList, pwd: Optional[Path]=None, settings=None): - pwd = pwd or Path('.') - if settings is None: - from django.conf import settings as django_settings - settings = django_settings + def exec(self, args: CmdArgsList, cwd: Optional[Path]=None, binary=None): + cwd = cwd or Path('.') + binary = (binary or self.BINARY).load() - binary = settings.BINARIES[self.binary] - return binary.exec(args, pwd=pwd) + return binary.exec(cmd=args, cwd=cwd) + + @cached_property + def BINARY(self) -> BaseBinary: + from django.conf import settings + for binary in settings.BINARIES.values(): + if binary.name == self.binary: + return binary + raise ValueError(f'Binary {self.binary} not found') + + @cached_property + def BIN(self) -> BaseBinary: + return self.BINARY.load() @abx.hookimpl def get_EXTRACTORS(self):