1
0
mirror of https://github.com/pirate/ArchiveBox.git synced 2025-08-30 09:39:52 +02:00

change BaseExtractor to use new extract hookspec

This commit is contained in:
Nick Sweeting
2024-10-01 21:45:18 -07:00
parent 276a505cae
commit 4a19051f4a

View File

@@ -1,16 +1,19 @@
__package__ = 'abx.archivebox' __package__ = 'abx.archivebox'
import json
import socket
from typing import Optional, List, Literal, Annotated, Dict, Any from typing import Optional, List, Literal, Annotated, Dict, Any
from typing_extensions import Self from typing_extensions import Self
from pathlib import Path from pathlib import Path
from pydantic import model_validator, AfterValidator from pydantic import model_validator, AfterValidator
from pydantic_pkgr import BinName from pydantic_pkgr import BinName
from django.utils.functional import cached_property
import abx import abx
from .base_hook import BaseHook, HookType from .base_hook import BaseHook, HookType
from .base_binary import BaseBinary
def no_empty_args(args: List[str]) -> List[str]: def no_empty_args(args: List[str]) -> List[str]:
@@ -49,37 +52,97 @@ class BaseExtractor(BaseHook):
return Path(self.id.lower()) return Path(self.id.lower())
def should_extract(self, snapshot) -> bool: def should_extract(self, snapshot) -> bool:
output_dir = self.get_output_path(snapshot) try:
if output_dir.glob('*.*'): assert self.BIN.version
except Exception:
# could not load binary
return False return False
# output_dir = self.get_output_path(snapshot)
# if output_dir.glob('*.*'):
# return False
return True return True
# TODO: move this to a hookimpl @abx.hookimpl
def extract(self, url: str, **kwargs) -> Dict[str, Any]: def extract(self, snapshot_id: str) -> Dict[str, Any]:
output_dir = self.get_output_path(url, **kwargs) from core.models import Snapshot
snapshot = Snapshot.objects.get(id=snapshot_id)
if not self.should_extract(snapshot):
return {}
from archivebox import CONSTANTS
# output_dir = self.get_output_path(snapshot) or CONSTANTS.TMP_DIR
output_dir = CONSTANTS.TMP_DIR / 'test'
output_dir.mkdir(parents=True, exist_ok=True)
cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args] cmd = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args]
proc = self.exec(cmd, pwd=output_dir) proc = self.exec(cmd, cwd=output_dir)
stdout = proc.stdout.strip()
stderr = proc.stderr.strip()
output_json = None
output_text = stdout
try:
output_json = json.loads(stdout.strip())
output_text = None
except json.JSONDecodeError:
pass
errors = []
if proc.returncode != 0:
errors.append(f'{self.BIN.name} returned non-zero exit code: {proc.returncode}')
# pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7
binary_str = f'{self.BIN.abspath}@{self.BIN.binprovider.name}:{self.BIN.binprovider.get_packages(self.BIN.name)}=={self.BIN.version}'
return { return {
'status': 'succeeded' if proc.returncode == 0 else 'failed', 'extractor': self.name,
'output': proc.stdout.decode().strip().split('\n')[-1],
'output_files': list(output_dir.glob('*.*')), 'snapshot_id': snapshot.id,
'snapshot_abid': snapshot.abid,
'stdout': proc.stdout.decode().strip(), 'snapshot_url': snapshot.url,
'stderr': proc.stderr.decode().strip(), 'snapshot_created_by_id': snapshot.created_by_id,
'hostname': socket.gethostname(),
'binary': binary_str,
'binary_name': self.BIN.name,
'binary_provider': self.BIN.binprovider.name,
'binary_version': self.BIN.version,
'binary_abspath': self.BIN.abspath,
'cmd': cmd,
'stdout': stdout,
'stderr': stderr,
'returncode': proc.returncode, 'returncode': proc.returncode,
'status': 'succeeded' if proc.returncode == 0 else 'failed',
'errors': errors,
'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)),
'output_files': list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*')),
'output_json': output_json or {},
'output_text': output_text or '',
} }
# TODO: move this to a hookimpl # TODO: move this to a hookimpl
def exec(self, args: CmdArgsList, pwd: Optional[Path]=None, settings=None): def exec(self, args: CmdArgsList, cwd: Optional[Path]=None, binary=None):
pwd = pwd or Path('.') cwd = cwd or Path('.')
if settings is None: binary = (binary or self.BINARY).load()
from django.conf import settings as django_settings
settings = django_settings
binary = settings.BINARIES[self.binary] return binary.exec(cmd=args, cwd=cwd)
return binary.exec(args, pwd=pwd)
@cached_property
def BINARY(self) -> BaseBinary:
from django.conf import settings
for binary in settings.BINARIES.values():
if binary.name == self.binary:
return binary
raise ValueError(f'Binary {self.binary} not found')
@cached_property
def BIN(self) -> BaseBinary:
return self.BINARY.load()
@abx.hookimpl @abx.hookimpl
def get_EXTRACTORS(self): def get_EXTRACTORS(self):