mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-30 09:39:52 +02:00
change BaseExtractor to use new extract hookspec
This commit is contained in:
@@ -1,16 +1,19 @@
|
||||
__package__ = 'abx.archivebox'
|
||||
|
||||
import json
|
||||
import socket
|
||||
from typing import Optional, List, Literal, Annotated, Dict, Any
|
||||
from typing_extensions import Self
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import model_validator, AfterValidator
|
||||
from pydantic_pkgr import BinName
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
import abx
|
||||
|
||||
from .base_hook import BaseHook, HookType
|
||||
from .base_binary import BaseBinary
|
||||
|
||||
|
||||
def no_empty_args(args: List[str]) -> List[str]:
|
||||
@@ -49,37 +52,97 @@ class BaseExtractor(BaseHook):
|
||||
return Path(self.id.lower())
|
||||
|
||||
def should_extract(self, snapshot) -> bool:
|
||||
output_dir = self.get_output_path(snapshot)
|
||||
if output_dir.glob('*.*'):
|
||||
try:
|
||||
assert self.BIN.version
|
||||
except Exception:
|
||||
# could not load binary
|
||||
return False
|
||||
|
||||
# output_dir = self.get_output_path(snapshot)
|
||||
# if output_dir.glob('*.*'):
|
||||
# return False
|
||||
return True
|
||||
|
||||
# TODO: move this to a hookimpl
|
||||
def extract(self, url: str, **kwargs) -> Dict[str, Any]:
|
||||
output_dir = self.get_output_path(url, **kwargs)
|
||||
@abx.hookimpl
|
||||
def extract(self, snapshot_id: str) -> Dict[str, Any]:
|
||||
from core.models import Snapshot
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
if not self.should_extract(snapshot):
|
||||
return {}
|
||||
|
||||
from archivebox import CONSTANTS
|
||||
# output_dir = self.get_output_path(snapshot) or CONSTANTS.TMP_DIR
|
||||
output_dir = CONSTANTS.TMP_DIR / 'test'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args]
|
||||
proc = self.exec(cmd, pwd=output_dir)
|
||||
cmd = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args]
|
||||
proc = self.exec(cmd, cwd=output_dir)
|
||||
|
||||
stdout = proc.stdout.strip()
|
||||
stderr = proc.stderr.strip()
|
||||
output_json = None
|
||||
output_text = stdout
|
||||
try:
|
||||
output_json = json.loads(stdout.strip())
|
||||
output_text = None
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
errors = []
|
||||
if proc.returncode != 0:
|
||||
errors.append(f'{self.BIN.name} returned non-zero exit code: {proc.returncode}')
|
||||
|
||||
# pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7
|
||||
binary_str = f'{self.BIN.abspath}@{self.BIN.binprovider.name}:{self.BIN.binprovider.get_packages(self.BIN.name)}=={self.BIN.version}'
|
||||
|
||||
return {
|
||||
'status': 'succeeded' if proc.returncode == 0 else 'failed',
|
||||
'output': proc.stdout.decode().strip().split('\n')[-1],
|
||||
'output_files': list(output_dir.glob('*.*')),
|
||||
|
||||
'stdout': proc.stdout.decode().strip(),
|
||||
'stderr': proc.stderr.decode().strip(),
|
||||
'extractor': self.name,
|
||||
|
||||
'snapshot_id': snapshot.id,
|
||||
'snapshot_abid': snapshot.abid,
|
||||
'snapshot_url': snapshot.url,
|
||||
'snapshot_created_by_id': snapshot.created_by_id,
|
||||
|
||||
'hostname': socket.gethostname(),
|
||||
|
||||
'binary': binary_str,
|
||||
'binary_name': self.BIN.name,
|
||||
'binary_provider': self.BIN.binprovider.name,
|
||||
'binary_version': self.BIN.version,
|
||||
'binary_abspath': self.BIN.abspath,
|
||||
|
||||
'cmd': cmd,
|
||||
'stdout': stdout,
|
||||
'stderr': stderr,
|
||||
'returncode': proc.returncode,
|
||||
|
||||
'status': 'succeeded' if proc.returncode == 0 else 'failed',
|
||||
'errors': errors,
|
||||
'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)),
|
||||
'output_files': list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*')),
|
||||
'output_json': output_json or {},
|
||||
'output_text': output_text or '',
|
||||
}
|
||||
|
||||
# TODO: move this to a hookimpl
|
||||
def exec(self, args: CmdArgsList, pwd: Optional[Path]=None, settings=None):
|
||||
pwd = pwd or Path('.')
|
||||
if settings is None:
|
||||
from django.conf import settings as django_settings
|
||||
settings = django_settings
|
||||
def exec(self, args: CmdArgsList, cwd: Optional[Path]=None, binary=None):
|
||||
cwd = cwd or Path('.')
|
||||
binary = (binary or self.BINARY).load()
|
||||
|
||||
binary = settings.BINARIES[self.binary]
|
||||
return binary.exec(args, pwd=pwd)
|
||||
return binary.exec(cmd=args, cwd=cwd)
|
||||
|
||||
@cached_property
|
||||
def BINARY(self) -> BaseBinary:
|
||||
from django.conf import settings
|
||||
for binary in settings.BINARIES.values():
|
||||
if binary.name == self.binary:
|
||||
return binary
|
||||
raise ValueError(f'Binary {self.binary} not found')
|
||||
|
||||
@cached_property
|
||||
def BIN(self) -> BaseBinary:
|
||||
return self.BINARY.load()
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS(self):
|
||||
|
Reference in New Issue
Block a user