mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-30 09:39:52 +02:00
change BaseExtractor to use new extract hookspec
This commit is contained in:
@@ -1,16 +1,19 @@
|
|||||||
__package__ = 'abx.archivebox'
|
__package__ = 'abx.archivebox'
|
||||||
|
|
||||||
|
import json
|
||||||
|
import socket
|
||||||
from typing import Optional, List, Literal, Annotated, Dict, Any
|
from typing import Optional, List, Literal, Annotated, Dict, Any
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from pydantic import model_validator, AfterValidator
|
from pydantic import model_validator, AfterValidator
|
||||||
from pydantic_pkgr import BinName
|
from pydantic_pkgr import BinName
|
||||||
|
from django.utils.functional import cached_property
|
||||||
|
|
||||||
import abx
|
import abx
|
||||||
|
|
||||||
from .base_hook import BaseHook, HookType
|
from .base_hook import BaseHook, HookType
|
||||||
|
from .base_binary import BaseBinary
|
||||||
|
|
||||||
|
|
||||||
def no_empty_args(args: List[str]) -> List[str]:
|
def no_empty_args(args: List[str]) -> List[str]:
|
||||||
@@ -49,37 +52,97 @@ class BaseExtractor(BaseHook):
|
|||||||
return Path(self.id.lower())
|
return Path(self.id.lower())
|
||||||
|
|
||||||
def should_extract(self, snapshot) -> bool:
|
def should_extract(self, snapshot) -> bool:
|
||||||
output_dir = self.get_output_path(snapshot)
|
try:
|
||||||
if output_dir.glob('*.*'):
|
assert self.BIN.version
|
||||||
|
except Exception:
|
||||||
|
# could not load binary
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# output_dir = self.get_output_path(snapshot)
|
||||||
|
# if output_dir.glob('*.*'):
|
||||||
|
# return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# TODO: move this to a hookimpl
|
@abx.hookimpl
|
||||||
def extract(self, url: str, **kwargs) -> Dict[str, Any]:
|
def extract(self, snapshot_id: str) -> Dict[str, Any]:
|
||||||
output_dir = self.get_output_path(url, **kwargs)
|
from core.models import Snapshot
|
||||||
|
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||||
|
|
||||||
|
if not self.should_extract(snapshot):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
from archivebox import CONSTANTS
|
||||||
|
# output_dir = self.get_output_path(snapshot) or CONSTANTS.TMP_DIR
|
||||||
|
output_dir = CONSTANTS.TMP_DIR / 'test'
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
cmd = [url, *self.args] if self.args is not None else [url, *self.default_args, *self.extra_args]
|
cmd = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args]
|
||||||
proc = self.exec(cmd, pwd=output_dir)
|
proc = self.exec(cmd, cwd=output_dir)
|
||||||
|
|
||||||
|
stdout = proc.stdout.strip()
|
||||||
|
stderr = proc.stderr.strip()
|
||||||
|
output_json = None
|
||||||
|
output_text = stdout
|
||||||
|
try:
|
||||||
|
output_json = json.loads(stdout.strip())
|
||||||
|
output_text = None
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
errors = []
|
||||||
|
if proc.returncode != 0:
|
||||||
|
errors.append(f'{self.BIN.name} returned non-zero exit code: {proc.returncode}')
|
||||||
|
|
||||||
|
# pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7
|
||||||
|
binary_str = f'{self.BIN.abspath}@{self.BIN.binprovider.name}:{self.BIN.binprovider.get_packages(self.BIN.name)}=={self.BIN.version}'
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'status': 'succeeded' if proc.returncode == 0 else 'failed',
|
'extractor': self.name,
|
||||||
'output': proc.stdout.decode().strip().split('\n')[-1],
|
|
||||||
'output_files': list(output_dir.glob('*.*')),
|
'snapshot_id': snapshot.id,
|
||||||
|
'snapshot_abid': snapshot.abid,
|
||||||
'stdout': proc.stdout.decode().strip(),
|
'snapshot_url': snapshot.url,
|
||||||
'stderr': proc.stderr.decode().strip(),
|
'snapshot_created_by_id': snapshot.created_by_id,
|
||||||
|
|
||||||
|
'hostname': socket.gethostname(),
|
||||||
|
|
||||||
|
'binary': binary_str,
|
||||||
|
'binary_name': self.BIN.name,
|
||||||
|
'binary_provider': self.BIN.binprovider.name,
|
||||||
|
'binary_version': self.BIN.version,
|
||||||
|
'binary_abspath': self.BIN.abspath,
|
||||||
|
|
||||||
|
'cmd': cmd,
|
||||||
|
'stdout': stdout,
|
||||||
|
'stderr': stderr,
|
||||||
'returncode': proc.returncode,
|
'returncode': proc.returncode,
|
||||||
|
|
||||||
|
'status': 'succeeded' if proc.returncode == 0 else 'failed',
|
||||||
|
'errors': errors,
|
||||||
|
'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)),
|
||||||
|
'output_files': list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*')),
|
||||||
|
'output_json': output_json or {},
|
||||||
|
'output_text': output_text or '',
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO: move this to a hookimpl
|
# TODO: move this to a hookimpl
|
||||||
def exec(self, args: CmdArgsList, pwd: Optional[Path]=None, settings=None):
|
def exec(self, args: CmdArgsList, cwd: Optional[Path]=None, binary=None):
|
||||||
pwd = pwd or Path('.')
|
cwd = cwd or Path('.')
|
||||||
if settings is None:
|
binary = (binary or self.BINARY).load()
|
||||||
from django.conf import settings as django_settings
|
|
||||||
settings = django_settings
|
|
||||||
|
|
||||||
binary = settings.BINARIES[self.binary]
|
return binary.exec(cmd=args, cwd=cwd)
|
||||||
return binary.exec(args, pwd=pwd)
|
|
||||||
|
@cached_property
|
||||||
|
def BINARY(self) -> BaseBinary:
|
||||||
|
from django.conf import settings
|
||||||
|
for binary in settings.BINARIES.values():
|
||||||
|
if binary.name == self.binary:
|
||||||
|
return binary
|
||||||
|
raise ValueError(f'Binary {self.binary} not found')
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def BIN(self) -> BaseBinary:
|
||||||
|
return self.BINARY.load()
|
||||||
|
|
||||||
@abx.hookimpl
|
@abx.hookimpl
|
||||||
def get_EXTRACTORS(self):
|
def get_EXTRACTORS(self):
|
||||||
|
Reference in New Issue
Block a user