1
0
mirror of https://github.com/pirate/ArchiveBox.git synced 2025-08-10 08:34:08 +02:00

add new crawl model

This commit is contained in:
Nick Sweeting
2024-10-01 21:46:59 -07:00
parent f46d62a114
commit 295c5c46e0
2 changed files with 182 additions and 2 deletions

View File

@@ -0,0 +1,101 @@
# Generated by Django 5.1.1 on 2024-10-01 02:10
import abid_utils.models
import charidfield.fields
import django.core.validators
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("core", "0074_alter_snapshot_downloaded_at"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name="Crawl",
fields=[
(
"id",
models.UUIDField(
default=None,
editable=False,
primary_key=True,
serialize=False,
unique=True,
verbose_name="ID",
),
),
(
"abid",
charidfield.fields.CharIDField(
blank=True,
db_index=True,
default=None,
help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)",
max_length=30,
null=True,
prefix="crl_",
unique=True,
),
),
(
"created_at",
abid_utils.models.AutoDateTimeField(db_index=True, default=None),
),
("modified_at", models.DateTimeField(auto_now=True)),
("urls", models.TextField()),
(
"depth",
models.PositiveSmallIntegerField(
default=1,
validators=[
django.core.validators.MinValueValidator(0),
django.core.validators.MaxValueValidator(2),
],
),
),
(
"parser",
models.CharField(
choices=[
("auto", "auto"),
("pocket_api", "Pocket API"),
("readwise_reader_api", "Readwise Reader API"),
("wallabag_atom", "Wallabag Atom"),
("pocket_html", "Pocket HTML"),
("pinboard_rss", "Pinboard RSS"),
("shaarli_rss", "Shaarli RSS"),
("medium_rss", "Medium RSS"),
("netscape_html", "Netscape HTML"),
("rss", "Generic RSS"),
("json", "Generic JSON"),
("jsonl", "Generic JSONL"),
("html", "Generic HTML"),
("txt", "Generic TXT"),
("url_list", "URL List"),
],
default="auto",
max_length=32,
),
),
(
"created_by",
models.ForeignKey(
default=None,
on_delete=django.db.models.deletion.CASCADE,
related_name="crawl_set",
to=settings.AUTH_USER_MODEL,
),
),
],
options={
"verbose_name": "Crawl",
"verbose_name_plural": "Crawls",
},
),
]

View File

@@ -14,6 +14,7 @@ from django.utils.text import slugify
from django.core.cache import cache from django.core.cache import cache
from django.urls import reverse, reverse_lazy from django.urls import reverse, reverse_lazy
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, Value, IntegerField
from django.core.validators import MaxValueValidator, MinValueValidator
from django.contrib import admin from django.contrib import admin
from django.conf import settings from django.conf import settings
@@ -27,7 +28,7 @@ from archivebox.misc.util import parse_date, base_url
from ..index.schema import Link from ..index.schema import Link
from ..index.html import snapshot_icons from ..index.html import snapshot_icons
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
from ..parsers import PARSERS
# class BaseModel(models.Model): # class BaseModel(models.Model):
@@ -42,7 +43,6 @@ from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
class Tag(ABIDModel): class Tag(ABIDModel):
""" """
Based on django-taggit model + ABID base. Based on django-taggit model + ABID base.
@@ -66,6 +66,7 @@ class Tag(ABIDModel):
# slug is autoset on save from name, never set it manually # slug is autoset on save from name, never set it manually
snapshot_set: models.Manager['Snapshot'] snapshot_set: models.Manager['Snapshot']
crawl_set: models.Manager['Crawl']
class Meta(TypedModelMeta): class Meta(TypedModelMeta):
verbose_name = "Tag" verbose_name = "Tag"
@@ -122,6 +123,84 @@ class SnapshotTag(models.Model):
unique_together = [('snapshot', 'tag')] unique_together = [('snapshot', 'tag')]
# class CrawlTag(models.Model):
# id = models.AutoField(primary_key=True)
# crawl = models.ForeignKey('Crawl', db_column='crawl_id', on_delete=models.CASCADE, to_field='id')
# tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
# class Meta:
# db_table = 'core_crawl_tags'
# unique_together = [('crawl', 'tag')]
class Crawl(ABIDModel):
abid_prefix = 'crl_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.urls'
abid_subtype_src = 'self.crawler'
abid_rand_src = 'self.id'
abid_drift_allowed = True
# CRAWLER_CHOICES = (
# ('breadth_first', 'Breadth-First'),
# ('depth_first', 'Depth-First'),
# )
PARSER_CHOICES = (
('auto', 'auto'),
*((parser_key, value[0]) for parser_key, value in PARSERS.items()),
)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
urls = models.TextField(blank=False, null=False)
depth = models.PositiveSmallIntegerField(default=1, validators=[MinValueValidator(0), MaxValueValidator(2)])
parser = models.CharField(choices=PARSER_CHOICES, default='auto', max_length=32)
# crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32)
# tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag')
# schedule = models.JSONField()
# config = models.JSONField()
class Meta(TypedModelMeta):
verbose_name = 'Crawl'
verbose_name_plural = 'Crawls'
def __str__(self):
return self.parser
@cached_property
def crawl_dir(self):
return Path()
@property
def api_url(self) -> str:
# /api/v1/core/crawl/{uulid}
return reverse_lazy('api-1:get_crawl', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
@property
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
# def get_absolute_url(self):
# return f'/crawls/{self.abid}'
def crawl(self):
# write self.urls to sources/crawl__<user>__YYYYMMDDHHMMSS.txt
# run parse_links(sources/crawl__<user>__YYYYMMDDHHMMSS.txt, parser=self.parser) and for each resulting link:
# create a Snapshot
# enqueue task bg_archive_snapshot(snapshot)
pass
class SnapshotManager(models.Manager): class SnapshotManager(models.Manager):
def get_queryset(self): def get_queryset(self):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct() return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()