mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-10 08:34:08 +02:00
add new crawl model
This commit is contained in:
101
archivebox/core/migrations/0075_crawl.py
Normal file
101
archivebox/core/migrations/0075_crawl.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
# Generated by Django 5.1.1 on 2024-10-01 02:10
|
||||||
|
|
||||||
|
import abid_utils.models
|
||||||
|
import charidfield.fields
|
||||||
|
import django.core.validators
|
||||||
|
import django.db.models.deletion
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
("core", "0074_alter_snapshot_downloaded_at"),
|
||||||
|
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name="Crawl",
|
||||||
|
fields=[
|
||||||
|
(
|
||||||
|
"id",
|
||||||
|
models.UUIDField(
|
||||||
|
default=None,
|
||||||
|
editable=False,
|
||||||
|
primary_key=True,
|
||||||
|
serialize=False,
|
||||||
|
unique=True,
|
||||||
|
verbose_name="ID",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"abid",
|
||||||
|
charidfield.fields.CharIDField(
|
||||||
|
blank=True,
|
||||||
|
db_index=True,
|
||||||
|
default=None,
|
||||||
|
help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)",
|
||||||
|
max_length=30,
|
||||||
|
null=True,
|
||||||
|
prefix="crl_",
|
||||||
|
unique=True,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"created_at",
|
||||||
|
abid_utils.models.AutoDateTimeField(db_index=True, default=None),
|
||||||
|
),
|
||||||
|
("modified_at", models.DateTimeField(auto_now=True)),
|
||||||
|
("urls", models.TextField()),
|
||||||
|
(
|
||||||
|
"depth",
|
||||||
|
models.PositiveSmallIntegerField(
|
||||||
|
default=1,
|
||||||
|
validators=[
|
||||||
|
django.core.validators.MinValueValidator(0),
|
||||||
|
django.core.validators.MaxValueValidator(2),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"parser",
|
||||||
|
models.CharField(
|
||||||
|
choices=[
|
||||||
|
("auto", "auto"),
|
||||||
|
("pocket_api", "Pocket API"),
|
||||||
|
("readwise_reader_api", "Readwise Reader API"),
|
||||||
|
("wallabag_atom", "Wallabag Atom"),
|
||||||
|
("pocket_html", "Pocket HTML"),
|
||||||
|
("pinboard_rss", "Pinboard RSS"),
|
||||||
|
("shaarli_rss", "Shaarli RSS"),
|
||||||
|
("medium_rss", "Medium RSS"),
|
||||||
|
("netscape_html", "Netscape HTML"),
|
||||||
|
("rss", "Generic RSS"),
|
||||||
|
("json", "Generic JSON"),
|
||||||
|
("jsonl", "Generic JSONL"),
|
||||||
|
("html", "Generic HTML"),
|
||||||
|
("txt", "Generic TXT"),
|
||||||
|
("url_list", "URL List"),
|
||||||
|
],
|
||||||
|
default="auto",
|
||||||
|
max_length=32,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"created_by",
|
||||||
|
models.ForeignKey(
|
||||||
|
default=None,
|
||||||
|
on_delete=django.db.models.deletion.CASCADE,
|
||||||
|
related_name="crawl_set",
|
||||||
|
to=settings.AUTH_USER_MODEL,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
"verbose_name": "Crawl",
|
||||||
|
"verbose_name_plural": "Crawls",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
@@ -14,6 +14,7 @@ from django.utils.text import slugify
|
|||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
from django.urls import reverse, reverse_lazy
|
from django.urls import reverse, reverse_lazy
|
||||||
from django.db.models import Case, When, Value, IntegerField
|
from django.db.models import Case, When, Value, IntegerField
|
||||||
|
from django.core.validators import MaxValueValidator, MinValueValidator
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
@@ -27,7 +28,7 @@ from archivebox.misc.util import parse_date, base_url
|
|||||||
from ..index.schema import Link
|
from ..index.schema import Link
|
||||||
from ..index.html import snapshot_icons
|
from ..index.html import snapshot_icons
|
||||||
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||||
|
from ..parsers import PARSERS
|
||||||
|
|
||||||
|
|
||||||
# class BaseModel(models.Model):
|
# class BaseModel(models.Model):
|
||||||
@@ -42,7 +43,6 @@ from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Tag(ABIDModel):
|
class Tag(ABIDModel):
|
||||||
"""
|
"""
|
||||||
Based on django-taggit model + ABID base.
|
Based on django-taggit model + ABID base.
|
||||||
@@ -66,6 +66,7 @@ class Tag(ABIDModel):
|
|||||||
# slug is autoset on save from name, never set it manually
|
# slug is autoset on save from name, never set it manually
|
||||||
|
|
||||||
snapshot_set: models.Manager['Snapshot']
|
snapshot_set: models.Manager['Snapshot']
|
||||||
|
crawl_set: models.Manager['Crawl']
|
||||||
|
|
||||||
class Meta(TypedModelMeta):
|
class Meta(TypedModelMeta):
|
||||||
verbose_name = "Tag"
|
verbose_name = "Tag"
|
||||||
@@ -122,6 +123,84 @@ class SnapshotTag(models.Model):
|
|||||||
unique_together = [('snapshot', 'tag')]
|
unique_together = [('snapshot', 'tag')]
|
||||||
|
|
||||||
|
|
||||||
|
# class CrawlTag(models.Model):
|
||||||
|
# id = models.AutoField(primary_key=True)
|
||||||
|
|
||||||
|
# crawl = models.ForeignKey('Crawl', db_column='crawl_id', on_delete=models.CASCADE, to_field='id')
|
||||||
|
# tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
|
||||||
|
|
||||||
|
# class Meta:
|
||||||
|
# db_table = 'core_crawl_tags'
|
||||||
|
# unique_together = [('crawl', 'tag')]
|
||||||
|
|
||||||
|
|
||||||
|
class Crawl(ABIDModel):
|
||||||
|
abid_prefix = 'crl_'
|
||||||
|
abid_ts_src = 'self.created_at'
|
||||||
|
abid_uri_src = 'self.urls'
|
||||||
|
abid_subtype_src = 'self.crawler'
|
||||||
|
abid_rand_src = 'self.id'
|
||||||
|
abid_drift_allowed = True
|
||||||
|
|
||||||
|
# CRAWLER_CHOICES = (
|
||||||
|
# ('breadth_first', 'Breadth-First'),
|
||||||
|
# ('depth_first', 'Depth-First'),
|
||||||
|
# )
|
||||||
|
PARSER_CHOICES = (
|
||||||
|
('auto', 'auto'),
|
||||||
|
*((parser_key, value[0]) for parser_key, value in PARSERS.items()),
|
||||||
|
)
|
||||||
|
|
||||||
|
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
||||||
|
abid = ABIDField(prefix=abid_prefix)
|
||||||
|
|
||||||
|
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
|
||||||
|
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
|
||||||
|
modified_at = models.DateTimeField(auto_now=True)
|
||||||
|
|
||||||
|
urls = models.TextField(blank=False, null=False)
|
||||||
|
depth = models.PositiveSmallIntegerField(default=1, validators=[MinValueValidator(0), MaxValueValidator(2)])
|
||||||
|
parser = models.CharField(choices=PARSER_CHOICES, default='auto', max_length=32)
|
||||||
|
|
||||||
|
# crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32)
|
||||||
|
# tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag')
|
||||||
|
# schedule = models.JSONField()
|
||||||
|
# config = models.JSONField()
|
||||||
|
|
||||||
|
|
||||||
|
class Meta(TypedModelMeta):
|
||||||
|
verbose_name = 'Crawl'
|
||||||
|
verbose_name_plural = 'Crawls'
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.parser
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def crawl_dir(self):
|
||||||
|
return Path()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def api_url(self) -> str:
|
||||||
|
# /api/v1/core/crawl/{uulid}
|
||||||
|
return reverse_lazy('api-1:get_crawl', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def api_docs_url(self) -> str:
|
||||||
|
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
|
||||||
|
|
||||||
|
# def get_absolute_url(self):
|
||||||
|
# return f'/crawls/{self.abid}'
|
||||||
|
|
||||||
|
def crawl(self):
|
||||||
|
# write self.urls to sources/crawl__<user>__YYYYMMDDHHMMSS.txt
|
||||||
|
# run parse_links(sources/crawl__<user>__YYYYMMDDHHMMSS.txt, parser=self.parser) and for each resulting link:
|
||||||
|
# create a Snapshot
|
||||||
|
# enqueue task bg_archive_snapshot(snapshot)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SnapshotManager(models.Manager):
|
class SnapshotManager(models.Manager):
|
||||||
def get_queryset(self):
|
def get_queryset(self):
|
||||||
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
|
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
|
||||||
|
Reference in New Issue
Block a user