mirror of
https://github.com/pirate/ArchiveBox.git
synced 2025-08-15 11:04:17 +02:00
Created ArchiveBox Architecture Diagrams (markdown)
198
ArchiveBox-Architecture-Diagrams.md
Normal file
198
ArchiveBox-Architecture-Diagrams.md
Normal file
@@ -0,0 +1,198 @@
|
|||||||
|
# ArchiveBox Architecture Diagrams
|
||||||
|
|
||||||
|
## High-Level System Execution Flow
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
stateDiagram-v2
|
||||||
|
cli.main(sys.argv)
|
||||||
|
state Supervisord {
|
||||||
|
Scheduler
|
||||||
|
state Orchestrator {
|
||||||
|
[*] --> TICK
|
||||||
|
TICK --> SPAWN_ACTORS: queued > 0
|
||||||
|
SPAWN_ACTORS --> TICK
|
||||||
|
TICK --> IDLE: queued == 0
|
||||||
|
IDLE --> TICK: 1s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
note left of cli.main(sys.argv)
|
||||||
|
archivebox entrypoint
|
||||||
|
end note
|
||||||
|
|
||||||
|
state "archivebox.cli.SUBCOMMAND" as MAIN_THREAD
|
||||||
|
|
||||||
|
cli.main(sys.argv) --> run_subcommand(sys.argv)
|
||||||
|
run_subcommand(sys.argv) --> setup_django()
|
||||||
|
setup_django() --> Supervisord: spawns in background
|
||||||
|
setup_django() --> MAIN_THREAD: runs in foreground
|
||||||
|
|
||||||
|
MAIN_THREAD --> archivebox.main.SUBCOMMAND
|
||||||
|
archivebox.main.SUBCOMMAND --> Storage: add_to_queue()
|
||||||
|
|
||||||
|
state Actors {
|
||||||
|
CrawlActor --> Crawl: tick()
|
||||||
|
SnapshotActor --> Snapshot: tick()
|
||||||
|
ArchiveResultActors --> ArchiveResult: tick()
|
||||||
|
}
|
||||||
|
|
||||||
|
state "State Machines" as JOBS {
|
||||||
|
|
||||||
|
state Crawl {
|
||||||
|
state "QUEUED" as CRAWL_QUEUED
|
||||||
|
state "STARTED" as CRAWL_STARTED
|
||||||
|
state "SEALED" as CRAWL_SEALED
|
||||||
|
CRAWL_QUEUED --> CRAWL_STARTED: create_root_snapshot()
|
||||||
|
CRAWL_STARTED --> CRAWL_SEALED: is_finished
|
||||||
|
}
|
||||||
|
|
||||||
|
state Snapshot {
|
||||||
|
state "QUEUED" as SNAP_QUEUED
|
||||||
|
state "STARTED" as SNAP_STARTED
|
||||||
|
state "SEALED" as SNAP_SEALED
|
||||||
|
SNAP_QUEUED --> SNAP_STARTED: create_pending_archiveresults()
|
||||||
|
SNAP_STARTED --> SNAP_SEALED: is_finished
|
||||||
|
}
|
||||||
|
|
||||||
|
state ArchiveResult {
|
||||||
|
QUEUED --> STARTED: run_extractor()
|
||||||
|
STARTED --> BACKOFF: is_temp_error
|
||||||
|
BACKOFF --> STARTED: is_retry_past
|
||||||
|
STARTED --> FAILED: is_fatal_error
|
||||||
|
STARTED --> SUCCEEDED: is_succeded
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
note right of ArchiveResult
|
||||||
|
exec_crome()
|
||||||
|
end note
|
||||||
|
|
||||||
|
note right of ArchiveResult
|
||||||
|
exec_wget()
|
||||||
|
end note
|
||||||
|
|
||||||
|
note right of ArchiveResult
|
||||||
|
exec_curl()
|
||||||
|
end note
|
||||||
|
|
||||||
|
note right of ArchiveResult
|
||||||
|
... other extractor subprocesses ...
|
||||||
|
end note
|
||||||
|
}
|
||||||
|
|
||||||
|
state Storage {
|
||||||
|
state "DB" as SQLITE_DB
|
||||||
|
sources/
|
||||||
|
archive/
|
||||||
|
state "index.json" as INDEX_JSONS
|
||||||
|
}
|
||||||
|
|
||||||
|
Storage: Storage
|
||||||
|
|
||||||
|
Orchestrator --> Actors: spawns subprocesses
|
||||||
|
|
||||||
|
Crawl --> Snapshot: create_root_snapshot()
|
||||||
|
Snapshot --> ArchiveResult: create_pending_archiveresults()
|
||||||
|
|
||||||
|
Crawl --> Storage: .save()
|
||||||
|
Snapshot --> Storage: .save()
|
||||||
|
ArchiveResult --> Storage: .save()
|
||||||
|
|
||||||
|
Storage --> Actors: get_queue()
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## State Diagrams for Main Models
|
||||||
|
|
||||||
|
|
||||||
|
### `Crawl`
|
||||||
|
|
||||||
|
- `crawls/models.py`: `Crawl`
|
||||||
|
- `crawls/statemachines.py`: `CrawlMachine`
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
stateDiagram-v2
|
||||||
|
QUEUED --> QUEUED: tick [!can_start]
|
||||||
|
QUEUED --> STARTED: tick [can_start]
|
||||||
|
STARTED --> STARTED: tick [!is_finished]
|
||||||
|
STARTED --> SEALED: tick [is_finished]
|
||||||
|
|
||||||
|
note left of QUEUED
|
||||||
|
Crawl created
|
||||||
|
end note
|
||||||
|
|
||||||
|
note right of STARTED
|
||||||
|
create_root_snapshot()
|
||||||
|
crawl.retry_at = now + 5s
|
||||||
|
end note
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## `Snapshot`
|
||||||
|
|
||||||
|
- `core/models.py`: `Snapshot`
|
||||||
|
- `core/statemachines.py`: `SnapshotMachine`
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
stateDiagram-v2
|
||||||
|
QUEUED --> QUEUED: tick [!can_start]
|
||||||
|
QUEUED --> STARTED: tick [can_start]
|
||||||
|
STARTED --> STARTED: tick [!is_finished]
|
||||||
|
STARTED --> SEALED: tick [is_finished]
|
||||||
|
|
||||||
|
note left of QUEUED
|
||||||
|
Snapshot created
|
||||||
|
end note
|
||||||
|
|
||||||
|
note right of STARTED
|
||||||
|
create_pending_archiveresults(extractors)
|
||||||
|
snapshot.retry_at = now + 60s
|
||||||
|
end note
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### `ArchiveResult`
|
||||||
|
|
||||||
|
- `core/models.py`: `ArchiveResult`
|
||||||
|
- `core/statemachines.py`: `ArchiveResultMachine`
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
stateDiagram-v2
|
||||||
|
QUEUED --> QUEUED: tick [!can_start]
|
||||||
|
QUEUED --> STARTED: tick [can_start]
|
||||||
|
STARTED --> STARTED: tick [!is_finished]
|
||||||
|
STARTED --> BACKOFF: tick [is_backoff]
|
||||||
|
STARTED --> FAILED: tick [is_failed]
|
||||||
|
STARTED --> SUCCEEDED: tick [is_succeeded]
|
||||||
|
BACKOFF --> BACKOFF: tick [!can_start]
|
||||||
|
BACKOFF --> STARTED: tick [can_start]
|
||||||
|
|
||||||
|
note left of QUEUED
|
||||||
|
ArchiveResult created
|
||||||
|
end note
|
||||||
|
|
||||||
|
note left of STARTED
|
||||||
|
start_ts = now
|
||||||
|
retry_at = now + 60s
|
||||||
|
create_output_dir()
|
||||||
|
run_extractor()
|
||||||
|
end note
|
||||||
|
|
||||||
|
note right of BACKOFF
|
||||||
|
retry_at = now + 60s
|
||||||
|
end note
|
||||||
|
|
||||||
|
note right of SUCCEEDED
|
||||||
|
end_ts = now
|
||||||
|
retry_at = None
|
||||||
|
end note
|
||||||
|
|
||||||
|
note right of FAILED
|
||||||
|
end_ts = now
|
||||||
|
retry_at = None
|
||||||
|
end note
|
||||||
|
```
|
||||||
|
|
Reference in New Issue
Block a user