From 30c88ee12ce3216c4d768f73c84f6c1efd7fc482 Mon Sep 17 00:00:00 2001 From: Lazaro V Date: Tue, 3 Nov 2020 11:05:59 -0300 Subject: [PATCH 01/75] Fixed docker-compose.yml url #Summary This PR fixes the URL for the `docker-compose.yml` configuration file. Previous link downloaded the entire github editor webpage, throwing syntax errors when trying to install via Docker. #Related Issues #523 #Changes - Documentation (README.md) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6080fec5..467d5075 100644 --- a/README.md +++ b/README.md @@ -196,7 +196,7 @@ a headless browser runtime, a full webserver, and CLI interface. # docker-compose run archivebox [args] mkdir archivebox && cd archivebox -wget 'https://github.com/pirate/ArchiveBox/blob/master/docker-compose.yml' +wget 'https://raw.githubusercontent.com/pirate/ArchiveBox/master/docker-compose.yml' docker-compose run archivebox init docker-compose run archivebox add 'https://example.com' docker-compose run archivebox manage createsuperuser From 018aecfa9397ec7bad16b104e52d349c7820f66b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 9 Nov 2020 15:09:05 -0500 Subject: [PATCH 02/75] favor chromium in install script over chromium-browser --- bin/setup.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/bin/setup.sh b/bin/setup.sh index 0eb45451..fed1bd5a 100755 --- a/bin/setup.sh +++ b/bin/setup.sh @@ -31,14 +31,17 @@ if which apt-get > /dev/null; then apt install git python3 python3-pip python3-distutils wget curl youtube-dl if which google-chrome; then - echo "[i] You already have google-chrome installed, if you would like to download chromium-browser instead (they work pretty much the same), follow the Manual Setup instructions" + echo "[i] You already have google-chrome installed, if you would like to download chromium instead (they work pretty much the same), follow the Manual Setup instructions" google-chrome --version elif which chromium-browser; then echo "[i] chromium-browser already installed, using existing installation." chromium-browser --version + elif which chromium; then + echo "[i] chromium already installed, using existing installation." + chromium --version else - echo "[+] Installing chromium-browser..." - apt install chromium-browser + echo "[+] Installing chromium..." + apt install chromium fi # On Mac: @@ -63,8 +66,10 @@ elif which brew > /dev/null; then # 🐍 eye of newt echo "[√] Using existing /Applications/Chromium.app" elif which chromium-browser; then echo "[√] Using existing $(which chromium-browser)" + elif which chromium; then + echo "[√] Using existing $(which chromium)" else - echo "[+] Installing chromium-browser..." + echo "[+] Installing chromium..." brew cask install chromium fi else @@ -78,7 +83,7 @@ else exit 1 fi -pip3 install --upgrade archivebox +python3 -m pip install --upgrade archivebox # Check: echo "" From 5225c17073528df3c3709db607ddd035b75ed840 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 9 Nov 2020 22:55:55 -0500 Subject: [PATCH 03/75] disable docker layer caching for now --- .github/workflows/test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2d641a47..f7233a7d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -113,7 +113,8 @@ jobs: with: fetch-depth: 1 - - uses: satackey/action-docker-layer-caching@v0.0.8 + # TODO: as of 2020-11 this helper layer broke, upgrade and re-enable this once it's usable again + # - uses: satackey/action-docker-layer-caching@v0.0.8 - name: Build image run: | From fbd9a7caa6c227a59c16028cd00b059d60cba0a7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 10 Nov 2020 01:07:56 -0500 Subject: [PATCH 04/75] add explicit error when FSYNC is not supported on filesystem --- archivebox/system.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/archivebox/system.py b/archivebox/system.py index 1adcffc7..2caf58e1 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -39,11 +39,16 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over mode = 'wb+' if isinstance(contents, bytes) else 'w' # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}') - with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f: - if isinstance(contents, dict): - dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) - elif isinstance(contents, (bytes, str)): - f.write(contents) + try: + with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f: + if isinstance(contents, dict): + dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder) + elif isinstance(contents, (bytes, str)): + f.write(contents) + except OSError as e: + print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})") + print(" For data integrity, ArchiveBox requires a filesystem that supports atomic writes.") + print(" Some filesystems and network drives don't implement FSYNC, and require workarounds.") os.chmod(path, int(OUTPUT_PERMISSIONS, base=8)) @enforce_types From 3f160eab8e6ed93fb14189c80d2a7d3901c88f73 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 12 Nov 2020 14:28:43 -0500 Subject: [PATCH 05/75] correctly handle WGET_AUTO_COMPRESSION failing when wget is missing --- archivebox/config.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index d29cecd4..38ed5019 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -643,12 +643,15 @@ def find_chrome_data_dir() -> Optional[str]: return None def wget_supports_compression(config): - cmd = [ - config['WGET_BINARY'], - "--compression=auto", - "--help", - ] - return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode + try: + cmd = [ + config['WGET_BINARY'], + "--compression=auto", + "--help", + ] + return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode + except (FileNotFoundError, OSError): + return False def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: return { From 16ad02d574a52e67dfc2decbd41272739ed8e5fb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 12 Nov 2020 14:29:06 -0500 Subject: [PATCH 06/75] add latest package build artifacts --- archivebox.egg-info/PKG-INFO | 2 +- archivebox.egg-info/requires.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index adc65f60..a74994ea 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -211,7 +211,7 @@ Description:
# docker-compose run archivebox [args] mkdir archivebox && cd archivebox - wget 'https://github.com/pirate/ArchiveBox/blob/master/docker-compose.yml' + wget 'https://raw.githubusercontent.com/pirate/ArchiveBox/master/docker-compose.yml' docker-compose run archivebox init docker-compose run archivebox add 'https://example.com' docker-compose run archivebox manage createsuperuser diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index 71dc253d..eb8d2f35 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -13,7 +13,6 @@ w3lib==1.22.0 [dev] setuptools -wheel twine flake8 ipdb From 4372cb6eecd32778830c8c531fb7873ac786c0b8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 12 Nov 2020 14:55:21 -0500 Subject: [PATCH 07/75] stop execution entirely when atomic_write is unsupported --- archivebox/system.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/archivebox/system.py b/archivebox/system.py index 2caf58e1..b27c5e46 100644 --- a/archivebox/system.py +++ b/archivebox/system.py @@ -48,7 +48,8 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over except OSError as e: print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})") print(" For data integrity, ArchiveBox requires a filesystem that supports atomic writes.") - print(" Some filesystems and network drives don't implement FSYNC, and require workarounds.") + print(" Filesystems and network drives that don't implement FSYNC are incompatible and require workarounds.") + raise SystemExit(1) os.chmod(path, int(OUTPUT_PERMISSIONS, base=8)) @enforce_types From 44eede96e5b75d32758c68a11370ff2a0857b103 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 13 Nov 2020 09:24:34 -0500 Subject: [PATCH 08/75] feat: Add extract flag to add command --- archivebox/cli/archivebox_add.py | 9 ++++++++- archivebox/main.py | 14 ++++++++++---- tests/test_add.py | 11 ++++++++++- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 5c370fa5..8d2d2af2 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -62,10 +62,16 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional help="Re-archive URLs from scratch, overwriting any existing files" ) parser.add_argument( - '--init', #'-i', + "--init", #'-i', action='store_true', help="Init/upgrade the curent data directory before adding", ) + parser.add_argument( + "--extract", + nargs="+", + help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ + This does not take precedence over the configuration" + ) command = parser.parse_args(args or ()) urls = command.urls stdin_urls = accept_stdin(stdin) @@ -83,6 +89,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional overwrite=command.overwrite, init=command.init, out_dir=pwd or OUTPUT_DIR, + extractors = command.extract or [], ) diff --git a/archivebox/main.py b/archivebox/main.py index 44ee6b14..208f7661 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -525,7 +525,8 @@ def add(urls: Union[str, List[str]], index_only: bool=False, overwrite: bool=False, init: bool=False, - out_dir: Path=OUTPUT_DIR) -> List[Link]: + out_dir: Path=OUTPUT_DIR, + extractors: list=[]) -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' @@ -567,12 +568,17 @@ def add(urls: Union[str, List[str]], return all_links # Run the archive methods for each link + archive_kwargs = { + "out_dir": out_dir, + } + if extractors: + archive_kwargs["methods"] = extractors if update_all: - archive_links(all_links, overwrite=overwrite, out_dir=out_dir) + archive_links(all_links, overwrite=overwrite, **archive_kwargs) elif overwrite: - archive_links(imported_links, overwrite=True, out_dir=out_dir) + archive_links(imported_links, overwrite=True, **archive_kwargs) elif new_links: - archive_links(new_links, overwrite=False, out_dir=out_dir) + archive_links(new_links, overwrite=False, **archive_kwargs) return all_links diff --git a/tests/test_add.py b/tests/test_add.py index 5e672e8d..bb15e51b 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -81,4 +81,13 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di with open(archived_item_path / "index.json", "r") as f: output_json = json.load(f) - assert output_json["history"] != {} \ No newline at end of file + assert output_json["history"] != {} + +def test_extract_input_uses_only_passed_extractors(tmp_path, process): + subprocess.run(["archivebox", "add", "http://127.0.0.1:8080/static/example.com.html", "--extract", "wget"], + capture_output=True) + + archived_item_path = list(tmp_path.glob('archive/**/*'))[0] + + assert (archived_item_path / "warc").exists() + assert not (archived_item_path / "singlefile.html").exists() \ No newline at end of file From db523c9d828598e0d768a451e4add32c58fdc58f Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 13 Nov 2020 11:41:50 -0500 Subject: [PATCH 09/75] fix: Avoid mutable default input argument --- archivebox/main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/archivebox/main.py b/archivebox/main.py index 208f7661..ed615daa 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -526,11 +526,14 @@ def add(urls: Union[str, List[str]], overwrite: bool=False, init: bool=False, out_dir: Path=OUTPUT_DIR, - extractors: list=[]) -> List[Link]: + extractors: list=None) -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' + if extractors is None: + extractors = [] + if init: run_subcommand('init', stdin=None, pwd=out_dir) From cbb3d04c12996365146feff99331aa47fe67492c Mon Sep 17 00:00:00 2001 From: JDC Date: Fri, 13 Nov 2020 12:06:12 -0500 Subject: [PATCH 10/75] Allow list filtering by tag name --- archivebox/cli/archivebox_list.py | 2 +- archivebox/index/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_list.py b/archivebox/cli/archivebox_list.py index 529dad80..140810a6 100644 --- a/archivebox/cli/archivebox_list.py +++ b/archivebox/cli/archivebox_list.py @@ -98,7 +98,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex'), + choices=('exact', 'substring', 'domain', 'regex','tag'), default='exact', help='Type of pattern matching to use when filtering URLs', ) diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index ee4bf411..890777c8 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -361,6 +361,7 @@ LINK_FILTERS = { 'substring': lambda pattern: Q(url__icontains=pattern), 'regex': lambda pattern: Q(url__iregex=pattern), 'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"), + 'tag': lambda pattern: Q(tags__name=pattern), } @enforce_types From 1ec82765144fe25f847957a5d37238a0b9cbb8b7 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 13 Nov 2020 13:01:11 -0500 Subject: [PATCH 11/75] fix: Use a comma separated input instead of nargs for the extract flag --- archivebox/cli/archivebox_add.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 8d2d2af2..d3c89256 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -68,13 +68,15 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional ) parser.add_argument( "--extract", - nargs="+", + type=str, help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \ - This does not take precedence over the configuration" + This does not take precedence over the configuration", + default="" ) command = parser.parse_args(args or ()) urls = command.urls stdin_urls = accept_stdin(stdin) + extractors = command.extract.split(",") if command.extract else None if (stdin_urls and urls) or (not stdin and not urls): stderr( '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n', @@ -89,7 +91,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional overwrite=command.overwrite, init=command.init, out_dir=pwd or OUTPUT_DIR, - extractors = command.extract or [], + extractors = extractors, ) From 54df0a035b5fbfea25002b21d6a95de2db1f6fc1 Mon Sep 17 00:00:00 2001 From: Cristian Date: Fri, 13 Nov 2020 13:10:17 -0500 Subject: [PATCH 12/75] fix: Move csv split to the add function to avoid optional nullable argument --- archivebox/cli/archivebox_add.py | 3 +-- archivebox/main.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index d3c89256..ca469fa8 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -76,7 +76,6 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional command = parser.parse_args(args or ()) urls = command.urls stdin_urls = accept_stdin(stdin) - extractors = command.extract.split(",") if command.extract else None if (stdin_urls and urls) or (not stdin and not urls): stderr( '[X] You must pass URLs/paths to add via stdin or CLI arguments.\n', @@ -91,7 +90,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional overwrite=command.overwrite, init=command.init, out_dir=pwd or OUTPUT_DIR, - extractors = extractors, + extractors = command.extract, ) diff --git a/archivebox/main.py b/archivebox/main.py index ed615daa..e27dff96 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -526,13 +526,12 @@ def add(urls: Union[str, List[str]], overwrite: bool=False, init: bool=False, out_dir: Path=OUTPUT_DIR, - extractors: list=None) -> List[Link]: + extractors: str="") -> List[Link]: """Add a new URL or list of URLs to your archive""" assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' - if extractors is None: - extractors = [] + extractors = extractors.split(",") if extractors else [] if init: run_subcommand('init', stdin=None, pwd=out_dir) From d54c3eec9d65ac87dd70cbd0f85c5cab995e9c1a Mon Sep 17 00:00:00 2001 From: JDC Date: Fri, 13 Nov 2020 14:16:48 -0500 Subject: [PATCH 13/75] Add tag filter argument to remove command --- archivebox/cli/archivebox_remove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py index 8fe717fb..cb073e95 100644 --- a/archivebox/cli/archivebox_remove.py +++ b/archivebox/cli/archivebox_remove.py @@ -50,7 +50,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional parser.add_argument( '--filter-type', type=str, - choices=('exact', 'substring', 'domain', 'regex'), + choices=('exact', 'substring', 'domain', 'regex','tag'), default='exact', help='Type of pattern matching to use when filtering URLs', ) From b1dbfcb73f22ebbb68e511650ede01db9cd87809 Mon Sep 17 00:00:00 2001 From: JDC Date: Fri, 13 Nov 2020 14:17:12 -0500 Subject: [PATCH 14/75] Add test remove tag filter --- tests/test_remove.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/test_remove.py b/tests/test_remove.py index 0fb16e2a..c9c63385 100644 --- a/tests/test_remove.py +++ b/tests/test_remove.py @@ -70,6 +70,29 @@ def test_remove_domain(tmp_path, process, disable_extractors_dict): assert count == 0 + +def test_remove_tag(tmp_path, process, disable_extractors_dict): + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) + subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict) + assert list((tmp_path / "archive").iterdir()) != [] + + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + c.execute("INSERT INTO core_tag (id, name, slug) VALUES (2, 'test-tag', 'test-tag')") + snapshot_ids = c.execute("SELECT id from core_snapshot") + c.executemany('INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, 2)', list(snapshot_ids)) + conn.commit() + + remove_process = subprocess.run(['archivebox', 'remove', '--filter-type=tag', 'test-tag', '--yes', '--delete'], capture_output=True) + + assert len(list((tmp_path / "archive").iterdir())) == 0 + + count = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0] + conn.commit() + conn.close() + + assert count == 0 + def test_remove_before(tmp_path, process, disable_extractors_dict): subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict) subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/iana.org.html'], capture_output=True, env=disable_extractors_dict) From 257d3f2a98b3d36f96f82a9434263b37d99253fb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 13 Nov 2020 14:52:21 -0500 Subject: [PATCH 15/75] Update archivebox/cli/archivebox_add.py --- archivebox/cli/archivebox_add.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index ca469fa8..b4e65231 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -90,7 +90,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional overwrite=command.overwrite, init=command.init, out_dir=pwd or OUTPUT_DIR, - extractors = command.extract, + extractors=command.extract, ) From a05485f85c4435ba9c4e1edebfbd671b0032c9f3 Mon Sep 17 00:00:00 2001 From: TrAyZeN <1810leo@gmail.com> Date: Sat, 14 Nov 2020 17:44:06 +0100 Subject: [PATCH 16/75] Fix file icons order --- archivebox/core/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 0bb8fceb..75c9c4e7 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -29,9 +29,9 @@ def get_icons(snapshot: Snapshot) -> str: '', *link_tuple(link, 'singlefile_path'), *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), + *link_tuple(link, 'dom_path'), *link_tuple(link, 'pdf_path'), *link_tuple(link, 'screenshot_path'), - *link_tuple(link, 'dom_path'), *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), From 88cc75a0457859a63b06854e353b053c730b3752 Mon Sep 17 00:00:00 2001 From: TrAyZeN <1810leo@gmail.com> Date: Sat, 14 Nov 2020 17:48:29 +0100 Subject: [PATCH 17/75] Change opacity of inexisting archive type on public view --- archivebox/themes/default/base.html | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html index ed7d1be9..61418673 100644 --- a/archivebox/themes/default/base.html +++ b/archivebox/themes/default/base.html @@ -223,6 +223,14 @@ .title-col a { color: black; } + + tr td a.exists-True { + opacity: 1; + } + tr td a.exists-False { + opacity: 0.1; + filter: grayscale(100%); + } From 8b0250caebb4ddaf46d4ad3721e403e7652cae43 Mon Sep 17 00:00:00 2001 From: JDC Date: Tue, 17 Nov 2020 08:36:03 -0500 Subject: [PATCH 18/75] Fixes 500 error on search The class SnapshotAdmin search_fields includes the tags ManyToMany field causing a django.core.exceptions.FieldError: Related Field got invalid lookup: icontains error. A related search field tags__name should be used. --- archivebox/core/admin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 20def353..b15507a4 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -86,7 +86,7 @@ class SnapshotAdmin(admin.ModelAdmin): list_display = ('added', 'title_str', 'url_str', 'files', 'size') sort_fields = ('title_str', 'url_str', 'added') readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated') - search_fields = ('url', 'timestamp', 'title', 'tags') + search_fields = ['url', 'timestamp', 'title', 'tags__name'] fields = (*readonly_fields, 'title', 'tags') list_filter = ('added', 'updated', 'tags') ordering = ['-added'] From c0b4198eaf6288e8c0eb95fd75579867746d3695 Mon Sep 17 00:00:00 2001 From: Chris Meller Date: Tue, 17 Nov 2020 17:57:03 +0200 Subject: [PATCH 19/75] Create Docker Workflow config --- .github/workflows/docker.yml | 64 ++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 .github/workflows/docker.yml diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 00000000..44fdb741 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,64 @@ +name: Docker Buildx and Push + +on: + schedule: + # 00:00 UTC every day + - cron: '0 0 * * *' + workflow_dispatch: + push: + +jobs: + buildx: + runs-on: ubuntu-latest + steps: + - + name: Docker Login + uses: docker/login-action@v1 + with: + username: ${{ secrets.DockerUsername }} + password: ${{ secrets.DockerPassword }} + - + name: Checkout + uses: actions/checkout@v2 + - + name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - + name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + with: + version: latest + install: true + - + name: Builder instance name + run: echo ${{ steps.buildx.outputs.name }} + - + name: Available platforms + run: echo ${{ steps.buildx.outputs.platforms }} + - + name: Cache Docker layers + uses: actions/cache@v2 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx- + - + name: Build and push + id: docker_build + uses: docker/build-push-action@v2 + with: + context: ./ + file: ./Dockerfile + builder: ${{ steps.buildx.outputs.name }} + push: true + tags: | + ${{ secrets.DockerUsername }}/${{ secrets.DockerRepository }}:latest + ${{ secrets.DockerUsername }}/${{ secrets.DockerRepository }}:${{ github.sha }} + cache-from: type=local,src=/tmp/.buildx-cache + cache-to: type=local,dest=/tmp/.buildx-cache + platforms: linux/arm64,linux/arm/v7 + - + name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} From 244e296652fbcac0431be7d5005e889fa12d6f8b Mon Sep 17 00:00:00 2001 From: Chris Meller Date: Tue, 17 Nov 2020 18:03:30 +0200 Subject: [PATCH 20/75] Remove Docker Workflow's scheduled trigger. --- .github/workflows/docker.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 44fdb741..0fd94764 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,9 +1,6 @@ name: Docker Buildx and Push on: - schedule: - # 00:00 UTC every day - - cron: '0 0 * * *' workflow_dispatch: push: From b11d5624452b3c047c6e342d31bcaad576b30696 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 22 Nov 2020 12:33:15 -0500 Subject: [PATCH 21/75] fix splitting on multiple equals in val --- archivebox/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/main.py b/archivebox/main.py index e27dff96..4696b619 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -865,7 +865,7 @@ def config(config_options_str: Optional[str]=None, stderr(f' {line}') raise SystemExit(2) - raw_key, val = line.split('=') + raw_key, val = line.split('=', 1) raw_key = raw_key.upper().strip() key = get_real_name(raw_key) if key != raw_key: From 4c3c6154f6def5347c432eee81a5c01b40377a6a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 22 Nov 2020 18:01:39 -0500 Subject: [PATCH 22/75] update secrets names --- .github/workflows/docker.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 0fd94764..c737fca6 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -12,8 +12,8 @@ jobs: name: Docker Login uses: docker/login-action@v1 with: - username: ${{ secrets.DockerUsername }} - password: ${{ secrets.DockerPassword }} + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} - name: Checkout uses: actions/checkout@v2 @@ -51,8 +51,8 @@ jobs: builder: ${{ steps.buildx.outputs.name }} push: true tags: | - ${{ secrets.DockerUsername }}/${{ secrets.DockerRepository }}:latest - ${{ secrets.DockerUsername }}/${{ secrets.DockerRepository }}:${{ github.sha }} + ${{ secrets.DOCKER_USERNAME }}/archivebox:latest + ${{ secrets.DOCKER_USERNAME }}/archivebox:${{ github.sha }} cache-from: type=local,src=/tmp/.buildx-cache cache-to: type=local,dest=/tmp/.buildx-cache platforms: linux/arm64,linux/arm/v7 From bb21198239d0b7556fd05f32b82c354a02ebf903 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 22 Nov 2020 18:38:38 -0500 Subject: [PATCH 23/75] rename workflow --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f7233a7d..8b26eca6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,4 +1,4 @@ -name: Test workflow +name: 'Lint, Test, and Build' on: [push] env: From e4fa56e55a849715394ea02b2c0a95b572b509be Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 22 Nov 2020 18:41:22 -0500 Subject: [PATCH 24/75] fix indentation and workflow name --- .github/workflows/docker.yml | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index c737fca6..67976b93 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,4 +1,4 @@ -name: Docker Buildx and Push +name: Docker Push on: workflow_dispatch: @@ -8,41 +8,33 @@ jobs: buildx: runs-on: ubuntu-latest steps: - - - name: Docker Login - uses: docker/login-action@v1 - with: + - name: Docker Login + uses: docker/login-action@v1 + with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - - name: Checkout + - name: Checkout uses: actions/checkout@v2 - - - name: Set up QEMU + - name: Set up QEMU uses: docker/setup-qemu-action@v1 - - - name: Set up Docker Buildx + - name: Set up Docker Buildx id: buildx uses: docker/setup-buildx-action@v1 with: version: latest install: true - - - name: Builder instance name + - name: Builder instance name run: echo ${{ steps.buildx.outputs.name }} - - - name: Available platforms + - name: Available platforms run: echo ${{ steps.buildx.outputs.platforms }} - - - name: Cache Docker layers + - name: Cache Docker layers uses: actions/cache@v2 with: path: /tmp/.buildx-cache key: ${{ runner.os }}-buildx-${{ github.sha }} restore-keys: | ${{ runner.os }}-buildx- - - - name: Build and push + - name: Build and push id: docker_build uses: docker/build-push-action@v2 with: @@ -56,6 +48,5 @@ jobs: cache-from: type=local,src=/tmp/.buildx-cache cache-to: type=local,dest=/tmp/.buildx-cache platforms: linux/arm64,linux/arm/v7 - - - name: Image digest + - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} From 43bc59b3d52a4db4451ea41a00e9ca76afa835af Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 22 Nov 2020 18:52:13 -0500 Subject: [PATCH 25/75] add plain non-user-scoped tag to docker image --- .github/workflows/docker.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 67976b93..9ecd31a2 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -43,6 +43,7 @@ jobs: builder: ${{ steps.buildx.outputs.name }} push: true tags: | + archivebox:latest ${{ secrets.DOCKER_USERNAME }}/archivebox:latest ${{ secrets.DOCKER_USERNAME }}/archivebox:${{ github.sha }} cache-from: type=local,src=/tmp/.buildx-cache From 75d4125e3e48fed04a0075b46ea7c7ab9b2f92ff Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 22 Nov 2020 19:23:42 -0500 Subject: [PATCH 26/75] wishful thinking, not allowed to push userless tags --- .github/workflows/docker.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9ecd31a2..67976b93 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -43,7 +43,6 @@ jobs: builder: ${{ steps.buildx.outputs.name }} push: true tags: | - archivebox:latest ${{ secrets.DOCKER_USERNAME }}/archivebox:latest ${{ secrets.DOCKER_USERNAME }}/archivebox:${{ github.sha }} cache-from: type=local,src=/tmp/.buildx-cache From 5b15c8e29e8a43b578c55756f7e53d1851b96b85 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 00:59:50 -0500 Subject: [PATCH 27/75] silence usermod on start --- bin/docker_entrypoint.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/docker_entrypoint.sh b/bin/docker_entrypoint.sh index c70d7f27..29fcb646 100755 --- a/bin/docker_entrypoint.sh +++ b/bin/docker_entrypoint.sh @@ -9,8 +9,8 @@ GRID=$(stat --format="%g" "$DATA_DIR") # If user is not root, modify the archivebox user+files to have the same uid,gid if [[ "$USID" != 0 && "$GRID" != 0 ]]; then - usermod -u "$USID" "$ARCHIVEBOX_USER" - groupmod -g "$GRID" "$ARCHIVEBOX_USER" + usermod -u "$USID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 + groupmod -g "$GRID" "$ARCHIVEBOX_USER" > /dev/null 2>&1 chown -R "$USID":"$GRID" "/home/$ARCHIVEBOX_USER" chown "$USID":"$GRID" "$DATA_DIR" chown "$USID":"$GRID" "$DATA_DIR/*" > /dev/null 2>&1 || true From d97fc6b16c70682b5536022fcfd5cbee606c681c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 02:00:02 -0500 Subject: [PATCH 28/75] push builds to new docker org --- .github/workflows/docker.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 67976b93..2a4ebf26 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -45,6 +45,8 @@ jobs: tags: | ${{ secrets.DOCKER_USERNAME }}/archivebox:latest ${{ secrets.DOCKER_USERNAME }}/archivebox:${{ github.sha }} + archivebox/archivebox:latest + archivebox/archivebox:${{ github.sha }} cache-from: type=local,src=/tmp/.buildx-cache cache-to: type=local,dest=/tmp/.buildx-cache platforms: linux/arm64,linux/arm/v7 From 0e2ccbc10d3ec8819a949f334f443f1d120f0e88 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 02:04:39 -0500 Subject: [PATCH 29/75] update urls to new repo path --- Dockerfile | 4 +- README.md | 116 +++++++++---------- archivebox.egg-info/PKG-INFO | 132 +++++++++++----------- archivebox/config.py | 26 ++--- archivebox/core/urls.py | 2 +- archivebox/index/json.py | 6 +- archivebox/logging_util.py | 2 +- archivebox/main.py | 4 +- archivebox/themes/default/base.html | 6 +- archivebox/themes/default/main_index.html | 6 +- archivebox/themes/legacy/main_index.html | 8 +- bin/setup.sh | 6 +- docker-compose.yml | 2 +- etc/ArchiveBox.conf.default | 2 +- setup.py | 2 +- tests/test_title.py | 2 +- 16 files changed, 163 insertions(+), 163 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5f16e658..a9b3c639 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,8 +12,8 @@ FROM python:3.8-slim-buster LABEL name="archivebox" \ maintainer="Nick Sweeting " \ description="All-in-one personal internet archiving container" \ - homepage="https://github.com/pirate/ArchiveBox" \ - documentation="https://github.com/pirate/ArchiveBox/wiki/Docker#docker" + homepage="https://github.com/ArchiveBox/ArchiveBox" \ + documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker" # System-level base config ENV TZ=UTC \ diff --git a/README.md b/README.md index 467d5075..230a25e9 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,13 @@

ArchiveBox
The open-source self-hosted web archive.

-▶️ Quickstart | +▶️ Quickstart | Demo | -Github | -Documentation | +Github | +Documentation | Info & Motivation | -Community | -Roadmap +Community | +Roadmap
 "Your own personal internet archive" (网站存档 / 爬虫)
@@ -16,11 +16,11 @@
 
 
 
-
-
-
+
+
+
 
-
+
 
 
 
@@ -48,7 +48,7 @@ open http://127.0.0.1:8000/admin/login/ # then click "Add" in the navbar
[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io) -For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs. +For more information, see the [full Quickstart guide](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) docs.
--- @@ -76,7 +76,7 @@ At the end of the day, the goal is to sleep soundly knowing that the part of the Desktop index screenshot Desktop details page Screenshot Desktop details page Screenshot
-Demo | Usage | Screenshots +Demo | Usage | Screenshots
. . . . . . . . . . . . . . . . . . . . . . . . . . . .
@@ -84,16 +84,16 @@ At the end of the day, the goal is to sleep soundly knowing that the part of the ## Key Features -- [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally -- [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) -- [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) -- Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources** +- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally +- [**Few dependencies**](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) +- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) +- Easy to set up **[scheduled importing](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources** - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC -- ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes) +- ~~**Suitable for paywalled / [authenticated content](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes) - **Doesn't require a constantly-running daemon**, proxy, or native app - Provides a CLI, Python API, self-hosted web UI, and REST API (WIP) -- Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc. -- Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy +- Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/ArchiveBox/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/ArchiveBox/ArchiveBox/issues/80), [close modals](https://github.com/ArchiveBox/ArchiveBox/issues/175), expand comment threads, etc. +- Can also [**mirror content to 3rd-party archiving services**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy ## Input formats @@ -112,7 +112,7 @@ archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12' - RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format - Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more -See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. +See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly. @@ -137,15 +137,15 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links -- _More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)..._ +- _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._ -It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file. +It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file. ## Dependencies -You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/pirate/ArchiveBox/wiki/Docker) with everything preinstalled. +You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) with everything preinstalled. -If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/pirate/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/pirate/ArchiveBox/wiki/Install). +If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/ArchiveBox/ArchiveBox/wiki/Install). ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more. @@ -163,7 +163,7 @@ archivebox config --set SAVE_FAVICON=False # optional: only the domain is leake archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google ``` -Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. +Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. ```bash # visiting an archived page with malicious JS: https://127.0.0.1:8000/archive/1602401954/example.com/index.html @@ -174,7 +174,7 @@ https://127.0.0.1:8000/archive/* # then example.com/index.js can send it off to some evil server ``` -Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash: +Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash: ```bash archivebox add 'https://example.com#2020-10-24' ... @@ -196,7 +196,7 @@ a headless browser runtime, a full webserver, and CLI interface. # docker-compose run archivebox [args] mkdir archivebox && cd archivebox -wget 'https://raw.githubusercontent.com/pirate/ArchiveBox/master/docker-compose.yml' +wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml' docker-compose run archivebox init docker-compose run archivebox add 'https://example.com' docker-compose run archivebox manage createsuperuser @@ -250,7 +250,7 @@ python3 -m venv .venv && source .venv/bin/activate pip install --upgrade archivebox # Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer) -npm install --prefix . 'git+https://github.com/pirate/ArchiveBox.git' +npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' ``` Initialize your archive and add some links: @@ -314,13 +314,13 @@ All the archived links are stored by date bookmarked in `./archive/`, ## Comparison to Other Projects -▶ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** +▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. #### User Interface & Intended Purpose -ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend. +ArchiveBox differentiates itself from [similar projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend. #### Private Local Archives vs Centralized Public Archives @@ -336,16 +336,16 @@ Whether you want to learn which organizations are the big players in the web arc -- [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) - - [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists) +- [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) + - [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists) _Community-maintained indexes of archiving tools and institutions._ - - [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) + - [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) _Open source tools and projects in the internet archiving space._ - - [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List) + - [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Reading-List) _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._ - - [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities) + - [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Communities) _A collection of the most active internet archiving communities and initiatives._ -- Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog) +- Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. - Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. @@ -355,51 +355,51 @@ Whether you want to learn which organizations are the big players in the web arc -We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation. +We use the [Github wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation. -You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder. +You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder. ## Getting Started -- [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart) -- [Install](https://github.com/pirate/ArchiveBox/wiki/Install) -- [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) +- [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) +- [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install) +- [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) ## Reference -- [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage) -- [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) -- [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) -- [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site) -- [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) -- [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive) -- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium) -- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview) -- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting) +- [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage) +- [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) +- [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) +- [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site) +- [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) +- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) +- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install-Chromium) +- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview) +- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) - [Python API](https://docs.archivebox.io/en/latest/modules.html) - REST API (coming soon...) ## More Info -- [Tickets](https://github.com/pirate/ArchiveBox/issues) -- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) -- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog) -- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations) -- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation) -- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) +- [Tickets](https://github.com/ArchiveBox/ArchiveBox/issues) +- [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) +- [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) +- [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) +- [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation) +- [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) --- # ArchiveBox Development -All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/pirate/ArchiveBox/issues) and [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap. +All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap. ### Setup the dev environment First, install the system dependencies from the "Bare Metal" section above. Then you can clone the ArchiveBox repo and install ```python3 -git clone https://github.com/pirate/ArchiveBox +git clone https://github.com/ArchiveBox/ArchiveBox cd ArchiveBox git checkout master # or the branch you want to test git pull @@ -480,7 +480,7 @@ You can also run all these in Docker. For more examples see the Github Actions C
- +

diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index a74994ea..c114eea4 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -2,28 +2,28 @@ Metadata-Version: 2.1 Name: archivebox Version: 0.4.21 Summary: The self-hosted internet archive. -Home-page: https://github.com/pirate/ArchiveBox +Home-page: https://github.com/ArchiveBox/ArchiveBox Author: Nick Sweeting Author-email: git@nicksweeting.com License: MIT -Project-URL: Source, https://github.com/pirate/ArchiveBox -Project-URL: Documentation, https://github.com/pirate/ArchiveBox/wiki -Project-URL: Bug Tracker, https://github.com/pirate/ArchiveBox/issues -Project-URL: Changelog, https://github.com/pirate/ArchiveBox/wiki/Changelog -Project-URL: Roadmap, https://github.com/pirate/ArchiveBox/wiki/Roadmap -Project-URL: Community, https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community -Project-URL: Donate, https://github.com/pirate/ArchiveBox/wiki/Donations +Project-URL: Source, https://github.com/ArchiveBox/ArchiveBox +Project-URL: Documentation, https://github.com/ArchiveBox/ArchiveBox/wiki +Project-URL: Bug Tracker, https://github.com/ArchiveBox/ArchiveBox/issues +Project-URL: Changelog, https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog +Project-URL: Roadmap, https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap +Project-URL: Community, https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community +Project-URL: Donate, https://github.com/ArchiveBox/ArchiveBox/wiki/Donations Description:

ArchiveBox
The open-source self-hosted web archive.

- ▶️ Quickstart | + ▶️ Quickstart | Demo | - Github | - Documentation | + Github | + Documentation | Info & Motivation | - Community | - Roadmap + Community | + Roadmap
         "Your own personal internet archive" (网站存档 / 爬虫)
@@ -31,11 +31,11 @@ Description: 
- - - + + + - +
@@ -63,7 +63,7 @@ Description:

[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io) - For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs. + For more information, see the [full Quickstart guide](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) docs.
--- @@ -91,7 +91,7 @@ Description:
Desktop index screenshot Desktop details page Screenshot Desktop details page Screenshot
- Demo | Usage | Screenshots + Demo | Usage | Screenshots
. . . . . . . . . . . . . . . . . . . . . . . . . . . .

@@ -99,16 +99,16 @@ Description:
## Key Features - - [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally - - [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) - - [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) - - Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources** + - [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally + - [**Few dependencies**](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) + - [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) + - Easy to set up **[scheduled importing](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources** - Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC - - ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes) + - ~~**Suitable for paywalled / [authenticated content](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes) - **Doesn't require a constantly-running daemon**, proxy, or native app - Provides a CLI, Python API, self-hosted web UI, and REST API (WIP) - - Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc. - - Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy + - Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/ArchiveBox/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/ArchiveBox/ArchiveBox/issues/80), [close modals](https://github.com/ArchiveBox/ArchiveBox/issues/175), expand comment threads, etc. + - Can also [**mirror content to 3rd-party archiving services**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy ## Input formats @@ -127,7 +127,7 @@ Description:
- RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format - Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more - See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. + See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly. @@ -152,15 +152,15 @@ Description:
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links - - _More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)..._ + - _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._ - It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file. + It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file. ## Dependencies - You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/pirate/ArchiveBox/wiki/Docker) with everything preinstalled. + You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) with everything preinstalled. - If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/pirate/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/pirate/ArchiveBox/wiki/Install). + If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/ArchiveBox/ArchiveBox/wiki/Install). ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more. @@ -178,7 +178,7 @@ Description:
archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google ``` - Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. + Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. ```bash # visiting an archived page with malicious JS: https://127.0.0.1:8000/archive/1602401954/example.com/index.html @@ -189,7 +189,7 @@ Description:
# then example.com/index.js can send it off to some evil server ``` - Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash: + Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash: ```bash archivebox add 'https://example.com#2020-10-24' ... @@ -211,7 +211,7 @@ Description:
# docker-compose run archivebox [args] mkdir archivebox && cd archivebox - wget 'https://raw.githubusercontent.com/pirate/ArchiveBox/master/docker-compose.yml' + wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml' docker-compose run archivebox init docker-compose run archivebox add 'https://example.com' docker-compose run archivebox manage createsuperuser @@ -265,7 +265,7 @@ Description:
pip install --upgrade archivebox # Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer) - npm install --prefix . 'git+https://github.com/pirate/ArchiveBox.git' + npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' ``` Initialize your archive and add some links: @@ -329,13 +329,13 @@ Description:
## Comparison to Other Projects - ▶ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** + ▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. #### User Interface & Intended Purpose - ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend. + ArchiveBox differentiates itself from [similar projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend. #### Private Local Archives vs Centralized Public Archives @@ -351,16 +351,16 @@ Description:
- - [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) - - [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists) + - [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) + - [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists) _Community-maintained indexes of archiving tools and institutions._ - - [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) + - [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) _Open source tools and projects in the internet archiving space._ - - [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List) + - [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Reading-List) _Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._ - - [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities) + - [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Communities) _A collection of the most active internet archiving communities and initiatives._ - - Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog) + - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. - Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. @@ -370,51 +370,51 @@ Description:
- We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation. + We use the [Github wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation. - You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder. + You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder. ## Getting Started - - [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart) - - [Install](https://github.com/pirate/ArchiveBox/wiki/Install) - - [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) + - [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) + - [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install) + - [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) ## Reference - - [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage) - - [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) - - [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) - - [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site) - - [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) - - [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive) - - [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium) - - [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview) - - [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting) + - [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage) + - [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) + - [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) + - [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site) + - [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) + - [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) + - [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install-Chromium) + - [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview) + - [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) - [Python API](https://docs.archivebox.io/en/latest/modules.html) - REST API (coming soon...) ## More Info - - [Tickets](https://github.com/pirate/ArchiveBox/issues) - - [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) - - [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog) - - [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations) - - [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation) - - [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) + - [Tickets](https://github.com/ArchiveBox/ArchiveBox/issues) + - [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) + - [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) + - [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations) + - [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation) + - [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) --- # ArchiveBox Development - All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/pirate/ArchiveBox/issues) and [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap. + All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap. ### Setup the dev environment First, install the system dependencies from the "Bare Metal" section above. Then you can clone the ArchiveBox repo and install ```python3 - git clone https://github.com/pirate/ArchiveBox + git clone https://github.com/ArchiveBox/ArchiveBox cd ArchiveBox git checkout master # or the branch you want to test git pull @@ -495,7 +495,7 @@ Description:

- +

diff --git a/archivebox/config.py b/archivebox/config.py index 38ed5019..d321dd72 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -36,7 +36,7 @@ from .config_stubs import ( # # ****************************************************************************** -# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration +# Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration # Use the 'env' command to pass config options to ArchiveBox. e.g.: # env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html # ****************************************************************************** @@ -98,8 +98,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'}, 'CHECK_SSL_VALIDITY': {'type': bool, 'default': True}, - 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'}, - 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'}, + 'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'}, + 'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'}, 'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}, 'COOKIES_FILE': {'type': str, 'default': None}, @@ -248,7 +248,7 @@ CONFIG_HEADER = ( # archivebox init # # A list of all possible config with documentation and examples can be found here: -# https://github.com/pirate/ArchiveBox/wiki/Configuration +# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration """) @@ -505,7 +505,7 @@ def load_config(defaults: ConfigDefaultDict, stderr(' Check your config for mistakes and try again (your archive data is unaffected).') stderr() stderr(' For config documentation and examples see:') - stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration') stderr() raise raise SystemExit(2) @@ -565,7 +565,7 @@ def bin_version(binary: Optional[str]) -> Optional[str]: # stderr(f' {binary} --version') # stderr() # stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:') - # stderr(' https://github.com/pirate/ArchiveBox/wiki/Install') + # stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install') return None def bin_path(binary: Optional[str]) -> Optional[str]: @@ -831,13 +831,13 @@ def check_system_config(config: ConfigDict=CONFIG) -> None: if config['USER'] == 'root': stderr('[!] ArchiveBox should never be run as root!', color='red') stderr(' For more information, see the security overview documentation:') - stderr(' https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root') raise SystemExit(2) ### Check Python environment if sys.version_info[:3] < (3, 6, 0): stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red') - stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') + stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.') raise SystemExit(2) if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'): @@ -857,7 +857,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None: stderr(f' {config["CHROME_USER_DATA_DIR"]}') stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.') stderr(' For more info see:') - stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR') if '/Default' in str(config['CHROME_USER_DATA_DIR']): stderr() stderr(' Try removing /Default from the end e.g.:') @@ -881,7 +881,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: ) ) if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'): - hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"', + hint(('npm install --prefix . "git+https://github.com/ArchiveBox/ArchiveBox.git"', f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning', ''), prefix=' ') stderr('') @@ -892,7 +892,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)') stderr() stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') - stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') stderr() elif config['USE_CHROME'] and config['TIMEOUT'] < 15: @@ -901,7 +901,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)') stderr() stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') - stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') stderr() if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20: @@ -910,7 +910,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr(' (Setting it somewhere over 60 seconds is recommended)') stderr() stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') - stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media') + stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') stderr() def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None: diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index e11653fd..b8e4bafb 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -14,7 +14,7 @@ urlpatterns = [ path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}), path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}), - path('docs/', RedirectView.as_view(url='https://github.com/pirate/ArchiveBox/wiki'), name='Docs'), + path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'), path('archive/', RedirectView.as_view(url='/')), path('archive/', LinkDetails.as_view(), name='LinkAssets'), diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 36c5ccdb..1c3ce6e8 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -32,9 +32,9 @@ MAIN_INDEX_HEADER = { 'version': VERSION, 'git_sha': GIT_SHA, 'website': 'https://ArchiveBox.io', - 'docs': 'https://github.com/pirate/ArchiveBox/wiki', - 'source': 'https://github.com/pirate/ArchiveBox', - 'issues': 'https://github.com/pirate/ArchiveBox/issues', + 'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki', + 'source': 'https://github.com/ArchiveBox/ArchiveBox', + 'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues', 'dependencies': DEPENDENCIES, }, } diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index 1c92eba2..aa4659f0 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -447,7 +447,7 @@ def log_shell_welcome_msg(): print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI)) print() print('[i] Welcome to the ArchiveBox Shell!') - print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage') + print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage') print() print(' {lightred}Hint:{reset} Example use:'.format(**ANSI)) print(' print(Snapshot.objects.filter(is_archived=True).count())') diff --git a/archivebox/main.py b/archivebox/main.py index 4696b619..26f05427 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -178,7 +178,7 @@ def help(out_dir: Path=OUTPUT_DIR) -> None: archivebox update --resume=15109948213.123 {lightred}Documentation:{reset} - https://github.com/pirate/ArchiveBox/wiki + https://github.com/ArchiveBox/ArchiveBox/wiki '''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI)) else: @@ -197,7 +197,7 @@ def help(out_dir: Path=OUTPUT_DIR) -> None: print(' 2. archivebox init') print() print('For more information, see the documentation here:') - print(' https://github.com/pirate/ArchiveBox/wiki') + print(' https://github.com/ArchiveBox/ArchiveBox/wiki') @enforce_types diff --git a/archivebox/themes/default/base.html b/archivebox/themes/default/base.html index 61418673..4a5a76c6 100644 --- a/archivebox/themes/default/base.html +++ b/archivebox/themes/default/base.html @@ -268,7 +268,7 @@
Add Links   |   Admin   |   - Docs + Docs
@@ -280,7 +280,7 @@
- Archive created using ArchiveBox   | + Archive created using ArchiveBox   |   Download index as JSON

@@ -291,4 +291,4 @@ - \ No newline at end of file + diff --git a/archivebox/themes/default/main_index.html b/archivebox/themes/default/main_index.html index e587ff75..d5135688 100644 --- a/archivebox/themes/default/main_index.html +++ b/archivebox/themes/default/main_index.html @@ -223,7 +223,7 @@
Add Links   |   Admin   |   - Docs + Docs
@@ -266,8 +266,8 @@
- Archive created using ArchiveBox - version v{{VERSION}}   |   + Archive created using ArchiveBox + version v{{VERSION}}   |   Download index as JSON

{{FOOTER_INFO}} diff --git a/archivebox/themes/legacy/main_index.html b/archivebox/themes/legacy/main_index.html index e246b0d9..2ed6dfaa 100644 --- a/archivebox/themes/legacy/main_index.html +++ b/archivebox/themes/legacy/main_index.html @@ -187,8 +187,8 @@
- Documentation   |   - Source   |   + Documentation   |   + Source   |   Website
@@ -209,8 +209,8 @@
- Archive created using ArchiveBox - version v$version   |   + Archive created using ArchiveBox + version v$version   |   Download index as JSON

$footer_info diff --git a/bin/setup.sh b/bin/setup.sh index fed1bd5a..e87c9571 100755 --- a/bin/setup.sh +++ b/bin/setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # ArchiveBox Setup Script # Nick Sweeting 2017 | MIT License -# https://github.com/pirate/ArchiveBox +# https://github.com/ArchiveBox/ArchiveBox echo "[i] ArchiveBox Setup Script 📦" echo "" @@ -16,7 +16,7 @@ echo " - youtube-dl" echo " - chromium-browser (skip this if Chrome/Chromium is already installed)" echo "" echo " If you'd rather install these manually, you can find documentation here:" -echo " https://github.com/pirate/ArchiveBox/wiki/Install" +echo " https://github.com/ArchiveBox/ArchiveBox/wiki/Install" echo "" echo "Press enter to continue with the automatic install, or Ctrl+C to cancel..." read @@ -112,5 +112,5 @@ echo "---------------------------------------------------" echo "[X] Failed to install some dependencies! ‼️" echo " - Try the Manual Setup instructions in the README.md" echo " - Try the Troubleshooting: Dependencies instructions in the README.md" -echo " - Open an issue on github to get help: https://github.com/pirate/ArchiveBox/issues" +echo " - Open an issue on github to get help: https://github.com/ArchiveBox/ArchiveBox/issues" exit 1 diff --git a/docker-compose.yml b/docker-compose.yml index f9a75748..4eb45384 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -5,7 +5,7 @@ # docker-compose run archivebox add --depth=1 https://example.com/some/feed.rss # docker-compose run archivebox config --set PUBLIC_INDEX=True # Documentation: -# https://github.com/pirate/ArchiveBox/wiki/Docker#docker-compose +# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose version: '3.7' diff --git a/etc/ArchiveBox.conf.default b/etc/ArchiveBox.conf.default index 9b014083..fe3bcdde 100644 --- a/etc/ArchiveBox.conf.default +++ b/etc/ArchiveBox.conf.default @@ -4,7 +4,7 @@ # DO NOT EDIT THIS FILE DIRECTLY! # # See the list of all the possible options. documentation, and examples here: -# https://github.com/pirate/ArchiveBox/wiki/Configuration +# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration [GENERAL_CONFIG] # OUTPUT_PERMISSIONS = 755 diff --git a/setup.py b/setup.py index cdec8133..af643c9e 100755 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ DESCRIPTION = "The self-hosted internet archive." LICENSE = "MIT" AUTHOR = "Nick Sweeting" AUTHOR_EMAIL="git@nicksweeting.com" -REPO_URL = "https://github.com/pirate/ArchiveBox" +REPO_URL = "https://github.com/ArchiveBox/ArchiveBox" PROJECT_URLS = { "Source": f"{REPO_URL}", "Documentation": f"{REPO_URL}/wiki", diff --git a/tests/test_title.py b/tests/test_title.py index 334fb9c3..89904e89 100644 --- a/tests/test_title.py +++ b/tests/test_title.py @@ -5,7 +5,7 @@ from .fixtures import * def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict): """ - https://github.com/pirate/ArchiveBox/issues/330 + https://github.com/ArchiveBox/ArchiveBox/issues/330 Unencoded content should not be rendered as it facilitates xss injections and breaks the layout. """ From ebcb05957ed4d95b73b93b0d32d2b35e6be3de79 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 02:06:43 -0500 Subject: [PATCH 30/75] bump docs --- docs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs b/docs index c90af04d..6c54fca6 160000 --- a/docs +++ b/docs @@ -1 +1 @@ -Subproject commit c90af04d27c1d4b77a97f700beb7676ef3703ef0 +Subproject commit 6c54fca67e07ea25682df5a4a3c500a1a96e1332 From cc3579f70fd9baa92542fe0df6d7caeb0d88600e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 02:20:01 -0500 Subject: [PATCH 31/75] update docker org urls in docs and scripts --- README.md | 22 +++++++++++----------- archivebox.egg-info/PKG-INFO | 22 +++++++++++----------- bin/build_docker.sh | 2 ++ bin/release.sh | 1 + docker-compose.yml | 4 ++-- 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 230a25e9..6e590e07 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ - +
@@ -36,9 +36,9 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot #### Quickstart ```bash -docker run -d -it -v ~/archivebox:/data -p 8000:8000 nikisweeting/archivebox server --init 0.0.0.0:8000 -docker run -v ~/archivebox:/data -it nikisweeting/archivebox manage createsuperuser -docker run -v ~/archivebox:/data -it nikisweeting/archivebox add 'https://example.com' +docker run -d -it -v ~/archivebox:/data -p 8000:8000 archivebox/archivebox server --init 0.0.0.0:8000 +docker run -v ~/archivebox:/data -it archivebox/archivebox manage createsuperuser +docker run -v ~/archivebox:/data -it archivebox/archivebox add 'https://example.com' open http://127.0.0.1:8000/admin/login/ # then click "Add" in the navbar ``` @@ -207,20 +207,20 @@ open http://127.0.0.1:8000 ## Docker ```bash -# docker run -v $PWD:/data -it nikisweeting/archivebox [args] +# docker run -v $PWD:/data -it archivebox/archivebox [args] mkdir archivebox && cd archivebox -docker run -v $PWD:/data -it nikisweeting/archivebox init -docker run -v $PWD:/data -it nikisweeting/archivebox add 'https://example.com' -docker run -v $PWD:/data -it nikisweeting/archivebox manage createsuperuser +docker run -v $PWD:/data -it archivebox/archivebox init +docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com' +docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # run the webserver to access the web UI -docker run -v $PWD:/data -it -p 8000:8000 nikisweeting/archivebox server 0.0.0.0:8000 +docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 # or export a static version of the index if you dont want to run a server -docker run -v $PWD:/data -it nikisweeting/archivebox list --html --with-headers > index.html -docker run -v $PWD:/data -it nikisweeting/archivebox list --json --with-headers > index.json +docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html +docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json open ./index.html ``` diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index c114eea4..eb80f48d 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -36,7 +36,7 @@ Description:
- +
@@ -51,9 +51,9 @@ Description:
#### Quickstart ```bash - docker run -d -it -v ~/archivebox:/data -p 8000:8000 nikisweeting/archivebox server --init 0.0.0.0:8000 - docker run -v ~/archivebox:/data -it nikisweeting/archivebox manage createsuperuser - docker run -v ~/archivebox:/data -it nikisweeting/archivebox add 'https://example.com' + docker run -d -it -v ~/archivebox:/data -p 8000:8000 archivebox/archivebox server --init 0.0.0.0:8000 + docker run -v ~/archivebox:/data -it archivebox/archivebox manage createsuperuser + docker run -v ~/archivebox:/data -it archivebox/archivebox add 'https://example.com' open http://127.0.0.1:8000/admin/login/ # then click "Add" in the navbar ``` @@ -222,20 +222,20 @@ Description:
## Docker ```bash - # docker run -v $PWD:/data -it nikisweeting/archivebox [args] + # docker run -v $PWD:/data -it archivebox/archivebox [args] mkdir archivebox && cd archivebox - docker run -v $PWD:/data -it nikisweeting/archivebox init - docker run -v $PWD:/data -it nikisweeting/archivebox add 'https://example.com' - docker run -v $PWD:/data -it nikisweeting/archivebox manage createsuperuser + docker run -v $PWD:/data -it archivebox/archivebox init + docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com' + docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # run the webserver to access the web UI - docker run -v $PWD:/data -it -p 8000:8000 nikisweeting/archivebox server 0.0.0.0:8000 + docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 # or export a static version of the index if you dont want to run a server - docker run -v $PWD:/data -it nikisweeting/archivebox list --html --with-headers > index.html - docker run -v $PWD:/data -it nikisweeting/archivebox list --json --with-headers > index.json + docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html + docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json open ./index.html ``` diff --git a/bin/build_docker.sh b/bin/build_docker.sh index 8e4394c8..025fe350 100755 --- a/bin/build_docker.sh +++ b/bin/build_docker.sh @@ -21,5 +21,7 @@ docker build . -t archivebox \ -t archivebox:$VERSION \ -t docker.io/nikisweeting/archivebox:latest \ -t docker.io/nikisweeting/archivebox:$VERSION \ + -t docker.io/archivebox/archivebox:latest \ + -t docker.io/archivebox/archivebox:$VERSION \ -t docker.pkg.github.com/pirate/archivebox/archivebox:latest \ -t docker.pkg.github.com/pirate/archivebox/archivebox:$VERSION diff --git a/bin/release.sh b/bin/release.sh index 10d51424..f01eb1d3 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -68,6 +68,7 @@ echo "[^] Uploading docker image" # docker login --username=nikisweeting # docker login docker.pkg.github.com --username=pirate docker push docker.io/nikisweeting/archivebox +docker push docker.io/archivebox/archivebox docker push docker.pkg.github.com/pirate/archivebox/archivebox echo "[√] Done. Published version v$NEW_VERSION" diff --git a/docker-compose.yml b/docker-compose.yml index 4eb45384..5fe91026 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,7 +12,7 @@ version: '3.7' services: archivebox: # build: . - image: ${DOCKER_IMAGE:-nikisweeting/archivebox:latest} + image: ${DOCKER_IMAGE:-archivebox/archivebox:latest} command: server 0.0.0.0:8000 stdin_open: true tty: true @@ -30,7 +30,7 @@ services: # Example: Run scheduled imports in a docker instead of using cron on the # host machine, add tasks and see more info with archivebox schedule --help # scheduler: - # image: nikisweeting/archivebox:latest + # image: archivebox/archivebox:latest # command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNAME/feed/all' # environment: # - USE_COLOR=True From e2d5b09a1a92d30121b124870254e0f6db971e32 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 02:20:41 -0500 Subject: [PATCH 32/75] update org urls in docs --- docs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs b/docs index 6c54fca6..798e00a3 160000 --- a/docs +++ b/docs @@ -1 +1 @@ -Subproject commit 6c54fca67e07ea25682df5a4a3c500a1a96e1332 +Subproject commit 798e00a3a8f6a1633ca64cb0de530c5785dc2ccd From a00b64ad5e19e4ad462d5dd27810d645fb5ad94f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 02:57:38 -0500 Subject: [PATCH 33/75] update twitter urls --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e590e07..ad6839e0 100644 --- a/README.md +++ b/README.md @@ -347,7 +347,7 @@ Whether you want to learn which organizations are the big players in the web arc _A collection of the most active internet archiving communities and initiatives._ - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. -- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. +- Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. --- @@ -479,7 +479,7 @@ You can also run all these in Docker. For more examples see the Github Actions C
- +

From a74216291a77dafba3a74bc0e1cec90dce0fba85 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 03:16:38 -0500 Subject: [PATCH 34/75] update icon --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ad6839e0..ecc921c8 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- +

ArchiveBox
The open-source self-hosted web archive.

▶️ Quickstart | From ef2b5d4e5a42606434d739d5020cafd83357adba Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 03:42:11 -0500 Subject: [PATCH 35/75] update quickstart --- README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ecc921c8..1bd03307 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@
-ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) or [`pip3`](https://wiki.python.org/moin/BeginnersGuide/Download). +ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. @@ -36,11 +36,19 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot #### Quickstart ```bash -docker run -d -it -v ~/archivebox:/data -p 8000:8000 archivebox/archivebox server --init 0.0.0.0:8000 -docker run -v ~/archivebox:/data -it archivebox/archivebox manage createsuperuser -docker run -v ~/archivebox:/data -it archivebox/archivebox add 'https://example.com' +# 1. Create a folder to hold your ArchiveBox data +mkdir ~/archivebox && cd ~/archivebox +docker run -v $PWD:/data -it archivebox/archivebox init -open http://127.0.0.1:8000/admin/login/ # then click "Add" in the navbar +# 2. Archive some URLs to get started +docker run -v $PWD:/data -t archivebox/archivebox add https://example.com +docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://archivebox.io + +# 3. Then browse the Web UI or filesystem to see snapshots of the URLs you added +docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser +docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the server +open http://127.0.0.1:8000/ # open the interactive web UI +ls ./archive/*/index.html # or open the static indexes ```
From 696f32867b15abe775b7afc6205a71ccb42d4695 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 03:45:50 -0500 Subject: [PATCH 36/75] add supported OSs --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1bd03307..5cbf733b 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@
-ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). +ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. From 5af95c5aa6b127befd92af1403904550a8ce9061 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 03:54:25 -0500 Subject: [PATCH 37/75] better quickstart example url --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5cbf733b..daf1c0ce 100644 --- a/README.md +++ b/README.md @@ -36,19 +36,19 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot #### Quickstart ```bash -# 1. Create a folder to hold your ArchiveBox data +# 1. Create a folder somewhere to hold your ArchiveBox data mkdir ~/archivebox && cd ~/archivebox docker run -v $PWD:/data -it archivebox/archivebox init # 2. Archive some URLs to get started -docker run -v $PWD:/data -t archivebox/archivebox add https://example.com -docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://archivebox.io +docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox +docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com # 3. Then browse the Web UI or filesystem to see snapshots of the URLs you added -docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser -docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the server -open http://127.0.0.1:8000/ # open the interactive web UI -ls ./archive/*/index.html # or open the static indexes +docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # create an admin acct +docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the web server +open http://127.0.0.1:8000/ # open the interactive web UI +ls archive/*/index.html # or browse snapshots on disk ```
From 7c1beb70d7857c08c3bddc62d25f8c4e5bae9a31 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 03:57:52 -0500 Subject: [PATCH 38/75] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index daf1c0ce..780c025a 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,11 @@ docker run -v $PWD:/data -it archivebox/archivebox init docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com -# 3. Then browse the Web UI or filesystem to see snapshots of the URLs you added +# 3. Then view the snapshots of the URLs you added via the self-hosted web UI docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # create an admin acct docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the web server -open http://127.0.0.1:8000/ # open the interactive web UI -ls archive/*/index.html # or browse snapshots on disk +open http://127.0.0.1:8000/ # open the interactive admin panel +ls archive/*/index.html # or just browse snapshots on disk ```
From a8c4df43221d5d87be1a8cb59957eb62102b8887 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 03:59:45 -0500 Subject: [PATCH 39/75] add link to desktop project --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 780c025a..3545ec85 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ archivebox server 0.0.0.0:8000 open http://127.0.0.1:8000 ``` -The CLI is considered "stable", and the ArchiveBox Python API and REST APIs are in "beta". +The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage. At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots. From 127efbabbbaeab12a89e56828ffaf65aa38e272d Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:08:15 -0500 Subject: [PATCH 40/75] Set theme jekyll-theme-tactile --- _config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_config.yml b/_config.yml index 2f7efbea..259a24e4 100644 --- a/_config.yml +++ b/_config.yml @@ -1 +1 @@ -theme: jekyll-theme-minimal \ No newline at end of file +theme: jekyll-theme-tactile \ No newline at end of file From dce47ea9660370fc5129a034c81e59265caedab7 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:09:35 -0500 Subject: [PATCH 41/75] Set theme jekyll-theme-merlot --- _config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_config.yml b/_config.yml index 259a24e4..c50ff38d 100644 --- a/_config.yml +++ b/_config.yml @@ -1 +1 @@ -theme: jekyll-theme-tactile \ No newline at end of file +theme: jekyll-theme-merlot \ No newline at end of file From 52fda6a728b5476bbd1d190078c0b96d0f4f5edf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:19:34 -0500 Subject: [PATCH 42/75] add css override --- assets/style.scss | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 assets/style.scss diff --git a/assets/style.scss b/assets/style.scss new file mode 100644 index 00000000..7c1714ec --- /dev/null +++ b/assets/style.scss @@ -0,0 +1,5 @@ +@import "{{ site.theme }}"; + +.shell { + width: 80%; +} From 7dbef2822ffa6840231034a1e58fdf25b7d1870a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:21:35 -0500 Subject: [PATCH 43/75] fix markdown link embed --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3545ec85..454b33a6 100644 --- a/README.md +++ b/README.md @@ -55,8 +55,8 @@ ls archive/*/index.html # or just browse
-[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io) -For more information, see the [full Quickstart guide](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) docs. +DEMO: archivebox.zervice.io/ +For more information, see the full Quickstart guide, Usage, and Configuration docs.
--- From 3fd04e4cc94d20dee204e1c7cd034706d97fe632 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:24:42 -0500 Subject: [PATCH 44/75] move style file down a level --- assets/{ => css}/style.scss | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename assets/{ => css}/style.scss (100%) diff --git a/assets/style.scss b/assets/css/style.scss similarity index 100% rename from assets/style.scss rename to assets/css/style.scss From 2431202dff3e77cac87a986138c656a1c5c02ec4 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:26:04 -0500 Subject: [PATCH 45/75] specify width more directly --- assets/css/style.scss | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/css/style.scss b/assets/css/style.scss index 7c1714ec..5fea8f40 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -1,5 +1,5 @@ @import "{{ site.theme }}"; -.shell { +div.shell { width: 80%; } From 882d317ee8130f978a0586f24987d2206710bfa8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:29:57 -0500 Subject: [PATCH 46/75] fix scss file --- assets/css/style.scss | 3 +++ 1 file changed, 3 insertions(+) diff --git a/assets/css/style.scss b/assets/css/style.scss index 5fea8f40..1e552009 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -1,3 +1,6 @@ +--- +--- + @import "{{ site.theme }}"; div.shell { From 81a958f1e760ad04964f5be0a94fb7a10b8130e8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:37:51 -0500 Subject: [PATCH 47/75] css tweaks --- assets/css/style.scss | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/assets/css/style.scss b/assets/css/style.scss index 1e552009..2e5a27fa 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -5,4 +5,26 @@ div.shell { width: 80%; + max-width: 1300px; +} + +span.banner-fix { + width: 80%; + max-width: 1300px; +} + +header h1 { + background-color: #aa1f55; +} +header h2 { + background-color: #aa1f55; + font-family: 'Open Sans' Helvetica sans-serif; +} + +#main_content div[align=center] h1 { + display: none; +} + +#forkme_banner { + opacity: 0.1; } From dc50a4c0431ea408b9e25e641ca384d24b403320 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:41:39 -0500 Subject: [PATCH 48/75] padding and img tweaks --- assets/css/style.scss | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/assets/css/style.scss b/assets/css/style.scss index 2e5a27fa..e0332f13 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -15,15 +15,25 @@ span.banner-fix { header h1 { background-color: #aa1f55; + padding-bottom: 12px; } header h2 { background-color: #aa1f55; - font-family: 'Open Sans' Helvetica sans-serif; + font-family: 'Open Sans'; } #main_content div[align=center] h1 { display: none; } +#main_content div[align=center] img { + display: block; + margin-top: -83px; + border: 0px; + padding: 0px; + box-shadow: 4px 4px 4px rgba(0,0,0,0.1); + border-radius: 8px; + margin-bottom: 20px; +} #forkme_banner { opacity: 0.1; From 4f42dfc3dce176627bf87c94e97e726e80ee2415 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:45:06 -0500 Subject: [PATCH 49/75] style tweaks --- README.md | 2 +- assets/css/style.scss | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 454b33a6..03e98f51 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ ls archive/*/index.html # or just browse ```
- +logo
DEMO: archivebox.zervice.io/ diff --git a/assets/css/style.scss b/assets/css/style.scss index e0332f13..cc40e5fb 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -15,7 +15,8 @@ span.banner-fix { header h1 { background-color: #aa1f55; - padding-bottom: 12px; + padding-bottom: 15px; + font-weight: 200px; } header h2 { background-color: #aa1f55; @@ -25,7 +26,7 @@ header h2 { #main_content div[align=center] h1 { display: none; } -#main_content div[align=center] img { +#main_content div[align=center] img[alt=logo] { display: block; margin-top: -83px; border: 0px; From a307e42896189e7660d54b9b3d45520d44450cd8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:48:00 -0500 Subject: [PATCH 50/75] use em tag as selector for img --- README.md | 2 +- assets/css/style.scss | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 03e98f51..9b753ab8 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ ls archive/*/index.html # or just browse ```
-logo +logo
DEMO: archivebox.zervice.io/ diff --git a/assets/css/style.scss b/assets/css/style.scss index cc40e5fb..f3ce0ab9 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -26,13 +26,15 @@ header h2 { #main_content div[align=center] h1 { display: none; } -#main_content div[align=center] img[alt=logo] { +#main_content img { + box-shadow: 4px 4px 4px rgba(0,0,0,0.1); + border-radius: 8px; +} +#main_content em img { display: block; margin-top: -83px; border: 0px; padding: 0px; - box-shadow: 4px 4px 4px rgba(0,0,0,0.1); - border-radius: 8px; margin-bottom: 20px; } From 991bad0d9f1c6561bdaa6996770bce98452de375 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:48:31 -0500 Subject: [PATCH 51/75] remove image borders --- assets/css/style.scss | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/css/style.scss b/assets/css/style.scss index f3ce0ab9..9d3728d4 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -29,11 +29,11 @@ header h2 { #main_content img { box-shadow: 4px 4px 4px rgba(0,0,0,0.1); border-radius: 8px; + border: 0px; } #main_content em img { display: block; margin-top: -83px; - border: 0px; padding: 0px; margin-bottom: 20px; } From b47c6e6131eaada6b7514e3f5cbafd2ca3f55592 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:51:05 -0500 Subject: [PATCH 52/75] fix markdown misplaced --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9b753ab8..d6987626 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- +

ArchiveBox
The open-source self-hosted web archive.

▶️ Quickstart | @@ -52,7 +52,7 @@ ls archive/*/index.html # or just browse ```
-logo +
DEMO: archivebox.zervice.io/ From 640c9d96f68f9d3773400d5211f457ba75c07db8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:51:26 -0500 Subject: [PATCH 53/75] fix image vertical column alignment --- assets/css/style.scss | 1 + 1 file changed, 1 insertion(+) diff --git a/assets/css/style.scss b/assets/css/style.scss index 9d3728d4..554b24c4 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -30,6 +30,7 @@ header h2 { box-shadow: 4px 4px 4px rgba(0,0,0,0.1); border-radius: 8px; border: 0px; + vertical-align: top; } #main_content em img { display: block; From 07649c3aa6b2a7006db995fc0e6bfafa90dc284c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 04:52:54 -0500 Subject: [PATCH 54/75] more margin on comparison img --- README.md | 2 +- assets/css/style.scss | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d6987626..790d8a06 100644 --- a/README.md +++ b/README.md @@ -324,7 +324,7 @@ All the archived links are stored by date bookmarked in `./archive/`, ▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** - The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. +comparison The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. #### User Interface & Intended Purpose diff --git a/assets/css/style.scss b/assets/css/style.scss index 554b24c4..a4bd9890 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -39,6 +39,10 @@ header h2 { margin-bottom: 20px; } +#main_content img[alt=comparison] { + margin: 25px; +} + #forkme_banner { opacity: 0.1; } From f05ae7043fe137c47201eacd84e9bd57eada29dd Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 05:01:23 -0500 Subject: [PATCH 55/75] add social link to funding --- .github/FUNDING.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 766165b2..ff0edb0f 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,3 +1,3 @@ github: pirate patreon: theSquashSH -custom: ["https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"] +custom: ["https://twitter.com/ArchiveBoxApp", "https://paypal.me/NicholasSweeting", "https://www.blockchain.com/eth/address/0x5D4c34D4a121Fe08d1dDB7969F07550f2dB9f471", "https://www.blockchain.com/btc/address/1HuxXriPE2Bbnag3jJrqa3bkNHrs297dYH"] From 83693a5c039ca0077e5a145ec9d9bdebf9c7367e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 16:52:15 -0500 Subject: [PATCH 56/75] add packaging setup with stdeb for debian and apt vendor the base32_crockford lib add build script for debain packages --- .gitignore | 14 ++- archivebox.egg-info/PKG-INFO | 32 +++--- archivebox.egg-info/SOURCES.txt | 1 + archivebox/base32_crockford.py | 172 ++++++++++++++++++++++++++++++++ archivebox/util.py | 2 +- assets/css/style.scss | 2 + bin/build.sh | 1 + bin/build_deb.sh | 42 ++++++++ bin/release.sh | 6 +- icon.png | Bin 0 -> 10717 bytes package.json | 2 +- setup.py | 2 +- stdeb.cfg | 6 ++ 13 files changed, 263 insertions(+), 19 deletions(-) create mode 100644 archivebox/base32_crockford.py create mode 100755 bin/build_deb.sh create mode 100644 icon.png create mode 100644 stdeb.cfg diff --git a/.gitignore b/.gitignore index 884e1da4..68717afb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,13 +4,21 @@ __pycache__/ .mypy_cache/ +# Python and Node dependencies venv/ .venv/ .docker-venv/ - -build/ -dist/ node_modules/ +# Packaging artifacts +archivebox-*.tar.gz +build/ +deb_dist/ +dist/ + +# Data folders data/ +data1/ +data2/ +data3/ output/ diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index eb80f48d..3940b731 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -14,7 +14,7 @@ Project-URL: Roadmap, https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap Project-URL: Community, https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community Project-URL: Donate, https://github.com/ArchiveBox/ArchiveBox/wiki/Donations Description:
- +

ArchiveBox
The open-source self-hosted web archive.

▶️ Quickstart | @@ -41,7 +41,7 @@ Description:

- ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) or [`pip3`](https://wiki.python.org/moin/BeginnersGuide/Download). + ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. @@ -51,19 +51,27 @@ Description:
#### Quickstart ```bash - docker run -d -it -v ~/archivebox:/data -p 8000:8000 archivebox/archivebox server --init 0.0.0.0:8000 - docker run -v ~/archivebox:/data -it archivebox/archivebox manage createsuperuser - docker run -v ~/archivebox:/data -it archivebox/archivebox add 'https://example.com' + # 1. Create a folder somewhere to hold your ArchiveBox data + mkdir ~/archivebox && cd ~/archivebox + docker run -v $PWD:/data -it archivebox/archivebox init - open http://127.0.0.1:8000/admin/login/ # then click "Add" in the navbar + # 2. Archive some URLs to get started + docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox + docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com + + # 3. Then view the snapshots of the URLs you added via the self-hosted web UI + docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # create an admin acct + docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the web server + open http://127.0.0.1:8000/ # open the interactive admin panel + ls archive/*/index.html # or just browse snapshots on disk ```

- [DEMO: archivebox.zervice.io/](https://archivebox.zervice.io) - For more information, see the [full Quickstart guide](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) docs. + DEMO: archivebox.zervice.io/ + For more information, see the full Quickstart guide, Usage, and Configuration docs.
--- @@ -82,7 +90,7 @@ Description:
open http://127.0.0.1:8000 ``` - The CLI is considered "stable", and the ArchiveBox Python API and REST APIs are in "beta". + The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage. At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots. @@ -331,7 +339,7 @@ Description:
▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.** - The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. + comparison The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations. #### User Interface & Intended Purpose @@ -362,7 +370,7 @@ Description:
_A collection of the most active internet archiving communities and initiatives._ - Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog) - Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post. - - Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. + - Or reach out to me for questions and comments via [@ArchiveBoxApp](https://twitter.com/ArchiveBoxApp) or [@theSquashSH](https://twitter.com/thesquashSH) on Twitter. --- @@ -494,7 +502,7 @@ Description:

- +

diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt index eee55cc5..8f0d5d48 100644 --- a/archivebox.egg-info/SOURCES.txt +++ b/archivebox.egg-info/SOURCES.txt @@ -6,6 +6,7 @@ archivebox/LICENSE archivebox/README.md archivebox/__init__.py archivebox/__main__.py +archivebox/base32_crockford.py archivebox/config.py archivebox/config_stubs.py archivebox/logging_util.py diff --git a/archivebox/base32_crockford.py b/archivebox/base32_crockford.py new file mode 100644 index 00000000..bafb69b4 --- /dev/null +++ b/archivebox/base32_crockford.py @@ -0,0 +1,172 @@ +""" +base32-crockford +================ + +A Python module implementing the alternate base32 encoding as described +by Douglas Crockford at: http://www.crockford.com/wrmg/base32.html. + +He designed the encoding to: + + * Be human and machine readable + * Be compact + * Be error resistant + * Be pronounceable + +It uses a symbol set of 10 digits and 22 letters, excluding I, L O and +U. Decoding is not case sensitive, and 'i' and 'l' are converted to '1' +and 'o' is converted to '0'. Encoding uses only upper-case characters. + +Hyphens may be present in symbol strings to improve readability, and +are removed when decoding. + +A check symbol can be appended to a symbol string to detect errors +within the string. + +""" + +import re +import sys + +PY3 = sys.version_info[0] == 3 + +if not PY3: + import string as str + + +__all__ = ["encode", "decode", "normalize"] + + +if PY3: + string_types = str, +else: + string_types = basestring, + +# The encoded symbol space does not include I, L, O or U +symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ' +# These five symbols are exclusively for checksum values +check_symbols = '*~$=U' + +encode_symbols = dict((i, ch) for (i, ch) in enumerate(symbols + check_symbols)) +decode_symbols = dict((ch, i) for (i, ch) in enumerate(symbols + check_symbols)) +normalize_symbols = str.maketrans('IiLlOo', '111100') +valid_symbols = re.compile('^[%s]+[%s]?$' % (symbols, + re.escape(check_symbols))) + +base = len(symbols) +check_base = len(symbols + check_symbols) + + +def encode(number, checksum=False, split=0): + """Encode an integer into a symbol string. + + A ValueError is raised on invalid input. + + If checksum is set to True, a check symbol will be + calculated and appended to the string. + + If split is specified, the string will be divided into + clusters of that size separated by hyphens. + + The encoded string is returned. + """ + number = int(number) + if number < 0: + raise ValueError("number '%d' is not a positive integer" % number) + + split = int(split) + if split < 0: + raise ValueError("split '%d' is not a positive integer" % split) + + check_symbol = '' + if checksum: + check_symbol = encode_symbols[number % check_base] + + if number == 0: + return '0' + check_symbol + + symbol_string = '' + while number > 0: + remainder = number % base + number //= base + symbol_string = encode_symbols[remainder] + symbol_string + symbol_string = symbol_string + check_symbol + + if split: + chunks = [] + for pos in range(0, len(symbol_string), split): + chunks.append(symbol_string[pos:pos + split]) + symbol_string = '-'.join(chunks) + + return symbol_string + + +def decode(symbol_string, checksum=False, strict=False): + """Decode an encoded symbol string. + + If checksum is set to True, the string is assumed to have a + trailing check symbol which will be validated. If the + checksum validation fails, a ValueError is raised. + + If strict is set to True, a ValueError is raised if the + normalization step requires changes to the string. + + The decoded string is returned. + """ + symbol_string = normalize(symbol_string, strict=strict) + if checksum: + symbol_string, check_symbol = symbol_string[:-1], symbol_string[-1] + + number = 0 + for symbol in symbol_string: + number = number * base + decode_symbols[symbol] + + if checksum: + check_value = decode_symbols[check_symbol] + modulo = number % check_base + if check_value != modulo: + raise ValueError("invalid check symbol '%s' for string '%s'" % + (check_symbol, symbol_string)) + + return number + + +def normalize(symbol_string, strict=False): + """Normalize an encoded symbol string. + + Normalization provides error correction and prepares the + string for decoding. These transformations are applied: + + 1. Hyphens are removed + 2. 'I', 'i', 'L' or 'l' are converted to '1' + 3. 'O' or 'o' are converted to '0' + 4. All characters are converted to uppercase + + A TypeError is raised if an invalid string type is provided. + + A ValueError is raised if the normalized string contains + invalid characters. + + If the strict parameter is set to True, a ValueError is raised + if any of the above transformations are applied. + + The normalized string is returned. + """ + if isinstance(symbol_string, string_types): + if not PY3: + try: + symbol_string = symbol_string.encode('ascii') + except UnicodeEncodeError: + raise ValueError("string should only contain ASCII characters") + else: + raise TypeError("string is of invalid type %s" % + symbol_string.__class__.__name__) + + norm_string = symbol_string.replace('-', '').translate(normalize_symbols).upper() + + if not valid_symbols.match(norm_string): + raise ValueError("string '%s' contains invalid characters" % norm_string) + + if strict and norm_string != symbol_string: + raise ValueError("string '%s' requires normalization" % symbol_string) + + return norm_string diff --git a/archivebox/util.py b/archivebox/util.py index ae827899..4e55e30d 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -16,7 +16,7 @@ from dateparser import parse as dateparser import requests from requests.exceptions import RequestException, ReadTimeout -from base32_crockford import encode as base32_encode # type: ignore +from .base32_crockford import encode as base32_encode # type: ignore from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding try: diff --git a/assets/css/style.scss b/assets/css/style.scss index a4bd9890..44657267 100644 --- a/assets/css/style.scss +++ b/assets/css/style.scss @@ -6,11 +6,13 @@ div.shell { width: 80%; max-width: 1300px; + min-width: 300px; } span.banner-fix { width: 80%; max-width: 1300px; + min-width: 300px; } header h1 { diff --git a/bin/build.sh b/bin/build.sh index 7b1c3232..693c2bbe 100755 --- a/bin/build.sh +++ b/bin/build.sh @@ -16,6 +16,7 @@ cd "$REPO_DIR" ./bin/build_docs.sh ./bin/build_pip.sh +./bin/build_deb.sh ./bin/build_docker.sh echo "[√] Done. Install the built package by running:" diff --git a/bin/build_deb.sh b/bin/build_deb.sh new file mode 100755 index 00000000..6f5e418c --- /dev/null +++ b/bin/build_deb.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +### Bash Environment Setup +# http://redsymbol.net/articles/unofficial-bash-strict-mode/ +# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html +# set -o xtrace +set -o errexit +set -o errtrace +set -o nounset +set -o pipefail +IFS=$'\n' + +REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" + +source "$REPO_DIR/.venv/bin/activate" +cd "$REPO_DIR" + +VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" +DEBIAN_VERSION="1" +PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988" +# make sure you have this in ~/.dput.cf: +# [archivebox-ppa] +# fqdn: ppa.launchpad.net +# method: ftp +# incoming: ~archivebox/ubuntu/archivebox/ +# login: anonymous +# allow_unsigned_uploads: 0 + + +# cleanup build artifacts +rm -Rf build deb_dist dist archivebox-*.tar.gz + +# build source and binary packages +python3 setup.py --command-packages=stdeb.command \ + sdist_dsc --debian-version=$DEBIAN_VERSION \ + bdist_deb + +# sign the build with your PGP key ID +debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" + +# push the build to launchpad ppa +# dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" diff --git a/bin/release.sh b/bin/release.sh index f01eb1d3..d9c9b52d 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -42,6 +42,7 @@ echo "${contents}" > package.json # Build docs, python package, and docker image ./bin/build_docs.sh ./bin/build_pip.sh +./bin/build_deb.sh ./bin/build_docker.sh @@ -64,11 +65,14 @@ python3 -m twine upload --repository testpypi dist/* echo "[^] Uploading to pypi.org" python3 -m twine upload --repository pypi dist/* +echo "[^] Uploading to launchpad.net" +python3 -m dput archivebox "deb_dist/archivebox_${NEW_VERSION}-1_source.changes" + echo "[^] Uploading docker image" # docker login --username=nikisweeting # docker login docker.pkg.github.com --username=pirate docker push docker.io/nikisweeting/archivebox docker push docker.io/archivebox/archivebox -docker push docker.pkg.github.com/pirate/archivebox/archivebox +docker push docker.pkg.github.com/archivebox/archivebox/archivebox echo "[√] Done. Published version v$NEW_VERSION" diff --git a/icon.png b/icon.png new file mode 100644 index 0000000000000000000000000000000000000000..04462b21cc1c72de4fba6a6534103e0805c2879c GIT binary patch literal 10717 zcmZ{J1z23mvhHHA!QEYgy95cAAi*7i1$TFs!QCOa1b0htcMSvx5`w!!upk3@Wbb{? zIq$st)_gr(Q&oR;S9SO5>Yhkt#kXk4#K-^upvg!}s6N*{e+n4k`CE0_77+kYjjhDR zm1V@mDV3ca%&lzA06;o2^&O&?+B*^v57VSN5LN=#4Ys5?Ktxpq264fwAjwb~4WwhL zFH4sPxkWQ5W=BX?;w1=T)=w6+R2T9qmwz+HBVhVH3RFK1Uj|IMoO)g6IP7KnOd&$_ z=?&<{-(JHoB;b$XP>XBh%*<#Py@epl2Z|U1N?l@9TLZ(xfOPQX+QGFOq->Yjo=Nw& z-tSsd23$rsfRYH)0tqh&6rl_B$iZ=WQ2_%SBh5Mb3~EN415(a}lmnkM_}>PnRXTn^ zaCkRP`@)gR9t4r!y4(=}_M2XdR7xknp6rVRExu<;k%sxo9mFq9xr;)j800DOx>p@9 z7o#30CGB+wiLv`U!AqAXb5WZSk-R+a5MHnHNK!hLk&8R--%~SC2d0sC==YA0V7d=) zUzA;h^LAwL@#=(HVg8c-WW1%`7Vk@UGOS|aLPqWTNkMLCjFFu(5Pj-}htopYl>s&5t- zdq!bnfEvpt+ERc9i=u33Ix--|np8CKrV6D#k(!-ThbH(tZggvXFqgAg5R?C_i0{-^ zre#AS9}-#PrN54iBjX0&8Jig+v1>+GA`i-_WZoP4!1=mC=`i^sin6y~zoH%tN$=GQ zDX4Q)IMr&G&aC=E&`!^cs_^yjX6$0C@q%AFg`#}(hnQ^%gY37%O%vpe-EWJ-CZ?f3 zWit7HPUt-?dMXHcVnx8{VIpXWA-%;MX5Ce443}07MdkNhv}0I+OWQ72cA_ zd6D^qP?$t8L)g`uxk_qfxI2pQmdsL;5zSisa^8y`lL!7J-oTq@D_tr;G%FpPmTk^}OEgSk@F%U7r zayzNgDr0q1UmdM&3#9ZC@Ggw0EHg5g5xxl7ylO0|iLUvTfTtnZT@_C^U-hxVIb5^r zX6mtE@(usn0MK6K;KdYnb=WAHp!Ytg9+sU1S5vGGy(O1d)~q4P!A)@ZQ%vopgB z_v#v|8~S^zCr_{25s)Rhku?xwUTM5kV;jXHXy89*rZoOp6qHD#MSe(eS#c~8eQ741t)`?rtekl=nB6#=Oe2u3BBYmVn9l))BbBy(F0M>}+wKP9=FfFB( zG+k1#G}a3)672|eO6J7t5zQj}I~akOY_UmdvJuv8k(pR%XvbU5lrZ9t_+_wy$YzD4 zJj4ob=sybV^L!>`l}szputaMkuoF|t-RsKC$PWDE$k!5Hcy6i(QR?5@lsmt^weNs&=&xP>^-~{; z6R5~R-+~QA66%9{gKi}FF-BQtQHEcFA6`j~V1qYN+7z2Cvg@3u1j{AzQ-oXuN#sKW z>j3(uxG_bf1lNn!B#?{~tw?!-x|Gh_B{@OGHu*#9&KE}s)g$EFU}r&XSvIP7Nxu@} z6WNpV5*`@uaebmhU&K)tO%$;d+LrRDv8lE(JgUQ1(C{W?N_;71%hyynDlaYbC}VeX zb+vVcx(*#lyY3#T9~>_T6uKDfk$p+neX}*gWhuL)y2Pe~a)N*IB1h39Km6<2#_qjF79=Z07_Q9uSD|_C!!al-rtEQdCowA+5-UQd)SB&@Q_j-O^ zPkN8mkNx+2R}ZL!2!&{ysK}@v2#-1WZO*?XnC=@A+B5L5@GhHs&YV-hnk;g)pu(6? znY`MDO*_lx;V~1=o#kJ_)}#|k#5M<@FM+g!fXO|;%MSm zKTDWXSY#M0xu)pxMI9eqzrq`N3A!D+2l<+e4nDJtu&mvT8V)<2PMff-*7R7uEc>R> z;}q>F$HEPme)#_Ja0(9l!VSe$w=p!rvE; zKQTSkee|x2e55=lFKsV;H+BwRTr6$7&1P)~Pd@0LX`aQ4bcs0UdFR<1yjqiAL+j#O zlsfjj!M-694KgTO6Y8e%KZE)~Q64dFe-o|aClJ)puOAQ&a~5#^7#2ePjI@ntk9Le| z3~z>DhjfVAguaWo04EBrN5cv`M)m4*+TC`(an22I3m+552<_UqciD5)?W?OBsk^AV zaqK_eKEK}hMQuy%Onpf0P18wZA=xB>RZvi5qkpEgR_`*@Bsp7vQ@HR} zG{Pw|ahNAwd^nb@*?Kug>p{!P)`T=-Yp^=1o#c#Al=Z3f=fKaOTb0A2ii9~FuQpt% zX`Ey?a|ae=KFW+I+0(yz!JMqCcBRUtYOS_f6jod#f0E5%-ItBDH#;#yYo%~dbd_?I zbKr;5PhXRy{VV#iBnS zR;3T~u1@n@o{(CP^%x&MN;4|g6w$nP!aoxy{S#BEE-6b5?l0+2#?DGSD$*BFC=p zJO<6k+{Iq_t0=3uP5(GxoW+>rNT3iG;#IAy{sMI5uAM-cV4L-}c%akNY_u}893@MRRdz2;1WI?2)yB2L<@G(* z0k&(U18hL6{jF=!26SD{l7K^~ImHo4dE)uQG&d+`sQw<8^k>kbV zwY6aVdY|InvCUMBcd@#2b(xKAzSot!YV+CER~5@0AKElqn?5m>R9NM#wCx;;R1ws6 zYm(`7It1T^4U(+d;@Wp~Fq*18nn!=Z%UW|tv~6wWb+!?Y6<64Kx!F`;7vEfJE7}>{ zS?FH+eKIPC!b8Tx?No7bMe|5kw|&@mszLCxqV_^3pUGMAS$~&9g-7LDTYPJ+vFKv$ z^6v+X3{gE%OuzhV$_K;8_#07s{sDgVRS3_7?Yn_Vg}#Rq!x#h7)k+P3JSergjtF zOF@$RO8EEBsRU8>E5-d2Z>j*!^M$0e5_XTy3-7rlf;Epbx89>Cz8wdJwNw9Le~){= z!w34uSxY?rs4;zl@we~WQ!-`y7kh0R$$B!>0n@Yon@X_3bLYiS= z*;xTfUf6RL5st6jPKN6=$13%@z-Rf{t4t>9zE#8MiUs&)O@&O1E@}med&xM1DKWf2 zf?Mw=0$jdGdc)9?=;uSn2Qw`ha|H!}>6r!t@G#f_>@x-P`~hHy0r&!8Ay)O+AaWq|Kd-9kx`{R2LO(hs{n+`Im~TjWD&Af-bW zFB=aV2ek+?B_*Yhv#GhDs)W=(@aJE`)RwNUj)Ls$9v&WS9^7mW&KB&P0s;c;99-;N zT&&L;tS(;mu121$_AWGk8~GnQ5@s$Y&Q^}DRu1-*f9x6=JGi+DQ&azG=wHv@{WSBm z`gcqAF8{do>>&Fe2|Fhn2mAj9bG0)6|G@r8{)YYK*Wb+v{V^t};bP`2?qFwUX74KU z?}!Wi)zbeZ{C7Y91}a;5n%QbeSUn?Mo_!MG;pP1Y_1}{J6{-CnNS>GfiTn@Ae<1(3 zAgJVQ_1sURKQR>H6k`8>c>k@hYUbi#>-I-o!`{kOgzF#3f3g2T39}Zee3z#R z%E2VX)8CS>OjDDYNABkl--r7d0~;;Ap|j;YNVA5kh15>ydM zUccpeBk{ehgE3RicD+GOk?ulee$Ux$Cz6hzJx$)n47%aVwN)&K!(BcqX%%|4>D*af z#ptnGy_hztaZvSn^Y$oNIC_a%P@H78^^nnEI?%OCMxda&wX5VH&HE*rY`6W2M)XVs z`s~xkkB7-!U9|jd7D~w%_NTqIpGV}BgUYP*8+8=wk~?U}-Gj{9Ssr`$+IXAuZb~s` zH?UQ^K3|$FP~-{z4vn0F2JczWUlK$K^+dw`8u(o#Q!^>sTU;3iM$aJqnHYFc0=cA0 ztoY%CcijtlIM^*jyxM^x4l=-MPWycSCX@a$gt)Y7S_j8WlV5A2uS8=uZ&+y7 zF#AUcFr=cUeQ^ji_Zlx|O>+?3ezW;79*(82Kp{70HDMF;aA!LqPHNxE6?^w2n0mxb zGD|_e%-QJjxSmG=9dr?WU1BhV6&^ykH+6gRG3)8LSkA%Et44L~M4M0k2MX-ByaTG- zjbhJX=#H*l*MRdtRcGjqciE1QpI4wxF3uZ-p=z_hA!Oz_#D$9}nV(Fb->Uf(cSz~N zF$o1IrtR*ZWXN634E(TNLH_LVIA`E>(YZkW8n3%5i~JEOPb=UAXG)>#Y25*RRjk=z ziL;q;AY_T4(`{2Ya&j!sjhwa>j%i*a@^G-FI@cy%Pe0m|>!5iq>=_%NW`jnO8# zi-Q=M1hvqFL40}oG!)O4b6Y6db$_!Njv!T0Zpx**8WYS<7OdC>~=9`2n?JnGVJYIJzWJ5^_ z1x6Y$uw*X{4!HZ9h0X`Qzw5g8>3?AIuwQ+@(dd=QoLLUq)iI47uTX=ldfSu1oI_?T zm_h6BsditBLXWd1;z4`8eqNpTxZ5uJ;A2jmvmHO9)_w8at@gd(!{96L+L9j#9&CDap#9q^vSZJrG9VSFQ%vCj~U1W=xiHe~(9f|H<*l@R{+>CPlH zP(8P4_rZ&uQVF@APVGSr4kQ8CQ2TfEaplUtP`&C~%-VJ8R)IwCgbp+*hv+Y=^dZgSDC#-8vy!+DEGs2JdK0@i0X~>h&13J5OCKo+z17T2$sh82sBUuusIy&WTuV)u91P-8N~Nt_SH<6VwbQBwxhD*3D98%O?S67F?I9Q!kH z$bvh64Q}`yzVT=)1B5NmSO7ZqSp^WWQNw+Wr2=Oh3QWTFZ7^(T0ap!ulK4?5PO%#A zrb!nyEI2`2^G1o5xR7|XO`L|Hlcj8rk<#VO3GuDNPH%sO)x&*ljcN^z3i`XKINnJi z7%m${DpV&SJ$SI@eWfppOIrfXl5&11SDuv0&;fG{Q=H9EeKeSag1j$=!xwHMV3@B( zjuZ=Bjwob3oA&a6)-$i9fFK!?!jsO0V-n1mJ zQpUx3tNi4oHIuE*hmpG62F?_%Is3dj@PdNIFl9F7jqs>PpW-sjmFNvFV%B6mTdGpH@m4Toa41Hx!_Gp4RorY^T{)db!d+w`I*7&IZ8>_3 zHca_(a+PY8^CcwC(Q`}N5BM#2ObQwn2TC5NrAIyp3;v38&Xz_D*Oq zC6}P6ZC$M-vm^8dHxW6(IXaCY>_zAh=j|QI%-iiI3pt@kbdJIGmK?)K)W2Fq+y!%R zsc|AYO@k?Vc;DxGSP#Ox#kknhD zu*veiYNS{;=a`!xUr?sFX%Kw72a8iPBlrQUr4?=p%BZaq8J1opCK3Z`87@Mcfgv8$ z@oQ4&IVqhW-Q*zD3SV+8!aP5MU6VE49+bD}9q+z;gX#cuJPF2T^ThlRw4R)DX4Meq zlmcvw+%JVxG;jG`w%%|YnqQ7ry3T4SKN>y8(F+~o2JT>{?+tHjxNyoT&uA$HUS179 z((#XYeeaH}rhjb`1<jerrF70&}_Hvk-@GxjG32;fnZO$IS%niQ&sM*<%2U%I!@^p zlP4wN1(OWtwQANYp&oEE%r#hf6XBMCDGn)1c7j)jbD<)Z7COx3tdpgf^rkV3SW=J= zsx@v_07dTRkGpS-pmC~osAi1Z9ZR3H+k6s5l=>!oB#*wgaw(WcGXLd@#U1w15^S=L z_pCLUG!HW7=Uy0+3V@8Tnvf?^uH?%hJuR%Eem_iAZ#gl)5lVs!E2Mw%ur2xBi7_Q& zqS^F=lt(D(2s^yDs;@N0hjFTtkdd^~OCL zvnkx1Nc3Qb(8-IV6VJk`06$T?z(E(1D`>e}ZPFW(N z`jVsHoqlF|Hb;OR?gI6Dsr*uUrNLHhrUplXAHEo-O+s>f4$F$K!kbis){tr@*+V>- zBoaK0N`^^9NRqVMJRZKd{%RsY>b_&ES5?1d<%C|%?FU+1@=qcy@+e*S46J3{!soPj ze(>K@%PWarbt{F9lBbV-k|j)RdlLLI1HFsgNKJ7lwS8q4g22e^Z{l6qQp9&3dm`;` z(k6O}VsetVoAIzeUFf?1m5N-f0G1>m(w>BL!^7Ascdr47w0o#8Rut7cRmoraoJhqJ zJ9(G>=VKNE!&KJ#=0FT1GcDY~I5xwGi2M&_5R2bS`b%GBO!`(ONwd2xdce?J3ucJG z+>(RW5O=-i(PNrI=qjN*J&aXi{cbPM1J_WT9jzF}RRM!t_j?9=%d#0dc@$w^A6DjINrxKW>sNu6qn^YJg!uJ^@#dB1^sx7H@SAs_t z4O@>|;8H#BKCO45oW>CC@%L*NHYy@4zKL-DCJRT5tO3&_ql%!!(c=>3Vhf%lms04j&zOjq3koJ-K z4aj;mc?=XrjdSu$1BVg-Vu;Lok)$Bfc6kmgl?` z!^^)+w~oeoBZf!$Cb+oBogC%?Pb?44&Hr1YB*rssIQ=f1{DfE;ZaxN52?&0vA(FA9 zOae#1E2BkhpEFL|Jn#EGyX;5vV}~;Ur+@mlIy*gC($F zG$0IdnKR3raOyCE2*odPCE;Q~=S)8~g~f19-E$;Xw2=!NNn0GL5I?@_mFlK%Phr~lZ!BxCZG~H-c#81L~;hneCc{!oXC0S&D_&z*uJi?V}pcugS^h zPUqw2e`>L|!Ft`J0*Ug+!cY9E%JnJ+a%~app1y|18;%oSc+hqJat$fEvrxuv>{q@8E>H;eQ~kyvP%%$H;n$jnsPO&VTdRed z_p3Y9mNM_rU}SFLxy$dN0y7m+~nlsNv`s&l9adDX>-@r zM};QCag81{O0Z{6QEfgD`?&NKM-xU)K|$Ggp$qW!uU|N)JxMkYQzc-B-+3Fn;xZ+i zGQP{Okjy>ym32}qSg6+-|Lsb_z?1)bc!o-NE#s>D`+G}M(~t|Vu8oG!A5C^yqrEdU zSzj_8`9d(VjD(gCeFP6+KeHNvdQyKg9bBD#wBqKIE+5^6t1QoLyIKOJ>+$FYKSbN&`- zu>XPuacW7dbf%8$c`3Wj=z#c^F%o1f1VTPHRR_kNo{P8ZG&j&p0IdI)T$!(S=gv%!t2pzInJg;WkAwX!>f2 zLRJ9W#OT9v`Fq)nAk#7hfdbA}J9pU=;@;S*SLsB&mnvaafsA01(tT`!6~m>T@P9Rz z2KMLk0{eh;8Qm`=(;M*(i&5Pge7`MB#Cj09VrRK8arHU0B9y^vnbWol_^^rVm^Rqu zz8?fhAjrSqF$ULN{jhzGc9ps+P-?I(A-_%|Rl@)oG}SH7!rv+`s)lq~TItn$Jh5YkaZ20K9LMSYLos-| zMf=7tu20;bu5P#DXFrOE&&QSRupYPmr1So;=0wRw~hWy_#0~Yx-TGF5Q5Unkl^58*I_ab zIQGz`V2$Th3CChyx%GX5lB#;d5NUH$;vjgJ4_VX{7amL;tHA?;tH&59#JjMGn4bfJN zSObK1dB3W_6>xYxGw~tCYbYMLv3PvD2~==3Ep3|nf*_Ujo~jS*>C`LP?(gH?DTt-8 z^i*V^HW}kmqT&ZKG?PD%vknU75r;-#S%p~8T2tS?_NQW7Z{aauF@CsZRR!o4iEV&O#!u3((rt*c zFr^9M9sR#p#!9n-q;Y$cN7w_kit8xp%pOS_z-kJPSUuZjsbaiD8EtDI&er)dCa_FB z(w7(6pYDE!edy@HDy695CE{jI&Ww$H+?*QcAy9Z=0n|cCU>%Gf(TZ~c7D{VvPHntxGr z-lj*@>>!iErO#aZ!^`QLWADD1Idgja9&%!$S_vkZolxvAU0!iF_Fb9o_DgRL#wBG_ zzHj*Ri@-IPjK+!^lf9`CAMaCX!x273~aqSydtEo;qHtnr8BC0t1A7 zw185s17h+nn6RA7^}xn(s0Q5p-2%63!}?3H?CMui!S7An zlvVQLrF_n}BU^_vG`tjjY+%S!AcT$d3FbPOWGLBQK_ZO{G3vd{b&**-{eb^@m=rgBu*?9|EK)CZK4Y zdeDu)slBdysA$0N(zLa^?KAt%;x-4v2k5;#MxIR8w#%%W3#Kq<1pVOY#FT}z26pmb z>saE?*;e|=&ZL$?j~|~5+Af>@w{j^S=Q>6@nm1BI4xfS#3<9oy1Uzg$l!|aCFnxW# z`Q5MZ6}>KYsQ~>X^L3Y4b+", "license": "MIT", diff --git a/setup.py b/setup.py index af643c9e..6b40b803 100755 --- a/setup.py +++ b/setup.py @@ -51,7 +51,6 @@ setuptools.setup( "requests==2.24.0", "atomicwrites==1.4.0", "mypy-extensions==0.4.3", - "base32-crockford==0.3.0", "django==3.0.8", "django-extensions==3.0.3", @@ -80,6 +79,7 @@ setuptools.setup( "recommonmark", "pytest", "bottle", + "stdeb", ], # 'redis': ['redis', 'django-redis'], # 'pywb': ['pywb', 'redis'], diff --git a/stdeb.cfg b/stdeb.cfg new file mode 100644 index 00000000..6eaa8f2d --- /dev/null +++ b/stdeb.cfg @@ -0,0 +1,6 @@ +[DEFAULT] +Package: archivebox +Suite: focal +Build-Depends: dh-python +Depends: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib +XS-Python-Version: >= 3.7 From 02551c0152f3957debd079a6764d62e79c829d8c Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 17:28:45 -0500 Subject: [PATCH 57/75] minor packaging fixes and bump to 0.4.21 --- archivebox.egg-info/PKG-INFO | 2 +- archivebox.egg-info/requires.txt | 30 +++++++++++++++--------------- archivebox/base32_crockford.py | 4 ++-- bin/build.sh | 2 ++ bin/release.sh | 5 ++--- docs | 2 +- 6 files changed, 23 insertions(+), 22 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index 3940b731..4c2de1ad 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: archivebox -Version: 0.4.21 +Version: 0.4.22 Summary: The self-hosted internet archive. Home-page: https://github.com/ArchiveBox/ArchiveBox Author: Nick Sweeting diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index eb8d2f35..083c3cdd 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -1,25 +1,25 @@ -requests==2.24.0 atomicwrites==1.4.0 -mypy-extensions==0.4.3 -base32-crockford==0.3.0 -django==3.0.8 -django-extensions==3.0.3 -dateparser -ipython -youtube-dl -python-crontab==2.5.1 croniter==0.3.34 +dateparser +django-extensions==3.0.3 +django==3.0.8 +ipython +mypy-extensions==0.4.3 +python-crontab==2.5.1 +requests==2.24.0 w3lib==1.22.0 +youtube-dl [dev] -setuptools -twine +bottle +django-stubs flake8 ipdb mypy -django-stubs +pytest +recommonmark +setuptools sphinx sphinx-rtd-theme -recommonmark -pytest -bottle +stdeb +twine diff --git a/archivebox/base32_crockford.py b/archivebox/base32_crockford.py index bafb69b4..07dac08c 100644 --- a/archivebox/base32_crockford.py +++ b/archivebox/base32_crockford.py @@ -37,9 +37,9 @@ __all__ = ["encode", "decode", "normalize"] if PY3: - string_types = str, + string_types = (str,) else: - string_types = basestring, + string_types = (basestring,) # noqa # The encoded symbol space does not include I, L, O or U symbols = '0123456789ABCDEFGHJKMNPQRSTVWXYZ' diff --git a/bin/build.sh b/bin/build.sh index 693c2bbe..988fce21 100755 --- a/bin/build.sh +++ b/bin/build.sh @@ -14,6 +14,8 @@ REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && p cd "$REPO_DIR" +# pipenv install --dev + ./bin/build_docs.sh ./bin/build_pip.sh ./bin/build_deb.sh diff --git a/bin/release.sh b/bin/release.sh index d9c9b52d..96dd8a51 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -12,9 +12,8 @@ IFS=$'\n' REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" -source "$REPO_DIR/.venv/bin/activate" cd "$REPO_DIR" - +source "./.venv/bin/activate" # Make sure git is clean @@ -66,7 +65,7 @@ echo "[^] Uploading to pypi.org" python3 -m twine upload --repository pypi dist/* echo "[^] Uploading to launchpad.net" -python3 -m dput archivebox "deb_dist/archivebox_${NEW_VERSION}-1_source.changes" +dput archivebox "deb_dist/archivebox_${NEW_VERSION}-1_source.changes" echo "[^] Uploading docker image" # docker login --username=nikisweeting diff --git a/docs b/docs index 798e00a3..d5071d92 160000 --- a/docs +++ b/docs @@ -1 +1 @@ -Subproject commit 798e00a3a8f6a1633ca64cb0de530c5785dc2ccd +Subproject commit d5071d92367a91bb585abb5da7c65ebc61d0d7b0 From ff6a28a27f31830b7eb676ab9058d91b22eeb988 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 17:41:22 -0500 Subject: [PATCH 58/75] add python build tools to debian build-depends --- archivebox.egg-info/PKG-INFO | 2 +- package.json | 2 +- stdeb.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index 4c2de1ad..4eea75b2 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: archivebox -Version: 0.4.22 +Version: 0.4.23 Summary: The self-hosted internet archive. Home-page: https://github.com/ArchiveBox/ArchiveBox Author: Nick Sweeting diff --git a/package.json b/package.json index 70f58f61..243f68e6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.22", + "version": "0.4.23", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", diff --git a/stdeb.cfg b/stdeb.cfg index 6eaa8f2d..80d24a29 100644 --- a/stdeb.cfg +++ b/stdeb.cfg @@ -1,6 +1,6 @@ [DEFAULT] Package: archivebox Suite: focal -Build-Depends: dh-python +Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb Depends: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib XS-Python-Version: >= 3.7 From be7a7f8548d76babb5ed0b4f8bae0109d94d3f76 Mon Sep 17 00:00:00 2001 From: mAAdhaTTah Date: Mon, 23 Nov 2020 18:34:07 -0500 Subject: [PATCH 59/75] Fix string checks in schedule `s` comes through as a `PosixPath`, so both the `' ' in s` & return value, later used by `join`, complain. --- archivebox/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/main.py b/archivebox/main.py index 26f05427..3d577a42 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -938,7 +938,7 @@ def schedule(add: bool=False, if every or add: every = every or 'day' - quoted = lambda s: f'"{s}"' if s and ' ' in s else s + quoted = lambda s: f'"{s}"' if s and ' ' in str(s) else str(s) cmd = [ 'cd', quoted(out_dir), From e85b8836966eadbf6a87440e0e97d5ed9196acd5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 18:19:27 -0500 Subject: [PATCH 60/75] force binary name to be just archivebox --- stdeb.cfg | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/stdeb.cfg b/stdeb.cfg index 80d24a29..d1184f0e 100644 --- a/stdeb.cfg +++ b/stdeb.cfg @@ -1,6 +1,9 @@ [DEFAULT] +Source: archivebox Package: archivebox +Package3: archivebox Suite: focal +Suite3: focal Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb -Depends: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib +Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib XS-Python-Version: >= 3.7 From 5e7c2d0ab8db13bcbd437f34d3111579fcb51b40 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 20:24:37 -0500 Subject: [PATCH 61/75] show archivebox and node versions in version cmd output --- archivebox.egg-info/PKG-INFO | 2 +- archivebox.egg-info/requires.txt | 30 +++++++++++++++--------------- archivebox/config.py | 23 +++++++++++++++++++++++ package.json | 2 +- 4 files changed, 40 insertions(+), 17 deletions(-) diff --git a/archivebox.egg-info/PKG-INFO b/archivebox.egg-info/PKG-INFO index 4eea75b2..1d528824 100644 --- a/archivebox.egg-info/PKG-INFO +++ b/archivebox.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: archivebox -Version: 0.4.23 +Version: 0.4.24 Summary: The self-hosted internet archive. Home-page: https://github.com/ArchiveBox/ArchiveBox Author: Nick Sweeting diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index 083c3cdd..457f64e5 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -1,25 +1,25 @@ -atomicwrites==1.4.0 -croniter==0.3.34 -dateparser -django-extensions==3.0.3 -django==3.0.8 -ipython -mypy-extensions==0.4.3 -python-crontab==2.5.1 requests==2.24.0 -w3lib==1.22.0 +atomicwrites==1.4.0 +mypy-extensions==0.4.3 +django==3.0.8 +django-extensions==3.0.3 +dateparser +ipython youtube-dl +python-crontab==2.5.1 +croniter==0.3.34 +w3lib==1.22.0 [dev] -bottle -django-stubs +setuptools +twine flake8 ipdb mypy -pytest -recommonmark -setuptools +django-stubs sphinx sphinx-rtd-theme +recommonmark +pytest +bottle stdeb -twine diff --git a/archivebox/config.py b/archivebox/config.py index d321dd72..dc50679d 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -157,6 +157,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'READABILITY_BINARY': {'type': str, 'default': 'readability-extractor'}, 'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, + 'NODE_BINARY': {'type': str, 'default': 'node'}, 'CHROME_BINARY': {'type': str, 'default': None}, }, } @@ -296,6 +297,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, + 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, @@ -318,6 +320,8 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()}, 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None}, 'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])}, + 'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None}, + 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']}, 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, 'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']}, @@ -665,6 +669,11 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict: 'enabled': True, 'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(), }, + # 'NODE_MODULES_DIR': { + # 'path': , + # 'enabled': , + # 'is_valid': (...).exists(), + # }, } def get_external_locations(config: ConfigDict) -> ConfigValue: @@ -718,6 +727,13 @@ def get_data_locations(config: ConfigDict) -> ConfigValue: def get_dependency_info(config: ConfigDict) -> ConfigValue: return { + 'ARCHIVEBOX_BINARY': { + 'path': bin_path(config['ARCHIVEBOX_BINARY']), + 'version': config['VERSION'], + 'hash': bin_hash(config['ARCHIVEBOX_BINARY']), + 'enabled': True, + 'is_valid': True, + }, 'PYTHON_BINARY': { 'path': bin_path(config['PYTHON_BINARY']), 'version': config['PYTHON_VERSION'], @@ -746,6 +762,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_WGET'], 'is_valid': bool(config['WGET_VERSION']), }, + 'NODE_BINARY': { + 'path': bin_path(config['NODE_BINARY']), + 'version': config['NODE_VERSION'], + 'hash': bin_hash(config['NODE_BINARY']), + 'enabled': config['USE_NODE'], + 'is_valid': bool(config['SINGLEFILE_VERSION']), + }, 'SINGLEFILE_BINARY': { 'path': bin_path(config['SINGLEFILE_BINARY']), 'version': config['SINGLEFILE_VERSION'], diff --git a/package.json b/package.json index 243f68e6..c7a61c1e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "archivebox", - "version": "0.4.23", + "version": "0.4.24", "description": "ArchiveBox: The self-hosted internet archive", "author": "Nick Sweeting ", "license": "MIT", From e0b08114628e1576e51d410dce4e6f16cf61ffcc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 20:33:35 -0500 Subject: [PATCH 62/75] add instructions for apt, brew, docker, pip --- README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 790d8a06..ed13e23c 100644 --- a/README.md +++ b/README.md @@ -26,12 +26,27 @@
-ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended) or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). +ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [apt](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [brew](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API. +**Get it via your method of choice:** +```bash +sudo add-apt-repository ppa:archivebox/archivebox +apt update +apt install archivebox +``` +```bash +brew install archivebox/archivebox/archivebox +``` +```bash +docker pull archivebox/archivebox +``` +```bash +pip3 install archivebox # you must install some system dependencies manually when using pip +``` #### Quickstart From a1cd8a5217ec4205e86a1bc7b8e51b4bd5b0c938 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 23:12:27 -0500 Subject: [PATCH 63/75] fix missing amd64 docker builds --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 2a4ebf26..a609e55f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -49,6 +49,6 @@ jobs: archivebox/archivebox:${{ github.sha }} cache-from: type=local,src=/tmp/.buildx-cache cache-to: type=local,dest=/tmp/.buildx-cache - platforms: linux/arm64,linux/arm/v7 + platforms: linux/amd64,linux/arm64,linux/arm/v7 - name: Image digest run: echo ${{ steps.docker_build.outputs.digest }} From af09730a86db0bad90030f6f3c5cb1f5b4a439d5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 23:29:19 -0500 Subject: [PATCH 64/75] clearer quickstart with new install methods --- README.md | 96 ++++++++++++++++++++++++------------------------------- 1 file changed, 42 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index ed13e23c..36fae694 100644 --- a/README.md +++ b/README.md @@ -32,40 +32,48 @@ Once installed, URLs can be added via the command line `archivebox add` or the b The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive//`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API. -**Get it via your method of choice:** -```bash -sudo add-apt-repository ppa:archivebox/archivebox -apt update -apt install archivebox -``` -```bash -brew install archivebox/archivebox/archivebox -``` -```bash -docker pull archivebox/archivebox -``` -```bash -pip3 install archivebox # you must install some system dependencies manually when using pip -``` - #### Quickstart +**First, get ArchiveBox using your system package manager, Docker, or pip:** +```bash +# To use with Docker (recommended) +docker pull archivebox/archivebox + +# for Ubuntu/Debian +sudo add-apt-repository -u ppa:archivebox/archivebox +apt install archivebox + +# for macOS +brew install archivebox/archivebox/archivebox + +# for Python version only, without wget/git/chrome/etc. included +pip3 install archivebox +``` + +**Then create a collection and add some URLs to archive:** ```bash # 1. Create a folder somewhere to hold your ArchiveBox data mkdir ~/archivebox && cd ~/archivebox -docker run -v $PWD:/data -it archivebox/archivebox init +archivebox init +archivebox version # 2. Archive some URLs to get started -docker run -v $PWD:/data -t archivebox/archivebox add https://github.com/ArchiveBox/ArchiveBox -docker run -v $PWD:/data -t archivebox/archivebox add --depth=1 https://example.com +archivebox add https://github.com/ArchiveBox/ArchiveBox +archivebox/archivebox add --depth=1 https://example.com # 3. Then view the snapshots of the URLs you added via the self-hosted web UI -docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser # create an admin acct -docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox # start the web server -open http://127.0.0.1:8000/ # open the interactive admin panel -ls archive/*/index.html # or just browse snapshots on disk +archivebox manage createsuperuser # create an admin acct +archivebox server 0.0.0.0:8000 # start the web server +open http://127.0.0.1:8000/ # open the interactive admin panel +ls ~/archivebox/archive/*/index.html # or just browse snapshots on disk ``` +If you're using docker, run the `archivebox [subcommand] [...args]` commands above like this: +`docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]` +or with docker compose: +`docker-compose run archivebox [subcommand] [...args]` + +

@@ -79,16 +87,9 @@ For more information, see the [args] + +# on Debian/Ubuntu +sudo add-apt-repository -u ppa:archivebox/archivebox +apt install archivebox + +# on macOS +brew install archivebox/archivebox/archivebox ``` -First install the system, pip, and npm dependencies: +Initialize your archive in a directory somewhere and add some links: ```bash -# Install main dependendencies using apt on Ubuntu/Debian, brew on mac, or pkg on BSD -apt install python3 python3-pip python3-dev git curl wget chromium-browser youtube-dl - -# Install Node runtime (used for headless browser scripts like Readability, Singlefile, Mercury, etc.) -curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \ - && echo 'deb https://deb.nodesource.com/node_14.x $(lsb_release -cs) main' >> /etc/apt/sources.list \ - && apt-get update \ - && apt-get install --no-install-recommends nodejs - -# Make a directory to hold your collection -mkdir archivebox && cd archivebox # (can be anywhere, doesn't have to be called archivebox) - -# Install the archivebox python package in ./.venv -python3 -m venv .venv && source .venv/bin/activate -pip install --upgrade archivebox - -# Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer) +mkdir ~/archivebox && cd archivebox npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git' -``` - -Initialize your archive and add some links: -```bash archivebox init archivebox add 'https://example.com' # add URLs as args pipe them in via stdin archivebox add --depth=1 https://example.com/table-of-contents.html From b05a7b781bb0949f857bfe6f83a56cbe6fd7e10e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 23:37:16 -0500 Subject: [PATCH 65/75] Update README.md --- README.md | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 36fae694..a17f4bc2 100644 --- a/README.md +++ b/README.md @@ -36,42 +36,56 @@ The main index is a self-contained `data/index.sqlite3` file, and each snapshot **First, get ArchiveBox using your system package manager, Docker, or pip:** ```bash -# To use with Docker (recommended) +# You can run it with Docker or Docker Compose (recommended) docker pull archivebox/archivebox +# https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml -# for Ubuntu/Debian +# or Ubuntu/Debian sudo add-apt-repository -u ppa:archivebox/archivebox apt install archivebox -# for macOS +# or macOS brew install archivebox/archivebox/archivebox -# for Python version only, without wget/git/chrome/etc. included +# or for the Python version only, without wget/git/chrome/etc. included pip3 install archivebox ``` -**Then create a collection and add some URLs to archive:** +Check that everything installed correctly with: ```bash -# 1. Create a folder somewhere to hold your ArchiveBox data -mkdir ~/archivebox && cd ~/archivebox -archivebox init -archivebox version +archivebox --version +``` -# 2. Archive some URLs to get started +**To start using archivebox, you have to create a data folder and `cd` into it:** + +```bash +mkdir ~/archivebox && cd ~/archivebox # pick somewhere to put your data folder +``` + +If you're using an apt/brew/pip install you can run archivebox commands like this: +`archivebox [subcommand] [...args]` +If you're using Docker it's equivalent to run the commands like this: +`docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]` +And the same in Docker Compose: +`docker-compose run archivebox [subcommand] [...args]` + +**Then Add some URLs to your archive collection:** +```bash archivebox add https://github.com/ArchiveBox/ArchiveBox archivebox/archivebox add --depth=1 https://example.com +``` -# 3. Then view the snapshots of the URLs you added via the self-hosted web UI +View the snapshots of the URLs you added via the self-hosted web UI: +```bash archivebox manage createsuperuser # create an admin acct archivebox server 0.0.0.0:8000 # start the web server open http://127.0.0.1:8000/ # open the interactive admin panel -ls ~/archivebox/archive/*/index.html # or just browse snapshots on disk ``` -If you're using docker, run the `archivebox [subcommand] [...args]` commands above like this: -`docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]` -or with docker compose: -`docker-compose run archivebox [subcommand] [...args]` +Or just browse the snapshots statically on disk: +```bash +ls ~/archivebox/archive/*/index.html +```
From ade783c5998cc65c42e341ce545fb7a7cb7a7188 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 23:39:08 -0500 Subject: [PATCH 66/75] Update README.md --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a17f4bc2..9cf2e0c3 100644 --- a/README.md +++ b/README.md @@ -59,16 +59,21 @@ archivebox --version **To start using archivebox, you have to create a data folder and `cd` into it:** ```bash -mkdir ~/archivebox && cd ~/archivebox # pick somewhere to put your data folder +mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere +archivebox init ``` + + If you're using an apt/brew/pip install you can run archivebox commands like this: `archivebox [subcommand] [...args]` -If you're using Docker it's equivalent to run the commands like this: +If you're using Docker it's equivalent to run the commands like this: `docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]` -And the same in Docker Compose: +And the same in Docker Compose: `docker-compose run archivebox [subcommand] [...args]` + + **Then Add some URLs to your archive collection:** ```bash archivebox add https://github.com/ArchiveBox/ArchiveBox From 6722ad249e6058451f5f0d4511c3e75261e977b5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 23:39:34 -0500 Subject: [PATCH 67/75] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9cf2e0c3..48546e98 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ archivebox init If you're using an apt/brew/pip install you can run archivebox commands like this: `archivebox [subcommand] [...args]` If you're using Docker it's equivalent to run the commands like this: -`docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]` +`docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]` And the same in Docker Compose: `docker-compose run archivebox [subcommand] [...args]` From 5fd6fcd09ef2442479f2bb1bdbacb65c4ca79bcf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 23:41:59 -0500 Subject: [PATCH 68/75] Update README.md --- README.md | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 48546e98..baa56826 100644 --- a/README.md +++ b/README.md @@ -61,19 +61,15 @@ archivebox --version ```bash mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere archivebox init + +# If you're using an apt/brew/pip install you can run archivebox commands normally +# archivebox [subcommand] [...args] +# If you're using Docker you'll have to run the commands like this +# docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args] +# And the equivalent in Docker Compose: +# docker-compose run archivebox [subcommand] [...args] ``` - - -If you're using an apt/brew/pip install you can run archivebox commands like this: -`archivebox [subcommand] [...args]` -If you're using Docker it's equivalent to run the commands like this: -`docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]` -And the same in Docker Compose: -`docker-compose run archivebox [subcommand] [...args]` - - - **Then Add some URLs to your archive collection:** ```bash archivebox add https://github.com/ArchiveBox/ArchiveBox From 1e04fa8108551a699f79706953af2ef581e0676b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Nov 2020 23:44:18 -0500 Subject: [PATCH 69/75] Update README.md --- README.md | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index baa56826..0f2fc5f2 100644 --- a/README.md +++ b/README.md @@ -49,18 +49,6 @@ brew install archivebox/archivebox/archivebox # or for the Python version only, without wget/git/chrome/etc. included pip3 install archivebox -``` - -Check that everything installed correctly with: -```bash -archivebox --version -``` - -**To start using archivebox, you have to create a data folder and `cd` into it:** - -```bash -mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere -archivebox init # If you're using an apt/brew/pip install you can run archivebox commands normally # archivebox [subcommand] [...args] @@ -70,22 +58,27 @@ archivebox init # docker-compose run archivebox [subcommand] [...args] ``` +Check that everything installed correctly with `archivebox --version` + +**To start using archivebox, you have to create a data folder and `cd` into it:** + +```bash +mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere +archivebox init +``` + **Then Add some URLs to your archive collection:** ```bash archivebox add https://github.com/ArchiveBox/ArchiveBox archivebox/archivebox add --depth=1 https://example.com ``` -View the snapshots of the URLs you added via the self-hosted web UI: +**View the snapshots of the URLs you added via the self-hosted web UI:** ```bash archivebox manage createsuperuser # create an admin acct archivebox server 0.0.0.0:8000 # start the web server open http://127.0.0.1:8000/ # open the interactive admin panel -``` - -Or just browse the snapshots statically on disk: -```bash -ls ~/archivebox/archive/*/index.html +ls ~/archivebox/archive/*/index.html # or browse the snapshots on disk ``` From b82737cc4dafca49d42edd3cdfd46cf7d5b7c6c1 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 24 Nov 2020 18:49:01 -0500 Subject: [PATCH 70/75] Fix formatting --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0f2fc5f2..394a76cb 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@
-ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [apt](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [brew](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). +ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. @@ -97,7 +97,7 @@ For more information, see the
Date: Wed, 25 Nov 2020 12:27:13 -0500 Subject: [PATCH 71/75] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 394a76cb..7825694f 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ archivebox init **Then Add some URLs to your archive collection:** ```bash archivebox add https://github.com/ArchiveBox/ArchiveBox -archivebox/archivebox add --depth=1 https://example.com +archivebox add --depth=1 https://example.com ``` **View the snapshots of the URLs you added via the self-hosted web UI:** From 38b2dec12e3160b2210bc7b5f7e01ddb9d66e82a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 26 Nov 2020 23:36:37 -0500 Subject: [PATCH 72/75] fix broken readme link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7825694f..56afe775 100644 --- a/README.md +++ b/README.md @@ -392,7 +392,7 @@ You can also access the docs locally by looking in the [`ArchiveBox/docs/`](http - [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site) - [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) - [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive) -- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install-Chromium) +- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install) - [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview) - [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting) - [Python API](https://docs.archivebox.io/en/latest/modules.html) From d9ef3d0bf82854b83a94e5c044b28ecb30152e1b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 27 Nov 2020 19:39:19 -0500 Subject: [PATCH 73/75] ignore lost+found dir in data folder --- archivebox/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/main.py b/archivebox/main.py index 3d577a42..aaaaa40f 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -111,6 +111,7 @@ from .logging_util import ( ALLOWED_IN_OUTPUT_DIR = { + 'lost+found', '.DS_Store', '.venv', 'venv', From 07a56f9d463b5cde4f65ab5d823f7d2cbda5f0d5 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 27 Nov 2020 22:59:18 -0500 Subject: [PATCH 74/75] also print platform and CPU info in version output --- archivebox/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/archivebox/main.py b/archivebox/main.py index aaaaa40f..9f15d783 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -3,6 +3,7 @@ __package__ = 'archivebox' import os import sys import shutil +import platform from pathlib import Path from datetime import date @@ -210,6 +211,8 @@ def version(quiet: bool=False, print(VERSION) else: print('ArchiveBox v{}'.format(VERSION)) + p = platform.uname() + print(p.system, p.release, p.machine) print() print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) From e4d2ac432db2627cec7cfa5fb02e8b83bb4269bb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 27 Nov 2020 23:08:23 -0500 Subject: [PATCH 75/75] improve OS kernel output in archivebox version --- archivebox/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/main.py b/archivebox/main.py index 9f15d783..66b9248f 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -212,7 +212,7 @@ def version(quiet: bool=False, else: print('ArchiveBox v{}'.format(VERSION)) p = platform.uname() - print(p.system, p.release, p.machine) + print(p.system, platform.platform(), p.machine) print() print('{white}[i] Dependency versions:{reset}'.format(**ANSI))