pax_global_header00006660000000000000000000000064151367465220014525gustar00rootroot0000000000000052 comment=992f2dc4ba14d20a0d61242efeebdd91ea801978 python-internetarchive-5.7.2/000077500000000000000000000000001513674652200162515ustar00rootroot00000000000000python-internetarchive-5.7.2/.github/000077500000000000000000000000001513674652200176115ustar00rootroot00000000000000python-internetarchive-5.7.2/.github/dependabot.yml000066400000000000000000000011031513674652200224340ustar00rootroot00000000000000# Keep GitHub Actions up to date with GitHub's Dependabot... # https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem version: 2 updates: - package-ecosystem: github-actions directory: / groups: github-actions: patterns: - "*" # Group all Actions updates into a single larger pull request schedule: interval: weekly python-internetarchive-5.7.2/.github/workflows/000077500000000000000000000000001513674652200216465ustar00rootroot00000000000000python-internetarchive-5.7.2/.github/workflows/lint_python.yml000066400000000000000000000012641513674652200247430ustar00rootroot00000000000000name: lint_python on: [pull_request, push] jobs: lint_python: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: cache: pip python-version: 3.x - run: pip install --upgrade pip setuptools wheel - run: pip install .[all] - run: black --check --skip-string-normalization . || true - run: ruff check --output-format=github # See pyproject.toml for configuration - run: pip install -r pex-requirements.txt -r tests/requirements.txt - run: mypy . # See setup.cfg for configuration - run: safety check || true # Temporary fix for https://pyup.io/v/51457/f17 python-internetarchive-5.7.2/.github/workflows/pre-commit.yml000066400000000000000000000010101513674652200244350ustar00rootroot00000000000000# https://pre-commit.com # This GitHub Action assumes that the repo contains a valid .pre-commit-config.yaml file. name: pre-commit on: pull_request: push: branches: [master] jobs: pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: 3.x cache: pip - run: pip install pre-commit - run: pre-commit --version - run: pre-commit install - run: pre-commit run --all-files python-internetarchive-5.7.2/.github/workflows/test_install.yml000066400000000000000000000006001513674652200250720ustar00rootroot00000000000000name: test_install on: pull_request: push: branches: [master] jobs: install_internetarchive: runs-on: ubuntu-latest strategy: fail-fast: false matrix: setuptools-version: ["45.2.0", "58.1.0", "62.4.0"] steps: - uses: actions/checkout@v6 - run: pip install setuptools=="${{ matrix.setuptools-version }}" - run: pip install . python-internetarchive-5.7.2/.github/workflows/tox.yml000066400000000000000000000007221513674652200232040ustar00rootroot00000000000000name: tox on: [push, pull_request] jobs: tox: runs-on: ubuntu-latest strategy: fail-fast: false max-parallel: 1 # Avoid timeout errors matrix: python: ['3.9', '3.10', '3.11', '3.12', '3.13', '3.14', 'pypy-3.11'] steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python }} cache: pip - run: pip install tox - run: tox -e py python-internetarchive-5.7.2/.gitignore000066400000000000000000000004101513674652200202340ustar00rootroot00000000000000.cache .coverage .DS_Store .envrc .idea .pytest_cache/ .python-version .tox .vagrant .venv* .vscode *.csv *.egg-info *.log *.pex *.pyc *gz build dist htmlcov itemlist.txt pex/ stairs TAGS trash/ v3.10/ v3.7/ v3.8/ v3.9/ wheelhouse ia.dist/ ia.bin scripts/ foo.txt python-internetarchive-5.7.2/.pre-commit-config.yaml000066400000000000000000000031141513674652200225310ustar00rootroot00000000000000# To enable these pre-commit hook run: # `python3 -m pip install pre-commit` or `brew install pre-commit` # Then in the project root directory run `pre-commit install` # Learn more about this config here: https://pre-commit.com/ # default_language_version: # python: python3.10 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: check-builtin-literals - id: check-executables-have-shebangs - id: check-shebang-scripts-are-executable - id: check-yaml - id: detect-private-key - id: end-of-file-fixer - id: mixed-line-ending - id: requirements-txt-fixer - id: trailing-whitespace - repo: https://github.com/charliermarsh/ruff-pre-commit rev: v0.14.1 hooks: - id: ruff-check # args: [ --fix, --exit-non-zero-on-fix ] - repo: https://github.com/psf/black rev: 25.9.0 hooks: - id: black language_version: python3 args: - --diff - --skip-string-normalization - repo: https://github.com/codespell-project/codespell rev: v2.4.1 hooks: - id: codespell # See setup.cfg for args - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.18.2 hooks: - id: mypy additional_dependencies: - tqdm-stubs - types-jsonpatch - types-requests - types-setuptools - types-ujson - types-urllib3 - repo: https://github.com/asottile/setup-cfg-fmt rev: v2.7.0 # Use v2.7.0 until dropping Python 3.9 support hooks: - id: setup-cfg-fmt python-internetarchive-5.7.2/AUTHORS.rst000066400000000000000000000006531513674652200201340ustar00rootroot00000000000000Authors ======= The Internet Archive Python library and command-line tool is written and maintained by Jake Johnson and various contributors: Development Lead ---------------- - Jake Johnson Contributors ------------ - Bryce Drennan - Max Zettlmeißl - Ian Tait Patches and Suggestions ----------------------- - VM Brasseur python-internetarchive-5.7.2/CLAUDE.md000066400000000000000000000067511513674652200175410ustar00rootroot00000000000000# CLAUDE.md This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. ## Project Overview Python library and `ia` CLI for interacting with archive.org. Used for uploading, downloading, searching, and managing items and their metadata. Also provides catalog task management and account administration utilities. Items are identified by a unique identifier and contain files and metadata. ## Common Commands ```bash # Install for development pip install -e . # Run tests pytest # Run tests with linting ruff check && pytest # Run a single test file pytest tests/test_api.py # Run a specific test pytest tests/test_api.py::test_get_item # Multi-version testing (requires Python 3.9-3.14 installed) tox # Lint only ruff check # Build docs pip install -r docs/requirements.txt cd docs && make html ``` ## Architecture The library has a three-layer architecture: **Layer 1 - Public API (`internetarchive/api.py`)** Convenience functions that wrap the core classes: `get_item()`, `search_items()`, `upload()`, `download()`, `modify_metadata()`, `delete()`, `configure()`, `get_session()`. **Layer 2 - Core Classes** - `ArchiveSession` (`session.py`) - Extends `requests.Session`. Manages config, credentials, HTTP headers, connection pooling. - `Item` (`item.py`) - Represents an Archive.org item. Contains files, metadata, and methods for download/upload/modify. - `File` (`files.py`) - Represents a single file within an item. Handles download, delete, checksum verification. - `Search` (`search.py`) - Query interface with pagination and field selection. **Layer 3 - Supporting Modules** - `config.py` - INI-based configuration (credentials at `~/.config/internetarchive/ia.ini` or `~/.ia`) - `iarequest.py` - HTTP request builders (`MetadataRequest`, `S3Request`) - `auth.py` - S3 authentication handlers - `catalog.py` - Catalog task management **CLI (`internetarchive/cli/`)** - Entry point: `ia.py:main()` → registered as `ia` console script - Subcommands: `ia_download.py`, `ia_upload.py`, `ia_metadata.py`, `ia_search.py`, `ia_list.py`, `ia_delete.py`, `ia_copy.py`, `ia_move.py`, `ia_tasks.py`, `ia_configure.py`, etc. ## Code Style - Line length: 90 characters - Linter: ruff (configured in `pyproject.toml`) - Formatter: black - Type checking: mypy (type stubs in `options.extras_require` under `types`) ## Key Dependencies - `requests` - HTTP client - `jsonpatch` - JSON patching for metadata updates - `tqdm` - Progress bars - `responses` - HTTP mocking for tests ## Contributing Notes - All new features should be developed on a feature branch, not directly on master - PRs require tests and must pass ruff linting - Avoid introducing new dependencies - Support Python 3.9+ ## Releasing To release a new version (must be on master with clean working directory): ```bash # 1. Prepare release (updates __version__.py and HISTORY.rst date) make prepare-release RELEASE=X.Y.Z # 2. Review and commit version changes git diff git add -A && git commit -m "Bump version to X.Y.Z" # 3. Publish to PyPI + archive.org + GitHub make publish-all ``` Individual release targets: - `make publish` - PyPI + GitHub release (no binary) - `make publish-all` - PyPI + pex binary + GitHub release - `make publish-binary` - pex binary only (after PyPI release) The release process will: - Run tests and linting - Build the package - Build and test the pex binary - Create and push a git tag - Upload to PyPI - Upload binary to archive.org - Create a GitHub release with changelog from HISTORY.rst python-internetarchive-5.7.2/CONTRIBUTING.rst000066400000000000000000000055501513674652200207170ustar00rootroot00000000000000How to Contribute ================= Thank you for considering contributing. All contributions are welcome and appreciated! Support Questions ----------------- Please don't use the Github issue tracker for asking support questions. All support questions should be emailed to `info@archive.org `_. Bug Reports ----------- `Github issues `_ is used for tracking bugs. Please consider the following when opening an issue: - Avoid opening duplicate issues by taking a look at the current open issues. - Provide details on the version, operating system and Python version you are running. - Include complete tracebacks and error messages. Pull Requests ------------- All pull requests and patches are welcome, but please consider the following: - Include tests. - Include documentation for new features. - If your patch is supposed to fix a bug, please describe in as much detail as possible the circumstances in which the bug happens. - Please follow `PEP8 `_, with the exception of what is ignored in `setup.cfg `_. PEP8 compliance is checked when tests run. Tests will fail if your patch is not PEP8 compliant. - Add yourself to AUTHORS.rst. - Avoid introducing new dependencies. - Open an issue if a relevant one is not already open, so others have visibility into what you're working on and efforts aren't duplicated. - Clarity is preferred over brevity. Running Tests ------------- The minimal requirements for running tests are ``pytest``, ``pytest-pep8`` and ``responses``: .. code:: bash $ pip install pytest pytest-pep8 responses Clone the `internetarchive lib `_: .. code:: bash $ git clone https://github.com/jjjake/internetarchive Install the `internetarchive lib `_ as an editable package: .. code:: bash $ cd internetarchive $ pip install -e . Run the tests: .. code:: bash $ py.test --pep8 Note that this will only test against the Python version you are currently using, however ``internetarchive`` tests against multiple Python versions defined in `tox.ini `_. Tests must pass on all versions defined in ``tox.ini`` for all pull requests. To test against all supported Python versions, first make sure you have all of the required versions of Python installed. Then simply install execute tox from the root directory of the repo: .. code:: bash $ pip install tox $ tox Even easier is simply creating a pull request. `GitHub Actions `_ are used for continuous integration, and are set up to run the full testsuite whenever a pull request is submitted or updated. python-internetarchive-5.7.2/HISTORY.rst000066400000000000000000001201601513674652200201440ustar00rootroot00000000000000.. :changelog: Release History --------------- 5.7.2 (2026-01-29) +++++++++ **Features and Improvements** - Added support for appending a custom suffix to the User-Agent string. The default User-Agent (including access key) is always sent to ensure proper request tracking. Example: With ``user_agent_suffix = MyApp/1.0``, the full User-Agent becomes: ``internetarchive/5.7.2 (Darwin x86_64; N; en; ACCESS_KEY) Python/3.9.0 MyApp/1.0`` - CLI: ``ia --user-agent-suffix "MyApp/1.0" `` - Config file: ``user_agent_suffix = MyApp/1.0`` in ``[general]`` section - Python API: ``get_session(config={'general': {'user_agent_suffix': 'MyApp/1.0'}})`` **Bugfixes** - Fixed bug where metadata append was not working correctly when source metadata field was a list, and failing with obscure messages in some cases. - Fixed inverted logic for ``--download-history`` flag in ``ia download`` where passing the flag would ignore history files instead of downloading them (`#735 `_). 5.7.1 (2025-10-29) ++++++++++++++++++ **Bugfixes** - Explicitly set Content-Type header in account API requests. This header wasn't being set in some cases and caused some requests to fail. 5.7.0 (2025-10-16) ++++++++++++++++++ **Bugfixes** - Fixed critical bug in ``ia delete --glob`` where all files were being deleted regardless of the glob pattern. This bug was introduced in version v5.4.1 (2025-07-16). - More metadata insert bugfixes and refactoring. 5.6.1 (2025-10-14) ++++++++++++++++++ **Bugfixes** - Fixed bug where metadata insert was clobbering and indexed modify writes were inserting. 5.6.0 (2025-10-10) ++++++++++++++++++ **Bugfixes** - Fixed bug in ``ia download`` where identifier was needlessly being validated and blocking some items from being downloaded. - Fixed regression in ``ia download --stdout`` where directories were being created. **Features and Improvements** - Added support for the Retry-After HTTP header to improve handling of rate-limited API responses. - Added support for configuring IA-S3 keys via IA_ACCESS_KEY_ID and IA_SECRET_ACCESS_KEY environment variables. - Added ``headers`` parameter to ``internetarchive.files.File.download()`` for adding custom headers to download requests. - Improved Windows filename sanitization. 5.5.1 (2025-09-05) ++++++++++++++++++ **Security** - **Fixed a critical directory traversal vulnerability in** File.download(). All users are urged to upgrade immediately. This prevents malicious filenames from writing files outside the target directory, a risk especially critical for Windows users. - Added automatic filename sanitization with platform-specific rules. - Added path resolution checks to block directory traversal attacks. - Introduced warnings when filenames are sanitized to maintain user awareness. **Bugfixes** - Fixed bug in JSON parsing for ia upload --file-metadata .... 5.5.0 (2025-07-17) ++++++++++++++++++ **Features and Improvements** - Added --parameters option to ``ia metadata``. 5.4.1 (2025-07-16) ++++++++++++++++++ **Features and Improvements** - Stop setting scanner on upload per policy change. **Bugfixes** - Fixed bug where REMOVE_TAG was not working with indexed keys. - Fixed argument validation and option parsing in ``ia download``. 5.4.0 (2025-04-29) ++++++++++++++++++ **Features and Improvements** - Added ``--print-auth-header`` option to ``ia configure``. **Bugfixes** - Corrected behavior of ``ia_copy`` to avoid dropping path prefixes, fixing ``ia_move`` to properly delete moved files in subdirectories (via `PR #693 `_). - Fixed bug where hardcoded test comment was being sent with every request. - Fixed issue where ``ia reviews --index/--noindex`` only worked for configured user. 5.3.1 (2025-03-26) ++++++++++++++++++ **Bugfixes** - Fixed bug where ``ia reviews --index/--noindex`` was only working for the configured user. 5.3.0 (2025-03-26) ++++++++++++++++++ **Features and Improvements** - Added ``ia configure --show`` to print config to stdout. - Added ``ia configure --check`` for validating credentials. - Added ``ia configure --whoami`` for retrieving info about the configured user. - Added ``ia simplelists`` command for managing simplelists. - Added ``ia flag`` command for managing flags. **Bugfixes** - Fixed bugs in ``ia copy`` and ``ia move`` where an ``AttributeError`` was being raised. - Exit with 0 rather than 1 with ``ia upload --checksum`` if the file already exists. 5.2.1 (2025-02-12) ++++++++++++++++++ **Bugfixes** - Fixed TypeError bug in ``ia delete`` that was causing all `ia delete` commands to fail. - Fixed bug in ``ia metadata`` where IDs were being validated needlessly and making it impossible to modify some items. - Fixed bug where bulk download was failing with TypeError. 5.2.0 (2025-01-10) ++++++++++++++++++ **Bugfixes** - Fixed bug where failed requests to IA-S3 check_limit API would be treated as a 503 slowdown error. 5.1.0 (2025-01-07) ++++++++++++++++++ **Features and Improvements** - added ``--reduced-priority`` option to ``ia metadata``. **Bugfixes** - Fixed bugs for URL parameter options in CLI. - Fixed various bugs and simplified CLI options with KEY:VALUE values. - Fixed bug in ``ia --host `` where the host was not being set correctly. - Removed identifier validation from ``ia reviews``. 5.0.4 (2024-12-10) ++++++++++++++++++ **Bugfixes** - Fixed bug where ``ia delete --no-backup`` was not turning off backups. - Fixed bug where ``ia delete`` required you specify a file. - Fixed bug where ``ia delete`` did not work correctly with multiple ``--format`` args. 5.0.4 (2024-11-15) ++++++++++++++++++ **Bugfixes** - Fixed bug where some error messages mentioned the wrong arg in the message. - Fixed bug where Scrape API was being used for num-found, even if Advanced Search was triggered via page/rows params. 5.0.3 (2024-11-12) ++++++++++++++++++ **Bugfixes** - Fixed bug in CLI where some multi-arguments were being treated as single arguments. - Fixed bug where InvalidHeader was being raised when a custom scanner was provided in some cases. 5.0.2 (2024-11-11) ++++++++++++++++++ **Bugfixes** - Fixed bug where ``ia metadata --spreadsheet`` would fail and return ``ia metadata: error: the following arguments are required: identifier``. 5.0.1 (2024-11-08) ++++++++++++++++++ **Bugfixes** - Fix bug where the use of signal.SIGPIPE causes the CLI to crash on Windows (SIGPIPE is not available on Windows). 5.0.0 (2024-11-07) ++++++++++++++++++ **Features and Improvements** - Updated the CLI's command-line argument parsing by replacing the obsolete ``docopt`` with the native ``argparse`` library, ensuring continued functionality and future compatibility. ***Note: While the CLI functionality hasn't changed, some commands may need to be formatted slightly differently. If you encounter any issues, refer to ``ia --help`` and ``ia {command} --help`` if you run into any issues.*** 4.1.0 (2024-05-07) ++++++++++++++++++ **Bugfixes** - Use mtime from files.xml if no Last-Modified header is available (e.g. VTT files). 4.0.1 (2024-04-15) ++++++++++++++++++ **Features and Improvements** - Partially downloaded files will now automatically resume where they left off when retried. - Use ``Last-Modified`` header to set all mtimes (this includes files.xml now). 3.7.0 (2024-03-19) ++++++++++++++++++ **Features and Improvements** - Added support for JSON Patch test operations, via the ``expect`` parameter. - Added support for moving values via --append-list (Now, rather than ignoring any requests where the value is already present, --append-list will move the value to the end of the list). - Switched to importlib-metadata to drop deprecated pkg_resources. **Bugfixes** - Fixed automatic size hint on uploads. - Fixed bug where auth wasn't being sent for searches with user_aggs params. 3.6.0 (2023-12-27) ++++++++++++++++++ **Features and Improvements** - Added ``set_scanner`` and ``--no-scanner`` options to upload to stop ia's default behavior of setting the scanner field in meta.xml on initial upload. - ``0`` is now returned instead of an exception when search fails to retrieve the total number of hits for a query. 3.5.0 (2023-05-09) ++++++++++++++++++ **Bugfixes** - Fixed bug in ``ia metadata --insert`` where duplicate values were being added in some cases **Features and Improvements** - Added timeout option for metadata writes. Set default to 60 seconds. 3.4.0 (2023-04-05) ++++++++++++++++++ **Features and Improvements** - Added parameters for filtering files based on their source value in files.xml. - Added support for downloading multiple files to stdout. - Added timeout parameter to download. 3.3.0 (2023-01-06) ++++++++++++++++++ **Features and Improvements** - Added support for inserting metadata into an existing multi-value metadata field. It differs from ``ia metadata --modify collection[0]:foo`` in that it does not clobber. For example, ``ia metadata --insert collection[0]:foo`` will insert ``foo`` as the first collection, it will not clobber. **Bugfixes** - Fixed bug in search where timeouts would always be returned on queries submitted to the files index where more than 10,000 results would be returned. 3.2.0 (2023-01-06) ++++++++++++++++++ **Features and Improvements** - Added support for admins to delete reviews via itemname. 3.1.0 (2023-01-06) ++++++++++++++++++ **Bugfixes** - Fixed bug in ``ia search --fts`` where ``--itemlist`` was printing empyt lines. - Fixed bug in ``ia search --fts`` where ``-p scope:all`` was not working. - Fixed directory creation race conditions in download. - Fixed bug in ``ia download --stdout`` where nothing would be printed to stdout if the specified file existed on disk. - Fixed bug where that made it impossible to upload to user items. - Fixed memoryview error when running ``Item.upload`` with ``StringIO`` input and ``verbose=True``. - Fixed bug in upload where a period was not being expanded properly to the contents of the current directory. **Features and Improvements** - Added support for admins to delete other users reviews - Added support for excluding files in ``ia download`` via the ``--exclude`` parameter. - Various refactoring and code simplifications. 3.0.2 (2022-06-15) ++++++++++++++++++ **Bugfixes** - Fixed bug where installation would fail in some cases if ``requests``, ``tqdm``, or ``jsonpatch`` were not already installed. 3.0.1 (2022-06-02) ++++++++++++++++++ **Features and Improvements** - Cut down on the number of HTTP requests made by search. - Added Python type hints, and other Python 3 improvements. 3.0.0 (2022-03-17) ++++++++++++++++++ **Breaking changes** - Removed Python 2.7, 3.5, and 3.6 support - ``ia download`` no longer has a ``--verbose`` option, and ``--silent`` has been renamed to ``--quiet``. - ``internetarchive.download``, ``Item.download`` and ``File.download`` no longer have a ``silent`` keyword argument. They are silent by default now unless ``verbose`` is set to ``True``. **Features and Improvements** - ``page`` parameter is no longer required if ``rows`` parameter is specified in search requests. - advancedsearch.php endpoint now supports IAS3 authorization. - ``ia upload`` now has a ``--keep-directories`` option to use the full local file paths as the remote name. - Added progress bars to ``ia download`` **Bugfixes** - Fixed treatment of list-like file metadata in ``ia list`` under Python 3 - Fixed ``ia upload --debug`` only displaying the first request. - Fixed uploading from stdin crashing with UnicodeDecodeError or TypeError exception. - Fixed ``ia upload`` silently ignoring exceptions. - Fixed uploading from a spreadsheet with a BOM (UTF-8 byte-order mark) raising a KeyError. - Fixed uploading from a spreadsheet not reusing the ``identifier`` column. - Fixed uploading from a spreadsheet not correctly dropping the ``item`` column from metadata. - Fixed uploading from a spreadsheet with ``--checksum`` crashing on skipped files. - Fixed minor bug in S3 overload check on upload error retries. - Fixed various messages being printed to stdout instead of stderr. - Fixed format selection for on-the-fly files. 2.3.0 (2022-01-20) ++++++++++++++++++ **Features and Improvements** - Added support for ``IA_CONFIG_FILE`` environment variable to specify the configuration file path. - Added ``--no-derive`` option to ``ia copy`` and ``ia move``. - Added ``--no-backup`` option to ``ia copy``, ``ia move``, ``ia upload``, and ``ia delete``. **Bugfixes** - Fixed bug where queries to the Scrape API (e.g. most search requests made by ``internetarchive``) would fail to return all docs without any error reporting, if the Scrape API times out. All queries to the Scrape API are now tested to assert the number of docs returned matches the hit count returned by the Scrape API. If these numbers don't match, an exception is thrown in the Python API and the CLI exits with a non-zero exit code and error message. - Use .archive.org as the default cookie domain. This fixes a bug where an AttributeError exception would be raised if a cookie wasn't set in a config file. 2.2.0 (2021-11-23) ++++++++++++++++++ **Features and Improvements** - Added ``ia reviews --delete``. - Added ability to fetch a users reviews from an item via ``ia reviews ``. **Bugfixes** - Fixed bug in ``ArchiveSession`` object where domains weren't getting set properly for cookies. This caused archive.org cookies to be sent to other domains. - Fixed bug in URL param parser for CLI. - Fixed Python 2 bug in ``ia upload --spreadsheet``. 2.1.0 (2021-08-25) ++++++++++++++++++ **Features and Improvements** - Better error messages in ``ia upload --spreadsheet``. - Added support for REMOTE_NAME in ``ia upload --spreadsheet`` via a ``REMOTE_NAME`` column. - Implemented XDG Base Directory specification. **Bugfixes** - Fixed bug in FTS where searches would crash with a TypeError exception. - Improved Python 2 compatibility. 2.0.3 (2021-05-03) ++++++++++++++++++ **Bugfixes** - Fixed bug where some "falsey"/empty values were being dropped when modifying metadata. 2.0.2 (2021-04-06) ++++++++++++++++++ - Fixing pypi issues... 2.0.1 (2021-04-06) ++++++++++++++++++ **Bugfixes** - Exit with 0 in ``ia tasks --cmd ...`` if a task is already queued or running. 2.0.0 (2021-04-05) ++++++++++++++++++ **Features and Improvements** - Automatic paging scrolling added to ``ia search --fts``. - Default support for lucene queries in ``ia search --fts``. - Added support for getting rate-limit information from the Tasks API (i.e. ``ia tasks --get-rate-limit --cmd derive.php``). - Added ability to set a remote-filename in a spreadsheet when uploading via ``ia upload --spreadsheet ...``. **Bugfixes** - Fixed bug in ``ia metadata --remove ...`` where multiple collections would be removed if the specified collection was a substring of any of the existing collections. - Fixed bug in ``ia metadata --remove ...`` where removing multiple collections was sometimes not supported. 1.9.9 (2021-01-27) ++++++++++++++++++ **Features and Improvements** - Added support for FTS API. - Validate identifiers in spreadsheet before uploading file with ``ia upload --spreadsheet``. - Added ``ia configure --print-cookies``. This is helpful for using your archive.org cookies in other programs like ``curl``. e.g. ``curl -b $(ia configure --print-cookies) ...`` 1.9.6 (2020-11-10) ++++++++++++++++++ **Features and Improvements** - Added ability to submit tasks with a reduced priority. - Added ability to add headers to modify_metadata requests. **Bugfixes** - Bumped version requirements for ``six``. This addresses the "No module named collections_abc" error. 1.9.5 (2020-09-18) ++++++++++++++++++ **Features and Improvements** - Increased chunk size in download and added other download optimizations. - Added support for submitting reviews via ``Item.review()`` and ``ia review``. - Improved exception/error messages in cases where s3.us.archive.org returns invalid XML during uploads. - Minor updates and improvements to continuous integration. 1.9.4 (2020-06-24) ++++++++++++++++++ **Features and Improvements** - Added support for adding file-level metadata at time of upload. - Added ``--no-backup`` to ``ia upload`` to turn off backups. **Bugfixes** - Fixed bug in ``internetarchive.get_tasks`` where no tasks were returned unless ``catalog`` or ``history`` params were provided. - Fixed bug in upload where headers were being reused in certain cases. This lead to issues such as queue-derive being turned off in some cases. - Fix crash in ``ia tasks`` when a task log contains invalid UTF-8 character. - Fixed bug in upload where requests were not being closed. 1.9.3 (2020-04-07) ++++++++++++++++++ **Features and Improvements** - Added support for removing items from simplelists as if they were collections. - Added ``Item.derive()`` method for deriving items. - Added ``Item.fixer()`` method for submitting fixer tasks. - Added ``--task-args`` to ``ia tasks`` for submitting task args to the Tasks API. **Bugfixes** - Minor bug fix in ``ia tasks`` to fix support for tasks that do not require a ``--comment`` option. 1.9.2 (2020-03-15) ++++++++++++++++++ **Features and Improvements** - Switched to ``tqdm`` for progress bar (``clint`` is no longer maintained). - Added ``Item.identifier_available()`` method for calling check_identifier.php. - Added support for opening details page in default browser after upload. - Added support for using ``item`` or ``identifier`` as column header in spreadsheet mode. - Added ``ArchiveSession.get_my_catalog()`` method for retrieving running/queued tasks. - Removed backports.csv requirement for newer Python releases. - Authorization header is now used for metadata reads, to support privileged access to /metadata. - ``ia download`` no longer downloads history dir by default. - Added ``ignore_history_dir`` to ``Item.download()``. The default is False. **Bugfixes** - Fixed bug in ``ia copy`` and ``ia move`` where filenames weren't being encoded/quoted correctly. - Fixed bug in ``Item.get_all_item_tasks()`` where all calls would fail unless a dict was provided to ``params``. - Read from ~/.config/ia.ini with fallback to ~/.ia regardless of the existence of ~/.config - Fixed S3 overload message always mentioning the total maximum number of retries, not the remaining ones. - Fixed bug where a KeyError exception would be raised on most calls to dark items. - Fixed bug where md5 was being calculated for every upload. 1.9.0 (2019-12-05) ++++++++++++++++++ **Features and Improvements** - Implemented new archive.org `Tasks API `_. - Added support for darking and undarking items via the Tasks API. - Added support for submitting arbitrary tasks (only darking/undarking currently supported, see Tasks API documentation). **Bugfixes** - ``ia download`` now displays ``download failed`` instead of ``success`` when download fails. - Fixed bug where ``Item.get_file`` would not work on unicode names in Python 2. 1.8.5 (2019-06-07) ++++++++++++++++++ **Features and Improvements** - Improved timeout logging and exceptions. - Added support for arbitrary targets to metadata write. - IA-S3 keys now supported for auth in download. - Authoraization (i.e. ``ia configure``) now uses the archive.org xauthn endpoint. **Bugfixes** - Fixed encoding error in --get-task-log - Fixed bug in upload where connections were not being closed in upload. 1.8.4 (2019-04-11) ++++++++++++++++++ **Features and Improvements** - It's now possible to retrieve task logs, given a task id, without first retrieving the items task history. - Added examples to ``ia tasks`` help. 1.8.3 (2019-03-29) ++++++++++++++++++ **Features and Improvements** - Increased search timeout from 24 to 300 seconds. **Bugfixes** - Fixed bug in setup.py where backports.csv wasn't being installed when installing from pypi. 1.8.2 (2019-03-21) ++++++++++++++++++ **Features and Improvements** - Documentation updates. - Added support for write-many to modify_metadata. **Bugfixes** - Fixed bug in ``ia tasks --task-id`` where no task was being returned. - Fixed bug in ``internetarchive.get_tasks()`` where it was not possible to query by ``task_id``. - Fixed TypeError bug in upload when uploading with checksum=True. 1.8.1 (2018-06-28) ++++++++++++++++++ **Bugfixes** - Fixed bug in ``ia tasks --get-task-log`` that was returning an unable to parse JSON error. 1.8.0 (2018-06-28) ++++++++++++++++++ **Features and Improvements** - Only use backports.csv for python2 in support of FreeBDS port. - Added a nicer error message to ``ia search`` for authentication errors. - Added support for using netrc files in ``ia configure``. - Added ``--remove`` option to ``ia metadata`` for removing values from single or mutli-field metadata elements. - Added support for appending a metadata value to an existing metadata element (as a new entry, not simply appending to a string). - Added ``--no-change-timestamp`` flag to ``ia download``. Download files retain the timestamp of "now", not of the source material when this option is used. **Bugfixes** - Fixed bug in upload where StringIO objects were not uploadable. - Fixed encoding issues that were causing some ``ia tasks`` commands to fail. - Fixed bug where keep-old-version wasn't working in ``ia move``. - Fixed bug in ``internetarchive.api.modify_metadata`` where debug and other args were not honoured. 1.7.7 (2018-03-05) ++++++++++++++++++ **Features and Improvements** - Added support for downloading on-the-fly archive_marc.xml files. **Bugfixes** - Improved syntax checking in ``ia move`` and ``ia copy``. - Added ``Connection:close`` header to all requests to force close connections after each request. This is a workaround for dealing with a bug on archive.org servers where the server hangs up before sending the complete response. 1.7.6 (2018-01-05) ++++++++++++++++++ **Features and Improvements** - Added ability to set the remote-name for a directory in ``ia upload`` (previously you could only do this for single files). **Bugfixes** - Fixed bug in ``ia delete`` where all requests were failing due to a typo in a function arg. 1.7.5 (2017-12-07) ++++++++++++++++++ **Features and Improvements** - Turned on ``x-archive-keep-old-version`` S3 header by default for all ``ia upload``, ``ia delete``, ``ia copy``, and ``ia move`` commands. This means that any ``ia`` command that clobbers or deletes a command, will save a version of the file in ``/history/files/$key.~N~``. This is only on by default in the CLI, and not in the Python lib. It can be turne off by adding ``-H x-archive-keep-old-version:0`` to any ``ia upload``, ``ia delete``, ``ia copy``, or ``ia move`` command. 1.7.4 (2017-11-06) ++++++++++++++++++ **Features and Improvements** - Increased timeout in search from 12 seconds to 24. - Added ability to set the ``max_retries`` in :func:`internetarchive.search_items`. - Made :meth:`internetarchive.ArchiveSession.mount_http_adapter` a public method for supporting complex custom retry logic. - Added ``--timeout`` option to ``ia search`` for setting a custom timeout. - Loosened requirements for schema library to ``schema>=0.4.0``. **Bugfixes** - The scraping API has reverted to using ``items`` key rather than ``docs`` key. v1.7.3 will still work, but this change keeps ia consistent with the API. 1.7.3 (2017-09-20) ++++++++++++++++++ **Bugfixes** - Fixed bug in search where search requests were failing with ``KeyError: 'items'``. 1.7.2 (2017-09-11) ++++++++++++++++++ **Features and Improvements** - Added support for adding custom headers to ``ia search``. **Bugfixes** - ``internetarchive.utils.get_s3_xml_text()`` is used to parse errors returned by S3 in XML. Sometimes there is no XML in the response. Most of the time this is due to 5xx errors. Either way, we want to always return the HTTPError, even if the XML parsing fails. - Fixed a regression where ``:`` was being stripped from filenames in upload. - Do not create a directory in ``download()`` when ``return_responses`` is ``True``. - Fixed bug in upload where file-like objects were failing with a TypeError exception. 1.7.1 (2017-07-25) ++++++++++++++++++ **Bugfixes** - Fixed bug in ``Item.upload_file()`` where ``checksum`` was being set to ``True`` if it was set to ``None``. 1.7.1 (2017-07-25) ++++++++++++++++++ **Bugfixes** - Fixed bug in ``ia upload`` where all commands would fail if multiple collections were specified (e.g. -m collection:foo -m collection:bar). 1.7.0 (2017-07-25) ++++++++++++++++++ **Features and Improvements** - Loosened up ``jsonpatch`` requirements, as the metadata API now supports more recent versions of the JSON Patch standard. - Added support for building "snap" packages (https://snapcraft.io/). **Bugfixes** - Fixed bug in upload where users were unable to add their own timeout via ``request_kwargs``. - Fixed bug where files with non-ascii filenames failed to upload on some platforms. - Fixed bug in upload where metadata keys with an index (e.g. ``subject[0]``) would make the request fail if the key was the only indexed key provided. - Added a default timeout to ``ArchiveSession.s3_is_overloaded()``. If it times out now, it returns ``True`` (as in, yes, S3 is overloaded). 1.6.0 (2017-06-27) ++++++++++++++++++ **Features and Improvements** - Added 60 second timeout to all upload requests. - Added support for uploading empty files. - Refactored ``Item.get_files()`` to be faster, especially for items with many files. - Updated search to use IA-S3 keys for auth instead of cookies. **Bugfixes** - Fixed bug in upload where derives weren't being queued in some cases where checksum=True was set. - Fixed bug where ``ia tasks`` and other ``Catalog`` functions were always using HTTP even when it should have been HTTPS. - ``ia metadata`` was exiting with a non-zero status for "no changes to xml" errors. This now exits with 0, as nearly every time this happens it should not be considered an "error". - Added unicode support to ``ia upload --spreadsheet`` and ``ia metadata --spreadsheet`` using the ``backports.csv`` module. - Fixed bug in ``ia upload --spreadsheet`` where some metadata was accidentally being copied from previous rows (e.g. when multiple subjects were used). - Submitter wasn't being added to ``ia tasks --json`` output, it now is. - ``row_type`` in ``ia tasks --json`` was returning integer for row-type rather than name (e.g. 'red'). 1.5.0 (2017-02-17) ++++++++++++++++++ **Features and Improvements** - Added option to download() for returning a list of response objects rather than writing files to disk. 1.4.0 (2017-01-26) ++++++++++++++++++ **Bugfixes** - Another bugfix for setting mtime correctly after ``fileobj`` functionality was added to ``ia download``. 1.3.0 (2017-01-26) ++++++++++++++++++ **Bugfixes** - Fixed bug where download was trying to set mtime, even when ``fileobj`` was set to ``True`` (e.g. ``ia download --stdout``). 1.2.0 (2017-01-26) ++++++++++++++++++ **Features and Improvements** - Added ``ia copy`` and ``ia move`` for copying and moving files in archive.org items. - Added support for outputting JSON in ``ia tasks``. - Added support to ``ia download`` to write to stdout instead of file. **Bugfixes** - Fixed bug in upload where AttributeError was raised when trying to upload file-like objects without a name attribute. - Removed identifier validation from ``ia delete``. If an identifier already exists, we don't need to validate it. This only makes things annoying if an identifier exists but fails ``internetarchive`` id validation. - Fixed bug where error message isn't returned in ``ia upload`` if the response body is not XML. Ideally IA-S3 would always return XML, but that's not the case as of now. Try to dump the HTML in the S3 response if unable to parse XML. - Fixed bug where ArchiveSession headers weren't being sent in prepared requests. - Fixed bug in ``ia upload --size-hint`` where value was an integer, but requests requires it to be a string. - Added support for downloading files to stdout in ``ia download`` and ``File.download``. 1.1.0 (2016-11-18) ++++++++++++++++++ **Features and Improvements** - Make sure collection exists when creating new item via ``ia upload``. If it doesn't, upload will fail. - Refactored tests. **Bugfixes** - Fixed bug where the full filepath was being set as the remote filename in Windows. - Convert all metadata header values to strings for compatibility with ``requests>=2.11.0``. 1.0.10 (2016-09-20) +++++++++++++++++++ **Bugfixes** - Convert x-archive-cascade-delete headers to strings for compatibility with ``requests>=2.11.0``. 1.0.9 (2016-08-16) ++++++++++++++++++ **Features and Improvements** - Added support to the CLI for providing username and password as options on the command-line. 1.0.8 (2016-08-10) ++++++++++++++++++ **Features and Improvements** - Increased maximum identifier length from 80 to 100 characters in ``ia upload``. **Bugfixes** - As of version 2.11.0 of the requests library, all header values must be strings (i.e. not integers). ``internetarchive`` now converts all header values to strings. 1.0.7 (2016-08-02) ++++++++++++++++++ **Features and Improvements** - Added ``internetarchive.api.get_user_info()``. 1.0.6 (2016-07-14) ++++++++++++++++++ **Bugfixes** - Fixed bug where upload was failing on file-like objects (e.g. StringIO objects). 1.0.5 (2016-07-07) ++++++++++++++++++ **Features and Improvements** - All metadata writes are now submitted at -5 priority by default. This is friendlier to the archive.org catalog, and should only be changed for one-off metadata writes. - Expanded scope of valid identifiers in ``utils.validate_ia_identifier`` (i.e. ``ia upload``). Periods are now allowed. Periods, underscores, and dashes are not allowed as the first character. 1.0.4 (2016-06-28) ++++++++++++++++++ **Features and Improvements** - Search now uses the v1 scraping API endpoint. - Moved ``internetarchive.item.Item.upload.iter_directory()`` to ``internetarchive.utils``. - Added support for downloading "on-the-fly" files (e.g. EPUB, MOBI, and DAISY) via ``ia download --on-the-fly`` or ``item.download(on_the_fly=True)``. **Bugfixes** - ``s3_is_overloaded()`` now returns ``True`` if the call is unsuccessful. - Fixed bug in upload where a derive task wasn't being queued when a directory is uploaded. 1.0.3 (2016-05-16) ++++++++++++++++++ **Features and Improvements** - Use scrape API for getting total number of results rather than the advanced search API. - Improved error messages for IA-S3 (upload) related errors. - Added retry support to delete. - ``ia delete`` no longer exits if a single request fails when deleting multiple files, but continues onto the next file. If any file fails, the command will exit with a non-zero status code. - All search requests now require authentication via IA-S3 keys. You can run ``ia configure`` to generate a config file that will be used to authenticate all search requests automatically. For more details refer to the following links: http://internetarchive.readthedocs.io/en/latest/quickstart.html?highlight=configure#configuring http://internetarchive.readthedocs.io/en/latest/api.html#configuration - Added ability to specify your own filepath in ``ia configure`` and ``internetarchive.configure()``. **Bugfixes** - Updated ``requests`` lib version requirements. This resolves issues with sending binary strings as bodies in Python 3. - Improved support for Windows, see `https://github.com/jjjake/internetarchive/issues/126 `_ for more details. - Previously all requests were made in HTTP for Python versions < 2.7.9 due to the issues described at `https://urllib3.readthedocs.org/en/latest/security.html `_. In favor of security over convenience, all requests are now made via HTTPS regardless of Python version. Refer to `http://internetarchive.readthedocs.org/en/latest/troubleshooting.html#https-issues `_ if you are experiencing issues. - Fixed bug in ``ia`` CLI where ``--insecure`` was still making HTTPS requests when it should have been making HTTP requests. - Fixed bug in ``ia delete`` where ``--all`` option wasn't working because it was using ``item.iter_files`` instead of ``item.get_files``. - Fixed bug in ``ia upload`` where uploading files with unicode file names were failing. - Fixed bug in upload where filenames with ``;`` characters were being truncated. - Fixed bug in ``internetarchive.catalog`` where TypeError was being raised in Python 3 due to mixing bytes with strings. 1.0.2 (2016-03-07) ++++++++++++++++++ **Bugfixes** - Fixed OverflowError bug in uploads on 32-bit systems when uploading files larger than ~2GB. - Fixed unicode bug in upload where ``urllib.parse.quote`` is unable to parse non-encoded strings. **Features and Improvements** - Only generate MD5s in upload if they are used (i.e. verify, delete, or checksum is True). - verify is off by default in ``ia upload``, it can be turned on with ``ia upload --verify``. 1.0.1 (2016-03-04) ++++++++++++++++++ **Bugfixes** - Fixed memory leak in ``ia upload --spreadsheet=metadata.csv``. - Fixed arg parsing bug in ``ia`` CLI. 1.0.0 (2016-03-01) ++++++++++++++++++ **Features and Improvements** - Renamed ``internetarchive.iacli`` to ``internetarchive.cli``. - Moved ``File`` object to ``internetarchive.files``. - Converted config format from YAML to INI to avoid PyYAML requirement. - Use HTTPS by default for Python versions > 2.7.9. - Added ``get_username`` function to API. - Improved Python 3 support. ``internetarchive`` is now being tested against Python versions 2.6, 2.7, 3.4, and 3.5. - Improved plugin support. - Added retry support to download and metadata retrieval. - Added ``Collection`` object. - Made ``Item`` objects hashable and orderable. **Bugfixes** - IA's Advanced Search API no longer supports deep-paging of large result sets. All search functions have been refactored to use the new Scrape API (http://archive.org/help/aboutsearch.htm). Search functions in previous versions are effictively broken, upgrade to >=1.0.0. 0.9.8 (2015-11-09) ++++++++++++++++++ **Bugfixes** - Fixed ``ia help`` bug. - Fixed bug in ``File.download()`` where connection errors weren't being caught/retried correctly. 0.9.7 (2015-11-05) ++++++++++++++++++ **Bugfixes** - Cleanup partially downloaded files when ``download()`` fails. **Features and Improvements** - Added ``--format`` option to ``ia delete``. - Refactored ``download()`` and ``ia download`` to behave more like rsync. Files are now clobbered by default, ``ignore_existing`` and ``--ignore-existing`` now skip over files already downloaded without making a request. - Added retry support to ``download()`` and ``ia download``. - Added ``files`` kwarg to ``Item.download()`` for downloading specific files. - Added ``ignore_errors`` option to ``File.download()`` for ignoring (but logging) exceptions. - Added default timeouts to metadata and download requests. - Less verbose output in ``ia download`` by default, use ``ia download --verbose`` for old style output. 0.9.6 (2015-10-12) ++++++++++++++++++ **Bugfixes** - Removed sync-db features for now, as lazytaable is not playing nicely with setup.py right now. 0.9.5 (2015-10-12) ++++++++++++++++++ **Features and Improvements** - Added skip based on mtime and length if no other clobber/skip options specified in ``download()`` and ``ia download``. 0.9.4 (2015-10-01) ++++++++++++++++++ **Features and Improvements** - Added ``internetarchive.api.get_username()`` for retrieving a username with an S3 key-pair. - Added ability to sync downloads via an sqlite database. 0.9.3 (2015-09-28) ++++++++++++++++++ **Features and Improvements** - Added ability to download items from an itemlist or search query in ``ia download``. - Made ``ia configure`` Python 3 compatible. **Bugfixes** - Fixed bug in ``ia upload`` where uploading an item with more than one collection specified caused the collection check to fail. 0.9.2 (2015-08-17) ++++++++++++++++++ **Bugfixes** - Added error message for failed ``ia configure`` calls due to invalid creds. 0.9.1 (2015-08-13) ++++++++++++++++++ **Bugfixes** - Updated docopt to v0.6.2 and PyYAML to v3.11. - Updated setup.py to automatically pull version from ``__init__``. 0.8.5 (2015-07-13) ++++++++++++++++++ **Bugfixes** - Fixed UnicodeEncodeError in ``ia metadata --append``. **Features and Improvements** - Added configuration documentation to readme. - Updated requests to v2.7.0 0.8.4 (2015-06-18) ++++++++++++++++++ **Features and Improvements** - Added check to ``ia upload`` to see if the collection being uploaded to exists. Also added an option to override this check. 0.8.3 (2015-05-18) ++++++++++++++++++ **Features and Improvements** - Fixed append to work like a standard metadata update if the metadata field does not yet exist for the given item. 0.8.0 2015-03-09 ++++++++++++++++ **Bugfixes** - Encode filenames in upload URLs. 0.7.9 (2015-01-26) ++++++++++++++++++ **Bugfixes** - Fixed bug in ``internetarchive.config.get_auth_config`` (i.e. ``ia configure``) where logged-in cookies returned expired within hours. Cookies should now be valid for about one year. 0.7.8 (2014-12-23) ++++++++++++++++++ - Output error message when downloading non-existing files in ``ia download`` rather than raising Python exception. - Fixed IOError in ``ia search`` when using ``head``, ``tail``, etc.. - Simplified ``ia search`` to output only JSON, rather than doing any special formatting. - Added experimental support for creating pex binaries of ia in ``Makefile``. 0.7.7 (2014-12-17) ++++++++++++++++++ - Simplified ``ia configure``. It now only asks for Archive.org email/password and automatically adds S3 keys and Archive.org cookies to config. See ``internetarchive.config.get_auth_config()``. 0.7.6 (2014-12-17) ++++++++++++++++++ - Write metadata to stdout rather than stderr in ``ia mine``. - Added options to search archive.org/v2. - Added destdir option to download files/itemdirs to a given destination dir. 0.7.5 (2014-10-08) ++++++++++++++++++ - Fixed typo. 0.7.4 (2014-10-08) ++++++++++++++++++ - Fixed missing "import" typo in ``internetarchive.iacli.ia_upload``. 0.7.3 (2014-10-08) ++++++++++++++++++ - Added progress bar to ``ia mine``. - Fixed unicode metadata support for ``upload()``. 0.7.2 (2014-09-16) ++++++++++++++++++ - Suppress ``KeyboardInterrupt`` exceptions and exit with status code 130. - Added ability to skip downloading files based on checksum in ``ia download``, ``Item.download()``, and ``File.download()``. - ``ia download`` is now verbose by default. Output can be suppressed with the ``--quiet`` flag. - Added an option to not download into item directories, but rather the current working directory (i.e. ``ia download --no-directories ``). - Added/fixed support for modifying different metadata targets (i.e. files/logo.jpg). 0.7.1 (2014-08-25) ++++++++++++++++++ - Added ``Item.s3_is_overloaded()`` method for S3 status check. This method is now used on retries in the upload method now as well. This will avoid uploading any data if a 503 is expected. If a 503 is still returned, retries are attempted. - Added ``--status-check`` option to ``ia upload`` for S3 status check. - Added ``--source`` parameter to ``ia list`` for returning files matching IA source (i.e. original, derivative, metadata, etc.). - Added support to ``ia upload`` for setting remote-name if only a single file is being uploaded. - Derive tasks are now only queued after the last file has been uploaded. - File URLs are now quoted in ``File`` objects, for downloading files with special characters in their filenames 0.7.0 (2014-07-23) ++++++++++++++++++ - Added support for retry on S3 503 SlowDown errors. 0.6.9 (2014-07-15) ++++++++++++++++++ - Added support for \n and \r characters in upload headers. - Added support for reading filenames from stdin when using the ``ia delete`` command. 0.6.8 (2014-07-11) ++++++++++++++++++ - The delete ``ia`` subcommand is now verbose by default. - Added glob support to the delete ``ia`` subcommand (i.e. ``ia delete --glob='*jpg'``). - Changed indexed metadata elements to clobber values instead of insert. - AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are now deprecated. IAS3_ACCESS_KEY and IAS3_SECRET_KEY must be used if setting IAS3 keys via environment variables. python-internetarchive-5.7.2/LICENSE000066400000000000000000001033301513674652200172560ustar00rootroot00000000000000 GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . python-internetarchive-5.7.2/MANIFEST.in000066400000000000000000000001051513674652200200030ustar00rootroot00000000000000include LICENSE AUTHORS.rst HISTORY.rst recursive-include tests *.py python-internetarchive-5.7.2/Makefile000066400000000000000000000102151513674652200177100ustar00rootroot00000000000000.PHONY: docs clean clean-dist test binary test-binary check-release check-version \ build tag push-tag upload-pypi publish-binary-upload github-release \ publish publish-all publish-binary docs-init init pep8-test prepare-release VERSION=$(shell grep -m1 __version__ internetarchive/__version__.py | cut -d\' -f2) # ============ Development ============ init: pip install responses==0.5.0 pytest-cov pytest-pep8 pip install -e . clean: find . -type f -name '*\.pyc' -delete find . -type d -name '__pycache__' -delete clean-dist: rm -rf dist/ build/ *.egg-info pep8-test: py.test --pep8 -m pep8 --cov-report term-missing --cov internetarchive test: ruff check pytest # ============ Documentation ============ docs-init: pip install -r docs/requirements.txt docs: cd docs && make html @echo "\033[95m\n\nBuild successful! View the docs homepage at docs/build/html/index.html.\n\033[0m" # ============ Binary Building ============ binary: pex . --python-shebang='/usr/bin/env python3' --python python3 -e internetarchive.cli.ia:main -o ia-$(VERSION)-py3-none-any.pex -r pex-requirements.txt --use-pep517 test-binary: binary @echo "Testing pex binary..." ./ia-$(VERSION)-py3-none-any.pex --version ./ia-$(VERSION)-py3-none-any.pex --help > /dev/null ./ia-$(VERSION)-py3-none-any.pex metadata --help > /dev/null @echo "Pex binary tests passed!" # ============ Release Preparation ============ # Usage: make prepare-release RELEASE=5.7.2 prepare-release: ifndef RELEASE $(error RELEASE is required. Usage: make prepare-release RELEASE=5.7.2) endif @if echo "$(RELEASE)" | grep -q 'dev'; then \ echo "Error: RELEASE should not contain 'dev'"; exit 1; \ fi sed -i '' "s/__version__ = '.*'/__version__ = '$(RELEASE)'/" internetarchive/__version__.py sed -i '' "s/^$(RELEASE) (?)$$/$(RELEASE) ($$(date +%Y-%m-%d))/" HISTORY.rst @echo "Updated to version $(RELEASE) with date $$(date +%Y-%m-%d)" @echo "Review changes and commit when ready" # ============ Release Validation ============ check-release: @if [ "$$(git rev-parse --abbrev-ref HEAD)" != "master" ]; then \ echo "Error: Must be on master branch to release"; exit 1; \ fi @if [ -n "$$(git status --porcelain)" ]; then \ echo "Error: Working directory is not clean"; exit 1; \ fi @if git rev-parse v$(VERSION) >/dev/null 2>&1; then \ echo "Error: Tag v$(VERSION) already exists"; exit 1; \ fi @echo "Release checks passed!" check-version: @if echo "$(VERSION)" | grep -q 'dev'; then \ echo "Error: Cannot release dev version $(VERSION)"; exit 1; \ fi @echo "Version $(VERSION) is valid for release" # ============ Release Building ============ build: clean-dist python -m build # ============ Release Publishing ============ tag: git tag -a v$(VERSION) -m 'version $(VERSION)' push-tag: git push --tags origin master upload-pypi: twine upload --repository pypi ./dist/* publish-binary-upload: ./ia-$(VERSION)-py3-none-any.pex upload ia-pex ia-$(VERSION)-py3-none-any.pex --no-derive ./ia-$(VERSION)-py3-none-any.pex upload ia-pex ia-$(VERSION)-py3-none-any.pex --remote-name=ia --no-derive # Extract changelog and create GitHub release github-release: @echo "Extracting changelog for v$(VERSION)..." @awk '/^$(VERSION) /{found=1; next} found && /^[0-9]+\.[0-9]+\.[0-9]+ /{exit} found' HISTORY.rst > /tmp/ia-release-notes-$(VERSION).md gh release create v$(VERSION) \ --title "v$(VERSION)" \ --notes-file /tmp/ia-release-notes-$(VERSION).md @rm -f /tmp/ia-release-notes-$(VERSION).md @echo "GitHub release created!" # ============ Main Release Targets ============ # PyPI-only release (no binary) publish: check-version check-release test build tag push-tag upload-pypi github-release @echo "\n\033[92mRelease v$(VERSION) published to PyPI and GitHub!\033[0m" # Full release including pex binary publish-all: check-version check-release test build binary test-binary tag push-tag upload-pypi publish-binary-upload github-release @echo "\n\033[92mRelease v$(VERSION) published everywhere!\033[0m" # Binary-only release (for publishing binary after PyPI release) publish-binary: binary test-binary publish-binary-upload @echo "\n\033[92mBinary v$(VERSION) published to archive.org!\033[0m" python-internetarchive-5.7.2/README.rst000066400000000000000000000053301513674652200177410ustar00rootroot00000000000000A Python and Command-Line Interface to Archive.org ================================================== |tox| |versions| |downloads| |contributors| .. |tox| image:: https://github.com/jjjake/internetarchive/actions/workflows/tox.yml/badge.svg :target: https://github.com/jjjake/internetarchive/actions/workflows/tox.yml .. |versions| image:: https://img.shields.io/pypi/pyversions/internetarchive.svg :target: https://pypi.org/project/internetarchive .. |downloads| image:: https://static.pepy.tech/badge/internetarchive/month :target: https://pepy.tech/project/internetarchive .. |contributors| image:: https://img.shields.io/github/contributors/jjjake/internetarchive.svg :target: https://github.com/jjjake/internetarchive/graphs/contributors This package installs a command-line tool named ``ia`` for using Archive.org from the command-line. It also installs the ``internetarchive`` Python module for programmatic access to archive.org. Please report all bugs and issues on `Github `__. SECURITY NOTICE _______________ **Please upgrade to v5.4.2+ immediately.** Versions <=5.4.1 contain a critical directory traversal vulnerability in the ``File.download()`` method. `See the changelog for details `_. Thank you to Pengo Wray for their contributions in identifying and resolving this issue. Installation ------------ You can install this module via `pipx `_: .. code:: bash $ pipx install internetarchive Binaries of the command-line tool are also available: .. code:: bash $ curl -LO https://archive.org/download/ia-pex/ia $ chmod +x ia $ ./ia --help Unsupported Installation Methods ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **This library must only be installed via** `one of the supported methods `_ **(i.e.** ``pip``, ``pipx``, **or from source).** Installation via third-party package managers like Homebrew, MacPorts, or Linux system packages (apt, yum, etc.) is **not supported**. These versions are often severely outdated, incompatible, and broken. If you have installed this software via Homebrew, please uninstall it (`brew uninstall internetarchive`) and use a supported method. Documentation ------------- Documentation is available at `https://archive.org/services/docs/api/internetarchive `_. Contributing ------------ All contributions are welcome and appreciated. Please see `https://archive.org/services/docs/api/internetarchive/contributing.html `_ for more details. python-internetarchive-5.7.2/docs/000077500000000000000000000000001513674652200172015ustar00rootroot00000000000000python-internetarchive-5.7.2/docs/Makefile000066400000000000000000000152271513674652200206500ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/internetarchive.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/internetarchive.qhc" devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/internetarchive" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/internetarchive" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." python-internetarchive-5.7.2/docs/make.bat000066400000000000000000000151101513674652200206040ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source set I18NSPHINXOPTS=%SPHINXOPTS% source if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\internetarchive.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\internetarchive.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %BUILDDIR%/.. echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %BUILDDIR%/.. echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end python-internetarchive-5.7.2/docs/requirements.txt000066400000000000000000000001171513674652200224640ustar00rootroot00000000000000alabaster==0.7.12 docutils<0.18 Sphinx==4.5.0 sphinx-autodoc-typehints==1.18.1 python-internetarchive-5.7.2/docs/source/000077500000000000000000000000001513674652200205015ustar00rootroot00000000000000python-internetarchive-5.7.2/docs/source/_static/000077500000000000000000000000001513674652200221275ustar00rootroot00000000000000python-internetarchive-5.7.2/docs/source/_static/ia.png000066400000000000000000000111021513674652200232210ustar00rootroot00000000000000PNG  IHDRXgAMA asRGB cHRMz&u0`:pQ<bKGD pHYsHHFk>7IDATxKly؞8vD8/$"H K֬` [X!bÆ%B"xl`'q+!x=wqo ׮:]O*33=9z 0!dERI@PBPBPBPBPBPBPBPBPBPBPBPBPBPBPBPBPBPBPBPBPK}A$^lV* qU?/^D[[paѣGߏnqΦ Cܽ{n8FFFpϘAVzl6;vG<^tttuݤ򆅂D@&kjחqLNNbqqʆz=sN۷GӃ["Xֽ rSSSr}6 uى|Q;v2J0 Q,199K.-7׮]n 㠭mtwwm]5V pMOG6&&&nz=d>|Ge:;;>*c Ν;ŋP͡h$"T*đ#G*adaaBayi||nݻwPo",N:>IL ;oM: a^J{%XOIN<'=+ 0zj&A/[_~eCpEA pNdeqVb P׹Z< BM!DuyO?7|dك7xI7|3U裏sssI4MË$Nhr+رcV}1G kRhDY߸ko/Z'!6ag^[q\-q-qDՂu;w_|WgS## ={ӫ8N8^z ]]]])={0 ݍW_}F;wg~X{0Ķmpi?~^3RT\r%]M7---'|b$pxxxMq{^zk7'=ƾ? =[SĨ., JD[lю#.AZ[[EQfE8mJ]Q---;j,ZZZcłܶı\N4sGM&Il |^kPc%٬h!jH۱:BIslml3wxgM% sM]V%8?L&V瘈< ImI׈S UG۱Zd$QRژM&Il)m$`q}ZSHVljAl)mtR Tj It7L -?WbF1U.ƙA]&3giM0S3d,ΙQj* U۱Z\.]5KS_@Vy8aI믿W_}K.^T*>*IENDB`python-internetarchive-5.7.2/docs/source/_templates/000077500000000000000000000000001513674652200226365ustar00rootroot00000000000000python-internetarchive-5.7.2/docs/source/_templates/about.html000066400000000000000000000001331513674652200246330ustar00rootroot00000000000000

The internetarchive library is a Python & command-line interface to archive.org

python-internetarchive-5.7.2/docs/source/_templates/sidebarlogo.html000066400000000000000000000002411513674652200260130ustar00rootroot00000000000000 python-internetarchive-5.7.2/docs/source/_templates/usefullinks.html000066400000000000000000000011611513674652200260670ustar00rootroot00000000000000

Useful links

python-internetarchive-5.7.2/docs/source/authors.rst000066400000000000000000000000371513674652200227200ustar00rootroot00000000000000.. include:: ../../AUTHORS.rst python-internetarchive-5.7.2/docs/source/cli.rst000066400000000000000000000361451513674652200220130ustar00rootroot00000000000000.. _cli: Command-Line Interface ====================== The ``ia`` command-line tool is installed with the ``internetarchive`` Python module, or :ref:`available as a binary `. ``ia`` allows you to interact with various archive.org services from the command-line. Once you have :ref:`installed ia ` or :ref:`downloaded a binary ` and :ref:`configured it `, you can start exploring the commands documented below. Quick Start ----------- If you're not sure where to start, most users start with these commands: - ``ia download `` - :ref:`Download ` files or items - ``ia search ''`` - :ref:`Search ` items on archive.org - ``ia metadata `` - :ref:`Read Metadata ` from an item - ``ia upload -m 'collection:test_collection'`` - :ref:`Upload ` files to archive.org Check out the help menu to see all available commands: .. code:: console $ ia --help usage: ia [-h] [-v] [-c FILE] [-l] [-d] [-i] [-H HOST] {command} ... A command line interface to Archive.org. optional arguments: -h, --help show this help message and exit -v, --version show program's version number and exit -c FILE, --config-file FILE path to configuration file -l, --log enable logging -d, --debug enable debugging -i, --insecure allow insecure connections -H HOST, --host HOST host to connect to (doesn't work for requests made to s3.us.archive.org) commands: {command} account (ac) Manage an archive.org account. Note: requires admin privileges configure (co) configure 'ia' with your archive.org credentials copy (cp) Copy files from archive.org items delete (rm) Delete files from archive.org items download (do) Download files from archive.org flag (fl) Manage flags list (ls) list files from archive.org items metadata (md) Retrieve and modify archive.org item metadata move (mv) Move and rename files in archive.org items reviews (re) submit and modify reviews for archive.org items search (se) Search items on archive.org simplelists (sl) Manage simplelists tasks (ta) Retrieve information about your archive.org catalog tasks upload (up) Upload files to archive.org Documentation for 'ia' is available at: https://archive.org/developers/internetarchive/cli.html See 'ia {command} --help' for help on a specific command. .. _cli-metadata: Metadata -------- Reading Metadata ^^^^^^^^^^^^^^^^ You can use ``ia`` to read and write metadata from archive.org. To retrieve all of an item's metadata in JSON, simply: .. code:: console $ ia metadata TripDown1905 A particularly useful tool to use alongside ``ia`` is `jq `_. ``jq`` is a command-line tool for parsing JSON. For example: .. code:: console $ ia metadata TripDown1905 | jq '.metadata.date' "1906" Modifying Metadata ^^^^^^^^^^^^^^^^^^ Once ``ia`` has been `configured `_, you can modify `metadata `_: .. code:: console $ ia metadata --modify="foo:bar" --modify="baz:foooo" You can remove a metadata field by setting the value of the given field to ``REMOVE_TAG``. For example, to remove the metadata field ``foo`` from the item ````: .. code:: console $ ia metadata --modify="foo:REMOVE_TAG" Note that some metadata fields (e.g. ``mediatype``) cannot be modified, and must instead be set initially on upload. The default target to write to is ``metadata``. If you would like to write to another target, such as ``files``, you can specify so using the ``--target`` parameter. For example, if we had an item whose identifier was ``my_identifier`` and we wanted to add a metadata field to a file within the item called ``foo.txt``: .. code:: console $ ia metadata my_identifier --target="files/foo.txt" --modify="title:My File" You can also create new targets if they don't exist: .. code:: console $ ia metadata --target="extra_metadata" --modify="foo:bar" There is also an ``--append`` option which allows you to append a string to an existing metadata strings (Note: use ``--append-list`` for appending elements to a list). For example, if your item's title was ``Foo`` and you wanted it to be ``Foo Bar``, you could simply do: .. code:: console $ ia metadata --append="title: Bar" If you would like to add a new value to an existing field that is an array (like ``subject`` or ``collection``), you can use the ``--append-list`` option: .. code:: console $ ia metadata --append-list="subject:another subject" This command would append ``another subject`` to the items list of subjects, if it doesn't already exist (i.e. no duplicate elements are added). Metadata fields or elements can be removed with the ``--remove`` option: .. code:: console $ ia metadata --remove="subject:another subject" This would remove ``another subject`` from the items subject field, regardless of whether or not the field is a single or multi-value field. Refer to `Internet Archive Metadata `_ for more specific details regarding metadata and archive.org. Modifying Metadata in Bulk ^^^^^^^^^^^^^^^^^^^^^^^^^^ If you have a lot of metadata changes to submit, you can use a CSV spreadsheet to submit many changes with a single command. Your CSV must contain an ``identifier`` column, with one item per row. Any other column added will be treated as a metadata field to modify. If no value is provided in a given row for a column, no changes will be submitted. If you would like to specify multiple values for certain fields, an index can be provided: ``subject[0]``, ``subject[1]``. Your CSV file should be UTF-8 encoded. See `metadata.csv `_ for an example CSV file. Once you're ready to submit your changes, you can submit them like so: .. code:: console $ ia metadata --spreadsheet=metadata.csv See ``ia help metadata`` for more details. .. _cli-upload: Upload ------ ``ia`` can also be used to upload items to archive.org. After `configuring ia `__, you can upload files like so: .. code:: console $ ia upload file1 file2 --metadata="mediatype:texts" --metadata="blah:arg" .. warning:: Please note that, unless specified otherwise, items will be uploaded with a ``data`` mediatype. **This cannot be changed afterwards.** Therefore, you should specify a mediatype when uploading, eg. ``--metadata="mediatype:movies"``. Similarly, if you want your upload to end up somewhere else than the default collection (currently `community texts `_), you should also specify a collection with ``--metadata="collection:foo"``. See `metadata documentation `_ for more information. You can upload files from ``stdin``: .. code:: console $ curl http://dumps.wikimedia.org/kywiki/20130927/kywiki-20130927-pages-logging.xml.gz \ | ia upload - --remote-name=kywiki-20130927-pages-logging.xml.gz --metadata="title:Uploaded from stdin." You can use the ``--retries`` parameter to retry on errors (i.e. if IA-S3 is overloaded): .. code:: console $ ia upload file1 --retries 10 Note that ``ia upload`` makes a backup of any files that are clobbered. They are saved to a directory in the item named ``history/files/``. The files are named in the format ``$key.~N~``. These files can be deleted like normal files. You can also prevent the backup from happening on clobbers by adding ``-H x-archive-keep-old-version:0`` to your command. Refer to `archive.org Identifiers `_ for more information on creating valid archive.org identifiers. Please also read the `Internet Archive Items `_ page before getting started. Bulk Uploading ^^^^^^^^^^^^^^ Uploading in bulk can be done similarly to `Modifying Metadata in Bulk`_. The only difference is that you must provide a ``file`` column which contains a relative or absolute path to your file. Please see `uploading.csv `_ for an example. Once you are ready to start your upload, simply run: .. code:: console $ ia upload --spreadsheet=uploading.csv Bulk Uploading Special Columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can set a remote filename that differs from your local filename by specifying a remote filename in a column named ``REMOTE_NAME`` (Added to ``ia`` in ``v2.0.0``). See ``ia help upload`` for more details. Setting File-Level Metadata on Upload ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can set file-level metadata at time of upload via a JSON/JSONL file. The JSON or JSONL must have a dict for each file, with the local path to the file stored under the key, ``name``. For example, you could upload two files named ``foo.txt`` and ``bar.txt`` with a file-level ``title`` with the following JSONL file (named ``file_md.jsonl``): .. code:: json {"name": "foo.txt", "title": "my foo file"} {"name": "bar.txt", "title": "my foo file"} And the following command: .. code:: console $ ia upload --file-metadata file_md.jsonl .. _cli-download: Download -------- Download an entire item: .. code:: console $ ia download TripDown1905 Download specific files from an item: .. code:: console $ ia download TripDown1905 TripDown1905_512kb.mp4 TripDown1905.ogv Download specific files matching a glob pattern: .. code:: console $ ia download TripDown1905 --glob="*.mp4" Note that you may have to escape the ``*`` differently depending on your shell (e.g. ``\*.mp4``, ``'*.mp4'``, etc.). Download specific files matching a glob pattern, but excluding files matching a different glob pattern: .. code:: console $ ia download TripDown1905 --glob="*.mp4" --exclude "*512kb*" Note that ``--exclude`` can only be used in conjunction with ``--glob``. Download files matching multiple glob and exclude patterns: .. code:: console $ ia download TripDown1905 --glob="*.mp4|*.xml" --exclude "*512kb*|*_reviews.xml" Download only files of a specific format: .. code:: console $ ia download TripDown1905 --format='512Kb MPEG4' Note that ``--format`` cannot be used with ``--glob`` or ``--exclude``. You can get a list of the formats of a given item like so: .. code:: console $ ia metadata --formats TripDown1905 Download an entire collection: .. code:: console $ ia download --search 'collection:glasgowschoolofart' Download from an itemlist: .. code:: console $ ia download --itemlist itemlist.txt See ``ia help download`` for more details. Downloading On-The-Fly Files ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Some files on archive.org are generated on-the-fly as requested. This currently includes non-original files of the formats EPUB, MOBI, DAISY, and archive.org's own MARCXML. These files can be downloaded using the ``--on-the-fly`` parameter: .. code:: console $ ia download goodytwoshoes00newyiala --on-the-fly Delete ------ You can use ``ia`` to delete files from archive.org items: .. code:: console $ ia delete Delete all files associated with the specified file, including upstream derivatives and the original: .. code:: console $ ia delete --cascade Delete all files in an item: .. code:: console $ ia delete --all Note that ``ia delete`` makes a backup of any files that are deleted. They are saved to a directory in the item named ``history/files/``. The files are named in the format ``$key.~N~``. These files can be deleted like normal files. You can also prevent the backup from happening on deletes by adding ``-H x-archive-keep-old-version:0`` to your command. See ``ia help delete`` for more details. .. _cli-search: Search ------ ``ia`` can also be used for retrieving archive.org search results in JSON: .. code:: console $ ia search 'subject:"market street" collection:prelinger' By default, ``ia search`` attempts to return all items meeting the search criteria, and the results are sorted by item identifier. If you want to just select the top ``n`` items, you can specify a ``page`` and ``rows`` parameter. For example, to get the top 20 items matching the search 'dogs': .. code:: console $ ia search --parameters="page=1&rows=20" "dogs" You can use ``ia search`` to create an itemlist: .. code:: console $ ia search 'collection:glasgowschoolofart' --itemlist > itemlist.txt You can pipe your itemlist into a GNU Parallel command to download items concurrently: .. code:: console $ ia search 'collection:glasgowschoolofart' --itemlist | parallel 'ia download {}' See ``ia help search`` for more details. Tasks ----- You can also use ``ia`` to retrieve information about your catalog tasks, after `configuring ia `__. To retrieve the task history for an item, simply run: .. code:: console $ ia tasks View all of your queued and running archive.org tasks: .. code:: console $ ia tasks See ``ia help tasks`` for more details. List ---- You can list files in an item like so: .. code:: console $ ia list goodytwoshoes00newyiala See ``ia help list`` for more details. Copy ---- You can copy files in archive.org items like so: .. code:: console $ ia copy / / If you're copying your file to a new item, you can provide metadata as well: .. code:: console $ ia copy / / --metadata 'title:My New Item' --metadata collection:test_collection Note that ``ia copy`` makes a backup of any files that are clobbered. They are saved to a directory in the item named ``history/files/``. The files are named in the format ``$key.~N~``. These files can be deleted like normal files. You can also prevent the backup from happening on clobbers by adding ``-H x-archive-keep-old-version:0`` to your command. Move ---- ``ia move`` works just like ``ia copy`` except the source file is deleted after the file has been successfully copied. Note that ``ia move`` makes a backup of any files that are clobbered or deleted. They are saved to a directory in the item named ``history/files/``. The files are named in the format ``$key.~N~``. These files can be deleted like normal files. You can also prevent the backup from happening on clobbers or deletes by adding ``-H x-archive-keep-old-version:0`` to your command. Performance Tips ---------------- For downloading or processing many items, see :ref:`using GNU Parallel ` for concurrent operations. Getting Help ------------ If you encounter issues, check :ref:`troubleshooting` for common problems and solutions. python-internetarchive-5.7.2/docs/source/conf.py000066400000000000000000000054421513674652200220050ustar00rootroot00000000000000""" Internet Archive Python Library documentation configuration. """ import os import sys import alabaster import internetarchive from internetarchive import __version__ # Add the project root to Python's module search path sys.path.insert(0, os.path.abspath('../../')) # -- Project information ---------------------------------------------------- project = 'internetarchive' copyright = '2015, Internet Archive' # The short X.Y version version = __version__ # The full version, including alpha/beta/rc tags release = version # -- General configuration --------------------------------------------------- extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.todo', 'sphinx.ext.coverage', 'sphinx.ext.viewcode', 'sphinx.ext.intersphinx', 'alabaster', 'sphinx_autodoc_typehints', ] # Paths containing templates templates_path = ['_templates'] # File extensions of source files source_suffix = '.rst' # Master document (starting point) master_doc = 'index' # Files to exclude exclude_patterns: list[str] = [] # Don't prepend module names to titles add_module_names = False # Syntax highlighting style pygments_style = 'sphinx' # -- Intersphinx configuration ---------------------------------------------- intersphinx_mapping = { 'python': ('https://docs.python.org/3', None), 'requests': ('https://docs.python-requests.org/en/latest/', None), } # -- Autodoc configuration -------------------------------------------------- autodoc_member_order = 'bysource' # -- HTML output configuration ---------------------------------------------- html_theme_path = [alabaster.get_path()] html_theme = 'alabaster' html_theme_options = { 'github_user': 'jjjake', 'github_repo': 'internetarchive', 'github_button': True, 'show_powered_by': False, 'sidebar_width': '200px', } html_sidebars = { '**': [ 'sidebarlogo.html', 'about.html', 'navigation.html', 'usefullinks.html', 'searchbox.html', ] } # Static files html_static_path = ['_static'] # HTML help builder output name htmlhelp_basename = 'internetarchivedoc' # -- LaTeX output configuration --------------------------------------------- latex_elements: dict[str, str] = {} latex_documents = [ ('index', 'internetarchive.tex', 'internetarchive Documentation', 'Jacob M. Johnson', 'manual'), ] # -- Manual page output configuration --------------------------------------- man_pages = [ ('index', 'internetarchive', 'internetarchive Documentation', ['Jacob M. Johnson'], 1) ] # -- Texinfo output configuration ------------------------------------------- texinfo_documents = [ ('index', 'internetarchive', 'internetarchive Documentation', 'Jacob M. Johnson', 'internetarchive', 'One line description of project.', 'Miscellaneous'), ] python-internetarchive-5.7.2/docs/source/configuration.rst000066400000000000000000000061331513674652200241050ustar00rootroot00000000000000.. _configuration: Configuration ============= Certain functionality of the internetarchive Python library requires your archive.org credentials. Your `IA-S3 keys `_ are required for uploading, searching, and modifying metadata, and your archive.org logged-in cookies are required for downloading access-restricted content and viewing your task history. Your keys can be saved to a config file or set as environment variables. Config File ----------- To automatically create a config file with your archive.org credentials, you can use the ``ia`` command-line tool: .. code-block:: console $ ia configure Enter your archive.org credentials below to configure 'ia'. Email address: user@example.com Password: Config saved to: /home/user/.config/ia.ini Your config file will be saved to ``$HOME/.config/ia.ini``, or ``$HOME/.ia`` if you do not have a ``.config`` directory in ``$HOME``. Alternatively, you can specify your own path to save the config to via ``ia --config-file '~/.ia-custom-config' configure``. If you have a netrc file with your archive.org credentials in it, you can simply run ``ia configure --netrc``. ``ia configure`` can be rerun at any time to update your credentials. Custom configuration options manually added to the config file will be preserved when using ``ia configure``. *Note: Python's netrc library does not currently support passphrases, or passwords with spaces in them, and therefore are not currently supported here.* Config File Format ~~~~~~~~~~~~~~~~~~ Below is an example of a config file with the required sections and keys, as well as optional keys for advanced configuration. You should generally only configure with ``ia configure``, but you can manually edit the config file if needed. .. code-block:: ini [s3] access = secret = [cookies] logged-in-user = logged-in-sig = [general] screenname = custom-var = foo [custom] foo = bar The config above would generate the following configuration dictionary when loaded via the ``get_session`` function: .. code-block:: python >>> from internetarchive import get_session >>> s = get_session(config_file='/tmp/ia.ini') >>> print(s.config) {'s3': { 'access': '', 'secret': '' }, 'cookies': { 'logged-in-user': '', 'logged-in-sig': ''}, 'general': { 'screenname': '', 'custom-var': 'foo' }, 'custom': { 'foo': 'bar' } } Environment Variables --------------------- Alternatively, you can set the following environment variables with your S3 credentials: - ``IA_ACCESS_KEY_ID``: Your IA-S3 access key - ``IA_SECRET_ACCESS_KEY``: Your IA-S3 secret key *Note: Both environment variables must be set together. If only one is set, a* :class:`ValueError` *will be raised. If both are set, they will take precedence over the config file.* python-internetarchive-5.7.2/docs/source/contributing.rst000066400000000000000000000000441513674652200237400ustar00rootroot00000000000000.. include:: ../../CONTRIBUTING.rst python-internetarchive-5.7.2/docs/source/index.rst000066400000000000000000000065311513674652200223470ustar00rootroot00000000000000The Internet Archive Python Library =================================== Release v\ |version|. (:ref:`Installation `) |tox| |versions| |downloads| |contributors| .. |tox| image:: https://github.com/jjjake/internetarchive/actions/workflows/tox.yml/badge.svg :target: https://github.com/jjjake/internetarchive/actions/workflows/tox.yml .. |versions| image:: https://img.shields.io/pypi/pyversions/internetarchive.svg :target: https://pypi.org/project/internetarchive .. |downloads| image:: https://static.pepy.tech/badge/internetarchive/month :target: https://pepy.tech/project/internetarchive .. |contributors| image:: https://img.shields.io/github/contributors/jjjake/internetarchive.svg :target: https://github.com/jjjake/internetarchive/graphs/contributors Welcome to the documentation for the ``internetarchive`` Python library. This tool provides both a **command-line interface (CLI)** and a **Python API** for interacting with **archive.org**, allowing you to search, download, upload and interact with archive.org services from your terminal or in Python. These docs guide you through installation, usage, and examples, whether you’re new to Python, just want to try the CLI, or are building applications that work with the Internet Archive. Please report any issues or contribute on `GitHub `_. Quick start =========== If you're new to Python or the command line interface (CLI), the easiest way to get started is to follow these three steps: 1. :ref:`Download a binary ` of the ``ia`` command-line tool 2. :ref:`Configure your environment ` with your Archive.org credentials 3. :ref:`Visit the CLI documentation ` to start exploring how to use the tool Documentation ============= For more detailed information, including installing the command-line tool and Python library, please refer to the following sections: Setup & Configuration _____________________ Get the tools running on your system: .. toctree:: :maxdepth: 2 installation configuration User Interfaces --------------- These are the main ways to use the Internet Archive Python Library and CLI: .. toctree:: :maxdepth: 2 cli python-lib Performance & Scaling --------------------- Optimize your workflows: .. toctree:: :maxdepth: 2 parallel Development & Community ----------------------- Contribute and stay updated: .. toctree:: :maxdepth: 2 contributing updates Help & Support -------------- Get help when you need it: .. toctree:: :maxdepth: 2 troubleshooting - **Documentation**: Check this troubleshooting guide first - **Community**: Search existing `GitHub Issues `_ - **Report**: If you can't find a solution, `open a new issue `_ When reporting an issue, please include: - The exact command or code that caused the problem - Any error messages you received - Your operating system and Python version Before reporting, make sure you're using the latest version of the library and :ref:`updating` if necessary. Reference --------- Complete reference documentation for all modules: .. toctree:: :maxdepth: 2 modules Authors ------- .. toctree:: :maxdepth: 2 authors Indices and tables ================== * :ref:`genindex` * :ref:`search` python-internetarchive-5.7.2/docs/source/installation.rst000066400000000000000000000070351513674652200237410ustar00rootroot00000000000000.. _installation: Installation ============ Recommended: Installing the ``ia`` CLI with ``pipx`` ---------------------------------------------------- If your primary goal is to use the **``ia`` command-line tool**, the recommended approach is to install it with ``pipx``. This keeps the CLI isolated from your system Python while making the ``ia`` command available globally. Using ``pipx`` ensures the CLI is isolated, easy to upgrade, and globally accessible. *If you just want to try out the ``ia`` CLI without installing anything, you can use the prebuilt binary instead. See the :ref:`binaries` section below for details.* **Prerequisite:** Make sure you have ``pipx`` installed. For installation instructions, see the `pipx installation guide `_. 1. **Install ``internetarchive`` using ``pipx``**: .. code-block:: console pipx install internetarchive 2. **Verify the installation**: .. code-block:: console ia --version This should display the installed version of the ``ia`` CLI. Troubleshooting ``pipx`` Installation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - **Permission errors**: Avoid using ``sudo`` with ``pipx``. It is designed to work without elevated permissions. - **Command not found**: If ``ia`` is not recognized, restart your terminal or run: .. code-block:: console pipx ensurepath - **Python version issues**: Ensure you are using Python 3.9 or later. For more details, refer to the `pipx documentation `_. Installing for Python Scripts (using a virtual environment) ----------------------------------------------------------- If you want to import ``internetarchive`` in your Python scripts (for programmatic access), the recommended approach is to use a virtual environment: .. code-block:: console python -m venv venv source venv/bin/activate pip install --upgrade pip pip install internetarchive After this, you can use the library in Python: .. code-block:: python from internetarchive import get_item item = get_item("nasa") print(item.metadata) .. _binaries: Using ``ia`` Binaries --------------------- The easiest way to start using ``ia`` is downloading a binary. The only requirements of the binary are a Unix-like environment with Python installed. To download the latest binary, and make it executable simply run the following commands: .. code-block:: console curl -LOs https://archive.org/download/ia-pex/ia chmod +x ia Binaries are generated with `PEX `_. The only requirement for using the binaries is that you have a `supported version of Python `_ installed on a Unix-like operating system. For more details on the command-line interface please refer to the `README `_, or run ``ia help``. .. _updating: Updating -------- The method for updating depends on how you originally installed: **If you installed** ``ia`` **with pipx** (CLI): .. code-block:: console pipx upgrade internetarchive **If you installed** ``internetarchive`` **in a virtual environment (Python library)**: Activate your virtual environment, then: .. code-block:: console pip install --upgrade internetarchive **If you are using the binary**: Simply download the latest binary again with the same steps as above: .. code-block:: console curl -LOs https://archive.org/download/ia-pex/ia chmod +x ia For more information about recent changes, see :ref:`updates`. python-internetarchive-5.7.2/docs/source/internetarchive.rst000066400000000000000000000016601513674652200244300ustar00rootroot00000000000000:orphan: .. _internetarchive: Internetarchive: A Python Interface to archive.org ================================================== .. automodule:: internetarchive :class:`internetarchive.Item` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: Item :members: :show-inheritance: :class:`internetarchive.File` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: File :members: :show-inheritance: :class:`internetarchive.Search` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: Search :members: :show-inheritance: :class:`internetarchive.Catalog` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: Catalog :members: :show-inheritance: :class:`internetarchive.ArchiveSession` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: ArchiveSession :members: :show-inheritance: :mod:`internetarchive.api` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. automodule:: internetarchive.api :members: :show-inheritance: python-internetarchive-5.7.2/docs/source/modules.rst000066400000000000000000000073451513674652200227140ustar00rootroot00000000000000.. _modules: Module Documentation ==================== This section contains complete reference documentation for all modules, classes, and methods in the ``internetarchive`` package. For a gentler introduction with examples, see :ref:`python-lib`. Core Modules ------------ These modules provide the main functionality for interacting with archive.org. .. _api-module: internetarchive.api module ~~~~~~~~~~~~~~~~~~~~~~~~~~ The convenience module providing simple functions for common tasks. .. automodule:: internetarchive.api :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: .. _session-module: internetarchive.session module ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The session management module for persisting configuration and connections. .. automodule:: internetarchive.session :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: .. _item-module: internetarchive.item module ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Modules for working with archive.org items and collections. .. automodule:: internetarchive.item :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: .. _account-module: internetarchive.account module ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Module for managing an archive.org account (requires admin privileges). .. automodule:: internetarchive.account :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: .. _search-module: internetarchive.search module ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Modules for searching and retrieving items from archive.org. .. automodule:: internetarchive.search :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: File Operations --------------- Modules for working with files and specific file operations. .. automodule:: internetarchive.files :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: Request Handling ---------------- Modules for making HTTP requests to archive.org services. .. automodule:: internetarchive.iarequest :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: Task and Catalog Management --------------------------- Modules for working with archive.org tasks and the catalog system. .. automodule:: internetarchive.catalog :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: Authentication and Configuration -------------------------------- Modules for authentication, configuration, and utility functions. .. automodule:: internetarchive.auth :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: .. automodule:: internetarchive.config :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: Utility and Supporting Modules ------------------------------ Internal utilities and supporting modules. .. automodule:: internetarchive.utils :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: .. automodule:: internetarchive.exceptions :members: :undoc-members: :show-inheritance: :noindex: CLI Modules (Internal) ---------------------- .. note:: These modules are primarily used by the command-line interface and are considered internal to the package. For using the CLI, see :ref:`cli`. .. automodule:: internetarchive.cli :members: :undoc-members: :show-inheritance: :special-members: __init__ :noindex: Complete Package Reference -------------------------- For a complete listing of all modules and classes: .. automodule:: internetarchive :members: :undoc-members: :show-inheritance: :noindex: python-internetarchive-5.7.2/docs/source/parallel.rst000066400000000000000000000053601513674652200230330ustar00rootroot00000000000000.. _parallel: Using GNU Parallel with ia ========================== `GNU Parallel `_ is a shell tool for executing jobs in parallel. It is a very useful tool to use with ``ia`` for bulk jobs. It can be installed via many OS package managers. For example, it can be installed via `homebrew `_ on Mac OS:: brew install parallel Refer to the `GNU Parallel homepage `_ for more details on available packages, source code, installation, and other documentation and tutorials. Basic Usage ----------- You can use ``parallel`` to retrieve metadata from archive.org items concurrently: .. code:: bash $ cat itemlist.txt jj-test-2020-09-17-1 jj-test-2020-09-17-2 jj-test-2020-09-17-3 $ cat itemlist.txt | parallel 'ia metadata {}' | jq .metadata.date "1999" "1999" "1999" You can run ``parallel`` with ``--dry-run`` to check your commands before running them: .. code:: bash $ cat itemlist.txt | parallel --dry-run 'ia metadata {}' ia metadata jj-test-2020-09-17-2 ia metadata jj-test-2020-09-17-1 ia metadata jj-test-2020-09-17-3 Logging and retrying with Parallel ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Parallel also offers an easy way to log and retry failed commands. Here's an example of a job that is retrieving metadata for all of the items in the file named ``itemlist.txt``, and outputting the metadata to a file named ``output.jsonl``. It uses the ``--joblog`` option to log all commands and their exit value to ``/tmp/my_ia_job.log``: .. code:: bash $ cat itemlist.txt | parallel --joblog /tmp/my_ia_job.log 'ia metadata {}' > output.jsonl You can now retry any commands that failed by using the ``--retry-failed`` option (don't forget to switch ``>`` to ``>>`` in this example, so you don't overwrite ``output.jsonl``! ``>>`` means to append to the output file, rather than clobber it): .. code:: bash $ parallel --retry-failed --joblog /tmp/my_ia_job.log 'ia metadata {}' >> output.jsonl If there were no failed commands, nothing will be rerun. You can rerun this command until it exits with ``0``. You can check the exit code by running ``echo $?`` directly after the ``parallel`` command finishes. Resources _________ - Intro videos: `https://www.youtube.com/playlist?list=PL284C9FF2488BC6D1 `_ - Cheat sheet: `https://www.gnu.org/software/parallel/parallel_cheat.pdf `_ - Examples from the man page: `https://www.gnu.org/software/parallel/man.html#EXAMPLE:-Working-as-xargs--n1.-Argument-appending `_ python-internetarchive-5.7.2/docs/source/python-lib.rst000066400000000000000000000135531513674652200233270ustar00rootroot00000000000000.. _python-lib: Python Library Usage ==================== The ``internetarchive`` Python library provides two main ways to interact with archive.org: 1. **Simple functional interface** via :mod:`internetarchive.api` - Easy to use for common tasks 2. **Flexible object-oriented interface** via :class:`~internetarchive.session.ArchiveSession` - More control for complex applications Quick Start ----------- The easiest way to get started is with the :mod:`internetarchive.api` module, which provides simple functions for common operations: .. code-block:: python from internetarchive import download, upload, search_items, get_item # Download files from an item download('TripDown1905', glob_pattern='*.mp4') # Search for items search = search_items('collection:opensource') for result in search: print(result['identifier']) # Get an item and work with it item = get_item('TripDown1905') print(item.metadata['title']) For more control and to persist configuration across operations, use a :class:`~internetarchive.session.ArchiveSession`: .. code-block:: python from internetarchive import get_session # Create a session with your configuration session = get_session(config_file='~/.config/ia.ini') # Use the session for all operations item = session.get_item('TripDown1905') item.download() search = session.search_items('subject:science') Simple Functional Interface --------------------------- The :mod:`internetarchive.api` module provides these convenient functions for common tasks: .. automodule:: internetarchive.api :members: :exclude-members: get_username, get_user_info, configure :noindex: These functions are great for scripts and simple applications. They automatically create a session in the background for you. For complete documentation including all parameters, see :ref:`api-module` in the reference. Using Sessions -------------- For more complex applications or when you need to perform multiple operations, use the :class:`~internetarchive.session.ArchiveSession` class: .. autoclass:: internetarchive.session.ArchiveSession :members: :exclude-members: set_file_logger, set_stream_logger, rebuild_auth, mount_http_adapter, send, _get_user_agent_string, s3_is_overloaded, get_tasks_api_rate_limit :noindex: Creating a session: .. code-block:: python from internetarchive import get_session # From config file session = get_session(config_file='~/.config/ia.ini') # From dictionary config = { 's3': { 'access': 'your_access_key', 'secret': 'your_secret_key' } } session = get_session(config=config) For complete session documentation, see :ref:`session-module`. Working with Items ------------------ Once you have an item (from :func:`get_item` or :meth:`~internetarchive.session.ArchiveSession.get_item`), you can: .. code-block:: python item = get_item('TripDown1905') # Access metadata print(item.metadata['title']) print(item.metadata['creator']) # Download files item.download(glob_pattern='*.mp4') # Upload new files item.upload(['file1.txt', 'file2.jpg'], metadata={'title': 'My New Files'}) # Modify metadata item.modify_metadata({'subject': ['history', 'film']}) # List files for file in item.files: print(file.name, file.format) For complete item documentation, see :ref:`item-module`. Searching for Items ------------------- .. code-block:: python from internetarchive import search_items # Basic search search = search_items('collection:opensource movies') # Iterate through results for result in search: print(f"{result['identifier']}: {result.get('title', 'No title')}") # Get specific fields search = search_items('subject:science', fields=['identifier', 'title', 'date']) for result in search: print(result) For complete search documentation, see :ref:`search-module`. Common Patterns --------------- **Download all files from multiple items:** .. code-block:: python from internetarchive import get_item identifiers = ['TripDown1905', 'goodytwoshoes00newyiala'] for identifier in identifiers: item = get_item(identifier) item.download() **Upload with custom metadata:** .. code-block:: python from internetarchive import upload upload( 'my-new-item-001', files=['document.pdf', 'cover.jpg'], metadata={ 'title': 'My Document', 'mediatype': 'texts', 'collection': 'opensource', 'subject': ['documentation', 'tutorial'] } ) **Search and process results:** .. code-block:: python from internetarchive import search_items # Search with pagination search = search_items( 'collection:prelinger', params={'rows': 50, 'page': 1} ) # Collect identifiers identifiers = [result['identifier'] for result in search] # Process in batches for identifier in identifiers[:10]: # First 10 items print(f"Processing {identifier}") Configuration ------------- The library needs your archive.org credentials for certain operations (uploading, modifying metadata, etc.). You can configure it in several ways: 1. **Config file** (recommended): Use ``ia configure`` from the CLI or :func:`~internetarchive.api.configure` from Python 2. **Environment variables**: Set ``IA_ACCESS_KEY_ID`` and ``IA_SECRET_ACCESS_KEY`` 3. **Python dictionary**: Pass credentials directly when creating a session See :ref:`configuration` for complete configuration details. Next Steps ---------- For complete documentation of all modules, classes, and methods, see :ref:`modules`. For troubleshooting and advanced usage, check the examples in the `GitHub repository `_. python-internetarchive-5.7.2/docs/source/troubleshooting.rst000066400000000000000000000036121513674652200244640ustar00rootroot00000000000000.. _troubleshooting: Troubleshooting =============== HTTPS Issues ------------ The ``internetarchive`` library uses the HTTPS protocol for making secure requests by default. If you run into problems with this, you can use HTTP to make insecure requests in one of the following ways: + Adding the following lines to your ``ia.ini`` config file (usually located at ``~/.config/ia.ini`` or ``~/.ia.ini``): .. code:: bash [general] secure = false + In the Python interface, using a config dict: .. code:: python >>> from internetarchive import get_item >>> config = {'general': {'secure': False}} >>> item = get_item('', config=config) + In the command-line interface, use the ``--insecure`` option: .. code:: bash $ ia --insecure download OverflowError ------------- On some 32-bit systems you may run into issues uploading files larger than 2 GB. You may see an error that looks something like ``OverflowError: long int too large to convert to int``. You can get around this by upgrading ``requests``:: pip install --upgrade requests You can find more details about this issue at the following links: https://github.com/sigmavirus24/requests-toolbelt/issues/80 https://github.com/kennethreitz/requests/issues/2691 Getting Further Help -------------------- If your issue isn't covered here, please: 1. Search the `existing GitHub Issues `_ to see if it's already been reported 2. If not, `open a new issue `_ with: - A clear description of the problem - Steps to reproduce - Any error messages - Your environment (OS, Python version, internetarchive version) Before reporting, make sure you're using the latest version of the library and :ref:`updating` if necessary. python-internetarchive-5.7.2/docs/source/updates.rst000066400000000000000000000000761513674652200227030ustar00rootroot00000000000000.. _updates: Updates ======= .. include:: ../../HISTORY.rst python-internetarchive-5.7.2/internetarchive/000077500000000000000000000000001513674652200214435ustar00rootroot00000000000000python-internetarchive-5.7.2/internetarchive/__init__.py000066400000000000000000000042731513674652200235620ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ Internetarchive Library ~~~~~~~~~~~~~~~~~~~~~~~ Internetarchive is a python interface to archive.org. Usage:: >>> from internetarchive import get_item >>> item = get_item('govlawgacode20071') >>> item.exists True :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ __title__ = 'internetarchive' __author__ = 'Jacob M. Johnson' __license__ = 'AGPL 3' __copyright__ = 'Copyright (C) 2012-2024 Internet Archive' from .__version__ import __version__ # isort:skip from internetarchive.api import ( configure, delete, download, get_files, get_item, get_session, get_tasks, get_user_info, get_username, modify_metadata, search_items, upload, ) from internetarchive.catalog import Catalog from internetarchive.files import File from internetarchive.item import Item from internetarchive.search import Search from internetarchive.session import ArchiveSession __all__ = [ # Classes. 'ArchiveSession', 'Catalog', 'File', 'Item', 'Search', '__version__', 'configure', 'delete', 'download', 'get_files', # API. 'get_item', 'get_session', 'get_tasks', 'get_username', 'modify_metadata', 'search_items', 'upload', ] # Set default logging handler to avoid "No handler found" warnings. import logging log = logging.getLogger(__name__) log.addHandler(logging.NullHandler()) python-internetarchive-5.7.2/internetarchive/__version__.py000066400000000000000000000000261513674652200242740ustar00rootroot00000000000000__version__ = '5.7.2' python-internetarchive-5.7.2/internetarchive/account.py000066400000000000000000000214151513674652200234540ustar00rootroot00000000000000from dataclasses import dataclass, field from typing import ClassVar, Dict, List, Optional import requests from internetarchive import get_session from internetarchive.exceptions import AccountAPIError from internetarchive.session import ArchiveSession """ internetarchive.account ~~~~~~~~~~~~~~~~~~~~~~~ :copyright: (C) 2012-2025 by Internet Archive. :license: AGPL 3, see LICENSE for more details. This module provides the `Account` class for interacting with user accounts on the Internet Archive. It requires administrative privileges. """ @dataclass class Account: """ A class for interacting with user accounts on the Internet Archive. Note: This class requires administrative privileges. This class provides methods to: - Fetch account details using various identifiers (e.g., email, screenname, itemname). - Lock and unlock accounts. - Convert account data to a dictionary for serialization. Example Usage: >>> from internetarchive.account import Account >>> account = Account.from_account_lookup('email', 'foo@example.com') >>> account.lock(comment="Locked spam account") >>> print(account.to_dict()) """ locked: bool verified: bool email: str canonical_email: str itemname: str screenname: str notifications: List[str] has_disability_access: bool lastlogin: str createdate: str session: ArchiveSession = field(default_factory=get_session) API_BASE_URL: str = '/services/xauthn/' API_INFO_PARAMS: ClassVar[Dict[str, str]] = {'op': 'info'} API_LOCK_UNLOCK_PARAMS: ClassVar[Dict[str, str]] = {'op': 'lock_unlock'} def _get_api_base_url(self) -> str: """Dynamically construct the API base URL using the session's host.""" return f'https://{self.session.host}{self.API_BASE_URL}' # type: ignore[attr-defined] def _post_api_request( self, endpoint: str, params: Dict[str, str], data: Dict[str, str], session: Optional[ArchiveSession] = None ) -> requests.Response: """ Helper method to make API requests. Args: endpoint: The API endpoint to call. params: Query parameters for the request. data: Data to send in the request body. session: Optional session to use for the request. Defaults to self.session. Returns: The response from the API. Raises: requests.exceptions.RequestException: If the API request fails. """ session = session or self.session url = f'https://{session.host}{endpoint}' # type: ignore[attr-defined] response = session.post(url, params=params, data=data) response.raise_for_status() return response @classmethod def from_account_lookup( cls, identifier_type: str, identifier: str, session: Optional[ArchiveSession] = None ) -> "Account": """ Factory method to initialize an Account using an identifier type and value. Args: identifier_type: The type of identifier (e.g., 'email', 'screenname'). identifier: The value of the identifier (e.g., 'foo@example.com'). session: Optional session to use for the request. Returns: An instance of Account. """ json_data = cls._fetch_account_data_from_api(identifier_type, identifier, session) return cls.from_json(json_data, session) @classmethod def _fetch_account_data_from_api( cls, identifier_type: str, identifier: str, session: Optional[ArchiveSession] = None ) -> Dict: """ Fetches account data from the API using an identifier type and value. Args: identifier_type: The type of identifier (e.g., 'email', 'screenname'). identifier: The value of the identifier (e.g., 'foo@example.com'). session: Optional session to use for the request. Returns: A dictionary containing the account data. Raises: requests.exceptions.RequestException: If the API request fails. ValueError: If the API response is invalid or missing required data. """ data = {identifier_type: identifier} session = session or get_session() try: response = session.post( f'https://{session.host}{cls.API_BASE_URL}', # type: ignore[attr-defined] params=cls.API_INFO_PARAMS, data=data, headers={'Content-Type': 'application/x-www-form-urlencoded'} ) response.raise_for_status() j = response.json() if j.get("error") or not j.get("values"): raise AccountAPIError(j.get("error", "Unknown error"), error_data=j) return j["values"] except requests.exceptions.RequestException as e: raise AccountAPIError(f"Failed to fetch account data: {e}") @classmethod def from_json( cls, json_data: Dict, session: Optional[ArchiveSession] = None ) -> "Account": """ Factory method to initialize an Account using JSON data. Args: json_data: A dictionary containing account data. session: Optional session to use for the request. Returns: An instance of Account. Raises: ValueError: If required fields are missing in the JSON data. """ required_fields = [ "canonical_email", "email", "has_disability_access", "itemname", "locked", "notifications", "screenname", "verified", "lastlogin", "createdate", ] for requried_field in required_fields: if requried_field not in json_data: raise ValueError(f"Missing required requried_field in JSON data: {requried_field}") # Ensure session is of type ArchiveSession if session is None: session = get_session() # Default to ArchiveSession elif not isinstance(session, ArchiveSession): raise TypeError(f"Expected session to be of type ArchiveSession, got {type(session)}") return cls( locked=json_data["locked"], verified=json_data["verified"], email=json_data["email"], canonical_email=json_data["canonical_email"], itemname=json_data["itemname"], screenname=json_data["screenname"], notifications=json_data["notifications"], has_disability_access=json_data["has_disability_access"], lastlogin=json_data["lastlogin"], createdate=json_data["createdate"], session=session ) def lock(self, comment: Optional[str] = None, session: Optional[ArchiveSession] = None) -> requests.Response: """ Lock the account. Args: comment: An optional comment for the lock operation. session: Optional session to use for the request. Returns: The response from the API. """ data = {'itemname': self.itemname, 'is_lock': '1'} if comment: data['comments'] = comment return self._post_api_request( self.API_BASE_URL, params=self.API_LOCK_UNLOCK_PARAMS, data=data, session=session ) def unlock(self, comment: Optional[str] = None, session: Optional[ArchiveSession] = None) -> requests.Response: """ Unlock the account. Args: comment: An optional comment for the unlock operation. session: Optional session to use for the request. Returns: The response from the API. """ data = {'itemname': self.itemname, 'is_lock': '0'} if comment: data['comments'] = comment return self._post_api_request( self.API_BASE_URL, params=self.API_LOCK_UNLOCK_PARAMS, data=data, session=session ) def to_dict(self) -> Dict: """ Converts the Account instance to a dictionary. Returns: A dictionary representation of the Account instance. """ return { "locked": self.locked, "verified": self.verified, "email": self.email, "canonical_email": self.canonical_email, "itemname": self.itemname, "screenname": self.screenname, "notifications": self.notifications, "has_disability_access": self.has_disability_access, "lastlogin": self.lastlogin, "createdate": self.createdate, } python-internetarchive-5.7.2/internetarchive/api.py000066400000000000000000000510211513674652200225650ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.api ~~~~~~~~~~~~~~~~~~~ This module implements the Internetarchive API. :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from __future__ import annotations from getpass import getpass from typing import Iterable, Mapping, MutableMapping import requests from urllib3 import Retry from internetarchive import auth, catalog, files, item, search, session from internetarchive import config as config_module from internetarchive.exceptions import AuthenticationError def get_session( config: Mapping | None = None, config_file: str | None = None, debug: bool = False, http_adapter_kwargs: MutableMapping | None = None, ) -> session.ArchiveSession: """Return a new :class:`ArchiveSession` object. The :class:`ArchiveSession` object is the main interface to the ``internetarchive`` lib. It allows you to persist certain parameters across tasks. :param config: A dictionary used to configure your session. Supports the following keys in the ``general`` section: - ``user_agent_suffix``: Custom string to append to the default User-Agent. The default (including access key) is always sent. - ``secure``: Use HTTPS (default: True). - ``host``: Host to connect to (default: archive.org). :param config_file: A path to a config file used to configure your session. :param debug: To be passed on to this session's method calls. :param http_adapter_kwargs: Keyword arguments that :py:class:`requests.adapters.HTTPAdapter` takes. :returns: To persist certain parameters across tasks. Usage: >>> from internetarchive import get_session >>> config = {'s3': {'access': 'foo', 'secret': 'bar'}} >>> s = get_session(config) >>> s.access_key 'foo' Append a custom User-Agent suffix: >>> config = {'general': {'user_agent_suffix': 'MyApp/1.0'}} >>> s = get_session(config) >>> s.headers['User-Agent'] 'internetarchive/5.7.2 (Darwin x86_64; N; en; ACCESS_KEY) Python/3.9.0 MyApp/1.0' From the session object, you can access all of the functionality of the ``internetarchive`` lib: >>> item = s.get_item('nasa') >>> item.download() nasa: ddddddd - success >>> s.get_tasks(task_ids=31643513)[0].server 'ia311234' """ return session.ArchiveSession(config, config_file or "", debug, http_adapter_kwargs) def get_item( identifier: str, config: Mapping | None = None, config_file: str | None = None, archive_session: session.ArchiveSession | None = None, debug: bool = False, http_adapter_kwargs: MutableMapping | None = None, request_kwargs: MutableMapping | None = None, ) -> item.Item: """Get an :class:`Item` object. :param identifier: The globally unique Archive.org item identifier. :param config: A dictionary used to configure your session. :param config_file: A path to a config file used to configure your session. :param archive_session: An :class:`ArchiveSession` object can be provided via the ``archive_session`` parameter. :param debug: To be passed on to get_session(). :param http_adapter_kwargs: Keyword arguments that :py:class:`requests.adapters.HTTPAdapter` takes. :param request_kwargs: Keyword arguments that :py:class:`requests.Request` takes. :returns: The Item that fits the criteria. Usage: >>> from internetarchive import get_item >>> item = get_item('nasa') >>> item.item_size 121084 """ if not archive_session: archive_session = get_session(config, config_file, debug, http_adapter_kwargs) return archive_session.get_item(identifier, request_kwargs=request_kwargs) def get_files( identifier: str, files: files.File | list[files.File] | None = None, formats: str | list[str] | None = None, glob_pattern: str | None = None, exclude_pattern: str | None = None, on_the_fly: bool = False, **get_item_kwargs, ) -> list[files.File]: r"""Get :class:`File` objects from an item. :param identifier: The globally unique Archive.org identifier for a given item. :param files: Only return files matching the given filenames. :param formats: Only return files matching the given formats. :param glob_pattern: Only return files matching the given glob pattern. :param exclude_pattern: Exclude files matching the given glob pattern. :param on_the_fly: Include on-the-fly files (i.e. derivative EPUB, MOBI, DAISY files). :param \*\*get_item_kwargs: Arguments that ``get_item()`` takes. :returns: Files from an item. Usage: >>> from internetarchive import get_files >>> fnames = [f.name for f in get_files('nasa', glob_pattern='*xml')] >>> print(fnames) ['nasa_reviews.xml', 'nasa_meta.xml', 'nasa_files.xml'] """ item = get_item(identifier, **get_item_kwargs) return item.get_files(files, formats, glob_pattern, exclude_pattern, on_the_fly) def modify_metadata( identifier: str, metadata: Mapping, target: str | None = None, append: bool = False, append_list: bool = False, priority: int = 0, access_key: str | None = None, secret_key: str | None = None, debug: bool = False, request_kwargs: Mapping | None = None, **get_item_kwargs, ) -> requests.Request | requests.Response: r"""Modify the metadata of an existing item on Archive.org. :param identifier: The globally unique Archive.org identifier for a given item. :param metadata: Metadata used to update the item. :param target: The metadata target to update. Defaults to `metadata`. :param append: set to True to append metadata values to current values rather than replacing. Defaults to ``False``. :param append_list: Append values to an existing multi-value metadata field. No duplicate values will be added. :param priority: Set task priority. :param access_key: IA-S3 access_key to use when making the given request. :param secret_key: IA-S3 secret_key to use when making the given request. :param debug: set to True to return a :class:`requests.Request ` object instead of sending request. Defaults to ``False``. :param \*\*get_item_kwargs: Arguments that ``get_item`` takes. :returns: A Request if debug else a Response. """ item = get_item(identifier, **get_item_kwargs) return item.modify_metadata( metadata, target=target, append=append, append_list=append_list, priority=priority, access_key=access_key, secret_key=secret_key, debug=debug, request_kwargs=request_kwargs, refresh=False ) def upload( identifier: str, files, metadata: Mapping | None = None, headers: dict | None = None, access_key: str | None = None, secret_key: str | None = None, queue_derive=None, verbose: bool = False, verify: bool = False, checksum: bool = False, delete: bool = False, retries: int | None = None, retries_sleep: int | None = None, debug: bool = False, validate_identifier: bool = False, request_kwargs: dict | None = None, **get_item_kwargs, ) -> list[requests.Request | requests.Response]: r"""Upload files to an item. The item will be created if it does not exist. :param identifier: The globally unique Archive.org identifier for a given item. :param files: The filepaths or file-like objects to upload. This value can be an iterable or a single file-like object or string. :param metadata: Metadata used to create a new item. If the item already exists, the metadata will not be updated -- use ``modify_metadata``. :param headers: Add additional HTTP headers to the request. :param access_key: IA-S3 access_key to use when making the given request. :param secret_key: IA-S3 secret_key to use when making the given request. :param queue_derive: Set to False to prevent an item from being derived after upload. :param verbose: Display upload progress. :param verify: Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :param checksum: Skip uploading files based on checksum. :param delete: Delete local file after the upload has been successfully verified. :param retries: Number of times to retry the given request if S3 returns a 503 SlowDown error. :param retries_sleep: Amount of time to sleep between ``retries``. :param debug: Set to True to print headers to stdout, and exit without sending the upload request. :param validate_identifier: Set to True to validate the identifier before uploading the file. :param \*\*kwargs: Optional arguments that ``get_item`` takes. :returns: A list Requests if debug else a list of Responses. """ item = get_item(identifier, **get_item_kwargs) return item.upload( files, metadata=metadata, headers=headers, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive, verbose=verbose, verify=verify, checksum=checksum, delete=delete, retries=retries, retries_sleep=retries_sleep, debug=debug, validate_identifier=validate_identifier, request_kwargs=request_kwargs, ) def download( identifier: str, files: files.File | list[files.File] | None = None, formats: str | list[str] | None = None, glob_pattern: str | None = None, dry_run: bool = False, verbose: bool = False, ignore_existing: bool = False, checksum: bool = False, checksum_archive: bool = False, destdir: str | None = None, no_directory: bool = False, retries: int | None = None, item_index: int | None = None, ignore_errors: bool = False, on_the_fly: bool = False, return_responses: bool = False, no_change_timestamp: bool = False, timeout: float | tuple[int, float] | None = None, **get_item_kwargs, ) -> list[requests.Request | requests.Response]: r"""Download files from an item. :param identifier: The globally unique Archive.org identifier for a given item. :param files: Only return files matching the given file names. :param formats: Only return files matching the given formats. :param glob_pattern: Only return files matching the given glob pattern. :param dry_run: Print URLs to files to stdout rather than downloading them. :param verbose: Turn on verbose output. :param ignore_existing: Skip files that already exist locally. :param checksum: Skip downloading file based on checksum. :param checksum_archive: Skip downloading file based on checksum, and skip checksum validation if it already succeeded (will create and use _checksum_archive.txt). :param destdir: The directory to download files to. :param no_directory: Download files to current working directory rather than creating an item directory. :param retries: The number of times to retry on failed requests. :param item_index: The index of the item for displaying progress in bulk downloads. :param ignore_errors: Don't fail if a single file fails to download, continue to download other files. :param on_the_fly: Download on-the-fly files (i.e. derivative EPUB, MOBI, DAISY files). :param return_responses: Rather than downloading files to disk, return a list of response objects. :param \*\*kwargs: Optional arguments that ``get_item`` takes. :returns: A list Requests if debug else a list of Responses. """ item = get_item(identifier, **get_item_kwargs) r = item.download( files=files, formats=formats, glob_pattern=glob_pattern, dry_run=dry_run, verbose=verbose, ignore_existing=ignore_existing, checksum=checksum, checksum_archive=checksum_archive, destdir=destdir, no_directory=no_directory, retries=retries, item_index=item_index, ignore_errors=ignore_errors, on_the_fly=on_the_fly, return_responses=return_responses, no_change_timestamp=no_change_timestamp, timeout=timeout, ) return r def delete( identifier: str, files: files.File | list[files.File] | None = None, formats: str | list[str] | None = None, glob_pattern: str | None = None, cascade_delete: bool = False, access_key: str | None = None, secret_key: str | None = None, verbose: bool = False, debug: bool = False, **kwargs, ) -> list[requests.Request | requests.Response]: """Delete files from an item. Note: Some system files, such as _meta.xml, cannot be deleted. :param identifier: The globally unique Archive.org identifier for a given item. :param files: Only return files matching the given filenames. :param formats: Only return files matching the given formats. :param glob_pattern: Only return files matching the given glob pattern. :param cascade_delete: Delete all files associated with the specified file, including upstream derivatives and the original. :param access_key: IA-S3 access_key to use when making the given request. :param secret_key: IA-S3 secret_key to use when making the given request. :param verbose: Print actions to stdout. :param debug: Set to True to print headers to stdout and exit exit without sending the delete request. :returns: A list Requests if debug else a list of Responses """ _files = get_files(identifier, files, formats, glob_pattern, **kwargs) responses = [] for f in _files: r = f.delete( cascade_delete=cascade_delete, access_key=access_key, secret_key=secret_key, verbose=verbose, debug=debug, ) responses.append(r) return responses def get_tasks( identifier: str = "", params: dict | None = None, config: Mapping | None = None, config_file: str | None = None, archive_session: session.ArchiveSession | None = None, http_adapter_kwargs: MutableMapping | None = None, request_kwargs: MutableMapping | None = None, ) -> set[catalog.CatalogTask]: """Get tasks from the Archive.org catalog. :param identifier: The Archive.org identifier for which to retrieve tasks for. :param params: The URL parameters to send with each request sent to the Archive.org catalog API. :returns: A set of :class:`CatalogTask` objects. """ if not archive_session: archive_session = get_session(config, config_file, False, http_adapter_kwargs) return archive_session.get_tasks( identifier=identifier, params=params, request_kwargs=request_kwargs ) def search_items( query: str, fields: Iterable | None = None, sorts=None, params: Mapping | None = None, full_text_search: bool = False, dsl_fts: bool = False, archive_session: session.ArchiveSession | None = None, config: Mapping | None = None, config_file: str | None = None, http_adapter_kwargs: MutableMapping | None = None, request_kwargs: Mapping | None = None, max_retries: int | Retry | None = None, ) -> search.Search: """Search for items on Archive.org. :param query: The Archive.org search query to yield results for. Refer to https://archive.org/advancedsearch.php#raw for help formatting your query. :param fields: The metadata fields to return in the search results. :param params: The URL parameters to send with each request sent to the Archive.org Advancedsearch Api. :param full_text_search: Beta support for querying the archive.org Full Text Search API [default: False]. :param dsl_fts: Beta support for querying the archive.org Full Text Search API in dsl (i.e. do not prepend ``!L `` to the ``full_text_search`` query [default: False]. :param secure: Configuration options for session. :param config_file: A path to a config file used to configure your session. :param http_adapter_kwargs: Keyword arguments that :py:class:`requests.adapters.HTTPAdapter` takes. :param request_kwargs: Keyword arguments that :py:class:`requests.Request` takes. :param max_retries: The number of times to retry a failed request. This can also be an `urllib3.Retry` object. If you need more control (e.g. `status_forcelist`), use a `ArchiveSession` object, and mount your own adapter after the session object has been initialized. For example:: >>> s = get_session() >>> s.mount_http_adapter() >>> search_results = s.search_items('nasa') See :meth:`ArchiveSession.mount_http_adapter` for more details. :returns: A :class:`Search` object, yielding search results. """ if not archive_session: archive_session = get_session(config, config_file, False, http_adapter_kwargs) return archive_session.search_items( query, fields=fields, sorts=sorts, params=params, full_text_search=full_text_search, dsl_fts=dsl_fts, request_kwargs=request_kwargs, max_retries=max_retries, ) def configure( # nosec: hardcoded_password_default username: str = "", password: str = "", config_file: str = "", host: str = "archive.org", ) -> str: """Configure internetarchive with your Archive.org credentials. :param username: The email address associated with your Archive.org account. :param password: Your Archive.org password. :returns: The config file path. Usage: >>> from internetarchive import configure >>> configure('user@example.com', 'password') """ auth_config = config_module.get_auth_config( username or input("Email address: "), password or getpass("Password: "), host, ) config_file_path = config_module.write_config_file(auth_config, config_file) return config_file_path def get_username(access_key: str, secret_key: str) -> str: """Returns an Archive.org username given an IA-S3 key pair. :param access_key: IA-S3 access_key to use when making the given request. :param secret_key: IA-S3 secret_key to use when making the given request. :returns: The username. """ j = get_user_info(access_key, secret_key) return j.get("username", "") def get_user_info(access_key: str, secret_key: str) -> dict[str, str]: """Returns details about an Archive.org user given an IA-S3 key pair. :param access_key: IA-S3 access_key to use when making the given request. :param secret_key: IA-S3 secret_key to use when making the given request. :returns: Archive.org use info. """ u = "https://s3.us.archive.org" p = {"check_auth": 1} r = requests.get(u, params=p, auth=auth.S3Auth(access_key, secret_key), timeout=10) r.raise_for_status() j = r.json() if j.get("error"): raise AuthenticationError(j.get("error")) else: return j python-internetarchive-5.7.2/internetarchive/auth.py000066400000000000000000000051211513674652200227550ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.auth ~~~~~~~~~~~~~~~~~~~~ This module contains the Archive.org authentication handlers for Requests. :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from __future__ import annotations from requests.auth import AuthBase from internetarchive.exceptions import AuthenticationError class S3Auth(AuthBase): """Attaches S3 Basic Authentication to the given Request object.""" def __init__(self, access_key: str | None = None, secret_key: str | None = None): self.access_key = access_key self.secret_key = secret_key def __call__(self, r): if not self.access_key: if self.secret_key: raise AuthenticationError('No access_key set!' ' Have you run `ia configure`?') if not self.secret_key: if self.access_key: raise AuthenticationError('No secret_key set!' ' Have you run `ia configure`?') else: raise AuthenticationError('No access_key or secret_key set!' ' Have you run `ia configure`?') auth_str = f'LOW {self.access_key}:{self.secret_key}' r.headers['Authorization'] = auth_str return r class S3PostAuth(AuthBase): """Attaches S3 Basic Authentication to the given Request object.""" def __init__(self, access_key: str | None = None, secret_key: str | None = None): self.access_key = access_key self.secret_key = secret_key def __call__(self, r): auth_str = f'&access={self.access_key}&secret={self.secret_key}' if not r.body: r.body = '' r.body += auth_str r.headers['content-type'] = 'application/x-www-form-urlencoded' return r python-internetarchive-5.7.2/internetarchive/catalog.py000066400000000000000000000272241513674652200234360ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.catalog ~~~~~~~~~~~~~~~~~~~~~~~ This module contains objects for interacting with the Archive.org catalog. :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from __future__ import annotations from datetime import datetime from logging import getLogger from typing import Iterable, Mapping, MutableMapping from requests import Response from requests.exceptions import HTTPError from internetarchive import auth from internetarchive import session as ia_session from internetarchive.utils import json log = getLogger(__name__) def sort_by_date(task_dict: CatalogTask) -> datetime: if task_dict.category == 'summary': # type: ignore return datetime.now() try: return datetime.strptime(task_dict['submittime'], '%Y-%m-%d %H:%M:%S.%f') except Exception: return datetime.strptime(task_dict['submittime'], '%Y-%m-%d %H:%M:%S') class Catalog: """This class represents the Archive.org catalog. You can use this class to access and submit tasks from the catalog. This is a low-level interface, and in most cases the functions in :mod:`internetarchive.api` and methods in :class:`ArchiveSession ` should be used. It uses the archive.org `Tasks API `_ Usage:: >>> from internetarchive import get_session, Catalog >>> s = get_session() >>> c = Catalog(s) >>> tasks = c.get_tasks('nasa') >>> tasks[-1].task_id 31643502 """ def __init__( self, archive_session: ia_session.ArchiveSession, request_kwargs: Mapping | None = None, ): """ Initialize :class:`Catalog ` object. :param archive_session: An :class:`ArchiveSession ` object. :param request_kwargs: Keyword arguments to be used in :meth:`requests.sessions.Session.get` and :meth:`requests.sessions.Session.post` requests. """ self.session = archive_session self.auth = auth.S3Auth(self.session.access_key, self.session.secret_key) self.request_kwargs = request_kwargs or {} self.url = f'{self.session.protocol}//{self.session.host}/services/tasks.php' def get_summary(self, identifier: str = "", params: dict | None = None) -> dict: """Get the total counts of catalog tasks meeting all criteria, organized by run status (queued, running, error, and paused). :param identifier: Item identifier. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :returns: the total counts of catalog tasks meeting all criteria """ params = params or {} if identifier: params['identifier'] = identifier params.update({'summary': 1, 'history': 0, 'catalog': 0}) r = self.make_tasks_request(params) j = r.json() if j.get('success') is True: return j['value']['summary'] else: return j def make_tasks_request(self, params: Mapping | None) -> Response: """Make a GET request to the `Tasks API `_ :param params: Query parameters, refer to `Tasks API `_ for available parameters. :returns: :class:`requests.Response` """ r = self.session.get(self.url, params=params, auth=self.auth, **self.request_kwargs) try: r.raise_for_status() except HTTPError as exc: j = r.json() error = j['error'] raise HTTPError(error, response=r) return r def iter_tasks(self, params: MutableMapping | None = None) -> Iterable[CatalogTask]: """A generator that can make arbitrary requests to the Tasks API. It handles paging (via cursor) automatically. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :returns: collections.Iterable[CatalogTask] """ params = params or {} while True: r = self.make_tasks_request(params) j = r.json() for row in j.get('value', {}).get('catalog', []): yield CatalogTask(row, self) for row in j.get('value', {}).get('history', []): yield CatalogTask(row, self) if not j.get('value', {}).get('cursor'): break params['cursor'] = j['value']['cursor'] def get_rate_limit(self, cmd: str = 'derive.php'): params = {'rate_limits': 1, 'cmd': cmd} r = self.make_tasks_request(params) line = '' tasks = [] for c in r.iter_content(): c = c.decode('utf-8') if c == '\n': j = json.loads(line) task = CatalogTask(j, self) tasks.append(task) line = '' line += c j = json.loads(line) return j def get_tasks(self, identifier: str = "", params: dict | None = None) -> list[CatalogTask]: """Get a list of all tasks meeting all criteria. The list is ordered by submission time. :param identifier: The item identifier, if provided will return tasks for only this item filtered by other criteria provided in params. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :returns: A list of all tasks meeting all criteria. """ params = params or {} if identifier: params.update({'identifier': identifier}) params.update({'limit': 0}) if not params.get('summary'): params['summary'] = 0 r = self.make_tasks_request(params) line = '' tasks = [] for c in r.iter_content(): c = c.decode('utf-8') if c == '\n': j = json.loads(line) task = CatalogTask(j, self) tasks.append(task) line = '' line += c if line.strip(): j = json.loads(line) task = CatalogTask(j, self) tasks.append(task) all_tasks = sorted(tasks, key=sort_by_date, reverse=True) return all_tasks def submit_task(self, identifier: str, cmd: str, comment: str | None = None, priority: int = 0, data: dict | None = None, headers: dict | None = None) -> Response: """Submit an archive.org task. :param identifier: Item identifier. :param cmd: Task command to submit, see `supported task commands `_. :param comment: A reasonable explanation for why the task is being submitted. :param priority: Task priority from 10 to -10 (default: 0). :param data: Extra POST data to submit with the request. Refer to `Tasks API Request Entity `_. :param headers: Add additional headers to request. :returns: :class:`requests.Response` """ data = data or {} data.update({'cmd': cmd, 'identifier': identifier}) if comment: if 'args' in data: data['args']['comment'] = comment else: data['args'] = {'comment': comment} if priority: data['priority'] = priority r = self.session.post(self.url, json=data, auth=self.auth, headers=headers, **self.request_kwargs) return r class CatalogTask: """This class represents an Archive.org catalog task. It is primarily used by :class:`Catalog`, and should not be used directly. """ def __init__(self, task_dict: Mapping, catalog_obj: Catalog): self.session = catalog_obj.session self.request_kwargs = catalog_obj.request_kwargs self.color = None self.task_dict = task_dict for key, value in task_dict.items(): setattr(self, key, value) # Confuses mypy ;-) def __repr__(self): color = self.task_dict.get('color', 'done') return ('CatalogTask(identifier={identifier},' ' task_id={task_id!r}, server={server!r},' ' cmd={cmd!r},' ' submitter={submitter!r},' ' color={task_color!r})'.format(task_color=color, **self.task_dict)) def __getitem__(self, key: str): """Dict-like access provided as backward compatibility.""" return self.task_dict[key] def json(self): return json.dumps(self.task_dict) def task_log(self) -> str: """Get task log. :returns: The task log as a string. """ task_id = self.task_id # type: ignore if task_id is None: raise ValueError('task_id is None') return self.get_task_log(task_id, self.session, self.request_kwargs) @staticmethod def get_task_log( task_id: int | str | None, session: ia_session.ArchiveSession, request_kwargs: Mapping | None = None ) -> str: """Static method for getting a task log, given a task_id. This method exists so a task log can be retrieved without retrieving the items task history first. :param task_id: The task id for the task log you'd like to fetch. :param archive_session: :class:`ArchiveSession ` :param request_kwargs: Keyword arguments that :py:class:`requests.Request` takes. :returns: The task log as a string. """ request_kwargs = request_kwargs or {} _auth = auth.S3Auth(session.access_key, session.secret_key) if session.host == 'archive.org': host = 'catalogd.archive.org' else: host = session.host url = f'{session.protocol}//{host}/services/tasks.php' params = {'task_log': task_id} r = session.get(url, params=params, auth=_auth, **request_kwargs) r.raise_for_status() return r.content.decode('utf-8', errors='surrogateescape') python-internetarchive-5.7.2/internetarchive/cli/000077500000000000000000000000001513674652200222125ustar00rootroot00000000000000python-internetarchive-5.7.2/internetarchive/cli/__init__.py000066400000000000000000000026011513674652200243220ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.cli ~~~~~~~~~~~~~~~~~~~ :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from internetarchive.cli import ( cli_utils, ia, ia_account, ia_configure, ia_copy, ia_delete, ia_download, ia_list, ia_metadata, ia_move, ia_reviews, ia_search, ia_tasks, ia_upload, ) __all__ = [ "cli_utils", "ia", "ia_account", "ia_configure", "ia_copy", "ia_delete", "ia_download", "ia_list", "ia_metadata", "ia_move", "ia_reviews", "ia_search", "ia_tasks", "ia_upload", ] python-internetarchive-5.7.2/internetarchive/cli/cli_utils.py000066400000000000000000000163251513674652200245620ustar00rootroot00000000000000""" interneratchive.cli.cli_utils """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse import json import os import signal import sys from collections import defaultdict from collections.abc import Iterable from typing import Mapping from urllib.parse import parse_qsl from internetarchive.utils import InvalidIdentifierException, validate_s3_identifier def get_args_dict(args: list[str], query_string: bool = False, header: bool = False) -> dict: args = args or [] if not isinstance(args, list): args = [args] metadata: dict[str, list | str] = defaultdict(list) for md in args: if query_string: if (":" in md) and ("=" not in md): md = md.replace(":", "=").replace(";", "&") for key, value in parse_qsl(md): assert value metadata[key] = value else: key, value = md.split(":", 1) assert value if value not in metadata[key]: metadata[key].append(value) # type: ignore for key in metadata: # noqa: PLC0206 # Flatten single item lists. if len(metadata[key]) <= 1: metadata[key] = metadata[key][0] return metadata def convert_str_list_to_unicode(str_list: list[bytes]): encoding = sys.getfilesystemencoding() return [b.decode(encoding) for b in str_list] def validate_identifier(identifier): try: validate_s3_identifier(identifier) except InvalidIdentifierException as e: raise argparse.ArgumentTypeError(str(e)) return identifier def flatten_list(lst): """Flatten a list if it contains lists.""" result = [] for item in lst: if isinstance(item, Iterable) and not isinstance(item, str): result.extend(flatten_list(item)) # Recursively flatten else: result.append(item) # Just append the item if it's not a list return result class FlattenListAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): # Flatten the list of values (if nested) flattened = flatten_list(values) # Initialize the attribute if it doesn't exist yet if getattr(namespace, self.dest, None) is None: setattr(namespace, self.dest, []) # Append the flattened list to the existing attribute getattr(namespace, self.dest).extend(flattened) class PostDataAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): current_value = getattr(namespace, self.dest, None) # Split values into individual JSON objects (if needed) and parse them all_values = [] for value in values: try: obj = json.loads(value) all_values.append(obj) except json.JSONDecodeError as e: parser.error(f"Invalid JSON format for post data: {value}") # If there is no current value (first argument), initialize it as an object or list if current_value is None: # If there's only one value, don't wrap it in a list if len(all_values) == 1: post_data = all_values[0] else: post_data = all_values elif isinstance(current_value, list): # If it's already a list, append the new values to it post_data = current_value + all_values else: # If it's a single object (first argument), convert it into a list and append new data post_data = [current_value] + all_values # Set the final value back to the namespace setattr(namespace, self.dest, post_data) class QueryStringAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): # Initialize the destination as an empty dictionary if it doesn't exist if getattr(namespace, self.dest, None) is None: setattr(namespace, self.dest, {}) for sublist in values: if "=" not in sublist and ":" in sublist: sublist = sublist.replace(":", "=", 1) key_value_pairs = parse_qsl(sublist) if sublist and not key_value_pairs: parser.error(f"{option_string} must be formatted as 'key=value' " "or 'key:value'") for key, value in key_value_pairs: current_dict = getattr(namespace, self.dest) if key in current_dict: current_dict[key].append(value) else: current_dict[key] = [value] current_dict = getattr(namespace, self.dest) for key, value in current_dict.items(): if len(value) == 1: current_dict[key] = value[0] class MetadataAction(argparse.Action): def __call__(self, parser, namespace, values, option_string=None): # Initialize the destination as an empty dictionary if it doesn't exist if getattr(namespace, self.dest, None) is None: setattr(namespace, self.dest, {}) for sublist in values: if ":" not in sublist and "=" in sublist: sublist = sublist.replace("=", ":", 1) try: key, value = sublist.split(":", 1) except ValueError: parser.error(f"{option_string} must be formatted as 'KEY:VALUE'") current_dict = getattr(namespace, self.dest) if key in current_dict: if not isinstance(current_dict[key], list): current_dict[key] = [current_dict[key]] current_dict[key].append(value) else: current_dict[key] = value def validate_dir_path(path): """ Check if the given path is a directory that exists. Args: path (str): The path to check. Returns: str: The validated directory path. Raises: argparse.ArgumentTypeError: If the path is not a valid directory. """ if os.path.isdir(path): return path else: raise argparse.ArgumentTypeError(f"'{path}' is not a valid directory") def exit_on_signal(sig, frame): """ Exit the program cleanly upon receiving a specified signal. This function is designed to be used as a signal handler. When a signal (such as SIGINT or SIGPIPE) is received, it exits the program with an exit code of 128 plus the signal number. This convention helps to distinguish between regular exit codes and those caused by signals. """ exit_code = 128 + sig sys.exit(exit_code) python-internetarchive-5.7.2/internetarchive/cli/ia.py000077500000000000000000000122301513674652200231560ustar00rootroot00000000000000#!/usr/bin/env python """ ia.py The internetarchive module is a Python/CLI interface to Archive.org. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import argparse import signal import sys from internetarchive import __version__, get_session from internetarchive.cli import ( ia_account, ia_configure, ia_copy, ia_delete, ia_download, ia_flag, ia_list, ia_metadata, ia_move, ia_reviews, ia_search, ia_simplelists, ia_tasks, ia_upload, ) from internetarchive.cli.cli_utils import exit_on_signal # Handle broken pipe try: signal.signal(signal.SIGPIPE, signal.SIG_DFL) except AttributeError: # Non-unix support pass # Handle signal.signal(signal.SIGINT, exit_on_signal) def validate_config_path(path): """ Validate the path to the configuration file. Returns: str: Validated path to the configuration file. """ if "configure" not in sys.argv: # Support for adding config to specific file file_check = argparse.FileType("r") file_check(path) return path def main(): """ Main entry point for the CLI. """ parser = argparse.ArgumentParser( description="A command line interface to Archive.org.", epilog=("Documentation for 'ia' is available at:\n\n\t" "https://archive.org/developers/internetarchive/cli.html\n\n" "See 'ia {command} --help' for help on a specific command."), formatter_class=argparse.RawTextHelpFormatter) # support for \n in epilog parser.add_argument("-v", "--version", action="version", version=__version__) parser.add_argument("-c", "--config-file", action="store", type=validate_config_path, metavar="FILE", help="path to configuration file") parser.add_argument("-l", "--log", action="store_true", default=False, help="enable logging") parser.add_argument("-d", "--debug", action="store_true", help="enable debugging") parser.add_argument("-i", "--insecure", action="store_true", help="allow insecure connections") parser.add_argument("-H", "--host", action="store", help=("host to connect to " "(doesn't work for requests made to s3.us.archive.org)")) parser.add_argument("--user-agent-suffix", action="store", metavar="STRING", help="custom string to append to the default User-Agent " "(default with access key is always included)") subparsers = parser.add_subparsers(title="commands", dest="command", metavar="{command}") # Add subcommand parsers ia_account.setup(subparsers) ia_configure.setup(subparsers) ia_copy.setup(subparsers) ia_delete.setup(subparsers) ia_download.setup(subparsers) ia_flag.setup(subparsers) ia_list.setup(subparsers) ia_metadata.setup(subparsers) ia_move.setup(subparsers) ia_reviews.setup(subparsers) ia_search.setup(subparsers) ia_simplelists.setup(subparsers) ia_tasks.setup(subparsers) ia_upload.setup(subparsers) # Suppress help for alias subcommands args = parser.parse_args() config: dict[str, dict] = {} if args.log: config["logging"] = {"level": "INFO"} elif args.debug: config["logging"] = {"level": "DEBUG"} if args.insecure: config["general"] = {"secure": False} if args.host: if config.get("general"): config["general"]["host"] = args.host else: config["general"] = {"host": args.host} if args.user_agent_suffix: if config.get("general"): config["general"]["user_agent_suffix"] = args.user_agent_suffix else: config["general"] = {"user_agent_suffix": args.user_agent_suffix} args.session = get_session(config_file=args.config_file, config=config, debug=args.debug) # Check if any arguments were provided if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) args.func(args) if __name__ == "__main__": main() python-internetarchive-5.7.2/internetarchive/cli/ia_account.py000066400000000000000000000100541513674652200246710ustar00rootroot00000000000000""" ia_account.py 'ia' subcommand for configuring 'ia' with your archive.org credentials. """ # Copyright (C) 2012-2025 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import argparse import json import sys from internetarchive import configure from internetarchive.account import Account from internetarchive.exceptions import AccountAPIError from internetarchive.utils import is_valid_email def setup(subparsers): """ Setup args for configure command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("account", aliases=["ac"], description=( "Manage an archive.org account.\n\n" "Note: This command requires administrative " "privileges. " ), help=("Manage an archive.org account. " "Note: requires admin privileges")) group = parser.add_mutually_exclusive_group() parser.add_argument("user", help="Email address, screenname, or itemname " "for an archive.org account") group.add_argument("-g", "--get-email", action="store_true", help="Print the email address associated with the user and exit") group.add_argument("-s", "--get-screenname", action="store_true", help="Print the screenname associated with the user and exit") group.add_argument("-i", "--get-itemname", action="store_true", help="Print the itemname associated with the user and exit") group.add_argument("-l", "--is-locked", action="store_true", help="Check if an account is locked") group.add_argument("-L", "--lock", action="store_true", help="Lock an account") group.add_argument("-u", "--unlock", action="store_true", help="Unlock an account") parser.add_argument("-c", "--comment", type=str, help="Comment to include with lock/unlock action") parser.set_defaults(func=main) def main(args: argparse.Namespace) -> None: """ Main entrypoint for 'ia account'. """ try: if args.user.startswith('@'): account = Account.from_account_lookup('itemname', args.user) elif not is_valid_email(args.user): account = Account.from_account_lookup('screenname', args.user) else: account = Account.from_account_lookup('email', args.user) except AccountAPIError as exc: print(json.dumps(exc.error_data)) sys.exit(1) if args.get_email: print(account.canonical_email) elif args.get_screenname: print(account.screenname) elif args.get_itemname: print(account.itemname) elif args.is_locked: print(account.locked) elif args.lock: r = account.lock(args.comment, session=args.session) print(r.text) elif args.unlock: r = account.unlock(args.comment, session=args.session) print(r.text) else: account_data = account.to_dict() print(json.dumps(account_data)) python-internetarchive-5.7.2/internetarchive/cli/ia_configure.py000066400000000000000000000164521513674652200252260ustar00rootroot00000000000000""" ia_configure.py 'ia' subcommand for configuring 'ia' with your archive.org credentials. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse import json import netrc import sys from internetarchive import configure from internetarchive.exceptions import AuthenticationError def setup(subparsers): """ Setup args for configure command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("configure", aliases=["co"], help=("configure 'ia' with your " "archive.org credentials")) config_action_group = parser.add_mutually_exclusive_group() parser.add_argument("--username", "-u", help=("provide username as an option rather than " "providing it interactively")) parser.add_argument("--password", "-p", help=("provide password as an option rather than " "providing it interactively")) parser.add_argument("--netrc", "-n", action="store_true", help="use netrc file for login") config_action_group.add_argument("--show", "-s", action="store_true", help=("print the current configuration in JSON format, " "redacting secrets and cookies")) config_action_group.add_argument("--check", "-C", action="store_true", help="validate IA-S3 keys (exits 0 if valid, 1 otherwise)") config_action_group.add_argument("--whoami", "-w", action="store_true", help=("uses your IA-S3 keys to retrieve account " "information from archive.org " "about the associated account")) parser.add_argument("--print-cookies", "-c", action="store_true", help="print archive.org logged-in-* cookies") parser.add_argument("--print-auth-header", "-a", action="store_true", help="print an Authorization header with your IA-S3 keys") parser.set_defaults(func=main) def main(args: argparse.Namespace) -> None: """ Main entrypoint for 'ia configure'. """ if args.print_auth_header: secret = args.session.config.get("s3", {}).get("secret") access = args.session.config.get("s3", {}).get("access") if not secret or not access: print('hi') if not access: print("error: 'access' key not found in config file, try reconfiguring.", file=sys.stderr) elif not secret: print("error: 'secret' key not found in config file, try reconfiguring.", file=sys.stderr) sys.exit(1) print(f"Authorization: LOW {access}:{secret}") sys.exit() if args.print_cookies: user = args.session.config.get("cookies", {}).get("logged-in-user") sig = args.session.config.get("cookies", {}).get("logged-in-sig") if not user or not sig: if not user and not sig: print("error: 'logged-in-user' and 'logged-in-sig' cookies " "not found in config file, try reconfiguring.", file=sys.stderr) elif not user: print("error: 'logged-in-user' cookie not found in config file, " "try reconfiguring.", file=sys.stderr) elif not sig: print("error: 'logged-in-sig' cookie not found in config file, " "try reconfiguring.", file=sys.stderr) sys.exit(1) print(f"logged-in-user={user}; logged-in-sig={sig}") sys.exit() if args.show: config = args.session.config.copy() # Redact S3 secret if 's3' in config: s3_config = config['s3'].copy() if 'secret' in s3_config: s3_config['secret'] = 'REDACTED' config['s3'] = s3_config # Redact logged-in-secret cookie if 'cookies' in config: cookies = config['cookies'].copy() if 'logged-in-sig' in cookies: cookies['logged-in-sig'] = 'REDACTED' config['cookies'] = cookies print(json.dumps(config)) sys.exit() if args.whoami: whoami_info = args.session.whoami() print(json.dumps(whoami_info)) sys.exit() if args.check: whoami_info = args.session.whoami() if whoami_info.get('success') is True: user = whoami_info['value']['username'] print(f'The credentials for "{user}" are valid') sys.exit(0) else: print('Your credentials are invalid, check your configuration and try again') sys.exit(1) try: # Netrc if args.netrc: print("Configuring 'ia' with netrc file...", file=sys.stderr) try: n = netrc.netrc() except netrc.NetrcParseError: print("error: netrc.netrc() cannot parse your .netrc file.", file=sys.stderr) sys.exit(1) except FileNotFoundError: print("error: .netrc file not found.", file=sys.stderr) sys.exit(1) username, _, password = n.hosts["archive.org"] config_file_path = configure(username, password or "", config_file=args.session.config_file, host=args.session.host) print(f"Config saved to: {config_file_path}", file=sys.stderr) # Interactive input. else: if not (args.username and args.password): print("Enter your Archive.org credentials below to configure 'ia'.\n") config_file_path = configure(args.username, args.password, config_file=args.session.config_file, host=args.session.host) saved_msg = f"Config saved to: {config_file_path}" if not all([args.username, args.password]): saved_msg = f"\n{saved_msg}" print(saved_msg) except AuthenticationError as exc: print(f"\nerror: {exc}", file=sys.stderr) sys.exit(1) python-internetarchive-5.7.2/internetarchive/cli/ia_copy.py000066400000000000000000000144721513674652200242170ustar00rootroot00000000000000""" ia_copy.py 'ia' subcommand for copying files on archive.org """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse import sys from typing import Optional from urllib.parse import quote from requests import Response import internetarchive as ia from internetarchive.cli.cli_utils import MetadataAction, QueryStringAction from internetarchive.utils import get_s3_xml_text, merge_dictionaries def setup(subparsers): """ Setup args for copy command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("copy", aliases=["cp"], help="Copy files from archive.org items") # Positional arguments parser.add_argument("source", metavar="SOURCE", help="Source file formatted as: identifier/file") parser.add_argument("destination", metavar="DESTINATION", help="Destination file formatted as: identifier/file") # Options parser.add_argument("-m", "--metadata", metavar="KEY:VALUE", nargs="+", default={}, action=MetadataAction, help=("Metadata to add to your new item, if you are moving the " "file to a new item")) parser.add_argument("--replace-metadata", action="store_true", help=("Only use metadata specified as argument, do not copy any " "from the source item")) parser.add_argument("-H", "--header", metavar="KEY:VALUE", nargs="+", default={}, action=QueryStringAction, help="S3 HTTP headers to send with your request") parser.add_argument("--ignore-file-metadata", action="store_true", help="Do not copy file metadata") parser.add_argument("-n", "--no-derive", action="store_true", help="Do not derive uploaded files") parser.add_argument("--no-backup", action="store_true", help=("Turn off archive.org backups, " "clobbered files will not be saved to " "'history/files/$key.~N~'")) parser.set_defaults(func=lambda args: main(args, "copy", parser)) def assert_src_file_exists(src_location: str) -> bool: """ Assert that the source file exists on archive.org. """ assert SRC_ITEM.exists # type: ignore global SRC_FILE src_filename = src_location.split("/", 1)[-1] SRC_FILE = SRC_ITEM.get_file(src_filename) # type: ignore assert SRC_FILE.exists # type: ignore return True def main(args: argparse.Namespace, cmd: str, parser: argparse.ArgumentParser) -> tuple[Response, ia.files.File | None]: """ Main entry point for 'ia copy'. """ SRC_FILE = None if args.source == args.destination: parser.error("error: The source and destination files cannot be the same!") global SRC_ITEM SRC_ITEM = args.session.get_item(args.source.split("/")[0]) # type: ignore SRC_FILE = SRC_ITEM.get_file(args.source.split("/",1)[-1]) # type: ignore try: assert_src_file_exists(args.source) except AssertionError: parser.error(f"error: https://{args.session.host}/download/{args.source} " "does not exist. Please check the " "identifier and filepath and retry.") args.header["x-amz-copy-source"] = f"/{quote(args.source)}" # Copy the old metadata verbatim if no additional metadata is supplied, # else combine the old and the new metadata in a sensible manner. if args.metadata or args.replace_metadata: args.header["x-amz-metadata-directive"] = "REPLACE" else: args.header["x-amz-metadata-directive"] = "COPY" # New metadata takes precedence over old metadata. if not args.replace_metadata: args.metadata = merge_dictionaries(SRC_ITEM.metadata, # type: ignore args.metadata) # File metadata is copied by default but can be dropped. file_metadata = None if args.ignore_file_metadata else SRC_FILE.metadata # type: ignore # Add keep-old-version by default. if not args.header.get("x-archive-keep-old-version") and not args.no_backup: args.header["x-archive-keep-old-version"] = "1" url = f"{args.session.protocol}//s3.us.archive.org/{quote(args.destination)}" queue_derive = not args.no_derive req = ia.iarequest.S3Request(url=url, method="PUT", metadata=args.metadata, file_metadata=file_metadata, headers=args.header, queue_derive=queue_derive, access_key=args.session.access_key, secret_key=args.session.secret_key) p = req.prepare() r = args.session.send(p) if r.status_code != 200: try: msg = get_s3_xml_text(r.text) except Exception as e: msg = r.text print(f"error: failed to {cmd} '{args.source}' to '{args.destination}' - {msg}", file=sys.stderr) sys.exit(1) elif cmd == "copy": print(f"success: copied '{args.source}' to '{args.destination}'.", file=sys.stderr) return (r, SRC_FILE) python-internetarchive-5.7.2/internetarchive/cli/ia_delete.py000066400000000000000000000153401513674652200245020ustar00rootroot00000000000000""" ia_delete.py 'ia' subcommand for deleting files from archive.org items. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import argparse import sys import requests.exceptions from requests.exceptions import HTTPError from internetarchive.cli.cli_utils import ( FlattenListAction, MetadataAction, QueryStringAction, validate_identifier, ) from internetarchive.utils import get_s3_xml_text def setup(subparsers): """ Setup args for delete command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("delete", aliases=["rm"], help="Delete files from archive.org items") # Positional arguments parser.add_argument("identifier", type=validate_identifier, help="Identifier for the item from which files are to be deleted.") parser.add_argument("file", type=str, nargs="*", help="Specific file(s) to delete.") # Optional arguments parser.add_argument("-q", "--quiet", action="store_true", help="Print status to stdout.") parser.add_argument("-c", "--cascade", action="store_true", help="Delete all associated files including derivatives and the original.") parser.add_argument("-H", "--header", nargs="+", action=QueryStringAction, default={}, metavar="KEY:VALUE", help="S3 HTTP headers to send with your request.") parser.add_argument("-a", "--all", action="store_true", help="Delete all files in the given item. Some files cannot be deleted.") parser.add_argument("-d", "--dry-run", action="store_true", help=("Output files to be deleted to stdout, " "but don't actually delete them.")) parser.add_argument("-g", "--glob", type=str, help="Only delete files matching the given pattern.") parser.add_argument("-f", "--format", type=str, nargs="+", action=FlattenListAction, help="Only delete files matching the specified formats.") parser.add_argument("-R", "--retries", type=int, default=2, help="Number of retries on S3 503 SlowDown error.") parser.add_argument("--no-backup", action="store_true", help="Turn off archive.org backups. Clobbered files will not be saved.") parser.set_defaults(func=lambda args: main(args, parser)) def get_files_to_delete(args: argparse.Namespace, item) -> list: """Get files to delete based on command-line arguments.""" if args.all or args.file == []: files = list(item.get_files()) args.cascade = True if args.glob: files = list(item.get_files(glob_pattern=args.glob)) elif args.format: files = list(item.get_files(formats=args.format)) else: fnames = [f.strip() for f in (sys.stdin if args.file == ["-"] else args.file)] files = list(item.get_files(fnames)) return files def delete_files(files, args, item, verbose): """ Deletes files from an item. Args: files (list): A list of files to delete. args (argparse.Namespace): Parsed command-line arguments. item: The item from which files are being deleted. verbose (bool): If True, verbose output is enabled. Returns: bool: True if errors occurred during deletion, False otherwise. """ errors = False for f in files: if not f: if verbose: print(f" error: '{f.name}' does not exist", file=sys.stderr) errors = True continue if args.dry_run: if args.cascade: print(f" will delete: {item.identifier}/{f.name} and all derivatives", file=sys.stderr) else: print(f" will delete: {item.identifier}/{f.name}", file=sys.stderr) continue try: resp = f.delete(verbose=verbose, cascade_delete=args.cascade, headers=args.header, retries=args.retries) except requests.exceptions.RetryError: print(f" error: max retries exceeded for {f.name}", file=sys.stderr) errors = True continue except HTTPError as exc: errors = True msg = get_s3_xml_text(exc.response.content) if not msg or msg == str(exc.response.content): msg = str(exc) print(f" error: {msg} ({exc.response.status_code})", file=sys.stderr) continue if resp.status_code != 204: errors = True msg = get_s3_xml_text(resp.content) print(f" error: {msg} ({resp.status_code})", file=sys.stderr) continue return errors def main(args: argparse.Namespace, parser: argparse.ArgumentParser): """ Main entry point for 'ia delete'. """ verbose = not args.quiet item = args.session.get_item(args.identifier) if not item.exists: print(f"{item.identifier}: skipping, item doesn't exist.", file=sys.stderr) return # Add keep-old-version by default. if "x-archive-keep-old-version" not in args.header and not args.no_backup: args.header["x-archive-keep-old-version"] = "1" if verbose: print(f"Deleting files from {item.identifier}", file=sys.stderr) files = get_files_to_delete(args, item) if not files: print(" warning: no files found, nothing deleted.", file=sys.stderr) sys.exit(1) errors = delete_files(files, args, item, verbose) if errors: sys.exit(1) python-internetarchive-5.7.2/internetarchive/cli/ia_download.py000066400000000000000000000251231513674652200250470ustar00rootroot00000000000000""" ia_download.py 'ia' subcommand for downloading files from archive.org. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse import sys from typing import TextIO from internetarchive.cli.cli_utils import ( QueryStringAction, validate_dir_path, ) from internetarchive.files import File from internetarchive.search import Search def setup(subparsers): """ Setup args for download command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("download", aliases=["do"], help="Download files from archive.org",) # Main options parser.add_argument("identifier", nargs="?", type=str, help="Identifier for the upload") parser.add_argument("file", nargs="*", help="Files to download (only allowed with identifier)") # Additional options parser.add_argument("-q", "--quiet", action="store_true", help="Turn off ia's output") parser.add_argument("-d", "--dry-run", action="store_true", help="Print URLs to stdout and exit") parser.add_argument("-i", "--ignore-existing", action="store_true", help="Clobber files already downloaded") parser.add_argument("-C", "--checksum", action="store_true", help="Skip files based on checksum") parser.add_argument("--checksum-archive", action="store_true", help="Skip files based on _checksum_archive.txt file") parser.add_argument("-R", "--retries", type=int, default=5, help="Set number of retries to (default: 5)") parser.add_argument("-I", "--itemlist", type=argparse.FileType("r"), help=("Download items from a specified file. " "Itemlists should be a plain text file with one " "identifier per line")) parser.add_argument("-S", "--search", help="Download items returned from a specified search query") parser.add_argument("-P", "--search-parameters", nargs="+", action=QueryStringAction, metavar="KEY:VALUE", help="Parameters to send with your --search query") parser.add_argument("-g", "--glob", help=("Only download files whose filename matches " "the given glob pattern. You can provide multiple " "patterns separated by a pipe symbol `|`")) parser.add_argument("-e", "--exclude", help=("Exclude files whose filename matches " "the given glob pattern. You can provide multiple " "patterns separated by a pipe symbol `|`. You can only " "use this option in conjunction with --glob")) parser.add_argument("-f", "--format", nargs="+", action="extend", help=("Only download files of the specified format. " "Use this option multiple times to download " "multiple formats. You can use the following command to " "retrieve a list of file formats contained within a " "given item: ia metadata --formats ")) parser.add_argument("--on-the-fly", action="store_true", help=("Download on-the-fly files, as well as other " "matching files. on-the-fly files include derivative " "EPUB, MOBI and DAISY files [default: False]")) parser.add_argument("--no-directories", action="store_true", help=("Download files into working directory. " "Do not create item directories")) parser.add_argument("--destdir", type=validate_dir_path, help=("The destination directory to download files " "and item directories to")) parser.add_argument("-s", "--stdout", action="store_true", help="Write file contents to stdout") parser.add_argument("--no-change-timestamp", action="store_true", help=("Don't change the timestamp of downloaded files to reflect " "the source material")) parser.add_argument("-p", "--parameters", nargs="+", action=QueryStringAction, metavar="KEY:VALUE", help="Parameters to send with your download request (e.g. `cnt=0`)") parser.add_argument("-a", "--download-history", action="store_true", help="Also download files from the history directory") parser.add_argument("--source", nargs="+", action="extend", help=("Filter files based on their source value in files.xml " "(i.e. `original`, `derivative`, `metadata`)")) parser.add_argument("--exclude-source", nargs="+", action="extend", help=("Filter files based on their source value in files.xml " "(i.e. `original`, `derivative`, `metadata`)")) parser.add_argument("-t", "--timeout", type=float, help=("Set a timeout for download requests. " "This sets both connect and read timeout")) parser.set_defaults(func=lambda args: main(args, parser)) def validate_args(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: if args.itemlist and args.search: parser.error("--itemlist and --search cannot be used together") if args.itemlist or args.search: if args.identifier: parser.error("Cannot specify an identifier with --itemlist/--search") if args.file: parser.error("Cannot specify files with --itemlist/--search") else: if not args.identifier: parser.error("Identifier is required when not using --itemlist/--search") def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """ Main entry point for 'ia download'. """ ids: list[File | str] | Search | TextIO validate_args(args, parser) if args.itemlist: ids = [x.strip() for x in args.itemlist if x.strip()] if not ids: parser.error("--itemlist file is empty or contains only whitespace") total_ids = len(ids) elif args.search: try: _search = args.session.search_items(args.search, params=args.search_parameters) total_ids = _search.num_found if total_ids == 0: print(f"error: the query '{args.search}' returned no results", file=sys.stderr) sys.exit(1) ids = _search except ValueError as e: print(f"error: {e}", file=sys.stderr) sys.exit(1) # Download specific files. if args.identifier and args.identifier != "-": if "/" in args.identifier: identifier = args.identifier.split("/")[0] files = ["/".join(args.identifier.split("/")[1:])] else: identifier = args.identifier files = args.file total_ids = 1 ids = [identifier] elif args.identifier == "-": total_ids = 1 ids = sys.stdin files = None else: files = None errors = [] for i, identifier in enumerate(ids): try: identifier = identifier.strip() except AttributeError: identifier = identifier.get("identifier") if total_ids > 1: item_index = f"{i + 1}/{total_ids}" else: item_index = None try: item = args.session.get_item(identifier) except Exception as exc: print(f"{identifier}: failed to retrieve item metadata - errors", file=sys.stderr) if "You are attempting to make an HTTPS" in str(exc): print(f"\n{exc}", file=sys.stderr) sys.exit(1) else: continue # Otherwise, download the entire item. ignore_history_dir = not args.download_history _errors = item.download( files=files, formats=args.format, glob_pattern=args.glob, exclude_pattern=args.exclude, dry_run=args.dry_run, verbose=not args.quiet, ignore_existing=args.ignore_existing, checksum=args.checksum, checksum_archive=args.checksum_archive, destdir=args.destdir, no_directory=args.no_directories, retries=args.retries, item_index=item_index, ignore_errors=True, on_the_fly=args.on_the_fly, no_change_timestamp=args.no_change_timestamp, params=args.parameters, ignore_history_dir=ignore_history_dir, source=args.source, exclude_source=args.exclude_source, stdout=args.stdout, timeout=args.timeout, ) if _errors: errors.append(_errors) if errors: # TODO: add option for a summary/report. sys.exit(1) else: sys.exit(0) python-internetarchive-5.7.2/internetarchive/cli/ia_flag.py000066400000000000000000000057361513674652200241610ustar00rootroot00000000000000""" ia_flag.py 'ia' subcommand for managing flags on archive.org. """ # Copyright (C) 2012-2025 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse def setup(subparsers): """Set up argument parser for the 'flag' subcommand. Args: subparsers: argparse subparsers object from main CLI """ parser = subparsers.add_parser( "flag", aliases=["fl"], help="Manage flags", ) parser.add_argument( "identifier", nargs="?", type=str, help="Identifier for the upload", ) parser.add_argument( "-u", "--user", type=str, help="User associated with the flag", ) group = parser.add_argument_group("Add flag operations") group.add_argument( "-a", "--add-flag", metavar="CATEGORY", type=str, help="Add a flag to the item", ) group = parser.add_argument_group("Delete flag operations") group.add_argument( "-d", "--delete-flag", metavar="CATEGORY", type=str, help="Delete a flag from the item", ) parser.set_defaults(func=lambda args: main(args, parser)) def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """Handle flag subcommand execution. Args: args: Parsed command-line arguments parser: Argument parser for error handling """ item = args.session.get_item(args.identifier) if args.user: flag_user = args.user else: flag_user = args.session.config.get("general", {}).get("screenname") if not flag_user.startswith('@'): flag_user = f"@{flag_user}" if args.add_flag: r = item.add_flag(args.add_flag, flag_user) j = r.json() if j.get("status") == "success": print(f"success: added '{args.add_flag}' flag by {flag_user} to {args.identifier}") else: print(f"error: {item.identifier} - {r.text}") elif args.delete_flag: r = item.delete_flag(args.delete_flag, flag_user) j = r.json() if j.get("status") == "success": print(f"success: deleted '{args.delete_flag}' flag by {flag_user} from {args.identifier}") else: print(f"error: {item.identifier} - {r.text}") else: r = item.get_flags() print(r.text) python-internetarchive-5.7.2/internetarchive/cli/ia_list.py000066400000000000000000000113201513674652200242050ustar00rootroot00000000000000""" ia_list.py 'ia' subcommand for listing files from archive.org items. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import argparse import csv import sys from fnmatch import fnmatch from itertools import chain from internetarchive.cli.cli_utils import validate_identifier def setup(subparsers): """ Setup args for list command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("list", aliases=["ls"], help="list files from archive.org items") # Positional arguments parser.add_argument("identifier", type=validate_identifier, help="identifier of the item") # Options parser.add_argument("-v", "--verbose", action="store_true", help="print column headers") parser.add_argument("-a", "--all", action="store_true", help="list all information available for files") parser.add_argument("-l", "--location", action="store_true", help="print full URL for each file") parser.add_argument("-c", "--columns", action="append", type=prepare_columns, help="list specified file information") parser.add_argument("-g", "--glob", help="only return files matching the given pattern") parser.add_argument("-f", "--format", action="append", help="return files matching FORMAT") parser.set_defaults(func=main) def prepare_columns(columns): """ Validate the path to the configuration file. Returns: str: Validated list of columns """ if columns: if not isinstance(columns, list): columns = [columns] return list(chain.from_iterable([c.split(",") for c in columns])) return None def setup_columns(args, files): """ Setup and adjust columns for output based on args. """ if not args.columns: args.columns = ["name"] else: args.columns = list(chain.from_iterable(args.columns)) if args.all: args.columns = list(set(chain.from_iterable(k for k in files))) # Make "name" the first column always. if "name" in args.columns: args.columns.remove("name") args.columns.insert(0, "name") def filter_files(args, files, item): """ Filter files based on glob patterns or formats. """ if args.glob: patterns = args.glob.split("|") return [f for f in files if any(fnmatch(f["name"], p) for p in patterns)] if args.format: return [f.__dict__ for f in item.get_files(formats=args.format)] return files def generate_output(files, args, dict_writer, item): """ Generate and write output based on filtered files and columns. """ output = [] for f in files: file_dict = {} for key, val in f.items(): if key in args.columns: if isinstance(val, (list, tuple, set)): val = ";".join(val) if key == "name" and args.location: file_dict[key] = (f"https://{args.session.host}" f"/download/{item.identifier}/{val}") else: file_dict[key] = val output.append(file_dict) if args.verbose: dict_writer.writer.writerow(args.columns) if all(x == {} for x in output): sys.exit(1) dict_writer.writerows(output) def main(args: argparse.Namespace) -> None: """ Main entry point for 'ia list'. """ item = args.session.get_item(args.identifier) files = item.files setup_columns(args, files) files = filter_files(args, files, item) dict_writer = csv.DictWriter(sys.stdout, args.columns, delimiter="\t", lineterminator="\n") generate_output(files, args, dict_writer, item) python-internetarchive-5.7.2/internetarchive/cli/ia_metadata.py000066400000000000000000000323761513674652200250300ustar00rootroot00000000000000""" ia_metadata.py 'ia' subcommand for modifying and retrieving metadata from archive.org items. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse import csv import sys from collections import defaultdict from copy import copy from typing import Mapping from requests import Request, Response from internetarchive import item from internetarchive.cli.cli_utils import MetadataAction, QueryStringAction from internetarchive.exceptions import ItemLocateError from internetarchive.utils import json def setup(subparsers): """ Setup args for metadata command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("metadata", aliases=["md"], help="Retrieve and modify archive.org item metadata") parser.add_argument("identifier", nargs="?", type=str, help="Identifier for the upload") # Mutually exclusive group for metadata modification options modify_group = parser.add_mutually_exclusive_group() modify_group.add_argument("-m", "--modify", nargs="+", action=MetadataAction, metavar="KEY:VALUE", help="Modify the metadata of an item") modify_group.add_argument("-r", "--remove", nargs="+", action=MetadataAction, metavar="KEY:VALUE", help="Remove KEY:VALUE from a metadata element") modify_group.add_argument("-a", "--append", nargs="+", action=MetadataAction, metavar="KEY:VALUE", help="Append a string to a metadata element") modify_group.add_argument("-A", "--append-list", nargs="+", action=MetadataAction, metavar="KEY:VALUE", help="Append a field to a metadata element") modify_group.add_argument("-i", "--insert", nargs="+", action=MetadataAction, metavar="KEY:VALUE", help=("Insert a value into a multi-value field given " "an index (e.g. `--insert=collection[0]:foo`)")) # Additional options parser.add_argument("-E", "--expect", nargs="+", action=MetadataAction, metavar="KEY:VALUE", help=("Test an expectation server-side before applying patch " "to item metadata")) parser.add_argument("-H", "--header", nargs="+", action=QueryStringAction, metavar="KEY:VALUE", help="S3 HTTP headers to send with your request") parser.add_argument("-t", "--target", metavar="target", default="metadata", help="The metadata target to modify") parser.add_argument("-s", "--spreadsheet", metavar="metadata.csv", help="Modify metadata in bulk using a spreadsheet as input") parser.add_argument("-e", "--exists", action="store_true", help="Check if an item exists") parser.add_argument("-F", "--formats", action="store_true", help="Return the file-formats the given item contains") parser.add_argument("-p", "--priority", metavar="priority", help="Set the task priority") parser.add_argument("--timeout", metavar="value", help="Set a timeout for metadata writes") parser.add_argument("-R", "--reduced-priority", action="store_true", help="Submit task at a reduced priority.") parser.add_argument("-P", "--parameters", nargs="+", action=QueryStringAction, metavar="KEY:VALUE", help="Parameters to send with your query.") parser.set_defaults(func=lambda args: main(args, parser)) def modify_metadata(item: item.Item, metadata: Mapping, args: argparse.Namespace, parser: argparse.ArgumentParser) -> Response: """ Modify metadata helper function. """ append = bool(args.append) append_list = bool(args.append_list) insert = bool(args.insert) try: r = item.modify_metadata(metadata, target=args.target, append=append, expect=args.expect, priority=args.priority, append_list=append_list, headers=args.header, insert=insert, reduced_priority=args.reduced_priority, timeout=args.timeout) except ItemLocateError as exc: print(f"{item.identifier} - error: {exc}", file=sys.stderr) sys.exit(1) except ValueError as exc: if "append to list" in str(exc): error_msg = ("cannot append string to list metadata with '--append'; " "use '--append-list' instead.") print(f"{item.identifier} - error: {error_msg}", file=sys.stderr) sys.exit(1) if isinstance(r, Request): # TODO: modify_metadata can return a Request object in some cases, # but it does NOT currently in the CLI. If that changes, i.e. if # debug is implemented, handle this here. This exception should # never be raised, but it keeps mypy happy. raise NotImplementedError("Request handling not yet implemented") if not r.json()["success"]: error_msg = r.json()["error"] etype = "warning" if "no changes" in r.text else "error" print(f"{item.identifier} - {etype} ({r.status_code}): {error_msg}", file=sys.stderr) return r print(f"{item.identifier} - success: {r.json()['log']}", file=sys.stderr) return r def remove_metadata(item: item.Item, metadata: Mapping, args: argparse.Namespace, parser: argparse.ArgumentParser) -> Response: """ Remove metadata helper function. """ md: dict[str, list | str] = defaultdict(list) for key in metadata: src_md = {} if args.target.startswith("files/"): for f in item.get_files(): if f.name == "/".join(args.target.split("/")[1:]): src_md = f.__dict__.get(key, {}) break else: src_md = copy(item.metadata.get(key, {})) if not src_md: continue if key == "collection": _col = copy(metadata[key]) _src_md = copy(src_md) if not isinstance(_col, list): _col = [_col] if not isinstance(_src_md, list): _src_md = [_src_md] # type: ignore for c in _col: if c not in _src_md: r = item.remove_from_simplelist(c, "holdings") j = r.json() if j.get("success"): print(f"{item.identifier} - success: {item.identifier} no longer in {c}", file=sys.stderr) sys.exit(0) elif j.get("error", "").startswith("no row to delete for"): print(f"{item.identifier} - success: {item.identifier} no longer in {c}", file=sys.stderr) sys.exit(0) else: print(f"{item.identifier} - error: {j.get('error')}", file=sys.stderr) sys.exit(1) if not isinstance(src_md, list): if key == "subject": if isinstance(src_md, str): src_md = src_md.split(";") elif key == "collection": print(f"{item.identifier} - error: all collections would be removed, " "not submitting task.", file=sys.stderr) sys.exit(1) if src_md == metadata[key]: md[key] = "REMOVE_TAG" continue for x in src_md: if isinstance(metadata[key], list): if x not in metadata[key]: md[key].append(x) # type: ignore else: if x != metadata[key]: md[key].append(x) # type: ignore if len(md[key]) == len(src_md): del md[key] if md.get("collection") == []: print(f"{item.identifier} - error: all collections would be removed, not submitting task.", file=sys.stderr) sys.exit(1) elif not md: print(f"{item.identifier} - warning: nothing needed to be removed.", file=sys.stderr) sys.exit(0) r = modify_metadata(item, md, args, parser) return r def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """ Main entry point for 'ia metadata'. """ formats = set() responses: list[bool | Response] = [] item = args.session.get_item(args.identifier, request_kwargs={'params': args.parameters}) # Check existence of item. if args.exists: if item.exists: responses.append(True) print(f"{args.identifier} exists", file=sys.stderr) else: responses.append(False) print(f"{args.identifier} does not exist", file=sys.stderr) if all(r is True for r in responses): sys.exit(0) else: sys.exit(1) # Modify metadata. elif (args.modify or args.append or args.append_list or args.remove or args.insert): # TODO: Find a better way to handle this. if args.modify: metadata = args.modify elif args.append: metadata = args.append elif args.append_list: metadata = args.append_list elif args.insert: metadata = args.insert if args.remove: metadata = args.remove if args.remove: responses.append(remove_metadata(item, metadata, args, parser)) else: responses.append(modify_metadata(item, metadata, args, parser)) if all(r.status_code == 200 for r in responses): # type: ignore sys.exit(0) else: for r in responses: assert isinstance(r, Response) if r.status_code == 200: continue # We still want to exit 0 if the non-200 is a # "no changes to xml" error. elif "no changes" in r.text: continue else: sys.exit(1) # Get metadata. elif args.formats: for f in item.get_files(): formats.add(f.format) print("\n".join(formats)) # Edit metadata for items in bulk, using a spreadsheet as input. elif args.spreadsheet: if not args.priority: args.priority = -5 with open(args.spreadsheet, newline="", encoding="utf-8-sig") as csvfp: spreadsheet = csv.DictReader(csvfp) responses = [] for row in spreadsheet: if not row["identifier"]: continue item = args.session.get_item(row["identifier"]) if row.get("file"): del row["file"] metadata = {k.lower(): v for k, v in row.items() if v} responses.append(modify_metadata(item, metadata, args, parser)) if all(r.status_code == 200 for r in responses): # type: ignore sys.exit(0) else: for r in responses: assert isinstance(r, Response) if r.status_code == 200: continue # We still want to exit 0 if the non-200 is a # "no changes to xml" error. elif "no changes" in r.text: continue else: sys.exit(1) # Dump JSON to stdout. else: metadata_str = json.dumps(item.item_metadata) print(metadata_str) python-internetarchive-5.7.2/internetarchive/cli/ia_move.py000066400000000000000000000073401513674652200242070ustar00rootroot00000000000000""" ia_move.py 'ia' subcommand for moving files on archive.org """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import argparse import sys from internetarchive.cli import ia_copy from internetarchive.cli.cli_utils import MetadataAction, QueryStringAction def setup(subparsers): """ Setup args for move command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("move", aliases=["mv"], help="Move and rename files in archive.org items") # Positional arguments parser.add_argument("source", metavar="SOURCE", help="Source file formatted as: identifier/file") parser.add_argument("destination", metavar="DESTINATION", help="Destination file formatted as: identifier/file") # Options parser.add_argument("-m", "--metadata", metavar="KEY:VALUE", nargs="+", action=MetadataAction, help=("Metadata to add to your new item, " "if you are moving the file to a new item")) parser.add_argument("-H", "--header", metavar="KEY:VALUE", nargs="+", action=QueryStringAction, default={}, help="S3 HTTP headers to send with your request") parser.add_argument("--replace-metadata", action="store_true", help=("Only use metadata specified as argument, do not copy any " "from the source item")) parser.add_argument("--ignore-file-metadata", action="store_true", help="Do not copy file metadata") parser.add_argument("-n", "--no-derive", action="store_true", help="Do not derive uploaded files") parser.add_argument("--no-backup", action="store_true", help=("Turn off archive.org backups, " "clobbered files will not be saved to 'history/files/$key.~N~'")) parser.set_defaults(func=lambda args: main(args, parser)) def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """ Main entry point for ia move command. """ # Add keep-old-version by default. if not args.header.get("x-archive-keep-old-version") and not args.no_backup: args.header["x-archive-keep-old-version"] = "1" # Call ia_copy. _, src_file = ia_copy.main(args, cmd="move", parser=parser) if src_file: dr = src_file.delete(headers=args.header, cascade_delete=True) else: print(f"error: {src_file} does not exist", file=sys.stderr) sys.exit(1) if dr.status_code == 204: print(f"success: moved '{args.source}' to '{args.destination}'", file=sys.stderr) sys.exit(0) print(f"error: {dr.content}", file=sys.stderr) python-internetarchive-5.7.2/internetarchive/cli/ia_reviews.py000066400000000000000000000121131513674652200247170ustar00rootroot00000000000000""" ia_reviews.py 'ia' subcommand for listing, submitting, and deleting reviews for archive.org items. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import argparse import sys from requests.exceptions import HTTPError def setup(subparsers): """ Setup args for list command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("reviews", aliases=["re"], help="submit and modify reviews for archive.org items") # Positional arguments parser.add_argument("identifier", type=str, help="identifier of the item") # Options parser.add_argument("-d", "--delete", action="store_true", help="delete your review") parser.add_argument("-t", "--title", type=str, help="the title of your review") parser.add_argument("-b", "--body", type=str, help="the body of your review") parser.add_argument("-s", "--stars", type=int, help="the number of stars for your review") parser.add_argument("-i", "--index", action="store_true", help="Index a review") parser.add_argument("-n", "--noindex", action="store_true", help="Remove a review from the index") # Conditional arguments that require --delete delete_group = parser.add_argument_group("delete options", ("these options are used with " "the --delete flag")) delete_group.add_argument("-u", "--username", type=str, help="delete reviews for a specific user given USERNAME") delete_group.add_argument("-S", "--screenname", type=str, help="delete reviews for a specific user given SCREENNAME") delete_group.add_argument("-I", "--itemname", type=str, help="delete reviews for a specific user given ITEMNAME") parser.set_defaults(func=lambda args: main(args, parser)) def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """ Main entry point for 'ia reviews'. """ item = args.session.get_item(args.identifier) if args.index: r = item.index_review(username=args.username, screenname=args.screenname, itemname=args.itemname) if r.json().get("success"): print(f"{item.identifier} - success: review indexed", file=sys.stderr) sys.exit(0) elif args.noindex: r = item.noindex_review(username=args.username, screenname=args.screenname, itemname=args.itemname) if r.json().get("success"): print(f"{item.identifier} - success: review removed from index", file=sys.stderr) sys.exit(0) if args.delete: r = item.delete_review(username=args.username, screenname=args.screenname, itemname=args.itemname) elif not args.body and not args.title: try: r = item.get_review() print(r.text) sys.exit(0) except HTTPError as exc: if exc.response.status_code == 404: # type: ignore sys.exit(0) else: raise exc else: if (args.title and not args.body) or (args.body and not args.title): parser.error("both --title and --body must be provided") r = item.review(args.title, args.body, args.stars) j = r.json() if j.get("success") or "no change detected" in j.get("error", "").lower(): task_id = j.get("value", {}).get("task_id") if task_id: print((f"{item.identifier} - success: " f"https://catalogd.archive.org/log/{task_id}"), file=sys.stderr) else: print(f"{item.identifier} - warning: no changes detected!", file=sys.stderr) sys.exit(0) else: print(f"{item.identifier} - error: {j.get('error')}", file=sys.stderr) sys.exit(1) python-internetarchive-5.7.2/internetarchive/cli/ia_search.py000066400000000000000000000146451513674652200245140ustar00rootroot00000000000000""" ia_search.py 'ia' subcommand for searching items on archive.org. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse import sys from itertools import chain from requests.exceptions import ConnectTimeout, ReadTimeout from internetarchive.cli.cli_utils import FlattenListAction, QueryStringAction from internetarchive.exceptions import AuthenticationError from internetarchive.utils import json def setup(subparsers): """ Setup args for search command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("search", aliases=["se"], help="Search items on archive.org") # Positional arguments parser.add_argument("query", type=str, help="Search query or queries.") # Optional arguments parser.add_argument("-p", "--parameters", nargs="+", action=QueryStringAction, metavar="KEY:VALUE", help="Parameters to send with your query.") parser.add_argument("-H", "--header", nargs="+", action=QueryStringAction, metavar="KEY:VALUE", help="Add custom headers to your search request.") parser.add_argument("-s", "--sort", action="append", help=("Sort search results by specified fields. " "See https://archive.org/advancedsearch.php " "for full list of sort values" " (e.g. --sort 'date desc', --sort 'date asc', etc.).")) parser.add_argument("-i", "--itemlist", action="store_true", help="Output identifiers only.") parser.add_argument("-f", "--field", nargs="+", action=FlattenListAction, help="Metadata fields to return.") parser.add_argument("-n", "--num-found", action="store_true", help="Print the number of results to stdout.") parser.add_argument("-F", "--fts", action="store_true", help="Beta support for querying the archive.org full text search API.") parser.add_argument("-D", "--dsl-fts", action="store_true", help="Submit --fts query in dsl.") parser.add_argument("-t", "--timeout", type=float, default=300, help="Set the timeout in seconds.") parser.set_defaults(func=lambda args: main(args, parser)) def prepare_values(value): """ Prepare comma-separated values based on the input value. """ if value: return list(chain.from_iterable([x.split(",") for x in value])) return None def perform_search(args, fields, sorts, r_kwargs): """ Perform the search using the provided arguments and request kwargs. """ return args.session.search_items(args.query, # type: ignore fields=fields, sorts=sorts, params=args.parameters, full_text_search=args.fts, dsl_fts=args.dsl_fts, request_kwargs=r_kwargs) def handle_search_results(args, search): """ Handle search results based on command-line arguments. """ if args.num_found: print(search.num_found) sys.exit(0) for result in search: if args.itemlist: if args.fts or args.dsl_fts: print("\n".join(result.get("fields", {}).get("identifier"))) else: print(result.get("identifier", "")) else: print(json.dumps(result)) if result.get("error"): sys.exit(1) def handle_value_error(exc): """ Handle ValueError exception. """ return f"error: {exc}" def handle_connect_timeout(): """ Handle ConnectTimeout exception. """ return "error: Request timed out. Increase the --timeout and try again." def handle_read_timeout(): """ Handle ReadTimeout exception. """ return "error: The server timed out and failed to return all search results, please try again" def handle_authentication_error(exc): """ Handle AuthenticationError exception. """ return f"error: {exc}" def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """ Main entry point for 'ia search'. """ try: # Prepare fields and sorts. fields = prepare_values(args.field) sorts = prepare_values(args.sort) # Prepare request kwargs. r_kwargs = { "headers": args.header, "timeout": args.timeout, } # Perform search. search = perform_search(args, fields, sorts, r_kwargs) # Handle search results. handle_search_results(args, search) except ValueError as exc: error_message = handle_value_error(exc) print(error_message, file=sys.stderr) sys.exit(1) except ConnectTimeout: error_message = handle_connect_timeout() print(error_message, file=sys.stderr) sys.exit(1) except ReadTimeout: error_message = handle_read_timeout() print(error_message, file=sys.stderr) sys.exit(1) except AuthenticationError as exc: error_message = handle_authentication_error(exc) print(error_message, file=sys.stderr) sys.exit(1) python-internetarchive-5.7.2/internetarchive/cli/ia_simplelists.py000066400000000000000000000105121513674652200256040ustar00rootroot00000000000000""" ia_simplelists.py 'ia' subcommand for managing simplelists on archive.org. """ # Copyright (C) 2012-2025 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import annotations import argparse import sys from internetarchive.utils import json def setup(subparsers): """Set up argument parser for the 'simplelists' subcommand. Args: subparsers: argparse subparsers object from main CLI """ parser = subparsers.add_parser("simplelists", aliases=["sl"], help="Manage simplelists") parser.add_argument( "identifier", nargs="?", type=str, help="Identifier for the upload" ) group = parser.add_argument_group("List operations") group.add_argument( "-p", "--list-parents", action="store_true", help="List parent lists for the given identifier" ) group.add_argument( "-c", "--list-children", action="store_true", help="List children in parent list" ) group.add_argument( "-l", "--list-name", type=str, help="Name of the list to operate on" ) group = parser.add_argument_group("Modification operations") group.add_argument( "-s", "--set-parent", metavar="PARENT", type=str, help="Add identifier to specified parent list" ) group.add_argument( "-n", "--notes", metavar="NOTES", type=str, help="Notes to attach to the list membership" ) group.add_argument( "-r", "--remove-parent", metavar="PARENT", type=str, help="Remove identifier from specified parent list" ) parser.set_defaults(func=lambda args: main(args, parser)) def submit_patch(patch, args): """Submit patch request to simplelists API""" data = {"-patch": json.dumps(patch), "-target": "simplelists"} url = f"{args.session.protocol}//{args.session.host}/metadata/{args.identifier}" return args.session.post(url, data=data) def _handle_patch_operation(args, parser, operation): """Handle set/delete patch operations for simplelists. :param operation: The patch operation type ('set' or 'delete') """ if not args.identifier: parser.error("Missing required identifier argument") if not args.list_name: parser.error("Must specify list name with -l/--list-name") patch = { "op": operation, "parent": args.set_parent or args.remove_parent, "list": args.list_name, } if args.notes: patch["notes"] = args.notes r = submit_patch(patch, args) try: r.raise_for_status() print(f"success: {args.identifier}") except Exception as e: print(f"error: {args.identifier} - {e!s}", file=sys.stderr) sys.exit(1) def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """Handle simplelists subcommand execution. Args: args: Parsed command-line arguments parser: Argument parser for error handling """ if args.list_parents: item = args.session.get_item(args.identifier) simplelists = item.item_metadata.get("simplelists") if simplelists: print(json.dumps(simplelists)) elif args.list_children: args.list_name = args.list_name or "catchall" query = f"simplelists__{args.list_name}:{args.identifier or '*'}" for result in args.session.search_items(query): print(json.dumps(result)) elif args.set_parent: _handle_patch_operation(args, parser, "set") elif args.remove_parent: _handle_patch_operation(args, parser, "delete") else: parser.print_help() sys.exit(1) python-internetarchive-5.7.2/internetarchive/cli/ia_tasks.py000066400000000000000000000152131513674652200243640ustar00rootroot00000000000000""" ia_tasks.py 'ia' subcommand for retrieving information about archive.org catalog tasks. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import argparse import sys import warnings from internetarchive.cli.cli_utils import PostDataAction, QueryStringAction from internetarchive.utils import json def setup(subparsers): """ Setup args for tasks command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("tasks", aliases=["ta"], help="Retrieve information about your archive.org catalog tasks") parser.add_argument("-t", "--task", nargs="*", help="Return information about the given task.") parser.add_argument("-G", "--get-task-log", help="Return the given tasks task log.") parser.add_argument("-p", "--parameter", nargs="+", action=QueryStringAction, default={}, metavar="KEY:VALUE", help="URL parameters passed to catalog.php.") parser.add_argument("-T", "--tab-output", action="store_true", help="Output task info in tab-delimited columns.") parser.add_argument("-c", "--cmd", type=str, help="The task to submit (e.g., make_dark.php).") parser.add_argument("-C", "--comment", type=str, help="A reasonable explanation for why a task is being submitted.") parser.add_argument("-a", "--task-args", nargs="+", action=QueryStringAction, default={}, metavar="KEY:VALUE", help="Args to submit to the Tasks API.") parser.add_argument("-d", "--data", nargs="+", action=PostDataAction, metavar="KEY:VALUE", default={}, help="Additional data to send when submitting a task.") parser.add_argument("-r", "--reduced-priority", action="store_true", help="Submit task at a reduced priority.") parser.add_argument("-l", "--get-rate-limit", action="store_true", help="Get rate limit info.") parser.add_argument("identifier", type=str, nargs="?", help="Identifier for tasks specific operations.") parser.set_defaults(func=lambda args: main(args, parser)) def handle_task_submission_result(result, cmd): """ Handle the result of a task submission. """ if result.get("success"): task_log_url = result.get("value", {}).get("log") print(f"success: {task_log_url}", file=sys.stderr) elif "already queued/running" in result.get("error", ""): print(f"success: {cmd} task already queued/running", file=sys.stderr) else: print(f"error: {result.get('error')}", file=sys.stderr) sys.exit(0 if result.get("success") else 1) def main(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None: """ Main entry point for 'ia tasks'. """ # Tasks write API. if args.cmd: if args.get_rate_limit: r = args.session.get_tasks_api_rate_limit(args.cmd) print(json.dumps(r)) sys.exit(0) args.data["args"] = args.task_args r = args.session.submit_task(args.identifier, args.cmd, comment=args.comment, priority=int(args.data.get("priority", 0)), reduced_priority=args.reduced_priority, data=args.data) handle_task_submission_result(r.json(), args.cmd) sys.exit(0) # Tasks read API. if args.identifier: _params = {"identifier": args.identifier, "catalog": 1, "history": 1} _params.update(args.parameter) args.parameter = _params elif args.get_task_log: log = args.session.get_task_log(args.get_task_log, **args.parameter) print(log.encode("utf-8", errors="surrogateescape") .decode("utf-8", errors="replace")) sys.exit(0) queryable_params = [ "identifier", "task_id", "server", "cmd", "args", "submitter", "priority", "wait_admin", "submittime", ] if not (args.identifier or args.parameter.get("task_id")): _params = {"catalog": 1, "history": 0} _params.update(args.parameter) args.parameter = _params if not any(x in args.parameter for x in queryable_params): _params = {"submitter": args.session.user_email, "catalog": 1, "history": 0, "summary": 0} _params.update(args.parameter) args.parameter = _params if args.tab_output: warn_msg = ("tab-delimited output will be removed in a future release. " "Please switch to the default JSON output.") warnings.warn(warn_msg, stacklevel=2) for t in args.session.get_tasks(params=args.parameter): # Legacy support for tab-delimited output. # Mypy is confused by CatalogTask members being created from kwargs if args.tab_output: color = t.color if t.color else "done" task_args = "\t".join([f"{k}={v}" for k, v in t.args.items()]) # type: ignore output = "\t".join([str(x) for x in [ t.identifier, t.task_id, t.server, t.submittime, t.cmd, color, t.submitter, task_args, ] if x]) print(output, flush=True) else: print(t.json(), flush=True) python-internetarchive-5.7.2/internetarchive/cli/ia_upload.py000066400000000000000000000335551513674652200245340ustar00rootroot00000000000000""" ia_upload.py 'ia' subcommand for uploading files to archive.org. """ # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import argparse import csv import os import sys import webbrowser from copy import deepcopy from locale import getpreferredencoding from tempfile import TemporaryFile from typing import Union from requests.exceptions import HTTPError from internetarchive.cli.cli_utils import ( MetadataAction, QueryStringAction, get_args_dict, validate_identifier, ) from internetarchive.utils import ( InvalidIdentifierException, JSONDecodeError, is_valid_metadata_key, json, ) def setup(subparsers): """ Setup args for copy command. Args: subparsers: subparser object passed from ia.py """ parser = subparsers.add_parser("upload", aliases=["up"], help="Upload files to archive.org") # Positional arguments parser.add_argument("identifier", type=validate_identifier, nargs="?", default=None, help="Identifier for the upload") parser.add_argument("file", nargs="*", type=validate_file, help="File(s) to upload") # Options parser.add_argument("-q", "--quiet", action="store_true", help="Turn off ia's output") parser.add_argument("-d", "--debug", action="store_true", help=("Print S3 request parameters to stdout and exit without " "sending request")) parser.add_argument("-r", "--remote-name", help=("When uploading data from stdin, " "this option sets the remote filename")) parser.add_argument("-m", "--metadata", nargs="+", action=MetadataAction, metavar="KEY:VALUE", default={}, help="Metadata to add to your item") parser.add_argument("--spreadsheet", type=argparse.FileType("r", encoding="utf-8-sig"), help="Bulk uploading") parser.add_argument("--file-metadata", type=argparse.FileType("r"), help="Upload files with file-level metadata via a file_md.jsonl file") parser.add_argument("-H", "--header", nargs="+", action=QueryStringAction, default={}, help="S3 HTTP headers to send with your request") parser.add_argument("-c", "--checksum", action="store_true", help="Skip based on checksum") parser.add_argument("-v", "--verify", action="store_true", help="Verify that data was not corrupted traversing the network") parser.add_argument("-n", "--no-derive", action="store_true", help="Do not derive uploaded files") parser.add_argument("--size-hint", help="Specify a size-hint for your item") parser.add_argument("--delete", action="store_true", help="Delete files after verifying checksums") parser.add_argument("-R", "--retries", type=int, help="Number of times to retry request if S3 returns a 503 SlowDown error") parser.add_argument("-s", "--sleep", type=int, help="The amount of time to sleep between retries") parser.add_argument("--no-collection-check", action="store_true", help="Skip collection exists check") parser.add_argument("-o", "--open-after-upload", action="store_true", help="Open the details page for an item after upload") parser.add_argument("--no-backup", action="store_true", help="Turn off archive.org backups") parser.add_argument("--keep-directories", action="store_true", help="Keep directories in the supplied file paths for the remote filename") parser.add_argument("--status-check", action="store_true", help="Check if S3 is accepting requests to the given item") parser.set_defaults(func=lambda args: main(args, parser)) def _upload_files(item, files, upload_kwargs, prev_identifier=None): """ Helper function for calling :meth:`Item.upload` """ # Check if the list has any element. if not files: raise FileNotFoundError("No valid file was found. Check your paths.") responses = [] if (upload_kwargs["verbose"]) and (prev_identifier != item.identifier): print(f"{item.identifier}:", file=sys.stderr) try: response = item.upload(files, **upload_kwargs) responses += response except HTTPError as exc: responses += [exc.response] except InvalidIdentifierException as exc: print(str(exc), file=sys.stderr) sys.exit(1) finally: # Debug mode. if upload_kwargs["debug"]: for i, r in enumerate(responses): if i != 0: print("---", file=sys.stderr) headers = "\n".join( [f" {k}:{v}" for (k, v) in r.headers.items()] ) print(f"Endpoint:\n {r.url}\n", file=sys.stderr) print(f"HTTP Headers:\n{headers}", file=sys.stderr) return responses def uploading_from_stdin(args): """ Check if the user is uploading from stdin. """ if not args.file: return False elif len(args.file) == 1 and args.file[0] == "-": return True return False def check_if_file_arg_required(args, parser): required_if_no_file = [args.spreadsheet, args.file_metadata, args.status_check] if not args.file and not any(required_if_no_file): parser.error("You must specify a file to upload.") def validate_file(arg): if os.path.exists(arg) or arg == "-": return arg else: raise argparse.ArgumentTypeError(f"'{arg}' is not a valid file or directory") def main(args, parser): # noqa: PLR0912,C901 # TODO: Refactor to deal with PLR0912 and C901 # add type hints """ Main entry point for 'ia upload'. """ check_if_file_arg_required(args, parser) if uploading_from_stdin(args) and not args.remote_name: parser.error("When uploading from stdin, " "you must specify a remote filename with --remote-name") if args.status_check: # TODO: support for checking if a specific bucket is overloaded if args.session.s3_is_overloaded(): print(f"warning: {args.identifier} is over limit, and not accepting requests. " "Expect 503 SlowDown errors.", file=sys.stderr) sys.exit(1) else: print(f"success: {args.identifier} is accepting requests.", file=sys.stderr) sys.exit(0) elif args.identifier: item = args.session.get_item(args.identifier) # Prepare upload headers and kwargs if args.no_derive: queue_derive = False else: queue_derive = True if args.quiet: verbose = False else: verbose = True if args.size_hint: args.header["x-archive-size-hint"] = args.size_hint if not args.header.get("x-archive-keep-old-version") \ and not args.no_backup: args.header["x-archive-keep-old-version"] = "1" if args.file_metadata: try: with open(args.file_metadata.name) as fh: args.file = json.load(fh) except JSONDecodeError: args.file = [] with open(args.file_metadata.name) as fh: for line in fh: j = json.loads(line.strip()) args.file.append(j) upload_kwargs = { "metadata": args.metadata, "headers": args.header, "debug": args.debug, "queue_derive": queue_derive, "verbose": verbose, "verify": args.verify, "checksum": args.checksum, "retries": args.retries, "retries_sleep": args.sleep, "delete": args.delete, "validate_identifier": True, } # Upload files errors = False if not args.spreadsheet: if uploading_from_stdin(args): local_file = TemporaryFile() # sys.stdin normally has the buffer attribute which returns bytes. # However, this might not always be the case, e.g. on mocking for test purposes. # Fall back to reading as str and encoding back to bytes. # Note that the encoding attribute might also be None. In that case, fall back to # locale.getpreferredencoding, the default of io.TextIOWrapper and open(). if hasattr(sys.stdin, "buffer"): def read(): return sys.stdin.buffer.read(1048576) else: encoding = sys.stdin.encoding or getpreferredencoding(False) def read(): return sys.stdin.read(1048576).encode(encoding) while True: data = read() if not data: break local_file.write(data) local_file.seek(0) else: local_file = args.file # Properly expand a period to the contents of the current working directory. if isinstance(local_file, str) and "." in local_file: local_file = [p for p in local_file if p != "."] local_file = os.listdir(".") + local_file if isinstance(local_file, (list, tuple, set)) and args.remote_name: local_file = local_file[0] if args.remote_name: files = {args.remote_name: local_file} elif args.keep_directories: files = {f: f for f in local_file} else: files = local_file for _r in _upload_files(item, files, upload_kwargs): if args.debug: break # Check if Response is empty first (i.e. --checksum) # TODO: Should upload return something other than an empty Response # object if checksum is set and the file is already in the item? if _r.status_code is None: pass elif not _r.ok: errors = True else: if args.open_after_upload: url = f"{args.session.protocol}//{args.session.host}/details/{item.identifier}" webbrowser.open_new_tab(url) # Bulk upload using spreadsheet. else: # Use the same session for each upload request. with args.spreadsheet as csvfp: spreadsheet = csv.DictReader(csvfp) prev_identifier = None for row in spreadsheet: for metadata_key in row: if not is_valid_metadata_key(metadata_key): print(f"error: '{metadata_key}' is not a valid metadata key.", file=sys.stderr) sys.exit(1) upload_kwargs_copy = deepcopy(upload_kwargs) if row.get("REMOTE_NAME"): local_file = {row["REMOTE_NAME"]: row["file"]} del row["REMOTE_NAME"] elif args.keep_directories: local_file = {row["file"]: row["file"]} else: local_file = row["file"] identifier = row.get("item", row.get("identifier")) if not identifier: if not prev_identifier: print("error: no identifier column on spreadsheet.", file=sys.stderr) sys.exit(1) identifier = prev_identifier del row["file"] if "identifier" in row: del row["identifier"] if "item" in row: del row["item"] item = args.session.get_item(identifier) # TODO: Clean up how indexed metadata items are coerced # into metadata. md_args = [f"{k.lower()}:{v}" for (k, v) in row.items() if v] metadata = get_args_dict(md_args) upload_kwargs_copy["metadata"].update(metadata) r = _upload_files(item, local_file, upload_kwargs_copy, prev_identifier) for _r in r: if args.debug: break if (not _r.status_code) or (not _r.ok): errors = True else: if args.open_after_upload: url = (f"{args.session.protocol}//{args.session.host}" "/details/{identifier}") webbrowser.open_new_tab(url) prev_identifier = identifier if errors: sys.exit(1) python-internetarchive-5.7.2/internetarchive/cli/py.typed000066400000000000000000000000001513674652200236770ustar00rootroot00000000000000python-internetarchive-5.7.2/internetarchive/config.py000066400000000000000000000167641513674652200233000ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.config ~~~~~~~~~~~~~~~~~~~~~~ :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from __future__ import annotations import os from collections import defaultdict from configparser import RawConfigParser from time import sleep from typing import DefaultDict, Dict, Mapping import requests from internetarchive import auth from internetarchive.exceptions import AuthenticationError from internetarchive.utils import deep_update def get_auth_config(email: str, password: str, host: str = 'archive.org') -> dict: u = f'https://{host}/services/xauthn/' p = {'op': 'login'} d = {'email': email, 'password': password} r = requests.post(u, params=p, data=d, timeout=10) sleep(2) j = r.json() if not j.get('success'): try: msg = j['values']['reason'] except KeyError: msg = j['error'] if msg == 'account_not_found': msg = 'Account not found, check your email and try again.' elif msg == 'account_bad_password': msg = 'Incorrect password, try again.' else: msg = f'Authentication failed: {msg}' raise AuthenticationError(msg) auth_config = { 's3': { 'access': j['values']['s3']['access'], 'secret': j['values']['s3']['secret'], }, 'cookies': { 'logged-in-user': j['values']['cookies']['logged-in-user'], 'logged-in-sig': j['values']['cookies']['logged-in-sig'], }, 'general': { 'screenname': j['values']['screenname'], } } return auth_config def write_config_file(auth_config: Mapping, config_file=None): config_file, is_xdg, config = parse_config_file(config_file) # S3 Keys. access = auth_config.get('s3', {}).get('access') secret = auth_config.get('s3', {}).get('secret') config.set('s3', 'access', access) config.set('s3', 'secret', secret) # Cookies. cookies = auth_config.get('cookies', {}) config.set('cookies', 'logged-in-user', cookies.get('logged-in-user')) config.set('cookies', 'logged-in-sig', cookies.get('logged-in-sig')) # General. screenname = auth_config.get('general', {}).get('screenname') config.set('general', 'screenname', screenname) # Create directory if needed. config_directory = os.path.dirname(config_file) if is_xdg and not os.path.exists(config_directory): # os.makedirs does not apply the mode for intermediate directories since Python 3.7. # The XDG Base Dir spec requires that the XDG_CONFIG_HOME directory be created with mode 700. # is_xdg will be True iff config_file is ${XDG_CONFIG_HOME}/internetarchive/ia.ini. # So create grandparent first if necessary then parent to ensure both have the right mode. os.makedirs(os.path.dirname(config_directory), mode=0o700, exist_ok=True) os.mkdir(config_directory, 0o700) # Write config file. with open(config_file, 'w') as fh: os.chmod(config_file, 0o600) config.write(fh) return config_file def parse_config_file(config_file=None): config = RawConfigParser() is_xdg = False if not config_file: candidates = [] if os.environ.get('IA_CONFIG_FILE'): candidates.append(os.environ['IA_CONFIG_FILE']) xdg_config_home = os.environ.get('XDG_CONFIG_HOME') if not xdg_config_home or not os.path.isabs(xdg_config_home): # Per the XDG Base Dir specification, this should be $HOME/.config. Unfortunately, $HOME # does not exist on all systems. Therefore, we use ~/.config here. On a POSIX-compliant # system, where $HOME must always be set, the XDG spec will be followed precisely. xdg_config_home = os.path.join(os.path.expanduser('~'), '.config') xdg_config_file = os.path.join(xdg_config_home, 'internetarchive', 'ia.ini') candidates.append(xdg_config_file) candidates.append(os.path.join(os.path.expanduser('~'), '.config', 'ia.ini')) candidates.append(os.path.join(os.path.expanduser('~'), '.ia')) for candidate in candidates: if os.path.isfile(candidate): config_file = candidate break else: # None of the candidates exist, default to IA_CONFIG_FILE if set else XDG config_file = os.environ.get('IA_CONFIG_FILE', xdg_config_file) if config_file == xdg_config_file: is_xdg = True config.read(config_file) if not config.has_section('s3'): config.add_section('s3') config.set('s3', 'access', None) config.set('s3', 'secret', None) if not config.has_section('cookies'): config.add_section('cookies') config.set('cookies', 'logged-in-user', None) config.set('cookies', 'logged-in-sig', None) if config.has_section('general'): for k, _v in config.items('general'): if k in ['secure']: config.set('general', k, config.getboolean('general', k)) if not config.get('general', 'screenname'): config.set('general', 'screenname', None) else: config.add_section('general') config.set('general', 'screenname', None) return (config_file, is_xdg, config) def get_config(config=None, config_file=None) -> dict: _config = config or {} config_file, _is_xdg, config_parser = parse_config_file(config_file) # TODO: Use typing.TypedDict when we drop Python 3.8 support # to get rid of noqa: UP006 config_dict: DefaultDict[str, Dict[str, str]] = defaultdict(dict) # noqa: UP006 # Read from config file if it exists if os.path.isfile(config_file): for sec in config_parser.sections(): try: for k, v in config_parser.items(sec): if k is None or v is None: continue config_dict[sec][k] = v except TypeError: pass # Check environment variables and override S3 config if present env_access_key = os.environ.get('IA_ACCESS_KEY_ID') env_secret_key = os.environ.get('IA_SECRET_ACCESS_KEY') # Check if only one environment variable is set if (env_access_key and not env_secret_key) or (not env_access_key and env_secret_key): raise ValueError( "Both IA_ACCESS_KEY_ID and IA_SECRET_ACCESS_KEY environment variables " "must be set together, or neither should be set." ) if env_access_key and env_secret_key: config_dict['s3']['access'] = env_access_key config_dict['s3']['secret'] = env_secret_key # Recursive/deep update with passed config deep_update(config_dict, _config) return {k: v for k, v in config_dict.items() if v is not None} python-internetarchive-5.7.2/internetarchive/exceptions.py000066400000000000000000000037131513674652200242020ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.exceptions ~~~~~~~~~~~~~~~~~~~~~~~~~~ :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from typing import Dict, Optional class AuthenticationError(Exception): """Authentication Failed""" class ItemLocateError(Exception): def __init__(self, *args, **kwargs): default_message = "Item cannot be located because it is dark or does not exist." if args or kwargs: super().__init__(*args, **kwargs) else: super().__init__(default_message) class InvalidChecksumError(Exception): def __init__(self, *args, **kwargs): default_message = "File corrupt, checksums do not match." if args or kwargs: super().__init__(*args, **kwargs) else: super().__init__(default_message) class AccountAPIError(Exception): """Base exception for Account API-related errors.""" def __init__(self, message: str, error_data: Optional[Dict] = None): super().__init__(message) self.error_data = error_data class DirectoryTraversalError(Exception): """Raised when a computed local file path escapes the intended destination directory.""" python-internetarchive-5.7.2/internetarchive/files.py000066400000000000000000000511511513674652200231220ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.files ~~~~~~~~~~~~~~~~~~~~~ :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ import logging import os import socket import sys from contextlib import nullcontext, suppress from email.utils import parsedate_to_datetime from pathlib import Path from time import sleep from urllib.parse import quote from requests.exceptions import ( ConnectionError, ConnectTimeout, HTTPError, ReadTimeout, RetryError, ) from tqdm import tqdm from internetarchive import auth, exceptions, iarequest, utils log = logging.getLogger(__name__) class BaseFile: def __init__(self, item_metadata, name, file_metadata=None): if file_metadata is None: file_metadata = {} name = name.strip('/') if not file_metadata: for f in item_metadata.get('files', []): if f.get('name') == name: file_metadata = f break self.identifier = item_metadata.get('metadata', {}).get('identifier') self.name = name self.size = None self.source = None self.format = None self.md5 = None self.sha1 = None self.mtime = None self.crc32 = None self.exists = bool(file_metadata) for key in file_metadata: setattr(self, key, file_metadata[key]) # An additional, more orderly way to access file metadata, # which avoids filtering the attributes. self.metadata = file_metadata self.mtime = float(self.mtime) if self.mtime else 0 self.size = int(self.size) if self.size else 0 class File(BaseFile): """This class represents a file in an archive.org item. You can use this class to access the file metadata:: >>> import internetarchive >>> item = internetarchive.Item('stairs') >>> file = internetarchive.File(item, 'stairs.avi') >>> print(f.format, f.size) ('Cinepack', '3786730') Or to download a file:: >>> file.download() >>> file.download('fabulous_movie_of_stairs.avi') This class also uses IA's S3-like interface to delete a file from an item. You need to supply your IAS3 credentials in environment variables in order to delete:: >>> file.delete(access_key='Y6oUrAcCEs4sK8ey', ... secret_key='youRSECRETKEYzZzZ') You can retrieve S3 keys here: `https://archive.org/account/s3.php `__ """ def __init__(self, item, name, file_metadata=None): """ :type item: Item :param item: The item that the file is part of. :type name: str :param name: The filename of the file. :type file_metadata: dict :param file_metadata: (optional) a dict of metadata for the given file. """ super().__init__(item.item_metadata, name, file_metadata) self.item = item url_parts = { 'protocol': item.session.protocol, 'id': self.identifier, 'name': quote(name.encode('utf-8')), 'host': item.session.host, } self.url = '{protocol}//{host}/download/{id}/{name}'.format(**url_parts) if self.item.session.access_key and self.item.session.secret_key: self.auth = auth.S3Auth(self.item.session.access_key, self.item.session.secret_key) else: self.auth = None def __repr__(self): return (f'File(identifier={self.identifier!r}, ' f'filename={self.name!r}, ' f'size={self.size!r}, ' f'format={self.format!r})') def download( # noqa: C901,PLR0911,PLR0912,PLR0915 self, file_path=None, verbose=None, ignore_existing=None, checksum=None, checksum_archive=None, destdir=None, retries=None, ignore_errors=None, fileobj=None, return_responses=None, no_change_timestamp=None, params=None, chunk_size=None, stdout=None, ors=None, timeout=None, headers=None, ): """Download the file into the current working directory. :type file_path: str :param file_path: Download file to the given file_path. :type verbose: bool :param verbose: (optional) Turn on verbose output. :type ignore_existing: bool :param ignore_existing: Overwrite local files if they already exist. :type checksum: bool :param checksum: (optional) Skip downloading file based on checksum. :type checksum_archive: bool :param checksum_archive: (optional) Skip downloading file based on checksum, and skip checksum validation if it already succeeded (will create and use _checksum_archive.txt). :type destdir: str :param destdir: (optional) The directory to download files to. :type retries: int :param retries: (optional) The number of times to retry on failed requests. :type ignore_errors: bool :param ignore_errors: (optional) Don't fail if a single file fails to download, continue to download other files. :type fileobj: file-like object :param fileobj: (optional) Write data to the given file-like object (e.g. sys.stdout). :type return_responses: bool :param return_responses: (optional) Rather than downloading files to disk, return a list of response objects. :type no_change_timestamp: bool :param no_change_timestamp: (optional) If True, leave the time stamp as the current time instead of changing it to that given in the original archive. :type stdout: bool :param stdout: (optional) Print contents of file to stdout instead of downloading to file. :type ors: bool :param ors: (optional) Append a newline or $ORS to the end of file. This is mainly intended to be used internally with `stdout`. :type params: dict :param params: (optional) URL parameters to send with download request (e.g. `cnt=0`). :rtype: bool :returns: True if file was successfully downloaded. """ verbose = False if verbose is None else verbose ignore_existing = False if ignore_existing is None else ignore_existing checksum = False if checksum is None else checksum checksum_archive = False if checksum_archive is None else checksum_archive retries = retries or 2 ignore_errors = ignore_errors or False return_responses = return_responses or False no_change_timestamp = no_change_timestamp or False params = params or None timeout = 12 if not timeout else timeout headers = headers or {} retries_sleep = 3 # TODO: exponential sleep retrying = False # for retry loop self.item.session.mount_http_adapter(max_retries=retries) file_path = file_path or self.name if os.name == 'nt' and not return_responses: file_path, _ = utils.sanitize_windows_relpath( file_path, verbose=bool(verbose), printer=lambda m: print(m, file=sys.stderr), ) if destdir: if not (return_responses or stdout): try: os.makedirs(destdir, exist_ok=True) except OSError: pass if os.path.isfile(destdir): raise OSError(f'{destdir} is not a directory!') file_path = os.path.join(destdir, file_path) # Windows sanitization handled earlier; legacy comment removed. # Directory traversal guard (all platforms). Ensure target path is inside destdir # (or cwd if none provided). Determine intended base directory. intended_base = destdir if destdir else os.getcwd() try: if not utils.is_path_within_directory(intended_base, os.path.abspath(file_path)): raise exceptions.DirectoryTraversalError( f'Unsafe file path resolved outside destination directory: {file_path}' ) except AttributeError: # Fallback if DirectoryTraversalError not present (older versions); re-raise generic. raise OSError(f'Unsafe file path resolved outside destination directory: {file_path}') parent_dir = os.path.dirname(file_path) # Warn (not fail) if path length may cause Windows issues (>240 chars typical safe limit) if os.name == 'nt' and len(os.path.abspath(file_path)) > 240: log.warning('Long path may cause issues: %s', file_path) if verbose: print(f' warning: long path may cause issues: {file_path}', file=sys.stderr) # Check if we should skip... if not return_responses and os.path.exists(file_path.encode('utf-8')): if checksum_archive: checksum_archive_filename = '_checksum_archive.txt' if not os.path.exists(checksum_archive_filename): with open(checksum_archive_filename, 'w', encoding='utf-8') as f: pass with open(checksum_archive_filename, encoding='utf-8') as f: checksum_archive_data = f.read().splitlines() if file_path in checksum_archive_data: msg = ( f'skipping {file_path}, ' f'file already exists based on checksum_archive.' ) log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) return if ignore_existing: msg = f'skipping {file_path}, file already exists.' log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) return elif checksum or checksum_archive: with open(file_path, 'rb') as fp: md5_sum = utils.get_md5(fp) if md5_sum == self.md5: msg = f'skipping {file_path}, file already exists based on checksum.' log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) if checksum_archive: # add file to checksum_archive to skip it next time with open(checksum_archive_filename, 'a', encoding='utf-8') as f: f.write(f'{file_path}\n') return # Retry loop while True: try: if parent_dir != '' and not (return_responses or stdout): os.makedirs(parent_dir, exist_ok=True) if not return_responses \ and not ignore_existing \ and self.name != f'{self.identifier}_files.xml' \ and os.path.exists(file_path.encode('utf-8')): st = os.stat(file_path.encode('utf-8')) if st.st_size != self.size and not (checksum or checksum_archive): headers = {"Range": f"bytes={st.st_size}-"} response = self.item.session.get( self.url, stream=True, timeout=timeout, auth=self.auth, params=params, headers=headers, ) # Get timestamp from Last-Modified header last_mod_header = response.headers.get('Last-Modified') if last_mod_header: dt = parsedate_to_datetime(last_mod_header) last_mod_mtime = dt.timestamp() else: last_mod_mtime = self.mtime response.raise_for_status() # Check if we should skip based on last modified time... if not fileobj and not return_responses and os.path.exists(file_path.encode('utf-8')): st = os.stat(file_path.encode('utf-8')) if st.st_mtime == last_mod_mtime: if self.name == f'{self.identifier}_files.xml' or (st.st_size == self.size): msg = (f'skipping {file_path}, file already exists based on ' 'length and date.') log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) return elif return_responses: return response if verbose: total = int(response.headers.get('content-length', 0)) or None progress_bar = tqdm(desc=f' downloading {self.name}', total=total, unit='iB', unit_scale=True, unit_divisor=1024) else: progress_bar = nullcontext() if not chunk_size: chunk_size = 1048576 if stdout: fileobj = os.fdopen(sys.stdout.fileno(), 'wb', closefd=False) if not fileobj or retrying: if 'Range' in headers: fileobj = open(file_path.encode('utf-8'), 'rb+') else: fileobj = open(file_path.encode('utf-8'), 'wb') with fileobj, progress_bar as bar: if 'Range' in headers: fileobj.seek(st.st_size) for chunk in response.iter_content(chunk_size=chunk_size): if chunk: size = fileobj.write(chunk) if bar is not None: bar.update(size) if ors: fileobj.write(os.environ.get("ORS", "\n").encode("utf-8")) if 'Range' in headers: with open(file_path, 'rb') as fh: local_checksum = utils.get_md5(fh) try: assert local_checksum == self.md5 except AssertionError: msg = (f"\"{file_path}\" corrupt, " "checksums do not match. " "Remote file may have been modified, " "retry download.") os.remove(file_path.encode('utf-8')) raise exceptions.InvalidChecksumError(msg) break except (RetryError, HTTPError, ConnectTimeout, OSError, ReadTimeout, exceptions.InvalidChecksumError) as exc: if retries > 0: retrying = True retries -= 1 msg = ('download failed, sleeping for ' f'{retries_sleep} seconds and retrying. ' f'{retries} retries left.') log.warning(msg) sleep(retries_sleep) continue msg = f'error downloading file {file_path}, exception raised: {exc}' log.error(msg) try: os.remove(file_path) except OSError: pass if verbose: print(f' {msg}', file=sys.stderr) if ignore_errors: return False else: raise exc # Set mtime with timestamp from Last-Modified header if not no_change_timestamp: # If we want to set the timestamp to that of the original archive... with suppress(OSError): # Probably file-like object, e.g. sys.stdout. os.utime(file_path.encode('utf-8'), (0,last_mod_mtime)) msg = f'downloaded {self.identifier}/{self.name} to {file_path}' log.info(msg) return True def delete(self, cascade_delete=None, access_key=None, secret_key=None, verbose=None, debug=None, retries=None, headers=None): """Delete a file from the Archive. Note: Some files -- such as _meta.xml -- cannot be deleted. :type cascade_delete: bool :param cascade_delete: (optional) Delete all files associated with the specified file, including upstream derivatives and the original. :type access_key: str :param access_key: (optional) IA-S3 access_key to use when making the given request. :type secret_key: str :param secret_key: (optional) IA-S3 secret_key to use when making the given request. :type verbose: bool :param verbose: (optional) Print actions to stdout. :type debug: bool :param debug: (optional) Set to True to print headers to stdout and exit without sending the delete request. """ cascade_delete = '0' if not cascade_delete else '1' access_key = self.item.session.access_key if not access_key else access_key secret_key = self.item.session.secret_key if not secret_key else secret_key debug = debug or False verbose = verbose or False max_retries = retries or 2 headers = headers or {} if 'x-archive-cascade-delete' not in headers: headers['x-archive-cascade-delete'] = cascade_delete url = f'{self.item.session.protocol}//s3.us.archive.org/{self.identifier}/{quote(self.name)}' self.item.session.mount_http_adapter(max_retries=max_retries, status_forcelist=[503], host='s3.us.archive.org') request = iarequest.S3Request( method='DELETE', url=url, headers=headers, access_key=access_key, secret_key=secret_key ) if debug: return request else: if verbose: msg = f' deleting: {self.name}' if cascade_delete == '1': msg += ' and all derivative files.' print(msg, file=sys.stderr) prepared_request = self.item.session.prepare_request(request) try: resp = self.item.session.send(prepared_request) resp.raise_for_status() except (RetryError, HTTPError, ConnectTimeout, OSError, ReadTimeout) as exc: error_msg = f'Error deleting {url}, {exc}' log.error(error_msg) raise else: return resp finally: # The retry adapter is mounted to the session object. # Make sure to remove it after delete, so it isn't # mounted if and when the session object is used for an # upload. This is important because we use custom retry # handling for IA-S3 uploads. url_prefix = f'{self.item.session.protocol}//s3.us.archive.org' del self.item.session.adapters[url_prefix] class OnTheFlyFile(File): def __init__(self, item, name): """ :type item: Item :param item: The item that the file is part of. :type name: str :param name: The filename of the file. """ super().__init__(item.item_metadata, name) python-internetarchive-5.7.2/internetarchive/iarequest.py000066400000000000000000000463431513674652200240310ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.iarequest ~~~~~~~~~~~~~~~~~~~~~~~~~ :copyright: (C) 2012-2025 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ import copy import logging import re from urllib.parse import quote import requests import requests.models from jsonpatch import make_patch from internetarchive import __version__, auth from internetarchive.exceptions import ItemLocateError from internetarchive.utils import delete_items_from_dict, json, needs_quote logger = logging.getLogger(__name__) class S3Request(requests.models.Request): def __init__(self, metadata=None, file_metadata=None, queue_derive=True, access_key=None, secret_key=None, **kwargs): super().__init__(**kwargs) self.auth = self.auth or auth.S3Auth(access_key, secret_key) self.metadata = metadata or {} self.file_metadata = file_metadata or {} self.queue_derive = queue_derive def prepare(self): p = S3PreparedRequest() p.prepare( method=self.method, url=self.url, headers=self.headers, files=self.files, data=self.data, params=self.params, auth=self.auth, cookies=self.cookies, hooks=self.hooks, # S3Request kwargs. metadata=self.metadata, file_metadata=self.file_metadata, queue_derive=self.queue_derive, ) return p class S3PreparedRequest(requests.models.PreparedRequest): def prepare(self, method=None, url=None, headers=None, files=None, data=None, params=None, auth=None, cookies=None, hooks=None, queue_derive=None, metadata=None, file_metadata=None): self.prepare_method(method) self.prepare_url(url, params) self.prepare_headers(headers, metadata, file_metadata, queue_derive) self.prepare_cookies(cookies) self.prepare_body(data, files) self.prepare_auth(auth, url) # Note that prepare_auth must be last to enable authentication schemes # such as OAuth to work on a fully prepared request. # This MUST go after prepare_auth. Authenticators could add a hook self.prepare_hooks(hooks) def prepare_headers(self, headers, metadata, file_metadata, queue_derive): headers = headers.copy() if headers else {} metadata = metadata.copy() if metadata else {} file_metadata = file_metadata.copy() if file_metadata else {} prepared_metadata = prepare_metadata(metadata) prepared_file_metadata = prepare_metadata(file_metadata) headers.setdefault('x-archive-auto-make-bucket', '1') headers['x-archive-queue-derive'] = '0' if queue_derive is False else '1' self._add_metadata_headers(headers, prepared_metadata, 'meta') self._add_metadata_headers(headers, prepared_file_metadata, 'filemeta') super().prepare_headers(headers) def _add_metadata_headers(self, headers, prepared_metadata, meta_type): for key, values in prepared_metadata.items(): if not isinstance(values, list): values = [values] for idx, value in enumerate(values): if not value: continue header_key = f'x-archive-{meta_type}{idx:02d}-{key}'.replace('_', '--') if isinstance(value, str) and needs_quote(value): value = f'uri({quote(value)})' headers[header_key] = value class MetadataRequest(requests.models.Request): def __init__(self, metadata=None, source_metadata=None, target=None, priority=None, access_key=None, secret_key=None, append=None, expect=None, append_list=None, insert=None, reduced_priority=None, **kwargs): super().__init__(**kwargs) self.auth = self.auth or auth.S3PostAuth(access_key, secret_key) self.metadata = metadata or {} self.source_metadata = source_metadata self.target = target self.priority = priority self.append = append self.expect = expect or {} self.append_list = append_list self.insert = insert self.reduced_priority = reduced_priority def prepare(self): p = MetadataPreparedRequest() p.prepare( method=self.method, url=self.url, headers=self.headers, files=self.files, data=self.data, params=self.params, auth=self.auth, cookies=self.cookies, hooks=self.hooks, # MetadataRequest kwargs. metadata=self.metadata, priority=self.priority, source_metadata=self.source_metadata, target=self.target, append=self.append, expect=self.expect, append_list=self.append_list, insert=self.insert, reduced_priority=self.reduced_priority, ) return p class MetadataPreparedRequest(requests.models.PreparedRequest): def prepare(self, method=None, url=None, headers=None, files=None, data=None, params=None, auth=None, cookies=None, hooks=None, metadata=None, source_metadata=None, target=None, priority=None, append=None, expect=None, append_list=None, insert=None, reduced_priority=None): # First handle our custom headers if reduced_priority: headers = headers.copy() if headers else {} headers['X-Accept-Reduced-Priority'] = '1' # Now run full parent preparation super().prepare( method=method, url=url, headers=headers, files=files, data=data, params=params, auth=auth, cookies=cookies, hooks=hooks, ) # Now add our custom handling self.identifier = self.url.split('?')[0].split('/')[-1] self._prepare_request_body( metadata, source_metadata, target, priority, append, append_list, insert, expect, ) self.prepare_auth(auth, url) # Note that prepare_auth must be last to enable authentication schemes # such as OAuth to work on a fully prepared request. # This MUST go after prepare_auth. Authenticators could add a hook self.prepare_hooks(hooks) def _prepare_request_body(self, metadata, source_metadata, target, priority, append, append_list, insert, expect): if not source_metadata: r = requests.get(self.url, timeout=10) source_metadata = r.json() if self._is_multi_target(metadata): changes = self._prepare_multi_target_changes( metadata, source_metadata, target, append, expect, append_list, insert, ) self.data = {'-changes': json.dumps(changes), 'priority': priority or -5} else: self._prepare_single_target_body( metadata, source_metadata, target, append, append_list, insert, expect, priority, ) logger.debug(f'submitting metadata request: {self.data}') super().prepare_body(self.data, None) def _is_multi_target(self, metadata): return ( isinstance(metadata, list) or any('/' in k for k in metadata) or all(isinstance(v, dict) for v in metadata.values()) ) def _prepare_multi_target_changes(self, metadata, source_metadata, target, append, expect, append_list, insert): changes = [] if target: metadata = {target: metadata} for key in metadata: patch = self._get_patch_for_target( key, metadata[key], source_metadata, append, expect, append_list, insert, ) changes.append({'target': key, 'patch': patch}) return changes def _prepare_single_target_body(self, metadata, source_metadata, target, append, append_list, insert, expect, priority): target = target or 'metadata' if target == 'metadata': try: patch = prepare_patch( metadata, source_metadata['metadata'], append, expect, append_list, insert, ) except KeyError: raise ItemLocateError( f'{self.identifier} cannot be located ' 'because it is dark or does not exist.' ) elif target.startswith('files/'): patch = prepare_files_patch( metadata, source_metadata['files'], target, append, append_list, insert, expect, ) else: patch = prepare_target_patch( metadata, source_metadata, append, target, append_list, insert, expect, ) self.data = { '-patch': json.dumps(patch), '-target': target, 'priority': priority or -5, } def prepare_patch(metadata, source_metadata, append, expect=None, append_list=None, insert=None): destination = source_metadata.copy() if isinstance(metadata, list): prepared_metadata = metadata if not destination: destination = [] else: prepared_metadata = prepare_metadata( metadata, source_metadata, append, append_list, insert, ) if isinstance(destination, dict): destination.update(prepared_metadata) elif isinstance(metadata, list): destination = prepared_metadata else: if isinstance(prepared_metadata, list): destination = prepared_metadata else: destination = [prepared_metadata] destination = delete_items_from_dict(destination, 'REMOVE_TAG') patch = make_patch(source_metadata, destination).patch patch_tests = _create_patch_tests(expect) return patch_tests + patch def _create_patch_tests(expect): tests = [] for key, value in (expect or {}).items(): if '[' in key: parts = key.split('[') idx = int(parts[1].strip(']')) path = f'/{parts[0]}/{idx}' else: path = f'/{key}' tests.append({'op': 'test', 'path': path, 'value': value}) return tests def prepare_target_patch(metadata, source_metadata, append, target, append_list, insert, expect): def get_nested_value(data, parts): current = data for part in parts: if isinstance(current, list) and part.isdigit(): current = current[int(part)] else: current = current[part] return current key_parts = target.split('/') current_source = get_nested_value(source_metadata, key_parts) return prepare_patch( metadata, current_source, append, expect, append_list, insert, ) def prepare_files_patch(metadata, files_metadata, target, append, append_list, insert, expect): filename = target.split('/')[1] for file_meta in files_metadata: if file_meta.get('name') == filename: return prepare_patch( metadata, file_meta, append, expect, append_list, insert, ) return [] def prepare_metadata(metadata, source_metadata=None, append=False, append_list=False, insert=False): """ Normalize and merge metadata before building JSON Patch. Handles both plain key/value metadata and "indexed" keys like `subject[0]`, `subject[1]`, etc. that represent list elements. Args: metadata (dict): New metadata to apply. source_metadata (dict, optional): Existing metadata from the item. append (bool): If True, append values for existing keys (concatenate strings). append_list (bool): If True, append values to lists. insert (bool): If True, insert elements instead of overwriting. Returns: dict: Prepared metadata dictionary ready for patch generation. """ # Deep copy source to avoid mutating input source = copy.deepcopy(source_metadata) if source_metadata else {} prepared = {} # If using insert-mode but metadata has no indexed keys, # rewrite unindexed keys as [0]-indexed to normalize. if insert and not all(_is_indexed_key(k) for k in metadata): for k in list(metadata): if not _is_indexed_key(k): metadata[f"{k}[0]"] = metadata[k] _process_non_indexed_keys(metadata, source, prepared, append, append_list) indexed_keys = _process_indexed_keys(metadata, source, prepared, insert) return prepared def _process_non_indexed_keys(metadata, source, prepared, append, append_list): """ Process plain (non-indexed) metadata keys. Handles: - Numeric value conversion to strings. - String concatenation when `append` is True. - List extension when `append_list` is True. """ for key, value in metadata.items(): # Skip indexed keys; handled in _process_indexed_keys(). if _is_indexed_key(key): continue current_key = key if append_list and isinstance(source, dict) and source.get(current_key): existing = source[current_key] if not isinstance(existing, list): existing = [existing] prepared[current_key] = existing + [value] elif append and source.get(current_key): if isinstance(source[current_key], list): raise ValueError( "Cannot append to list metadata with 'append' flag; " "use 'append_list' instead.") prepared[current_key] = f'{source[current_key]} {value}' else: prepared[current_key] = value def _process_indexed_keys(metadata, source, prepared, insert): """ Process indexed metadata keys such as 'subject[0]', 'subject[1]', etc. Builds list values in `prepared` based on these indexed keys. Merges with any existing list data from `source`, optionally inserting new values when `insert=True` (otherwise existing values are overwritten at given index). Also filters out None and 'REMOVE_TAG' placeholders, which indicate that a list element should be deleted. Args: metadata (dict): Input metadata possibly containing indexed keys. source (dict): Existing metadata for the item. prepared (dict): Dict being built up by prepare_metadata(). insert (bool): If True, insert elements instead of overwriting. Returns: dict: Mapping of base keys to original list lengths (for reference). """ indexed_keys = {} # Track explicit indexes to delete (where value is REMOVE_TAG) remove_indexes = {} for key in list(metadata.keys()): # Skip non-indexed keys; handled in _process_non_indexed_keys(). if not _is_indexed_key(key): continue # Extract base key ('subject' from 'subject[2]') base = _get_base_key(key) # Extract list index (2 from 'subject[2]') idx = _get_index(key) if base not in indexed_keys: # Initialize this base key once per group of indexed keys. # Pull any existing list data from the source metadata. source_list = source.get(base, []) if not isinstance(source_list, list): source_list = [source_list] indexed_keys[base] = len(source_list) # Preallocate enough None slots to handle incoming indices. current_metadata_length = len(metadata) prepared[base] = source_list + [None] * ( current_metadata_length - len(source_list) ) # Ensure we're working with a list at this point. if not isinstance(prepared[base], list): prepared[base] = [prepared[base]] # Make sure list is long enough to hold this index. while len(prepared[base]) <= idx: prepared[base].append(None) # Track REMOVE_TAG for later deletion if metadata[key] == 'REMOVE_TAG': remove_indexes.setdefault(base, []).append(idx) prepared[base][idx] = None # Placeholder for now elif insert: # In "insert" mode, insert at index (shift others right), # and remove duplicates if value already exists. if metadata[key] in prepared[base]: prepared[base].remove(metadata[key]) prepared[base].insert(idx, metadata[key]) else: # Default mode: overwrite value at given index. prepared[base][idx] = metadata[key] # Cleanup lists: first remove explicit REMOVE_TAG indexes for base, indexes in remove_indexes.items(): for idx in sorted(indexes, reverse=True): if idx < len(prepared[base]): del prepared[base][idx] # Then remove any remaining None values from preallocation for base in prepared: if isinstance(prepared[base], list): prepared[base] = [v for v in prepared[base] if v is not None] return indexed_keys def _get_base_key(key): """Return the part of a metadata key before any [index] notation.""" return key.split('[')[0] def _is_indexed_key(key): """Return True if key includes [n] list indexing syntax.""" return '[' in key and ']' in key def _get_index(key): """Extract integer index from an indexed metadata key (e.g. 'subject[2]').""" match = re.search(r'(?<=\[)\d+(?=\])', key) return int(match.group()) if match else None python-internetarchive-5.7.2/internetarchive/item.py000066400000000000000000001706441513674652200227670ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.item ~~~~~~~~~~~~~~~~~~~~ :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from __future__ import annotations import io import math import os import socket import sys from copy import deepcopy from datetime import datetime from fnmatch import fnmatch from functools import total_ordering from logging import getLogger from time import sleep from typing import Mapping, MutableMapping, Optional from urllib.parse import quote, urlparse from xml.parsers.expat import ExpatError from requests import Request, Response from requests.exceptions import HTTPError from tqdm import tqdm from internetarchive import catalog, exceptions from internetarchive.auth import S3Auth from internetarchive.files import File from internetarchive.iarequest import MetadataRequest, S3Request from internetarchive.utils import ( IdentifierListAsItems, IterableToFileAdapter, chunk_generator, get_file_size, get_md5, get_s3_xml_text, is_dir, iter_directory, json, norm_filepath, recursive_file_count_and_size, validate_s3_identifier, ) log = getLogger(__name__) @total_ordering class BaseItem: EXCLUDED_ITEM_METADATA_KEYS = ('workable_servers', 'server') def __init__( self, identifier: str | None = None, item_metadata: Mapping | None = None, ): # Default attributes. self.identifier = identifier self.item_metadata = item_metadata or {} self.exists = False # Archive.org metadata attributes. self.metadata: dict = {} self.files: list[dict] = [] self.created = None self.d1 = None self.d2 = None self.dir = None self.files_count = None self.item_size = None self.reviews: list = [] self.server = None self.uniq = None self.updated = None self.tasks = None self.is_dark = None # Load item. self.load() def __repr__(self) -> str: notloaded = ', item_metadata={}' if not self.exists else '' return f'{self.__class__.__name__}(identifier={self.identifier!r}{notloaded})' def load(self, item_metadata: Mapping | None = None) -> None: if item_metadata: self.item_metadata = item_metadata self.exists = bool(self.item_metadata) for key in self.item_metadata: setattr(self, key, self.item_metadata[key]) if not self.identifier: self.identifier = self.metadata.get('identifier') mc = self.metadata.get('collection', []) # TODO: The `type: ignore` on the following line should be removed. See #518 self.collection = IdentifierListAsItems(mc, self.session) # type: ignore def __eq__(self, other) -> bool: return (self.item_metadata == other.item_metadata or (self.item_metadata.keys() == other.item_metadata.keys() and all(self.item_metadata[x] == other.item_metadata[x] for x in self.item_metadata if x not in self.EXCLUDED_ITEM_METADATA_KEYS))) def __le__(self, other) -> bool: return self.identifier <= other.identifier def __hash__(self) -> int: without_excluded_keys = { k: v for k, v in self.item_metadata.items() if k not in self.EXCLUDED_ITEM_METADATA_KEYS} return hash(json.dumps(without_excluded_keys, sort_keys=True, check_circular=False)) # type: ignore class Item(BaseItem): """This class represents an archive.org item. Generally this class should not be used directly, but rather via the ``internetarchive.get_item()`` function:: >>> from internetarchive import get_item >>> item = get_item('stairs') >>> print(item.metadata) Or to modify the metadata for an item:: >>> metadata = {'title': 'The Stairs'} >>> item.modify_metadata(metadata) >>> print(item.metadata['title']) 'The Stairs' This class also uses IA's S3-like interface to upload files to an item. You need to supply your IAS3 credentials in environment variables in order to upload:: >>> item.upload('myfile.tar', access_key='Y6oUrAcCEs4sK8ey', ... secret_key='youRSECRETKEYzZzZ') True You can retrieve S3 keys here: `https://archive.org/account/s3.php `__ """ def __init__( self, archive_session, identifier: str, item_metadata: Mapping | None = None, ): """ :param archive_session: :class:`ArchiveSession ` :param identifier: The globally unique Archive.org identifier for this item. An identifier is composed of any unique combination of alphanumeric characters, underscore ( _ ) and dash ( - ). While there are no official limits it is strongly suggested that they be between 5 and 80 characters in length. Identifiers must be unique across the entirety of Internet Archive, not simply unique within a single collection. Once defined an identifier can not be changed. It will travel with the item or object and is involved in every manner of accessing or referring to the item. :param item_metadata: The Archive.org item metadata used to initialize this item. If no item metadata is provided, it will be retrieved from Archive.org using the provided identifier. """ self.session = archive_session super().__init__(identifier, item_metadata) self.urls = Item.URLs(self) if self.metadata.get('title'): # A copyable link to the item, in MediaWiki format details = self.urls.details # type: ignore self.wikilink = f'* [{details} {self.identifier}] -- {self.metadata["title"]}' class URLs: def __init__(self, itm_obj): self._itm_obj = itm_obj self._paths = [] self._make_URL('details') self._make_URL('metadata') self._make_URL('download') self._make_URL('history') self._make_URL('edit') self._make_URL('editxml') self._make_URL('manage') if self._itm_obj.metadata.get('mediatype') == 'collection': self._make_tab_URL('about') self._make_tab_URL('collection') def _make_tab_URL(self, tab: str) -> None: """Make URLs for the separate tabs of Collections details page.""" self._make_URL(tab, self.details + f'&tab={tab}') # type: ignore DEFAULT_URL_FORMAT = ('{0.session.protocol}//{0.session.host}' '/{path}/{0.identifier}') def _make_URL(self, path: str, url_format: str = DEFAULT_URL_FORMAT) -> None: setattr(self, path, url_format.format(self._itm_obj, path=path)) self._paths.append(path) def __str__(self) -> str: return f'URLs ({", ".join(self._paths)}) for {self._itm_obj.identifier}' def refresh(self, item_metadata: Mapping | None = None, **kwargs) -> None: if not item_metadata: item_metadata = self.session.get_metadata(self.identifier, **kwargs) self.load(item_metadata) def identifier_available(self) -> bool: """Check if the item identifier is available for creating a new item. :return: `True` if identifier is available, or `False` if it is not available. """ url = f'{self.session.protocol}//{self.session.host}/services/check_identifier.php' params = {'output': 'json', 'identifier': self.identifier} response = self.session.get(url, params=params) availability = response.json()['code'] return availability == 'available' def get_task_summary( self, params: Mapping | None = None, request_kwargs: Mapping | None = None, ) -> dict: """Get a summary of the item's pending tasks. :param params: Params to send with your request. :returns: A summary of the item's pending tasks. """ return self.session.get_tasks_summary(self.identifier, params, request_kwargs) def no_tasks_pending( self, params: Mapping | None = None, request_kwargs: Mapping | None = None, ) -> bool: """Check if there is any pending task for the item. :param params: Params to send with your request. :returns: `True` if no tasks are pending, otherwise `False`. """ return all(x == 0 for x in self.get_task_summary(params, request_kwargs).values()) def get_all_item_tasks( self, params: dict | None = None, request_kwargs: Mapping | None = None, ) -> list[catalog.CatalogTask]: """Get a list of all tasks for the item, pending and complete. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :param request_kwargs: Keyword arguments that :py:func:`requests.get` takes. :returns: A list of all tasks for the item, pending and complete. """ params = params or {} params.update({'catalog': 1, 'history': 1}) return self.session.get_tasks(self.identifier, params, request_kwargs) def get_history( self, params: Mapping | None = None, request_kwargs: Mapping | None = None, ) -> list[catalog.CatalogTask]: """Get a list of completed catalog tasks for the item. :param params: Params to send with your request. :returns: A list of completed catalog tasks for the item. """ return list(self.session.iter_history(self.identifier, params, request_kwargs)) def get_catalog( self, params: Mapping | None = None, request_kwargs: Mapping | None = None, ) -> list[catalog.CatalogTask]: """Get a list of pending catalog tasks for the item. :param params: Params to send with your request. :returns: A list of pending catalog tasks for the item. """ return list(self.session.iter_catalog(self.identifier, params, request_kwargs)) def derive(self, priority: int = 0, remove_derived: str | None = None, reduced_priority: bool = False, data: MutableMapping | None = None, headers: Mapping | None = None, request_kwargs: Mapping | None = None) -> Response: """Derive an item. :param priority: Task priority from 10 to -10 [default: 0] :param remove_derived: You can use wildcards ("globs") to only remove *some* prior derivatives. For example, "*" (typed without the quotation marks) specifies that all derivatives (in the item's top directory) are to be rebuilt. "*.mp4" specifies that all "*.mp4" deriviatives are to be rebuilt. "{*.gif,*thumbs/*.jpg}" specifies that all GIF and thumbs are to be rebuilt. :param reduced_priority: Submit your derive at a lower priority. This option is helpful to get around rate-limiting. Your task will more likely be accepted, but it might not run for a long time. Note that you still may be subject to rate-limiting. :returns: :class:`requests.Response` """ data = data or {} if remove_derived is not None: if not data.get('args'): data['args'] = {'remove_derived': remove_derived} else: data['args'].update({'remove_derived': remove_derived}) r = self.session.submit_task(self.identifier, 'derive.php', priority=priority, data=data, headers=headers, reduced_priority=reduced_priority, request_kwargs=request_kwargs) r.raise_for_status() return r def fixer(self, ops: list | str | None = None, priority: int | str | None = None, reduced_priority: bool = False, data: MutableMapping | None = None, headers: Mapping | None = None, request_kwargs: Mapping | None = None) -> Response: """Submit a fixer task on an item. :param ops: The fixer operation(s) to run on the item [default: noop]. :param priority: The task priority. :param reduced_priority: Submit your derive at a lower priority. This option is helpful to get around rate-limiting. Your task will more likely be accepted, but it might not run for a long time. Note that you still may be subject to rate-limiting. This is different than ``priority`` in that it will allow you to possibly avoid rate-limiting. :param data: Additional parameters to submit with the task. :returns: :class:`requests.Response` """ data = data or {} ops = ops or ['noop'] if not isinstance(ops, (list, tuple, set)): ops = [ops] data['args'] = data.get('args') or {} for op in ops: data['args'][op] = '1' r = self.session.submit_task(self.identifier, 'fixer.php', priority=priority, data=data, headers=headers, reduced_priority=reduced_priority, request_kwargs=request_kwargs) r.raise_for_status() return r def undark(self, comment: str, priority: int | str | None = None, reduced_priority: bool = False, data: Mapping | None = None, request_kwargs: Mapping | None = None) -> Response: """Undark the item. :param comment: The curation comment explaining reason for undarking item :param priority: The task priority. :param reduced_priority: Submit your derive at a lower priority. This option is helpful to get around rate-limiting. Your task will more likely be accepted, but it might not run for a long time. Note that you still may be subject to rate-limiting. This is different than ``priority`` in that it will allow you to possibly avoid rate-limiting. :param data: Additional parameters to submit with the task. :returns: :class:`requests.Response` """ r = self.session.submit_task(self.identifier, 'make_undark.php', comment=comment, priority=priority, data=data, reduced_priority=reduced_priority, request_kwargs=request_kwargs) r.raise_for_status() return r # TODO: dark and undark have different order for data and reduced_pripoity def dark(self, comment: str, priority: int | str | None = None, data: Mapping | None = None, reduced_priority: bool = False, request_kwargs: Mapping | None = None) -> Response: """Dark the item. :param comment: The curation comment explaining reason for darking item :param priority: The task priority. :param reduced_priority: Submit your derive at a lower priority. This option is helpful to get around rate-limiting. Your task will more likely be accepted, but it might not run for a long time. Note that you still may be subject to rate-limiting. This is different than ``priority`` in that it will allow you to possibly avoid rate-limiting. :param data: Additional parameters to submit with the task. :returns: :class:`requests.Response` """ r = self.session.submit_task(self.identifier, 'make_dark.php', comment=comment, priority=priority, data=data, reduced_priority=reduced_priority, request_kwargs=request_kwargs) r.raise_for_status() return r def get_review(self) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} a = S3Auth(self.session.access_key, self.session.secret_key) r = self.session.get(u, params=p, auth=a) r.raise_for_status() return r def index_review(self, username=None, screenname=None, itemname=None) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = {'noindex': '0'} if username: d['username'] = username elif screenname: d['screenname'] = screenname elif itemname: d['itemname'] = itemname a = S3Auth(self.session.access_key, self.session.secret_key) r = self.session.put(u, params=p, data=d, auth=a) r.raise_for_status() return r def noindex_review(self, username=None, screenname=None, itemname=None) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = {'noindex': '1'} if username: d['username'] = username elif screenname: d['screenname'] = screenname elif itemname: d['itemname'] = itemname a = S3Auth(self.session.access_key, self.session.secret_key) r = self.session.put(u, params=p, data=d, auth=a) r.raise_for_status() return r def delete_review(self, username=None, screenname=None, itemname=None) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = None if username: d = {'username': username} elif screenname: d = {'screenname': screenname} elif itemname: d = {'itemname': itemname} a = S3Auth(self.session.access_key, self.session.secret_key) r = self.session.delete(u, params=p, data=d, auth=a) r.raise_for_status() return r def review(self, title, body, stars=None) -> Response: u = f'{self.session.protocol}//{self.session.host}/services/reviews.php' p = {'identifier': self.identifier} d = {'title': title, 'body': body} if stars: d['stars'] = stars a = S3Auth(self.session.access_key, self.session.secret_key) r = self.session.post(u, params=p, data=json.dumps(d), auth=a) r.raise_for_status() return r def get_file(self, file_name: str, file_metadata: Mapping | None = None) -> File: """Get a :class:`File ` object for the named file. :param file_metadata: a dict of metadata for the given file. :returns: An :class:`internetarchive.File ` object. """ return File(self, file_name, file_metadata) def get_files(self, files: File | list[File] | None = None, formats: str | list[str] | None = None, glob_pattern: str | list[str] | None = None, exclude_pattern: str | list[str] | None = None, on_the_fly: bool = False): files = files or [] formats = formats or [] exclude_pattern = exclude_pattern or '' on_the_fly = bool(on_the_fly) if not isinstance(files, (list, tuple, set)): files = [files] if not isinstance(formats, (list, tuple, set)): formats = [formats] item_files = deepcopy(self.files) # Add support for on-the-fly files (e.g. EPUB). if on_the_fly: otf_files = [ ('EPUB', f'{self.identifier}.epub'), ('MOBI', f'{self.identifier}.mobi'), ('DAISY', f'{self.identifier}_daisy.zip'), ('MARCXML', f'{self.identifier}_archive_marc.xml'), ] for format, file_name in otf_files: item_files.append({'name': file_name, 'format': format, 'otf': True}) if not any(k for k in [files, formats, glob_pattern]): for f in item_files: yield self.get_file(str(f.get('name')), file_metadata=f) for f in item_files: if f.get('name') in files: yield self.get_file(str(f.get('name'))) elif f.get('format') in formats: yield self.get_file(str(f.get('name'))) elif glob_pattern: if not isinstance(glob_pattern, list): patterns = glob_pattern.split('|') else: patterns = glob_pattern if not isinstance(exclude_pattern, list): exclude_patterns = exclude_pattern.split('|') else: exclude_patterns = exclude_pattern for p in patterns: if fnmatch(f.get('name', ''), p): if not any(fnmatch(f.get('name', ''), e) for e in exclude_patterns): yield self.get_file(str(f.get('name'))) # ruff: noqa: PLR0912 def download(self, files: File | list[File] | None = None, formats: str | list[str] | None = None, glob_pattern: str | None = None, exclude_pattern: str | None = None, dry_run: bool = False, verbose: bool = False, ignore_existing: bool = False, checksum: bool = False, checksum_archive: bool = False, destdir: str | None = None, no_directory: bool = False, retries: int | None = None, item_index: int | None = None, ignore_errors: bool = False, on_the_fly: bool = False, return_responses: bool = False, no_change_timestamp: bool = False, ignore_history_dir: bool = False, source: str | list[str] | None = None, exclude_source: str | list[str] | None = None, stdout: bool = False, params: Mapping | None = None, timeout: float | tuple[int, float] | None = None ) -> list[Request | Response]: """Download files from an item. :param files: Only download files matching given file names. :param formats: Only download files matching the given Formats. :param glob_pattern: Only download files matching the given glob pattern. :param exclude_pattern: Exclude files whose filename matches the given glob pattern. :param dry_run: Output download URLs to stdout, don't download anything. :param verbose: Turn on verbose output. :param ignore_existing: Skip files that already exist locally. :param checksum: Skip downloading file based on checksum. :param checksum_archive: Skip downloading file based on checksum, and skip checksum validation if it already succeeded (will create and use _checksum_archive.txt). :param destdir: The directory to download files to. :param no_directory: Download files to current working directory rather than creating an item directory. :param retries: The number of times to retry on failed requests. :param item_index: The index of the item for displaying progress in bulk downloads. :param ignore_errors: Don't fail if a single file fails to download, continue to download other files. :param on_the_fly: Download on-the-fly files (i.e. derivative EPUB, MOBI, DAISY files). :param return_responses: Rather than downloading files to disk, return a list of response objects. :param no_change_timestamp: If True, leave the time stamp as the current time instead of changing it to that given in the original archive. :param source: Filter files based on their source value in files.xml (i.e. `original`, `derivative`, `metadata`). :param exclude_source: Filter files based on their source value in files.xml (i.e. `original`, `derivative`, `metadata`). :param params: URL parameters to send with download request (e.g. `cnt=0`). :param ignore_history_dir: Do not download any files from the history dir. This param defaults to ``False``. :returns: True if if all files have been downloaded successfully. """ dry_run = bool(dry_run) verbose = bool(verbose) ignore_existing = bool(ignore_existing) ignore_errors = bool(ignore_errors) checksum = bool(checksum) checksum_archive = bool(checksum_archive) no_directory = bool(no_directory) return_responses = bool(return_responses) no_change_timestamp = bool(no_change_timestamp) ignore_history_dir = bool(ignore_history_dir) params = params or None if source: if not isinstance(source, list): source = [source] if exclude_source: if not isinstance(exclude_source, list): exclude_source = [exclude_source] if stdout: fileobj = os.fdopen(sys.stdout.fileno(), "wb", closefd=False) verbose = False else: fileobj = None if not dry_run: if item_index and verbose: print(f'{self.identifier} ({item_index}):', file=sys.stderr) elif item_index is None and verbose: print(f'{self.identifier}:', file=sys.stderr) if self.is_dark: msg = f'skipping {self.identifier}, item is dark' log.warning(msg) if verbose: print(f' {msg}', file=sys.stderr) return [] elif self.metadata == {}: msg = f'skipping {self.identifier}, item does not exist.' log.warning(msg) if verbose: print(f' {msg}', file=sys.stderr) return [] if files: files = self.get_files(files, on_the_fly=on_the_fly) else: files = self.get_files(on_the_fly=on_the_fly) if formats: files = self.get_files(formats=formats, on_the_fly=on_the_fly) if glob_pattern: files = self.get_files( glob_pattern=glob_pattern, exclude_pattern=exclude_pattern, on_the_fly=on_the_fly ) if stdout: files = list(files) # type: ignore errors = [] downloaded = 0 responses = [] file_count = 0 for f in files: # type: ignore if ignore_history_dir is True: if f.name.startswith('history/'): continue if source and not any(f.source == x for x in source): continue if exclude_source and any(f.source == x for x in exclude_source): continue file_count += 1 if no_directory: path = f.name else: # Use forward slash as logical separator even on Windows so that # downstream sanitization treats backslashes inside remote filenames as data. if os.name == 'nt': path = f'{self.identifier}/{f.name}' else: path = os.path.join(str(self.identifier), f.name) if dry_run: print(f.url) continue if stdout and file_count < len(files): # type: ignore ors = True else: ors = False try: r = f.download(path, verbose, ignore_existing, checksum, checksum_archive, destdir, retries, ignore_errors, fileobj, return_responses, no_change_timestamp, params, None, stdout, ors, timeout) except exceptions.DirectoryTraversalError as exc: # type: ignore # Record error and continue; do not abort entire download batch. msg = f'error: {exc}' log.error(msg) # Always surface to stderr so user sees the skip. print(f' {msg}', file=sys.stderr) errors.append(f.name) continue if return_responses: responses.append(r) if r is False: errors.append(f.name) else: downloaded += 1 if file_count == 0: msg = f'skipping {self.identifier}, no matching files found.' log.info(msg) if verbose: print(f' {msg}', file=sys.stderr) return [] return responses if return_responses else errors def modify_metadata(self, metadata: Mapping, target: str | None = None, append: bool = False, expect: Mapping | None = None, append_list: bool = False, insert: bool = False, priority: int = 0, access_key: str | None = None, secret_key: str | None = None, debug: bool = False, headers: Mapping | None = None, reduced_priority: bool = False, request_kwargs: Mapping | None = None, timeout: float | None = None, refresh: bool = True) -> Request | Response: """Modify the metadata of an existing item on Archive.org. Note: The Metadata Write API does not yet comply with the latest Json-Patch standard. It currently complies with `version 02 `__. :param metadata: Metadata used to update the item. :param target: Set the metadata target to update. :param priority: Set task priority. :param append: Append value to an existing multi-value metadata field. :param expect: Provide a dict of expectations to be tested server-side before applying patch to item metadata. :param append_list: Append values to an existing multi-value metadata field. No duplicate values will be added. :param refresh: Refresh the item metadata after the request. :param reduced_priority: Submit your task at a lower priority. This option is helpful to get around rate-limiting. Your task will more likely be accepted, but it might not run for a long time. Note that you still may be subject to rate-limiting. :returns: A Request if debug else a Response. Usage:: >>> import internetarchive >>> item = internetarchive.Item('mapi_test_item1') >>> md = {'new_key': 'new_value', 'foo': ['bar', 'bar2']} >>> item.modify_metadata(md) """ append = bool(append) access_key = access_key or self.session.access_key secret_key = secret_key or self.session.secret_key debug = bool(debug) headers = headers or {} expect = expect or {} request_kwargs = request_kwargs or {} if timeout: request_kwargs["timeout"] = float(timeout) # type: ignore else: request_kwargs["timeout"] = 60 # type: ignore _headers = self.session.headers.copy() _headers.update(headers) url = f'{self.session.protocol}//{self.session.host}/metadata/{self.identifier}' # TODO: currently files and metadata targets do not support dict's, # but they might someday?? refactor this check. source_metadata = self.item_metadata request = MetadataRequest( method='POST', url=url, metadata=metadata, headers=_headers, source_metadata=source_metadata, target=target, priority=priority, access_key=access_key, secret_key=secret_key, append=append, expect=expect, append_list=append_list, insert=insert, reduced_priority=reduced_priority) # Must use Session.prepare_request to make sure session settings # are used on request! prepared_request = request.prepare() if debug: return prepared_request resp = self.session.send(prepared_request, **request_kwargs) # Re-initialize the Item object with the updated metadata. if refresh: self.refresh() return resp def delete_flag( self, category: str, user: str | None = None, ) -> Response: if user is None: user = f"@{self.session.config.get('general', {}).get('screenname')}" url = f'{self.session.protocol}//{self.session.host}/services/flags/admin.php' headers = {'Accept': 'text/json'} # must be text/json specifically params = {'identifier': self.identifier, 'category': category, 'user': user} r = self.session.delete(url, headers=headers, params=params) return r def add_flag( self, category: str, user: str | None = None, ) -> Response: if user is None: user = f"@{self.session.config.get('general', {}).get('screenname')}" url = f'{self.session.protocol}//{self.session.host}/services/flags/admin.php' headers = {'Accept': 'text/json'} # must be text/json specifically params = {'identifier': self.identifier, 'category': category, 'user': user} r = self.session.put(url, headers=headers, params=params) return r def get_flags(self) -> Response: url = f'{self.session.protocol}//{self.session.host}/services/flags/admin.php' headers = {'Accept': 'text/json'} # must be text/json specifically params = {'identifier': self.identifier} r = self.session.get(url, headers=headers, params=params) return r # TODO: `list` parameter name shadows the Python builtin def remove_from_simplelist(self, parent, list) -> Response: """Remove item from a simplelist. :returns: :class:`requests.Response` """ patch = { 'op': 'delete', 'parent': parent, 'list': list, } data = { '-patch': json.dumps(patch), '-target': 'simplelists', } r = self.session.post(self.urls.metadata, data=data) # type: ignore return r def upload_file(self, body, # noqa: PLR0915, C901 TODO: Refactor this method to reduce complexity key: str | None = None, metadata: Mapping | None = None, file_metadata: Mapping | None = None, headers: dict | None = None, access_key: str | None = None, secret_key: str | None = None, queue_derive: bool = False, verbose: bool = False, verify: bool = False, checksum: bool = False, delete: bool = False, retries: int | None = None, retries_sleep: int | None = None, debug: bool = False, validate_identifier: bool = False, request_kwargs: MutableMapping | None = None) -> Request | Response: """Upload a single file to an item. The item will be created if it does not exist. :type body: Filepath or file-like object. :param body: File or data to be uploaded. :param key: Remote filename. :param metadata: Metadata used to create a new item. :param file_metadata: File-level metadata to add to the files.xml entry for the file being uploaded. :param headers: Add additional IA-S3 headers to request. :param queue_derive: Set to False to prevent an item from being derived after upload. :param verify: Verify local MD5 checksum matches the MD5 checksum of the file received by IAS3. :param checksum: Skip based on checksum. :param delete: Delete local file after the upload has been successfully verified. :param retries: Number of times to retry the given request if S3 returns a 503 SlowDown error. :param retries_sleep: Amount of time to sleep between ``retries``. :param verbose: Print progress to stdout. :param debug: Set to True to print headers to stdout, and exit without sending the upload request. :param validate_identifier: Set to True to validate the identifier before uploading the file. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> item.upload_file('/path/to/image.jpg', ... key='photos/image1.jpg') True """ # Set defaults. headers = headers or {} metadata = metadata or {} file_metadata = file_metadata or {} access_key = access_key or self.session.access_key secret_key = secret_key or self.session.secret_key queue_derive = bool(queue_derive) verbose = bool(verbose) verify = bool(verify) delete = bool(delete) # Set checksum after delete. checksum = delete or checksum retries = retries or 0 retries_sleep = retries_sleep or 30 debug = bool(debug) validate_identifier = bool(validate_identifier) request_kwargs = request_kwargs or {} if 'timeout' not in request_kwargs: request_kwargs['timeout'] = 120 md5_sum = None _headers = self.session.headers.copy() _headers.update(headers) if not hasattr(body, 'read'): filename = body body = open(body, 'rb') else: filename = key or body.name size = get_file_size(body) # Support for uploading empty files. if size == 0: _headers['Content-Length'] = '0' if not _headers.get('x-archive-size-hint'): _headers['x-archive-size-hint'] = str(size) # Build IA-S3 URL. if validate_identifier: validate_s3_identifier(self.identifier or "") key = norm_filepath(filename).split('/')[-1] if key is None else key base_url = f'{self.session.protocol}//s3.us.archive.org/{self.identifier}' url = f'{base_url}/{quote(norm_filepath(key).lstrip("/").encode("utf-8"))}' # Skip based on checksum. if checksum: md5_sum = get_md5(body) ia_file = self.get_file(key) if (not self.tasks) and (ia_file) and (ia_file.md5 == md5_sum): log.info(f'{key} already exists: {url}') if verbose: print(f' {key} already exists, skipping.', file=sys.stderr) if delete: log.info( f'{key} successfully uploaded to ' f'https://archive.org/download/{self.identifier}/{key} ' 'and verified, deleting local copy') body.close() os.remove(filename) # Return an empty response object if checksums match. # TODO: Is there a better way to handle this? body.close() return Response() # require the Content-MD5 header when delete is True. if verify or delete: if not md5_sum: md5_sum = get_md5(body) _headers['Content-MD5'] = md5_sum def _build_request(): body.seek(0, os.SEEK_SET) if verbose: try: # hack to raise exception so we get some output for # empty files. if size == 0: raise Exception chunk_size = 1048576 expected_size = math.ceil(size / chunk_size) chunks = chunk_generator(body, chunk_size) progress_generator = tqdm(chunks, desc=f' uploading {key}', dynamic_ncols=True, total=expected_size, unit='MiB') data = None # pre_encode is needed because http doesn't know that it # needs to encode a TextIO object when it's wrapped # in the Iterator from tqdm. # So, this FileAdapter provides pre-encoded output data = IterableToFileAdapter( progress_generator, size, pre_encode=isinstance(body, io.TextIOBase) ) except Exception: print(f' uploading {key}', file=sys.stderr) data = body else: data = body request = S3Request(method='PUT', url=url, headers=_headers, data=data, metadata=metadata, file_metadata=file_metadata, access_key=access_key, secret_key=secret_key, queue_derive=queue_derive) return request if debug: prepared_request = self.session.prepare_request(_build_request()) body.close() return prepared_request else: try: first_try = True while True: error_msg = ('s3 is overloaded, sleeping for ' f'{retries_sleep} seconds and retrying. ' f'{retries} retries left.') if retries > 0 and not first_try: try: overloaded = self.session.s3_is_overloaded( access_key=access_key) except Exception as e: error_msg = ('error checking if s3 is overloaded via ' 's3.us.archive.org?check_limit=1, ' f'exception raised: "{e}". ' f'sleeping for {retries_sleep} seconds and ' f'retrying. {retries} retries left.') overloaded = True if overloaded: sleep(retries_sleep) log.info(error_msg) if verbose: print(f' warning: {error_msg}', file=sys.stderr) retries -= 1 continue request = _build_request() prepared_request = request.prepare() # chunked transfer-encoding is NOT supported by IA-S3. # It should NEVER be set. Requests adds it in certain # scenarios (e.g. if content-length is 0). Stop it. if prepared_request.headers.get('transfer-encoding') == 'chunked': del prepared_request.headers['transfer-encoding'] response = self.session.send(prepared_request, stream=True, **request_kwargs) if (response.status_code == 503) and (retries > 0): if b'appears to be spam' in response.content: log.info('detected as spam, upload failed') break log.info(error_msg) if verbose: print(f' warning: {error_msg}', file=sys.stderr) sleep(retries_sleep) retries -= 1 first_try = False continue else: if response.status_code == 503: log.info('maximum retries exceeded, upload failed.') break response.raise_for_status() log.info(f'uploaded {key} to {url}') if delete and response.status_code == 200: log.info( f'{key} successfully uploaded to ' f'https://archive.org/download/{self.identifier}/{key} and verified, ' 'deleting local copy') body.close() os.remove(filename) response.close() return response except ConnectionResetError as exc: # Get connection info from thread-local storage conn_info = self.session.get_connection_info() # Extract connection details src_ip_port = conn_info.get('src', 'unknown') dst_ip_port = conn_info.get('dst', 'unknown') src_ip = conn_info.get('src_ip', 'unknown') src_port = conn_info.get('src_port', 'unknown') dst_ip = conn_info.get('dst_ip', 'unknown') dst_port = conn_info.get('dst_port', 'unknown') # Get other diagnostic info ip = "unknown" http_path = "unknown" connection_header_value = "unknown" pool_status = "unknown" try: # Parse URL for hostname and path parsed_url = urlparse(prepared_request.url) http_path = parsed_url.path # Use resolved destination IP if available if dst_ip and dst_ip != 'unknown': ip = dst_ip elif parsed_url.hostname: hostname = parsed_url.hostname ip = socket.gethostbyname(hostname) # Check what Connection header was actually sent connection_header_value = prepared_request.headers.get('Connection', 'not-set') # Check if urllib3 pooled the connection for this host adapter = self.session.get_adapter(prepared_request.url) if hasattr(adapter, 'poolmanager'): pool_key = adapter.poolmanager._get_pool_key(prepared_request.url, None) # noqa pool = adapter.poolmanager.pools.get(pool_key) if pool: pool_status = f"requests={pool.num_requests}" else: pool_status = "no-pool" else: pool_status = "no-poolmanager" except Exception: log.debug('error gathering diagnostic info for connection reset error, ' 'Raising original exception.') # Construct enhanced error message with clear diagnostic context error_msg = (f'Connection reset by peer while uploading {key} to ' f'{self.identifier} (src: {src_ip_port}, dst: {dst_ip_port}, ' f'path: {http_path}, UTC: {datetime.utcnow().isoformat()}, ' f'Connection: {connection_header_value}, Pool: {pool_status})') log.error(error_msg) if verbose: print(f' error: {error_msg}', file=sys.stderr) # Re-raise with enhanced message while preserving original traceback raise ConnectionResetError(error_msg) from exc except HTTPError as exc: try: msg = get_s3_xml_text(exc.response.content) # type: ignore except ExpatError: # probably HTTP 500 error and response is invalid XML msg = ('IA S3 returned invalid XML ' # type: ignore f'(HTTP status code {exc.response.status_code}). ' 'This is a server side error which is either temporary, ' 'or requires the intervention of IA admins.') error_msg = f' error uploading {key} to {self.identifier}, {msg}' log.error(error_msg) if verbose: print(f' error uploading {key}: {msg}', file=sys.stderr) # Raise HTTPError with error message. raise type(exc)(error_msg, response=exc.response, request=exc.request) finally: body.close() def upload(self, files, metadata: Mapping | None = None, headers: dict | None = None, access_key: str | None = None, secret_key: str | None = None, queue_derive=None, # TODO: True if None?? verbose: bool = False, verify: bool = False, checksum: bool = False, delete: bool = False, retries: int | None = None, retries_sleep: int | None = None, debug: bool = False, validate_identifier: bool = False, request_kwargs: dict | None = None) -> list[Request | Response]: r"""Upload files to an item. The item will be created if it does not exist. :type files: str, file, list, tuple, dict :param files: The filepaths or file-like objects to upload. :param \*\*kwargs: Optional arguments that :func:`Item.upload_file()` takes. :returns: A list of :class:`requests.Response` objects. Usage:: >>> import internetarchive >>> item = internetarchive.Item('identifier') >>> md = {'mediatype': 'image', 'creator': 'Jake Johnson'} >>> item.upload('/path/to/image.jpg', metadata=md, queue_derive=False) [] Uploading multiple files:: >>> r = item.upload(['file1.txt', 'file2.txt']) >>> r = item.upload([fileobj, fileobj2]) >>> r = item.upload(('file1.txt', 'file2.txt')) Uploading file objects: >>> import io >>> f = io.BytesIO(b'some initial binary data: \x00\x01') >>> r = item.upload({'remote-name.txt': f}) >>> f = io.BytesIO(b'some more binary data: \x00\x01') >>> f.name = 'remote-name.txt' >>> r = item.upload(f) *Note: file objects must either have a name attribute, or be uploaded in a dict where the key is the remote-name* Setting the remote filename with a dict:: >>> r = item.upload({'remote-name.txt': '/path/to/local/file.txt'}) """ queue_derive = True if queue_derive is None else queue_derive remote_dir_name = None total_files = 0 if isinstance(files, dict): if files.get('name'): files = [files] total_files = 1 else: files = list(files.items()) if not isinstance(files, (list, tuple)): files = [files] if all(isinstance(f, dict) and f.get('name') for f in files): total_files = len(files) responses = [] file_index = 0 headers = headers or {} if (queue_derive or not headers.get('x-archive-size-hint')) and total_files == 0: total_files, total_size = recursive_file_count_and_size(files, item=self, checksum=checksum) if not headers.get('x-archive-size-hint'): headers['x-archive-size-hint'] = str(total_size) file_metadata = None for f in files: if isinstance(f, dict): if f.get('name'): file_metadata = f.copy() del file_metadata['name'] f = f['name'] if ((isinstance(f, str) and is_dir(f)) or (isinstance(f, tuple) and is_dir(f[-1]))): if isinstance(f, tuple): remote_dir_name = f[0].strip('/') f = f[-1] for filepath, key in iter_directory(f): file_index += 1 # Set derive header if queue_derive is True, # and this is the last request being made. if queue_derive is True and file_index >= total_files: _queue_derive = True else: _queue_derive = False if not f.endswith('/'): if remote_dir_name: key = f'{remote_dir_name}{f}/{key}' else: key = f'{f}/{key}' elif remote_dir_name: key = f'{remote_dir_name}/{key}' key = norm_filepath(key) resp = self.upload_file(filepath, key=key, metadata=metadata, file_metadata=file_metadata, headers=headers, access_key=access_key, secret_key=secret_key, queue_derive=_queue_derive, verbose=verbose, verify=verify, checksum=checksum, delete=delete, retries=retries, retries_sleep=retries_sleep, debug=debug, validate_identifier=validate_identifier, request_kwargs=request_kwargs) responses.append(resp) else: file_index += 1 # Set derive header if queue_derive is True, # and this is the last request being made. # if queue_derive is True and file_index >= len(files): if queue_derive is True and file_index >= total_files: _queue_derive = True else: _queue_derive = False if not isinstance(f, (list, tuple)): key, body = (None, f) else: key, body = f if key and not isinstance(key, str): key = str(key) resp = self.upload_file(body, key=key, metadata=metadata, file_metadata=file_metadata, headers=headers, access_key=access_key, secret_key=secret_key, queue_derive=_queue_derive, verbose=verbose, verify=verify, checksum=checksum, delete=delete, retries=retries, retries_sleep=retries_sleep, debug=debug, validate_identifier=validate_identifier, request_kwargs=request_kwargs) responses.append(resp) return responses class Collection(Item): """This class represents an archive.org collection.""" def __init__(self, *args, **kwargs): self.searches = {} if isinstance(args[0], Item): orig = args[0] args = (orig.session, orig.identifier, orig.item_metadata) super().__init__(*args, **kwargs) if self.metadata.get('mediatype', 'collection') != 'collection': raise ValueError('mediatype is not "collection"!') deflt_srh = f'collection:{self.identifier}' self._make_search('contents', self.metadata.get('search_collection', deflt_srh)) self._make_search('subcollections', f'{deflt_srh} AND mediatype:collection') def _do_search(self, name: str, query: str): rtn = self.searches.setdefault( name, self.session.search_items(query, fields=['identifier'])) if not hasattr(self, f'{name}_count'): setattr(self, f'{name}_count', self.searches[name].num_found) return rtn.iter_as_items() def _make_search(self, name: str, query: str): setattr(self, name, lambda: self._do_search(name, query)) python-internetarchive-5.7.2/internetarchive/py.typed000066400000000000000000000000001513674652200231300ustar00rootroot00000000000000python-internetarchive-5.7.2/internetarchive/search.py000066400000000000000000000245311513674652200232670ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.search ~~~~~~~~~~~~~~~~~~~~~~ This module provides objects for interacting with the Archive.org search engine. :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ import itertools from logging import getLogger from requests.exceptions import ReadTimeout from internetarchive.auth import S3Auth log = getLogger(__name__) class Search: """This class represents an archive.org item search. You can use this class to search for Archive.org items using the advanced search engine. Usage:: >>> from internetarchive.session import ArchiveSession >>> from internetarchive.search import Search >>> s = ArchiveSession() >>> search = Search(s, '(uploader:jake@archive.org)') >>> for result in search: ... print(result['identifier']) """ def __init__(self, archive_session, query, fields=None, sorts=None, params=None, full_text_search=None, dsl_fts=None, request_kwargs=None, max_retries=None): params = params or {} self.session = archive_session self.dsl_fts = False if not dsl_fts else True if self.dsl_fts or full_text_search: self.fts = True else: self.fts = False self.query = query if self.fts and not self.dsl_fts: self.query = f'!L {self.query}' self.fields = fields or [] self.sorts = sorts or [] self.request_kwargs = request_kwargs or {} self._num_found = None self.fts_url = f'{self.session.protocol}//be-api.us.archive.org/ia-pub-fts-api' self.scrape_url = f'{self.session.protocol}//{self.session.host}/services/search/v1/scrape' self.search_url = f'{self.session.protocol}//{self.session.host}/advancedsearch.php' if self.session.access_key and self.session.secret_key: self.auth = S3Auth(self.session.access_key, self.session.secret_key) else: self.auth = None self.max_retries = max_retries if max_retries is not None else 5 # Initialize params. default_params = {'q': self.query} if 'page' not in params: if 'rows' in params: params['page'] = 1 else: default_params['count'] = 10000 else: default_params['output'] = 'json' # In the beta endpoint 'scope' was called 'index'. # Let's support both for a while. if 'index' in params: params['scope'] = params['index'] del params['index'] self.params = default_params.copy() self.params.update(params) # Set timeout. if 'timeout' not in self.request_kwargs: self.request_kwargs['timeout'] = 300 # Set retries. self.session.mount_http_adapter(max_retries=self.max_retries) def __repr__(self): return f'Search(query={self.query!r})' def __iter__(self): return self.iter_as_results() def _advanced_search(self): # Always return identifier. if 'identifier' not in self.fields: self.fields.append('identifier') for k, v in enumerate(self.fields): self.params[f'fl[{k}]'] = v for i, field in enumerate(self.sorts): self.params[f'sort[{i}]'] = field self.params['output'] = 'json' r = self.session.get(self.search_url, params=self.params, auth=self.auth, **self.request_kwargs) j = r.json() num_found = int(j.get('response', {}).get('numFound', 0)) if not self._num_found: self._num_found = num_found if j.get('error'): yield j yield from j.get('response', {}).get('docs', []) def _scrape(self): if self.fields: self.params['fields'] = ','.join(self.fields) if self.sorts: self.params['sorts'] = ','.join(self.sorts) i = 0 num_found = None while True: r = self.session.post(self.scrape_url, params=self.params, auth=self.auth, **self.request_kwargs) j = r.json() if j.get('error'): yield j if not num_found: num_found = int(j.get('total') or '0') if not self._num_found: self._num_found = num_found self._handle_scrape_error(j) self.params['cursor'] = j.get('cursor') for item in j['items']: i += 1 yield item if 'cursor' not in j: if i != num_found: raise ReadTimeout('The server failed to return results in the' f' allotted amount of time for {r.request.url}') break def _full_text_search(self): d = { 'q': self.query, 'size': '10000', 'from': '0', 'scroll': 'true', } if 'scope' in self.params: d['scope'] = self.params['scope'] if 'size' in self.params: d['scroll'] = False d['size'] = self.params['size'] while True: r = self.session.post(self.fts_url, json=d, auth=self.auth, **self.request_kwargs) j = r.json() scroll_id = j.get('_scroll_id') hits = j.get('hits', {}).get('hits') if not hits: return yield from hits if not hits or d['scroll'] is False: break d['scroll_id'] = scroll_id def _make_results_generator(self): if self.fts: return self._full_text_search() if 'user_aggs' in self.params: return self._user_aggs() elif 'page' in self.params: return self._advanced_search() else: return self._scrape() def _user_aggs(self): """Experimental support for user aggregations. """ del self.params['count'] # advanced search will error if this param is present! self.params['page'] = '1' self.params['rows'] = '1' self.params['output'] = 'json' r = self.session.get(self.search_url, params=self.params, auth=self.auth, **self.request_kwargs) j = r.json() if j.get('error'): yield j for agg in j.get('response', {}).get('aggregations', {}).items(): yield {agg[0]: agg[1]} @property def num_found(self): if not self._num_found: if not self.fts and 'page' in self.params: p = self.params.copy() p['output'] = 'json' r = self.session.get(self.search_url, params=p, auth=self.auth, **self.request_kwargs) j = r.json() num_found = int(j.get('response', {}).get('numFound', 0)) if not self._num_found: self._num_found = num_found elif not self.fts: p = self.params.copy() p['total_only'] = 'true' r = self.session.post(self.scrape_url, params=p, auth=self.auth, **self.request_kwargs) j = r.json() self._handle_scrape_error(j) self._num_found = j.get('total') else: self.params['q'] = self.query r = self.session.get(self.fts_url, params=self.params, auth=self.auth, **self.request_kwargs) j = r.json() self._num_found = j.get('hits', {}).get('total') return self._num_found def _handle_scrape_error(self, j): if 'error' in j: if all(s in j['error'].lower() for s in ['invalid', 'secret']): if not j['error'].endswith('.'): j['error'] += '.' raise ValueError(f"{j['error']} Try running 'ia configure' and retrying.") raise ValueError(j.get('error')) def _get_item_from_search_result(self, search_result): return self.session.get_item(search_result['identifier']) def iter_as_results(self): return SearchIterator(self, self._make_results_generator()) def iter_as_items(self): _map = map(self._get_item_from_search_result, self._make_results_generator()) return SearchIterator(self, _map) def __len__(self): return self.num_found class SearchIterator: """This class is an iterator wrapper for search results. It provides access to the underlying Search, and supports len() (since that is known initially).""" def __init__(self, search, iterator): self.search = search self.iterator = iterator def __len__(self): return self.search.num_found def __next__(self): return next(self.iterator) def __iter__(self): return self def __repr__(self): return f'{self.__class__.__name__}({self.search!r}, {self.iterator!r})' python-internetarchive-5.7.2/internetarchive/session.py000066400000000000000000000636011513674652200235060ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.session ~~~~~~~~~~~~~~~~~~~~~~~ This module provides an ArchiveSession object to manage and persist settings across the internetarchive package. :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from __future__ import annotations import locale import logging import os import platform import socket import sys import threading import warnings from typing import Iterable, Mapping, MutableMapping from urllib.parse import unquote, urlparse import requests import requests.sessions from requests import Response from requests.adapters import HTTPAdapter from requests.cookies import create_cookie from requests.exceptions import RequestException from requests.utils import default_headers from urllib3 import Retry from internetarchive import __version__, auth, catalog from internetarchive.config import get_config from internetarchive.item import Collection, Item from internetarchive.search import Search from internetarchive.utils import parse_dict_cookies, reraise_modify logger = logging.getLogger(__name__) class ArchiveSession(requests.sessions.Session): """The :class:`ArchiveSession ` object collects together useful functionality from `internetarchive` as well as important data such as configuration information and credentials. It is subclassed from :class:`requests.Session `. Usage:: >>> from internetarchive import ArchiveSession >>> s = ArchiveSession() >>> item = s.get_item('nasa') Collection(identifier='nasa', exists=True) """ ITEM_MEDIATYPE_TABLE = { # noqa: RUF012 'collection': Collection, } def __init__(self, config: Mapping | None = None, config_file: str = "", debug: bool = False, http_adapter_kwargs: MutableMapping | None = None): """Initialize :class:`ArchiveSession ` object with config. :param config: A config dict used for initializing the :class:`ArchiveSession ` object. Supports the following keys in the ``general`` section: - ``user_agent_suffix``: Custom string to append to the default User-Agent. The default (including access key) is always sent. - ``secure``: Use HTTPS (default: True). - ``host``: Host to connect to (default: archive.org). :param config_file: Path to config file used for initializing the :class:`ArchiveSession ` object. :param http_adapter_kwargs: Keyword arguments used to initialize the :class:`requests.adapters.HTTPAdapter ` object. :returns: :class:`ArchiveSession` object. """ super().__init__() http_adapter_kwargs = http_adapter_kwargs or {} debug = bool(debug) self.config = get_config(config, config_file) self.config_file = config_file for ck, cv in self.config.get('cookies', {}).items(): raw_cookie = f'{ck}={cv}' cookie_dict = parse_dict_cookies(raw_cookie) if not cookie_dict.get(ck): continue cookie = create_cookie(ck, cookie_dict[ck], domain=cookie_dict.get('domain', '.archive.org'), path=cookie_dict.get('path', '/')) self.cookies.set_cookie(cookie) self.secure: bool = self.config.get('general', {}).get('secure', True) self.host: str = self.config.get('general', {}).get('host', 'archive.org') if 'archive.org' not in self.host: self.host += '.archive.org' self.protocol = 'https:' if self.secure else 'http:' user_email = self.config.get('cookies', {}).get('logged-in-user') if user_email: user_email = user_email.split(';')[0] user_email = unquote(user_email) self.user_email: str = user_email self.access_key: str = self.config.get('s3', {}).get('access') self.secret_key: str = self.config.get('s3', {}).get('secret') self.http_adapter_kwargs: MutableMapping = http_adapter_kwargs or {} self.headers = default_headers() # type: ignore[assignment] default_user_agent = self._get_user_agent_string() user_agent_suffix = self.config.get('general', {}).get('user_agent_suffix') if user_agent_suffix: self.headers.update({'User-Agent': f'{default_user_agent} {user_agent_suffix}'}) else: self.headers.update({'User-Agent': default_user_agent}) self.headers.update({'Connection': 'close'}) self.mount_http_adapter() logging_config = self.config.get('logging', {}) if logging_config.get('level'): if logging_config.get('log_to_stdout'): self.set_stream_logger(logging_config.get('level', 'NOTSET')) if debug or (logger.level <= 10): self.set_stream_logger(logging_config.get('level', 'NOTSET'), "urllib3") else: self.set_file_logger(logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log')) if debug or (logger.level <= 10): self.set_file_logger(logging_config.get('level', 'NOTSET'), logging_config.get('file', 'internetarchive.log'), 'urllib3') # Thread-local storage for connection info self._connection_info_local = threading.local() # Monkey-patch socket.connect self._original_connect = socket.socket.connect def instrumented_connect(sock, address): result = self._original_connect(sock, address) try: src_ip, src_port = sock.getsockname() dst_ip, dst_port = address self._connection_info_local.info = { 'src': f"{src_ip}:{src_port}", 'dst': f"{dst_ip}:{dst_port}", 'src_ip': src_ip, 'src_port': src_port, 'dst_ip': dst_ip, 'dst_port': dst_port } except Exception: self._connection_info_local.info = {} return result socket.socket.connect = instrumented_connect # type: ignore[method-assign] def get_connection_info(self): """Get connection info for current thread""" return getattr(self._connection_info_local, 'info', {}) def _get_user_agent_string(self) -> str: """Generate a User-Agent string to be sent with every request.""" uname = platform.uname() try: lang = locale.getlocale()[0][:2] # type: ignore except Exception: lang = '' py_version = '{}.{}.{}'.format(*sys.version_info) return (f'internetarchive/{__version__} ' f'({uname[0]} {uname[-1]}; N; {lang}; {self.access_key}) ' f'Python/{py_version}') def rebuild_auth(self, prepared_request, response): """Never rebuild auth for archive.org URLs. """ u = urlparse(prepared_request.url) if u.netloc.endswith('archive.org'): return super().rebuild_auth(prepared_request, response) def mount_http_adapter(self, protocol: str | None = None, max_retries: int | None = None, status_forcelist: list | None = None, host: str | None = None) -> None: """Mount an HTTP adapter to the :class:`ArchiveSession ` object. :param protocol: HTTP protocol to mount your adapter to (e.g. 'https://'). :param max_retries: The number of times to retry a failed request. This can also be an `urllib3.Retry` object. :param status_forcelist: A list of status codes (as int's) to retry on. :param host: The host to mount your adapter to. """ protocol = protocol or self.protocol host = host or 'archive.org' if max_retries is None: max_retries = self.http_adapter_kwargs.get('max_retries', 3) status_forcelist = status_forcelist or [429, 500, 501, 502, 503, 504] if max_retries and isinstance(max_retries, (int, float)): self.http_adapter_kwargs['max_retries'] = Retry( total=max_retries, connect=max_retries, read=max_retries, redirect=False, allowed_methods=["POST", "HEAD", "GET", "OPTIONS"], status_forcelist=status_forcelist, backoff_factor=1, respect_retry_after_header=True ) else: self.http_adapter_kwargs['max_retries'] = max_retries max_retries_adapter = HTTPAdapter(**self.http_adapter_kwargs) # Don't mount on s3.us.archive.org, only archive.org! # IA-S3 requires a more complicated retry workflow. self.mount(f'{protocol}//{host}', max_retries_adapter) def set_file_logger( self, log_level: str, path: str, logger_name: str = 'internetarchive' ) -> None: """Convenience function to quickly configure any level of logging to a file. :param log_level: A log level as specified in the `logging` module. :param path: Path to the log file. The file will be created if it doesn't already exist. :param logger_name: The name of the logger. """ _log_level = { 'CRITICAL': 50, 'ERROR': 40, 'WARNING': 30, 'INFO': 20, 'DEBUG': 10, 'NOTSET': 0, } log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' _log = logging.getLogger(logger_name) _log.setLevel(logging.DEBUG) fh = logging.FileHandler(path, encoding='utf-8') fh.setLevel(_log_level[log_level]) formatter = logging.Formatter(log_format) fh.setFormatter(formatter) _log.addHandler(fh) def set_stream_logger( self, log_level: str, logger_name: str = 'internetarchive' ) -> None: """Convenience function to quickly configure any level of logging to a stream (stdout). :param log_level: A log level as specified in the `logging` module. :param logger_name: The name of the logger. """ _log_level = { 'CRITICAL': 50, 'ERROR': 40, 'WARNING': 30, 'INFO': 20, 'DEBUG': 10, 'NOTSET': 0, } log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' _log = logging.getLogger(logger_name) _log.setLevel(logging.DEBUG) sh = logging.StreamHandler(stream=sys.stdout) sh.setLevel(_log_level[log_level]) formatter = logging.Formatter(log_format) sh.setFormatter(formatter) _log.addHandler(sh) def whoami(self) -> str: """Return the logged-in user email address. :returns: The logged-in user email address. """ u = 'https://archive.org/services/user.php' p = {'op': 'whoami'} # Do not use self/Session.get() here, # to make sure S3 keys are used for validation -- not cookies. r = requests.get(u, params=p, auth=auth.S3Auth(self.access_key, self.secret_key), timeout=12) return r.json() def get_item(self, identifier: str, item_metadata: Mapping | None = None, request_kwargs: MutableMapping | None = None): """A method for creating :class:`internetarchive.Item ` and :class:`internetarchive.Collection ` objects. :param identifier: A globally unique Archive.org identifier. :param item_metadata: A metadata dict used to initialize the Item or Collection object. Metadata will automatically be retrieved from Archive.org if nothing is provided. :param request_kwargs: Keyword arguments to be used in :meth:`requests.sessions.Session.get` request. """ request_kwargs = request_kwargs or {} if not item_metadata: logger.debug(f'no metadata provided for "{identifier}", retrieving now.') item_metadata = self.get_metadata(identifier, request_kwargs) or {} mediatype = item_metadata.get('metadata', {}).get('mediatype') try: item_class = self.ITEM_MEDIATYPE_TABLE.get(mediatype, Item) except TypeError: item_class = Item return item_class(self, identifier, item_metadata) def get_metadata(self, identifier: str, request_kwargs: MutableMapping | None = None): """Get an item's metadata from the `Metadata API `__ :param identifier: Globally unique Archive.org identifier. :returns: Metadata API response. """ request_kwargs = request_kwargs or {} url = f'{self.protocol}//{self.host}/metadata/{identifier}' if 'timeout' not in request_kwargs: request_kwargs['timeout'] = 12 try: if self.access_key and self.secret_key: s3_auth = auth.S3Auth(self.access_key, self.secret_key) else: s3_auth = None resp = self.get(url, auth=s3_auth, **request_kwargs) resp.raise_for_status() except Exception as exc: error_msg = f'Error retrieving metadata from {url}, {exc}' logger.error(error_msg) raise type(exc)(error_msg) return resp.json() def search_items(self, query: str, fields: Iterable[str] | None = None, sorts: Iterable[str] | None = None, params: Mapping | None = None, full_text_search: bool = False, dsl_fts: bool = False, request_kwargs: Mapping | None = None, max_retries: int | Retry | None = None) -> Search: """Search for items on Archive.org. :param query: The Archive.org search query to yield results for. Refer to https://archive.org/advancedsearch.php#raw for help formatting your query. :param fields: The metadata fields to return in the search results. :param params: The URL parameters to send with each request sent to the Archive.org Advancedsearch Api. :param full_text_search: Beta support for querying the archive.org Full Text Search API [default: False]. :param dsl_fts: Beta support for querying the archive.org Full Text Search API in dsl (i.e. do not prepend ``!L `` to the ``full_text_search`` query [default: False]. :returns: A :class:`Search` object, yielding search results. """ request_kwargs = request_kwargs or {} return Search(self, query, fields=fields, sorts=sorts, params=params, full_text_search=full_text_search, dsl_fts=dsl_fts, request_kwargs=request_kwargs, max_retries=max_retries) def s3_is_overloaded(self, identifier=None, access_key=None, request_kwargs=None): request_kwargs = request_kwargs or {} if 'timeout' not in request_kwargs: request_kwargs['timeout'] = 12 u = f'{self.protocol}//s3.us.archive.org' p = { 'check_limit': 1, 'accesskey': access_key, 'bucket': identifier, } r = self.get(u, params=p, **request_kwargs) try: j = r.json() except ValueError: return True return j.get('over_limit') != 0 def get_tasks_api_rate_limit(self, cmd: str = 'derive.php', request_kwargs: dict | None = None): return catalog.Catalog(self, request_kwargs).get_rate_limit(cmd=cmd) def submit_task(self, identifier: str, cmd: str, comment: str = '', priority: int = 0, data: dict | None = None, headers: dict | None = None, reduced_priority: bool = False, request_kwargs: Mapping | None = None) -> requests.Response: """Submit an archive.org task. :param identifier: Item identifier. :param cmd: Task command to submit, see `supported task commands `_. :param comment: A reasonable explanation for why the task is being submitted. :param priority: Task priority from 10 to -10 (default: 0). :param data: Extra POST data to submit with the request. Refer to `Tasks API Request Entity `_. :param headers: Add additional headers to request. :param reduced_priority: Submit your derive at a lower priority. This option is helpful to get around rate-limiting. Your task will more likely be accepted, but it might not run for a long time. Note that you still may be subject to rate-limiting. This is different than ``priority`` in that it will allow you to possibly avoid rate-limiting. :param request_kwargs: Keyword arguments to be used in :meth:`requests.sessions.Session.post` request. :returns: :class:`requests.Response` """ headers = headers or {} if reduced_priority: headers.update({'X-Accept-Reduced-Priority': '1'}) return catalog.Catalog(self, request_kwargs).submit_task(identifier, cmd, comment=comment, priority=priority, data=data, headers=headers) def iter_history(self, identifier: str | None, params: dict | None = None, request_kwargs: Mapping | None = None) -> Iterable[catalog.CatalogTask]: """A generator that returns completed tasks. :param identifier: Item identifier. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :param request_kwargs: Keyword arguments to be used in :meth:`requests.sessions.Session.get` request. :returns: An iterable of completed CatalogTasks. """ params = params or {} params.update({'identifier': identifier, 'catalog': 0, 'summary': 0, 'history': 1}) c = catalog.Catalog(self, request_kwargs) yield from c.iter_tasks(params) def iter_catalog(self, identifier: str | None = None, params: dict | None = None, request_kwargs: Mapping | None = None) -> Iterable[catalog.CatalogTask]: """A generator that returns queued or running tasks. :param identifier: Item identifier. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :param request_kwargs: Keyword arguments to be used in :meth:`requests.sessions.Session.get` request. :returns: An iterable of queued or running CatalogTasks. """ params = params or {} params.update({'identifier': identifier, 'catalog': 1, 'summary': 0, 'history': 0}) c = catalog.Catalog(self, request_kwargs) yield from c.iter_tasks(params) def get_tasks_summary(self, identifier: str = "", params: dict | None = None, request_kwargs: Mapping | None = None) -> dict: """Get the total counts of catalog tasks meeting all criteria, organized by run status (queued, running, error, and paused). :param identifier: Item identifier. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :param request_kwargs: Keyword arguments to be used in :meth:`requests.sessions.Session.get` request. :returns: Counts of catalog tasks meeting all criteria. """ return catalog.Catalog(self, request_kwargs).get_summary(identifier=identifier, params=params) def get_tasks(self, identifier: str = "", params: dict | None = None, request_kwargs: Mapping | None = None) -> set[catalog.CatalogTask]: """Get a list of all tasks meeting all criteria. The list is ordered by submission time. :param identifier: The item identifier, if provided will return tasks for only this item filtered by other criteria provided in params. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :param request_kwargs: Keyword arguments to be used in :meth:`requests.sessions.Session.get` request. :returns: A set of all tasks meeting all criteria. """ params = params or {} if 'history' not in params: params['history'] = 1 if 'catalog' not in params: params['catalog'] = 1 return set(catalog.Catalog(self, request_kwargs).get_tasks( identifier=identifier, params=params) ) def get_my_catalog(self, params: dict | None = None, request_kwargs: Mapping | None = None) -> set[catalog.CatalogTask]: """Get all queued or running tasks. :param params: Query parameters, refer to `Tasks API `_ for available parameters. :param request_kwargs: Keyword arguments to be used in :meth:`requests.sessions.Session.get` request. :returns: A set of all queued or running tasks. """ params = params or {} _params = {'submitter': self.user_email, 'catalog': 1, 'history': 0, 'summary': 0} params.update(_params) return self.get_tasks(params=params, request_kwargs=request_kwargs) def get_task_log(self, task_id: str | int, request_kwargs: Mapping | None = None) -> str: """Get a task log. :param task_id: The task id for the task log you'd like to fetch. :param request_kwargs: Keyword arguments that :py:class:`requests.Request` takes. :returns: The task log as a string. """ return catalog.CatalogTask.get_task_log(task_id, self, request_kwargs) def send(self, request, **kwargs) -> Response: # Catch urllib3 warnings for HTTPS related errors. insecure = False with warnings.catch_warnings(record=True) as w: warnings.filterwarnings('always') try: r = super().send(request, **kwargs) except Exception as e: try: reraise_modify(e, e.request.url, prepend=False) # type: ignore except Exception: logger.error(e) raise e if self.protocol == 'http:': return r insecure_warnings = ['SNIMissingWarning', 'InsecurePlatformWarning'] if w: for warning in w: if any(x in str(warning) for x in insecure_warnings): insecure = True break if insecure: msg = ('You are attempting to make an HTTPS request on an insecure platform,' ' please see:\n\n\thttps://archive.org/services/docs/api' '/internetarchive/troubleshooting.html#https-issues\n') raise RequestException(msg) return r python-internetarchive-5.7.2/internetarchive/utils.py000066400000000000000000000557471513674652200231770ustar00rootroot00000000000000# # The internetarchive module is a Python/CLI interface to Archive.org. # # Copyright (C) 2012-2024 Internet Archive # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . """ internetarchive.utils ~~~~~~~~~~~~~~~~~~~~~ This module provides utility functions for the internetarchive library. :copyright: (C) 2012-2024 by Internet Archive. :license: AGPL 3, see LICENSE for more details. """ from __future__ import annotations import hashlib import os import platform import re import sys import warnings from collections.abc import Mapping from typing import Iterable from xml.dom.minidom import parseString # Make preferred JSON package available via `from internetarchive.utils import json` try: import ujson as json # ujson lacks a JSONDecodeError: https://github.com/ultrajson/ultrajson/issues/497 JSONDecodeError = ValueError except ImportError: import json # type: ignore JSONDecodeError = json.JSONDecodeError # type: ignore def deep_update(d: dict, u: Mapping) -> dict: for k, v in u.items(): if isinstance(v, Mapping): r = deep_update(d.get(k, {}), v) d[k] = r else: d[k] = v return d class InvalidIdentifierException(Exception): pass def validate_s3_identifier(string: str) -> bool: legal_chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789._-' # periods, underscores, and dashes are legal, but may not be the first # character! if any(string.startswith(c) is True for c in ['.', '_', '-']): raise InvalidIdentifierException('Identifier cannot begin with periods ".", underscores ' '"_", or dashes "-".') if len(string) > 100 or len(string) < 3: raise InvalidIdentifierException('Identifier should be between 3 and 80 characters in ' 'length.') # Support for uploading to user items, e.g. first character can be `@`. if string.startswith('@'): string = string[1:] if any(c not in legal_chars for c in string): raise InvalidIdentifierException('Identifier can only contain alphanumeric characters, ' 'periods ".", underscores "_", or dashes "-". However, ' 'identifier cannot begin with periods, underscores, or ' 'dashes.') return True def needs_quote(s: str) -> bool: try: s.encode('ascii') except (UnicodeDecodeError, UnicodeEncodeError): return True return re.search(r'\s', s) is not None def norm_filepath(fp: bytes | str) -> str: if isinstance(fp, bytes): fp = fp.decode('utf-8') fp = fp.replace(os.path.sep, '/') if not fp.startswith('/'): fp = f'/{fp}' return fp def get_md5(file_object) -> str: m = hashlib.md5() while True: data = file_object.read(8192) if not data: break m.update(data) file_object.seek(0, os.SEEK_SET) return m.hexdigest() def chunk_generator(fp, chunk_size: int): while True: chunk = fp.read(chunk_size) if not chunk: break yield chunk def suppress_keyboard_interrupt_message() -> None: """Register a new excepthook to suppress KeyboardInterrupt exception messages, and exit with status code 130. """ old_excepthook = sys.excepthook def new_hook(type, value, traceback): if type is KeyboardInterrupt: sys.exit(130) old_excepthook(type, value, traceback) sys.excepthook = new_hook class IterableToFileAdapter: def __init__(self, iterable, size: int, pre_encode: bool = False): self.iterator = iter(iterable) self.length = size # pre_encode is needed because http doesn't know that it # needs to encode a TextIO object when it's wrapped # in the Iterator from tqdm. # So, this FileAdapter provides pre-encoded output self.pre_encode = pre_encode def read(self, size: int = -1): # TBD: add buffer for `len(data) > size` case if self.pre_encode: # this adapter is intended to emulate the encoding that is usually # done by the http lib. # As of 2022, iso-8859-1 encoding is used to meet the HTTP standard, # see in the cpython repo (https://github.com/python/cpython # Lib/http/client.py lines 246; 1340; or grep 'iso-8859-1' return next(self.iterator, '').encode("iso-8859-1") return next(self.iterator, b'') def __len__(self) -> int: return self.length class IdentifierListAsItems: """This class is a lazily-loaded list of Items, accessible by index or identifier. """ def __init__(self, id_list_or_single_id, session): self.ids = (id_list_or_single_id if isinstance(id_list_or_single_id, list) else [id_list_or_single_id]) self._items = [None] * len(self.ids) self.session = session def __len__(self) -> int: return len(self.ids) def __getitem__(self, idx): for i in (range(*idx.indices(len(self))) if isinstance(idx, slice) else [idx]): if self._items[i] is None: self._items[i] = self.session.get_item(self.ids[i]) return self._items[idx] def __getattr__(self, name): try: return self[self.ids.index(name)] except ValueError: raise AttributeError def __repr__(self) -> str: return f'{self.__class__.__name__}({self.ids!r})' def get_s3_xml_text(xml_str: str) -> str: def _get_tag_text(tag_name, xml_obj): text = '' elements = xml_obj.getElementsByTagName(tag_name) for e in elements: for node in e.childNodes: if node.nodeType == node.TEXT_NODE: text += node.data return text tag_names = ['Message', 'Resource'] try: p = parseString(xml_str) _msg = _get_tag_text('Message', p) _resource = _get_tag_text('Resource', p) # Avoid weird Resource text that contains PUT method. if _resource and "'PUT" not in _resource: return f'{_msg} - {_resource.strip()}' else: return _msg except Exception: return str(xml_str) def get_file_size(file_obj) -> int | None: if is_filelike_obj(file_obj): try: file_obj.seek(0, os.SEEK_END) size = file_obj.tell() # Avoid OverflowError. if size > sys.maxsize: size = None file_obj.seek(0, os.SEEK_SET) except OSError: size = None else: st = os.stat(file_obj) size = st.st_size return size def iter_directory(directory: str): """Given a directory, yield all files recursively as a two-tuple (filepath, s3key)""" for path, _dir, files in os.walk(directory): for f in files: filepath = os.path.join(path, f) key = os.path.relpath(filepath, directory) yield (filepath, key) def recursive_file_count_and_size(files, item=None, checksum=False): """Given a filepath or list of filepaths, return the total number and size of files. If `checksum` is `True`, skip over files whose MD5 hash matches any file in the `item`. """ if not isinstance(files, (list, set)): files = [files] total_files = 0 total_size = 0 if checksum is True: md5s = [f.get('md5') for f in item.files] else: md5s = [] if isinstance(files, dict): # make sure to use local filenames. _files = files.values() else: if isinstance(files[0], tuple): _files = dict(files).values() else: _files = files for f in _files: try: is_dir = os.path.isdir(f) except TypeError: try: f = f[0] is_dir = os.path.isdir(f) except (AttributeError, TypeError): is_dir = False if is_dir: it = iter_directory(f) else: it = [(f, None)] for x, _ in it: if checksum is True: try: with open(x, 'rb') as fh: lmd5 = get_md5(fh) except TypeError: # Support file-like objects. lmd5 = get_md5(x) if lmd5 in md5s: continue total_size += get_file_size(x) total_files += 1 return total_files, total_size def recursive_file_count(*args, **kwargs): """Like `recursive_file_count_and_size`, but returns only the file count.""" total_files, _ = recursive_file_count_and_size(*args, **kwargs) return total_files def is_dir(obj) -> bool: """Special is_dir function to handle file-like object cases that cannot be stat'd""" try: return os.path.isdir(obj) except TypeError as exc: return False def is_filelike_obj(obj) -> bool: """Distinguish file-like from path-like objects""" try: os.fspath(obj) except TypeError: return True else: return False def reraise_modify( caught_exc: Exception, append_msg: str, prepend: bool = False, ) -> None: """Append message to exception while preserving attributes. Preserves exception class, and exception traceback. Note: This function needs to be called inside an except because an exception must be active in the current scope. Args: caught_exc(Exception): The caught exception object append_msg(str): The message to append to the caught exception prepend(bool): If True prepend the message to args instead of appending Returns: None Side Effects: Re-raises the exception with the preserved data / trace but modified message """ if not caught_exc.args: # If no args, create our own tuple arg_list = [append_msg] else: # Take the last arg # If it is a string # append your message. # Otherwise append it to the # arg list(Not as pretty) arg_list = list(caught_exc.args[:-1]) last_arg = caught_exc.args[-1] if isinstance(last_arg, str): if prepend: arg_list.append(append_msg + last_arg) else: arg_list.append(last_arg + append_msg) else: arg_list += [last_arg, append_msg] caught_exc.args = tuple(arg_list) raise # noqa: PLE0704 def remove_none(obj): if isinstance(obj, (list, tuple, set)): lst = type(obj)(remove_none(x) for x in obj if x) try: return [dict(t) for t in {tuple(sorted(d.items())) for d in lst}] except (AttributeError, TypeError): return lst elif isinstance(obj, dict): return type(obj)((remove_none(k), remove_none(v)) for k, v in obj.items() if k is not None and v is not None) else: return obj def delete_items_from_dict(d: dict | list, to_delete): """Recursively deletes items from a dict, if the item's value(s) is in ``to_delete``. """ if not isinstance(to_delete, list): to_delete = [to_delete] if isinstance(d, dict): for single_to_delete in set(to_delete): if single_to_delete in d.values(): for k, v in d.copy().items(): if v == single_to_delete: del d[k] for v in d.values(): delete_items_from_dict(v, to_delete) elif isinstance(d, list): for i in d: delete_items_from_dict(i, to_delete) return remove_none(d) def is_valid_metadata_key(name: str) -> bool: # According to the documentation a metadata key # has to be a valid XML tag name. # # The actual allowed tag names (at least as tested with the metadata API), # are way more restrictive and only allow ".-A-Za-z_", possibly followed # by an index in square brackets e. g. [0]. # On the other hand the Archive allows tags starting with the string "xml". return bool(re.fullmatch(r'[A-Za-z][.\-0-9A-Za-z_]+(?:\[[0-9]+\])?', name)) def merge_dictionaries( dict0: dict | None, dict1: dict | None, keys_to_drop: Iterable | None = None, ) -> dict: """Merge two dictionaries. Items in `dict0` can optionally be dropped before the merge. If equal keys exist in both dictionaries, entries in`dict0` are overwritten. :param dict0: A base dictionary with the bulk of the items. :param dict1: Additional items which overwrite the items in `dict0`. :param keys_to_drop: An iterable of keys to drop from `dict0` before the merge. :returns: A merged dictionary. """ if dict0 is not None: new_dict = dict0.copy() else: new_dict = {} if keys_to_drop is not None: for key in keys_to_drop: new_dict.pop(key, None) # Items from `dict1` take precedence over items from `dict0`. if dict1 is not None: new_dict.update(dict1) return new_dict def parse_dict_cookies(value: str) -> dict[str, str | None]: result: dict[str, str | None] = {} for item in value.split(';'): item = item.strip() if not item: continue if '=' not in item: result[item] = None continue name, value = item.split('=', 1) result[name] = value if 'domain' not in result: result['domain'] = '.archive.org' if 'path' not in result: result['path'] = '/' return result def is_valid_email(email): # Regular expression pattern for a valid email address # Ensures the TLD has at least 2 characters pattern = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$' return re.match(pattern, email) is not None # ------------------------------ # Filename sanitization helpers # ------------------------------ _WINDOWS_RESERVED_BASENAMES = { # Device names without extensions (case-insensitive match on stem only) 'CON', 'PRN', 'AUX', 'NUL', *(f'COM{i}' for i in range(1, 10)), *(f'LPT{i}' for i in range(1, 10)), } _WINDOWS_INVALID_CHARS = set('<>:"\\|?*') # plus control chars 0x00-0x1F handled separately def _percent_encode_byte(b: int) -> str: return f'%{b:02X}' def sanitize_windows_filename(name: str) -> tuple[str, bool]: """Return a Windows-safe filename by percent-encoding illegal constructs. Highlights (Windows relevance): * Control chars (0x00-0x1F) encoded. * Characters in _WINDOWS_INVALID_CHARS encoded. * Trailing spaces and periods encoded. * Existing '%' encoded only if another change occurs (to avoid unnecessary churn). * Reserved device names (CON, PRN, AUX, NUL, COM1-9, LPT1-9) including when followed by a dot/extension have their final character encoded. (e.g. "AUX" -> "AU%58", "AUX.txt" -> "AU%58.txt"). Returns (sanitized_name, modified_flag). """ original = name if not name: return name, False # Reserved device name detection (with or without extension). We encode the last character # of the reserved token so that the resulting string no longer triggers Windows device name rules. upper_name = name.upper() reserved_index: int | None = None for base in _WINDOWS_RESERVED_BASENAMES: if upper_name == base or upper_name.startswith(base + '.'): reserved_index = len(base) - 1 break # Determine indexes to encode. encode_indexes: set[int] = set() length = len(name) for idx, ch in enumerate(name): code = ord(ch) if code < 0x20: encode_indexes.add(idx) elif ch in _WINDOWS_INVALID_CHARS: encode_indexes.add(idx) elif ch == '\\': # already included above but explicit for clarity encode_indexes.add(idx) # NOTE: '%' handled later globally # Encode trailing spaces and dots t = length - 1 while t >= 0 and name[t] in (' ', '.'): encode_indexes.add(t) t -= 1 # Reserved device name last character encoding (with or without extension). if reserved_index is not None: encode_indexes.add(reserved_index) modified = bool(encode_indexes) if not modified: # Nothing to do; leave '%' untouched. return name, False # Build output encoding '%' first. out_chars: list[str] = [] for idx, ch in enumerate(name): if ch == '%': out_chars.append('%25') continue if idx in encode_indexes: out_chars.append(_percent_encode_byte(ord(ch))) else: out_chars.append(ch) sanitized = ''.join(out_chars) return sanitized, sanitized != original def is_path_within_directory(base_dir: str, target_path: str) -> bool: """Return True if target_path is within base_dir (after resolving symlinks).""" base_real = os.path.realpath(base_dir) target_real = os.path.realpath(target_path) # Ensure base path ends with separator for prefix test to avoid /foo/bar vs /foo/barista if not base_real.endswith(os.path.sep): base_real += os.path.sep return target_real.startswith(base_real) def sanitize_windows_relpath(rel_path: str, verbose: bool = False, printer=None) -> tuple[str, bool]: """Sanitize a relative path intended for Windows downloads. Splits only on forward slashes (logical separators we introduce) so that any backslashes present in remote filenames are treated as data and percent-encoded. Returns (sanitized_rel_path, modified_flag). """ if os.name != 'nt': # no-op on non-Windows return rel_path, False if not rel_path: return rel_path, False components = rel_path.split('/') if '/' in rel_path else [rel_path] out_parts: list[str] = [] modified_any = False if printer is None: def noop_printer(msg): pass printer = noop_printer original_components: list[str] = [] for comp in components: original_components.append(comp) sanitized, modified = sanitize_windows_filename(comp) out_parts.append(sanitized) modified_any = modified_any or modified result_path = os.path.join(*out_parts) if verbose and modified_any: original_path_display = os.path.join(*original_components) printer(f'windows path sanitized: {original_path_display} -> {result_path}') return result_path, modified_any def is_windows() -> bool: return ( platform.system().lower() == "windows" or sys.platform.startswith("win") ) def sanitize_filepath(filepath: str, avoid_colon: bool = False) -> str: """ Sanitizes only the filename part of a full file path, leaving the directory path intact. This is useful when you need to ensure the filename is safe for filesystem use without modifying the directory structure. Typically used before creating files or directories to prevent invalid filename characters. Args: filepath (str): The full file path to sanitize. avoid_colon (bool): If True, colon ':' in the filename will be percent-encoded for macOS compatibility. Defaults to False. Returns: str: The sanitized file path with the filename portion percent-encoded as needed. """ parent_dir = os.path.dirname(filepath) filename = os.path.basename(filepath) sanitized_filename = sanitize_filename(filename, avoid_colon) return os.path.join(parent_dir, sanitized_filename) def sanitize_filename(name: str, avoid_colon: bool = False) -> str: """ Sanitizes a filename by replacing invalid characters with percent-encoded values. This function is designed to be compatible with both Windows and POSIX systems. Args: name (str): The original string to sanitize. avoid_colon (bool): If True, colon ':' will be percent-encoded. Returns: str: A sanitized version of the filename. """ original = name if is_windows(): sanitized = sanitize_filename_windows(name) else: sanitized = sanitize_filename_posix(name, avoid_colon) if sanitized != original: warnings.warn( f"Filename sanitized: original='{original}' sanitized='{sanitized}'", UserWarning, stacklevel=2 ) return sanitized def unsanitize_filename(name: str) -> str: """ Reverses percent-encoding of the form %XX back to original characters. Works for filenames sanitized by sanitize_filename (Windows or POSIX). Args: name (str): Sanitized filename string with %XX encodings. Returns: str: Original filename with all %XX sequences decoded. """ if '%' in name: if re.search(r'%[0-9A-Fa-f]{2}', name): warnings.warn( "Filename contains percent-encoded sequences that will be decoded.", UserWarning, stacklevel=2 ) def decode_match(match): hex_value = match.group(1) return chr(int(hex_value, 16)) return re.sub(r'%([0-9A-Fa-f]{2})', decode_match, name) def sanitize_filename_windows(name: str) -> str: r""" Replaces Windows-invalid filename characters with percent-encoded values. Characters replaced: < > : " / \ | ? * % Args: name (str): The original string. Returns: str: A sanitized version safe for filesystem use. """ # Encode `%` so that it's possible to round-trip (i.e. via `unsanitize_filename`) invalid_chars = r'[<>:"/\\|?*\x00-\x1F%]' def encode(char): return f'%{ord(char.group()):02X}' # Replace invalid characters name = re.sub(invalid_chars, encode, name) # Remove trailing dots or spaces (not allowed in Windows filenames) return name.rstrip(' .') def sanitize_filename_posix(name: str, avoid_colon: bool = False) -> str: """ Sanitizes filenames for Linux, BSD, and Unix-like systems. - Percent-encodes forward slash '/' (always) - Optionally percent-encodes colon ':' for macOS compatibility Args: name (str): Original filename string. avoid_colon (bool): If True, colon ':' will be encoded. Returns: str: Sanitized filename safe for POSIX systems. """ # Build regex pattern dynamically chars_to_encode = r'/' if avoid_colon: chars_to_encode += ':' pattern = f'[{re.escape(chars_to_encode)}]' def encode_char(match): return f'%{ord(match.group()):02X}' return re.sub(pattern, encode_char, name) python-internetarchive-5.7.2/pex-requirements.txt000066400000000000000000000001501513674652200223230ustar00rootroot00000000000000charset-normalizer==2.1.1 jsonpatch>=0.4 requests>=2.25.0,<3.0.0 setuptools tqdm>=4.0.0 urllib3>=1.26.0 python-internetarchive-5.7.2/pyproject.toml000066400000000000000000000036341513674652200211730ustar00rootroot00000000000000[tool.ruff] line-length = 102 target-version = "py37" [tool.ruff.lint] select = [ "B", # flake8-bugbear "C4", # flake8-comprehensions "C90", # McCabe cyclomatic complexity "E", # pycodestyle "EXE", # flake8-executable "F", # Pyflakes "I", # isort "ICN", # flake8-import-conventions "INT", # flake8-gettext "ISC", # flake8-implicit-str-concat "PIE", # flake8-pie "PL", # Pylint "PT", # flake8-pytest-style "PYI", # flake8-pyi "RSE", # flake8-raise "RUF", # Ruff-specific rules "S", # flake8-bandit "SLF", # flake8-self "T10", # flake8-debugger "TID", # flake8-tidy-imports "UP", # pyupgrade "W", # pycodestyle "YTT", # flake8-2020 # "A", # flake8-builtins # "ANN", # flake8-annotations # "ARG", # flake8-unused-arguments # "BLE", # flake8-blind-except # "COM", # flake8-commas # "D", # pydocstyle # "DJ", # flake8-django # "DTZ", # flake8-datetimez # "EM", # flake8-errmsg # "ERA", # eradicate # "FBT", # flake8-boolean-trap # "G", # flake8-logging-format # "INP", # flake8-no-pep420 # "N", # pep8-naming # "NPY", # NumPy-specific rules # "PD", # pandas-vet # "PGH", # pygrep-hooks # "PTH", # flake8-use-pathlib # "Q", # flake8-quotes # "RET", # flake8-return # "SIM", # flake8-simplify # "T20", # flake8-print # "TCH", # flake8-type-checking # "TRY", # tryceratops ] ignore = [ "B904", "F401", "F841", "PLC1901", "PLR2004", "PLR5501", "PLW0603", "PLW2901", "RUF001", "RUF005", "S101", "S103", "S105", "S106", "S318", "S324", ] [tool.ruff.lint.mccabe] max-complexity = 33 [tool.ruff.lint.pylint] max-args = 24 max-branches = 33 max-statements = 124 [tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402"] "tests/*" = ["PT017", "S101"] "tests/cli/test_ia_list.py" = ["E741"] "tests/test_api.py" = ["E712"] "tests/test_config.py" = ["PT011"] python-internetarchive-5.7.2/setup.cfg000066400000000000000000000037721513674652200201030ustar00rootroot00000000000000[metadata] name = internetarchive version = attr: internetarchive.__version__.__version__ description = A Python interface to archive.org. long_description = file: README.rst long_description_content_type = text/x-rst url = https://github.com/jjjake/internetarchive author = Jacob M. Johnson author_email = jake@archive.org license = AGPL-3.0 license_files = LICENSE classifiers = Development Status :: 5 - Production/Stable Intended Audience :: Developers License :: OSI Approved :: GNU Affero General Public License v3 Natural Language :: English Programming Language :: Python Programming Language :: Python :: 3 Programming Language :: Python :: 3 :: Only Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy [options] packages = internetarchive internetarchive.cli install_requires = jsonpatch>=0.4 requests>=2.25.0,<3.0.0 tqdm>=4.0.0 urllib3>=1.26.0 importlib-metadata>=3.6.0 ;python_version <= "3.10" python_requires = >=3.9 include_package_data = True zip_safe = False [options.entry_points] console_scripts = ia = internetarchive.cli.ia:main [options.extras_require] all = %(dev)s %(test)s %(types)s dev = black mypy pre-commit pytest safety setuptools docs = alabaster==0.7.12 docutils<0.18 sphinx==4.5.0 sphinx-autodoc-typehints==1.18.1 test = pytest==8.4.2 responses==0.20.0 ruff==0.14.1 types = tqdm-stubs>=0.2.0 types-colorama types-jsonpatch>=0.1.0a0 types-pygments types-requests>=2.25.0,<3.0.0 types-setuptools types-ujson>=4.2.0 types-urllib3>=1.26.0 [options.package_data] * = py.typed [codespell] ignore-words-list = alers [mypy] exclude = ^\.git/|^__pycache__/|^docs/source/conf.py$|^old/|^build/|^dist/|\.tox python_version = 3.9 pretty = True scripts_are_modules = True show_error_codes = True show_error_context = True [tool:black] line-length = 102 skip-string-normalization = true python-internetarchive-5.7.2/setup.py000066400000000000000000000000461513674652200177630ustar00rootroot00000000000000from setuptools import setup setup() python-internetarchive-5.7.2/snap/000077500000000000000000000000001513674652200172125ustar00rootroot00000000000000python-internetarchive-5.7.2/snap/snapcraft.yaml000066400000000000000000000006411513674652200220600ustar00rootroot00000000000000name: ia version: master summary: A Command-Line Interface to Archive.org description: | This package installs a command-line tool named ia for using Archive.org from the command-line. grade: devel # must be 'stable' to release into candidate/stable channels confinement: strict apps: ia: command: ia plugs: [network, home, removable-media] parts: internetarchive: source: . plugin: python python-internetarchive-5.7.2/tests/000077500000000000000000000000001513674652200174135ustar00rootroot00000000000000python-internetarchive-5.7.2/tests/__init__.py000066400000000000000000000000001513674652200215120ustar00rootroot00000000000000python-internetarchive-5.7.2/tests/cli/000077500000000000000000000000001513674652200201625ustar00rootroot00000000000000python-internetarchive-5.7.2/tests/cli/test_cli_utils.py000066400000000000000000000020271513674652200235630ustar00rootroot00000000000000from internetarchive.cli.cli_utils import get_args_dict def test_get_args_dict(): test_input = [ 'collection:test_collection', 'description: Attention: multiple colons', 'unicode_test:தமிழ்', 'subject:subject1, subject1', 'subject:subject2', 'subject:subject3; subject3', ] test_output = { 'collection': 'test_collection', 'description': ' Attention: multiple colons', 'unicode_test': 'தமிழ்', 'subject': ['subject1, subject1', 'subject2', 'subject3; subject3'], } args_dict = get_args_dict(test_input) for key, value in args_dict.items(): assert test_output[key] == value def test_get_args_dict_query_string(): test_input = ['a=b,foo&c=d&e=f', 'foo:bar '] test_output = { 'a': 'b,foo', 'c': 'd', 'e': 'f', 'foo': 'bar ', } args_dict = get_args_dict(test_input, query_string=True) for key, value in args_dict.items(): assert test_output[key] == value python-internetarchive-5.7.2/tests/cli/test_ia.py000066400000000000000000000016411513674652200221660ustar00rootroot00000000000000from tests.conftest import IaRequestsMock, ia_call def test_ia(capsys): ia_call(['ia', '--help']) out, err = capsys.readouterr() assert 'A command line interface to Archive.org.' in out ia_call(['ia', '--insecure', 'ls', 'nasa']) ia_call(['ia', 'nocmd'], expected_exit_code=2) out, err = capsys.readouterr() assert "invalid choice: 'nocmd'" in err def test_user_agent_suffix_option(): """Test that --user-agent-suffix option appends to the default User-Agent.""" custom_suffix = 'TestCLIAgent/1.0' with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') ia_call(['ia', '--user-agent-suffix', custom_suffix, 'metadata', 'nasa']) # Check that the user agent starts with default and ends with custom suffix ua = rsps.calls[0].request.headers['User-Agent'] assert ua.startswith('internetarchive/') assert ua.endswith(custom_suffix) python-internetarchive-5.7.2/tests/cli/test_ia_delete.py000066400000000000000000000115731513674652200235150ustar00rootroot00000000000000import argparse import sys from internetarchive.cli.ia_delete import get_files_to_delete def make_args(**kwargs): args = dict(all=False, file=[], glob=None, format=None, cascade=False) # noqa: C408 args.update(kwargs) return argparse.Namespace(**args) def test_get_files_to_delete_all(nasa_item): args = make_args(all=True) files = list(get_files_to_delete(args, nasa_item)) expected = { 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_archive.torrent', 'nasa_files.xml', 'nasa_meta.xml', 'nasa_reviews.xml', } assert {f.name for f in files} == expected def test_get_files_to_delete_empty_file_list(nasa_item): args = make_args(file=[]) files = list(get_files_to_delete(args, nasa_item)) expected = { 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_archive.torrent', 'nasa_files.xml', 'nasa_meta.xml', 'nasa_reviews.xml', } assert {f.name for f in files} == expected def test_get_files_to_delete_with_glob(nasa_item): args = make_args(glob="*xml") files = list(get_files_to_delete(args, nasa_item)) expected = {'nasa_meta.xml', 'nasa_reviews.xml', 'nasa_files.xml'} assert {f.name for f in files} == expected args = make_args(glob="*west_*") files = list(get_files_to_delete(args, nasa_item)) expected = {'globe_west_540.jpg'} assert {f.name for f in files} == expected args = make_args(glob="*west_*|*torrent") files = list(get_files_to_delete(args, nasa_item)) expected = {'globe_west_540.jpg', 'nasa_archive.torrent'} assert {f.name for f in files} == expected args = make_args(glob="nasa_[!m]*.xml") files = list(get_files_to_delete(args, nasa_item)) expected = {'nasa_files.xml', 'nasa_reviews.xml'} assert {f.name for f in files} == expected args = make_args(glob="nasa_???????.xml") files = list(get_files_to_delete(args, nasa_item)) expected = {'nasa_reviews.xml'} assert {f.name for f in files} == expected args = make_args(glob="*_[0-9]*") files = list(get_files_to_delete(args, nasa_item)) expected = {'globe_west_540.jpg'} assert {f.name for f in files} == expected # Match JPEG files with uppercase letters in the name prefix args = make_args(glob="[A-Z]*.jpg") files = list(get_files_to_delete(args, nasa_item)) expected = {'NASAarchiveLogo.jpg'} assert {f.name for f in files} == expected # Match lowercase-only names ending in .jpg args = make_args(glob="[a-z]*.jpg") files = list(get_files_to_delete(args, nasa_item)) expected = {'globe_west_540.jpg'} assert {f.name for f in files} == expected args = make_args(glob="nasa_[fm]*.xml") files = list(get_files_to_delete(args, nasa_item)) expected = {'nasa_files.xml', 'nasa_meta.xml'} assert {f.name for f in files} == expected args = make_args(glob="*.[a-z][a-z][a-z]") files = list(get_files_to_delete(args, nasa_item)) expected = { 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_files.xml', 'nasa_meta.xml', 'nasa_reviews.xml' } assert {f.name for f in files} == expected args = make_args(glob="?a*") files = list(get_files_to_delete(args, nasa_item)) expected = { 'nasa_archive.torrent', 'nasa_files.xml', 'nasa_meta.xml', 'nasa_reviews.xml' } assert {f.name for f in files} == expected args = make_args(glob="g*.jpg") files = list(get_files_to_delete(args, nasa_item)) expected = {'globe_west_540.jpg'} assert {f.name for f in files} == expected args = make_args(glob="nasa_*[st].xml") files = list(get_files_to_delete(args, nasa_item)) expected = {'nasa_files.xml', 'nasa_reviews.xml'} assert {f.name for f in files} == expected args = make_args(glob="[!nN]*") files = list(get_files_to_delete(args, nasa_item)) expected = {'globe_west_540.jpg'} assert {f.name for f in files} == expected def test_get_files_to_delete_with_format(nasa_item): args = make_args(format="JPEG") files = list(get_files_to_delete(args, nasa_item)) expected = {'globe_west_540.jpg'} assert {f.name for f in files} == expected def test_get_files_to_delete_with_explicit_file_list(nasa_item): args = make_args(file=["nasa_meta.xml", "nasa_reviews.xml"]) files = list(get_files_to_delete(args, nasa_item)) expected = {f.name for f in nasa_item.get_files(["nasa_meta.xml", "nasa_reviews.xml"])} assert {f.name for f in files} == expected def test_get_files_to_delete_with_stdin(monkeypatch, nasa_item): args = make_args(file=["-"]) monkeypatch.setattr(sys, "stdin", ["nasa_meta.xml\n", "nasa_reviews.xml\n"]) files = list(get_files_to_delete(args, nasa_item)) expected = {f.name for f in nasa_item.get_files(["nasa_meta.xml", "nasa_reviews.xml"])} assert {f.name for f in files} == expected python-internetarchive-5.7.2/tests/cli/test_ia_download.py000066400000000000000000000150261513674652200240570ustar00rootroot00000000000000import os import sys import time import pytest from internetarchive import get_item from internetarchive.utils import json from tests.conftest import ( NASA_EXPECTED_FILES, IaRequestsMock, call_cmd, files_downloaded, load_test_data_file, ) def test_no_args(tmpdir_ch): call_cmd('ia --insecure download nasa') assert files_downloaded(path='nasa') == NASA_EXPECTED_FILES @pytest.mark.xfail("CI" in os.environ, reason="May timeout on continuous integration") def test_https(tmpdir_ch): call_cmd('ia download nasa') assert files_downloaded(path='nasa') == NASA_EXPECTED_FILES def test_dry_run(): nasa_url = 'http://archive.org/download/nasa/' expected_urls = {nasa_url + f for f in NASA_EXPECTED_FILES} stdout, _stderr = call_cmd('ia --insecure download --dry-run nasa') output_lines = stdout.split('\n') dry_run_urls = {x.strip() for x in output_lines if x and 'nasa:' not in x} assert expected_urls == dry_run_urls def test_glob(tmpdir_ch): expected_files = { 'globe_west_540.jpg', 'globe_west_540_thumb.jpg', 'nasa_itemimage.jpg', '__ia_thumb.jpg', } call_cmd('ia --insecure download --glob="*jpg" nasa') assert files_downloaded(path='nasa') == expected_files def test_exclude(tmpdir_ch): expected_files = { 'globe_west_540.jpg', 'nasa_itemimage.jpg', } call_cmd('ia --insecure download --glob="*jpg" --exclude="*thumb*" nasa') assert files_downloaded(path='nasa') == expected_files def test_format(tmpdir_ch): call_cmd('ia --insecure download --format="Archive BitTorrent" nasa') assert files_downloaded(path='nasa') == {'nasa_archive.torrent'} def test_on_the_fly_format(): i = 'wonderfulwizardo00baumiala' stdout, _stderr = call_cmd(f'ia --insecure download --dry-run --format="DAISY" {i}') assert stdout == '' stdout, _stderr = call_cmd(f'ia --insecure download --dry-run --format="DAISY" --on-the-fly {i}') assert stdout == f'http://archive.org/download/{i}/{i}_daisy.zip' def test_clobber(tmpdir_ch): cmd = 'ia --insecure download nasa nasa_meta.xml' call_cmd(cmd) assert files_downloaded('nasa') == {'nasa_meta.xml'} _stdout, stderr = call_cmd(cmd) assert files_downloaded('nasa') == {'nasa_meta.xml'} prefix = 'nasa:\n'.replace('\n', os.linesep) filepath = os.path.join('nasa', 'nasa_meta.xml') expected_stderr = f'{prefix} skipping {filepath}, file already exists based on length and date.' assert expected_stderr == stderr def test_checksum(tmpdir_ch): call_cmd('ia --insecure download nasa nasa_meta.xml') assert files_downloaded('nasa') == {'nasa_meta.xml'} _stdout, stderr = call_cmd('ia --insecure download --checksum nasa nasa_meta.xml') assert files_downloaded('nasa') == {'nasa_meta.xml'} prefix = 'nasa:\n'.replace('\n', os.linesep) filepath = os.path.join('nasa', 'nasa_meta.xml') assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr def test_checksum_archive(tmpdir_ch): call_cmd('ia --insecure download nasa nasa_meta.xml') assert files_downloaded('nasa') == {'nasa_meta.xml'} _stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml') assert files_downloaded('nasa') == {'nasa_meta.xml'} prefix = 'nasa:\n'.replace('\n', os.linesep) filepath = os.path.join('nasa', 'nasa_meta.xml') assert f'{prefix} skipping {filepath}, file already exists based on checksum.' == stderr assert '_checksum_archive.txt' in files_downloaded('.') with open(os.path.join('.', '_checksum_archive.txt'), encoding='utf-8') as f: filepath = os.path.join('nasa', 'nasa_meta.xml') assert f.read() == f'{filepath}\n' _stdout, stderr = call_cmd('ia --insecure download --checksum-archive nasa nasa_meta.xml') assert files_downloaded('nasa') == {'nasa_meta.xml'} prefix = 'nasa:\n'.replace('\n', os.linesep) filepath = os.path.join('nasa', 'nasa_meta.xml') assert f'{prefix} skipping {filepath}, file already exists based on checksum_archive.' == stderr def test_no_directories(tmpdir_ch): call_cmd('ia --insecure download --no-directories nasa nasa_meta.xml') assert files_downloaded('.') == {'nasa_meta.xml'} def test_destdir(tmpdir_ch): cmd = 'ia --insecure download --destdir=thisdirdoesnotexist/ nasa nasa_meta.xml' _stdout, stderr = call_cmd(cmd, expected_exit_code=2) assert "--destdir: 'thisdirdoesnotexist/' is not a valid directory" in stderr tmpdir_ch.mkdir('thisdirdoesnotexist/') call_cmd(cmd) assert files_downloaded('thisdirdoesnotexist/nasa') == {'nasa_meta.xml'} tmpdir_ch.mkdir('dir2/') cmd = ('ia --insecure download --no-directories --destdir=dir2/ ' 'nasa nasa_meta.xml') call_cmd(cmd) assert files_downloaded('dir2') == {'nasa_meta.xml'} def test_no_change_timestamp(tmpdir_ch): # TODO: Handle the case of daylight savings time now = time.time() call_cmd('ia --insecure download --no-change-timestamp nasa') for path, dirnames, filenames in os.walk(str(tmpdir_ch)): for d in dirnames: p = os.path.join(path, d) assert os.stat(p).st_mtime >= now for f in filenames: p = os.path.join(path, f) assert os.stat(p).st_mtime >= now def test_download_history_flag(capsys): """Test that --download-history correctly includes/excludes history files. Regression test for https://github.com/jjjake/internetarchive/issues/735 The bug was that --download-history was being passed directly to ignore_history_dir without negation, causing the opposite behavior. """ # Add a history file to the nasa metadata nasa_data = json.loads(load_test_data_file('metadata/nasa.json')) nasa_data['files'].append({ 'name': 'history/files/old_file.txt', 'source': 'original', 'size': '100', 'format': 'Text', }) with IaRequestsMock() as mocker: mocker.add_metadata_mock('nasa', body=json.dumps(nasa_data)) item = get_item('nasa') # Without --download-history (ignore_history_dir=True), history files excluded item.download(dry_run=True, ignore_history_dir=True) stdout_without = capsys.readouterr().out assert 'history/files/old_file.txt' not in stdout_without # With --download-history (ignore_history_dir=False), history files included item.download(dry_run=True, ignore_history_dir=False) stdout_with = capsys.readouterr().out assert 'history/files/old_file.txt' in stdout_with python-internetarchive-5.7.2/tests/cli/test_ia_list.py000066400000000000000000000047251513674652200232270ustar00rootroot00000000000000from copy import deepcopy from tests.conftest import IaRequestsMock, ia_call NASA_FILES = { 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_reviews.xml', 'nasa_meta.xml', 'nasa_archive.torrent', 'nasa_files.xml' } def test_ia_list(capsys, nasa_mocker): ia_call(['ia', 'list', 'nasa']) out, _err = capsys.readouterr() assert {l for l in out.split('\n') if l} == NASA_FILES def test_ia_list_verbose(capsys, nasa_mocker): ia_call(['ia', 'list', '--verbose', 'nasa']) out, _err = capsys.readouterr() _nasa_files = deepcopy(NASA_FILES) _nasa_files.add('name') assert {l for l in out.split('\n') if l} == _nasa_files def test_ia_list_all(capsys, nasa_mocker): ia_call(['ia', 'list', '--all', 'nasa']) out, _err = capsys.readouterr() out = [l for l in out.split('\n') if l] assert len(out) == 6 assert all(len(f.split('\t')) == 9 for f in out) assert all(f.split('\t')[0] in NASA_FILES for f in out) def test_ia_list_location(capsys, nasa_mocker): ia_call(['ia', 'list', '--location', '--glob', '*meta.xml', 'nasa']) out, _err = capsys.readouterr() assert out == 'https://archive.org/download/nasa/nasa_meta.xml\n' def test_ia_list_columns(capsys): with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') ia_call(['ia', 'list', '--columns', 'name,md5', '--glob', '*meta.xml', 'nasa']) out, _err = capsys.readouterr() assert out == 'nasa_meta.xml\t0e339f4a29a8bc42303813cbec9243e5\n' with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') ia_call(['ia', 'list', '--columns', 'md5', '--glob', '*meta.xml', 'nasa']) out, _err = capsys.readouterr() assert out == '0e339f4a29a8bc42303813cbec9243e5\n' def test_ia_list_glob(capsys, nasa_mocker): ia_call(['ia', 'list', '--glob', '*torrent', 'nasa']) out, _err = capsys.readouterr() assert out == 'nasa_archive.torrent\n' def test_ia_list_format(capsys, nasa_mocker): ia_call(['ia', 'list', '--format', 'Metadata', 'nasa']) out, _err = capsys.readouterr() expected_output = { 'nasa_reviews.xml', 'nasa_files.xml', 'nasa_meta.xml', } assert {f for f in out.split('\n') if f} == expected_output def test_ia_list_non_existing(capsys): with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa', body='{}') ia_call(['ia', 'list', 'nasa'], expected_exit_code=1) out, _err = capsys.readouterr() assert out == '' python-internetarchive-5.7.2/tests/cli/test_ia_metadata.py000066400000000000000000000030161513674652200240240ustar00rootroot00000000000000import sys from time import time import responses from tests.conftest import IaRequestsMock, ia_call def test_ia_metadata_exists(capsys): with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') ia_call(['ia', 'metadata', '--exists', 'nasa'], expected_exit_code=0) _out, err = capsys.readouterr() assert err == 'nasa exists\n' rsps.reset() rsps.add_metadata_mock('nasa', '{}') sys.argv = ['ia', 'metadata', '--exists', 'nasa'] ia_call(['ia', 'metadata', '--exists', 'nasa'], expected_exit_code=1) _out, err = capsys.readouterr() assert err == 'nasa does not exist\n' def test_ia_metadata_formats(capsys, nasa_mocker): ia_call(['ia', 'metadata', '--formats', 'nasa']) out, _err = capsys.readouterr() expected_formats = {'Collection Header', 'Archive BitTorrent', 'JPEG', 'Metadata', ''} assert set(out.split('\n')) == expected_formats def test_ia_metadata_modify(capsys): md_rsp = ('{"success":true,"task_id":447613301,' '"log":"https://catalogd.archive.org/log/447613301"}') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa', method=responses.GET) rsps.add_metadata_mock('nasa', body=md_rsp, method=responses.POST) valid_key = f'foo-{int(time())}' ia_call(['ia', 'metadata', 'nasa', '--modify', f'{valid_key}:test_value']) _out, err = capsys.readouterr() assert err == 'nasa - success: https://catalogd.archive.org/log/447613301\n' python-internetarchive-5.7.2/tests/cli/test_ia_search.py000066400000000000000000000027241513674652200235160ustar00rootroot00000000000000import responses from internetarchive.utils import json from tests.conftest import PROTOCOL, IaRequestsMock, ia_call, load_test_data_file def test_ia_search_itemlist(capsys): test_scrape_response = load_test_data_file('scrape_response.json') with responses.RequestsMock(assert_all_requests_are_fired=False) as rsps: url = f'{PROTOCOL}//archive.org/services/search/v1/scrape' p1 = { 'q': 'collection:attentionkmartshoppers', 'count': '10000' } _j = json.loads(test_scrape_response) del _j['cursor'] _r = json.dumps(_j) rsps.add(responses.POST, url, body=_r, match=[responses.matchers.query_param_matcher(p1)]) ia_call(['ia', 'search', 'collection:attentionkmartshoppers', '--itemlist']) out, _err = capsys.readouterr() assert len(set(out.split())) == 100 def test_ia_search_num_found(capsys): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: url = f'{PROTOCOL}//archive.org/services/search/v1/scrape' p = { 'q': 'collection:nasa', 'total_only': 'true', 'count': '10000' } rsps.add(responses.POST, url, body='{"items":[],"count":0,"total":50}', match=[responses.matchers.query_param_matcher(p)]) ia_call(['ia', 'search', 'collection:nasa', '--num-found']) out, _err = capsys.readouterr() assert out == '50\n' python-internetarchive-5.7.2/tests/cli/test_ia_upload.py000066400000000000000000000350531513674652200235360ustar00rootroot00000000000000import os import sys from contextlib import contextmanager from io import StringIO import responses from internetarchive.utils import json from tests.conftest import IaRequestsMock, ia_call, load_test_data_file PROTOCOL = 'https:' STATUS_CHECK_RESPONSE = load_test_data_file('s3_status_check.json') def test_ia_upload(tmpdir_ch, caplog): with open('test.txt', 'w') as fh: fh.write('foo') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/test.txt', body='', content_type='text/plain') ia_call(['ia', '--log', 'upload', 'nasa', 'test.txt']) assert f'uploaded test.txt to {PROTOCOL}//s3.us.archive.org/nasa/test.txt' in caplog.text def test_ia_upload_invalid_identifier(capsys, caplog): with open('test.txt', 'w') as fh: fh.write('foo') ia_call(['ia', '--log', 'upload', 'føø', 'test.txt'], expected_exit_code=2) _out, err = capsys.readouterr() assert "Identifier can only contain alphanumeric" in err def test_ia_upload_status_check(capsys): with IaRequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//s3.us.archive.org', body=STATUS_CHECK_RESPONSE, content_type='application/json') ia_call(['ia', 'upload', 'nasa', '--status-check']) _out, err = capsys.readouterr() assert 'success: nasa is accepting requests.' in err j = json.loads(STATUS_CHECK_RESPONSE) j['over_limit'] = 1 rsps.reset() rsps.add(responses.GET, f'{PROTOCOL}//s3.us.archive.org', body=json.dumps(j), content_type='application/json') ia_call(['ia', 'upload', 'nasa', '--status-check'], expected_exit_code=1) _out, err = capsys.readouterr() assert ('warning: nasa is over limit, and not accepting requests. ' 'Expect 503 SlowDown errors.') in err def test_ia_upload_debug(capsys, tmpdir_ch, nasa_mocker): with open('test.txt', 'w') as fh: fh.write('foo') ia_call(['ia', 'upload', '--debug', 'nasa', 'test.txt']) _out, err = capsys.readouterr() assert 'User-Agent' in err assert 's3.us.archive.org/nasa/test.txt' in err assert 'Accept:*/*' in err assert 'Authorization:LOW ' in err assert 'Connection:close' in err assert 'Content-Length:3' in err assert 'Accept-Encoding:gzip, deflate' in err def test_ia_upload_403(capsys): s3_error = ('' 'SignatureDoesNotMatch' 'The request signature we calculated does not match ' 'the signature you provided. Check your AWS Secret Access Key ' 'and signing method. For more information, see REST ' 'Authentication and SOAP Authentication for details.' "'PUT\n\n\n\n/iacli-test-item60/test-replace.txt'" '18a9c5ea-088f-42f5-9fcf-70651cc085ca' '') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/test_ia_upload.py', body=s3_error, status=403, content_type='text/plain') ia_call(['ia', 'upload', 'nasa', __file__], expected_exit_code=1) _out, err = capsys.readouterr() assert 'error uploading test_ia_upload.py' in err def test_ia_upload_invalid_cmd(capsys): ia_call(['ia', 'upload', 'nasa', 'nofile.txt'], expected_exit_code=2) _out, err = capsys.readouterr() assert "'nofile.txt' is not a valid file or directory" in err def test_ia_upload_size_hint(capsys, tmpdir_ch, nasa_mocker): with open('test.txt', 'w') as fh: fh.write('foo') ia_call(['ia', 'upload', '--debug', '--size-hint', '30', 'nasa', 'test.txt']) _out, err = capsys.readouterr() assert 'User-Agent' in err assert 's3.us.archive.org/nasa/test.txt' in err assert 'x-archive-size-hint:30' in err assert 'Accept:*/*' in err assert 'Authorization:LOW ' in err assert 'Connection:close' in err assert 'Content-Length:3' in err assert 'Accept-Encoding:gzip, deflate' in err def test_ia_upload_automatic_size_hint_files(capsys, tmpdir_ch, nasa_mocker): with open('foo', 'w') as fh: fh.write('foo') with open('bar', 'w') as fh: fh.write('bar') ia_call(['ia', 'upload', '--debug', 'nasa', 'foo', 'bar']) _out, err = capsys.readouterr() assert 'x-archive-size-hint:6' in err def test_ia_upload_automatic_size_hint_dir(capsys, tmpdir_ch, nasa_mocker): with open('foo', 'w') as fh: fh.write('foo') with open('bar', 'w') as fh: fh.write('bar') ia_call(['ia', 'upload', '--debug', 'nasa', '.'], expected_exit_code=2) _out, err = capsys.readouterr() assert 'x-archive-size-hint:115' in err def test_ia_upload_unicode(tmpdir_ch, caplog): with open('தமிழ் - baz ∆.txt', 'w') as fh: fh.write('unicode foo') efname = '%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%20-%20baz%20%E2%88%86.txt' with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/{efname}', body='', content_type='text/plain') ia_call(['ia', '--log', 'upload', 'nasa', 'தமிழ் - baz ∆.txt', '--metadata', 'foo:∆']) assert (f'uploaded தமிழ் - baz ∆.txt to {PROTOCOL}//s3.us.archive.org/nasa/' '%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D%20-%20' 'baz%20%E2%88%86.txt') in caplog.text def test_ia_upload_remote_name(tmpdir_ch, caplog): with open('test.txt', 'w') as fh: fh.write('foo') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/hi.txt', body='', content_type='text/plain') ia_call(['ia', '--log', 'upload', 'nasa', 'test.txt', '--remote-name', 'hi.txt']) assert f'uploaded hi.txt to {PROTOCOL}//s3.us.archive.org/nasa/hi.txt' in caplog.text def test_ia_upload_stdin(tmpdir_ch, caplog): @contextmanager def replace_stdin(f): original_stdin = sys.stdin sys.stdin = f try: yield finally: sys.stdin = original_stdin with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/hi.txt', body='', content_type='text/plain') with replace_stdin(StringIO('foo')): ia_call(['ia', '--log', 'upload', 'nasa', '-', '--remote-name', 'hi.txt']) assert f'uploaded hi.txt to {PROTOCOL}//s3.us.archive.org/nasa/hi.txt' in caplog.text def test_ia_upload_inexistent_file(tmpdir_ch, capsys, caplog): ia_call(['ia', 'upload', 'foo', 'test.txt'], expected_exit_code=2) _out, err = capsys.readouterr() assert "'test.txt' is not a valid file or directory" in err def test_ia_upload_spreadsheet(tmpdir_ch, capsys): with open('foo.txt', 'w') as fh: fh.write('foo') with open('test.txt', 'w') as fh: fh.write('bar') with open('test.csv', 'w') as fh: fh.write('identifier,file,REMOTE_NAME\n') fh.write('nasa,foo.txt,\n') fh.write(',test.txt,bar.txt\n') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/foo.txt', body='', content_type='text/plain') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/bar.txt', body='', content_type='text/plain') ia_call(['ia', 'upload', '--spreadsheet', 'test.csv']) _out, err = capsys.readouterr() assert 'uploading foo.txt' in err assert 'uploading bar.txt' in err def test_ia_upload_spreadsheet_item_column(tmpdir_ch, capsys): with open('test.txt', 'w') as fh: fh.write('foo') with open('test.csv', 'w') as fh: fh.write('item,file\n') fh.write('nasa,test.txt\n') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/test.txt', body='', content_type='text/plain') ia_call(['ia', 'upload', '--spreadsheet', 'test.csv']) _out, err = capsys.readouterr() assert 'uploading test.txt' in err def test_ia_upload_spreadsheet_item_and_identifier_column(tmpdir_ch, capsys): # item is preferred, and both are discarded with open('test.txt', 'w') as fh: fh.write('foo') with open('test.csv', 'w') as fh: fh.write('item,identifier,file\n') fh.write('nasa,uhoh,test.txt\n') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/test.txt', body='', content_type='text/plain') ia_call(['ia', 'upload', '--spreadsheet', 'test.csv']) # Verify that the item and identifier columns are not in the PUT request headers putCalls = [c for c in rsps.calls if c.request.method == 'PUT'] assert len(putCalls) == 1 assert 'x-archive-meta00-identifier' not in putCalls[0].request.headers assert 'x-archive-meta00-item' not in putCalls[0].request.headers _out, err = capsys.readouterr() assert 'uploading test.txt' in err def test_ia_upload_spreadsheet_missing_identifier(tmpdir_ch, capsys, caplog): with open('test.txt', 'w') as fh: fh.write('foo') with open('test.csv', 'w') as fh: fh.write('file\n') fh.write('test.txt\n') ia_call(['ia', 'upload', '--spreadsheet', 'test.csv'], expected_exit_code=1) assert 'error: no identifier column on spreadsheet.' in capsys.readouterr().err def test_ia_upload_spreadsheet_empty_identifier(tmpdir_ch, capsys, caplog): with open('test.txt', 'w') as fh: fh.write('foo') with open('test.csv', 'w') as fh: fh.write('identifier,file\n') fh.write(',test.txt\n') ia_call(['ia', 'upload', '--spreadsheet', 'test.csv'], expected_exit_code=1) assert 'error: no identifier column on spreadsheet.' in capsys.readouterr().err def test_ia_upload_spreadsheet_bom(tmpdir_ch, capsys): with open('test.txt', 'w') as fh: fh.write('foo') with open('test.csv', 'wb') as fh: fh.write(b'\xef\xbb\xbf') fh.write(b'identifier,file\n') fh.write(b'nasa,test.txt\n') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/test.txt', body='', content_type='text/plain') ia_call(['ia', 'upload', '--spreadsheet', 'test.csv']) _out, err = capsys.readouterr() assert 'uploading test.txt' in err def test_ia_upload_checksum(tmpdir_ch, caplog): with open('test.txt', 'w') as fh: fh.write('foo') # First upload, file not in metadata yet with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/test.txt', body='', content_type='text/plain') ia_call(['ia', '--log', 'upload', 'nasa', 'test.txt', '--checksum']) assert f'uploaded test.txt to {PROTOCOL}//s3.us.archive.org/nasa/test.txt' in caplog.text caplog.clear() # Second upload with file in metadata def insert_test_txt(body): body = json.loads(body) body['files'].append({'name': 'test.txt', 'md5': 'acbd18db4cc2f85cedef654fccc4a4d8'}) return json.dumps(body) with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa', transform_body=insert_test_txt) ia_call(['ia', '--log', 'upload', 'nasa', 'test.txt', '--checksum'], expected_exit_code=1) assert f'test.txt already exists: {PROTOCOL}//s3.us.archive.org/nasa/test.txt' in caplog.text caplog.clear() # Second upload with spreadsheet with open('test.csv', 'w') as fh: fh.write('identifier,file\n') fh.write('nasa,test.txt\n') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa', transform_body=insert_test_txt) ia_call(['ia', '--log', 'upload', '--spreadsheet', 'test.csv', '--checksum'], expected_exit_code=1) assert f'test.txt already exists: {PROTOCOL}//s3.us.archive.org/nasa/test.txt' in caplog.text def test_ia_upload_keep_directories(tmpdir_ch, caplog): os.mkdir('foo') with open('foo/test.txt', 'w') as fh: fh.write('foo') with open('test.csv', 'w') as fh: fh.write('identifier,file\n') fh.write('nasa,foo/test.txt\n') # Default behaviour with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/test.txt', body='', content_type='text/plain') ia_call(['ia', '--log', 'upload', 'nasa', 'foo/test.txt']) assert f'uploaded test.txt to {PROTOCOL}//s3.us.archive.org/nasa/test.txt' in caplog.text caplog.clear() with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/test.txt', body='', content_type='text/plain') ia_call(['ia', '--log', 'upload', '--spreadsheet', 'test.csv']) assert f'uploaded test.txt to {PROTOCOL}//s3.us.archive.org/nasa/test.txt' in caplog.text caplog.clear() # With the option with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/foo/test.txt', body='', content_type='text/plain') ia_call(['ia', '--log', 'upload', 'nasa', 'foo/test.txt', '--keep-directories']) assert f'uploaded foo/test.txt to {PROTOCOL}//s3.us.archive.org/nasa/foo/test.txt' in caplog.text caplog.clear() with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.PUT, f'{PROTOCOL}//s3.us.archive.org/nasa/foo/test.txt', body='', content_type='text/plain') ia_call(['ia', '--log', 'upload', '--spreadsheet', 'test.csv', '--keep-directories']) assert f'uploaded foo/test.txt to {PROTOCOL}//s3.us.archive.org/nasa/foo/test.txt' in caplog.text python-internetarchive-5.7.2/tests/conftest.py000066400000000000000000000065651513674652200216260ustar00rootroot00000000000000import os import re import sys from subprocess import PIPE, Popen import pytest import responses from responses import RequestsMock from internetarchive import get_session from internetarchive.api import get_item from internetarchive.cli import ia from internetarchive.utils import json PROTOCOL = 'https:' BASE_URL = 'https://archive.org/' METADATA_URL = f'{BASE_URL}metadata/' ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) TEST_CONFIG = os.path.join(ROOT_DIR, 'tests/ia.ini') NASA_METADATA_PATH = os.path.join(ROOT_DIR, 'tests/data/metadata/nasa.json') NASA_EXPECTED_FILES = { 'globe_west_540.jpg', 'globe_west_540_thumb.jpg', 'nasa_archive.torrent', 'nasa_meta.sqlite', 'nasa_files.xml', 'nasa_meta.xml', 'nasa_reviews.xml', 'nasa_itemimage.jpg', '__ia_thumb.jpg', } def ia_call(argv, expected_exit_code=0): # Use a test config for all `ia` tests. argv.insert(1, '--config-file') argv.insert(2, TEST_CONFIG) sys.argv = argv try: ia.main() except SystemExit as exc: exit_code = exc.code if exc.code else 0 assert exit_code == expected_exit_code def files_downloaded(path): found_files = set() try: found_files = set(os.listdir(path)) except OSError: pass return found_files def load_file(filename): with open(filename) as fh: return fh.read() def load_test_data_file(filename): return load_file(os.path.join(ROOT_DIR, 'tests/data/', filename)) def call_cmd(cmd, expected_exit_code=0): proc = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) # noqa: S602 stdout, stderr = proc.communicate() stdout = stdout.decode('utf-8').strip() stderr = stderr.decode('utf-8').strip() if proc.returncode != expected_exit_code: print(stdout) print(stderr) assert proc.returncode == expected_exit_code return (stdout, stderr) class IaRequestsMock(RequestsMock): def add_metadata_mock(self, identifier, body=None, method=responses.GET, protocol='https?', transform_body=None): url = re.compile(f'{protocol}://archive.org/metadata/{identifier}') if body is None: body = load_test_data_file(f'metadata/{identifier}.json') if transform_body: body = transform_body(body) self.add(method, url, body=body, content_type='application/json') def mock_all_downloads(self, num_calls=1, body='test content', protocol='https?'): url = re.compile(f'{protocol}://archive.org/download/.*') for _ in range(6): self.add(responses.GET, url, body=body) @pytest.fixture def tmpdir_ch(tmpdir): tmpdir.chdir() return tmpdir @pytest.fixture def nasa_mocker(): with IaRequestsMock() as mocker: mocker.add_metadata_mock('nasa') yield mocker @pytest.fixture def nasa_item(): session = get_session() with IaRequestsMock() as mocker: mocker.add_metadata_mock('nasa') yield session.get_item('nasa') @pytest.fixture def session(): return get_session(config={'s3': {'access': 'access', 'secret': 'secret'}}) @pytest.fixture def nasa_metadata(): return json.loads(load_test_data_file('metadata/nasa.json')) # TODO: Why is this function defined twice in this file? See issue #505 @pytest.fixture # type: ignore def nasa_item(nasa_mocker): # noqa: F811 return get_item('nasa') python-internetarchive-5.7.2/tests/data/000077500000000000000000000000001513674652200203245ustar00rootroot00000000000000python-internetarchive-5.7.2/tests/data/advanced_search_response.json000066400000000000000000001071341513674652200262350ustar00rootroot00000000000000{"responseHeader":{"status":0,"QTime":216,"params":{"wt":"json","rows":"50","qin":"collection:nasa","fl":"","start":"0","q":"collection:nasa"}},"response":{"numFound":50,"start":0,"maxScore":5.409824,"docs":[{"mediatype":"movies","title":"Red rover Goes To Mars-Student Astronaut B-Roll Compilation","description":"A brief compilation of b-roll of Student Astronauts Courtney Dressing and Rafael Morozowski working with the Spirit rover team. Master: DVCProHD.","year":2004,"downloads":2,"week":0,"month":0,"identifier":"1redRoverGoesToMars-studentAstronautB-rollCompilation","format":["Metadata"],"collection":["nasa"],"creator":["NASA/JPL-Caltech"],"score":5.409824},{"description":"Douglas XP-91 Airplane wing. Investigation of stability and control characteristics of a full scale model. Side view.","mediatype":"image","title":"A-11464","year":1947,"publicdate":"2010-06-30T19:35:59Z","downloads":28,"week":0,"month":0,"identifier":"A-11464","subject":["Where -- Douglas"],"format":["JPEG Thumb","Metadata","TIFF"],"collection":["nasa"],"creator":["NASA Ames Research Center"],"score":5.409824},{"mediatype":"movies","title":"Red rover Goes To Mars-Student Astronaut B-Roll Compilation","description":"A brief compilation of b-roll of Student Astronauts Courtney Dressing and Rafael Morozowski working with the Spirit rover team. Master: DVCProHD.","year":2004,"publicdate":"2009-07-10T01:41:17Z","downloads":12,"week":0,"month":0,"identifier":"Archive_test_redRoverGoesToMars-studentAstronautB-rollCompilation","format":["Metadata"],"collection":["nasa"],"creator":["NASA/JPL-Caltech"],"score":5.409824},{"mediatype":"movies","title":"Physics of Toys","description":"NASA video","downloads":24,"week":0,"month":0,"identifier":"PhysicsOfToys","subject":["nasa","toys","physics"],"format":["Metadata"],"collection":["nasa"],"score":5.409824},{"mediatype":"movies","title":"Vice President Dick Cheney Visits JPL-Video File","description":"Vice President Dick Cheney addressed the Mars Exploration Rover Project Team and the JPL community. NASA's Deputy Administrator Fred Gregory and JPL's Director Dr. Charles Elachi also addresses the JPL community. Master: DVCPro25.Audio 1: Mono mix 2: Mono mix.","year":2004,"downloads":4,"week":0,"month":0,"identifier":"1vicePresidentDickCheneyVisitsJpl-videoFile","format":["Metadata"],"collection":["nasa"],"creator":["NASA/JPL-Caltech"],"score":5.409824},{"mediatype":"movies","title":"Red rover Goes To Mars-Student Astronaut B-Roll Compilation","description":"A brief compilation of b-roll of Student Astronauts Courtney Dressing and Rafael Morozowski working with the Spirit rover team. Master: DVCProHD.","year":2004,"publicdate":"2009-07-10T18:17:02Z","downloads":20,"week":0,"month":0,"identifier":"Nup_test_redRoverGoesToMars-studentAstronautB-rollCompilation","format":["Archive BitTorrent","MPEG1","Metadata"],"collection":["nasa"],"creator":["NASA/JPL-Caltech"],"score":5.409824},{"title":"Physics of Toys","mediatype":"movies","description":"NASA educational video","publicdate":"2010-04-23T17:51:37Z","downloads":18,"week":0,"month":0,"identifier":"PhysicsOfToys_174","subject":["nasa","physics"],"format":["Metadata"],"collection":["nasa"],"score":5.409824},{"mediatype":"movies","title":"Red rover Goes To Mars-Student Astronaut B-Roll Compilation","description":"A brief compilation of b-roll of Student Astronauts Courtney Dressing and Rafael Morozowski working with the Spirit rover team. Master: DVCProHD.","year":2004,"publicdate":"2009-07-09T05:14:49Z","downloads":24,"week":0,"month":0,"identifier":"Test_localredRoverGoesToMars-studentAstronautB-rollCompilation","format":["512Kb MPEG4","Animated GIF","MPEG1","Metadata","Ogg Video","Thumbnail"],"collection":["nasa"],"creator":["NASA/JPL-Caltech"],"score":5.409824},{"mediatype":"movies","title":"MER \"Spirit\" Stand Up Compilation","description":"MER Team reacts to confirmation of stand up deployment of Spirit rover. Master: DVCProHD. Audio 1: Mono mix 2: Mono mix.","year":2004,"downloads":3,"week":0,"month":0,"identifier":"1merspiritStandUpCompilation","format":["Metadata"],"collection":["nasa"],"creator":["NASA/JPL-Caltech"],"score":5.409824},{"date":"2010-09-13T00:00:00Z","description":"NASA Administrator Charles F. Bolden speaks at the Green Aviation Summit, hosted Sept. 8-9, 2010, by the Aeronautics Research Mission Directorate at NASA's Ames Research Center in Moffett Field, Calif. Experts from NASA, other federal organizations, industry and academia gathered to discuss groundbreaking solutions that NASA and its research partners are developing to reduce aircraft noise, emissions and fuel consumption, and to ensure the safe and manageable growth of aviation. Image Credit: NASA/Eric James","mediatype":"image","source":"http://www.nasa.gov/images/content/481213main_gas_bolden_ames_full.jpg","title":"Administrator Addresses Green Aviation Summit","year":2010,"publicdate":"2010-11-21T00:50:43Z","downloads":84,"week":0,"month":0,"identifier":"481213main_gas_bolden_ames","subject":["Where -- Ames Research Center (ARC)"],"format":["Archive BitTorrent","JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA"],"score":5.409824},{"title":"SOFIA Test Flight","mediatype":"image","description":"An F/A-18 mission support aircraft shadows NASA's Stratospheric Observatory for Infrared Astronomy 747SP during a functional check flight in restricted airspace near Edwards Air Force Base and the Dryden Flight Research Center on Dec. 9, 2009. The flight included an evaluation of the aircraft's systems, including engines, flight controls and communication. Credit: NASA / Jim Ross","publicdate":"2010-03-01T21:21:24Z","downloads":9,"week":0,"month":0,"identifier":"411090main_ED09-0342-03-1-_full.jpg","subject":["Edwards Air Force Base"],"format":["Metadata"],"collection":["nasa"],"score":5.409824},{"title":"NASA Chief Technologist Hosts Town Hall","mediatype":"image","description":"NASA's Chief Technologists, Bobby Braun, hosts a Town Hall meeting to discuss agency-wide technology policy and programs at NASA Headquarters on Tuesday, May 25, 2010, in Washington. Photo Credit: NASA/Carla Cioffi","year":2010,"publicdate":"2010-06-08T18:58:46Z","downloads":32,"week":0,"month":0,"identifier":"201005250003HQ","subject":["Where -- NASA Headquarters","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Carla Cioffi"],"score":5.409824},{"description":"The Soyuz TMA-16 spacecraft is seen as it lands with Expedition 22 Commander Jeff Williams and Flight Engineer Maxim Suraev near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:19:49Z","downloads":50,"week":0,"month":0,"identifier":"201003180014HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"title":"Soyuz TMA-17 Lands","mediatype":"image","description":"Expedition 23 Flight Engineer T.J. Creamer is carried in a chair to the medical tent just minutes after he and fellow crew members Soichi Noguchi and Commander Oleg Kotov landed in their Soyuz TMA-17 capsule near the town of Zhezkazgan, Kazakhstan on Wednesday, June 2, 2010. NASA Astronaut Creamer, Russian Cosmonaut Kotov and Japanese Astronaut Noguchi are returning from six months onboard the International Space Station where they served as members of the Expedition 22 and 23 crews. Photo Credit: NASA/Bill Ingalls","year":2010,"publicdate":"2010-06-08T19:01:43Z","downloads":51,"week":0,"month":0,"identifier":"201006020009HQ","subject":["Who -- Soichi Noguchi","Who -- Oleg Kotov","What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"The Soyuz TMA-16 spacecraft is seen as it lands with Expedition 22 Commander Jeff Williams and Flight Engineer Maxim Suraev near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:20:34Z","downloads":50,"week":0,"month":0,"identifier":"201003180008HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"title":"Soyuz TMA-17 Lands","mediatype":"image","description":"Expedition 23 Flight Engineer T.J. Creamer is seen sitting in a chair outside the Soyuz Capsule just minutes after he and fellow crew members Soichi Noguchi and Commander Oleg Kotov landed in their Soyuz TMA-17 capsule near the town of Zhezkazgan, Kazakhstan on Wednesday, June 2, 2010. NASA Astronaut Creamer, Russian Cosmonaut Kotov and Japanese Astronaut Noguchi are returning from six months onboard the International Space Station where they served as members of the Expedition 22 and 23 crews. Photo Credit: NASA/Bill Ingalls","year":2010,"publicdate":"2010-06-08T19:01:06Z","downloads":54,"week":0,"month":0,"identifier":"201006020004HQ","subject":["Who -- Soichi Noguchi","Who -- Oleg Kotov","What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"title":"Expedition 24 Crew Departs Russia","mediatype":"image","description":"Expedition 24 Backup crew members, from left, Italian astronaut Paolo Nespoli, Russian cosmonaut Dmitri Kondratiev, and U.S. astronaut Catherine Coleman talk to the press with Expedition 24 prime crew members from left, U.S. astronaut Shannon Walker, Russian cosmonaut Fyodor Yurchikhin, and U.S. astronaut Doug Wheelock, during an official farewell ceremony before their departure to Baikonur, Kazakhstan from the Gagarin Cosmonaut Training Center in Star City outside Moscow June 3, 2010. Wheelock, Walker and Yurchikhin are scheduled to fly to the International Space Station ISS in a Soyuz TMA-19 spacecraft June 16, 2010. Photo Credit: NASA/Bill Ingalls","year":2010,"publicdate":"2010-06-08T19:02:14Z","downloads":47,"week":0,"month":0,"identifier":"201006030003HQ","subject":["Who -- Catherine Coleman","Who -- Fyodor Yurchikhin","What -- International Space Station ISS","What -- Soyuz TM","Where -- Kazakhstan","Where -- Moscow"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"title":"NASA SCI Files - Hurricane Paths And Landfall","description":"NASA Why? Files segment explaining how the NOAA predicts hurricane severity and travel patterns.","source":"http://nasa.ibiblio.org/details.php?videoid=6227&start=100&subject=Science","mediatype":"movies","publicdate":"2009-07-08T23:08:24Z","downloads":140,"week":0,"month":0,"identifier":"NasaSciFiles-HurricanePathsAndLandfall","subject":["NOAA: National Oceanic And Atmospheric Administration","Michelle Meinelli","Prediction","Decadal Scale","Circulation","Air","Landfall"],"format":["Animated GIF","Archive BitTorrent","MPEG1","Metadata","Ogg Video","Thumbnail","Video Index","h.264"],"language":["eng"],"collection":["nasa"],"creator":["NASA LaRC Office of Education"],"score":5.409824},{"title":"NASA Chief Technologist Hosts Town Hall","mediatype":"image","description":"Bobby Braun, right, NASA's Chief Technologist, answers questions during a Town Hall meeting to discuss agency-wide technology policy and programs at NASA Headquarters on Tuesday, May 25, 2010, in Washington. Photo Credit: NASA/Carla Cioffi","year":2010,"publicdate":"2010-06-08T18:58:54Z","downloads":54,"week":0,"month":0,"identifier":"201005250005HQ","subject":["Where -- NASA Headquarters","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Carla Cioffi"],"score":5.409824},{"title":"House Hearing NASA Human Spaceflight Plan","mediatype":"image","description":"Apollo 11 Commander Neil Armstrong, left, testifies during a hearing before the House Science and Technology Committee, Tuesday, May 26, 2010, at the Rayburn House office building on Capitol Hill in Washington as retired Navy Captain and commander of Apollo 17 Eugene Cernan looks on. The hearing was to review proposed human spaceflight plan by NASA. Photo Credit: NASA/Paul E. Alers","year":2010,"publicdate":"2010-06-08T18:59:46Z","downloads":40,"week":0,"month":0,"identifier":"201005260004HQ","subject":["Who -- Neil A. Armstrong","What -- Apollo 11","What -- Apollo 17","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Paul E. Alers"],"score":5.409824},{"title":"House Hearing NASA Human Spaceflight Plan","mediatype":"image","description":"Apollo 11 Commander Neil Armstrong, left, and retired Navy Captain and commander of Apollo 17 Eugene Cernana, confer prior to testifying at a hearing before the House Science and Technology Committee, Tuesday, May 26, 2010, at the Rayburn House office building on Capitol Hill in Washington. The hearing was to review proposed human spaceflight plan by NASA. Photo Credit: NASA/Paul E. Alers","year":2010,"publicdate":"2010-06-08T19:00:10Z","downloads":51,"week":1,"month":1,"identifier":"201005260007HQ","subject":["Who -- Neil A. Armstrong","What -- Apollo 11","What -- Apollo 17","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Paul E. Alers"],"score":5.409824},{"title":"House Hearing NASA Human Spaceflight Plan","mediatype":"image","description":"Retired Navy Captain and commander of Apollo 17 Eugene Cernan, center, is flanked by Apollo 11 Commander Neil Armstrong, left, and A. Thomaas Young, as he testifies during a hearing before the House Science and Technology Committee, Tuesday, May 26, 2010, at the Rayburn House office building on Capitol Hill in Washington. The hearing was to review proposed human spaceflight plan by NASA. Photo Credit: NASA/Paul E. Alers","year":2010,"publicdate":"2010-06-08T19:00:27Z","downloads":46,"week":1,"month":1,"identifier":"201005260009HQ","subject":["Who -- Neil A. Armstrong","What -- Apollo 11","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Paul E. Alers"],"score":5.409824},{"title":"NASA SCI Files - Hurricane Game","description":"NASA Why? Files segment involving students in an activity to track the paths of hurricanes.","source":"http://nasa.ibiblio.org/details.php?videoid=6226&start=100&subject=Science","mediatype":"movies","publicdate":"2009-07-08T23:08:03Z","downloads":143,"week":0,"month":0,"identifier":"NasaSciFiles-HurricaneGame","subject":["Niki Srigley","Ashaundria Greene","Molly Gallina","Thompson Elementary School","Tracking","Track Chart","Coordinate","Landfall","Hurricane Watch","Hurricane Warning","Why? Files Kids' Club"],"format":["Animated GIF","Archive BitTorrent","MPEG1","Metadata","Ogg Video","Thumbnail","Video Index","h.264"],"language":["eng"],"collection":["nasa"],"creator":["NASA LaRC Office of Education"],"score":5.409824},{"description":"The Soyuz TMA-16 spacecraft is seen as it lands with Expedition 22 Commander Jeff Williams and Flight Engineer Maxim Suraev near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:20:11Z","downloads":50,"week":1,"month":1,"identifier":"201003180011HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"Expedition 22 Flight Engineer Maxim Suraev is carried in a chair to the medical tent shortly after he and Flight Engineer Maxim Suraev landed in their Soyuz TMA-16 capsule near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:19:03Z","downloads":44,"week":1,"month":1,"identifier":"201003180003HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"title":"NASA SCI Files - Clouds","description":"NASA Why? Files segment describing different cloud types and how they form.","source":"http://nasa.ibiblio.org/details.php?videoid=6221&start=100&subject=Science","mediatype":"movies","publicdate":"2009-07-08T23:07:46Z","downloads":264,"week":1,"month":1,"identifier":"NasaSciFiles-Clouds","subject":["Dr. Lin Chambers","Water Vapor","Condensation","Cirrus","Cumulous","Stratus","S'cool","Students' Cloud Observations On-Line"],"format":["Animated GIF","Archive BitTorrent","MPEG1","Metadata","Ogg Video","Thumbnail","h.264"],"language":["eng"],"collection":["nasa"],"creator":["NASA LaRC Office of Education"],"score":5.409824},{"title":"NASA Chief Technologist Hosts Town Hall","mediatype":"image","description":"Bobby Braun, NASA's Chief Technologist, answers questions during a Town Hall meeting to discuss agency-wide technology policy and programs at NASA Headquarters on Tuesday, May 25, 2010, in Washington.","year":2010,"publicdate":"2010-06-08T18:58:13Z","downloads":37,"week":1,"month":1,"identifier":"201005250006HQ","subject":["Where -- NASA Headquarters","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Carla Cioffi"],"score":5.409824},{"description":"Expedition 22 Flight Engineer Maxim Suraev rests in a chair outside the Soyuz Capsule just minutes after he and Commander Jeff Williams landed in their Soyuz TMA-16 spacecraft near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:22:13Z","downloads":57,"week":1,"month":1,"identifier":"201003180019HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"Expedition 22 Commander Jeff Williams talks on a satellite phone as he is carried in a chair to the medical tent shortly after he and Flight Engineer Maxim Suraev landed in their Soyuz TMA-16 capsule near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:19:11Z","downloads":47,"week":1,"month":1,"identifier":"201003180004HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"The Soyuz TMA-16 spacecraft is seen as it lands with Expedition 22 Commander Jeff Williams and Flight Engineer Maxim Suraev near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:19:41Z","downloads":51,"week":1,"month":1,"identifier":"201003180015HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"The first three Russian Search and Recovery helicopters are seen on the ground as support teams arrive to the landing of Expedition 22 Commander Jeff Williams and Flight Engineer Maxim Suraev in their Soyuz TMA-16 spacecraft near Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:19:19Z","downloads":49,"week":1,"month":1,"identifier":"201003180018HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"Expedition 22 Flight Engineer Maxim Suraev rests in a chair outside the Soyuz Capsule just minutes after he and Commander Jeff Williams landed in their Soyuz TMA-16 spacecraft near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:21:58Z","downloads":58,"week":1,"month":1,"identifier":"201003180021HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"title":"Super Bowl XLIV Game Coin which flew on STS-129","mediatype":"image","description":"Super Bowl XLIV Game Coin which flew on STS-129","year":2010,"publicdate":"2010-04-06T23:19:42Z","downloads":85,"week":1,"month":1,"identifier":"C2010_00310","subject":["NFL","National Football League","Super Bowl XLIV","Silver medallion","What -- STS-129","Canton, Ohio","Glenn Research Center"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Marv Smith"],"score":5.409824},{"date":"2010-01-01T00:00:00Z","description":"Clarence Sams explains what we learn about the human body and exploration by conducting missions in low Earth orbit onboard the International Space Station.","mediatype":"movies","title":"Lessons Learned for Exploration","year":2010,"publicdate":"2010-07-09T23:58:48Z","downloads":75,"week":1,"month":1,"identifier":"cxemm_earthclip02_1080i","subject":["What -- Earth","What -- International Space Station ISS"],"format":["512Kb MPEG4","Animated GIF","Metadata","Ogg Video","QuickTime","Thumbnail"],"collection":["nasa"],"creator":["nasa"],"score":5.409824},{"title":"House Hearing NASA Human Spaceflight Plan","mediatype":"image","description":"Retired Navy Captain and commander of Apollo 17 Eugene Cernan, center, testifies during a hearing before the House Science and Technology Committee, Tuesday, May 26, 2010, at the Rayburn House office building on Capitol Hill in Washington as Apollo 11 Commander Neil Armstrong, left, looks on. The hearing was to review proposed human spaceflight plan by NASA. Photo Credit: NASA/Paul E. Alers","year":2010,"publicdate":"2010-06-08T19:00:02Z","downloads":74,"week":1,"month":1,"identifier":"201005260006HQ","subject":["Who -- Neil A. Armstrong","What -- Apollo 11","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Paul E. Alers"],"score":5.409824},{"title":"Tethered Satellites Forces and Motion","mediatype":"movies","description":"NASA video","publicdate":"2010-04-20T04:01:04Z","downloads":71,"week":1,"month":1,"identifier":"TetheredSatellitsForcesAndMotion","subject":["education","nasa","science"],"format":["512Kb MPEG4","Animated GIF","Metadata","Ogg Video","Thumbnail","Windows Media"],"collection":["nasa"],"score":5.409824},{"description":"Producer/Director Toni Myers speaks to the audience prior to the World Premiere of 'Hubble 3D', screened at the Smithsonian's Air and Space Museum Tuesday evening, March 9, 2010, in Washington. Photo Credit: NASA/Paul E. Alers","mediatype":"image","title":"Hubble IMAX Premier","year":2010,"publicdate":"2010-04-06T20:18:17Z","downloads":63,"week":1,"month":1,"identifier":"201003090004HQ","subject":["Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Paul E. Alers"],"score":5.409824},{"description":"Moviegoers wear 3D glasses as they watch the World Premiere of 'Hubble 3D', screened at the Smithsonian's Air and Space Museum Tuesday evening, March 9, 2010, in Washington. Photo Credit: NASA/Paul E. Alers","mediatype":"image","title":"Hubble IMAX Premier","year":2010,"publicdate":"2010-04-06T20:18:08Z","downloads":70,"week":1,"month":1,"identifier":"201003090003HQ","subject":["Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Paul E. Alers"],"score":5.409824},{"date":"2010-06-17T00:00:00Z","description":"NASA Administrator Charles Bolden, met with professors and graduate students of Science programs at Cairo University, met with the Egyptian Ministry of Higher Education and Scientific Research, and gave a presentation at the American University in Cairo called ''Developments in Space Exploration and the Future of International Cooperation.'' He also talked with secondary school students and teachers, and made a visit to a conference on remote sensing funded jointly by the U.S.-Egypt Science and Technology Fund. His visit to Egypt comes shortly after the first anniversary of President Barack Obama's historic visit to Cairo in which he launched his New Beginning Initiative to enhance cooperation and understanding between the U.S. and the Muslim world, especially in the fields of science and technology. This visit also follows visits by other senior American figures in the fields of science and technology, including the Presidential Science and Technology Envoy Dr. Ahmed Zewail, to explore means for future cooperation between the U.S. and Muslim majority countries. Credit: U.S. Embassy, Cairo","mediatype":"image","source":"http://www.nasa.gov/images/content/463755main_bolden_cairo_full.jpg","title":"Administrator Bolden in Cairo","year":2010,"publicdate":"2010-11-21T00:50:56Z","downloads":85,"week":1,"month":1,"identifier":"463755main_bolden_cairo","subject":["Who -- Charles Bolden","Who -- Barack Obama","Where -- Cairo","Where -- Egypt"],"format":["Archive BitTorrent","JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA"],"score":5.409824},{"title":"House Hearing NASA Human Spaceflight Plan","mediatype":"image","description":"Apollo 11 Commander Neil Armstrong makes a point as he testifies during a hearing before the House Science and Technology Committee, Tuesday, May 26, 2010, at the Rayburn House office building on Capitol Hill in Washington. The hearing was to review proposed human spaceflight plan by NASA. Photo Credit: NASA/Paul E. Alers","year":2010,"publicdate":"2010-06-08T19:00:18Z","downloads":52,"week":0,"month":1,"identifier":"201005260008HQ","subject":["Who -- Neil A. Armstrong","What -- Apollo 11","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Paul E. Alers"],"score":5.409824},{"title":"Soyuz TMA-17 Lands","mediatype":"image","description":"Expedition 23 Commander Oleg Kotov is seen sitting in a chair outside the Soyuz Capsule just minutes after he and fellow crew members T.J. Creamer and Soichi Noguchi landed in their Soyuz TMA-17 capsule near the town of Zhezkazgan, Kazakhstan on Wednesday, June 2, 2010. NASA Astronaut Creamer, Russian Cosmonaut Kotov and Japanese Astronaut Noguchi are returning from six months onboard the International Space Station where they served as members of the Expedition 22 and 23 crews. Photo Credit: NASA/Bill Ingalls","year":2010,"publicdate":"2010-06-08T19:01:13Z","downloads":40,"week":0,"month":0,"identifier":"201006020005HQ","subject":["Who -- Oleg Kotov","Who -- Soichi Noguchi","What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"Expedition 22 Commander Jeff Williams, left, and NASA Flight Surgeon Steve Gilmore try to catch a glimpse of the Soyuz TMA-16 capsule out the window of their helicopter as it departs the landing scene after Williams and Flight Engineer Maxim Suraev landed in their Soyuz TMA-16 capsule near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:21:04Z","downloads":65,"week":0,"month":1,"identifier":"201003180028HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"date":"2009-01-01T00:00:00Z","description":"ASA centers across the United States are working together to design, test and manufacture hardware for this new era in human space exploration.","mediatype":"movies","title":"Constellation Quarterly Report June 2009","year":2009,"publicdate":"2010-07-09T23:58:11Z","downloads":55,"week":0,"month":1,"identifier":"cx_quarterly_june2009_720p","subject":["Where -- United States of America"],"format":["512Kb MPEG4","Animated GIF","Metadata","Ogg Video","QuickTime","Thumbnail"],"collection":["nasa"],"creator":["nasa"],"score":5.409824},{"description":"Support teams are seen at the landing site of the Soyuz TMA-16 that landed with Expedition 22 Commander Jeff Williams and Flight Engineer Maxim Suraev near Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:21:27Z","downloads":56,"week":0,"month":0,"identifier":"201003180025HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"title":"Super Bowl XLIV Game Coin which flew on STS-129","mediatype":"image","description":"Super Bowl XLIV Game Coin which flew on STS-129","year":2010,"publicdate":"2010-04-06T23:19:34Z","downloads":83,"week":0,"month":0,"identifier":"C2010_00308","subject":["NFL","National Football League","Super Bowl XLIV","Silver medallion","What -- STS-129","Canton, Ohio","Glenn Research Center"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Marv Smith"],"score":5.409824},{"description":"Expedition 22 Commander Jeff Williams, left, and Flight Engineer Maxim Suraev sit in chairs outside the Soyuz Capsule just minutes after they landed near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:18:48Z","downloads":41,"week":0,"month":0,"identifier":"201003180002HQ","subject":["What -- International Space Station ISS","What -- Soyuz TM","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"The Soyuz TMA-16 spacecraft is seen as it lands with Expedition 22 Commander Jeff Williams and Flight Engineer Maxim Suraev near the town of Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:19:26Z","downloads":45,"week":0,"month":0,"identifier":"201003180017HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"description":"On Launch Pad 17-B at Cape Canaveral Air Force Station in Florida, the first half of the fairing is moved into place around NASA's Kepler spacecraft, atop the United Launch Alliance Delta II rocket. The fairing is a molded structure that fits flush with the outside surface of the rocket and forms an aerodynamically smooth nose cone, protecting the spacecraft during launch and ascent. Photo credit: NASA/Jack Pfaller","mediatype":"image","publicdate":"2009-08-19T23:48:01Z","source":"http://www.nasa.gov/mission_pages/kepler/multimedia/images/09-02-26.html","title":"315467main_keplerpayloadfairing_full","year":2009,"downloads":56,"week":0,"month":1,"identifier":"315467main_keplerpayloadfairing_full","subject":["Kepler, fairing"],"format":["Archive BitTorrent","JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA"],"score":5.409824},{"description":"Support teams are seen outside the inflatable medical tent at the landing site of the Soyuz TMA-16 that landed with Expedition 22 Commander Jeff Williams and Flight Engineer Maxim Suraev near Arkalyk, Kazakhstan on Thursday, March 18, 2010. NASA Astronaut Jeff Williams and Russian Cosmonaut Maxim Suraev are returning from six months onboard the International Space Station where they served as members of the Expedition 21 and 22 crews. Photo Credit: NASA/Bill Ingalls","mediatype":"image","title":"Soyuz TMA-16 Lands","year":2010,"publicdate":"2010-04-06T20:21:35Z","downloads":56,"week":0,"month":1,"identifier":"201003180024HQ","subject":["What -- Soyuz TM","What -- International Space Station ISS","Where -- Kazakhstan"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Bill Ingalls"],"score":5.409824},{"title":"Hubble IMAX Premier","mediatype":"image","description":"STS-125 astronaut Mike Massimino, speaks to a reporter on the red carpet prior to the World Premiere of 'Hubble 3D', screened at the Smithsonian's Air and Space Museum Tuesday evening, March 9, 2010, in Washington. Photo Credit: NASA/Paul E. Alers","year":2010,"publicdate":"2010-04-06T20:18:25Z","downloads":52,"week":0,"month":0,"identifier":"201003090005HQ","subject":["What -- STS-125","Where -- Washington"],"format":["JPEG","JPEG Thumb","Metadata"],"collection":["nasa"],"creator":["NASA/Paul E. Alers"],"score":5.409824}]}} python-internetarchive-5.7.2/tests/data/metadata/000077500000000000000000000000001513674652200221045ustar00rootroot00000000000000python-internetarchive-5.7.2/tests/data/metadata/nasa.json000066400000000000000000000166051513674652200237310ustar00rootroot00000000000000{ "created": 1427273784, "d1": "ia902606.us.archive.org", "d2": "ia802606.us.archive.org", "dir": "/7/items/nasa", "files": [ { "name": "NASAarchiveLogo.jpg", "source": "original", "size": "38366", "format": "Collection Header", "mtime": "1236194645", "md5": "64dcc1092df36142eb4aab7cc255a4a6", "crc32": "c554eac4", "sha1": "e1c225a0d37282f08685905a4ba1c5d7aec5fa64" }, { "name": "globe_west_540.jpg", "source": "original", "size": "66065", "format": "JPEG", "mtime": "1245274910", "md5": "9366a4b09386bf673c447e33d806d904", "crc32": "2283b5fd", "sha1": "3e20a009994405f535cdf07cdc2974cef2fce8f2" }, { "name": "nasa_reviews.xml", "source": "metadata", "size": "168", "format": "Metadata", "mtime": "1221069786", "md5": "97d24bade73a4c02f4f0a0359cf28fa6", "crc32": "2430b438", "sha1": "a5c310d5bd9976007ee99d769ebb7c1bf6307def" }, { "name": "nasa_meta.xml", "source": "metadata", "size": "7852", "format": "Metadata", "mtime": "1423450255", "md5": "0e339f4a29a8bc42303813cbec9243e5", "crc32": "3a41fbda", "sha1": "2fbcd566449fbd717b395607c9b79ff72ded8398" }, { "name": "nasa_archive.torrent", "source": "metadata", "btih": "81ea3dbf71b379b0ef1c34d19bc91d731b8101e2", "mtime": "1423450256", "size": "1579", "md5": "385aa96809db514699523ce7c4bb65c5", "crc32": "30d618a2", "sha1": "f371b0955e5b0d9e1278555abff74b0044e71f22", "format": "Archive BitTorrent" }, { "name": "nasa_files.xml", "source": "metadata", "format": "Metadata", "md5": "c0db07dc3ca660c0eb4af0ce7fc80cd3" } ], "files_count": 6, "is_collection": true, "item_size": 114030, "metadata": { "identifier": "nasa", "title": "NASA Images", "mediatype": "collection", "collection": "movies", "description": "
\n \n
\n Search NASA Images:\n \n \n \n \n \n \n \n Advanced Search\n
\n
", "hidden": "true", "publicdate": "2008-09-10 18:05:14", "addeddate": "2008-09-10 18:03:38", "uploader": "traceyj@archive.org", "updater": [ "tracey pooh", "julielefevre", "jhornstein", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "jhornstein", "jhornstein", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "Jake Johnson" ], "updatedate": [ "2008-09-10 18:05:38", "2008-09-10 18:39:01", "2008-09-12 18:48:03", "2009-03-04 19:21:48", "2009-03-04 19:29:52", "2009-03-04 19:30:49", "2009-03-04 20:30:17", "2009-03-04 22:47:33", "2009-03-06 19:33:26", "2009-06-16 22:21:35", "2009-06-17 21:37:01", "2009-06-17 21:48:00", "2009-06-17 21:49:20", "2009-06-17 21:50:23", "2009-06-17 21:51:06", "2009-06-17 21:51:31", "2010-05-11 14:54:17", "2010-05-11 14:55:56", "2010-05-11 17:12:50", "2010-05-11 17:15:01", "2010-05-11 17:25:03", "2010-05-27 18:28:23", "2010-05-27 18:35:40", "2010-05-27 18:42:25", "2010-05-27 18:48:57", "2010-05-27 18:57:17", "2010-05-27 19:01:22", "2010-05-27 19:09:38", "2010-05-27 19:17:18", "2010-05-27 19:20:46", "2010-05-27 19:28:40", "2010-05-27 19:35:59", "2010-05-27 19:44:52", "2010-05-27 19:50:23", "2010-05-27 22:08:57", "2010-05-27 22:50:20", "2010-05-27 22:56:05", "2010-05-27 23:12:25", "2010-05-27 23:25:58", "2010-05-27 23:30:48", "2010-05-27 23:41:34", "2010-05-27 23:49:48", "2010-05-27 23:56:41", "2010-05-28 00:03:41", "2010-05-28 00:09:53", "2010-05-28 00:19:21", "2010-05-28 20:37:49", "2010-05-28 20:45:07", "2010-05-28 20:53:37", "2010-05-28 21:07:25", "2010-05-28 21:15:46", "2010-05-28 21:26:32", "2010-05-28 21:53:15", "2010-06-03 18:01:53", "2010-06-03 18:15:12", "2010-06-07 17:13:22", "2010-06-15 18:20:13", "2010-06-15 18:52:46", "2010-06-22 22:49:30", "2010-06-25 17:31:06", "2010-07-21 18:41:35", "2010-11-17 20:10:55", "2010-11-17 20:25:04", "2010-11-17 20:45:36", "2010-11-17 23:35:20", "2010-12-07 01:26:09", "2011-03-22 18:44:49", "2011-03-22 18:46:23", "2011-03-22 18:47:26", "2011-03-22 18:52:42", "2011-03-22 18:54:28", "2011-03-22 18:56:14", "2011-09-30 18:23:08" ], "rights": "NASA Images Terms & Conditions", "homepage": "http:///www.nasaimages.org", "num_recent_reviews": "10", "num_top_dl": "10", "spotlight_identifier": "APOLLO16MMLAUNCHVIEWS", "show_browse_by_date": "true", "show_hidden_subcollections": "true", "num_subcollections": "15", "related_collection": "nasa_techdocs" }, "reviews": [], "server": "ia902606.us.archive.org", "uniq": 2131998567, "updated": 1427273788, "workable_servers": [ "ia902606.us.archive.org", "ia802606.us.archive.org" ] } python-internetarchive-5.7.2/tests/data/nasa_meta.json000066400000000000000000000166051513674652200231570ustar00rootroot00000000000000{ "created": 1427273784, "d1": "ia902606.us.archive.org", "d2": "ia802606.us.archive.org", "dir": "/7/items/nasa", "files": [ { "name": "NASAarchiveLogo.jpg", "source": "original", "size": "38366", "format": "Collection Header", "mtime": "1236194645", "md5": "64dcc1092df36142eb4aab7cc255a4a6", "crc32": "c554eac4", "sha1": "e1c225a0d37282f08685905a4ba1c5d7aec5fa64" }, { "name": "globe_west_540.jpg", "source": "original", "size": "66065", "format": "JPEG", "mtime": "1245274910", "md5": "9366a4b09386bf673c447e33d806d904", "crc32": "2283b5fd", "sha1": "3e20a009994405f535cdf07cdc2974cef2fce8f2" }, { "name": "nasa_reviews.xml", "source": "metadata", "size": "168", "format": "Metadata", "mtime": "1221069786", "md5": "97d24bade73a4c02f4f0a0359cf28fa6", "crc32": "2430b438", "sha1": "a5c310d5bd9976007ee99d769ebb7c1bf6307def" }, { "name": "nasa_meta.xml", "source": "metadata", "size": "7852", "format": "Metadata", "mtime": "1423450255", "md5": "0e339f4a29a8bc42303813cbec9243e5", "crc32": "3a41fbda", "sha1": "2fbcd566449fbd717b395607c9b79ff72ded8398" }, { "name": "nasa_archive.torrent", "source": "metadata", "btih": "81ea3dbf71b379b0ef1c34d19bc91d731b8101e2", "mtime": "1423450256", "size": "1579", "md5": "385aa96809db514699523ce7c4bb65c5", "crc32": "30d618a2", "sha1": "f371b0955e5b0d9e1278555abff74b0044e71f22", "format": "Archive BitTorrent" }, { "name": "nasa_files.xml", "source": "metadata", "format": "Metadata", "md5": "c0db07dc3ca660c0eb4af0ce7fc80cd3" } ], "files_count": 6, "is_collection": true, "item_size": 114030, "metadata": { "identifier": "nasa", "title": "NASA Images", "mediatype": "collection", "collection": "movies", "description": "
\n \n
\n Search NASA Images:\n \n \n \n \n \n \n \n Advanced Search\n
\n
", "hidden": "true", "publicdate": "2008-09-10 18:05:14", "addeddate": "2008-09-10 18:03:38", "uploader": "traceyj@archive.org", "updater": [ "tracey pooh", "julielefevre", "jhornstein", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "BonnieReal", "jhornstein", "jhornstein", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "SJO'Connell", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "jake@archive.org", "Jake Johnson" ], "updatedate": [ "2008-09-10 18:05:38", "2008-09-10 18:39:01", "2008-09-12 18:48:03", "2009-03-04 19:21:48", "2009-03-04 19:29:52", "2009-03-04 19:30:49", "2009-03-04 20:30:17", "2009-03-04 22:47:33", "2009-03-06 19:33:26", "2009-06-16 22:21:35", "2009-06-17 21:37:01", "2009-06-17 21:48:00", "2009-06-17 21:49:20", "2009-06-17 21:50:23", "2009-06-17 21:51:06", "2009-06-17 21:51:31", "2010-05-11 14:54:17", "2010-05-11 14:55:56", "2010-05-11 17:12:50", "2010-05-11 17:15:01", "2010-05-11 17:25:03", "2010-05-27 18:28:23", "2010-05-27 18:35:40", "2010-05-27 18:42:25", "2010-05-27 18:48:57", "2010-05-27 18:57:17", "2010-05-27 19:01:22", "2010-05-27 19:09:38", "2010-05-27 19:17:18", "2010-05-27 19:20:46", "2010-05-27 19:28:40", "2010-05-27 19:35:59", "2010-05-27 19:44:52", "2010-05-27 19:50:23", "2010-05-27 22:08:57", "2010-05-27 22:50:20", "2010-05-27 22:56:05", "2010-05-27 23:12:25", "2010-05-27 23:25:58", "2010-05-27 23:30:48", "2010-05-27 23:41:34", "2010-05-27 23:49:48", "2010-05-27 23:56:41", "2010-05-28 00:03:41", "2010-05-28 00:09:53", "2010-05-28 00:19:21", "2010-05-28 20:37:49", "2010-05-28 20:45:07", "2010-05-28 20:53:37", "2010-05-28 21:07:25", "2010-05-28 21:15:46", "2010-05-28 21:26:32", "2010-05-28 21:53:15", "2010-06-03 18:01:53", "2010-06-03 18:15:12", "2010-06-07 17:13:22", "2010-06-15 18:20:13", "2010-06-15 18:52:46", "2010-06-22 22:49:30", "2010-06-25 17:31:06", "2010-07-21 18:41:35", "2010-11-17 20:10:55", "2010-11-17 20:25:04", "2010-11-17 20:45:36", "2010-11-17 23:35:20", "2010-12-07 01:26:09", "2011-03-22 18:44:49", "2011-03-22 18:46:23", "2011-03-22 18:47:26", "2011-03-22 18:52:42", "2011-03-22 18:54:28", "2011-03-22 18:56:14", "2011-09-30 18:23:08" ], "rights": "NASA Images Terms & Conditions", "homepage": "http:///www.nasaimages.org", "num_recent_reviews": "10", "num_top_dl": "10", "spotlight_identifier": "APOLLO16MMLAUNCHVIEWS", "show_browse_by_date": "true", "show_hidden_subcollections": "true", "num_subcollections": "15", "related_collection": "nasa_techdocs" }, "reviews": [], "server": "ia902606.us.archive.org", "uniq": 2131998567, "updated": 1427273788, "workable_servers": [ "ia902606.us.archive.org", "ia802606.us.archive.org" ] } python-internetarchive-5.7.2/tests/data/nasa_meta.xml000066400000000000000000000172541513674652200230070ustar00rootroot00000000000000 nasa NASA Images collection movies <center> <img src="/download/nasa/globe_west_540.jpg" style="height:270px;width:270px;margin-bottom:25px"/> <form style="margin:0;padding:0;" action="/searchresults.php" id="searchform" method="post"> <b>Search NASA Images:</b> <input size="25" name="search" value="" style="font-size: 8pt"/> <input style="vertical-align:bottom; text-align:center; width:21px; height:21px; border:0px" name="gobutton" type="image" id="gobutton" value="Find" src="/images/go-button-gateway.gif"/> <input type="hidden" name="collection" value="nasa"/> <input type="hidden" name="limit" value="100"/> <input type="hidden" name="start" value="0"/> <input type="hidden" name="searchAll" value="yes"/> <input type="hidden" name="submit" value="this was submitted"/> <a href="/advancedsearch.php?q=collection:nasa" class="level3Header level3HeaderSearch">Advanced Search</a> </form> </center> true 2008-09-10 18:05:14 2008-09-10 18:03:38 traceyj@archive.org tracey pooh julielefevre jhornstein BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal BonnieReal jhornstein jhornstein SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell SJO'Connell jake@archive.org jake@archive.org jake@archive.org jake@archive.org jake@archive.org jake@archive.org jake@archive.org jake@archive.org jake@archive.org jake@archive.org jake@archive.org 2008-09-10 18:05:38 2008-09-10 18:39:01 2008-09-12 18:48:03 2009-03-04 19:21:48 2009-03-04 19:29:52 2009-03-04 19:30:49 2009-03-04 20:30:17 2009-03-04 22:47:33 2009-03-06 19:33:26 2009-06-16 22:21:35 2009-06-17 21:37:01 2009-06-17 21:48:00 2009-06-17 21:49:20 2009-06-17 21:50:23 2009-06-17 21:51:06 2009-06-17 21:51:31 2010-05-11 14:54:17 2010-05-11 14:55:56 2010-05-11 17:12:50 2010-05-11 17:15:01 2010-05-11 17:25:03 2010-05-27 18:28:23 2010-05-27 18:35:40 2010-05-27 18:42:25 2010-05-27 18:48:57 2010-05-27 18:57:17 2010-05-27 19:01:22 2010-05-27 19:09:38 2010-05-27 19:17:18 2010-05-27 19:20:46 2010-05-27 19:28:40 2010-05-27 19:35:59 2010-05-27 19:44:52 2010-05-27 19:50:23 2010-05-27 22:08:57 2010-05-27 22:50:20 2010-05-27 22:56:05 2010-05-27 23:12:25 2010-05-27 23:25:58 2010-05-27 23:30:48 2010-05-27 23:41:34 2010-05-27 23:49:48 2010-05-27 23:56:41 2010-05-28 00:03:41 2010-05-28 00:09:53 2010-05-28 00:19:21 2010-05-28 20:37:49 2010-05-28 20:45:07 2010-05-28 20:53:37 2010-05-28 21:07:25 2010-05-28 21:15:46 2010-05-28 21:26:32 2010-05-28 21:53:15 2010-06-03 18:01:53 2010-06-03 18:15:12 2010-06-07 17:13:22 2010-06-15 18:20:13 2010-06-15 18:52:46 2010-06-22 22:49:30 2010-06-25 17:31:06 2010-07-21 18:41:35 2010-11-17 20:10:55 2010-11-17 20:25:04 2010-11-17 20:45:36 2010-11-17 23:35:20 2010-12-07 01:26:09 2011-03-22 18:44:49 2011-03-22 18:46:23 2011-03-22 18:47:26 2011-03-22 18:52:42 2011-03-22 18:54:28 2011-03-22 18:56:14 <a href="http://web.archive.org/web/20121005192929/http://nasaimages.org/Terms.html" rel="nofollow">NASA Images Terms & Conditions</a> http:///www.nasaimages.org 10 10 APOLLO16MMLAUNCHVIEWS true true 15 nasa_techdocs 2011-09-30 18:23:08 Jake Johnson python-internetarchive-5.7.2/tests/data/s3_status_check.json000066400000000000000000000006151513674652200243060ustar00rootroot00000000000000{ "accesskey": "test_access", "bucket": "nasa", "detail": { "accesskey_ration": 74, "accesskey_tasks_queued": 0, "bucket_ration": 24, "bucket_tasks_queued": 0, "limit_reason": "", "rationing_engaged": 0, "rationing_level": 1399, "total_global_limit": 1799, "total_tasks_queued": 1017 }, "over_limit": 0 } python-internetarchive-5.7.2/tests/data/scrape_response.json000066400000000000000000000063761513674652200244260ustar00rootroot00000000000000{"items":[{"identifier":"00-042-154"},{"identifier":"00-042-32"},{"identifier":"00-042-43"},{"identifier":"00-042-56"},{"identifier":"00-042-71"},{"identifier":"00-042-94"},{"identifier":"00-050D-01"},{"identifier":"00-057D-01"},{"identifier":"00-062D-03"},{"identifier":"00-068D-01"},{"identifier":"00-068D-02"},{"identifier":"00-068D-03"},{"identifier":"00-068D-04"},{"identifier":"00-101-25"},{"identifier":"00-176-22A"},{"identifier":"00-42-140"},{"identifier":"00-83-9"},{"identifier":"01-012D-01"},{"identifier":"01-046-143"},{"identifier":"01-046-6"},{"identifier":"01-046-63"},{"identifier":"01-085-12"},{"identifier":"01-085-13"},{"identifier":"01-086-30"},{"identifier":"01A_trmm2004126"},{"identifier":"01B_amo2003131"},{"identifier":"01B_tmo2003134"},{"identifier":"02-13-12_NASA-FY-2013-Budget-Briefing"},{"identifier":"0204282315G8I01"},{"identifier":"031913EXP3637PreflightCrewNewsConference"},{"identifier":"031913EXP3637PreflightCrewNewsConference_201303"},{"identifier":"08S_tmo2002365"},{"identifier":"1293AAP"},{"identifier":"130429NybergPSA720p"},{"identifier":"13357AAP"},{"identifier":"14NewAstronautsPressConference"},{"identifier":"150227LeonardNiimoy720p"},{"identifier":"171269main_image_feature_776_ys"},{"identifier":"174156main_image_feature_800A_ys"},{"identifier":"175339main_image_feature_812_ys_full"},{"identifier":"175715main_8892072"},{"identifier":"177715main_image_feature_832_ys_full"},{"identifier":"178073main_image_feature_834_ys_full"},{"identifier":"180886main_image_feature_854_ys"},{"identifier":"187515main_image_feature_899_ys"},{"identifier":"1880-2004_temp_anomaly"},{"identifier":"189293main_drats07"},{"identifier":"1957-L-02098"},{"identifier":"1957-L-03413"},{"identifier":"1958-L-01082"},{"identifier":"1958-L-01150"},{"identifier":"1958-L-01840"},{"identifier":"1958-L-01843"},{"identifier":"1958-L-02861"},{"identifier":"1958-L-03440"},{"identifier":"1958-L-03445"},{"identifier":"1958-L-03446"},{"identifier":"1958-L-03469"},{"identifier":"1959-L-00057"},{"identifier":"1959-L-02333"},{"identifier":"1959-L-02334"},{"identifier":"1959-L-02571"},{"identifier":"1959-L-03980"},{"identifier":"1959-L-03983"},{"identifier":"1959-L-03984"},{"identifier":"1959-L-03985"},{"identifier":"1959-L-04323"},{"identifier":"1959-L-04712"},{"identifier":"1959-L-04882"},{"identifier":"1959-L-05705"},{"identifier":"1959-L-06102"},{"identifier":"1959-L-06848"},{"identifier":"1959-L-07307"},{"identifier":"1959-L-07321"},{"identifier":"1959-L-07567"},{"identifier":"1959-L-07574"},{"identifier":"1960-L-01459"},{"identifier":"1960-L-02449"},{"identifier":"1960-L-02517"},{"identifier":"1960-L-02681"},{"identifier":"1960-L-03017"},{"identifier":"1960-L-03412"},{"identifier":"1960-L-03416"},{"identifier":"1960-L-04166"},{"identifier":"1960-L-05546"},{"identifier":"1960-L-05800"},{"identifier":"1960-L-05801"},{"identifier":"1960-L-06622"},{"identifier":"1961-L-00278"},{"identifier":"1961-L-00424"},{"identifier":"1961-L-01126"},{"identifier":"1961-L-01734"},{"identifier":"1961-L-02404"},{"identifier":"1961-L-02595"},{"identifier":"1961-L-02892"},{"identifier":"1961-L-03230"},{"identifier":"1961-L-04261"},{"identifier":"1961-L-04703"},{"identifier":"1961-L-05204"},{"identifier":"1961-L-05914"}],"count":100,"cursor":"W3siaWRlbnRpZmllciI6IjE5NjEtTC0wNTkxNCJ9XQ==","total":100} python-internetarchive-5.7.2/tests/ia.ini000066400000000000000000000000371513674652200205050ustar00rootroot00000000000000[s3] access = foo secret = bar python-internetarchive-5.7.2/tests/requirements.txt000066400000000000000000000000551513674652200226770ustar00rootroot00000000000000pytest==8.4.2 responses==0.23.1 ruff==0.14.1 python-internetarchive-5.7.2/tests/test_api.py000066400000000000000000000335271513674652200216070ustar00rootroot00000000000000import os import re import responses import urllib3 from internetarchive import ( download, get_files, get_item, get_session, modify_metadata, search_items, upload, ) from internetarchive.utils import InvalidIdentifierException, json from tests.conftest import ( NASA_METADATA_PATH, PROTOCOL, IaRequestsMock, load_file, load_test_data_file, ) TEST_SEARCH_RESPONSE = load_test_data_file('advanced_search_response.json') TEST_SCRAPE_RESPONSE = load_test_data_file('scrape_response.json') _j = json.loads(TEST_SCRAPE_RESPONSE) del _j['cursor'] _j['items'] = [{'identifier': 'nasa'}] _j['total'] = 1 TEST_SCRAPE_RESPONSE = json.dumps(_j) def test_get_session_with_config(): s = get_session(config={'s3': {'access': 'key'}, 'gengeral': {'secure': False}}) assert s.access_key == 'key' def test_get_session_with_config_file(tmpdir): tmpdir.chdir() test_conf = '[s3]\naccess = key2' with open('ia_test.ini', 'w') as fh: fh.write(test_conf) s = get_session(config_file='ia_test.ini') assert s.access_key == 'key2' def test_get_item(nasa_mocker): item = get_item('nasa') assert item.identifier == 'nasa' def test_get_item_with_config(nasa_mocker): item = get_item('nasa', config={'s3': {'access': 'key'}}) assert item.session.access_key == 'key' def test_get_item_with_config_file(tmpdir, nasa_mocker): tmpdir.chdir() test_conf = '[s3]\naccess = key2' with open('ia_test.ini', 'w') as fh: fh.write(test_conf) item = get_item('nasa', config_file='ia_test.ini') assert item.session.access_key == 'key2' def test_get_item_with_archive_session(nasa_mocker): s = get_session(config={'s3': {'access': 'key3'}}) item = get_item('nasa', archive_session=s) assert item.session.access_key == 'key3' def test_get_item_with_kwargs(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') item = get_item('nasa', http_adapter_kwargs={'max_retries': 13}) assert isinstance(item.session.adapters[f'{PROTOCOL}//'].max_retries, urllib3.Retry) try: get_item('nasa', request_kwargs={'timeout': .0000000000001}) except Exception as exc: assert 'timed out' in str(exc) def test_get_files(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') files = get_files('nasa') expected_files = { 'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_reviews.xml', 'nasa_meta.xml', 'nasa_archive.torrent', 'nasa_files.xml', } assert {f.name for f in files} == expected_files def test_get_files_with_get_item_kwargs(tmpdir): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') s = get_session(config={'s3': {'access': 'key'}}) files = get_files('nasa', files='nasa_meta.xml', archive_session=s) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', config={'logging': {'level': 'INFO'}}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' test_conf = '[s3]\naccess = key2' with open('ia_test.ini', 'w') as fh: fh.write(test_conf) files = get_files('nasa', files='nasa_meta.xml', config_file='ia_test.ini') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', http_adapter_kwargs={'max_retries': 3}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' files = get_files('nasa', files='nasa_meta.xml', request_kwargs={'timeout': 4}) files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_meta.xml' def test_get_files_non_existing(nasa_mocker): files = get_files('nasa', files='none') assert list(files) == [] def test_get_files_multiple(nasa_mocker): _files = ['nasa_meta.xml', 'nasa_files.xml'] files = get_files('nasa', files=_files) for f in files: assert f.name in _files def test_get_files_formats(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') files = get_files('nasa', formats='JPEG') files = list(files) assert len(files) == 1 assert files[0].name == 'globe_west_540.jpg' files = get_files('nasa', formats=['JPEG', 'Collection Header']) expected_files = { 'globe_west_540.jpg', 'NASAarchiveLogo.jpg', } assert {f.name for f in files} == expected_files def test_get_files_glob_pattern(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') files = get_files('nasa', glob_pattern='*torrent') files = list(files) assert len(files) == 1 assert files[0].name == 'nasa_archive.torrent' files = get_files('nasa', glob_pattern='*torrent|*jpg') expected_files = { 'globe_west_540.jpg', 'NASAarchiveLogo.jpg', 'nasa_archive.torrent', } assert {f.name for f in files} == expected_files def test_modify_metadata(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/nasa', body='{"metadata":{"title":"foo"}}') rsps.add(responses.POST, f'{PROTOCOL}//archive.org/metadata/nasa', body=('{"success":true,"task_id":423444944,' '"log":"https://catalogd.archive.org/log/423444944"}')) r = modify_metadata('nasa', {'foo': 1}) assert r.status_code == 200 assert r.json() == { 'task_id': 423444944, 'success': True, 'log': 'https://catalogd.archive.org/log/423444944' } def test_upload(): expected_s3_headers = { 'content-length': '7557', 'x-archive-queue-derive': '1', 'x-archive-size-hint': '7557', 'x-archive-auto-make-bucket': '1', 'authorization': 'LOW test_access:test_secret', } with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, re.compile(r'.*s3.us.archive.org/.*'), adding_headers=expected_s3_headers) rsps.add_metadata_mock('nasa') rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/nasa', body='{}') _responses = upload('nasa', NASA_METADATA_PATH, access_key='test_access', secret_key='test_secret') for response in _responses: req = response.request headers = {k.lower(): str(v) for k, v in req.headers.items()} assert 'user-agent' in headers del headers['accept'] del headers['accept-encoding'] del headers['connection'] del headers['user-agent'] assert headers == expected_s3_headers assert req.url == f'{PROTOCOL}//s3.us.archive.org/nasa/nasa.json' def test_upload_validate_identifier(): try: upload('føø', NASA_METADATA_PATH, access_key='test_access', secret_key='test_secret', validate_identifier=True) raise AssertionError("Given invalid identifier was not correctly validated.") except Exception as exc: assert isinstance(exc, InvalidIdentifierException) expected_s3_headers = { 'content-length': '7557', 'x-archive-queue-derive': '1', 'x-archive-size-hint': '7557', 'x-archive-auto-make-bucket': '1', 'authorization': 'LOW test_access:test_secret', } with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, re.compile(r'.*s3.us.archive.org/.*'), adding_headers=expected_s3_headers) rsps.add_metadata_mock('nasa') rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/nasa', body='{}') upload('nasa', NASA_METADATA_PATH, access_key='test_access', secret_key='test_secret', validate_identifier=True) assert True def test_download(tmpdir): tmpdir.chdir() last_mod_header = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"} with IaRequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//archive.org/download/nasa/nasa_meta.xml', body='test content', adding_headers=last_mod_header) rsps.add_metadata_mock('nasa') download('nasa', 'nasa_meta.xml') p = os.path.join(str(tmpdir), 'nasa') assert len(os.listdir(p)) == 1 assert load_file('nasa/nasa_meta.xml') == 'test content' def test_search_items(session): url = f'{PROTOCOL}//archive.org/services/search/v1/scrape' p1 = { 'q': 'identifier:nasa', 'count': '10000', } p2 = p1.copy() p2['total_only'] = 'true' with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, url, body=TEST_SCRAPE_RESPONSE, match=[responses.matchers.query_param_matcher(p1)]) rsps.add(responses.POST, url, body='{"items":[],"count":0,"total":1}', match=[responses.matchers.query_param_matcher(p2)], content_type='application/json; charset=UTF-8') r = search_items('identifier:nasa', archive_session=session) expected_results = [{'identifier': 'nasa'}] assert r.num_found == 1 assert iter(r).search == r assert len(iter(r)) == 1 assert len(r.iter_as_results()) == 1 assert list(r) == expected_results assert list(r.iter_as_results()) == expected_results assert r.fts == False def test_search_items_with_fields(session): _j = json.loads(TEST_SCRAPE_RESPONSE) _j['items'] = [ {'identifier': 'nasa', 'title': 'NASA Images'} ] search_response_str = json.dumps(_j) url = f'{PROTOCOL}//archive.org/services/search/v1/scrape' p1 = { 'q': 'identifier:nasa', 'count': '10000', 'fields': 'identifier,title', } p2 = { 'q': 'identifier:nasa', 'total_only': 'true', 'count': '10000', } with IaRequestsMock() as rsps: rsps.add(responses.POST, url, match=[responses.matchers.query_param_matcher(p1)], body=search_response_str) rsps.add(responses.POST, url, body='{"items":[],"count":0,"total":1}', match=[responses.matchers.query_param_matcher(p2)], content_type='application/json; charset=UTF-8') r = search_items('identifier:nasa', fields=['identifier', 'title'], archive_session=session) assert list(r) == [{'identifier': 'nasa', 'title': 'NASA Images'}] def test_search_items_as_items(session): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, f'{PROTOCOL}//archive.org/services/search/v1/scrape', body=TEST_SCRAPE_RESPONSE) rsps.add_metadata_mock('nasa') r = search_items('identifier:nasa', archive_session=session) assert [x.identifier for x in r.iter_as_items()] == ['nasa'] assert r.iter_as_items().search == r def test_search_items_fts(session): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, f'{PROTOCOL}//be-api.us.archive.org/ia-pub-fts-api', body=TEST_SCRAPE_RESPONSE) rsps.add_metadata_mock('nasa') r = search_items('nina simone', full_text_search=True, archive_session=session) print(r.search_url) assert r.fts == True assert r.dsl_fts == False assert r.query == '!L nina simone' assert r.params == {'count': 10000, 'q': '!L nina simone'} r = search_items('nina simone', full_text_search=True, dsl_fts=True, archive_session=session) assert r.fts == True assert r.dsl_fts == True assert r.query == 'nina simone' assert r.params == {'count': 10000, 'q': 'nina simone'} r = search_items('nina simone', dsl_fts=True, archive_session=session) assert r.fts == True assert r.dsl_fts == True assert r.query == 'nina simone' assert r.params == {'count': 10000, 'q': 'nina simone'} def test_page_row_specification(session): _j = json.loads(TEST_SEARCH_RESPONSE) _j['response']['items'] = [{'identifier': 'nasa'}] _j['response']['numFound'] = 1 _search_r = json.dumps(_j) with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, f'{PROTOCOL}//archive.org/advancedsearch.php', body=_search_r) rsps.add_metadata_mock('nasa') rsps.add(responses.POST, f'{PROTOCOL}//archive.org/services/search/v1/scrape', body='{"items":[],"count":0,"total":1}', content_type='application/json; charset=UTF-8') r = search_items('identifier:nasa', params={'page': '1', 'rows': '1'}, archive_session=session) assert r.iter_as_items().search == r assert len(r.iter_as_items()) == 1 python-internetarchive-5.7.2/tests/test_bad_data.py000066400000000000000000000006421513674652200225450ustar00rootroot00000000000000from internetarchive.api import get_item from tests.conftest import IaRequestsMock def test_bad_mediatype(): # this identifier actually has this malformed data ident = 'CIA-RDP96-00789R000700210007-5' body = '{"metadata": {"mediatype":["texts","texts"]}}' with IaRequestsMock() as rsps: rsps.add_metadata_mock(ident, body=body) # should complete without error get_item(ident) python-internetarchive-5.7.2/tests/test_config.py000066400000000000000000000350631513674652200223000ustar00rootroot00000000000000import contextlib import os import tempfile from unittest import mock import pytest import requests.adapters import responses import internetarchive.config import internetarchive.session from internetarchive.exceptions import AuthenticationError @responses.activate def test_get_auth_config(): test_body = """{ "success": true, "values": { "cookies": { "logged-in-sig": "foo-sig", "logged-in-user": "foo%40example.com" }, "email": "foo@example.com", "itemname": "@jakej", "s3": { "access": "Ac3ssK3y", "secret": "S3cretK3y" }, "screenname":"jakej" }, "version": 1}""" responses.add(responses.POST, 'https://archive.org/services/xauthn/', body=test_body) r = internetarchive.config.get_auth_config('test@example.com', 'password1') assert r['s3']['access'] == 'Ac3ssK3y' assert r['s3']['secret'] == 'S3cretK3y' assert r['cookies']['logged-in-user'] == 'foo%40example.com' assert r['cookies']['logged-in-sig'] == 'foo-sig' @responses.activate def test_get_auth_config_auth_fail(): # No logged-in-sig cookie set raises AuthenticationError. responses.add(responses.POST, 'https://archive.org/services/xauthn/', body='{"error": "failed"}') try: r = internetarchive.config.get_auth_config('test@example.com', 'password1') except AuthenticationError as exc: return assert str(exc) == ('Authentication failed. Please check your credentials ' 'and try again.') def test_get_config(): config = internetarchive.config.get_config() assert isinstance(config, dict) def test_get_config_with_config_file(tmpdir): test_conf = ('[s3]\n' 'access = test-access\n' 'secret = test-secret\n' '[cookies]\n' 'logged-in-sig = test-sig\n' 'logged-in-user = test@archive.org\n') tmpdir.chdir() with open('ia_test.ini', 'w') as fp: fp.write(test_conf) config = internetarchive.config.get_config(config_file='ia_test.ini', config={'custom': 'test'}) assert config['cookies']['logged-in-sig'] == 'test-sig' assert config['cookies']['logged-in-user'] == 'test@archive.org' assert config['s3']['access'] == 'test-access' assert config['s3']['secret'] == 'test-secret' assert config['custom'] == 'test' def test_get_config_no_config_file(): os.environ['HOME'] = '' config = internetarchive.config.get_config() assert config == {} def test_get_config_with_config(): test_conf = { 's3': { 'access': 'custom-access', 'secret': 'custom-secret', }, 'cookies': { 'logged-in-user': 'test@archive.org', 'logged-in-sig': 'test-sig', }, } os.environ['HOME'] = '' config = internetarchive.config.get_config(config=test_conf) assert config['cookies']['logged-in-sig'] == 'test-sig' assert config['cookies']['logged-in-user'] == 'test@archive.org' assert config['s3']['access'] == 'custom-access' assert config['s3']['secret'] == 'custom-secret' def test_get_config_home_not_set(): os.environ['HOME'] = '/none' config = internetarchive.config.get_config() assert isinstance(config, dict) def test_get_config_home_not_set_with_config(): test_conf = { 's3': { 'access': 'no-home-access', 'secret': 'no-home-secret', }, } os.environ['HOME'] = '/none' config = internetarchive.config.get_config(config=test_conf) assert isinstance(config, dict) assert config['s3']['access'] == 'no-home-access' assert config['s3']['secret'] == 'no-home-secret' def test_get_config_config_and_config_file(tmpdir): test_conf = ('[s3]\n' 'access = test-access\n' 'secret = test-secret\n' '[cookies]\n' 'logged-in-sig = test-sig\n' 'logged-in-user = test@archive.org\n') tmpdir.chdir() with open('ia_test.ini', 'w') as fp: fp.write(test_conf) test_conf = { 's3': { 'access': 'custom-access', 'secret': 'custom-secret', }, 'cookies': { 'logged-in-user': 'test@archive.org', 'logged-in-sig': 'test-sig', }, } del test_conf['s3']['access'] config = internetarchive.config.get_config(config_file='ia_test.ini', config=test_conf) assert config['cookies']['logged-in-sig'] == 'test-sig' assert config['cookies']['logged-in-user'] == 'test@archive.org' assert config['s3']['access'] == 'test-access' assert config['s3']['secret'] == 'custom-secret' @contextlib.contextmanager def _environ(**kwargs): old_values = {k: os.environ.get(k) for k in kwargs} try: for k, v in kwargs.items(): if v is not None: os.environ[k] = v else: del os.environ[k] yield finally: for k, v in old_values.items(): if v is not None: os.environ[k] = v else: del os.environ[k] def _test_parse_config_file( expected_result, config_file_contents='', config_file_paths=None, home=None, xdg_config_home=None, config_file_param=None): # expected_result: (config_file_path, is_xdg); config isn't compared. # config_file_contents: str # config_file_paths: list of filenames to write config_file_contents to # home: str, override HOME env var; default: path of the temporary dir # xdg_config_home: str, set XDG_CONFIG_HOME # config_file_param: str, filename to pass to parse_config_file # All paths starting with '$TMPTESTDIR/' get evaluated relative to the temp dir. if not config_file_paths: config_file_paths = [] with tempfile.TemporaryDirectory() as tmp_test_dir: def _replace_path(s): if s and s.startswith('$TMPTESTDIR/'): return os.path.join(tmp_test_dir, s.split('/', 1)[1]) return s expected_result = (_replace_path(expected_result[0]), expected_result[1]) config_file_paths = [_replace_path(x) for x in config_file_paths] home = _replace_path(home) xdg_config_home = _replace_path(xdg_config_home) config_file_param = _replace_path(config_file_param) for p in config_file_paths: os.makedirs(os.path.dirname(p), exist_ok=True) with open(p, 'w') as fp: fp.write(config_file_contents) if home is None: home = tmp_test_dir env = {'HOME': home} if xdg_config_home is not None: env['XDG_CONFIG_HOME'] = xdg_config_home with _environ(**env): config_file_path, is_xdg, _config = internetarchive.config.parse_config_file( config_file=config_file_param) assert (config_file_path, is_xdg) == expected_result[0:2] def test_parse_config_file_blank(): _test_parse_config_file( expected_result=('$TMPTESTDIR/.config/internetarchive/ia.ini', True) ) def test_parse_config_file_existing_config_ia(): _test_parse_config_file( expected_result=('$TMPTESTDIR/.config/ia.ini', False), config_file_paths=['$TMPTESTDIR/.config/ia.ini'], ) def test_parse_config_file_existing_dotia(): _test_parse_config_file( expected_result=('$TMPTESTDIR/.ia', False), config_file_paths=['$TMPTESTDIR/.ia'], ) def test_parse_config_file_existing_config_ia_and_dotia(): _test_parse_config_file( expected_result=('$TMPTESTDIR/.config/ia.ini', False), config_file_paths=['$TMPTESTDIR/.config/ia.ini', '$TMPTESTDIR/.ia'], ) def test_parse_config_file_existing_all(): _test_parse_config_file( expected_result=('$TMPTESTDIR/.config/internetarchive/ia.ini', True), config_file_paths=[ '$TMPTESTDIR/.config/internetarchive/ia.ini', '$TMPTESTDIR/.config/ia.ini', '$TMPTESTDIR/.ia' ], ) def test_parse_config_file_custom_xdg(): _test_parse_config_file( expected_result=('$TMPTESTDIR/.xdg/internetarchive/ia.ini', True), xdg_config_home='$TMPTESTDIR/.xdg', ) def test_parse_config_file_empty_xdg(): # Empty XDG_CONFIG_HOME should be treated as if not set, i.e. default _test_parse_config_file( expected_result=('$TMPTESTDIR/.config/internetarchive/ia.ini', True), xdg_config_home='', ) def test_parse_config_file_relative_xdg(): # Relative XDG_CONFIG_HOME is invalid and should be ignored, i.e. default ~/.config used instead _test_parse_config_file( expected_result=('$TMPTESTDIR/.config/internetarchive/ia.ini', True), xdg_config_home='relative/.config', ) def test_parse_config_file_direct_path_overrides_existing_files(): _test_parse_config_file( expected_result=('/path/to/ia.ini', False), config_file_paths=[ '$TMPTESTDIR/.config/internetarchive/ia.ini', '$TMPTESTDIR/.config/ia.ini', '$TMPTESTDIR/.ia' ], config_file_param='/path/to/ia.ini', ) def test_parse_config_file_with_environment_variable(): with _environ(IA_CONFIG_FILE='/inexistent.ia.ini'): _test_parse_config_file( expected_result=('/inexistent.ia.ini', False), ) def test_parse_config_file_with_environment_variable_and_parameter(): with _environ(IA_CONFIG_FILE='/inexistent.ia.ini'): _test_parse_config_file( expected_result=('/inexistent.other.ia.ini', False), config_file_param='/inexistent.other.ia.ini', ) def _test_write_config_file( expected_config_file, expected_modes, dirs=None, create_expected_file=False, config_file_param=None): # expected_config_file: str # expected_modes: list of (path, mode) tuples # dirs: list of str, directories to create before running write_config_file # create_expected_file: bool, create the expected_config_file if True # config_file_param: str, filename to pass to write_config_file # Both dirs and the config file are created with mode 777 (minus umask). # All paths are evaluated relative to a temporary HOME. # Mode comparison accounts for the umask; expected_modes does not need to care about it. with tempfile.TemporaryDirectory() as temp_home_dir: expected_config_file = os.path.join(temp_home_dir, expected_config_file) if dirs: dirs = [os.path.join(temp_home_dir, d) for d in dirs] expected_modes = [(os.path.join(temp_home_dir, p), m) for p, m in expected_modes] if config_file_param: config_file_param = os.path.join(temp_home_dir, config_file_param) with _environ(HOME=temp_home_dir): # Need to account for the umask in the expected_modes comparisons. # The umask can't just be retrieved, so set and then restore previous value. umask = os.umask(0) os.umask(umask) if dirs: for d in dirs: os.mkdir(d) if create_expected_file: with open(expected_config_file, 'w') as fp: os.chmod(expected_config_file, 0o777) config_file = internetarchive.config.write_config_file({}, config_file_param) assert config_file == expected_config_file assert os.path.isfile(config_file) for path, mode in expected_modes: actual_mode = os.stat(path).st_mode & 0o777 assert actual_mode == mode & ~umask def test_write_config_file_blank(): """Test that a blank HOME is populated with expected dirs and modes.""" _test_write_config_file( expected_config_file='.config/internetarchive/ia.ini', expected_modes=[ ('.config/internetarchive/ia.ini', 0o600), ('.config/internetarchive', 0o700), ('.config', 0o700), ], ) def test_write_config_file_config_existing(): """Test that .config's permissions remain but ia gets created correctly.""" _test_write_config_file( dirs=['.config'], expected_config_file='.config/internetarchive/ia.ini', expected_modes=[ ('.config/internetarchive/ia.ini', 0o600), ('.config/internetarchive', 0o700), ('.config', 0o777), ], ) def test_write_config_file_config_internetarchive_existing(): """Test that directory permissions are left as is""" _test_write_config_file( dirs=['.config', '.config/internetarchive'], expected_config_file='.config/internetarchive/ia.ini', expected_modes=[ ('.config/internetarchive/ia.ini', 0o600), ('.config/internetarchive', 0o777), ('.config', 0o777), ], ) def test_write_config_file_existing_file(): """Test that the permissions of the file are forced to 600""" _test_write_config_file( dirs=['.config', '.config/internetarchive'], expected_config_file='.config/internetarchive/ia.ini', create_expected_file=True, expected_modes=[ ('.config/internetarchive/ia.ini', 0o600), ('.config/internetarchive', 0o777), ('.config', 0o777), ], ) def test_write_config_file_existing_other_file(): """Test that the permissions of the file are forced to 600 even outside XDG""" _test_write_config_file( dirs=['foo'], expected_config_file='foo/ia.ini', create_expected_file=True, config_file_param='foo/ia.ini', expected_modes=[ ('foo/ia.ini', 0o600), ('foo', 0o777), ], ) def test_write_config_file_custom_path_existing(): """Test the creation of a config file at a custom location""" _test_write_config_file( dirs=['foo'], expected_config_file='foo/ia.ini', config_file_param='foo/ia.ini', expected_modes=[ ('foo/ia.ini', 0o600), ('foo', 0o777), ], ) def test_write_config_file_custom_path_not_existing(): """Ensure that an exception is thrown if the custom path dir doesn't exist""" with tempfile.TemporaryDirectory() as temp_home_dir: with _environ(HOME=temp_home_dir): config_file = os.path.join(temp_home_dir, 'foo/ia.ini') with pytest.raises(IOError): internetarchive.config.write_config_file({}, config_file) python-internetarchive-5.7.2/tests/test_exceptions.py000066400000000000000000000003631513674652200232070ustar00rootroot00000000000000import internetarchive.exceptions def test_AuthenticationError(): try: raise internetarchive.exceptions.AuthenticationError('Authentication Failed') except Exception as exc: assert str(exc) == 'Authentication Failed' python-internetarchive-5.7.2/tests/test_files.py000066400000000000000000000035551513674652200221360ustar00rootroot00000000000000import os import re from unittest.mock import patch import pytest import responses from internetarchive.exceptions import DirectoryTraversalError from internetarchive.utils import sanitize_filename from tests.conftest import PROTOCOL, IaRequestsMock DOWNLOAD_URL_RE = re.compile(f'{PROTOCOL}//archive.org/download/.*') EXPECTED_LAST_MOD_HEADER = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"} def test_file_download_sanitizes_filename(tmpdir, nasa_item): tmpdir.chdir() # Mock is_windows to return True to test Windows-style sanitization with patch('internetarchive.utils.is_windows', return_value=True): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content', adding_headers=EXPECTED_LAST_MOD_HEADER) # Test filename with Windows-invalid characters file_obj = nasa_item.get_file('nasa_meta.xml') problematic_name = 'file:withchars.xml' sanitized_name = sanitize_filename(problematic_name) expected_path = os.path.join(str(tmpdir), sanitized_name) file_obj.download(file_path=sanitized_name, destdir=str(tmpdir)) assert os.path.exists(expected_path) def test_file_download_prevents_directory_traversal(tmpdir, nasa_item): tmpdir.chdir() # Don't mock the request since it won't be made due to the security check with IaRequestsMock(assert_all_requests_are_fired=False): # Test directory traversal attempt by getting the file and calling download directly file_obj = nasa_item.get_file('nasa_meta.xml') malicious_path = os.path.join('..', 'nasa_meta.xml') with pytest.raises(DirectoryTraversalError, match=r"outside.*directory"): file_obj.download(file_path=malicious_path, destdir=str(tmpdir)) python-internetarchive-5.7.2/tests/test_iarequest.py000066400000000000000000000226151513674652200230340ustar00rootroot00000000000000import copy import json import pytest from internetarchive.iarequest import ( MetadataRequest, S3Request, prepare_files_patch, prepare_metadata, prepare_patch, prepare_target_patch, ) from tests.conftest import PROTOCOL, IaRequestsMock @pytest.fixture def sample_metadata(): return copy.deepcopy({ "metadata": {"title": "Test"}, "files": [ {"name": "test.txt", "custom": {"tags": ["old"]}, "foo": "bar"}, ], "dupe_pallet_index": { "IA9999": ["IA999901"] } }) @pytest.mark.parametrize(("metadata", "expected"), [ ({"custom": ["new"]}, [{'op': 'add', 'path': '/custom', 'value': ['new']}]), ({"title": "New Title"}, [{'op': 'replace', 'path': '/title', 'value': 'New Title'}]), ({"title": "REMOVE_TAG"}, [{'op': 'remove', 'path': '/title'}]), ]) def test_metadata_patch_operations(metadata, expected, sample_metadata): patch = prepare_patch( metadata=metadata, source_metadata=sample_metadata["metadata"], append=False, append_list=False, insert=False, ) assert patch == expected @pytest.mark.parametrize(("metadata", "expected"), [ ({"new-key": ["new", "new2"]}, [{'op': 'add', 'path': '/new-key', 'value': ['new', 'new2']}]), ({"custom": "foo new"}, [{'op': 'replace', 'path': '/custom', 'value': 'foo new'}]), ({"custom": "REMOVE_TAG"}, [{'op': 'remove', 'path': '/custom'}]), ]) def test_file_metadata_patch_operations(metadata, expected, sample_metadata): patch = prepare_files_patch( metadata=metadata, files_metadata=sample_metadata["files"], target="files/test.txt", append=False, append_list=False, insert=False, expect={} ) assert patch == expected @pytest.mark.parametrize(("metadata", "expected"), [ ( {"IA9999": ["UPDATED"], "NEW_ITEM": ["NEW123"]}, [ {'op': 'add', 'path': '/NEW_ITEM', 'value': ['NEW123']}, {'op': 'replace', 'path': '/IA9999/0', 'value': 'UPDATED'} ] ), ]) def test_target_patch_add_and_replace(metadata, expected, sample_metadata): patch = prepare_target_patch( metadata=metadata, source_metadata=sample_metadata, target="dupe_pallet_index", append=False, append_list=False, insert=False, expect={} ) assert patch == expected @pytest.mark.parametrize(("metadata", "expected"), [ ( {"IA9999": ["IA999901", "IA999902", "IA999903"]}, [{'op': 'add', 'path': '/IA9999/1', 'value': ['IA999901', 'IA999902', 'IA999903']}] ), ( {"IA9999": "IA999902"}, [{'op': 'add', 'path': '/IA9999/1', 'value': 'IA999902'}] ), ]) def test_target_patch_append_list(metadata, expected, sample_metadata): patch = prepare_target_patch( metadata=metadata, source_metadata=sample_metadata, target="dupe_pallet_index", append=False, append_list=True, insert=False, expect={} ) assert patch == expected def test_metadata_request_patch_key(sample_metadata): with IaRequestsMock() as rsps: rsps.add_metadata_mock('test_item', body=json.dumps(sample_metadata)) req = MetadataRequest( metadata={"title": "New Title"}, url=f"{PROTOCOL}//archive.org/metadata/test_item" ) prepared = req.prepare() assert any(k.endswith('-patch') for k in prepared.data) @pytest.mark.parametrize(("test_value", "expected"), [ ( "http://example.com/foo bar?q=✓", "uri(http%3A//example.com/foo%20bar%3Fq%3D%E2%9C%93)" ), ]) def test_metadata_header_uri_encoding(test_value, expected): req = S3Request( method='PUT', url=f"{PROTOCOL}//s3.us.archive.org/test_item", metadata={"source": test_value}, access_key='test_access', secret_key='test_secret' ) prepared = req.prepare() header = prepared.headers.get('x-archive-meta00-source', '') assert header == expected @pytest.mark.parametrize(("metadata", "source", "insert", "expected"), [ # Shift existing elements in insert mode ({"collection[0]": "noaa-hawaii"}, {"collection": ["internetarchivebooks", "noaa-hawaii", "noaa", "democracys-library"]}, True, {"collection": ["noaa-hawaii", "internetarchivebooks", "noaa", "democracys-library"]}), # Simple overwrite of an indexed key ({"subject[0]": "Math"}, {"subject": "Science"}, False, {"subject": ["Math"]}), # Indexed key overwriting existing list ({"subject[1]": "Science"}, {"subject": ["Math"]}, False, {"subject": ["Math", "Science"]}), # Insert mode: shifts existing elements ({"subject[1]": "History"}, {"subject": ["Math", "Science"]}, True, {"subject": ["Math", "History", "Science"]}), # REMOVE_TAG removes an element ({"subject[0]": "REMOVE_TAG"}, {"subject": ["Math", "Science"]}, False, {"subject": ["Science"]}), # Multiple indexed keys out of order ({"subject[2]": "Art", "subject[0]": "Math"}, {}, False, {"subject": ["Math", "Art"]}), # Insert at beginning of an existing list ({"subject[0]": "Physics"}, {"subject": ["Math", "Chemistry"]}, True, {"subject": ["Physics", "Math", "Chemistry"]}), # Insert at end of an existing list ({"subject[2]": "Biology"}, {"subject": ["Math", "Chemistry"]}, True, {"subject": ["Math", "Chemistry", "Biology"]}), # Overwrite numeric value ({"page[0]": 42}, {"page": [1]}, False, {"page": [42]}), # Mixed non-indexed and indexed keys ({"subject": "History", "topic[0]": "Algebra"}, {"subject": "Math", "topic": ["English", "Geometry"]}, False, {"subject": "History", "topic": ["Algebra", "Geometry"]}), # Remove multiple elements with REMOVE_TAG ({"subject[0]": "REMOVE_TAG", "subject[1]": "REMOVE_TAG"}, {"subject": ["Math", "Science", "History"]}, False, {"subject": ["History"]}), # Indexed key beyond current list length ({"subject[5]": "Philosophy"}, {"subject": ["Math"]}, False, {"subject": ["Math", "Philosophy"]}), # Insert mode with duplicate prevention ({"subject[1]": "Math"}, {"subject": ["Math", "Science"]}, True, {"subject": ["Science", "Math"]}), ]) def test_prepare_metadata_indexed_keys(metadata, source, insert, expected): result = prepare_metadata(metadata, source_metadata=source, insert=insert) # remove None placeholders for comparison for k, v in result.items(): if isinstance(v, list): result[k] = [i for i in v if i is not None] assert result == expected def test_prepare_metadata_insert_mode_and_duplicates(): source = {"tags": ["foo", "bar"]} metadata = {"tags[1]": "foo"} # duplicate value result = prepare_metadata(metadata, source_metadata=source, insert=True) # Duplicate should be removed and value inserted at index 1 assert result["tags"] == ["bar", "foo"] def test_prepare_metadata_with_preallocation_and_none_cleanup(): source = {"keywords": ["python"]} metadata = {"keywords[3]": "testing"} result = prepare_metadata(metadata, source_metadata=source) # Index 1 and 2 are None and should be removed assert result["keywords"] == ["python", "testing"] def test_prepare_metadata_numeric_conversion_and_append(): source = {"page": 5} metadata = {"page": 10} result = prepare_metadata(metadata, source_metadata=source, append=True) # Numeric values should be converted to strings and concatenated assert result["page"] == "5 10" def test_prepare_metadata_append_list(): source = {"tags": ["foo"]} metadata = {"tags": ["bar"]} result = prepare_metadata(metadata, source_metadata=source, append_list=True) assert result["tags"] == ["foo", ["bar"]] @pytest.mark.parametrize(("metadata", "source", "insert", "expected"), [ # Multiple REMOVE_TAGs interleaved with inserts ({"tags[0]": "REMOVE_TAG", "tags[2]": "new", "tags[1]": "REMOVE_TAG"}, {"tags": ["foo", "bar", "baz"]}, True, {"tags": ["new", "baz"]}), # Sparse indices beyond current list length, insert mode ({"keywords[5]": "python", "keywords[2]": "pytest"}, {"keywords": ["testing"]}, True, {"keywords": ["testing", "pytest", "python"]}), # Duplicate prevention with insert mode ({"categories[1]": "Tech"}, {"categories": ["Tech", "Science"]}, True, {"categories": ["Science", "Tech"]}), # Indexed key overwrite where source is a non-list ({"page[0]": 99}, {"page": 42}, False, {"page": [99]}), # Mixed string and list in source ({"authors[1]": "Alice"}, {"authors": "Bob"}, True, {"authors": ["Bob", "Alice"]}), # REMOVE_TAG at the end of list ({"items[2]": "REMOVE_TAG"}, {"items": ["A", "B", "C"]}, False, {"items": ["A", "B"]}), # Multiple sparse inserts with duplicates ({"tags[0]": "python", "tags[3]": "python"}, {"tags": ["python", "pytest"]}, True, {"tags": ["pytest", "python"]}), ]) def test_prepare_metadata_edge_cases(metadata, source, insert, expected): result = prepare_metadata(metadata, source_metadata=source, insert=insert) # remove None placeholders for comparison for k, v in result.items(): if isinstance(v, list): result[k] = [i for i in v if i is not None] assert result == expected python-internetarchive-5.7.2/tests/test_item.py000066400000000000000000000702321513674652200217660ustar00rootroot00000000000000import logging import os import re import sys import types from copy import deepcopy import pytest import responses from requests.exceptions import ConnectionError, HTTPError import internetarchive.files from internetarchive import get_session from internetarchive.api import get_item from internetarchive.exceptions import InvalidChecksumError from internetarchive.utils import InvalidIdentifierException, json, norm_filepath from tests.conftest import ( NASA_METADATA_PATH, PROTOCOL, IaRequestsMock, load_file, load_test_data_file, ) S3_URL = f'{PROTOCOL}//s3.us.archive.org/' DOWNLOAD_URL_RE = re.compile(f'{PROTOCOL}//archive.org/download/.*') S3_URL_RE = re.compile(r'.*s3.us.archive.org/.*') EXPECTED_LAST_MOD_HEADER = {"Last-Modified": "Tue, 14 Nov 2023 20:25:48 GMT"} EXPECTED_S3_HEADERS = { 'content-length': '7557', 'x-archive-queue-derive': '1', 'x-archive-size-hint': '7557', 'x-archive-auto-make-bucket': '1', 'authorization': 'LOW a:b', 'accept': '*/*', 'accept-encoding': 'gzip, deflate', 'connection': 'close', } # `compression.zstd` is added to the Standard Library in Python >= 3.14. if sys.version_info >= (3, 14): EXPECTED_S3_HEADERS['accept-encoding'] += ', zstd' def test_get_item(nasa_metadata, nasa_item, session): assert nasa_item.item_metadata == nasa_metadata assert nasa_item.identifier == 'nasa' assert nasa_item.exists is True assert isinstance(nasa_item.metadata, dict) assert isinstance(nasa_item.files, list) assert isinstance(nasa_item.reviews, list) assert nasa_item.created == 1427273784 assert nasa_item.d1 == 'ia902606.us.archive.org' assert nasa_item.d2 == 'ia802606.us.archive.org' assert nasa_item.dir == '/7/items/nasa' assert nasa_item.files_count == 6 assert nasa_item.item_size == 114030 assert nasa_item.server == 'ia902606.us.archive.org' assert nasa_item.uniq == 2131998567 assert nasa_item.updated == 1427273788 assert nasa_item.tasks is None assert len(nasa_item.collection) == 1 def test_get_file(nasa_item): file = nasa_item.get_file('nasa_meta.xml') assert type(file) is internetarchive.files.File assert file.name == 'nasa_meta.xml' def test_get_files(nasa_item): files = nasa_item.get_files() assert isinstance(files, types.GeneratorType) expected_files = {'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_reviews.xml', 'nasa_meta.xml', 'nasa_archive.torrent', 'nasa_files.xml'} files = {x.name for x in files} assert files == expected_files def test_get_files_by_name(nasa_item): files = nasa_item.get_files('globe_west_540.jpg') assert {f.name for f in files} == {'globe_west_540.jpg'} files = nasa_item.get_files(['globe_west_540.jpg', 'nasa_meta.xml']) assert {f.name for f in files} == {'globe_west_540.jpg', 'nasa_meta.xml'} def test_get_files_by_formats(nasa_item): files = {f.name for f in nasa_item.get_files(formats='Archive BitTorrent')} expected_files = {'nasa_archive.torrent'} assert files == expected_files files = {f.name for f in nasa_item.get_files(formats=['Archive BitTorrent', 'JPEG'])} expected_files = {'nasa_archive.torrent', 'globe_west_540.jpg'} assert files == expected_files def test_get_files_by_glob(nasa_item): files = {f.name for f in nasa_item.get_files(glob_pattern='*jpg|*torrent')} expected_files = {'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_archive.torrent'} assert files == expected_files files = {f.name for f in nasa_item.get_files(glob_pattern=['*jpg', '*torrent'])} expected_files = {'NASAarchiveLogo.jpg', 'globe_west_540.jpg', 'nasa_archive.torrent'} assert files == expected_files def test_get_files_by_glob_with_exclude(nasa_item): files = { f.name for f in nasa_item.get_files( glob_pattern="*jpg|*torrent", exclude_pattern="*540*|*Logo*" ) } expected_files = {"nasa_archive.torrent"} assert files == expected_files files = { f.name for f in nasa_item.get_files( glob_pattern=["*jpg", "*torrent"], exclude_pattern=["*540*", "*Logo*"] ) } expected_files = {"nasa_archive.torrent"} assert files == expected_files def test_get_files_with_multiple_filters(nasa_item): files = {f.name for f in nasa_item.get_files(formats='JPEG', glob_pattern='*xml')} expected_files = {'globe_west_540.jpg', 'nasa_reviews.xml', 'nasa_meta.xml', 'nasa_files.xml'} assert files == expected_files def test_get_files_no_matches(nasa_item): assert list(nasa_item.get_files(formats='none')) == [] def test_download(tmpdir, nasa_item): tmpdir.chdir() with IaRequestsMock() as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content', adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') assert len(tmpdir.listdir()) == 1 os.remove('nasa/nasa_meta.xml') with IaRequestsMock() as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content', adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') with open('nasa/nasa_meta.xml') as fh: assert fh.read() == 'new test content' def test_download_io_error(tmpdir, nasa_item): tmpdir.chdir() with IaRequestsMock() as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content', adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') rsps.reset() with pytest.raises(ConnectionError): nasa_item.download(files='nasa_meta.xml') def test_download_ignore_errors(tmpdir, nasa_item): with IaRequestsMock() as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content', adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml') nasa_item.download(files='nasa_meta.xml', ignore_errors=True) def test_download_ignore_existing(tmpdir, nasa_item): tmpdir.chdir() with IaRequestsMock( assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content', adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml', ignore_existing=True) rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new test content', adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml', ignore_existing=True) with open('nasa/nasa_meta.xml') as fh: assert fh.read() == 'test content' def test_download_checksum(tmpdir, caplog): tmpdir.chdir() # test overwrite based on checksum. if os.path.exists('nasa/nasa_meta.xml'): os.remove('nasa/nasa_meta.xml') with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') rsps.add(responses.GET, DOWNLOAD_URL_RE, body='test content', adding_headers=EXPECTED_LAST_MOD_HEADER) rsps.add(responses.GET, DOWNLOAD_URL_RE, body='overwrite based on md5', adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item = get_item('nasa') nasa_item.download(files='nasa_meta.xml') try: nasa_item.download(files='nasa_meta.xml', checksum=True) except InvalidChecksumError as exc: assert "corrupt, checksums do not match." in str(exc) # test no overwrite based on checksum. with caplog.at_level(logging.DEBUG): rsps.reset() rsps.add(responses.GET, DOWNLOAD_URL_RE, body=load_test_data_file('nasa_meta.xml'), adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True) nasa_item.download(files='nasa_meta.xml', checksum=True, verbose=True) assert 'skipping nasa' in caplog.text assert 'nasa_meta.xml, file already exists based on checksum.' in caplog.text def test_download_destdir(tmpdir, nasa_item): tmpdir.chdir() with IaRequestsMock() as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='new destdir', adding_headers=EXPECTED_LAST_MOD_HEADER) dest = os.path.join(str(tmpdir), 'new destdir') nasa_item.download(files='nasa_meta.xml', destdir=dest) assert 'nasa' in os.listdir(dest) with open(os.path.join(dest, 'nasa/nasa_meta.xml')) as fh: assert fh.read() == 'new destdir' def test_download_no_directory(tmpdir, nasa_item): url_re = re.compile(f'{PROTOCOL}//archive.org/download/.*') tmpdir.chdir() with IaRequestsMock() as rsps: rsps.add(responses.GET, url_re, body='no dest dir', adding_headers=EXPECTED_LAST_MOD_HEADER) nasa_item.download(files='nasa_meta.xml', no_directory=True) with open(os.path.join(str(tmpdir), 'nasa_meta.xml')) as fh: assert fh.read() == 'no dest dir' def test_download_dry_run(tmpdir, capsys, nasa_item): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='no dest dir', adding_headers={'content-length': '100'}) nasa_item.download(formats='Metadata', dry_run=True) expected = {'nasa_reviews.xml', 'nasa_meta.xml', 'nasa_files.xml'} out, _err = capsys.readouterr() assert {x.split('/')[-1] for x in out.split('\n') if x} == expected def test_download_dry_run_on_the_fly_formats(tmpdir, capsys, nasa_item): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.GET, DOWNLOAD_URL_RE, body='no dest dir', adding_headers={'content-length': '100'}) nasa_item.download(formats='MARCXML', on_the_fly=True, dry_run=True) expected = {'nasa_archive_marc.xml'} out, _err = capsys.readouterr() assert {x.split('/')[-1] for x in out.split('\n') if x} == expected def test_download_verbose(tmpdir, capsys, nasa_item): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: headers = {'content-length': '11'} headers.update(EXPECTED_LAST_MOD_HEADER) rsps.add(responses.GET, DOWNLOAD_URL_RE, body='no dest dir', adding_headers=headers) nasa_item.download(files='nasa_meta.xml', verbose=True) _out, err = capsys.readouterr() assert 'downloading nasa_meta.xml' in err def test_download_dark_item(tmpdir, capsys, nasa_metadata, session): tmpdir.chdir() with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: nasa_metadata['metadata']['identifier'] = 'dark-item' nasa_metadata['is_dark'] = True _item_metadata = json.dumps(nasa_metadata) rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/dark-item', body=_item_metadata, content_type='application/json') _item = session.get_item('dark-item') rsps.add(responses.GET, DOWNLOAD_URL_RE, body='no dest dir', status=403, adding_headers={'content-length': '100'}) _item.download(files='nasa_meta.xml', verbose=True) _out, err = capsys.readouterr() assert 'skipping dark-item, item is dark' in err def test_upload(nasa_item): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE, adding_headers=EXPECTED_S3_HEADERS) _responses = nasa_item.upload(NASA_METADATA_PATH, access_key='a', secret_key='b') for resp in _responses: request = resp.request headers = {k.lower(): str(v) for k, v in request.headers.items()} assert 'user-agent' in headers del headers['user-agent'] assert headers == EXPECTED_S3_HEADERS assert request.url == f'{PROTOCOL}//s3.us.archive.org/nasa/nasa.json' def test_upload_validate_identifier(session): item = session.get_item('føø') with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE, adding_headers=EXPECTED_S3_HEADERS) try: item.upload(NASA_METADATA_PATH, access_key='a', secret_key='b', validate_identifier=True) raise AssertionError("Given invalid identifier was not correctly validated.") except Exception as exc: assert isinstance(exc, InvalidIdentifierException) valid_item = session.get_item('foo') with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE, adding_headers=EXPECTED_S3_HEADERS) valid_item.upload(NASA_METADATA_PATH, access_key='a', secret_key='b', validate_identifier=True) assert True def test_upload_secure_session(): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: c = {'s3': {'access': 'foo', 'secret': 'bar'}, 'general': {'secure': True}} s = get_session(config=c) rsps.add_metadata_mock('nasa') item = s.get_item('nasa') with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE) r = item.upload(NASA_METADATA_PATH) assert r[0].url == 'https://s3.us.archive.org/nasa/nasa.json' def test_upload_metadata(nasa_item): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: _expected_headers = deepcopy(EXPECTED_S3_HEADERS) _expected_headers['x-archive-meta00-foo'] = 'bar' _expected_headers['x-archive-meta00-subject'] = 'first' _expected_headers['x-archive-meta01-subject'] = 'second' _expected_headers['x-archive-meta00-baz'] = ( 'uri(%D0%9F%D0%BE%D1%87%D0%B5%D0%BC' '%D1%83%20%D0%B1%D1%8B%20%D0%B8%20%' 'D0%BD%D0%B5%D1%82...)') _expected_headers['x-archive-meta00-baz2'] = ( 'uri(%D0%9F%D0%BE%D1%87%D0%B5%D0%BC' '%D1%83%20%D0%B1%D1%8B%20%D0%B8%20%' 'D0%BD%D0%B5%D1%82...)') rsps.add(responses.PUT, S3_URL_RE, adding_headers=_expected_headers) md = { 'foo': 'bar', 'subject': ['first', 'second'], 'baz': 'Почему бы и нет...', 'baz2': ('\u041f\u043e\u0447\u0435\u043c\u0443 \u0431\u044b \u0438 ' '\u043d\u0435\u0442...'), } _responses = nasa_item.upload(NASA_METADATA_PATH, metadata=md, access_key='a', secret_key='b') for resp in _responses: request = resp.request headers = {k.lower(): str(v) for k, v in request.headers.items()} assert 'user-agent' in headers del headers['user-agent'] assert headers == _expected_headers def test_upload_503(capsys, nasa_item): body = ("" 'SlowDownPlease reduce your request rate.' 'simulated error caused by x-(amz|archive)-simulate-error' ', try x-archive-simulate-error:helpd36ec445-8d4a-4a64-' 'a110-f67af6ee2c2a') with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: _expected_headers = deepcopy(EXPECTED_S3_HEADERS) rsps.add(responses.GET, S3_URL_RE, body='{"over_limit": "1"}', adding_headers={'content-length': '19'} ) _expected_headers['content-length'] = '296' rsps.add(responses.PUT, S3_URL_RE, body=body, adding_headers=_expected_headers, status=503) try: nasa_item.upload(NASA_METADATA_PATH, access_key='a', secret_key='b', retries=1, retries_sleep=.1, verbose=True) except Exception as exc: assert 'Please reduce your request rate' in str(exc) _out, err = capsys.readouterr() assert 'warning: s3 is overloaded' in err def test_upload_file_keys(nasa_item): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE, adding_headers=EXPECTED_S3_HEADERS) files = {'new_key.txt': NASA_METADATA_PATH, '222': NASA_METADATA_PATH} _responses = nasa_item.upload(files, access_key='a', secret_key='b') expected_urls = [ f'{PROTOCOL}//s3.us.archive.org/nasa/new_key.txt', f'{PROTOCOL}//s3.us.archive.org/nasa/222', ] for resp in _responses: assert resp.request.url in expected_urls def test_upload_dir(tmpdir, nasa_item): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE, adding_headers=EXPECTED_S3_HEADERS) tmpdir.mkdir('dir_test') with open(os.path.join(str(tmpdir), 'dir_test', 'foo.txt'), 'w') as fh: fh.write('hi') with open(os.path.join(str(tmpdir), 'dir_test', 'foo2.txt'), 'w') as fh: fh.write('hi 2') # Test no-slash upload, dir is not in key name. _responses = nasa_item.upload(os.path.join(str(tmpdir), 'dir_test') + '/', access_key='a', secret_key='b') expected_eps = [ f'{S3_URL}nasa/foo.txt', f'{S3_URL}nasa/foo2.txt', ] for resp in _responses: assert resp.request.url in expected_eps # Test slash upload, dir is in key name. _responses = nasa_item.upload(os.path.join(str(tmpdir), 'dir_test'), access_key='a', secret_key='b') tmp_path = norm_filepath(str(tmpdir)) expected_eps = [ f'{S3_URL}nasa{tmp_path}/dir_test/foo.txt', f'{S3_URL}nasa{tmp_path}/dir_test/foo2.txt', ] for resp in _responses: assert resp.request.url in expected_eps def test_upload_queue_derive(nasa_item): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: _expected_headers = deepcopy(EXPECTED_S3_HEADERS) _expected_headers['x-archive-queue-derive'] = '1' rsps.add(responses.PUT, S3_URL_RE, adding_headers=_expected_headers) _responses = nasa_item.upload(NASA_METADATA_PATH, access_key='a', secret_key='b') for resp in _responses: headers = {k.lower(): str(v) for k, v in resp.request.headers.items()} assert 'user-agent' in headers del headers['user-agent'] assert headers == _expected_headers def test_upload_delete(tmpdir, nasa_item): body = ("" 'BadDigestThe Content-MD5 you specified did not ' 'match what we received.content-md5 submitted with PUT: ' 'foo != received data md5: 70871f9fce8dd23853d6e42417356b05also not equal to ' 'base64 version: cIcfn86N0jhT1uQkFzVrBQ==ec03fe7c-e123-' '4133-a207-3141d4d74096') _expected_headers = deepcopy(EXPECTED_S3_HEADERS) _expected_headers['content-length'] = '383' tmpdir.chdir() test_file = os.path.join(str(tmpdir), 'test.txt') with open(test_file, 'w') as fh: fh.write('test delete') with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: # Non-matching md5, should not delete. rsps.add(responses.PUT, S3_URL_RE, body=body, adding_headers=_expected_headers, status=400) with pytest.raises(HTTPError): nasa_item.upload(test_file, access_key='a', secret_key='b', delete=True, queue_derive=True) assert len(tmpdir.listdir()) == 1 _expected_headers = deepcopy(EXPECTED_S3_HEADERS) test_file = os.path.join(str(tmpdir), 'test.txt') with open(test_file, 'w') as fh: fh.write('test delete') # Matching md5, should delete. with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.PUT, S3_URL_RE, adding_headers=_expected_headers) resp = nasa_item.upload(test_file, access_key='a', secret_key='b', delete=True, queue_derive=True) for r in resp: headers = {k.lower(): str(v) for k, v in r.headers.items()} del headers['content-type'] assert headers == _expected_headers assert len(tmpdir.listdir()) == 0 def test_upload_checksum(tmpdir, nasa_item): with IaRequestsMock() as rsps: rsps.add_metadata_mock('nasa') nasa_item = get_item('nasa') _expected_headers = deepcopy(EXPECTED_S3_HEADERS) _expected_headers['content-md5'] = '6f1834f5c70c0eabf93dea675ccf90c4' test_file = os.path.join(str(tmpdir), 'checksum_test.txt') with open(test_file, 'wb') as fh: fh.write(b'test delete') # No skip. rsps.add(responses.PUT, S3_URL_RE, adding_headers=_expected_headers) resp = nasa_item.upload(test_file, access_key='a', secret_key='b', checksum=True) for r in resp: headers = {k.lower(): str(v) for k, v in r.headers.items()} del headers['content-type'] assert headers == _expected_headers assert r.status_code == 200 # Skip. nasa_item.item_metadata['files'].append( {'name': 'checksum_test.txt', 'md5': '33213e7683c1e6d15b2a658f3c567717'}) resp = nasa_item.upload(test_file, access_key='a', secret_key='b', checksum=True) for r in resp: headers = {k.lower(): str(v) for k, v in r.headers.items()} assert r.status_code is None def test_upload_automatic_size_hint(tmpdir, nasa_item): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: _expected_headers = deepcopy(EXPECTED_S3_HEADERS) del _expected_headers['x-archive-size-hint'] _expected_headers['x-archive-size-hint'] = '15' rsps.add(responses.PUT, S3_URL_RE, adding_headers=_expected_headers) files = [] with open(os.path.join(tmpdir, 'file'), 'w') as fh: fh.write('a') files.append(os.path.join(tmpdir, 'file')) os.mkdir(os.path.join(tmpdir, 'dir')) with open(os.path.join(tmpdir, 'dir', 'file0'), 'w') as fh: fh.write('bb') with open(os.path.join(tmpdir, 'dir', 'file1'), 'w') as fh: fh.write('cccc') files.append(os.path.join(tmpdir, 'dir')) with open(os.path.join(tmpdir, 'obj'), 'wb') as fh: fh.write(b'dddddddd') fh.seek(0, os.SEEK_SET) files.append(fh) _responses = nasa_item.upload(files, access_key='a', secret_key='b') for r in _responses: headers = {k.lower(): str(v) for k, v in r.headers.items()} del headers['content-type'] assert headers == _expected_headers def test_modify_metadata(nasa_item, nasa_metadata): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add(responses.POST, f'{PROTOCOL}//archive.org/metadata/nasa') # Test simple add. md = {'foo': 'bar'} p = nasa_item.modify_metadata(md, debug=True) _patch = json.dumps([ {'add': '/foo', 'value': 'bar'}, ]) expected_data = { 'priority': -5, '-target': 'metadata', '-patch': _patch, } assert set(p.data.keys()) == set(expected_data.keys()) assert p.data['priority'] == expected_data['priority'] assert p.data['-target'] == expected_data['-target'] assert all(v in p.data['-patch'] for v in ['/foo', 'bar']) # Test no changes. md = {'title': 'NASA Images'} p = nasa_item.modify_metadata(md, debug=True) expected_data = {'priority': -5, '-target': 'metadata', '-patch': '[]'} assert p.data == expected_data md = {'title': 'REMOVE_TAG'} p = nasa_item.modify_metadata(md, debug=True) expected_data = { 'priority': -5, '-target': 'metadata', '-patch': json.dumps([{'remove': '/title'}]) } assert set(p.data.keys()) == set(expected_data.keys()) assert p.data['priority'] == expected_data['priority'] assert p.data['-target'] == expected_data['-target'] assert '/title' in str(p.data['-patch']) assert 'remove' in str(p.data['-patch']) # Test add array. md = {'subject': ['one', 'two', 'last']} p = nasa_item.modify_metadata(md, debug=True, priority=-1) expected_data = { 'priority': -1, '-target': 'metadata', '-patch': json.dumps([{'add': '/subject', 'value': ['one', 'two', 'last']}]) } assert set(p.data.keys()) == set(expected_data.keys()) assert p.data['priority'] == expected_data['priority'] assert p.data['-target'] == expected_data['-target'] assert '["one", "two", "last"]' in str(p.data['-patch']) \ or '["one","two","last"]' in str(p.data['-patch']) # Test indexed mod. nasa_item.item_metadata['metadata']['subject'] = ['first', 'middle', 'last'] md = {'subject[2]': 'new first'} p = nasa_item.modify_metadata(md, debug=True) expected_data = { 'priority': -5, '-target': 'metadata', '-patch': json.dumps([{'value': 'new first', 'replace': '/subject/2'}]) } # Avoid comparing the json strings, because they are not in a canonical form assert set(p.data.keys()) == set(expected_data.keys()) assert all(p.data[k] == expected_data[k] for k in ['priority', '-target']) assert '/subject/2' in p.data['-patch'] or r'\/subject\/2' in p.data['-patch'] # Test priority. md = {'title': 'NASA Images'} p = nasa_item.modify_metadata(md, priority=3, debug=True) expected_data = {'priority': 3, '-target': 'metadata', '-patch': '[]'} assert p.data == expected_data # Test auth. md = {'title': 'NASA Images'} p = nasa_item.modify_metadata(md, access_key='a', secret_key='b', debug=True) assert 'access=a' in p.body assert 'secret=b' in p.body # Test change. md = {'title': 'new title'} nasa_metadata['metadata']['title'] = 'new title' _item_metadata = json.dumps(nasa_metadata) rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/nasa', body=_item_metadata) nasa_item.modify_metadata(md, access_key='a', secret_key='b') # Test that item re-initializes assert nasa_item.metadata['title'] == 'new title' python-internetarchive-5.7.2/tests/test_session.py000066400000000000000000000157661513674652200225260ustar00rootroot00000000000000import os import responses import internetarchive.session from internetarchive import __version__ from tests.conftest import NASA_METADATA_PATH, PROTOCOL, IaRequestsMock CONFIG = { 's3': { 'access': 'test_access', 'secret': 'test_secret', }, 'cookies': { 'logged-in-user': 'test%40example.com; path=/; domain=.archive.org', 'logged-in-sig': 'testsig; path=/; domain=.archive.org', }, 'logging': { 'level': 'INFO', 'file': 'test.log', }, } def test_archive_session(tmpdir): tmpdir.chdir() # Use an empty config file to avoid merging with user's real config empty_config_file = str(tmpdir.join('empty_ia.ini')) with open(empty_config_file, 'w'): pass s = internetarchive.session.ArchiveSession(CONFIG, config_file=empty_config_file) assert os.path.isfile('test.log') assert CONFIG == s.config assert set(s.cookies.keys()) == set(CONFIG['cookies'].keys()) assert s.secure is True assert s.protocol == PROTOCOL assert s.access_key == 'test_access' assert s.secret_key == 'test_secret' assert s.headers['user-agent'].startswith(f'internetarchive/{__version__}') def test_get_item(tmpdir): tmpdir.chdir() with open(NASA_METADATA_PATH) as fh: item_metadata = fh.read().strip() with responses.RequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/nasa', body=item_metadata, content_type='application/json') s = internetarchive.session.ArchiveSession() item = s.get_item('nasa') assert item.exists is True assert item.identifier == 'nasa' with responses.RequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//archive.org/metadata/nasa', body=item_metadata, status=400, content_type='application/json') s = internetarchive.session.ArchiveSession(CONFIG) try: item = s.get_item('nasa') except Exception: with open('test.log') as fh: assert '400 Client Error' in fh.read() def test_s3_is_overloaded(): test_body = """{ "accesskey": "test_access", "bucket": "nasa", "detail": { "accesskey_ration": 74, "accesskey_tasks_queued": 0, "bucket_ration": 24, "bucket_tasks_queued": 0, "limit_reason": "", "rationing_engaged": 0, "rationing_level": 1399, "total_global_limit": 1799, "total_tasks_queued": 308 }, "over_limit": 0 }""" with IaRequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//s3.us.archive.org', body=test_body, content_type='application/json') s = internetarchive.session.ArchiveSession(CONFIG) r = s.s3_is_overloaded('nasa') assert r is False test_body = """{ "accesskey": "test_access", "bucket": "nasa", "detail": { "accesskey_ration": 74, "accesskey_tasks_queued": 0, "bucket_ration": 24, "bucket_tasks_queued": 0, "limit_reason": "", "rationing_engaged": 0, "rationing_level": 1399, "total_global_limit": 1799, "total_tasks_queued": 308 }, "over_limit": 1 }""" with responses.RequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//s3.us.archive.org', body=test_body, content_type='application/json') s = internetarchive.session.ArchiveSession(CONFIG) r = s.s3_is_overloaded('nasa') assert r is True def test_user_agent_suffix(): """Test that a custom user agent suffix is appended to the default UA.""" custom_suffix = 'MyCustomApp/1.0 (test bot)' config = { 's3': { 'access': 'test_access', 'secret': 'test_secret', }, 'general': { 'user_agent_suffix': custom_suffix, }, } s = internetarchive.session.ArchiveSession(config) # Verify the UA starts with the default and ends with the custom suffix assert s.headers['User-Agent'].startswith(f'internetarchive/{__version__}') assert s.headers['User-Agent'].endswith(custom_suffix) # Verify access key is present in the UA assert 'test_access' in s.headers['User-Agent'] def test_default_user_agent_when_not_specified(): """Test that default user agent is used when custom is not specified.""" config = { 's3': { 'access': 'test_access', 'secret': 'test_secret', }, } s = internetarchive.session.ArchiveSession(config) assert s.headers['user-agent'].startswith(f'internetarchive/{__version__}') def test_user_agent_suffix_in_requests(): """Test that the user agent suffix is appended and sent in requests.""" custom_suffix = 'TestAgent/2.0' config = { 's3': { 'access': 'test_access', 'secret': 'test_secret', }, 'general': { 'user_agent_suffix': custom_suffix, }, } with responses.RequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//archive.org') s = internetarchive.session.ArchiveSession(config) r = s.get(f'{PROTOCOL}//archive.org') # Verify the UA starts with the default and ends with the custom suffix assert r.request.headers['User-Agent'].startswith(f'internetarchive/{__version__}') assert r.request.headers['User-Agent'].endswith(custom_suffix) # Verify access key is present in the UA assert 'test_access' in r.request.headers['User-Agent'] def test_access_key_always_in_user_agent(): """Test that the access key is always present in the User-Agent.""" config = { 's3': { 'access': 'MY_ACCESS_KEY', 'secret': 'test_secret', }, } s = internetarchive.session.ArchiveSession(config) assert 'MY_ACCESS_KEY' in s.headers['User-Agent'] assert s.headers['User-Agent'].startswith(f'internetarchive/{__version__}') def test_cookies(): with responses.RequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//archive.org') s = internetarchive.session.ArchiveSession(CONFIG) r = s.get(f'{PROTOCOL}//archive.org') assert 'logged-in-sig' in r.request.headers['Cookie'] assert 'logged-in-user' in r.request.headers['Cookie'] for c in s.cookies: if c.name.startswith('logged-in-'): assert c.domain == '.archive.org' with responses.RequestsMock() as rsps: rsps.add(responses.GET, f'{PROTOCOL}//example.com') s = internetarchive.session.ArchiveSession(CONFIG) r = s.get(f'{PROTOCOL}//example.com') assert 'logged-in-sig' not in r.request.headers.get('Cookie', '') assert 'logged-in-user' not in r.request.headers.get('Cookie', '') assert '.archive.org' not in r.request.headers.get('Cookie', '') python-internetarchive-5.7.2/tests/test_utils.py000066400000000000000000000154141513674652200221710ustar00rootroot00000000000000import string import warnings from unittest.mock import patch import pytest import internetarchive.utils from tests.conftest import NASA_METADATA_PATH, IaRequestsMock def test_utils(): with open(__file__, encoding='utf-8') as fh: list(internetarchive.utils.chunk_generator(fh, 10)) ifp = internetarchive.utils.IterableToFileAdapter([1, 2], 200) assert len(ifp) == 200 ifp.read() def test_needs_quote(): notascii = ('ȧƈƈḗƞŧḗḓ ŧḗẋŧ ƒǿř ŧḗşŧīƞɠ, ℛℯα∂α♭ℓℯ ♭ʊ☂ η☺т Ѧ$☾ℐℐ, ' '¡ooʇ ןnɟǝsn sı uʍop-ǝpısdn') assert internetarchive.utils.needs_quote(notascii) assert internetarchive.utils.needs_quote(string.whitespace) assert not internetarchive.utils.needs_quote(string.ascii_letters + string.digits) def test_validate_s3_identifier(): id1 = 'valid-Id-123-_foo' id2 = '!invalid-Id-123-_foo' id3 = 'invalid-Id-123-_foo+bar' id4 = 'invalid-Id-123-_føø' id5 = 'i' valid = internetarchive.utils.validate_s3_identifier(id1) assert valid for invalid_id in [id2, id3, id4, id5]: try: internetarchive.utils.validate_s3_identifier(invalid_id) except Exception as exc: assert isinstance(exc, internetarchive.utils.InvalidIdentifierException) def test_get_md5(): with open(__file__, 'rb') as fp: md5 = internetarchive.utils.get_md5(fp) assert isinstance(md5, str) def test_IdentifierListAsItems(session): with IaRequestsMock(assert_all_requests_are_fired=False) as rsps: rsps.add_metadata_mock('nasa') it = internetarchive.utils.IdentifierListAsItems('nasa', session) assert it[0].identifier == 'nasa' assert it.nasa.identifier == 'nasa' def test_IdentifierListAsItems_len(session): assert len(internetarchive.utils.IdentifierListAsItems(['foo', 'bar'], session)) == 2 # TODO: Add test of slice access to IdenfierListAsItems def test_get_s3_xml_text(): xml_str = ('NoSuchBucket' 'The specified bucket does not exist.' '' 'does-not-exist-! not found by Metadata::get_obj()[server]' '' 'd56bdc63-169b-4b4f-8c47-0fac6de39040') expected_txt = internetarchive.utils.get_s3_xml_text(xml_str) assert expected_txt == ('The specified bucket does not exist. - does-not-exist-! ' 'not found by Metadata::get_obj()[server]') def test_get_file_size(): try: s = internetarchive.utils.get_file_size(NASA_METADATA_PATH) except AttributeError as exc: assert "object has no attribute 'seek'" in str(exc) with open(NASA_METADATA_PATH) as fp: s = internetarchive.utils.get_file_size(fp) assert s == 7557 def test_is_valid_metadata_key(): # Keys starting with "xml" should also be invalid # due to the XML specification, but are supported # by the Internet Archive. valid = ('adaptive_ocr', 'bookreader-defaults', 'frames_per_second', 'identifier', 'possible-copyright-status', 'index[0]') invalid = ('Analog Format', "Date of transfer (probably today's date)", '_metadata_key', '58', '_', '', 'a') for metadata_key in valid: assert internetarchive.utils.is_valid_metadata_key(metadata_key) for metadata_key in invalid: assert not internetarchive.utils.is_valid_metadata_key(metadata_key) def test_is_windows(): with patch('platform.system', return_value='Windows'), \ patch('sys.platform', 'win32'): assert internetarchive.utils.is_windows() is True with patch('platform.system', return_value='Linux'), \ patch('sys.platform', 'linux'): assert internetarchive.utils.is_windows() is False def test_sanitize_filename_windows(): test_cases = [ ('file:name.txt', 'file%3Aname.txt'), ('file%name.txt', 'file%25name.txt'), ('con.txt', 'con.txt'), # Reserved name, but no invalid chars so unchanged ('file .txt', 'file .txt'), # Internal space preserved (not trailing) ('file ', 'file'), # Trailing spaces removed ('file..', 'file'), # Trailing dots removed ('file . ', 'file'), # Trailing space and dot removed ] for input_name, expected in test_cases: result = internetarchive.utils.sanitize_filename_windows(input_name) assert result == expected def test_sanitize_filename_posix(): # Test without colon encoding result = internetarchive.utils.sanitize_filename_posix('file/name.txt', False) assert result == 'file%2Fname.txt' # Test with colon encoding result = internetarchive.utils.sanitize_filename_posix('file:name.txt', True) assert result == 'file%3Aname.txt' # Test mixed encoding result = internetarchive.utils.sanitize_filename_posix('file/:name.txt', True) assert result == 'file%2F%3Aname.txt' def test_unsanitize_filename(): test_cases = [ ('file%3Aname.txt', 'file:name.txt'), ('file%2Fname.txt', 'file/name.txt'), ('file%25name.txt', 'file%name.txt'), # Percent sign ('normal.txt', 'normal.txt'), # No encoding ] for input_name, expected in test_cases: with warnings.catch_warnings(record=True) as w: result = internetarchive.utils.unsanitize_filename(input_name) assert result == expected if '%' in input_name: assert len(w) == 1 assert issubclass(w[0].category, UserWarning) def test_sanitize_filename(): # Test Windows path with patch('internetarchive.utils.is_windows', return_value=True): with warnings.catch_warnings(record=True) as w: result = internetarchive.utils.sanitize_filename('file:name.txt') assert result == 'file%3Aname.txt' assert len(w) == 1 assert "sanitized" in str(w[0].message) # Test POSIX path with patch('internetarchive.utils.is_windows', return_value=False): result = internetarchive.utils.sanitize_filename('file/name.txt', False) assert result == 'file%2Fname.txt' def test_sanitize_filepath(): # Test with colon encoding result = internetarchive.utils.sanitize_filepath('/path/to/file:name.txt', True) assert result == '/path/to/file%3Aname.txt' # Test without colon encoding result = internetarchive.utils.sanitize_filepath('/path/to/file:name.txt', False) assert result == '/path/to/file:name.txt' # Colon not encoded on POSIX by default # Test Windows path (mocked) with patch('internetarchive.utils.is_windows', return_value=True): result = internetarchive.utils.sanitize_filepath('/path/to/con.txt') assert result == '/path/to/con.txt' # Reserved name sanitized python-internetarchive-5.7.2/tests/test_windows_filenames.py000066400000000000000000000125501513674652200245440ustar00rootroot00000000000000import os import sys import pytest from internetarchive import get_item from internetarchive.exceptions import DirectoryTraversalError from internetarchive.files import File from internetarchive.item import Item from internetarchive.utils import ( is_path_within_directory, sanitize_windows_filename, sanitize_windows_relpath, ) IS_WIN = os.name == 'nt' pytestmark = pytest.mark.skipif(not IS_WIN, reason='Windows specific tests') def test_control_char_encoding(): name = 'bad\x05name' sanitized, modified = sanitize_windows_filename(name) assert modified assert sanitized == 'bad%05name' @pytest.mark.parametrize(('reserved','expected'), [ ('AUX', 'AU%58'), ('CON', 'CO%4E'), ('COM1', 'COM%31'), ('LPT9', 'LPT%39'), ('NUL', 'NU%4C'), ]) def test_reserved_names(reserved, expected): sanitized, modified = sanitize_windows_filename(reserved) assert modified assert sanitized == expected @pytest.mark.parametrize(('filename','expected'), [ ('AUX.txt', 'AU%58.txt'), ('con.log', 'co%6E.log'), ('Com1.bin', 'Com%31.bin'), ('COM3.txt.txt', 'COM%33.txt.txt'), ]) def test_reserved_with_extension_sanitized(filename, expected): sanitized, modified = sanitize_windows_filename(filename) assert modified assert sanitized == expected @pytest.mark.parametrize(('filename','expected'), [ ('name.', 'name%2E'), ('name..', 'name%2E%2E'), ('trailspace ', 'trailspace%20'), ('both. ', 'both%2E%20'), ]) def test_trailing_dot_space(filename, expected): sanitized, modified = sanitize_windows_filename(filename) assert modified assert sanitized == expected @pytest.mark.parametrize(('ch','enc'), [ (':', '%3A'), ('*', '%2A'), ('<', '%3C'), ('>', '%3E'), ('|', '%7C'), ('?', '%3F'), ('\\', '%5C'), ('"', '%22') ]) def test_invalid_chars(ch, enc): sanitized, modified = sanitize_windows_filename(f'a{ch}b') assert modified assert sanitized == f'a{enc}b' @pytest.mark.parametrize('name', [ 'back\\slash', 'dir\\\\file' ]) def test_backslash_always_encoded(name): sanitized, _modified = sanitize_windows_filename(name) assert '%5C' in sanitized def test_full_filename_combined_sanitization(tmp_path, monkeypatch): """Simulate downloading a file whose remote name contains many invalid characters including a backslash. We only test the sanitization logic up to path formation (not actual network download).""" remote_name = 'hello < > : " \\ | ? *.txt' # Use direct sanitize to assert expected output sanitized, modified = sanitize_windows_filename(remote_name) assert modified # Ensure each invalid char encoded for ch in ['<','>','|','?','*',':','\\','"',' ']: assert ch not in sanitized or ch == ' ' # trailing/inner spaces become %20 assert '%5C' in sanitized # backslash def test_reserved_identifier_directory_sanitized(tmp_path): """Ensure that an item identifier that is a reserved device name is sanitized when constructing download paths.""" # This test focuses on sanitize_windows_filename, as item.Download path building now # sanitizes components. reserved = 'AUX' sanitized, modified = sanitize_windows_filename(reserved) assert modified assert (sanitized.startswith('AU') and sanitized.endswith(b'X'.hex().upper()[:])) \ or sanitized == 'AU%58' def test_directory_traversal_exception_handled(monkeypatch, tmp_path): # Use is_path_within_directory directly base = tmp_path outside = tmp_path.parent / 'outside.txt' outside.write_text('x') assert not is_path_within_directory(str(base), str(outside)) @pytest.mark.parametrize('attempt', [ '../evil.txt', '..\\evil.txt', '..%2Fevil.txt', '%2e%2e/evil.txt' ]) def test_traversal_attempt_sanitization(attempt): # sanitize_windows_relpath should NOT remove traversal but higher layer blocks it; # here we just ensure it encodes backslashes sanitized, _ = sanitize_windows_relpath(attempt, verbose=False) # Backslashes encoded if '\\' in attempt: assert '%5C' in sanitized or sanitized.replace('\\', '%5C') @pytest.mark.parametrize('name', [ 'hello%20world', '%41already' ]) def test_existing_percent_sequences(name): # If no other encoding needed, percent remains unless part of %HH sequence # and no other changes? sanitized, _modified = sanitize_windows_filename(name) # existing sequences remain unchanged because no other encoding triggered assert sanitized == name @pytest.mark.parametrize('name', [ 'needs:encoding%20plus', 'AUX%41' # reserved triggers change ]) def test_percent_gets_encoded_when_other_modifications(name): sanitized, modified = sanitize_windows_filename(name) if '%' in name and modified: assert '%25' in sanitized or name.count('%') == sanitized.count('%25') # Directory traversal guard logic tests # (cross-platform semantics validated on Windows here) def test_is_path_within_directory_true(tmp_path): base = tmp_path target = base / 'subdir' / 'file.txt' target.parent.mkdir() target.write_text('x') assert is_path_within_directory(str(base), str(target)) def test_is_path_within_directory_false(tmp_path): base = tmp_path / 'a' other = tmp_path / 'b' / 'file.txt' base.mkdir() (tmp_path / 'b').mkdir() other.write_text('x') assert not is_path_within_directory(str(base), str(other)) python-internetarchive-5.7.2/tox.ini000066400000000000000000000003031513674652200175600ustar00rootroot00000000000000[tox] envlist = py{39,310,311,312,313,314},pypy{311} [testenv] deps = -r tests/requirements.txt # See setup.cfg for changes to default settings commands = ruff check pytest {posargs}