pax_global_header00006660000000000000000000000064151412110550014505gustar00rootroot0000000000000052 comment=1181d335955418f081a1d0b94c3d8350cea0751f s3fs-2026.2.0/000077500000000000000000000000001514121105500126145ustar00rootroot00000000000000s3fs-2026.2.0/.coveragerc000066400000000000000000000001771514121105500147420ustar00rootroot00000000000000[run] include = s3fs/* omit = s3fs/tests/test* [report] show_missing = True [html] directory = coverage_html_report s3fs-2026.2.0/.gitattributes000066400000000000000000000000361514121105500155060ustar00rootroot00000000000000s3fs/_version.py export-subst s3fs-2026.2.0/.github/000077500000000000000000000000001514121105500141545ustar00rootroot00000000000000s3fs-2026.2.0/.github/workflows/000077500000000000000000000000001514121105500162115ustar00rootroot00000000000000s3fs-2026.2.0/.github/workflows/ci.yml000066400000000000000000000025561514121105500173370ustar00rootroot00000000000000name: CI on: [push, pull_request] jobs: test: name: Python ${{ matrix.python-version }} - AioBotocore ${{ matrix.aiobotocore-version }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: - "3.10" - "3.11" - "3.12" - "3.13" - "3.14" aiobotocore-version: [">=2.19.0,<2.20.0", "<3.0.0", "<4.0.0"] env: BOTO_CONFIG: /dev/null AWS_ACCESS_KEY_ID: foobar_key AWS_SECRET_ACCESS_KEY: foobar_secret steps: - name: Checkout source uses: actions/checkout@v5 with: fetch-depth: 0 - name: Setup conda uses: conda-incubator/setup-miniconda@v3 with: environment-file: ci/env.yaml python-version: ${{ matrix.python-version }} - name: Install shell: bash -l {0} run: | pip install git+https://github.com/fsspec/filesystem_spec pip install --upgrade "aiobotocore${{ matrix.aiobotocore-version }}" pip install . --no-deps pip list - name: Run Tests shell: bash -l {0} run: pytest -vv -s s3fs pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v5 - uses: actions/setup-python@v6 with: python-version: "3.11" - uses: pre-commit/action@v3.0.1 s3fs-2026.2.0/.gitignore000066400000000000000000000001361514121105500146040ustar00rootroot00000000000000*.pyc .cache/ .pytest_cache/ .python-version .idea/ __pycache__ dist/ *.egg-info build/ venv/ s3fs-2026.2.0/.pre-commit-config.yaml000066400000000000000000000006601514121105500170770ustar00rootroot00000000000000repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/psf/black-pre-commit-mirror rev: 25.9.0 hooks: - id: black exclude: ^docs/ - repo: https://github.com/pycqa/flake8 rev: 7.3.0 hooks: - id: flake8 exclude: tests/|^docs/|__init__.py s3fs-2026.2.0/.readthedocs.yaml000066400000000000000000000003601514121105500160420ustar00rootroot00000000000000version: 2 build: os: ubuntu-22.04 tools: python: miniconda3-4.7 conda: environment: docs/environment.yml python: install: - method: pip path: . sphinx: configuration: docs/source/conf.py fail_on_warning: true s3fs-2026.2.0/CONTRIBUTING.md000066400000000000000000000002161514121105500150440ustar00rootroot00000000000000s3fs is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more. s3fs-2026.2.0/LICENSE.txt000066400000000000000000000027411514121105500144430ustar00rootroot00000000000000Copyright (c) 2016, Continuum Analytics, Inc. and contributors All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of Continuum Analytics nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. s3fs-2026.2.0/MANIFEST.in000066400000000000000000000003411514121105500143500ustar00rootroot00000000000000recursive-include s3fs *.py recursive-include docs *.rst include setup.py include README.rst include LICENSE.txt include MANIFEST.in include requirements.txt prune docs/_build include versioneer.py include s3fs/_version.py s3fs-2026.2.0/README.md000066400000000000000000000013071514121105500140740ustar00rootroot00000000000000s3fs ==== [|Build Status|](https://github.com/fsspec/s3fs/actions) [|Documentation|](https://s3fs.readthedocs.io/en/latest/?badge=latest) S3FS builds on [aiobotocore](https://aiobotocore.readthedocs.io/en/latest/) to provide a convenient Python filesystem interface for S3. Support ------- Work on this repository is supported in part by: "Anaconda, Inc. - Advancing AI through open source." anaconda logo s3fs-2026.2.0/ci/000077500000000000000000000000001514121105500132075ustar00rootroot00000000000000s3fs-2026.2.0/ci/env.yaml000066400000000000000000000003611514121105500146630ustar00rootroot00000000000000name: test_env channels: - conda-forge dependencies: - pytest - pytest-asyncio - pip - pytest - ujson - requests - decorator - pytest-timeout - flake8 - black - httpretty - aiobotocore - moto - flask - fsspec s3fs-2026.2.0/docs/000077500000000000000000000000001514121105500135445ustar00rootroot00000000000000s3fs-2026.2.0/docs/Makefile000066400000000000000000000167511514121105500152160ustar00rootroot00000000000000# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" .PHONY: clean clean: rm -rf $(BUILDDIR)/* .PHONY: html html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." .PHONY: dirhtml dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." .PHONY: singlehtml singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." .PHONY: pickle pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." .PHONY: json json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." .PHONY: htmlhelp htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." .PHONY: qthelp qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/S3Fs.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/S3Fs.qhc" .PHONY: applehelp applehelp: $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp @echo @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." @echo "N.B. You won't be able to view it unless you put it in" \ "~/Library/Documentation/Help or install it in your application" \ "bundle." .PHONY: devhelp devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/S3Fs" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/S3Fs" @echo "# devhelp" .PHONY: epub epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." .PHONY: latex latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." .PHONY: latexpdf latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." .PHONY: latexpdfja latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." .PHONY: text text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." .PHONY: man man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." .PHONY: texinfo texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." .PHONY: info info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." .PHONY: gettext gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." .PHONY: changes changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." .PHONY: linkcheck linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." .PHONY: doctest doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." .PHONY: coverage coverage: $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." .PHONY: xml xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." .PHONY: pseudoxml pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." s3fs-2026.2.0/docs/environment.yml000066400000000000000000000002001514121105500166230ustar00rootroot00000000000000name: s3fs channels: - defaults dependencies: - python= 3.10 - botocore - docutils<0.17 - sphinx - sphinx_rtd_theme s3fs-2026.2.0/docs/make.bat000066400000000000000000000161261514121105500151570ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source set I18NSPHINXOPTS=%SPHINXOPTS% source if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled echo. coverage to run coverage check of the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) REM Check if sphinx-build is available and fallback to Python version if any %SPHINXBUILD% 1>NUL 2>NUL if errorlevel 9009 goto sphinx_python goto sphinx_ok :sphinx_python set SPHINXBUILD=python -m sphinx.__init__ %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) :sphinx_ok if "%1" == "html" ( %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\S3Fs.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\S3Fs.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "coverage" ( %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage if errorlevel 1 exit /b 1 echo. echo.Testing of coverage in the sources finished, look at the ^ results in %BUILDDIR%/coverage/python.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end s3fs-2026.2.0/docs/source/000077500000000000000000000000001514121105500150445ustar00rootroot00000000000000s3fs-2026.2.0/docs/source/_static/000077500000000000000000000000001514121105500164725ustar00rootroot00000000000000s3fs-2026.2.0/docs/source/_static/custom.css000066400000000000000000000001241514121105500205130ustar00rootroot00000000000000.classifier:before { font-style: normal; margin: 0.5em; content: ":"; } s3fs-2026.2.0/docs/source/api.rst000066400000000000000000000016121514121105500163470ustar00rootroot00000000000000API === .. currentmodule:: s3fs.core .. autosummary:: S3FileSystem S3FileSystem.cat S3FileSystem.du S3FileSystem.exists S3FileSystem.find S3FileSystem.get S3FileSystem.glob S3FileSystem.info S3FileSystem.ls S3FileSystem.mkdir S3FileSystem.mv S3FileSystem.open S3FileSystem.put S3FileSystem.read_block S3FileSystem.rm S3FileSystem.tail S3FileSystem.touch .. autosummary:: S3File S3File.close S3File.flush S3File.info S3File.read S3File.seek S3File.tell S3File.write .. currentmodule:: s3fs.mapping .. autosummary:: S3Map .. currentmodule:: s3fs.core .. autoclass:: S3FileSystem :members: :inherited-members: .. autoclass:: S3File :members: :inherited-members: .. currentmodule:: s3fs.mapping .. autofunction:: S3Map .. currentmodule:: s3fs.utils .. autoclass:: ParamKwargsHelper .. autoclass:: SSEParams s3fs-2026.2.0/docs/source/changelog.rst000066400000000000000000000164701514121105500175350ustar00rootroot00000000000000Changelog ========= 2026.2.0 -------- - add custom error handling (#1003) - do delete placeholders with rm(recursive=True) (#1005) - force new session if it was explicitly closed (#1002) 2026.1.0 -------- - allow aiobotocore 3 (#998) 2025.12.0 --------- - remove optional dependencies (#995) - add support for py3.14 and remove 3.9 (#993) - add link docs->repo (#992) 2025.10.0 --------- - get bucket info on demand (#987) - add CoC (#986) - add goatcounter tracker (#985) 2025.9.0 -------- - update README for distribution compliance (#977) 2025.7.0 -------- - fix exclusive write for small files (#974) - acknowledge Anaconda support (#972) - fix test typo (#970) 2025.5.1 -------- no changes 2025.5.0 -------- - simpler requirements syntax (#958) - use head_bucket for info(bucket) (#961) 2025.3.2 -------- no changes 2025.3.1 -------- - get_event_loop -> get_running_loop at shutdown (#954) 2025.3.0 -------- - recreate sessino object on refresh (#939) - re-enable CI tests (#940) 2025.2.0 -------- - update docstrings to new default values (#934) - fix CI (#936) 2024.12.0 --------- - CI fixes (#922) - smaller threshold for copy_managed (#921) - exclusive write (#917) - fix bug in _find (#913) - parse query without upstream infer_storage_options (#912) - bug in _upload_file_part_concurrent (#910) 2024.10.0 --------- - invalidate cache in one-shot pipe file (#904) - make pipe() concurrent (#901) - add py3.13 (#898) - suppoert R2 multi-part uploads (#888) 2024.9.0 -------- no change 2024.6.1 -------- no changes 2024.6.0 -------- no changes 2024.5.0 -------- - widen fsspec req version (#869) - _bulk_delete must return list (#866) - retry on "reduce request rate" (#865) 2024.3.1 -------- - accept kwargs in get_file (#863) 2024.3.0 -------- - don't fail ls is parent is unaccessible (#860) - allow checksum error to retry (#858) - don't lsbuckets for isdir(bucket) (#856) - concurrent uplads of parts in put_file (#848) 2024.2.0 -------- - fix cache lookup in _info (#840) 2023.12.2 --------- no changes 2023.12.1 --------- - revert fallback to anon (#835) 2023.12.0 --------- - fall back to anon if no creds are found or passed at all (#823) - **relax version bounds for aiobotocore** (#829) - avoid key error if LastModified missing (#828) - add make_mucket_versioned method (#825) - retain TZ on modified time (#818) 2023.10.0 --------- - make protocol attribute a tuple (#812) - update to aiobotocore 2.7.0 (#809) - fix in _get_file following failure after connect (#805) - test for du of nonexistent (#803) 2023.9.2 -------- - allow size= in fs.open() (#797) - rmdir for non-bucket (#975) - moto updates (#973) - fix CI warnings (#792) - dircache usage with depth (#791) 2023.9.1 -------- - retry ClientPayloadError while reading after initial connection (#787) - don't pass ACL if not specified (#785) 2023.9.0 -------- - aiobotocore to 2.5.4 - better ** support in bulk ops/glob (#769) - default ACL to "private" rather than blank (#764) - invalidate cache in rm_file (#762) - closing client in running loop (#760) 2023.6.0 -------- - allow versions in info.exists (#746) - streaming file to update it's size for tell (#745, 741) 2023.5.0 -------- - Fix "_" in xattrs tests (#732) - Fix file pointer already at end of file when retrying put (#731) - Fix repeated find corrupting cache (#730) - Remove duplicate class definition (#727) - return list of deleted keys in bulk deleted (#726) 2023.4.0 -------- - Add streaming async read file (#722) - Doc fixes (#721) - aiobotocore to 2.5.0 (#710) 2023.3.0 -------- - Allow setting endpoint_url as top-level kwarg (#704) - minimum python version 3.8 (#702) - Update docs config (#697) - get/put/cp recursive extra tests (#691) 2023.1.0 -------- - parse lambda ARNs (#686) - recursive on chmod (#679) - default cache to be readahead (#678) - temporary redirects in headBucket (#676) - async iterator for listings (#670) 2022.11.0 --------- - optionally listing versions with ls (#661) 2022.10.0 --------- - directory cache race condition (#655) - version aware find (#654) 2022.8.1 -------- (no change) 2022.8.0 -------- - aiobotocore 2.4.0 (#643) - del/list multipart uploads (#645) - disallow prerelease aiohttp (#640) - docs syntax (#634) 2022.7.1 -------- No changes 2022.7.0 -------- - aiobotocore 2.3.4 (#633) 2022.5.0 -------- - aiobotocore 2.3 (#622, fixes #558) - rate limiting (#619, #620) 2022.3.0 -------- - pre-commit (#612) - aiobotocore 2.2 (#609) - empty ETag (#605) - HTTPClientError retry (#597) - new callbacks support (#590) 2022.02.0 --------- - callbacks fixes (#594, 590) - drop py36 (#582) - metadata fixes (#575, 579) 2022.01.0 --------- - aiobotocore dep to 2.1.0 (#564) - docs for non-aws (#567) - ContentType in info (#570) - small-file ACL (#574) 2021.11.1 --------- - deal with missing ETag (#557) - ClientPayloadError to retryable (#556) - pin aiobotocore (#555) 2021.11.0 --------- - move to fsspec org - doc tweaks (#546, 540) - redondant argument in _rm_versioned_bucket_contents (#439) - allow client_method in url/sign (POST, etc) (#536) - revert list_v2->head for info (#545) 2021.10.1 --------- - allow other methods than GET to url/sign (#536) 2021.10.0 --------- No changes (just released to keep pin with fsspec) 2021.09.0 --------- - check for bucket also with get_bucket_location (#533) - update versioneer (#531) 2021.08.1 --------- - retry on IncompleteRead (#525) - fix isdir for missing bucket (#522) - raise for glob("*") (#5167) 2021.08.0 --------- - fix for aiobotocore update (#510) 2021.07.0 --------- - make bucket in put(recursive) (#496) - non-truthy prefixes (#497) - implement rm_file (#499) 2021.06.1 --------- - bucket region caching (#495) 2021.06.0 --------- - support "prefix" in directory listings (#486) - support negative index in cat_file (#487, 488) - don't requite ETag in file details (#480) 2021.05.0 --------- - optimize ``info``,``exists`` (and related) calls for non-version aware mode - copy with entries without ETag (#480) - find not to corrupts parent listing (#476) - short listing to determine directory (#472, 471) Version 2021.04.0 ----------------- - switch to calver and fsspec pin - py36 (#462) - async fixes (#456, 452) Version 0.6.0 ------------- - update for fsspec 0.9.0 (#448) - better errors (#443) - cp to preserve ETAG (#441) - CI (#435, #427, #395) - 5GB PUT (#425) - partial cat (#389) - direct find (#360) Version 0.5.0 ------------- - Asynchronous filesystem based on ``aiobotocore`` Version 0.4.0 ------------- - New instances no longer need reconnect (:pr:`244`) by `Martin Durant`_ - Always use multipart uploads when not autocommitting (:pr:`243`) by `Marius van Niekerk`_ - Create ``CONTRIBUTING.md`` (:pr:`248`) by `Jacob Tomlinson`_ - Use autofunction for ``S3Map`` sphinx autosummary (:pr:`251`) by `James Bourbeau`_ - Miscellaneous doc updates (:pr:`252`) by `James Bourbeau`_ - Support for Python 3.8 (:pr:`264`) by `Tom Augspurger`_ - Improved performance for ``isdir`` (:pr:`259`) by `Nate Yoder`_ - Increased the minimum required version of fsspec to 0.6.0 .. _`Martin Durant`: https://github.com/martindurant .. _`Marius van Niekerk`: https://github.com/mariusvniekerk .. _`Jacob Tomlinson`: https://github.com/jacobtomlinson .. _`James Bourbeau`: https://github.com/jrbourbeau .. _`Tom Augspurger`: https://github.com/TomAugspurger .. _`Nate Yoder`: https://github.com/nateyoder s3fs-2026.2.0/docs/source/code-of-conduct.rst000066400000000000000000000126361514121105500205570ustar00rootroot00000000000000Code of Conduct =============== All participants in the fsspec community are expected to adhere to a Code of Conduct. As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. We are committed to making participation in this project a harassment-free experience for everyone, treating everyone as unique humans deserving of respect. Examples of unacceptable behaviour by participants include: - The use of sexualized language or imagery - Personal attacks - Trolling or insulting/derogatory comments - Public or private harassment - Publishing other's private information, such as physical or electronic addresses, without explicit permission - Other unethical or unprofessional conduct Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviours that they deem inappropriate, threatening, offensive, or harmful. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team. This code of conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. If you feel the code of conduct has been violated, please report the incident to the fsspec core team. Reporting --------- If you believe someone is violating theCode of Conduct we ask that you report it to the Project by emailing community@anaconda.com. All reports will be kept confidential. In some cases we may determine that a public statement will need to be made. If that's the case, the identities of all victims and reporters will remain confidential unless those individuals instruct us otherwise. If you believe anyone is in physical danger, please notify appropriate law enforcement first. In your report please include: - Your contact info - Names (real, nicknames, or pseudonyms) of any individuals involved. If there were other witnesses besides you, please try to include them as well. - When and where the incident occurred. Please be as specific as possible. - Your account of what occurred. If there is a publicly available record please include a link. - Any extra context you believe existed for the incident. - If you believe this incident is ongoing. - If you believe any member of the core team has a conflict of interest in adjudicating the incident. - What, if any, corrective response you believe would be appropriate. - Any other information you believe we should have. Core team members are obligated to maintain confidentiality with regard to the reporter and details of an incident. What happens next? ~~~~~~~~~~~~~~~~~~ You will receive an email acknowledging receipt of your complaint. The core team will immediately meet to review the incident and determine: - What happened. - Whether this event constitutes a code of conduct violation. - Who the bad actor was. - Whether this is an ongoing situation, or if there is a threat to anyone's physical safety. - If this is determined to be an ongoing incident or a threat to physical safety, the working groups' immediate priority will be to protect everyone involved. If a member of the core team is one of the named parties, they will not be included in any discussions, and will not be provided with any confidential details from the reporter. If anyone on the core team believes they have a conflict of interest in adjudicating on a reported issue, they will inform the other core team members, and exempt themselves from any discussion about the issue. Following this declaration, they will not be provided with any confidential details from the reporter. Once the working group has a complete account of the events they will make a decision as to how to response. Responses may include: - Nothing (if we determine no violation occurred). - A private reprimand from the working group to the individual(s) involved. - A public reprimand. - An imposed vacation - A permanent or temporary ban from some or all spaces (GitHub repositories, etc.) - A request for a public or private apology. We'll respond within one week to the person who filed the report with either a resolution or an explanation of why the situation is not yet resolved. Once we've determined our final action, we'll contact the original reporter to let them know what action (if any) we'll be taking. We'll take into account feedback from the reporter on the appropriateness of our response, but we don't guarantee we'll act on it. Acknowledgement --------------- This CoC is modified from the one by `BeeWare`_, which in turn refers to the `Contributor Covenant`_ and the `Django`_ project. .. _BeeWare: https://beeware.org/community/behavior/code-of-conduct/ .. _Contributor Covenant: https://www.contributor-covenant.org/version/1/3/0/code-of-conduct/ .. _Django: https://www.djangoproject.com/conduct/reporting/ .. raw:: html s3fs-2026.2.0/docs/source/conf.py000066400000000000000000000217261514121105500163530ustar00rootroot00000000000000#!/usr/bin/env python3 # # S3Fs documentation build configuration file, created by # sphinx-quickstart on Mon Mar 21 15:20:01 2016. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import os # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', 'sphinx.ext.autosummary', 'sphinx.ext.extlinks', 'sphinx.ext.napoleon', ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = 'S3Fs' copyright = '2016, Continuum Analytics' author = 'Continuum Analytics' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. import s3fs version = s3fs.__version__ # The full version, including alpha/beta/rc tags. release = version # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [] # The reST default role (used for this markup: `text`) to use for all # documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. #keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False extlinks = { "pr": ("https://github.com/fsspec/s3fs/pull/%s", "PR #%s"), } # -- Options for HTML output ---------------------------------------------- html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Custom CSS file to override read the docs default CSS. # Contains workaround for issue #790. html_css_files = ["custom.css"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. #html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' #html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value #html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. #html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. htmlhelp_basename = 'S3Fsdoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. #'preamble': '', # Latex figure (float) alignment #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'S3Fs.tex', 'S3Fs Documentation', 'Continuum Analytics', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 's3fs', 'S3Fs Documentation', [author], 1) ] # If true, show URL addresses after external links. #man_show_urls = False # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'S3Fs', 'S3Fs Documentation', author, 'S3Fs', 'One line description of project.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. #texinfo_appendices = [] # If false, no module index is generated. #texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. #texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False s3fs-2026.2.0/docs/source/development.rst000066400000000000000000000002271514121105500201210ustar00rootroot00000000000000Development =========== Create a development environment:: $ pip install -r requirements.txt -r test_requirements.txt Run tests:: $ pytest s3fs-2026.2.0/docs/source/index.rst000066400000000000000000000334221514121105500167110ustar00rootroot00000000000000S3Fs ==== S3Fs is a Pythonic file interface to S3. It builds on top of botocore_. The project is hosted on `GitHub `_ |github_stars| .. |github_stars| image:: https://img.shields.io/github/stars/fsspec/s3fs?style=social :target: https://github.com/fsspec/s3fs :alt: GitHub Repository The top-level class :py:class:`.S3FileSystem` holds connection information and allows typical file-system style operations like ``cp``, ``mv``, ``ls``, ``du``, ``glob``, etc., as well as put/get of local files to/from S3. The connection can be anonymous - in which case only publicly-available, read-only buckets are accessible - or via credentials explicitly supplied or in configuration files. Calling ``open()`` on a :py:class:`.S3FileSystem` (typically using a context manager) provides an :py:class:`.S3File` for read or write access to a particular key. The object emulates the standard ``File`` protocol (``read``, ``write``, ``tell``, ``seek``), such that functions expecting a file can access S3. Only binary read and write modes are implemented, with blocked caching. S3Fs uses and is based upon `fsspec`_. .. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ Examples -------- Simple locate and read a file: .. code-block:: python >>> import s3fs >>> s3 = s3fs.S3FileSystem(anon=True) >>> s3.ls('my-bucket') ['my-file.txt'] >>> with s3.open('my-bucket/my-file.txt', 'rb') as f: ... print(f.read()) b'Hello, world' (see also ``walk`` and ``glob``) Reading with delimited blocks: .. code-block:: python >>> s3.read_block(path, offset=1000, length=10, delimiter=b'\n') b'A whole line of text\n' Writing with blocked caching: .. code-block:: python >>> s3 = s3fs.S3FileSystem(anon=False) # uses default credentials >>> with s3.open('mybucket/new-file', 'wb') as f: ... f.write(2*2**20 * b'a') ... f.write(2*2**20 * b'a') # data is flushed and file closed >>> s3.du('mybucket/new-file') {'mybucket/new-file': 4194304} Because S3Fs faithfully copies the Python file interface it can be used smoothly with other projects that consume the file interface like ``gzip`` or ``pandas``. .. code-block:: python >>> with s3.open('mybucket/my-file.csv.gz', 'rb') as f: ... g = gzip.GzipFile(fileobj=f) # Decompress data with gzip ... df = pd.read_csv(g) # Read CSV file with Pandas Integration ----------- The libraries ``intake``, ``pandas`` and ``dask`` accept URLs with the prefix "s3://", and will use s3fs to complete the IO operation in question. The IO functions take an argument ``storage_options``, which will be passed to :py:class:`.S3FileSystem`, for example: .. code-block:: python df = pd.read_excel("s3://bucket/path/file.xls", storage_options={"anon": True}) This gives the chance to pass any credentials or other necessary arguments needed to s3fs. Async ----- ``s3fs`` is implemented using ``aiobotocore``, and offers async functionality. A number of methods of :py:class:`.S3FileSystem` are ``async``, for for each of these, there is also a synchronous version with the same name and lack of a ``_`` prefix. If you wish to call ``s3fs`` from async code, then you should pass ``asynchronous=True, loop=`` to the constructor (the latter is optional, if you wish to use both async and sync methods). You must also explicitly await the client creation before making any S3 call. .. code-block:: python async def run_program(): s3 = S3FileSystem(..., asynchronous=True) session = await s3.set_session() ... # perform work await session.close() asyncio.run(run_program()) # or call from your async code Concurrent async operations are also used internally for bulk operations such as ``pipe/cat``, ``get/put``, ``cp/mv/rm``. The async calls are hidden behind a synchronisation layer, so are designed to be called from normal code. If you are *not* using async-style programming, you do not need to know about how this works, but you might find the implementation interesting. Multiprocessing --------------- When using Python's `multiprocessing`_, the start method must be set to either ``spawn`` or ``forkserver``. ``fork`` is not safe to use because of the open sockets and async thread used by s3fs, and may lead to hard-to-find bugs and occasional deadlocks. Read more about the available `start methods`_. .. _multiprocessing: https://docs.python.org/3/library/multiprocessing.html .. _start methods: https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods Limitations ----------- This project is meant for convenience, rather than feature completeness. The following are known current omissions: - file access is always binary (although ``readline`` and iterating by line are possible) - no permissions/access-control (i.e., no ``chmod``/``chown`` methods) Logging ------- The logger named ``s3fs`` provides information about the operations of the file system. To quickly see all messages, you can set the environment variable ``S3FS_LOGGING_LEVEL=DEBUG``. The presence of this environment variable will install a handler for the logger that prints messages to stderr and set the log level to the given value. More advance logging configuration is possible using Python's standard `logging framework`_. .. _logging framework: https://docs.python.org/3/library/logging.html Errors ------ The ``s3fs`` library includes a built-in mechanism to automatically retry operations when specific transient errors occur. You can customize this behavior by adding specific exception types or defining complex logic via custom handlers. Default Retryable Errors ~~~~~~~~~~~~~~~~~~~~~~~~ By default, ``s3fs`` will retry the following exception types: - ``socket.timeout`` - ``HTTPClientError`` - ``IncompleteRead`` - ``FSTimeoutError`` - ``ResponseParserError`` - ``aiohttp.ClientPayloadError`` (if available) Registering Custom Error Types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To include additional exception types in the default retry logic, use the ``add_retryable_error`` function. This is useful for simple type-based retries. .. code-block:: python >>> class MyCustomError(Exception): pass >>> s3fs.add_retryable_error(MyCustomError) Implementing Custom Error Handlers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ For more complex scenarios, such as retrying based on an error message rather than just the type, you can register a custom error handler using ``set_custom_error_handler``. The handler should be a callable that accepts an exception instance and returns ``True`` if the error should be retried, or ``False`` otherwise. .. code-block:: python >>> def my_handler(e): return isinstance(e, MyCustomError) and "some condition" in str(e) >>> s3fs.set_custom_error_handler(my_handler) Handling AWS ClientErrors ~~~~~~~~~~~~~~~~~~~~~~~~~ ``s3fs`` provides specialized handling for ``botocore.exceptions.ClientError``. While ``s3fs`` checks these against internal patterns (like throttling), you can extend this behavior using a custom handler. Note that the internal patterns will still be checked and handled before the custom handler. .. code-block:: python >>> def another_handler(e): return isinstance(e, ClientError) and "Throttling" in str(e) >>> s3fs.set_custom_error_handler(another_handler) Credentials ----------- The AWS key and secret may be provided explicitly when creating an :py:class:`.S3FileSystem`. A more secure way, not including the credentials directly in code, is to allow boto to establish the credentials automatically. Boto will try the following methods, in order: - ``AWS_ACCESS_KEY_ID``, ``AWS_SECRET_ACCESS_KEY``, and ``AWS_SESSION_TOKEN`` environment variables - configuration files such as ``~/.aws/credentials`` - for nodes on EC2, the IAM metadata provider You can specify a profile using ``s3fs.S3FileSystem(profile='PROFILE')``. Otherwise ``sf3s`` will use authentication via `boto environment variables`_. .. _boto environment variables: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables In a distributed environment, it is not expected that raw credentials should be passed between machines. In the explicitly provided credentials case, the method :py:meth:`.S3FileSystem.get_delegated_s3pars` can be used to obtain temporary credentials. When not using explicit credentials, it should be expected that every machine also has the appropriate environment variables, config files or IAM roles available. If none of the credential methods are available, only anonymous access will work, and ``anon=True`` must be passed to the constructor. Furthermore, :py:meth:`.S3FileSystem.current` will return the most-recently created instance, so this method could be used in preference to the constructor in cases where the code must be agnostic of the credentials/config used. S3 Compatible Storage --------------------- To use ``s3fs`` against an S3 compatible storage, like `MinIO`_ or `Ceph Object Gateway`_, you'll probably need to pass extra parameters when creating the ``s3fs`` filesystem. Here are some sample configurations: For a self-hosted MinIO instance: .. code-block:: python # When relying on auto discovery for credentials >>> s3 = s3fs.S3FileSystem( anon=False, endpoint_url='https://...' ) # Or passing the credentials directly >>> s3 = s3fs.S3FileSystem( key='miniokey...', secret='asecretkey...', endpoint_url='https://...' ) It is also possible to set credentials through environment variables: .. code-block:: python # export FSSPEC_S3_ENDPOINT_URL=https://... # export FSSPEC_S3_KEY='miniokey...' # export FSSPEC_S3_SECRET='asecretkey...' >>> s3 = s3fs.S3FileSystem() # or ... >>> f = fsspec.open("s3://minio-bucket/...") For Storj DCS via the `S3-compatible Gateway `_: .. code-block:: python # When relying on auto discovery for credentials >>> s3 = s3fs.S3FileSystem( anon=False, endpoint_url='https://gateway.storjshare.io' ) # Or passing the credentials directly >>> s3 = s3fs.S3FileSystem( key='accesskey...', secret='asecretkey...', endpoint_url='https://gateway.storjshare.io' ) For a Scaleway s3-compatible storage in the ``fr-par`` zone: .. code-block:: python >>> s3 = s3fs.S3FileSystem( key='scaleway-api-key...', secret='scaleway-secretkey...', endpoint_url='https://s3.fr-par.scw.cloud', client_kwargs={ 'region_name': 'fr-par' } ) For an OVH s3-compatible storage in the ``GRA`` zone: .. code-block:: python >>> s3 = s3fs.S3FileSystem( key='ovh-s3-key...', secret='ovh-s3-secretkey...', endpoint_url='https://s3.GRA.cloud.ovh.net', client_kwargs={ 'region_name': 'GRA' }, config_kwargs={ 'signature_version': 's3v4' } ) .. _MinIO: https://min.io .. _Ceph Object Gateway: https://docs.ceph.com/docs/master/radosgw/ Requester Pays Buckets ---------------------- Some buckets, such as the `arXiv raw data `__, are configured so that the requester of the data pays any transfer fees. You must be authenticated to access these buckets and (because these charges maybe unexpected) amazon requires an additional key on many of the API calls. To enable ``RequesterPays`` create your file system as .. code-block:: python >>> s3 = s3fs.S3FileSystem(anon=False, requester_pays=True) Serverside Encryption --------------------- For some buckets/files you may want to use some of s3's server side encryption features. ``s3fs`` supports these in a few ways .. code-block:: python >>> s3 = s3fs.S3FileSystem( ... s3_additional_kwargs={'ServerSideEncryption': 'AES256'}) This will create an s3 filesystem instance that will append the ServerSideEncryption argument to all s3 calls (where applicable). The same applies for ``s3.open``. Most of the methods on the filesystem object will also accept and forward keyword arguments to the underlying calls. The most recently specified argument is applied last in the case where both ``s3_additional_kwargs`` and a method's ``**kwargs`` are used. The ``s3.utils.SSEParams`` provides some convenient helpers for the serverside encryption parameters in particular. An instance can be passed instead of a regular python dictionary as the ``s3_additional_kwargs`` parameter. Bucket Version Awareness ------------------------ If your bucket has object versioning enabled then you can add version-aware support to ``s3fs``. This ensures that if a file is opened at a particular point in time that version will be used for reading. This mitigates the issue where more than one user is concurrently reading and writing to the same object. .. code-block:: python >>> s3 = s3fs.S3FileSystem(version_aware=True) # Open the file at the latest version >>> fo = s3.open('versioned_bucket/object') >>> versions = s3.object_version_info('versioned_bucket/object') # Open the file at a particular version >>> fo_old_version = s3.open('versioned_bucket/object', version_id='SOMEVERSIONID') In order for this to function the user must have the necessary IAM permissions to perform a GetObjectVersion Contents ======== .. toctree:: install development api changelog code-of-conduct :maxdepth: 2 .. _botocore: https://botocore.readthedocs.io/en/latest/ Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` These docs pages collect anonymous tracking data using goatcounter, and the dashboard is available to the public: https://s3fs.goatcounter.com/ . .. raw:: html s3fs-2026.2.0/docs/source/install.rst000066400000000000000000000010221514121105500172370ustar00rootroot00000000000000Installation ============ Conda ----- The ``s3fs`` library and its dependencies can be installed from the `conda-forge `_ repository using `conda `_:: $ conda install s3fs -c conda-forge PyPI ---- You can install ``s3fs`` with pip:: pip install s3fs Install from source ------------------- You can also download the ``s3fs`` library from Github and install normally:: git clone git@github.com:fsspec/s3fs cd s3fs python setup.py install s3fs-2026.2.0/pytest.ini000066400000000000000000000000321514121105500146400ustar00rootroot00000000000000[pytest] testpaths = s3fs s3fs-2026.2.0/release-procedure.md000066400000000000000000000010611514121105500165420ustar00rootroot000000000000001. Verify tests on Linux, OS-X, and Windows 2. Complete entries in `docs/source/changelog.rst`. There's no need for changing version numbers in source files. The release version will be determined from the git tag (see below). 3. Tag the commit git tag 1.2.3 -m "Version 1.2.3" 4. Push new version bump commit and tag to github git push fsspec main --tags 5. Build source and wheel packages rm -rf dist/ python setup.py sdist bdist_wheel --universal 6. Upload packages to PyPI twine upload dist/* s3fs-2026.2.0/requirements.txt000066400000000000000000000001101514121105500160700ustar00rootroot00000000000000aiobotocore>=2.19.0,<4.0.0 fsspec==2026.2.0 aiohttp!=4.0.0a0, !=4.0.0a1 s3fs-2026.2.0/s3fs/000077500000000000000000000000001514121105500134725ustar00rootroot00000000000000s3fs-2026.2.0/s3fs/__init__.py000066400000000000000000000003171514121105500156040ustar00rootroot00000000000000from .core import S3FileSystem, S3File, add_retryable_error, set_custom_error_handler from .mapping import S3Map from ._version import get_versions __version__ = get_versions()["version"] del get_versions s3fs-2026.2.0/s3fs/_version.py000066400000000000000000000601171514121105500156750ustar00rootroot00000000000000# This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. # Generated by versioneer-0.29 # https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" import errno import os import re import subprocess import sys from typing import Any, Callable, Dict, List, Optional, Tuple import functools def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = " (HEAD -> main, tag: 2026.2.0)" git_full = "1181d335955418f081a1d0b94c3d8350cea0751f" git_date = "2026-02-05 16:57:01 -0500" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str parentdir_prefix: str versionfile_source: str verbose: bool def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "pep440" cfg.tag_prefix = "" cfg.parentdir_prefix = "None" cfg.versionfile_source = "s3fs/_version.py" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen( [command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs, ) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, process.returncode return stdout, process.returncode def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return { "version": dirname[len(parentdir_prefix) :], "full-revisionid": None, "dirty": False, "error": None, "date": None, } rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print( "Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix) ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r"\d", r): continue if verbose: print("picking %s" % r) return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return { "version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner( GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*", ], cwd=root, ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( full_tag, tag_prefix, ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None, } if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date"), } def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for _ in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None, } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None, } s3fs-2026.2.0/s3fs/core.py000066400000000000000000002727471514121105500150170ustar00rootroot00000000000000import asyncio import errno import io import logging import math import mimetypes import os import socket import weakref import re from urllib3.exceptions import IncompleteRead import fsspec # noqa: F401 from fsspec.spec import AbstractBufferedFile from fsspec.utils import tokenize, setup_logging as setup_logger from fsspec.asyn import ( AsyncFileSystem, AbstractAsyncStreamedFile, sync, sync_wrapper, FSTimeoutError, _run_coros_in_chunks, ) from fsspec.callbacks import _DEFAULT_CALLBACK import aiobotocore import botocore import aiobotocore.session from aiobotocore.config import AioConfig from botocore.exceptions import ClientError, HTTPClientError, ParamValidationError from botocore.parsers import ResponseParserError from s3fs.errors import translate_boto_error from s3fs.utils import S3BucketRegionCache, ParamKwargsHelper, _get_brange, FileExpired # ClientPayloadError can be thrown during an incomplete read. aiohttp is a dependency of # aiobotocore, we guard the import here in case this dependency is replaced in a future version # of aiobotocore. try: from aiohttp import ClientPayloadError except ImportError: ClientPayloadError = None logger = logging.getLogger("s3fs") def setup_logging(level=None): setup_logger(logger=logger, level=(level or os.environ["S3FS_LOGGING_LEVEL"])) if "S3FS_LOGGING_LEVEL" in os.environ: setup_logging() MANAGED_COPY_THRESHOLD = 150 * 2**20 # Certain rate-limiting responses can send invalid XML # (see https://github.com/fsspec/s3fs/issues/484), which can result in a parser error # deep within botocore. So we treat those as retryable as well, even though there could # be some false positives. S3_RETRYABLE_ERRORS = ( socket.timeout, HTTPClientError, IncompleteRead, FSTimeoutError, ResponseParserError, ) MAX_UPLOAD_PARTS = 10_000 # maximum number of parts for S3 multipart upload if ClientPayloadError is not None: S3_RETRYABLE_ERRORS += (ClientPayloadError,) def add_retryable_error(exc): """ Add an exception type to the list of retryable S3 errors. Parameters ---------- exc : Exception The exception type to add to the retryable errors. Examples ---------- >>> class MyCustomError(Exception): # doctest: +SKIP ... pass # doctest: +SKIP >>> add_retryable_error(MyCustomError) # doctest: +SKIP """ global S3_RETRYABLE_ERRORS S3_RETRYABLE_ERRORS += (exc,) CUSTOM_ERROR_HANDLER = lambda _: False def set_custom_error_handler(func): """Set a custom error handler function for S3 retryable errors. The function should take an exception instance as its only argument, and return True if the operation should be retried, or False otherwise. This can also be used for custom behavior on `ClientError` exceptions, such as retrying other patterns. Parameters ---------- func : callable[[Exception], bool] The custom error handler function. Examples ---------- >>> def my_handler(e): # doctest: +SKIP ... return isinstance(e, MyCustomError) and "some condition" in str(e) # doctest: +SKIP >>> set_custom_error_handler(my_handler) # doctest: +SKIP >>> def another_handler(e): # doctest: +SKIP ... return isinstance(e, ClientError) and "Throttling" in str(e)" # doctest: +SKIP >>> set_custom_error_handler(another_handler) # doctest: +SKIP """ global CUSTOM_ERROR_HANDLER CUSTOM_ERROR_HANDLER = func _VALID_FILE_MODES = {"r", "w", "a", "rb", "wb", "ab"} _PRESERVE_KWARGS = [ "CacheControl", "ContentDisposition", "ContentEncoding", "ContentLanguage", "ContentLength", "ContentType", "Expires", "WebsiteRedirectLocation", "ServerSideEncryption", "SSECustomerAlgorithm", "SSEKMSKeyId", "BucketKeyEnabled", "StorageClass", "ObjectLockMode", "ObjectLockRetainUntilDate", "ObjectLockLegalHoldStatus", "Metadata", ] key_acls = { "private", "public-read", "public-read-write", "authenticated-read", "aws-exec-read", "bucket-owner-read", "bucket-owner-full-control", } buck_acls = {"private", "public-read", "public-read-write", "authenticated-read"} async def _error_wrapper(func, *, args=(), kwargs=None, retries): if kwargs is None: kwargs = {} err = None for i in range(retries): wait_time = min(1.7**i * 0.1, 15) try: return await func(*args, **kwargs) except S3_RETRYABLE_ERRORS as e: err = e logger.debug("Retryable error: %s", e) await asyncio.sleep(wait_time) except ClientError as e: logger.debug("Client error (maybe retryable): %s", e) err = e matched = False for pattern in [ "SlowDown", "reduce your request rate", "XAmzContentSHA256Mismatch", ]: if pattern in str(e): matched = True break if matched: await asyncio.sleep(wait_time) else: should_retry = CUSTOM_ERROR_HANDLER(e) if should_retry: await asyncio.sleep(wait_time) else: break except Exception as e: err = e should_retry = CUSTOM_ERROR_HANDLER(e) if should_retry: await asyncio.sleep(wait_time) else: logger.debug("Nonretryable error: %s", e) break if "'coroutine'" in str(err): # aiobotocore internal error - fetch original botocore error tb = err.__traceback__ while tb.tb_next: tb = tb.tb_next try: await tb.tb_frame.f_locals["response"] except Exception as e: err = e err = translate_boto_error(err) raise err def version_id_kw(version_id): """Helper to make versionId kwargs. Not all boto3 methods accept a None / empty versionId so dictionary expansion solves that problem. """ if version_id: return {"VersionId": version_id} else: return {} def _coalesce_version_id(*args): """Helper to coalesce a list of version_ids down to one""" version_ids = set(args) if None in version_ids: version_ids.remove(None) if len(version_ids) > 1: raise ValueError( "Cannot coalesce version_ids where more than one are defined," f" {version_ids}" ) elif len(version_ids) == 0: return None else: return version_ids.pop() def calculate_chunksize(filesize, chunksize=None, max_parts=MAX_UPLOAD_PARTS) -> int: if chunksize is None: chunksize = 50 * 2**20 # default chunksize set to 50 MiB required_chunks = math.ceil(filesize / chunksize) # increase chunksize to fit within the max_parts limit if required_chunks > max_parts: # S3 supports uploading objects up to 5 TiB in size, # so each chunk can be up to ~524 MiB. chunksize = math.ceil(filesize / max_parts) return chunksize class S3FileSystem(AsyncFileSystem): """ Access S3 as if it were a file system. This exposes a filesystem-like API (ls, cp, open, etc.) on top of S3 storage. Provide credentials either explicitly (``key=``, ``secret=``) or depend on boto's credential methods. See botocore documentation for more information. If no credentials are available, use ``anon=True``. Parameters ---------- anon : bool (False) Whether to use anonymous connection (public buckets only). If False, uses the key/secret given, or boto's credential resolver (client_kwargs, environment, variables, config files, EC2 IAM server, in that order) endpoint_url : string (None) Use this endpoint_url, if specified. Needed for connecting to non-AWS S3 buckets. Takes precedence over `endpoint_url` in client_kwargs. key : string (None) If not anonymous, use this access key ID, if specified. Takes precedence over `aws_access_key_id` in client_kwargs. secret : string (None) If not anonymous, use this secret access key, if specified. Takes precedence over `aws_secret_access_key` in client_kwargs. token : string (None) If not anonymous, use this security token, if specified use_ssl : bool (True) Whether to use SSL in connections to S3; may be faster without, but insecure. If ``use_ssl`` is also set in ``client_kwargs``, the value set in ``client_kwargs`` will take priority. s3_additional_kwargs : dict of parameters that are used when calling s3 api methods. Typically used for things like "ServerSideEncryption". client_kwargs : dict of parameters for the botocore client requester_pays : bool (False) If RequesterPays buckets are supported. default_block_size: int (None) If given, the default block size value used for ``open()``, if no specific value is given at all time. The built-in default is 50MB. default_fill_cache : Bool (True) Whether to use cache filling with open by default. Refer to ``S3File.open``. default_cache_type : string ("readahead") If given, the default cache_type value used for ``open()``. Set to "none" if no caching is desired. See fsspec's documentation for other available cache_type values. Default cache_type is "readahead". version_aware : bool (False) Whether to support bucket versioning. If enable this will require the user to have the necessary IAM permissions for dealing with versioned objects. Note that in the event that you only need to work with the latest version of objects in a versioned bucket, and do not need the VersionId for those objects, you should set ``version_aware`` to False for performance reasons. When set to True, filesystem instances will use the S3 ListObjectVersions API call to list directory contents, which requires listing all historical object versions. cache_regions : bool (False) Whether to cache bucket regions or not. Whenever a new bucket is used, it will first find out which region it belongs and then use the client for that region. asynchronous : bool (False) Whether this instance is to be used from inside coroutines. config_kwargs : dict of parameters passed to ``botocore.client.Config`` kwargs : other parameters for core session. session : aiobotocore AioSession object to be used for all connections. This session will be used inplace of creating a new session inside S3FileSystem. For example: aiobotocore.session.AioSession(profile='test_user') max_concurrency : int (10) The maximum number of concurrent transfers to use per file for multipart upload (``put()``) operations. Defaults to 10. When used in conjunction with ``S3FileSystem.put(batch_size=...)`` the maximum number of simultaneous connections is ``max_concurrency * batch_size``. We may extend this parameter to affect ``pipe()``, ``cat()`` and ``get()``. Increasing this value will result in higher memory usage during multipart upload operations (by ``max_concurrency * chunksize`` bytes per file). fixed_upload_size : bool (False) Use same chunk size for all parts in multipart upload (last part can be smaller). Cloudflare R2 storage requires fixed_upload_size=True for multipart uploads. The following parameters are passed on to fsspec: skip_instance_cache: to control reuse of instances use_listings_cache, listings_expiry_time, max_paths: to control reuse of directory listings Examples -------- >>> s3 = S3FileSystem(anon=False) # doctest: +SKIP >>> s3.ls('my-bucket/') # doctest: +SKIP ['my-file.txt'] >>> with s3.open('my-bucket/my-file.txt', mode='rb') as f: # doctest: +SKIP ... print(f.read()) # doctest: +SKIP b'Hello, world!' """ root_marker = "" connect_timeout = 5 retries = 5 read_timeout = 15 default_block_size = 50 * 2**20 protocol = ("s3", "s3a") _extra_tokenize_attributes = ("default_block_size",) def __init__( self, anon=False, endpoint_url=None, key=None, secret=None, token=None, use_ssl=True, client_kwargs=None, requester_pays=False, default_block_size=None, default_fill_cache=True, default_cache_type="readahead", version_aware=False, config_kwargs=None, s3_additional_kwargs=None, session=None, username=None, password=None, cache_regions=False, asynchronous=False, loop=None, max_concurrency=10, fixed_upload_size: bool = False, **kwargs, ): if key and username: raise KeyError("Supply either key or username, not both") if secret and password: raise KeyError("Supply secret or password, not both") if username: key = username if password: secret = password self.endpoint_url = endpoint_url self.anon = anon self.key = key self.secret = secret self.token = token self.kwargs = kwargs super_kwargs = { k: kwargs.pop(k) for k in ["use_listings_cache", "listings_expiry_time", "max_paths"] if k in kwargs } # passed to fsspec superclass super().__init__(loop=loop, asynchronous=asynchronous, **super_kwargs) self.default_block_size = default_block_size or self.default_block_size self.default_fill_cache = default_fill_cache self.default_cache_type = default_cache_type self.version_aware = version_aware self.client_kwargs = client_kwargs or {} self.config_kwargs = config_kwargs or {} self.req_kw = {"RequestPayer": "requester"} if requester_pays else {} self.s3_additional_kwargs = s3_additional_kwargs or {} self.use_ssl = use_ssl self.cache_regions = cache_regions self._s3 = None self.session = session self.fixed_upload_size = fixed_upload_size if max_concurrency < 1: raise ValueError("max_concurrency must be >= 1") self.max_concurrency = max_concurrency @property def s3(self): if self._s3 is None: if self.asynchronous: raise RuntimeError("please await ``.set_session`` before anything else") self.connect() return self._s3 def _filter_kwargs(self, s3_method, kwargs): return self._kwargs_helper.filter_dict(s3_method.__name__, kwargs) async def get_s3(self, bucket=None): if self.cache_regions and bucket is not None: return await self._s3creator.get_bucket_client(bucket) else: return self._s3 async def _call_s3(self, method, *akwarglist, **kwargs): await self.set_session() s3 = await self.get_s3(kwargs.get("Bucket")) method = getattr(s3, method) kw2 = kwargs.copy() kw2.pop("Body", None) logger.debug("CALL: %s - %s - %s", method.__name__, akwarglist, kw2) additional_kwargs = self._get_s3_method_kwargs(method, *akwarglist, **kwargs) return await _error_wrapper( method, kwargs=additional_kwargs, retries=self.retries ) call_s3 = sync_wrapper(_call_s3) def _get_s3_method_kwargs(self, method, *akwarglist, **kwargs): additional_kwargs = self.s3_additional_kwargs.copy() for akwargs in akwarglist: additional_kwargs.update(akwargs) # Add the normal kwargs in additional_kwargs.update(kwargs) # filter all kwargs return self._filter_kwargs(method, additional_kwargs) @staticmethod def _get_kwargs_from_urls(urlpath): """ When we have a urlpath that contains a ?versionId= Assume that we want to use version_aware mode for the filesystem. """ from urllib.parse import urlsplit url_query = urlsplit(urlpath).query out = {} if url_query is not None: from urllib.parse import parse_qs parsed = parse_qs(url_query) if "versionId" in parsed: out["version_aware"] = True return out def _find_bucket_key(self, s3_path): """ This is a helper function that given an s3 path such that the path is of the form: bucket/key It will return the bucket and the key represented by the s3 path """ bucket_format_list = [ re.compile( r"^(?Parn:(aws).*:s3:[a-z\-0-9]*:[0-9]{12}:accesspoint[:/][^/]+)/?" r"(?P.*)$" ), re.compile( r"^(?Parn:(aws).*:s3-outposts:[a-z\-0-9]+:[0-9]{12}:outpost[/:]" r"[a-zA-Z0-9\-]{1,63}[/:](bucket|accesspoint)[/:][a-zA-Z0-9\-]{1,63})[/:]?(?P.*)$" ), re.compile( r"^(?Parn:(aws).*:s3-outposts:[a-z\-0-9]+:[0-9]{12}:outpost[/:]" r"[a-zA-Z0-9\-]{1,63}[/:]bucket[/:]" r"[a-zA-Z0-9\-]{1,63})[/:]?(?P.*)$" ), re.compile( r"^(?Parn:(aws).*:s3-object-lambda:[a-z\-0-9]+:[0-9]{12}:" r"accesspoint[/:][a-zA-Z0-9\-]{1,63})[/:]?(?P.*)$" ), ] for bucket_format in bucket_format_list: match = bucket_format.match(s3_path) if match: return match.group("bucket"), match.group("key") s3_components = s3_path.split("/", 1) bucket = s3_components[0] s3_key = "" if len(s3_components) > 1: s3_key = s3_components[1] return bucket, s3_key def split_path(self, path) -> tuple[str, str, str | None]: """ Normalise S3 path string into bucket and key. Parameters ---------- path : string Input path, like `s3://mybucket/path/to/file` Examples -------- >>> split_path("s3://mybucket/path/to/file") ['mybucket', 'path/to/file', None] >>> split_path("s3://mybucket/path/to/versioned_file?versionId=some_version_id") ['mybucket', 'path/to/versioned_file', 'some_version_id'] """ trail = path[len(path.rstrip("/")) :] path = self._strip_protocol(path) path = path.lstrip("/") if "/" not in path: return path, "", None else: bucket, keypart = self._find_bucket_key(path) key, _, version_id = keypart.partition("?versionId=") key += trail # restore trailing slashes removed by AbstractFileSystem._strip_protocol return ( bucket, key, version_id if self.version_aware and version_id else None, ) def _prepare_config_kwargs(self): config_kwargs = self.config_kwargs.copy() if "connect_timeout" not in config_kwargs.keys(): config_kwargs["connect_timeout"] = self.connect_timeout if "read_timeout" not in config_kwargs.keys(): config_kwargs["read_timeout"] = self.read_timeout return config_kwargs async def set_session(self, refresh=False, kwargs={}): """Establish S3 connection object. This async method is called by any operation on an ``S3FileSystem`` instance. The ``refresh=True`` argument is useful if new credentials have been created and the instance needs to be reestablished. ``connect`` is a blocking version of ``set_session``. Parameters ---------- refresh : bool (False) If True, create a new session even if one already exists. kwargs : dict Currently unused. Returns ------- Session to be closed later with await .close() Examples -------- >>> s3 = S3FileSystem(profile="") # doctest: +SKIP # use in an async coroutine to assign the client object to a local variable >>> await s3.set_session() # doctest: +SKIP # blocking version of set_session >>> s3.connect(refresh=True) # doctest: +SKIP """ if self._s3 is not None and not refresh: hsess = getattr(getattr(self._s3, "_endpoint", None), "http_session", None) if hsess is not None: if all(_.closed for _ in hsess._sessions.values()): refresh = True if not refresh: return self._s3 logger.debug("Setting up s3fs instance") client_kwargs = self.client_kwargs.copy() init_kwargs = dict( aws_access_key_id=self.key, aws_secret_access_key=self.secret, aws_session_token=self.token, endpoint_url=self.endpoint_url, ) init_kwargs = { key: value for key, value in init_kwargs.items() if value is not None and value != client_kwargs.get(key) } if "use_ssl" not in client_kwargs.keys(): init_kwargs["use_ssl"] = self.use_ssl config_kwargs = self._prepare_config_kwargs() if self.anon: from botocore import UNSIGNED drop_keys = { "aws_access_key_id", "aws_secret_access_key", "aws_session_token", } init_kwargs = { key: value for key, value in init_kwargs.items() if key not in drop_keys } client_kwargs = { key: value for key, value in client_kwargs.items() if key not in drop_keys } config_kwargs["signature_version"] = UNSIGNED conf = AioConfig(**config_kwargs) if self.session is None or refresh: self.session = aiobotocore.session.AioSession(**self.kwargs) for parameters in (config_kwargs, self.kwargs, init_kwargs, client_kwargs): for option in ("region_name", "endpoint_url"): if parameters.get(option): self.cache_regions = False break else: cache_regions = self.cache_regions logger.debug( "RC: caching enabled? %r (explicit option is %r)", cache_regions, self.cache_regions, ) self.cache_regions = cache_regions if self.cache_regions: s3creator = S3BucketRegionCache( self.session, config=conf, **init_kwargs, **client_kwargs ) self._s3 = await s3creator.get_client() else: s3creator = self.session.create_client( "s3", config=conf, **init_kwargs, **client_kwargs ) self._s3 = await s3creator.__aenter__() self._s3creator = s3creator # the following actually closes the aiohttp connection; use of privates # might break in the future, would cause exception at gc time if not self.asynchronous: weakref.finalize(self, self.close_session, self.loop, self._s3creator) self._kwargs_helper = ParamKwargsHelper(self._s3) return self._s3 _connect = set_session connect = sync_wrapper(set_session) @staticmethod def close_session(loop, s3): if loop is not None and loop.is_running(): try: loop = asyncio.get_running_loop() loop.create_task(s3.__aexit__(None, None, None)) return except RuntimeError: pass try: sync(loop, s3.__aexit__, None, None, None, timeout=0.1) return except FSTimeoutError: pass try: # close the actual socket s3._client._endpoint.http_session._connector._close() except AttributeError: # but during shutdown, it may have gone pass async def _get_delegated_s3pars(self, exp=3600): """Get temporary credentials from STS, appropriate for sending across a network. Only relevant where the key/secret were explicitly provided. Parameters ---------- exp : int Time in seconds that credentials are good for Returns ------- dict of parameters """ if self.anon: return {"anon": True} if self.token: # already has temporary cred return { "key": self.key, "secret": self.secret, "token": self.token, "anon": False, } if self.key is None or self.secret is None: # automatic credentials return {"anon": False} async with self.session.create_client("sts") as sts: cred = sts.get_session_token(DurationSeconds=exp)["Credentials"] return { "key": cred["AccessKeyId"], "secret": cred["SecretAccessKey"], "token": cred["SessionToken"], "anon": False, } get_delegated_s3pars = sync_wrapper(_get_delegated_s3pars) def _open( self, path, mode="rb", block_size=None, acl=False, version_id=None, fill_cache=None, cache_type=None, autocommit=True, size=None, requester_pays=None, cache_options=None, **kwargs, ): """Open a file for reading or writing Parameters ---------- path: string Path of file on S3 mode: string One of 'r', 'w', 'a', 'rb', 'wb', or 'ab'. These have the same meaning as they do for the built-in `open` function. "x" mode, exclusive write, is only known to work on AWS S3, and requires botocore>1.35.20. If the file is multi-part (i.e., has more than one block), the condition is only checked on commit; if this fails, the MPU is aborted. block_size: int Size of data-node blocks if reading fill_cache: bool If seeking to new a part of the file beyond the current buffer, with this True, the buffer will be filled between the sections to best support random access. When reading only a few specific chunks out of a file, performance may be better if False. acl: str Canned ACL to set when writing. False sends no parameter and uses the bucket's preset default; otherwise it should be a member of the `key_acls` set. version_id : str Explicit version of the object to open. This requires that the s3 filesystem is version aware and bucket versioning is enabled on the relevant bucket. encoding : str The encoding to use if opening the file in text mode. The platform's default text encoding is used if not given. cache_type : str See fsspec's documentation for available cache_type values. Set to "none" if no caching is desired. If None, defaults to ``self.default_cache_type``. requester_pays : bool (optional) If RequesterPays buckets are supported. If None, defaults to the value used when creating the S3FileSystem (which defaults to False.) kwargs: dict-like Additional parameters used for s3 methods. Typically used for ServerSideEncryption. """ if block_size is None: block_size = self.default_block_size if fill_cache is None: fill_cache = self.default_fill_cache if requester_pays is None: requester_pays = bool(self.req_kw) acl = ( acl or self.s3_additional_kwargs.get("ACL", False) or self.s3_additional_kwargs.get("acl", False) ) kw = self.s3_additional_kwargs.copy() kw.update(kwargs) if not self.version_aware and version_id: raise ValueError( "version_id cannot be specified if the filesystem " "is not version aware" ) if cache_type is None: cache_type = self.default_cache_type return S3File( self, path, mode, block_size=block_size, acl=acl, version_id=version_id, fill_cache=fill_cache, s3_additional_kwargs=kw, cache_type=cache_type, autocommit=autocommit, requester_pays=requester_pays, cache_options=cache_options, size=size, ) async def _lsdir( self, path, refresh=False, max_items=None, delimiter="/", prefix="", versions=False, ): bucket, key, _ = self.split_path(path) if not prefix: prefix = "" if key: prefix = key.lstrip("/") + "/" + prefix if path not in self.dircache or refresh or not delimiter or versions: try: logger.debug("Get directory listing page for %s" % path) dirs = [] files = [] async for c in self._iterdir( bucket, max_items=max_items, delimiter=delimiter, prefix=prefix, versions=versions, ): if c["type"] == "directory": dirs.append(c) else: files.append(c) files += dirs files.sort(key=lambda f: f["name"]) except ClientError as e: raise translate_boto_error(e) if delimiter and files and not versions: self.dircache[path] = files return files return self.dircache[path] async def _iterdir( self, bucket, max_items=None, delimiter="/", prefix="", versions=False ): """Iterate asynchronously over files and directories under `prefix`. The contents are yielded in arbitrary order as info dicts. """ if versions and not self.version_aware: raise ValueError( "versions cannot be specified if the filesystem is not version aware" ) await self.set_session() s3 = await self.get_s3(bucket) if self.version_aware: method = "list_object_versions" contents_key = "Versions" else: method = "list_objects_v2" contents_key = "Contents" pag = s3.get_paginator(method) config = {} if max_items is not None: config.update(MaxItems=max_items, PageSize=2 * max_items) it = pag.paginate( Bucket=bucket, Prefix=prefix, Delimiter=delimiter, PaginationConfig=config, **self.req_kw, ) async for i in it: for l in i.get("CommonPrefixes", []): c = { "Key": l["Prefix"][:-1], "Size": 0, "StorageClass": "DIRECTORY", "type": "directory", } self._fill_info(c, bucket, versions=False) yield c for c in i.get(contents_key, []): if not self.version_aware or c.get("IsLatest") or versions: c["type"] = "file" c["size"] = c["Size"] self._fill_info(c, bucket, versions=versions) yield c @staticmethod def _fill_info(f, bucket, versions=False): f["size"] = f["Size"] f["Key"] = "/".join([bucket, f["Key"]]) f["name"] = f["Key"] version_id = f.get("VersionId") if versions and version_id and version_id != "null": f["name"] += f"?versionId={version_id}" async def _glob(self, path, **kwargs): if path.startswith("*"): raise ValueError("Cannot traverse all of S3") return await super()._glob(path, **kwargs) async def _find( self, path, maxdepth=None, withdirs=None, detail=False, prefix="", **kwargs ): """List all files below path. Like posix ``find`` command without conditions Parameters ---------- path : str maxdepth: int or None If not None, the maximum number of levels to descend withdirs: bool Whether to include directory paths in the output. This is True when used by glob, but users usually only want files. prefix: str Only return files that match ``^{path}/{prefix}`` (if there is an exact match ``filename == {path}/{prefix}``, it also will be included) """ path = self._strip_protocol(path) bucket, key, _ = self.split_path(path) if not bucket: raise ValueError("Cannot traverse all of S3") if (withdirs or maxdepth) and prefix: # TODO: perhaps propagate these to a glob(f"path/{prefix}*") call raise ValueError( "Can not specify 'prefix' option alongside 'withdirs'/'maxdepth' options." ) if maxdepth: return await super()._find( bucket + "/" + key, maxdepth=maxdepth, withdirs=withdirs, detail=detail, **kwargs, ) # TODO: implement find from dircache, if all listings are present # if refresh is False: # out = incomplete_tree_dirs(self.dircache, path) # if len(out) == 1: # await self._find(out[0]) # return super().find(path) # elif len(out) == 0: # return super().find(path) # # else: we refresh anyway, having at least two missing trees out = await self._lsdir(path, delimiter="", prefix=prefix, **kwargs) if (not out and key) and not prefix: try: out = [await self._info(path)] except FileNotFoundError: out = [] dirs = [] sdirs = set() thisdircache = {} for o in out: # not self._parent, because that strips "/" from placeholders par = o["name"].rsplit("/", maxsplit=1)[0] o["Key"] = o["name"] name = o["name"] while "/" in par: if par not in sdirs: sdirs.add(par) d = False if len(path) <= len(par): d = { "Key": par, "Size": 0, "name": par, "StorageClass": "DIRECTORY", "type": "directory", "size": 0, } dirs.append(d) thisdircache[par] = [] ppar = self._parent(par) if ppar in thisdircache: if d and d not in thisdircache[ppar]: thisdircache[ppar].append(d) if par in sdirs and not name.endswith("/"): # exclude placeholdees, they do not belong in the directory listing thisdircache[par].append(o) par, name, o = par.rsplit("/", maxsplit=1)[0], par, d if par in thisdircache or par in self.dircache: break # Explicitly add directories to their parents in the dircache for d in dirs: par = self._parent(d["name"]) # extra condition here (in any()) to deal with directory-marking files if par in thisdircache and not any( _["name"] == d["name"] for _ in thisdircache[par] ): thisdircache[par].append(d) if not prefix: for k, v in thisdircache.items(): if k not in self.dircache and len(k) >= len(path): self.dircache[k] = sorted(v, key=lambda x: x["name"]) if withdirs: out = sorted(out + dirs, key=lambda x: x["name"]) if detail: return {o["name"]: o for o in out} return [o["name"] for o in out] find = sync_wrapper(_find) async def _mkdir(self, path, acl=False, create_parents=True, **kwargs): path = self._strip_protocol(path).rstrip("/") if not path: raise ValueError bucket, key, _ = self.split_path(path) if await self._exists(bucket): if not key: # requested to create bucket, but bucket already exist raise FileExistsError # else: # do nothing as bucket is already created. elif not key or create_parents: if acl and acl not in buck_acls: raise ValueError("ACL not in %s", buck_acls) try: params = {"Bucket": bucket} if acl: params["ACL"] = acl region_name = kwargs.get("region_name", None) or self.client_kwargs.get( "region_name", None ) if region_name: params["CreateBucketConfiguration"] = { "LocationConstraint": region_name } await self._call_s3("create_bucket", **params) self.invalidate_cache("") self.invalidate_cache(bucket) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError("Bucket create failed %r: %s" % (bucket, e)) else: # raises if bucket doesn't exist and doesn't get create flag. await self._ls(bucket) mkdir = sync_wrapper(_mkdir) async def _makedirs(self, path, exist_ok=False): try: await self._mkdir(path, create_parents=True) except FileExistsError: if exist_ok: pass else: raise makedirs = sync_wrapper(_makedirs) async def _rmdir(self, path): bucket, key, _ = self.split_path(path) if key: if await self._exists(path): # User may have meant rm(path, recursive=True) raise FileExistsError raise FileNotFoundError try: await self._call_s3("delete_bucket", Bucket=path) except botocore.exceptions.ClientError as e: if "NoSuchBucket" in str(e): raise FileNotFoundError(path) from e if "BucketNotEmpty" in str(e): raise OSError from e raise self.invalidate_cache(path) self.invalidate_cache("") rmdir = sync_wrapper(_rmdir) async def _lsbuckets(self, refresh=False): if "" not in self.dircache or refresh: if self.anon: # cannot list buckets if not logged in return [] try: files = (await self._call_s3("list_buckets"))["Buckets"] except ClientError: # listbucket permission missing return [] for f in files: f["Key"] = f["Name"] f["Size"] = 0 f["StorageClass"] = "BUCKET" f["size"] = 0 f["type"] = "directory" f["name"] = f["Name"] del f["Name"] self.dircache[""] = files return files return self.dircache[""] async def _ls(self, path, detail=False, refresh=False, versions=False): """List files in given bucket, or list of buckets. Listing is cached unless `refresh=True`. Note: only your buckets associated with the login will be listed by `ls('')`, not any public buckets (even if already accessed). Parameters ---------- path : string/bytes location at which to list files refresh : bool (=False) if False, look in local cache for file details first """ path = self._strip_protocol(path).rstrip("/") if path in ["", "/"]: files = await self._lsbuckets(refresh) else: files = await self._lsdir(path, refresh, versions=versions) if not files and "/" in path: try: files = await self._lsdir( self._parent(path), refresh=refresh, versions=versions ) except OSError: pass files = [ o for o in files if o["name"].rstrip("/") == path and o["type"] != "directory" ] if not files: raise FileNotFoundError(path) if detail: return files return files if detail else sorted([o["name"] for o in files]) def _exists_in_cache(self, path, bucket, key, version_id): fullpath = "/".join((bucket, key)) try: entries = self._ls_from_cache(fullpath) except FileNotFoundError: return False if entries is None: return None if not self.version_aware or version_id is None: return True for entry in entries: if entry["name"] == fullpath and entry.get("VersionId") == version_id: return True # dircache doesn't support multiple versions, so we really can't tell if # the one we want exists. return None async def _exists(self, path): if path in ["", "/"]: # the root always exists, even if anon return True path = self._strip_protocol(path) bucket, key, version_id = self.split_path(path) if key: exists_in_cache = self._exists_in_cache(path, bucket, key, version_id) if exists_in_cache is not None: return exists_in_cache try: await self._info(path, bucket, key, version_id=version_id) return True except FileNotFoundError: return False elif self.dircache.get(bucket, False): return True else: try: if self._ls_from_cache(bucket): return True except FileNotFoundError: # might still be a bucket we can access but don't own pass try: await self._call_s3("head_bucket", Bucket=bucket, **self.req_kw) return True except Exception: pass try: await self._call_s3("get_bucket_location", Bucket=bucket, **self.req_kw) return True except Exception: return False exists = sync_wrapper(_exists) async def _touch(self, path, truncate=True, data=None, **kwargs): """Create empty file or truncate""" bucket, key, version_id = self.split_path(path) if version_id: raise ValueError("S3 does not support touching existing versions of files") if not truncate and await self._exists(path): raise ValueError("S3 does not support touching existent files") try: write_result = await self._call_s3( "put_object", Bucket=bucket, Key=key, **kwargs ) except ClientError as ex: raise translate_boto_error(ex) self.invalidate_cache(self._parent(path)) return write_result touch = sync_wrapper(_touch) async def _cat_file(self, path, version_id=None, start=None, end=None): bucket, key, vers = self.split_path(path) if start is not None or end is not None: head = {"Range": await self._process_limits(path, start, end)} else: head = {} async def _call_and_read(): resp = await self._call_s3( "get_object", Bucket=bucket, Key=key, **version_id_kw(version_id or vers), **head, **self.req_kw, ) try: return await resp["Body"].read() finally: resp["Body"].close() return await _error_wrapper(_call_and_read, retries=self.retries) async def _pipe_file( self, path, data, chunksize=50 * 2**20, max_concurrency=None, mode="overwrite", **kwargs, ): """ mode=="create", exclusive write, is only known to work on AWS S3, and requires botocore>1.35.20 """ bucket, key, _ = self.split_path(path) concurrency = max_concurrency or self.max_concurrency size = len(data) if mode == "create": match = {"IfNoneMatch": "*"} else: match = {} # 5 GB is the limit for an S3 PUT if size < min(5 * 2**30, 2 * chunksize): out = await self._call_s3( "put_object", Bucket=bucket, Key=key, Body=data, **kwargs, **match ) self.invalidate_cache(path) return out else: mpu = await self._call_s3( "create_multipart_upload", Bucket=bucket, Key=key, **kwargs ) ranges = list(range(0, len(data), chunksize)) inds = list(range(0, len(ranges), concurrency)) + [len(ranges)] parts = [] try: for start, stop in zip(inds[:-1], inds[1:]): out = await asyncio.gather( *[ self._call_s3( "upload_part", Bucket=bucket, PartNumber=i + 1, UploadId=mpu["UploadId"], Body=data[ranges[i] : ranges[i] + chunksize], Key=key, ) for i in range(start, stop) ] ) parts.extend( {"PartNumber": i + 1, "ETag": o["ETag"]} for i, o in zip(range(start, stop), out) ) await self._call_s3( "complete_multipart_upload", Bucket=bucket, Key=key, UploadId=mpu["UploadId"], MultipartUpload={"Parts": parts}, **match, ) self.invalidate_cache(path) except Exception: await self._abort_mpu(bucket, key, mpu["UploadId"]) raise async def _put_file( self, lpath, rpath, callback=_DEFAULT_CALLBACK, chunksize=None, max_concurrency=None, mode="overwrite", **kwargs, ): """ mode=="create", exclusive write, is only known to work on AWS S3, and requires botocore>1.35.20 """ bucket, key, _ = self.split_path(rpath) if os.path.isdir(lpath): if key: # don't make remote "directory" return else: await self._mkdir(lpath) size = os.path.getsize(lpath) callback.set_size(size) if mode == "create": match = {"IfNoneMatch": "*"} else: match = {} if "ContentType" not in kwargs: content_type, _ = mimetypes.guess_type(lpath) if content_type is not None: kwargs["ContentType"] = content_type chunksize = calculate_chunksize(size, chunksize=chunksize) with open(lpath, "rb") as f0: if size < min(5 * 2**30, 2 * chunksize): chunk = f0.read() await self._call_s3( "put_object", Bucket=bucket, Key=key, Body=chunk, **kwargs, **match ) callback.relative_update(size) else: mpu = await self._call_s3( "create_multipart_upload", Bucket=bucket, Key=key, **kwargs ) try: out = await self._upload_file_part_concurrent( bucket, key, mpu, f0, chunksize, callback=callback, max_concurrency=max_concurrency, ) parts = [ {"PartNumber": i + 1, "ETag": o["ETag"]} for i, o in enumerate(out) ] await self._call_s3( "complete_multipart_upload", Bucket=bucket, Key=key, UploadId=mpu["UploadId"], MultipartUpload={"Parts": parts}, **match, ) except Exception: await self._abort_mpu(bucket, key, mpu["UploadId"]) raise while rpath: self.invalidate_cache(rpath) rpath = self._parent(rpath) async def _upload_file_part_concurrent( self, bucket, key, mpu, f0, chunksize, callback=_DEFAULT_CALLBACK, max_concurrency=None, ): max_concurrency = max_concurrency or self.max_concurrency if max_concurrency < 1: raise ValueError("max_concurrency must be >= 1") async def _upload_chunk(chunk, part_number): result = await self._call_s3( "upload_part", Bucket=bucket, PartNumber=part_number, UploadId=mpu["UploadId"], Body=chunk, Key=key, ) callback.relative_update(len(chunk)) return result out = [] while True: chunks = [] for i in range(max_concurrency): chunk = f0.read(chunksize) if chunk: chunks.append(chunk) if not chunks: break out.extend( await asyncio.gather( *[ _upload_chunk(chunk, len(out) + i) for i, chunk in enumerate(chunks, 1) ] ) ) return out async def _get_file( self, rpath, lpath, callback=_DEFAULT_CALLBACK, version_id=None, **kwargs ): if os.path.isdir(lpath): return bucket, key, vers = self.split_path(rpath) async def _open_file(range: int): kw = self.req_kw.copy() if range: kw["Range"] = f"bytes={range}-" resp = await self._call_s3( "get_object", Bucket=bucket, Key=key, **version_id_kw(version_id or vers), **kw, ) return resp["Body"], resp.get("ContentLength", None) body, content_length = await _open_file(range=0) callback.set_size(content_length) failed_reads = 0 bytes_read = 0 try: with open(lpath, "wb") as f0: while True: try: chunk = await body.read(2**16) except S3_RETRYABLE_ERRORS: failed_reads += 1 if failed_reads >= self.retries: # Give up if we've failed too many times. raise # Closing the body may result in an exception if we've failed to read from it. try: body.close() except Exception: pass await asyncio.sleep(min(1.7**failed_reads * 0.1, 15)) # Byte ranges are inclusive, which means we need to be careful to not read the same data twice # in a failure. # Examples: # Read 1 byte -> failure, retry with read_range=0, byte range should be 0- # Read 1 byte, success. Read 1 byte: failure. Retry with read_range=1, byte-range should be 1- # Read 1 bytes, success. Read 1 bytes: success. Read 1 byte, failure. Retry with read_range=2, # byte-range should be 2-. body, _ = await _open_file(bytes_read) continue if not chunk: break bytes_read += len(chunk) segment_len = f0.write(chunk) callback.relative_update(segment_len) finally: try: body.close() except Exception: pass async def _info(self, path, bucket=None, key=None, refresh=False, version_id=None): path = self._strip_protocol(path) bucket, key, path_version_id = self.split_path(path) fullpath = "/".join((bucket, key)) if version_id is not None: if not self.version_aware: raise ValueError( "version_id cannot be specified if the " "filesystem is not version aware" ) if path in ["/", ""]: return {"name": path, "size": 0, "type": "directory"} version_id = _coalesce_version_id(path_version_id, version_id) if not refresh: out = self._ls_from_cache(fullpath) if out is not None: if self.version_aware and version_id is not None: # If cached info does not match requested version_id, # fallback to calling head_object out = [ o for o in out if o["name"] == fullpath and version_id == o.get("VersionId") ] if out: return out[0] else: out = [o for o in out if o["name"] == fullpath] if out: return out[0] return {"name": path, "size": 0, "type": "directory"} if key: try: out = await self._call_s3( "head_object", self.kwargs, Bucket=bucket, Key=key, **version_id_kw(version_id), **self.req_kw, ) return { "ETag": out.get("ETag", ""), "LastModified": out.get("LastModified", ""), "size": out["ContentLength"], "name": "/".join([bucket, key]), "type": "file", "StorageClass": out.get("StorageClass", "STANDARD"), "VersionId": out.get("VersionId"), "ContentType": out.get("ContentType"), } except FileNotFoundError: pass except ClientError as e: raise translate_boto_error(e, set_cause=False) else: try: out = await self._call_s3("head_bucket", Bucket=bucket, **self.req_kw) return { "name": bucket, "type": "directory", "size": 0, "StorageClass": "DIRECTORY", "VersionId": out.get("VersionId"), } except ClientError as e: raise translate_boto_error(e, set_cause=False) try: # We check to see if the path is a directory by attempting to list its # contexts. If anything is found, it is indeed a directory out = await self._call_s3( "list_objects_v2", self.kwargs, Bucket=bucket, Prefix=key.rstrip("/") + "/" if key else "", Delimiter="/", MaxKeys=1, **self.req_kw, ) if ( out.get("KeyCount", 0) > 0 or out.get("Contents", []) or out.get("CommonPrefixes", []) ): return { "name": "/".join([bucket, key]), "type": "directory", "size": 0, "StorageClass": "DIRECTORY", } raise FileNotFoundError(path) except ClientError as e: raise translate_boto_error(e, set_cause=False) except ParamValidationError as e: raise ValueError("Failed to list path %r: %s" % (path, e)) async def _checksum(self, path, refresh=False): """ Unique value for current version of file If the checksum is the same from one moment to another, the contents are guaranteed to be the same. If the checksum changes, the contents *might* have changed. Parameters ---------- path : string/bytes path of file to get checksum for refresh : bool (=False) if False, look in local cache for file details first """ info = await self._info(path, refresh=refresh) if info["type"] != "directory": return int(info["ETag"].strip('"').split("-")[0], 16) else: return int(tokenize(info), 16) checksum = sync_wrapper(_checksum) async def _isdir(self, path): path = self._strip_protocol(path).strip("/") # Send buckets to super if "/" not in path: if path == "": return True try: out = await self._lsdir(path) return True except FileNotFoundError: return False if path in self.dircache: for fp in self.dircache[path]: # For files the dircache can contain itself. # If it contains anything other than itself it is a directory. if fp["name"] != path: return True return False parent = self._parent(path) if parent in self.dircache: for f in self.dircache[parent]: if f["name"] == path: # If we find ourselves return whether we are a directory return f["type"] == "directory" return False # This only returns things within the path and NOT the path object itself try: return bool(await self._lsdir(path)) except FileNotFoundError: return False isdir = sync_wrapper(_isdir) async def _object_version_info(self, path, **kwargs): if not self.version_aware: raise ValueError( "version specific functionality is disabled for " "non-version aware filesystems" ) bucket, key, _ = self.split_path(path) kwargs = {} out = {"IsTruncated": True} versions = [] while out["IsTruncated"]: out = await self._call_s3( "list_object_versions", kwargs, Bucket=bucket, Prefix=key, **self.req_kw, ) versions.extend(out["Versions"]) kwargs.update( { "VersionIdMarker": out.get("NextVersionIdMarker", ""), "KeyMarker": out.get("NextKeyMarker", ""), } ) return versions object_version_info = sync_wrapper(_object_version_info) _metadata_cache = {} async def _metadata(self, path, refresh=False, **kwargs): """Return metadata of path. Parameters ---------- path : string/bytes filename to get metadata for refresh : bool (=False) (ignored) """ bucket, key, version_id = self.split_path(path) response = await self._call_s3( "head_object", kwargs, Bucket=bucket, Key=key, **version_id_kw(version_id), **self.req_kw, ) meta = {k.replace("_", "-"): v for k, v in response["Metadata"].items()} return meta metadata = sync_wrapper(_metadata) def get_tags(self, path): """Retrieve tag key/values for the given path Returns ------- {str: str} """ bucket, key, version_id = self.split_path(path) response = self.call_s3( "get_object_tagging", Bucket=bucket, Key=key, **version_id_kw(version_id), ) return {v["Key"]: v["Value"] for v in response["TagSet"]} def put_tags(self, path, tags, mode="o"): """Set tags for given existing key Tags are a str:str mapping that can be attached to any key, see https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/allocation-tag-restrictions.html This is similar to, but distinct from, key metadata, which is usually set at key creation time. Parameters ---------- path: str Existing key to attach tags to tags: dict str, str Tags to apply. mode: One of 'o' or 'm' 'o': Will over-write any existing tags. 'm': Will merge in new tags with existing tags. Incurs two remote calls. """ bucket, key, version_id = self.split_path(path) if mode == "m": existing_tags = self.get_tags(path=path) existing_tags.update(tags) new_tags = [{"Key": k, "Value": v} for k, v in existing_tags.items()] elif mode == "o": new_tags = [{"Key": k, "Value": v} for k, v in tags.items()] else: raise ValueError("Mode must be {'o', 'm'}, not %s" % mode) tag = {"TagSet": new_tags} self.call_s3( "put_object_tagging", Bucket=bucket, Key=key, Tagging=tag, **version_id_kw(version_id), ) async def _getxattr(self, path, attr_name, **kwargs): """Get an attribute from the metadata. Examples -------- >>> mys3fs.getxattr('mykey', 'attribute_1') # doctest: +SKIP 'value_1' """ attr_name = attr_name.replace("_", "-") xattr = await self._metadata(path, **kwargs) if attr_name in xattr: return xattr[attr_name] return None getxattr = sync_wrapper(_getxattr) async def _setxattr(self, path, copy_kwargs=None, **kw_args): """Set metadata. Attributes have to be of the form documented in the `Metadata Reference`_. Parameters ---------- kw_args : key-value pairs like field="value", where the values must be strings. Does not alter existing fields, unless the field appears here - if the value is None, delete the field. copy_kwargs : dict, optional dictionary of additional params to use for the underlying s3.copy_object. Examples -------- >>> mys3file.setxattr(attribute_1='value1', attribute_2='value2') # doctest: +SKIP # Example for use with copy_args >>> mys3file.setxattr(copy_kwargs={'ContentType': 'application/pdf'}, ... attribute_1='value1') # doctest: +SKIP .. _Metadata Reference: http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html#object-metadata """ kw_args = {k.replace("_", "-"): v for k, v in kw_args.items()} bucket, key, version_id = self.split_path(path) metadata = await self._metadata(path) metadata.update(**kw_args) copy_kwargs = copy_kwargs or {} # remove all keys that are None for kw_key in kw_args: if kw_args[kw_key] is None: metadata.pop(kw_key, None) src = {"Bucket": bucket, "Key": key} if version_id: src["VersionId"] = version_id await self._call_s3( "copy_object", copy_kwargs, CopySource=src, Bucket=bucket, Key=key, Metadata=metadata, MetadataDirective="REPLACE", ) # refresh metadata self._metadata_cache[path] = metadata setxattr = sync_wrapper(_setxattr) async def _chmod(self, path, acl, recursive=False, **kwargs): """Set Access Control on a bucket/key See http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl Parameters ---------- path : string the object to set acl : string the value of ACL to apply recursive : bool whether to apply the ACL to all keys below the given path too """ bucket, key, version_id = self.split_path(path) if recursive: allfiles = await self._find(path, withdirs=False) await asyncio.gather( *[self._chmod(p, acl, recursive=False) for p in allfiles] ) elif key: if acl not in key_acls: raise ValueError("ACL not in %s", key_acls) await self._call_s3( "put_object_acl", kwargs, Bucket=bucket, Key=key, ACL=acl, **version_id_kw(version_id), ) if not key: if acl not in buck_acls: raise ValueError("ACL not in %s", buck_acls) await self._call_s3("put_bucket_acl", kwargs, Bucket=bucket, ACL=acl) chmod = sync_wrapper(_chmod) async def _url(self, path, expires=3600, client_method="get_object", **kwargs): """Generate presigned URL to access path by HTTP Parameters ---------- path : string the key path we are interested in expires : int the number of seconds this signature will be good for. """ bucket, key, version_id = self.split_path(path) await self.set_session() s3 = await self.get_s3(bucket) return await s3.generate_presigned_url( ClientMethod=client_method, Params=dict(Bucket=bucket, Key=key, **version_id_kw(version_id), **kwargs), ExpiresIn=expires, ) url = sync_wrapper(_url) async def _merge(self, path, filelist, **kwargs): """Create single S3 file from list of S3 files Uses multi-part, no data is downloaded. The original files are not deleted. Parameters ---------- path : str The final file to produce filelist : list of str The paths, in order, to assemble into the final file. """ bucket, key, version_id = self.split_path(path) if version_id: raise ValueError("Cannot write to an explicit versioned file!") mpu = await self._call_s3( "create_multipart_upload", kwargs, Bucket=bucket, Key=key ) # TODO: Make this support versions? out = await asyncio.gather( *[ self._call_s3( "upload_part_copy", kwargs, Bucket=bucket, Key=key, UploadId=mpu["UploadId"], CopySource=f, PartNumber=i + 1, ) for (i, f) in enumerate(filelist) ] ) parts = [ {"PartNumber": i + 1, "ETag": o["CopyPartResult"]["ETag"]} for (i, o) in enumerate(out) ] part_info = {"Parts": parts} await self._call_s3( "complete_multipart_upload", Bucket=bucket, Key=key, UploadId=mpu["UploadId"], MultipartUpload=part_info, ) self.invalidate_cache(path) merge = sync_wrapper(_merge) async def _copy_basic(self, path1, path2, **kwargs): """Copy file between locations on S3 Not allowed where the origin is >5GB - use copy_managed """ buc1, key1, ver1 = self.split_path(path1) buc2, key2, ver2 = self.split_path(path2) if ver2: raise ValueError("Cannot copy to a versioned file!") try: copy_src = {"Bucket": buc1, "Key": key1} if ver1: copy_src["VersionId"] = ver1 await self._call_s3( "copy_object", kwargs, Bucket=buc2, Key=key2, CopySource=copy_src ) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError("Copy failed (%r -> %r): %s" % (path1, path2, e)) from e self.invalidate_cache(path2) async def _copy_etag_preserved(self, path1, path2, size, total_parts, **kwargs): """Copy file between locations on S3 as multi-part while preserving the etag (using the same part sizes for each part""" bucket1, key1, version1 = self.split_path(path1) bucket2, key2, version2 = self.split_path(path2) mpu = await self._call_s3( "create_multipart_upload", Bucket=bucket2, Key=key2, **kwargs ) part_infos = await asyncio.gather( *[ self._call_s3("head_object", Bucket=bucket1, Key=key1, PartNumber=i) for i in range(1, total_parts + 1) ] ) parts = [] brange_first = 0 for i, part_info in enumerate(part_infos, 1): part_size = part_info["ContentLength"] brange_last = brange_first + part_size - 1 if brange_last > size: brange_last = size - 1 part = await self._call_s3( "upload_part_copy", Bucket=bucket2, Key=key2, PartNumber=i, UploadId=mpu["UploadId"], CopySource=path1, CopySourceRange="bytes=%i-%i" % (brange_first, brange_last), ) parts.append({"PartNumber": i, "ETag": part["CopyPartResult"]["ETag"]}) brange_first += part_size await self._call_s3( "complete_multipart_upload", Bucket=bucket2, Key=key2, UploadId=mpu["UploadId"], MultipartUpload={"Parts": parts}, ) self.invalidate_cache(path2) async def _copy_managed(self, path1, path2, size, block=50 * 2**20, **kwargs): """Copy file between locations on S3 as multi-part block: int The size of the pieces, must be larger than 5MB and at most 5GB. Smaller blocks mean more calls, only useful for testing. """ if block < 5 * 2**20 or block > 5 * 2**30: raise ValueError("Copy block size must be 5MB<=block<=5GB") bucket, key, version = self.split_path(path2) mpu = await self._call_s3( "create_multipart_upload", Bucket=bucket, Key=key, **kwargs ) # attempting to do the following calls concurrently with gather causes # occasional "upload is smaller than the minimum allowed" out = [ await self._call_s3( "upload_part_copy", Bucket=bucket, Key=key, PartNumber=i + 1, UploadId=mpu["UploadId"], CopySource=self._strip_protocol(path1), CopySourceRange="bytes=%i-%i" % (brange_first, brange_last), ) for i, (brange_first, brange_last) in enumerate(_get_brange(size, block)) ] parts = [ {"PartNumber": i + 1, "ETag": o["CopyPartResult"]["ETag"]} for i, o in enumerate(out) ] await self._call_s3( "complete_multipart_upload", Bucket=bucket, Key=key, UploadId=mpu["UploadId"], MultipartUpload={"Parts": parts}, ) self.invalidate_cache(path2) async def _cp_file(self, path1, path2, preserve_etag=None, **kwargs): """Copy file between locations on S3. preserve_etag: bool Whether to preserve etag while copying. If the file is uploaded as a single part, then it will be always equalivent to the md5 hash of the file hence etag will always be preserved. But if the file is uploaded in multi parts, then this option will try to reproduce the same multipart upload while copying and preserve the generated etag. """ path1 = self._strip_protocol(path1) bucket, key, vers = self.split_path(path1) info = await self._info(path1, bucket, key, version_id=vers) size = info["size"] _, _, parts_suffix = info.get("ETag", "").strip('"').partition("-") if preserve_etag and parts_suffix: await self._copy_etag_preserved( path1, path2, size, total_parts=int(parts_suffix) ) elif size <= MANAGED_COPY_THRESHOLD: # simple copy allowed for <5GB await self._copy_basic(path1, path2, **kwargs) else: # if the preserve_etag is true, either the file is uploaded # on multiple parts or the size is lower than 5GB assert not preserve_etag # serial multipart copy await self._copy_managed(path1, path2, size, **kwargs) async def _list_multipart_uploads(self, bucket): out = await self._call_s3("list_multipart_uploads", Bucket=bucket) return out.get("Contents", []) or out.get("Uploads", []) list_multipart_uploads = sync_wrapper(_list_multipart_uploads) async def _abort_mpu(self, bucket, key, mpu): await self._call_s3( "abort_multipart_upload", Bucket=bucket, Key=key, UploadId=mpu, ) abort_mpu = sync_wrapper(_abort_mpu) async def _clear_multipart_uploads(self, bucket): """Remove any partial uploads in the bucket""" await asyncio.gather( *[ self._abort_mpu(bucket, upload["Key"], upload["UploadId"]) for upload in await self._list_multipart_uploads(bucket) ] ) clear_multipart_uploads = sync_wrapper(_clear_multipart_uploads) async def _bulk_delete(self, pathlist, **kwargs): """ Remove multiple keys with one call Parameters ---------- pathlist : list(str) The keys to remove, must all be in the same bucket. Must have 0 < len <= 1000 """ if not pathlist: return [] buckets = {self.split_path(path)[0] for path in pathlist} if len(buckets) > 1: raise ValueError("Bulk delete files should refer to only one bucket") bucket = buckets.pop() if len(pathlist) > 1000: raise ValueError("Max number of files to delete in one call is 1000") delete_keys = { "Objects": [{"Key": self.split_path(path)[1]} for path in pathlist], "Quiet": True, } for path in pathlist: self.invalidate_cache(self._parent(path)) out = await self._call_s3( "delete_objects", kwargs, Bucket=bucket, Delete=delete_keys ) # TODO: we report on successes but don't raise on any errors, effectively # on_error="omit" return [f"{bucket}/{_['Key']}" for _ in out.get("Deleted", [])] async def _rm_file(self, path, **kwargs): bucket, key, _ = self.split_path(path) self.invalidate_cache(path) try: await self._call_s3("delete_object", Bucket=bucket, Key=key) except ClientError as e: raise translate_boto_error(e) async def _rm(self, path, recursive=False, **kwargs): if recursive and isinstance(path, str): bucket, key, _ = self.split_path(path) if not key and await self._is_bucket_versioned(bucket): # special path to completely remove versioned bucket await self._rm_versioned_bucket_contents(bucket) paths = await self._expand_path(path, recursive=recursive) files = [p for p in paths if self.split_path(p)[1]] dirs = [p for p in paths if not self.split_path(p)[1]] # TODO: fails if more than one bucket in list out = await _run_coros_in_chunks( [ self._bulk_delete(files[i : i + 1000]) for i in range(0, len(files), 1000) ], batch_size=3, nofiles=True, ) await asyncio.gather(*[self._rmdir(d) for d in dirs]) [ (self.invalidate_cache(p), self.invalidate_cache(self._parent(p))) for p in paths ] return sum(out, []) async def _is_bucket_versioned(self, bucket): return (await self._call_s3("get_bucket_versioning", Bucket=bucket)).get( "Status", "" ) == "Enabled" is_bucket_versioned = sync_wrapper(_is_bucket_versioned) async def _make_bucket_versioned(self, bucket, versioned: bool = True): """Set bucket versioning status""" status = "Enabled" if versioned else "Suspended" return await self._call_s3( "put_bucket_versioning", Bucket=bucket, VersioningConfiguration={"Status": status}, ) make_bucket_versioned = sync_wrapper(_make_bucket_versioned) async def _rm_versioned_bucket_contents(self, bucket): """Remove a versioned bucket and all contents""" await self.set_session() s3 = await self.get_s3(bucket) pag = s3.get_paginator("list_object_versions") async for plist in pag.paginate(Bucket=bucket): obs = plist.get("Versions", []) + plist.get("DeleteMarkers", []) delete_keys = { "Objects": [ {"Key": i["Key"], "VersionId": i["VersionId"]} for i in obs ], "Quiet": True, } if obs: await self._call_s3("delete_objects", Bucket=bucket, Delete=delete_keys) def invalidate_cache(self, path=None): if path is None: self.dircache.clear() else: path = self._strip_protocol(path) self.dircache.pop(path, None) while path: self.dircache.pop(path, None) path = self._parent(path) async def _walk(self, path, maxdepth=None, **kwargs): if path in ["", "*"] + [f"{p}://" for p in self.protocol]: raise ValueError("Cannot crawl all of S3") async for _ in super()._walk(path, maxdepth=maxdepth, **kwargs): yield _ def modified(self, path, version_id=None, refresh=False): """Return the last modified timestamp of file at `path` as a datetime""" info = self.info(path=path, version_id=version_id, refresh=refresh) if "LastModified" not in info: # This path is a bucket or folder, which do not currently have a modified date raise IsADirectoryError return info["LastModified"] def sign(self, path, expiration=100, **kwargs): return self.url(path, expires=expiration, **kwargs) async def _invalidate_region_cache(self): """Invalidate the region cache (associated with buckets) if ``cache_regions`` is turned on.""" if not self.cache_regions: return None # If the region cache is not initialized, then # do nothing. cache = getattr(self, "_s3creator", None) if cache is not None: await cache.clear() invalidate_region_cache = sync_wrapper(_invalidate_region_cache) async def open_async(self, path, mode="rb", **kwargs): if "b" not in mode or kwargs.get("compression"): raise ValueError return S3AsyncStreamedFile(self, path, mode) class S3File(AbstractBufferedFile): """ Open S3 key as a file. Data is only loaded and cached on demand. Parameters ---------- s3 : S3FileSystem botocore connection path : string S3 bucket/key to access mode : str One of 'rb', 'wb', 'ab'. These have the same meaning as they do for the built-in `open` function. block_size : int read-ahead size for finding delimiters fill_cache : bool If seeking to new a part of the file beyond the current buffer, with this True, the buffer will be filled between the sections to best support random access. When reading only a few specific chunks out of a file, performance may be better if False. acl: str Canned ACL to apply version_id : str Optional version to read the file at. If not specified this will default to the current version of the object. This is only used for reading. requester_pays : bool (False) If RequesterPays buckets are supported. Examples -------- >>> s3 = S3FileSystem() # doctest: +SKIP >>> with s3.open('my-bucket/my-file.txt', mode='rb') as f: # doctest: +SKIP ... ... # doctest: +SKIP See Also -------- S3FileSystem.open: used to create ``S3File`` objects """ retries = 5 part_min = 5 * 2**20 part_max = 5 * 2**30 def __init__( self, s3, path, mode="rb", block_size=50 * 2**20, acl=False, version_id=None, fill_cache=True, s3_additional_kwargs=None, autocommit=True, cache_type="readahead", requester_pays=False, cache_options=None, size=None, ): bucket, key, path_version_id = s3.split_path(path) if not key: raise ValueError("Attempt to open non key-like path: %s" % path) self.bucket = bucket self.key = key self.version_id = _coalesce_version_id(version_id, path_version_id) self.acl = acl if self.acl and self.acl not in key_acls: raise ValueError("ACL not in %s", key_acls) self.mpu = None self.parts = None self.fill_cache = fill_cache self.s3_additional_kwargs = s3_additional_kwargs or {} self.req_kw = {"RequestPayer": "requester"} if requester_pays else {} if "r" not in mode: if block_size < 5 * 2**20: raise ValueError("Block size must be >=5MB") else: if version_id and s3.version_aware: self.version_id = version_id self.details = s3.info(path, version_id=version_id) self.size = self.details["size"] elif s3.version_aware: # In this case we have not managed to get the VersionId out of details and # we should invalidate the cache and perform a full head_object since it # has likely been partially populated by ls. s3.invalidate_cache(path) self.details = s3.info(path) self.version_id = self.details.get("VersionId") super().__init__( s3, path, mode, block_size, autocommit=autocommit, cache_type=cache_type, cache_options=cache_options, size=size, ) self.s3 = self.fs # compatibility # when not using autocommit we want to have transactional state to manage self.append_block = False if "a" in mode and s3.exists(path): # See: # put: https://boto3.amazonaws.com/v1/documentation/api/latest # /reference/services/s3.html#S3.Client.put_object # # head: https://boto3.amazonaws.com/v1/documentation/api/latest # /reference/services/s3.html#S3.Client.head_object head = self._call_s3( "head_object", self.kwargs, Bucket=bucket, Key=key, **version_id_kw(version_id), **self.req_kw, ) head = { key: value for key, value in head.items() if key in _PRESERVE_KWARGS and key not in self.s3_additional_kwargs } loc = head.pop("ContentLength") if loc < 5 * 2**20: # existing file too small for multi-upload: download self.write(self.fs.cat(self.path)) else: self.append_block = True self.loc = loc # Reflect head self.s3_additional_kwargs.update(head) if "r" in mode and size is None and "ETag" in self.details: self.req_kw["IfMatch"] = self.details["ETag"] def _call_s3(self, method, *kwarglist, **kwargs): return self.fs.call_s3(method, self.s3_additional_kwargs, *kwarglist, **kwargs) def _initiate_upload(self): if self.autocommit and not self.append_block and self.tell() < self.blocksize: # only happens when closing small file, use on-shot PUT return logger.debug("Initiate upload for %s" % self) self.parts = [] kw = dict( Bucket=self.bucket, Key=self.key, ) if self.acl: kw["ACL"] = self.acl self.mpu = self._call_s3("create_multipart_upload", **kw) if self.append_block: # use existing data in key when appending, # and block is big enough out = self._call_s3( "upload_part_copy", self.s3_additional_kwargs, Bucket=self.bucket, Key=self.key, PartNumber=1, UploadId=self.mpu["UploadId"], CopySource=self.path, ) self.parts.append({"PartNumber": 1, "ETag": out["CopyPartResult"]["ETag"]}) def metadata(self, refresh=False, **kwargs): """Return metadata of file. See :func:`~s3fs.S3Filesystem.metadata`. Metadata is cached unless `refresh=True`. """ return self.fs.metadata(self.path, refresh, **kwargs) def getxattr(self, xattr_name, **kwargs): """Get an attribute from the metadata. See :func:`~s3fs.S3Filesystem.getxattr`. Examples -------- >>> mys3file.getxattr('attribute_1') # doctest: +SKIP 'value_1' """ return self.fs.getxattr(self.path, xattr_name, **kwargs) def setxattr(self, copy_kwargs=None, **kwargs): """Set metadata. See :func:`~s3fs.S3Filesystem.setxattr`. Examples -------- >>> mys3file.setxattr(attribute_1='value1', attribute_2='value2') # doctest: +SKIP """ if self.writable(): raise NotImplementedError( "cannot update metadata while file is open for writing" ) return self.fs.setxattr(self.path, copy_kwargs=copy_kwargs, **kwargs) def url(self, **kwargs): """HTTP URL to read this file (if it already exists)""" return self.fs.url(self.path, **kwargs) def _fetch_range(self, start, end): try: return _fetch_range( self.fs, self.bucket, self.key, self.version_id, start, end, req_kw=self.req_kw, ) except OSError as ex: if ex.args[0] == errno.EINVAL and "pre-conditions" in ex.args[1]: raise FileExpired( filename=self.details["name"], e_tag=self.details.get("ETag") ) from ex else: raise def _upload_chunk(self, final=False): bucket, key, _ = self.fs.split_path(self.path) logger.debug( "Upload for %s, final=%s, loc=%s, buffer loc=%s" % (self, final, self.loc, self.buffer.tell()) ) if ( self.autocommit and not self.append_block and final and self.tell() < self.blocksize ): # only happens when closing small file, use one-shot PUT pass else: self.buffer.seek(0) def upload_part(part_data: bytes): if len(part_data) == 0: return part = len(self.parts) + 1 logger.debug( "Upload chunk %s, %s; %s bytes" % (self, part, len(part_data)) ) out = self._call_s3( "upload_part", Bucket=bucket, PartNumber=part, UploadId=self.mpu["UploadId"], Body=part_data, Key=key, ) part_header = {"PartNumber": part, "ETag": out["ETag"]} if "ChecksumSHA256" in out: part_header["ChecksumSHA256"] = out["ChecksumSHA256"] self.parts.append(part_header) def n_bytes_left() -> int: return len(self.buffer.getbuffer()) - self.buffer.tell() min_chunk = 1 if final else self.blocksize # TODO: concurrent here if self.fs.fixed_upload_size: # all chunks have fixed size, exception: last one can be smaller while n_bytes_left() >= min_chunk: upload_part(self.buffer.read(self.blocksize)) else: while n_bytes_left() >= min_chunk: upload_part(self.buffer.read(self.part_max)) if self.autocommit and final: self.commit() else: # update 'upload offset' self.offset += self.buffer.tell() # create new smaller buffer, seek to file end self.buffer = io.BytesIO(self.buffer.read()) self.buffer.seek(0, 2) return False # instruct fsspec.flush to NOT clear self.buffer def commit(self): logger.debug("Commit %s" % self) match = {"IfNoneMatch": "*"} if "x" in self.mode else {} if self.tell() == 0: if self.buffer is not None: logger.debug("Empty file committed %s" % self) self._abort_mpu() write_result = self.fs.touch(self.path, **self.kwargs) elif not self.parts: if self.buffer is not None: logger.debug("One-shot upload of %s" % self) self.buffer.seek(0) data = self.buffer.read() kw = dict(Key=self.key, Bucket=self.bucket, Body=data, **self.kwargs) if self.acl: kw["ACL"] = self.acl write_result = self._call_s3("put_object", **kw, **match) else: raise RuntimeError else: logger.debug("Complete multi-part upload for %s " % self) part_info = {"Parts": self.parts} write_result = self._call_s3( "complete_multipart_upload", Bucket=self.bucket, Key=self.key, UploadId=self.mpu["UploadId"], MultipartUpload=part_info, **match, ) if self.fs.version_aware: self.version_id = write_result.get("VersionId") # complex cache invalidation, since file's appearance can cause several # directories self.buffer = None parts = self.path.split("/") path = parts[0] for p in parts[1:]: if path in self.fs.dircache and not [ True for f in self.fs.dircache[path] if f["name"] == path + "/" + p ]: self.fs.invalidate_cache(path) path = path + "/" + p def discard(self): self._abort_mpu() self.buffer = None # file becomes unusable def _abort_mpu(self): if self.mpu: self.fs.abort_mpu(self.bucket, self.key, self.mpu["UploadId"]) self.mpu = None class S3AsyncStreamedFile(AbstractAsyncStreamedFile): def __init__(self, fs, path, mode): self.fs = fs self.path = path self.mode = mode self.r = None self.loc = 0 self.size = None async def read(self, length=-1): if self.r is None: bucket, key, gen = self.fs.split_path(self.path) r = await self.fs._call_s3("get_object", Bucket=bucket, Key=key) self.size = int(r["ResponseMetadata"]["HTTPHeaders"]["content-length"]) self.r = r["Body"] out = await self.r.read(length) self.loc += len(out) return out def _fetch_range(fs, bucket, key, version_id, start, end, req_kw=None): if req_kw is None: req_kw = {} if start == end: logger.debug( "skip fetch for negative range - bucket=%s,key=%s,start=%d,end=%d", bucket, key, start, end, ) return b"" logger.debug("Fetch: %s/%s, %s-%s", bucket, key, start, end) return sync(fs.loop, _inner_fetch, fs, bucket, key, version_id, start, end, req_kw) async def _inner_fetch(fs, bucket, key, version_id, start, end, req_kw=None): async def _call_and_read(): resp = await fs._call_s3( "get_object", Bucket=bucket, Key=key, Range="bytes=%i-%i" % (start, end - 1), **version_id_kw(version_id), **req_kw, ) try: return await resp["Body"].read() finally: resp["Body"].close() return await _error_wrapper(_call_and_read, retries=fs.retries) s3fs-2026.2.0/s3fs/errors.py000066400000000000000000000174311514121105500153660ustar00rootroot00000000000000"""S3 error codes adapted into more natural Python ones. Adapted from: https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html """ import errno import functools # Fallback values since some systems might not have these. ENAMETOOLONG = getattr(errno, "ENAMETOOLONG", errno.EINVAL) ENOTEMPTY = getattr(errno, "ENOTEMPTY", errno.EINVAL) EMSGSIZE = getattr(errno, "EMSGSIZE", errno.EINVAL) EREMOTEIO = getattr(errno, "EREMOTEIO", errno.EIO) EREMCHG = getattr(errno, "EREMCHG", errno.ENOENT) ERROR_CODE_TO_EXCEPTION = { "AccessDenied": PermissionError, "AccountProblem": PermissionError, "AllAccessDisabled": PermissionError, "AmbiguousGrantByEmailAddress": functools.partial(IOError, errno.EINVAL), "AuthorizationHeaderMalformed": functools.partial(IOError, errno.EINVAL), "BadDigest": functools.partial(IOError, errno.EINVAL), "BucketAlreadyExists": FileExistsError, "BucketAlreadyOwnedByYou": FileExistsError, "BucketNotEmpty": functools.partial(IOError, ENOTEMPTY), "CredentialsNotSupported": functools.partial(IOError, errno.EINVAL), "CrossLocationLoggingProhibited": PermissionError, "EntityTooSmall": functools.partial(IOError, errno.EINVAL), "EntityTooLarge": functools.partial(IOError, EMSGSIZE), "ExpiredToken": PermissionError, "IllegalLocationConstraintException": PermissionError, "IllegalVersioningConfigurationException": functools.partial(IOError, errno.EINVAL), "IncompleteBody": functools.partial(IOError, errno.EINVAL), "IncorrectNumberOfFilesInPostRequest": functools.partial(IOError, errno.EINVAL), "InlineDataTooLarge": functools.partial(IOError, EMSGSIZE), "InternalError": functools.partial(IOError, EREMOTEIO), "InvalidAccessKeyId": PermissionError, "InvalidAddressingHeader": functools.partial(IOError, errno.EINVAL), "InvalidArgument": functools.partial(IOError, errno.EINVAL), "InvalidBucketName": functools.partial(IOError, errno.EINVAL), "InvalidBucketState": functools.partial(IOError, errno.EPERM), "InvalidDigest": functools.partial(IOError, errno.EINVAL), "InvalidEncryptionAlgorithmError": functools.partial(IOError, errno.EINVAL), "InvalidLocationConstraint": functools.partial(IOError, errno.EINVAL), "InvalidObjectState": PermissionError, "InvalidPart": functools.partial(IOError, errno.EINVAL), "InvalidPartOrder": functools.partial(IOError, errno.EINVAL), "InvalidPayer": PermissionError, "InvalidPolicyDocument": functools.partial(IOError, errno.EINVAL), "InvalidRange": functools.partial(IOError, errno.EINVAL), "InvalidRequest": functools.partial(IOError, errno.EINVAL), "InvalidSecurity": PermissionError, "InvalidSOAPRequest": functools.partial(IOError, errno.EINVAL), "InvalidStorageClass": functools.partial(IOError, errno.EINVAL), "InvalidTargetBucketForLogging": functools.partial(IOError, errno.EINVAL), "InvalidToken": functools.partial(IOError, errno.EINVAL), "InvalidURI": functools.partial(IOError, errno.EINVAL), "KeyTooLongError": functools.partial(IOError, ENAMETOOLONG), "MalformedACLError": functools.partial(IOError, errno.EINVAL), "MalformedPOSTRequest": functools.partial(IOError, errno.EINVAL), "MalformedXML": functools.partial(IOError, errno.EINVAL), "MaxMessageLengthExceeded": functools.partial(IOError, EMSGSIZE), "MaxPostPreDataLengthExceededError": functools.partial(IOError, EMSGSIZE), "MetadataTooLarge": functools.partial(IOError, EMSGSIZE), "MethodNotAllowed": functools.partial(IOError, errno.EPERM), "MissingAttachment": functools.partial(IOError, errno.EINVAL), "MissingContentLength": functools.partial(IOError, errno.EINVAL), "MissingRequestBodyError": functools.partial(IOError, errno.EINVAL), "MissingSecurityElement": functools.partial(IOError, errno.EINVAL), "MissingSecurityHeader": functools.partial(IOError, errno.EINVAL), "NoLoggingStatusForKey": functools.partial(IOError, errno.EINVAL), "NoSuchBucket": FileNotFoundError, "NoSuchBucketPolicy": FileNotFoundError, "NoSuchKey": FileNotFoundError, "NoSuchLifecycleConfiguration": FileNotFoundError, "NoSuchUpload": FileNotFoundError, "NoSuchVersion": FileNotFoundError, "NotImplemented": functools.partial(IOError, errno.ENOSYS), "NotSignedUp": PermissionError, "OperationAborted": functools.partial(IOError, errno.EBUSY), "PermanentRedirect": functools.partial(IOError, EREMCHG), "PreconditionFailed": functools.partial(IOError, errno.EINVAL), "Redirect": functools.partial(IOError, EREMCHG), "RestoreAlreadyInProgress": functools.partial(IOError, errno.EBUSY), "RequestIsNotMultiPartContent": functools.partial(IOError, errno.EINVAL), "RequestTimeout": TimeoutError, "RequestTimeTooSkewed": PermissionError, "RequestTorrentOfBucketError": functools.partial(IOError, errno.EPERM), "SignatureDoesNotMatch": PermissionError, "ServiceUnavailable": functools.partial(IOError, errno.EBUSY), "SlowDown": functools.partial(IOError, errno.EBUSY), "TemporaryRedirect": functools.partial(IOError, EREMCHG), "TokenRefreshRequired": functools.partial(IOError, errno.EINVAL), "TooManyBuckets": functools.partial(IOError, errno.EINVAL), "UnexpectedContent": functools.partial(IOError, errno.EINVAL), "UnresolvableGrantByEmailAddress": functools.partial(IOError, errno.EINVAL), "UserKeyMustBeSpecified": functools.partial(IOError, errno.EINVAL), "301": functools.partial(IOError, EREMCHG), # PermanentRedirect "307": functools.partial(IOError, EREMCHG), # Redirect "400": functools.partial(IOError, errno.EINVAL), "403": PermissionError, "404": FileNotFoundError, "405": functools.partial(IOError, errno.EPERM), "409": functools.partial(IOError, errno.EBUSY), "412": functools.partial(IOError, errno.EINVAL), # PreconditionFailed "416": functools.partial(IOError, errno.EINVAL), # InvalidRange "500": functools.partial(IOError, EREMOTEIO), # InternalError "501": functools.partial(IOError, errno.ENOSYS), # NotImplemented "503": functools.partial(IOError, errno.EBUSY), # SlowDown } def translate_boto_error(error, message=None, set_cause=True, *args, **kwargs): """Convert a ClientError exception into a Python one. Parameters ---------- error : botocore.exceptions.ClientError The exception returned by the boto API. message : str An error message to use for the returned exception. If not given, the error message returned by the server is used instead. set_cause : bool Whether to set the __cause__ attribute to the previous exception if the exception is translated. *args, **kwargs : Additional arguments to pass to the exception constructor, after the error message. Useful for passing the filename arguments to ``IOError``. Returns ------- An instantiated exception ready to be thrown. If the error code isn't recognized, an IOError with the original error message is returned. """ error_response = getattr(error, "response", None) if error_response is None: # non-http error, or response is None: return error code = error_response["Error"].get("Code") if ( code == "PreconditionFailed" and error_response["Error"].get("Condition", "") == "If-None-Match" ): constructor = FileExistsError else: constructor = ERROR_CODE_TO_EXCEPTION.get(code) if constructor: if not message: message = error_response["Error"].get("Message", str(error)) custom_exc = constructor(message, *args, **kwargs) else: # No match found, wrap this in an IOError with the appropriate message. custom_exc = OSError(errno.EIO, message or str(error), *args) if set_cause: custom_exc.__cause__ = error return custom_exc s3fs-2026.2.0/s3fs/mapping.py000066400000000000000000000003551514121105500155020ustar00rootroot00000000000000from .core import S3FileSystem def S3Map(root, s3, check=False, create=False): """Mirror previous class, not implemented in fsspec""" s3 = s3 or S3FileSystem.current() return s3.get_mapper(root, check=check, create=create) s3fs-2026.2.0/s3fs/tests/000077500000000000000000000000001514121105500146345ustar00rootroot00000000000000s3fs-2026.2.0/s3fs/tests/__init__.py000066400000000000000000000000001514121105500167330ustar00rootroot00000000000000s3fs-2026.2.0/s3fs/tests/derived/000077500000000000000000000000001514121105500162565ustar00rootroot00000000000000s3fs-2026.2.0/s3fs/tests/derived/__init__.py000066400000000000000000000000001514121105500203550ustar00rootroot00000000000000s3fs-2026.2.0/s3fs/tests/derived/s3fs_fixtures.py000066400000000000000000000054511514121105500214440ustar00rootroot00000000000000import json import os import pytest import requests import time from fsspec.tests.abstract import AbstractFixtures from s3fs.core import S3FileSystem test_bucket_name = "test" secure_bucket_name = "test-secure" versioned_bucket_name = "test-versioned" port = 5556 endpoint_uri = "http://127.0.0.1:%s/" % port class S3fsFixtures(AbstractFixtures): @pytest.fixture(scope="class") def fs(self, _s3_base, _get_boto3_client): client = _get_boto3_client client.create_bucket(Bucket=test_bucket_name, ACL="public-read") client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read") client.put_bucket_versioning( Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"} ) # initialize secure bucket client.create_bucket(Bucket=secure_bucket_name, ACL="public-read") policy = json.dumps( { "Version": "2012-10-17", "Id": "PutObjPolicy", "Statement": [ { "Sid": "DenyUnEncryptedObjectUploads", "Effect": "Deny", "Principal": "*", "Action": "s3:PutObject", "Resource": f"arn:aws:s3:::{secure_bucket_name}/*", "Condition": { "StringNotEquals": { "s3:x-amz-server-side-encryption": "aws:kms" } }, } ], } ) client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy) S3FileSystem.clear_instance_cache() s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) s3.invalidate_cache() yield s3 @pytest.fixture def fs_path(self): return test_bucket_name @pytest.fixture def supports_empty_directories(self): return False @pytest.fixture(scope="class") def _get_boto3_client(self): from botocore.session import Session # NB: we use the sync botocore client for setup session = Session() return session.create_client("s3", endpoint_url=endpoint_uri) @pytest.fixture(scope="class") def _s3_base(self): # copy of s3_base in test_s3fs from moto.moto_server.threaded_moto_server import ThreadedMotoServer server = ThreadedMotoServer(ip_address="127.0.0.1", port=port) server.start() if "AWS_SECRET_ACCESS_KEY" not in os.environ: os.environ["AWS_SECRET_ACCESS_KEY"] = "foo" if "AWS_ACCESS_KEY_ID" not in os.environ: os.environ["AWS_ACCESS_KEY_ID"] = "foo" print("server up") yield print("moto done") server.stop() s3fs-2026.2.0/s3fs/tests/derived/s3fs_test.py000066400000000000000000000017661514121105500205570ustar00rootroot00000000000000import pytest import fsspec.tests.abstract as abstract from s3fs.tests.derived.s3fs_fixtures import S3fsFixtures class TestS3fsCopy(abstract.AbstractCopyTests, S3fsFixtures): pass class TestS3fsGet(abstract.AbstractGetTests, S3fsFixtures): pass class TestS3fsPut(abstract.AbstractPutTests, S3fsFixtures): pass def botocore_too_old(): import botocore from packaging.version import parse MIN_BOTOCORE_VERSION = "1.33.2" return parse(botocore.__version__) < parse(MIN_BOTOCORE_VERSION) class TestS3fsPipe(abstract.AbstractPipeTests, S3fsFixtures): test_pipe_exclusive = pytest.mark.skipif( botocore_too_old(), reason="Older botocore doesn't support exclusive writes" )(abstract.AbstractPipeTests.test_pipe_exclusive) class TestS3fsOpen(abstract.AbstractOpenTests, S3fsFixtures): test_open_exclusive = pytest.mark.xfail( reason="complete_multipart_upload doesn't implement condition in moto" )(abstract.AbstractOpenTests.test_open_exclusive) s3fs-2026.2.0/s3fs/tests/test_custom_error_handler.py000066400000000000000000000165401514121105500224730ustar00rootroot00000000000000"""Tests for custom error handler functionality.""" import asyncio import pytest from botocore.exceptions import ClientError import s3fs.core from s3fs.core import ( S3FileSystem, _error_wrapper, set_custom_error_handler, add_retryable_error, ) # Custom exception types for testing class CustomRetryableError(Exception): """A custom exception that should be retried.""" pass class CustomNonRetryableError(Exception): """A custom exception that should not be retried.""" pass @pytest.fixture(autouse=True) def reset_error_handler(): """Reset the custom error handler and retryable errors after each test.""" original_errors = s3fs.core.S3_RETRYABLE_ERRORS yield # Reset to default handler s3fs.core.CUSTOM_ERROR_HANDLER = lambda e: False # Reset retryable errors tuple s3fs.core.S3_RETRYABLE_ERRORS = original_errors def test_handler_retry_on_custom_exception(): """Test that custom error handler allows retrying on custom exceptions.""" call_count = 0 async def failing_func(): nonlocal call_count call_count += 1 if call_count < 3: raise CustomRetryableError("Custom error that should retry") return "success" # Set up custom handler to retry CustomRetryableError def custom_handler(e): return isinstance(e, CustomRetryableError) set_custom_error_handler(custom_handler) # Should retry and eventually succeed async def run_test(): result = await _error_wrapper(failing_func, retries=5) assert result == "success" assert call_count == 3 # Failed twice, succeeded on third attempt asyncio.run(run_test()) def test_handler_no_retry_on_other_exception(): """Test that custom error handler does not retry exceptions it doesn't handle.""" call_count = 0 async def failing_func(): nonlocal call_count call_count += 1 raise CustomNonRetryableError("Custom error that should not retry") # Set up custom handler that only retries CustomRetryableError def custom_handler(e): return isinstance(e, CustomRetryableError) set_custom_error_handler(custom_handler) # Should not retry and fail immediately async def run_test(): with pytest.raises(CustomNonRetryableError): await _error_wrapper(failing_func, retries=5) assert call_count == 1 # Should only be called once asyncio.run(run_test()) def test_handler_with_client_error(): """Test that custom handler can make ClientError retryable.""" call_count = 0 async def failing_func(): nonlocal call_count call_count += 1 if call_count < 3: # Create a ClientError that doesn't match the built-in retry patterns error_response = { "Error": { "Code": "CustomThrottlingError", "Message": "Custom throttling message", } } raise ClientError(error_response, "operation_name") return "success" # Set up custom handler to retry on specific ClientError codes def custom_handler(e): if isinstance(e, ClientError): return e.response.get("Error", {}).get("Code") == "CustomThrottlingError" return False set_custom_error_handler(custom_handler) # Should retry and eventually succeed async def run_test(): result = await _error_wrapper(failing_func, retries=5) assert result == "success" assert call_count == 3 asyncio.run(run_test()) def test_handler_preserves_builtin_retry_pattern(): """Test that custom handler doesn't interfere with built-in retry logic.""" call_count = 0 async def failing_func(): nonlocal call_count call_count += 1 if call_count < 3: # SlowDown is a built-in retryable pattern error_response = { "Error": { "Code": "SlowDown", "Message": "Please reduce your request rate", } } raise ClientError(error_response, "operation_name") return "success" # Set up a custom handler that handles something else def custom_handler(e): return isinstance(e, CustomRetryableError) set_custom_error_handler(custom_handler) # Should still retry SlowDown errors due to built-in logic async def run_test(): result = await _error_wrapper(failing_func, retries=5) assert result == "success" assert call_count == 3 asyncio.run(run_test()) def test_handler_max_retries(): """Test that custom handler respects max retries.""" call_count = 0 async def always_failing_func(): nonlocal call_count call_count += 1 raise CustomRetryableError("Always fails") def custom_handler(e): return isinstance(e, CustomRetryableError) set_custom_error_handler(custom_handler) # Should retry up to retries limit then raise async def run_test(): with pytest.raises(CustomRetryableError): await _error_wrapper(always_failing_func, retries=3) assert call_count == 3 asyncio.run(run_test()) def test_handler_sleep_behavior(): """Test that retries due to custom handler also wait between attempts.""" call_times = [] async def failing_func(): call_times.append(asyncio.get_event_loop().time()) raise CustomRetryableError("Retry me") def custom_handler(e): return isinstance(e, CustomRetryableError) set_custom_error_handler(custom_handler) async def run_test(): with pytest.raises(CustomRetryableError): await _error_wrapper(failing_func, retries=3) # Should have made 3 attempts assert len(call_times) == 3 # Check that there was a delay between attempts # The wait time formula is min(1.7**i * 0.1, 15) # For i=0: min(0.1, 15) = 0.1 # For i=1: min(0.17, 15) = 0.17 if len(call_times) >= 2: time_between_first_and_second = call_times[1] - call_times[0] # Should be roughly 0.1 seconds (with some tolerance) assert time_between_first_and_second >= 0.05 asyncio.run(run_test()) def test_default_handler(): """Test behavior when custom handler is not set explicitly.""" call_count = 0 async def failing_func(): nonlocal call_count call_count += 1 raise ValueError("Regular exception") # Don't set a custom handler, use default (returns False) # Should not retry regular exceptions async def run_test(): with pytest.raises(ValueError): await _error_wrapper(failing_func, retries=5) assert call_count == 1 asyncio.run(run_test()) def test_add_retryable_error(): """Test adding a custom exception to the retryable errors tuple.""" call_count = 0 async def failing_func(): nonlocal call_count call_count += 1 if call_count < 3: raise CustomRetryableError("Custom error") return "success" # Add CustomRetryableError to the retryable errors add_retryable_error(CustomRetryableError) # Should now be retried automatically without custom handler async def run_test(): result = await _error_wrapper(failing_func, retries=5) assert result == "success" assert call_count == 3 asyncio.run(run_test()) s3fs-2026.2.0/s3fs/tests/test_mapping.py000066400000000000000000000054531514121105500177070ustar00rootroot00000000000000import pytest from s3fs.tests.test_s3fs import s3_base, s3, test_bucket_name from s3fs import S3Map, S3FileSystem root = test_bucket_name + "/mapping" def test_simple(s3): d = s3.get_mapper(root) assert not d assert list(d) == list(d.keys()) == [] assert list(d.values()) == [] assert list(d.items()) == [] s3.get_mapper(root) def test_default_s3filesystem(s3): d = s3.get_mapper(root) assert d.fs is s3 def test_errors(s3): d = s3.get_mapper(root) with pytest.raises(KeyError): d["nonexistent"] try: s3.get_mapper("does-not-exist", check=True) except Exception as e: assert "does-not-exist" in str(e) def test_with_data(s3): d = s3.get_mapper(root) d["x"] = b"123" assert list(d) == list(d.keys()) == ["x"] assert list(d.values()) == [b"123"] assert list(d.items()) == [("x", b"123")] assert d["x"] == b"123" assert bool(d) assert s3.find(root) == [test_bucket_name + "/mapping/x"] d["x"] = b"000" assert d["x"] == b"000" d["y"] = b"456" assert d["y"] == b"456" assert set(d) == {"x", "y"} d.clear() assert list(d) == [] def test_complex_keys(s3): d = s3.get_mapper(root) d[1] = b"hello" assert d[1] == b"hello" del d[1] d[1, 2] = b"world" assert d[1, 2] == b"world" del d[1, 2] d["x", 1, 2] = b"hello world" assert d["x", 1, 2] == b"hello world" assert ("x", 1, 2) in d def test_clear_empty(s3): d = s3.get_mapper(root) d.clear() assert list(d) == [] d[1] = b"1" assert list(d) == ["1"] d.clear() assert list(d) == [] def test_no_dircache(s3): from s3fs.tests.test_s3fs import endpoint_uri import fsspec d = fsspec.get_mapper( "s3://" + root, anon=False, client_kwargs={"endpoint_url": endpoint_uri}, use_listings_cache=False, ) d.clear() assert list(d) == [] d[1] = b"1" assert list(d) == ["1"] d.clear() assert list(d) == [] def test_pickle(s3): d = s3.get_mapper(root) d["x"] = b"1" import pickle d2 = pickle.loads(pickle.dumps(d)) assert d2["x"] == b"1" def test_array(s3): from array import array d = s3.get_mapper(root) d["x"] = array("B", [65] * 1000) assert d["x"] == b"A" * 1000 def test_bytearray(s3): d = s3.get_mapper(root) d["x"] = bytearray(b"123") assert d["x"] == b"123" def test_new_bucket(s3): try: s3.get_mapper("new-bucket", check=True) assert False except ValueError as e: assert "create" in str(e) d = s3.get_mapper("new-bucket", create=True) assert not d d = s3.get_mapper("new-bucket/new-directory") assert not d def test_old_api(s3): import fsspec.mapping assert isinstance(S3Map(root, s3), fsspec.mapping.FSMap) s3fs-2026.2.0/s3fs/tests/test_s3fs.py000066400000000000000000002731101514121105500171270ustar00rootroot00000000000000import asyncio import errno import datetime from contextlib import contextmanager import json from concurrent.futures import ProcessPoolExecutor import io import os import random import requests import time import sys import pytest import moto from moto.moto_server.threaded_moto_server import ThreadedMotoServer from itertools import chain import fsspec.core from dateutil.tz import tzutc import botocore import s3fs.core from s3fs.core import MAX_UPLOAD_PARTS, S3FileSystem, calculate_chunksize from s3fs.utils import ignoring, SSEParams from botocore.exceptions import NoCredentialsError from fsspec.asyn import sync from fsspec.callbacks import Callback from packaging import version test_bucket_name = "test" secure_bucket_name = "test-secure" versioned_bucket_name = "test-versioned" files = { "test/accounts.1.json": ( b'{"amount": 100, "name": "Alice"}\n' b'{"amount": 200, "name": "Bob"}\n' b'{"amount": 300, "name": "Charlie"}\n' b'{"amount": 400, "name": "Dennis"}\n' ), "test/accounts.2.json": ( b'{"amount": 500, "name": "Alice"}\n' b'{"amount": 600, "name": "Bob"}\n' b'{"amount": 700, "name": "Charlie"}\n' b'{"amount": 800, "name": "Dennis"}\n' ), } csv_files = { "2014-01-01.csv": ( b"name,amount,id\n" b"Alice,100,1\n" b"Bob,200,2\n" b"Charlie,300,3\n" ), "2014-01-02.csv": (b"name,amount,id\n"), "2014-01-03.csv": ( b"name,amount,id\n" b"Dennis,400,4\n" b"Edith,500,5\n" b"Frank,600,6\n" ), } text_files = { "nested/file1": b"hello\n", "nested/file2": b"world", "nested/nested2/file1": b"hello\n", "nested/nested2/file2": b"world", } glob_files = {"file.dat": b"", "filexdat": b""} a = test_bucket_name + "/tmp/test/a" b = test_bucket_name + "/tmp/test/b" c = test_bucket_name + "/tmp/test/c" d = test_bucket_name + "/tmp/test/d" port = 5555 endpoint_uri = "http://127.0.0.1:%s/" % port @pytest.fixture(scope="module") def s3_base(): # writable local S3 system # This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests server = ThreadedMotoServer(ip_address="127.0.0.1", port=port) server.start() if "AWS_SECRET_ACCESS_KEY" not in os.environ: os.environ["AWS_SECRET_ACCESS_KEY"] = "foo" if "AWS_ACCESS_KEY_ID" not in os.environ: os.environ["AWS_ACCESS_KEY_ID"] = "foo" os.environ.pop("AWS_PROFILE", None) print("server up") yield print("moto done") server.stop() @pytest.fixture(autouse=True) def reset_s3_fixture(): # We reuse the MotoServer for all tests # But we do want a clean state for every test try: requests.post(f"{endpoint_uri}/moto-api/reset") except: pass def get_boto3_client(): from botocore.session import Session # NB: we use the sync botocore client for setup session = Session() return session.create_client("s3", endpoint_url=endpoint_uri) @pytest.fixture() def s3(s3_base): client = get_boto3_client() client.create_bucket(Bucket=test_bucket_name, ACL="public-read") client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read") client.put_bucket_versioning( Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"} ) # initialize secure bucket client.create_bucket(Bucket=secure_bucket_name, ACL="public-read") policy = json.dumps( { "Version": "2012-10-17", "Id": "PutObjPolicy", "Statement": [ { "Sid": "DenyUnEncryptedObjectUploads", "Effect": "Deny", "Principal": "*", "Action": "s3:PutObject", "Resource": f"arn:aws:s3:::{secure_bucket_name}/*", "Condition": { "StringNotEquals": { "s3:x-amz-server-side-encryption": "aws:kms" } }, } ], } ) client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy) for flist in [files, csv_files, text_files, glob_files]: for f, data in flist.items(): client.put_object(Bucket=test_bucket_name, Key=f, Body=data) S3FileSystem.clear_instance_cache() s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) s3.invalidate_cache() yield s3 @contextmanager def expect_errno(expected_errno): """Expect an OSError and validate its errno code.""" with pytest.raises(OSError) as error: yield assert error.value.errno == expected_errno, "OSError has wrong error code." def test_simple(s3): data = b"a" * (10 * 2**20) with s3.open(a, "wb") as f: f.write(data) with s3.open(a, "rb") as f: out = f.read(len(data)) assert len(data) == len(out) assert out == data def test_with_size(s3): data = b"a" * (10 * 2**20) with s3.open(a, "wb") as f: f.write(data) with s3.open(a, "rb", size=100) as f: assert f.size == 100 out = f.read() assert len(out) == 100 @pytest.mark.parametrize("default_cache_type", ["none", "bytes", "mmap", "readahead"]) def test_default_cache_type(s3, default_cache_type): data = b"a" * (10 * 2**20) s3 = S3FileSystem( anon=False, default_cache_type=default_cache_type, client_kwargs={"endpoint_url": endpoint_uri}, ) with s3.open(a, "wb") as f: f.write(data) with s3.open(a, "rb") as f: assert isinstance(f.cache, fsspec.core.caches[default_cache_type]) out = f.read(len(data)) assert len(data) == len(out) assert out == data def test_ssl_off(): s3 = S3FileSystem(use_ssl=False, client_kwargs={"endpoint_url": endpoint_uri}) assert s3.s3.meta.endpoint_url.startswith("http://") def test_client_kwargs(): s3 = S3FileSystem(client_kwargs={"endpoint_url": "http://foo"}) assert s3.s3.meta.endpoint_url.startswith("http://foo") def test_config_kwargs(): s3 = S3FileSystem( config_kwargs={"signature_version": "s3v4"}, client_kwargs={"endpoint_url": endpoint_uri}, ) assert s3.connect().meta.config.signature_version == "s3v4" def test_config_kwargs_class_attributes_default(): s3 = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri}) assert s3.connect().meta.config.connect_timeout == 5 assert s3.connect().meta.config.read_timeout == 15 def test_config_kwargs_class_attributes_override(): s3 = S3FileSystem( config_kwargs={ "connect_timeout": 60, "read_timeout": 120, }, client_kwargs={"endpoint_url": endpoint_uri}, ) assert s3.connect().meta.config.connect_timeout == 60 assert s3.connect().meta.config.read_timeout == 120 def test_user_session_is_preserved(): from aiobotocore.session import get_session session = get_session() s3 = S3FileSystem(session=session) s3.connect() assert s3.session == session def test_idempotent_connect(s3): stale_s3 = s3.s3 stale_session = s3.session s3.connect(refresh=True) assert stale_s3 is not s3.s3 assert stale_session is not s3.session def test_multiple_objects(s3): s3.connect() s3.ls("test") s32 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) assert s32.session assert s3.ls("test") == s32.ls("test") def test_info(s3): s3.touch(a) s3.touch(b) info = s3.info(a) linfo = s3.ls(a, detail=True)[0] assert abs(info.pop("LastModified") - linfo.pop("LastModified")).seconds < 1 info.pop("VersionId") info.pop("ContentType") linfo.pop("Key") linfo.pop("Size") linfo.pop("ChecksumAlgorithm", None) # field DNE in some S3-compatible providers assert info == linfo parent = a.rsplit("/", 1)[0] s3.invalidate_cache() # remove full path from the cache s3.ls(parent) # fill the cache with parent dir assert s3.info(a) == s3.dircache[parent][0] # correct value assert id(s3.info(a)) == id(s3.dircache[parent][0]) # is object from cache assert id(s3.info(f"/{a}")) == id(s3.dircache[parent][0]) # is object from cache new_parent = test_bucket_name + "/foo" s3.mkdir(new_parent) with pytest.raises(FileNotFoundError): s3.info(new_parent) with pytest.raises(FileNotFoundError): s3.ls(new_parent) with pytest.raises(FileNotFoundError): s3.info(new_parent) def test_info_cached(s3): path = test_bucket_name + "/tmp/" fqpath = "s3://" + path s3.touch(path + "test") info = s3.info(fqpath) assert info == s3.info(fqpath) assert info == s3.info(path) def test_checksum(s3): bucket = test_bucket_name d = "checksum" prefix = d + "/e" o1 = prefix + "1" o2 = prefix + "2" path1 = bucket + "/" + o1 path2 = bucket + "/" + o2 client = s3.s3 # init client and files sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="") sync(s3.loop, client.put_object, Bucket=bucket, Key=o2, Body="") # change one file, using cache sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="foo") checksum = s3.checksum(path1) s3.ls(path1) # force caching sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar") # refresh == False => checksum doesn't change assert checksum == s3.checksum(path1) # change one file, without cache sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="foo") checksum = s3.checksum(path1, refresh=True) s3.ls(path1) # force caching sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar") # refresh == True => checksum changes assert checksum != s3.checksum(path1, refresh=True) # Test for nonexistent file sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar") s3.ls(path1) # force caching sync(s3.loop, client.delete_object, Bucket=bucket, Key=o1) with pytest.raises(FileNotFoundError): s3.checksum(o1, refresh=True) # Test multipart upload upload_id = sync( s3.loop, client.create_multipart_upload, Bucket=bucket, Key=o1, )["UploadId"] etag1 = sync( s3.loop, client.upload_part, Bucket=bucket, Key=o1, UploadId=upload_id, PartNumber=1, Body="0" * (5 * 1024 * 1024), )["ETag"] etag2 = sync( s3.loop, client.upload_part, Bucket=bucket, Key=o1, UploadId=upload_id, PartNumber=2, Body="0", )["ETag"] sync( s3.loop, client.complete_multipart_upload, Bucket=bucket, Key=o1, UploadId=upload_id, MultipartUpload={ "Parts": [ {"PartNumber": 1, "ETag": etag1}, {"PartNumber": 2, "ETag": etag2}, ] }, ) s3.checksum(path1, refresh=True) def test_multi_checksum(s3): # Moto accepts the request to add checksum, and accepts the checksum mode, # but doesn't actually return the checksum # So, this is mostly a stub test file_key = "checksum" path = test_bucket_name + "/" + file_key s3 = S3FileSystem( anon=False, client_kwargs={"endpoint_url": endpoint_uri}, s3_additional_kwargs={"ChecksumAlgorithm": "SHA256"}, ) with s3.open( path, "wb", blocksize=5 * 2**20, ) as f: f.write(b"0" * (5 * 2**20 + 1)) # starts multipart and puts first part f.write(b"data") # any extra data assert s3.cat(path) == b"0" * (5 * 2**20 + 1) + b"data" FileHead = sync( s3.loop, s3.s3.head_object, Bucket=test_bucket_name, Key=file_key, ChecksumMode="ENABLED", ) # assert "ChecksumSHA256" in FileHead test_xattr_sample_metadata = {"testxattr": "1"} def test_xattr(s3): bucket, key = (test_bucket_name, "tmp/test/xattr") filename = bucket + "/" + key body = b"aaaa" public_read_acl = { "Permission": "READ", "Grantee": { "URI": "http://acs.amazonaws.com/groups/global/AllUsers", "Type": "Group", }, } resp = sync( s3.loop, s3.s3.put_object, Bucket=bucket, Key=key, ACL="public-read", Metadata=test_xattr_sample_metadata, Body=body, ) # save etag for later etag = s3.info(filename)["ETag"] assert ( public_read_acl in sync(s3.loop, s3.s3.get_object_acl, Bucket=bucket, Key=key)["Grants"] ) assert s3.getxattr(filename, "testxattr") == test_xattr_sample_metadata["testxattr"] assert s3.metadata(filename) == {"testxattr": "1"} # note _ became - s3file = s3.open(filename) assert s3file.getxattr("testxattr") == test_xattr_sample_metadata["testxattr"] assert s3file.metadata() == {"testxattr": "1"} # note _ became - s3file.setxattr(testxattr="2") assert s3file.getxattr("testxattr") == "2" s3file.setxattr(**{"testxattr": None}) assert s3file.metadata() == {} assert s3.cat(filename) == body # check that ACL and ETag are preserved after updating metadata assert ( public_read_acl in sync(s3.loop, s3.s3.get_object_acl, Bucket=bucket, Key=key)["Grants"] ) assert s3.info(filename)["ETag"] == etag def test_xattr_setxattr_in_write_mode(s3): s3file = s3.open(a, "wb") with pytest.raises(NotImplementedError): s3file.setxattr(test_xattr="1") @pytest.mark.xfail() def test_delegate(s3): out = s3.get_delegated_s3pars() assert out assert out["token"] s32 = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri}, **out) assert not s32.anon assert out == s32.get_delegated_s3pars() def test_not_delegate(): s3 = S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri}) out = s3.get_delegated_s3pars() assert out == {"anon": True} s3 = S3FileSystem( anon=False, client_kwargs={"endpoint_url": endpoint_uri} ) # auto credentials out = s3.get_delegated_s3pars() assert out == {"anon": False} def test_ls(s3): assert set(s3.ls("", detail=False)) == { test_bucket_name, secure_bucket_name, versioned_bucket_name, } with pytest.raises(FileNotFoundError): s3.ls("nonexistent") fn = test_bucket_name + "/test/accounts.1.json" assert fn in s3.ls(test_bucket_name + "/test", detail=False) def test_pickle(s3): import pickle s32 = pickle.loads(pickle.dumps(s3)) assert s3.ls("test") == s32.ls("test") s33 = pickle.loads(pickle.dumps(s32)) assert s3.ls("test") == s33.ls("test") def test_ls_touch(s3): assert not s3.exists(test_bucket_name + "/tmp/test") s3.touch(a) s3.touch(b) L = s3.ls(test_bucket_name + "/tmp/test", True) assert {d["Key"] for d in L} == {a, b} L = s3.ls(test_bucket_name + "/tmp/test", False) assert set(L) == {a, b} @pytest.mark.parametrize("version_aware", [True, False]) def test_exists_versioned(s3, version_aware): """Test to ensure that a prefix exists when using a versioned bucket""" import uuid n = 3 s3 = S3FileSystem( anon=False, version_aware=version_aware, client_kwargs={"endpoint_url": endpoint_uri}, ) segments = [versioned_bucket_name] + [str(uuid.uuid4()) for _ in range(n)] path = "/".join(segments) for i in range(2, n + 1): assert not s3.exists("/".join(segments[:i])) s3.touch(path) for i in range(2, n + 1): assert s3.exists("/".join(segments[:i])) def test_isfile(s3): assert not s3.isfile("") assert not s3.isfile("/") assert not s3.isfile(test_bucket_name) assert not s3.isfile(test_bucket_name + "/test") assert not s3.isfile(test_bucket_name + "/test/foo") assert s3.isfile(test_bucket_name + "/test/accounts.1.json") assert s3.isfile(test_bucket_name + "/test/accounts.2.json") assert not s3.isfile(a) s3.touch(a) assert s3.isfile(a) assert not s3.isfile(b) assert not s3.isfile(b + "/") s3.mkdir(b) assert not s3.isfile(b) assert not s3.isfile(b + "/") assert not s3.isfile(c) assert not s3.isfile(c + "/") s3.mkdir(c + "/") assert not s3.isfile(c) assert not s3.isfile(c + "/") def test_isdir(s3): assert s3.isdir("") assert s3.isdir("/") assert s3.isdir(test_bucket_name) assert s3.isdir(test_bucket_name + "/test") assert not s3.isdir(test_bucket_name + "/test/foo") assert not s3.isdir(test_bucket_name + "/test/accounts.1.json") assert not s3.isdir(test_bucket_name + "/test/accounts.2.json") assert not s3.isdir(a) s3.touch(a) assert not s3.isdir(a) assert not s3.isdir(b) assert not s3.isdir(b + "/") assert not s3.isdir(c) assert not s3.isdir(c + "/") # test cache s3.invalidate_cache() assert not s3.dircache s3.ls(test_bucket_name + "/nested") assert test_bucket_name + "/nested" in s3.dircache assert not s3.isdir(test_bucket_name + "/nested/file1") assert not s3.isdir(test_bucket_name + "/nested/file2") assert s3.isdir(test_bucket_name + "/nested/nested2") assert s3.isdir(test_bucket_name + "/nested/nested2/") def test_rm(s3): assert not s3.exists(a) s3.touch(a) assert s3.exists(a) s3.rm(a) assert not s3.exists(a) # the API is OK with deleting non-files; maybe this is an effect of using bulk # with pytest.raises(FileNotFoundError): # s3.rm(test_bucket_name + '/nonexistent') with pytest.raises(FileNotFoundError): s3.rm("nonexistent") out = s3.rm(test_bucket_name + "/nested", recursive=True) assert test_bucket_name + "/nested/nested2/file1" in out assert not s3.exists(test_bucket_name + "/nested/nested2/file1") # whole bucket out = s3.rm(test_bucket_name, recursive=True) assert test_bucket_name + "/2014-01-01.csv" in out assert not s3.exists(test_bucket_name + "/2014-01-01.csv") assert not s3.exists(test_bucket_name) def test_rmdir(s3): bucket = "test1_bucket" s3.mkdir(bucket) s3.rmdir(bucket) assert bucket not in s3.ls("/") # Issue 689, s3fs rmdir command returns error when given a valid s3 path. dir = test_bucket_name + "/dir" assert not s3.exists(dir) with pytest.raises(FileNotFoundError): s3.rmdir(dir) s3.touch(dir + "/file") assert s3.exists(dir) assert s3.exists(dir + "/file") with pytest.raises(FileExistsError): s3.rmdir(dir) with pytest.raises(OSError): s3.rmdir(test_bucket_name) def test_mkdir(s3): bucket = "test1_bucket" s3.mkdir(bucket) assert bucket in s3.ls("/") def test_mkdir_existing_bucket(s3): # mkdir called on existing bucket should be no-op and not calling create_bucket # creating a s3 bucket bucket = "test1_bucket" s3.mkdir(bucket) assert bucket in s3.ls("/") # a second call. with pytest.raises(FileExistsError): s3.mkdir(bucket) def test_mkdir_bucket_and_key_1(s3): bucket = "test1_bucket" file = bucket + "/a/b/c" s3.mkdir(file, create_parents=True) assert bucket in s3.ls("/") def test_mkdir_bucket_and_key_2(s3): bucket = "test1_bucket" file = bucket + "/a/b/c" with pytest.raises(FileNotFoundError): s3.mkdir(file, create_parents=False) assert bucket not in s3.ls("/") def test_mkdir_region_name(s3): bucket = "test2_bucket" s3.mkdir(bucket, region_name="eu-central-1") assert bucket in s3.ls("/") def test_mkdir_client_region_name(s3): bucket = "test3_bucket" s3 = S3FileSystem( anon=False, client_kwargs={"region_name": "eu-central-1", "endpoint_url": endpoint_uri}, ) s3.mkdir(bucket) assert bucket in s3.ls("/") def test_makedirs(s3): bucket = "test_makedirs_bucket" test_file = bucket + "/a/b/c/file" s3.makedirs(test_file) assert bucket in s3.ls("/") def test_makedirs_existing_bucket(s3): bucket = "test_makedirs_bucket" s3.mkdir(bucket) assert bucket in s3.ls("/") test_file = bucket + "/a/b/c/file" # no-op, and no error. s3.makedirs(test_file) def test_makedirs_pure_bucket_exist_ok(s3): bucket = "test1_bucket" s3.mkdir(bucket) s3.makedirs(bucket, exist_ok=True) def test_makedirs_pure_bucket_error_on_exist(s3): bucket = "test1_bucket" s3.mkdir(bucket) with pytest.raises(FileExistsError): s3.makedirs(bucket, exist_ok=False) def test_bulk_delete(s3): with pytest.raises(FileNotFoundError): s3.rm(["nonexistent/file"]) filelist = s3.find(test_bucket_name + "/nested") s3.rm(filelist) assert not s3.exists(test_bucket_name + "/nested/nested2/file1") @pytest.mark.xfail(reason="anon user is still privileged on moto") def test_anonymous_access(s3): with ignoring(NoCredentialsError): s3 = S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri}) assert s3.ls("") == [] # TODO: public bucket doesn't work through moto with pytest.raises(PermissionError): s3.mkdir("newbucket") def test_s3_file_access(s3): fn = test_bucket_name + "/nested/file1" data = b"hello\n" assert s3.cat(fn) == data assert s3.head(fn, 3) == data[:3] assert s3.tail(fn, 3) == data[-3:] assert s3.tail(fn, 10000) == data def test_s3_file_info(s3): fn = test_bucket_name + "/nested/file1" data = b"hello\n" assert fn in s3.find(test_bucket_name) assert s3.exists(fn) assert not s3.exists(fn + "another") assert s3.info(fn)["Size"] == len(data) with pytest.raises(FileNotFoundError): s3.info(fn + "another") def test_content_type_is_set(s3, tmpdir): test_file = str(tmpdir) + "/test.json" destination = test_bucket_name + "/test.json" open(test_file, "w").write("text") s3.put(test_file, destination) assert s3.info(destination)["ContentType"] == "application/json" def test_content_type_is_not_overrided(s3, tmpdir): test_file = os.path.join(str(tmpdir), "test.json") destination = os.path.join(test_bucket_name, "test.json") open(test_file, "w").write("text") s3.put(test_file, destination, ContentType="text/css") assert s3.info(destination)["ContentType"] == "text/css" def test_bucket_exists(s3): assert s3.exists(test_bucket_name) assert not s3.exists(test_bucket_name + "x") s3 = S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri}) assert s3.exists(test_bucket_name) assert not s3.exists(test_bucket_name + "x") def test_du(s3): d = s3.du(test_bucket_name, total=False) assert all(isinstance(v, int) and v >= 0 for v in d.values()) assert test_bucket_name + "/nested/file1" in d assert s3.du(test_bucket_name + "/test/", total=True) == sum( map(len, files.values()) ) assert s3.du(test_bucket_name) == s3.du("s3://" + test_bucket_name) # Issue 450, s3.du of non-existent directory dir = test_bucket_name + "/does-not-exist" assert not s3.exists(dir) assert s3.du(dir) == 0 assert s3.du(dir + "/") == 0 def test_s3_ls(s3): fn = test_bucket_name + "/nested/file1" assert fn not in s3.ls(test_bucket_name + "/") assert fn in s3.ls(test_bucket_name + "/nested/") assert fn in s3.ls(test_bucket_name + "/nested") assert s3.ls("s3://" + test_bucket_name + "/nested/") == s3.ls( test_bucket_name + "/nested" ) def test_s3_big_ls(s3): for x in range(1200): s3.touch(test_bucket_name + "/thousand/%i.part" % x) assert len(s3.find(test_bucket_name)) > 1200 s3.rm(test_bucket_name + "/thousand/", recursive=True) assert len(s3.find(test_bucket_name + "/thousand/")) == 0 def test_s3_ls_detail(s3): L = s3.ls(test_bucket_name + "/nested", detail=True) assert all(isinstance(item, dict) for item in L) def test_s3_glob(s3): fn = test_bucket_name + "/nested/file1" assert fn not in s3.glob(test_bucket_name + "/") assert fn not in s3.glob(test_bucket_name + "/*") assert fn not in s3.glob(test_bucket_name + "/nested") assert fn in s3.glob(test_bucket_name + "/nested/*") assert fn in s3.glob(test_bucket_name + "/nested/file*") assert fn in s3.glob(test_bucket_name + "/*/*") assert all( any(p.startswith(f + "/") or p == f for p in s3.find(test_bucket_name)) for f in s3.glob(test_bucket_name + "/nested/*") ) assert [test_bucket_name + "/nested/nested2"] == s3.glob( test_bucket_name + "/nested/nested2" ) out = s3.glob(test_bucket_name + "/nested/nested2/*") assert {"test/nested/nested2/file1", "test/nested/nested2/file2"} == set(out) with pytest.raises(ValueError): s3.glob("*") # Make sure glob() deals with the dot character (.) correctly. assert test_bucket_name + "/file.dat" in s3.glob(test_bucket_name + "/file.*") assert test_bucket_name + "/filexdat" not in s3.glob(test_bucket_name + "/file.*") def test_get_list_of_summary_objects(s3): L = s3.ls(test_bucket_name + "/test") assert len(L) == 2 assert [l.lstrip(test_bucket_name).lstrip("/") for l in sorted(L)] == sorted( list(files) ) L2 = s3.ls("s3://" + test_bucket_name + "/test") assert L == L2 def test_read_keys_from_bucket(s3): for k, data in files.items(): file_contents = s3.cat("/".join([test_bucket_name, k])) assert file_contents == data assert s3.cat("/".join([test_bucket_name, k])) == s3.cat( "s3://" + "/".join([test_bucket_name, k]) ) def test_url(s3): fn = test_bucket_name + "/nested/file1" url = s3.url(fn, expires=100) assert "http" in url import urllib.parse components = urllib.parse.urlparse(url) query = urllib.parse.parse_qs(components.query) exp = int(query["Expires"][0]) delta = abs(exp - time.time() - 100) assert delta < 5 with s3.open(fn) as f: assert "http" in f.url() def test_seek(s3): with s3.open(a, "wb") as f: f.write(b"123") with s3.open(a) as f: assert f.read() == b"123" with s3.open(a) as f: f.seek(1000) with pytest.raises(ValueError): f.seek(-1) with pytest.raises(ValueError): f.seek(-5, 2) with pytest.raises(ValueError): f.seek(0, 10) f.seek(0) assert f.read(1) == b"1" f.seek(0) assert f.read(1) == b"1" f.seek(3) assert f.read(1) == b"" f.seek(-1, 2) assert f.read(1) == b"3" f.seek(-1, 1) f.seek(-1, 1) assert f.read(1) == b"2" for i in range(4): assert f.seek(i) == i def test_bad_open(s3): with pytest.raises(ValueError): s3.open("") def test_copy(s3): fn = test_bucket_name + "/test/accounts.1.json" s3.copy(fn, fn + "2") assert s3.cat(fn) == s3.cat(fn + "2") def test_copy_managed(s3): data = b"abc" * 12 * 2**20 fn = test_bucket_name + "/test/biggerfile" with s3.open(fn, "wb") as f: f.write(data) sync(s3.loop, s3._copy_managed, fn, fn + "2", size=len(data), block=5 * 2**20) assert s3.cat(fn) == s3.cat(fn + "2") with pytest.raises(ValueError): sync(s3.loop, s3._copy_managed, fn, fn + "3", size=len(data), block=4 * 2**20) with pytest.raises(ValueError): sync(s3.loop, s3._copy_managed, fn, fn + "3", size=len(data), block=6 * 2**30) @pytest.mark.parametrize("recursive", [True, False]) def test_move(s3, recursive): fn = test_bucket_name + "/test/accounts.1.json" data = s3.cat(fn) s3.mv(fn, fn + "2", recursive=recursive) assert s3.cat(fn + "2") == data assert not s3.exists(fn) def test_get_put(s3, tmpdir): test_file = str(tmpdir.join("test.json")) s3.get(test_bucket_name + "/test/accounts.1.json", test_file) data = files["test/accounts.1.json"] assert open(test_file, "rb").read() == data s3.put(test_file, test_bucket_name + "/temp") assert s3.du(test_bucket_name + "/temp", total=False)[ test_bucket_name + "/temp" ] == len(data) assert s3.cat(test_bucket_name + "/temp") == data def test_get_put_big(s3, tmpdir): test_file = str(tmpdir.join("test")) data = b"1234567890A" * 2**20 open(test_file, "wb").write(data) s3.put(test_file, test_bucket_name + "/bigfile") test_file = str(tmpdir.join("test2")) s3.get(test_bucket_name + "/bigfile", test_file) assert open(test_file, "rb").read() == data def test_get_put_with_callback(s3, tmpdir): test_file = str(tmpdir.join("test.json")) class BranchingCallback(Callback): def branch(self, path_1, path_2, kwargs): kwargs["callback"] = BranchingCallback() cb = BranchingCallback() s3.get(test_bucket_name + "/test/accounts.1.json", test_file, callback=cb) assert cb.size == 1 assert cb.value == 1 cb = BranchingCallback() s3.put(test_file, test_bucket_name + "/temp", callback=cb) assert cb.size == 1 assert cb.value == 1 def test_get_file_with_callback(s3, tmpdir): test_file = str(tmpdir.join("test.json")) cb = Callback() s3.get_file(test_bucket_name + "/test/accounts.1.json", test_file, callback=cb) assert cb.size == os.stat(test_file).st_size assert cb.value == cb.size def test_get_file_with_kwargs(s3, tmpdir): test_file = str(tmpdir.join("test.json")) get_file_kwargs = {"max_concurency": 1, "random_kwarg": "value"} s3.get_file( test_bucket_name + "/test/accounts.1.json", test_file, **get_file_kwargs ) @pytest.mark.parametrize("size", [2**10, 10 * 2**20]) def test_put_file_with_callback(s3, tmpdir, size): test_file = str(tmpdir.join("test.json")) with open(test_file, "wb") as f: f.write(b"1234567890A" * size) cb = Callback() s3.put_file(test_file, test_bucket_name + "/temp", callback=cb) assert cb.size == os.stat(test_file).st_size assert cb.value == cb.size assert s3.size(test_bucket_name + "/temp") == 11 * size @pytest.mark.parametrize("factor", [1, 5, 6]) def test_put_file_does_not_truncate(s3, tmpdir, factor): test_file = str(tmpdir.join("test.json")) chunksize = 5 * 2**20 block = b"x" * chunksize with open(test_file, "wb") as f: f.write(block * factor) s3.put_file( test_file, test_bucket_name + "/temp", max_concurrency=5, chunksize=chunksize ) assert s3.size(test_bucket_name + "/temp") == factor * chunksize @pytest.mark.parametrize("size", [2**10, 2**20, 10 * 2**20]) def test_pipe_cat_big(s3, size): data = b"1234567890A" * size s3.pipe(test_bucket_name + "/bigfile", data) assert s3.cat(test_bucket_name + "/bigfile") == data def test_errors(s3): with pytest.raises(FileNotFoundError): s3.open(test_bucket_name + "/tmp/test/shfoshf", "rb") # This is fine, no need for interleaving directories on S3 # with pytest.raises((IOError, OSError)): # s3.touch('tmp/test/shfoshf/x') # Deleting nonexistent or zero paths is allowed for now # with pytest.raises(FileNotFoundError): # s3.rm(test_bucket_name + '/tmp/test/shfoshf/x') with pytest.raises(FileNotFoundError): s3.mv(test_bucket_name + "/tmp/test/shfoshf/x", "tmp/test/shfoshf/y") with pytest.raises(ValueError): s3.open("x", "rb") with pytest.raises(FileNotFoundError): s3.rm("unknown") with pytest.raises(ValueError): with s3.open(test_bucket_name + "/temp", "wb") as f: f.read() with pytest.raises(ValueError): f = s3.open(test_bucket_name + "/temp", "rb") f.close() f.read() with pytest.raises(ValueError): s3.mkdir("/") with pytest.raises(ValueError): s3.find("") with pytest.raises(ValueError): s3.find("s3://") def test_errors_cause_preservings(monkeypatch, s3): # We translate the error, and preserve the original one with pytest.raises(FileNotFoundError) as exc: s3.rm("unknown") assert type(exc.value.__cause__).__name__ == "NoSuchBucket" async def head_object(*args, **kwargs): raise NoCredentialsError monkeypatch.setattr(type(s3.s3), "head_object", head_object) # Since the error is not translate, the __cause__ would # be None with pytest.raises(NoCredentialsError) as exc: s3.info("test/a.txt") assert exc.value.__cause__ is None def test_read_small(s3): fn = test_bucket_name + "/2014-01-01.csv" with s3.open(fn, "rb", block_size=10, cache_type="bytes") as f: out = [] while True: data = f.read(3) if data == b"": break out.append(data) assert s3.cat(fn) == b"".join(out) # cache drop assert len(f.cache) < len(out) def test_read_s3_block(s3): data = files["test/accounts.1.json"] lines = io.BytesIO(data).readlines() path = test_bucket_name + "/test/accounts.1.json" assert s3.read_block(path, 1, 35, b"\n") == lines[1] assert s3.read_block(path, 0, 30, b"\n") == lines[0] assert s3.read_block(path, 0, 35, b"\n") == lines[0] + lines[1] assert s3.read_block(path, 0, 5000, b"\n") == data assert len(s3.read_block(path, 0, 5)) == 5 assert len(s3.read_block(path, 4, 5000)) == len(data) - 4 assert s3.read_block(path, 5000, 5010) == b"" assert s3.read_block(path, 5, None) == s3.read_block(path, 5, 1000) def test_new_bucket(s3): assert not s3.exists("new") s3.mkdir("new") assert s3.exists("new") with s3.open("new/temp", "wb") as f: f.write(b"hello") with pytest.raises(OSError): s3.rmdir("new") s3.rm("new/temp") s3.rmdir("new") assert "new" not in s3.ls("") assert not s3.exists("new") with pytest.raises(FileNotFoundError): s3.ls("new") def test_new_bucket_auto(s3): assert not s3.exists("new") with pytest.raises(Exception): s3.mkdir("new/other", create_parents=False) s3.mkdir("new/other", create_parents=True) assert s3.exists("new") s3.touch("new/afile") with pytest.raises(Exception): s3.rm("new") with pytest.raises(Exception): s3.rmdir("new") s3.rm("new", recursive=True) assert not s3.exists("new") def test_dynamic_add_rm(s3): s3.mkdir("one") s3.mkdir("one/two") assert s3.exists("one") s3.ls("one") s3.touch("one/two/file_a") assert s3.exists("one/two/file_a") s3.rm("one", recursive=True) assert not s3.exists("one") def test_write_small(s3): with s3.open(test_bucket_name + "/test", "wb") as f: f.write(b"hello") assert s3.cat(test_bucket_name + "/test") == b"hello" s3.open(test_bucket_name + "/test", "wb").close() assert s3.info(test_bucket_name + "/test")["size"] == 0 def test_write_small_with_acl(s3): bucket, key = (test_bucket_name, "test-acl") filename = bucket + "/" + key body = b"hello" public_read_acl = { "Permission": "READ", "Grantee": { "URI": "http://acs.amazonaws.com/groups/global/AllUsers", "Type": "Group", }, } with s3.open(filename, "wb", acl="public-read") as f: f.write(body) assert s3.cat(filename) == body assert ( public_read_acl in sync(s3.loop, s3.s3.get_object_acl, Bucket=bucket, Key=key)["Grants"] ) def test_write_large(s3): "flush() chunks buffer when processing large singular payload" mb = 2**20 payload_size = int(2.5 * 5 * mb) payload = b"0" * payload_size with s3.open(test_bucket_name + "/test", "wb") as fd: fd.write(payload) assert s3.cat(test_bucket_name + "/test") == payload assert s3.info(test_bucket_name + "/test")["size"] == payload_size def test_write_limit(s3): "flush() respects part_max when processing large singular payload" mb = 2**20 block_size = 15 * mb payload_size = 44 * mb payload = b"0" * payload_size with s3.open(test_bucket_name + "/test", "wb", blocksize=block_size) as fd: fd.write(payload) assert s3.cat(test_bucket_name + "/test") == payload assert s3.info(test_bucket_name + "/test")["size"] == payload_size def test_write_small_secure(s3): s3 = S3FileSystem( s3_additional_kwargs={"ServerSideEncryption": "aws:kms"}, client_kwargs={"endpoint_url": endpoint_uri}, ) s3.mkdir("mybucket") with s3.open("mybucket/test", "wb") as f: f.write(b"hello") assert s3.cat("mybucket/test") == b"hello" sync(s3.loop, s3.s3.head_object, Bucket="mybucket", Key="test") def test_write_large_secure(s3): # build our own s3fs with the relevant additional kwarg s3 = S3FileSystem( s3_additional_kwargs={"ServerSideEncryption": "AES256"}, client_kwargs={"endpoint_url": endpoint_uri}, ) s3.mkdir("mybucket") with s3.open("mybucket/myfile", "wb") as f: f.write(b"hello hello" * 10**6) assert s3.cat("mybucket/myfile") == b"hello hello" * 10**6 def test_write_fails(s3): with pytest.raises(ValueError): s3.touch(test_bucket_name + "/temp") s3.open(test_bucket_name + "/temp", "rb").write(b"hello") with pytest.raises(ValueError): s3.open(test_bucket_name + "/temp", "wb", block_size=10) f = s3.open(test_bucket_name + "/temp", "wb") f.close() with pytest.raises(ValueError): f.write(b"hello") with pytest.raises(FileNotFoundError): s3.open("nonexistentbucket/temp", "wb").close() def test_write_blocks(s3): with s3.open(test_bucket_name + "/temp", "wb", block_size=5 * 2**20) as f: f.write(b"a" * 2 * 2**20) assert f.buffer.tell() == 2 * 2**20 assert not (f.parts) f.flush() assert f.buffer.tell() == 2 * 2**20 assert not (f.parts) f.write(b"a" * 2 * 2**20) f.write(b"a" * 2 * 2**20) assert f.mpu assert f.parts assert s3.info(test_bucket_name + "/temp")["size"] == 6 * 2**20 with s3.open(test_bucket_name + "/temp2", "wb", block_size=10 * 2**20) as f: f.write(b"a" * 15 * 2**20) assert f.buffer.tell() == 0 assert s3.info(test_bucket_name + "/temp2")["size"] == 15 * 2**20 def test_readline(s3): all_items = chain.from_iterable( [files.items(), csv_files.items(), text_files.items()] ) for k, data in all_items: with s3.open("/".join([test_bucket_name, k]), "rb") as f: result = f.readline() expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"") assert result == expected def test_readline_empty(s3): data = b"" with s3.open(a, "wb") as f: f.write(data) with s3.open(a, "rb") as f: result = f.readline() assert result == data def test_readline_blocksize(s3): data = b"ab\n" + b"a" * (10 * 2**20) + b"\nab" with s3.open(a, "wb") as f: f.write(data) with s3.open(a, "rb") as f: result = f.readline() expected = b"ab\n" assert result == expected result = f.readline() expected = b"a" * (10 * 2**20) + b"\n" assert result == expected result = f.readline() expected = b"ab" assert result == expected def test_next(s3): expected = csv_files["2014-01-01.csv"].split(b"\n")[0] + b"\n" with s3.open(test_bucket_name + "/2014-01-01.csv") as f: result = next(f) assert result == expected def test_iterable(s3): data = b"abc\n123" with s3.open(a, "wb") as f: f.write(data) with s3.open(a) as f, io.BytesIO(data) as g: for froms3, fromio in zip(f, g): assert froms3 == fromio f.seek(0) assert f.readline() == b"abc\n" assert f.readline() == b"123" f.seek(1) assert f.readline() == b"bc\n" with s3.open(a) as f: out = list(f) with s3.open(a) as f: out2 = f.readlines() assert out == out2 assert b"".join(out) == data def test_readable(s3): with s3.open(a, "wb") as f: assert not f.readable() with s3.open(a, "rb") as f: assert f.readable() def test_seekable(s3): with s3.open(a, "wb") as f: assert not f.seekable() with s3.open(a, "rb") as f: assert f.seekable() def test_writable(s3): with s3.open(a, "wb") as f: assert f.writable() with s3.open(a, "rb") as f: assert not f.writable() def test_merge(s3): with s3.open(a, "wb") as f: f.write(b"a" * 10 * 2**20) with s3.open(b, "wb") as f: f.write(b"a" * 10 * 2**20) s3.merge(test_bucket_name + "/joined", [a, b]) assert s3.info(test_bucket_name + "/joined")["size"] == 2 * 10 * 2**20 def test_append(s3): data = text_files["nested/file1"] with s3.open(test_bucket_name + "/nested/file1", "ab") as f: assert f.tell() == len(data) # append, no write, small file assert s3.cat(test_bucket_name + "/nested/file1") == data with s3.open(test_bucket_name + "/nested/file1", "ab") as f: f.write(b"extra") # append, write, small file assert s3.cat(test_bucket_name + "/nested/file1") == data + b"extra" with s3.open(a, "wb") as f: f.write(b"a" * 10 * 2**20) with s3.open(a, "ab") as f: pass # append, no write, big file data = s3.cat(a) assert len(data) == 10 * 2**20 and set(data) == set(b"a") with s3.open(a, "ab") as f: assert f.parts is None f._initiate_upload() assert f.parts assert f.tell() == 10 * 2**20 f.write(b"extra") # append, small write, big file data = s3.cat(a) assert len(data) == 10 * 2**20 + len(b"extra") assert data[-5:] == b"extra" with s3.open(a, "ab") as f: assert f.tell() == 10 * 2**20 + 5 f.write(b"b" * 10 * 2**20) # append, big write, big file assert f.tell() == 20 * 2**20 + 5 data = s3.cat(a) assert len(data) == 10 * 2**20 + len(b"extra") + 10 * 2**20 assert data[10 * 2**20 : 10 * 2**20 + 5] == b"extra" assert set(data[-10 * 2**20 :]) == set(b"b") # Keep Head Metadata head = dict( CacheControl="public", ContentDisposition="string", ContentEncoding="gzip", ContentLanguage="ru-RU", ContentType="text/csv", Expires=datetime.datetime(2015, 1, 1, 0, 0, tzinfo=tzutc()), Metadata={"string": "string"}, ServerSideEncryption="AES256", StorageClass="REDUCED_REDUNDANCY", WebsiteRedirectLocation="https://www.example.com/", ) with s3.open(a, "wb", **head) as f: f.write(b"data") with s3.open(a, "ab") as f: f.write(b"other") with s3.open(a) as f: filehead = { k: v for k, v in f._call_s3( "head_object", f.kwargs, Bucket=f.bucket, Key=f.key ).items() if k in head } assert filehead == head def test_bigger_than_block_read(s3): with s3.open(test_bucket_name + "/2014-01-01.csv", "rb", block_size=3) as f: out = [] while True: data = f.read(20) out.append(data) if len(data) == 0: break assert b"".join(out) == csv_files["2014-01-01.csv"] def test_current(s3): s3._cache.clear() s3 = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri}) assert s3.current() is s3 assert S3FileSystem.current() is s3 def test_array(s3): from array import array data = array("B", [65] * 1000) with s3.open(a, "wb") as f: f.write(data) with s3.open(a, "rb") as f: out = f.read() assert out == b"A" * 1000 def _get_s3_id(s3): return id(s3.s3) @pytest.mark.parametrize( "method", [ "spawn", pytest.param( "forkserver", marks=pytest.mark.skipif( sys.platform.startswith("win"), reason="'forkserver' not available on windows", ), ), ], ) def test_no_connection_sharing_among_processes(s3, method): import multiprocessing as mp ctx = mp.get_context(method) executor = ProcessPoolExecutor(mp_context=ctx) conn_id = executor.submit(_get_s3_id, s3).result() assert id(s3.connect()) != conn_id, "Processes should not share S3 connections." @pytest.mark.xfail() def test_public_file(s3): # works on real s3, not on moto test_bucket_name = "s3fs_public_test" other_bucket_name = "s3fs_private_test" s3.touch(test_bucket_name) s3.touch(test_bucket_name + "/afile") s3.touch(other_bucket_name, acl="public-read") s3.touch(other_bucket_name + "/afile", acl="public-read") s = S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri}) with pytest.raises(PermissionError): s.ls(test_bucket_name) s.ls(other_bucket_name) s3.chmod(test_bucket_name, acl="public-read") s3.chmod(other_bucket_name, acl="private") with pytest.raises(PermissionError): s.ls(other_bucket_name, refresh=True) assert s.ls(test_bucket_name, refresh=True) # public file in private bucket with s3.open(other_bucket_name + "/see_me", "wb", acl="public-read") as f: f.write(b"hello") assert s.cat(other_bucket_name + "/see_me") == b"hello" def test_upload_with_s3fs_prefix(s3): path = "s3://test/prefix/key" with s3.open(path, "wb") as f: f.write(b"a" * (10 * 2**20)) with s3.open(path, "ab") as f: f.write(b"b" * (10 * 2**20)) def test_multipart_upload_blocksize(s3): blocksize = 5 * (2**20) expected_parts = 3 s3f = s3.open(a, "wb", block_size=blocksize) for _ in range(3): data = b"b" * blocksize s3f.write(data) # Ensure that the multipart upload consists of only 3 parts assert len(s3f.parts) == expected_parts s3f.close() def test_default_pars(s3): s3 = S3FileSystem( default_block_size=20, default_fill_cache=False, client_kwargs={"endpoint_url": endpoint_uri}, ) fn = test_bucket_name + "/" + list(files)[0] with s3.open(fn) as f: assert f.blocksize == 20 assert f.fill_cache is False with s3.open(fn, block_size=40, fill_cache=True) as f: assert f.blocksize == 40 assert f.fill_cache is True def test_tags(s3): tagset = {"tag1": "value1", "tag2": "value2"} fname = list(files)[0] s3.touch(fname) s3.put_tags(fname, tagset) assert s3.get_tags(fname) == tagset # Ensure merge mode updates value of existing key and adds new one new_tagset = {"tag2": "updatedvalue2", "tag3": "value3"} s3.put_tags(fname, new_tagset, mode="m") tagset.update(new_tagset) assert s3.get_tags(fname) == tagset @pytest.mark.parametrize("prefix", ["", "/dir", "/dir/subdir"]) def test_versions(s3, prefix): parent = versioned_bucket_name + prefix versioned_file = parent + "/versioned_file" s3 = S3FileSystem( anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} ) with s3.open(versioned_file, "wb") as fo: fo.write(b"1") first_version = fo.version_id with s3.open(versioned_file, "wb") as fo: fo.write(b"2") second_version = fo.version_id assert s3.isfile(versioned_file) versions = s3.object_version_info(versioned_file) assert len(versions) == 2 assert {version["VersionId"] for version in versions} == { first_version, second_version, } with s3.open(versioned_file) as fo: assert fo.version_id == second_version assert fo.read() == b"2" with s3.open(versioned_file, version_id=first_version) as fo: assert fo.version_id == first_version assert fo.read() == b"1" versioned_file_v1 = f"{versioned_file}?versionId={first_version}" versioned_file_v2 = f"{versioned_file}?versionId={second_version}" assert s3.ls(parent) == [versioned_file] assert set(s3.ls(parent, versions=True)) == {versioned_file_v1, versioned_file_v2} assert s3.exists(versioned_file_v1) assert s3.info(versioned_file_v1) assert s3.exists(versioned_file_v2) assert s3.info(versioned_file_v2) def test_list_versions_many(s3): # moto doesn't actually behave in the same way that s3 does here so this doesn't test # anything really in moto 1.2 s3 = S3FileSystem( anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} ) versioned_file = versioned_bucket_name + "/versioned_file2" for i in range(1200): with s3.open(versioned_file, "wb") as fo: fo.write(b"1") versions = s3.object_version_info(versioned_file) assert len(versions) == 1200 def test_fsspec_versions_multiple(s3): """Test that the standard fsspec.core.get_fs_token_paths behaves as expected for versionId urls""" s3 = S3FileSystem( anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} ) versioned_file = versioned_bucket_name + "/versioned_file3" version_lookup = {} for i in range(20): contents = str(i).encode() with s3.open(versioned_file, "wb") as fo: fo.write(contents) version_lookup[fo.version_id] = contents urls = [ f"s3://{versioned_file}?versionId={version}" for version in version_lookup.keys() ] fs, token, paths = fsspec.core.get_fs_token_paths( urls, storage_options=dict(client_kwargs={"endpoint_url": endpoint_uri}) ) assert isinstance(fs, S3FileSystem) assert fs.version_aware for path in paths: with fs.open(path, "rb") as fo: contents = fo.read() assert contents == version_lookup[fo.version_id] def test_versioned_file_fullpath(s3): versioned_file = versioned_bucket_name + "/versioned_file_fullpath" s3 = S3FileSystem( anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} ) with s3.open(versioned_file, "wb") as fo: fo.write(b"1") # moto doesn't correctly return a versionId for a multipart upload. So we resort to this. # version_id = fo.version_id versions = s3.object_version_info(versioned_file) version_ids = [version["VersionId"] for version in versions] version_id = version_ids[0] with s3.open(versioned_file, "wb") as fo: fo.write(b"2") file_with_version = f"{versioned_file}?versionId={version_id}" with s3.open(file_with_version, "rb") as fo: assert fo.version_id == version_id assert fo.read() == b"1" versions = s3.object_version_info(versioned_file) version_ids = [version["VersionId"] for version in versions] assert set(s3.ls(versioned_bucket_name, versions=True)) == { f"{versioned_file}?versionId={vid}" for vid in version_ids } def test_versions_unaware(s3): versioned_file = versioned_bucket_name + "/versioned_file3" s3 = S3FileSystem( anon=False, version_aware=False, client_kwargs={"endpoint_url": endpoint_uri} ) with s3.open(versioned_file, "wb") as fo: fo.write(b"1") with s3.open(versioned_file, "wb") as fo: fo.write(b"2") with s3.open(versioned_file) as fo: assert fo.version_id is None assert fo.read() == b"2" with pytest.raises(ValueError): with s3.open(versioned_file, version_id="0"): fo.read() def test_versions_dircached(s3): versioned_file = versioned_bucket_name + "/dir/versioned_file" s3 = S3FileSystem( anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} ) with s3.open(versioned_file, "wb") as fo: fo.write(b"1") first_version = fo.version_id with s3.open(versioned_file, "wb") as fo: fo.write(b"2") second_version = fo.version_id s3.find(versioned_bucket_name) cached = s3.dircache[versioned_bucket_name + "/dir"][0] assert cached.get("VersionId") == second_version assert s3.info(versioned_file) == cached assert ( s3.info(versioned_file, version_id=first_version).get("VersionId") == first_version ) assert ( s3.info(versioned_file, version_id=second_version).get("VersionId") == second_version ) def test_text_io__stream_wrapper_works(s3): """Ensure using TextIOWrapper works.""" s3.mkdir("bucket") with s3.open("bucket/file.txt", "wb") as fd: fd.write("\u00af\\_(\u30c4)_/\u00af".encode("utf-16-le")) with s3.open("bucket/file.txt", "rb") as fd: with io.TextIOWrapper(fd, "utf-16-le") as stream: assert stream.readline() == "\u00af\\_(\u30c4)_/\u00af" def test_text_io__basic(s3): """Text mode is now allowed.""" s3.mkdir("bucket") with s3.open("bucket/file.txt", "w", encoding="utf-8") as fd: fd.write("\u00af\\_(\u30c4)_/\u00af") with s3.open("bucket/file.txt", "r", encoding="utf-8") as fd: assert fd.read() == "\u00af\\_(\u30c4)_/\u00af" def test_text_io__override_encoding(s3): """Allow overriding the default text encoding.""" s3.mkdir("bucket") with s3.open("bucket/file.txt", "w", encoding="ibm500") as fd: fd.write("Hello, World!") with s3.open("bucket/file.txt", "r", encoding="ibm500") as fd: assert fd.read() == "Hello, World!" def test_readinto(s3): s3.mkdir("bucket") with s3.open("bucket/file.txt", "wb") as fd: fd.write(b"Hello, World!") contents = bytearray(15) with s3.open("bucket/file.txt", "rb") as fd: assert fd.readinto(contents) == 13 assert contents.startswith(b"Hello, World!") def test_change_defaults_only_subsequent(): """Test for Issue #135 Ensure that changing the default block size doesn't affect existing file systems that were created using that default. It should only affect file systems created after the change. """ try: S3FileSystem.cachable = False # don't reuse instances with same pars fs_default = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri}) assert fs_default.default_block_size == 50 * (1024**2) fs_overridden = S3FileSystem( default_block_size=64 * (1024**2), client_kwargs={"endpoint_url": endpoint_uri}, ) assert fs_overridden.default_block_size == 64 * (1024**2) # Suppose I want all subsequent file systems to have a block size of 1 GiB # instead of 5 MiB: S3FileSystem.default_block_size = 1024**3 fs_big = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri}) assert fs_big.default_block_size == 1024**3 # Test the other file systems created to see if their block sizes changed assert fs_overridden.default_block_size == 64 * (1024**2) assert fs_default.default_block_size == 50 * (1024**2) finally: S3FileSystem.default_block_size = 5 * (1024**2) S3FileSystem.cachable = True def test_cache_after_copy(s3): # https://github.com/dask/dask/issues/5134 s3.touch("test/afile") assert "test/afile" in s3.ls("s3://test", False) s3.cp("test/afile", "test/bfile") assert "test/bfile" in s3.ls("s3://test", False) def test_autocommit(s3): auto_file = test_bucket_name + "/auto_file" committed_file = test_bucket_name + "/commit_file" aborted_file = test_bucket_name + "/aborted_file" s3 = S3FileSystem( anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} ) def write_and_flush(path, autocommit): with s3.open(path, "wb", autocommit=autocommit) as fo: fo.write(b"1") return fo # regular behavior fo = write_and_flush(auto_file, autocommit=True) assert fo.autocommit assert s3.exists(auto_file) fo = write_and_flush(committed_file, autocommit=False) assert not fo.autocommit assert not s3.exists(committed_file) fo.commit() assert s3.exists(committed_file) fo = write_and_flush(aborted_file, autocommit=False) assert not s3.exists(aborted_file) fo.discard() assert not s3.exists(aborted_file) # Cannot commit a file that was discarded with pytest.raises(Exception): fo.commit() def test_autocommit_mpu(s3): """When not autocommitting we always want to use multipart uploads""" path = test_bucket_name + "/auto_commit_with_mpu" with s3.open(path, "wb", autocommit=False) as fo: fo.write(b"1") assert fo.mpu is not None assert len(fo.parts) == 1 def test_touch(s3): # create fn = test_bucket_name + "/touched" assert not s3.exists(fn) s3.touch(fn) assert s3.exists(fn) assert s3.size(fn) == 0 # truncates with s3.open(fn, "wb") as f: f.write(b"data") assert s3.size(fn) == 4 s3.touch(fn, truncate=True) assert s3.size(fn) == 0 # exists error with s3.open(fn, "wb") as f: f.write(b"data") assert s3.size(fn) == 4 with pytest.raises(ValueError): s3.touch(fn, truncate=False) assert s3.size(fn) == 4 def test_touch_versions(s3): versioned_file = versioned_bucket_name + "/versioned_file" s3 = S3FileSystem( anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} ) with s3.open(versioned_file, "wb") as fo: fo.write(b"1") first_version = fo.version_id with s3.open(versioned_file, "wb") as fo: fo.write(b"") second_version = fo.version_id assert s3.isfile(versioned_file) versions = s3.object_version_info(versioned_file) assert len(versions) == 2 assert {version["VersionId"] for version in versions} == { first_version, second_version, } with s3.open(versioned_file) as fo: assert fo.version_id == second_version assert fo.read() == b"" with s3.open(versioned_file, version_id=first_version) as fo: assert fo.version_id == first_version assert fo.read() == b"1" def test_cat_missing(s3): fn0 = test_bucket_name + "/file0" fn1 = test_bucket_name + "/file1" s3.touch(fn0) with pytest.raises(FileNotFoundError): s3.cat([fn0, fn1], on_error="raise") out = s3.cat([fn0, fn1], on_error="omit") assert list(out) == [fn0] out = s3.cat([fn0, fn1], on_error="return") assert fn1 in out assert isinstance(out[fn1], FileNotFoundError) def test_get_directories(s3, tmpdir): s3.touch(test_bucket_name + "/dir/dirkey/key0") s3.touch(test_bucket_name + "/dir/dirkey/key1") s3.touch(test_bucket_name + "/dir/dirkey") s3.touch(test_bucket_name + "/dir/dir/key") d = str(tmpdir) # Target directory with trailing slash s3.get(test_bucket_name + "/dir/", d, recursive=True) assert {"dirkey", "dir"} == set(os.listdir(d)) assert ["key"] == os.listdir(os.path.join(d, "dir")) assert {"key0", "key1"} == set(os.listdir(os.path.join(d, "dirkey"))) local_fs = fsspec.filesystem("file") local_fs.rm(os.path.join(d, "dir"), recursive=True) local_fs.rm(os.path.join(d, "dirkey"), recursive=True) # Target directory without trailing slash s3.get(test_bucket_name + "/dir", d, recursive=True) assert ["dir"] == os.listdir(d) assert {"dirkey", "dir"} == set(os.listdir(os.path.join(d, "dir"))) assert {"key0", "key1"} == set(os.listdir(os.path.join(d, "dir", "dirkey"))) def test_seek_reads(s3): fn = test_bucket_name + "/myfile" with s3.open(fn, "wb") as f: f.write(b"a" * 175627146) with s3.open(fn, "rb", blocksize=100) as f: f.seek(175561610) d1 = f.read(65536) f.seek(4) size = 17562198 d2 = f.read(size) assert len(d2) == size f.seek(17562288) size = 17562187 d3 = f.read(size) assert len(d3) == size def test_connect_many(s3): from multiprocessing.pool import ThreadPool def task(i): S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}).ls("") return True pool = ThreadPool(processes=20) out = pool.map(task, range(40)) assert all(out) pool.close() pool.join() def test_requester_pays(s3): fn = test_bucket_name + "/myfile" s3 = S3FileSystem(requester_pays=True, client_kwargs={"endpoint_url": endpoint_uri}) assert s3.req_kw["RequestPayer"] == "requester" s3.touch(fn) with s3.open(fn, "rb") as f: assert f.req_kw["RequestPayer"] == "requester" def test_credentials(): s3 = S3FileSystem( key="foo", secret="foo", client_kwargs={"endpoint_url": endpoint_uri} ) assert s3.s3._request_signer._credentials.access_key == "foo" assert s3.s3._request_signer._credentials.secret_key == "foo" s3 = S3FileSystem( client_kwargs={ "aws_access_key_id": "bar", "aws_secret_access_key": "bar", "endpoint_url": endpoint_uri, } ) assert s3.s3._request_signer._credentials.access_key == "bar" assert s3.s3._request_signer._credentials.secret_key == "bar" s3 = S3FileSystem( key="foo", client_kwargs={"aws_secret_access_key": "bar", "endpoint_url": endpoint_uri}, ) assert s3.s3._request_signer._credentials.access_key == "foo" assert s3.s3._request_signer._credentials.secret_key == "bar" s3 = S3FileSystem( key="foobar", secret="foobar", client_kwargs={ "aws_access_key_id": "foobar", "aws_secret_access_key": "foobar", "endpoint_url": endpoint_uri, }, ) assert s3.s3._request_signer._credentials.access_key == "foobar" assert s3.s3._request_signer._credentials.secret_key == "foobar" with pytest.raises((TypeError, KeyError)): # should be TypeError: arg passed twice; but in moto can be KeyError S3FileSystem( key="foo", secret="foo", client_kwargs={ "aws_access_key_id": "bar", "aws_secret_access_key": "bar", "endpoint_url": endpoint_uri, }, ).s3 def test_modified(s3): dir_path = test_bucket_name + "/modified" file_path = dir_path + "/file" # Test file s3.touch(file_path) modified = s3.modified(path=file_path) assert isinstance(modified, datetime.datetime) assert modified.tzinfo is not None # Test directory with pytest.raises(IsADirectoryError): modified = s3.modified(path=dir_path) # Test bucket with pytest.raises(IsADirectoryError): s3.modified(path=test_bucket_name) def test_async_s3(s3): async def _(): s3 = S3FileSystem( anon=False, asynchronous=True, loop=asyncio.get_running_loop(), client_kwargs={"region_name": "eu-central-1", "endpoint_url": endpoint_uri}, ) fn = test_bucket_name + "/nested/file1" data = b"hello\n" # Is good with or without connect() await s3._cat_file(fn) session = await s3.set_session() # creates client assert await s3._cat_file(fn) == data assert await s3._cat_file(fn, start=0, end=3) == data[:3] # TODO: file IO is *not* async # with s3.open(fn, "rb") as f: # assert f.read() == data try: await session.close() except AttributeError: # bug in aiobotocore 1.4.1 await session._endpoint.http_session._session.close() asyncio.run(_()) def test_cat_ranges(s3): data = b"a string to select from" fn = test_bucket_name + "/parts" s3.pipe(fn, data) assert s3.cat_file(fn) == data assert s3.cat_file(fn, start=5) == data[5:] assert s3.cat_file(fn, end=5) == data[:5] assert s3.cat_file(fn, start=1, end=-1) == data[1:-1] assert s3.cat_file(fn, start=-5) == data[-5:] def test_async_s3_old(s3): async def _(): s3 = S3FileSystem( anon=False, asynchronous=True, loop=asyncio.get_running_loop(), client_kwargs={"region_name": "eu-central-1", "endpoint_url": endpoint_uri}, ) fn = test_bucket_name + "/nested/file1" data = b"hello\n" # Check old API session = await s3._connect() assert await s3._cat_file(fn, start=0, end=3) == data[:3] try: await session.close() except AttributeError: # bug in aiobotocore 1.4.1 await session._endpoint.http_session._session.close() asyncio.run(_()) def test_via_fsspec(s3): import fsspec s3.mkdir("mine") with fsspec.open( "s3://mine/oi", "wb", client_kwargs={"endpoint_url": endpoint_uri} ) as f: f.write(b"hello") with fsspec.open( "s3://mine/oi", "rb", client_kwargs={"endpoint_url": endpoint_uri} ) as f: assert f.read() == b"hello" @pytest.mark.parametrize( ["raw_url", "expected_url", "expected_version_aware"], [ ( "s3://arn:aws:s3:us-west-2:123456789012:accesspoint/abc/123.jpg", "arn:aws:s3:us-west-2:123456789012:accesspoint/abc/123.jpg", False, ), ( "s3://arn:aws:s3:us-west-2:123456789012:accesspoint/abc/123.jpg?versionId=some_version_id", "arn:aws:s3:us-west-2:123456789012:accesspoint/abc/123.jpg?versionId=some_version_id", True, ), ( "s3://xyz/abc/123.jpg", "xyz/abc/123.jpg", False, ), ( "s3://xyz/abc/123.jpg?versionId=some_version_id", "xyz/abc/123.jpg?versionId=some_version_id", True, ), ], ) def test_fsspec_url_to_fs_compatability( s3, raw_url, expected_url, expected_version_aware ): import fsspec fs, url = fsspec.url_to_fs(raw_url) assert isinstance(fs, type(s3)) assert fs.version_aware is expected_version_aware assert url == expected_url def test_repeat_exists(s3): fn = "s3://" + test_bucket_name + "/file1" s3.touch(fn) assert s3.exists(fn) assert s3.exists(fn) def test_with_xzarr(s3): da = pytest.importorskip("dask.array") xr = pytest.importorskip("xarray") name = "sample" nana = xr.DataArray(da.random.random((1024, 1024, 10, 9, 1))) s3_path = f"{test_bucket_name}/{name}" s3store = s3.get_mapper(s3_path) s3.ls("") nana.to_dataset().to_zarr(store=s3store, mode="w", consolidated=True, compute=True) def test_async_close(): async def _(): loop = asyncio.get_event_loop() s3 = S3FileSystem(anon=False, asynchronous=True, loop=loop) await s3._connect() fn = test_bucket_name + "/afile" async def async_wrapper(): coros = [ asyncio.ensure_future(s3._get_file(fn, "/nonexistent/a/b/c"), loop=loop) for _ in range(3) ] completed, pending = await asyncio.wait(coros) for future in completed: with pytest.raises(OSError): future.result() await asyncio.gather(*[async_wrapper() for __ in range(2)]) try: await s3._s3.close() except AttributeError: # bug in aiobotocore 1.4.1 await s3._s3._endpoint.http_session._session.close() asyncio.run(_()) def test_put_single(s3, tmpdir): fn = os.path.join(str(tmpdir), "dir") os.mkdir(fn) open(os.path.join(fn, "abc"), "w").write("text") # Put with trailing slash s3.put(fn + "/", test_bucket_name) # no-op, no files assert not s3.exists(test_bucket_name + "/abc") assert not s3.exists(test_bucket_name + "/dir") s3.put(fn + "/", test_bucket_name, recursive=True) assert s3.cat(test_bucket_name + "/abc") == b"text" # Put without trailing slash s3.put(fn, test_bucket_name, recursive=True) assert s3.cat(test_bucket_name + "/dir/abc") == b"text" def test_shallow_find(s3): """Test that find method respects maxdepth. Verify that the ``find`` method respects the ``maxdepth`` parameter. With ``maxdepth=1``, the results of ``find`` should be the same as those of ``ls``, without returning subdirectories. See also issue 378. """ ls_output = s3.ls(test_bucket_name) assert sorted(ls_output + [test_bucket_name]) == s3.find( test_bucket_name, maxdepth=1, withdirs=True ) assert ls_output == s3.glob(test_bucket_name + "/*") def test_multi_find(s3): s3.mkdir("bucket/test") s3.mkdir("bucket/test/sub") s3.write_text("bucket/test/file.txt", "some_text") s3.write_text("bucket/test/sub/file.txt", "some_text") out1 = s3.find("bucket", withdirs=True) out2 = s3.find("bucket", withdirs=True) assert ( out1 == out2 == [ "bucket/test", "bucket/test/file.txt", "bucket/test/sub", "bucket/test/sub/file.txt", ] ) out1 = s3.find("bucket", withdirs=False) out2 = s3.find("bucket", withdirs=False) assert out1 == out2 == ["bucket/test/file.txt", "bucket/test/sub/file.txt"] def test_version_sizes(s3): # protect against caching of incorrect version details s3 = S3FileSystem( anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri} ) import gzip path = f"s3://{versioned_bucket_name}/test.txt.gz" versions = [ s3.pipe_file(path, gzip.compress(text)) for text in ( b"good morning!", b"hello!", b"hi!", b"hello!", ) ] for version in versions: version_id = version["VersionId"] with s3.open(path, version_id=version_id) as f: with gzip.GzipFile(fileobj=f) as zfp: zfp.read() def test_find_no_side_effect(s3): infos1 = s3.find(test_bucket_name, maxdepth=1, withdirs=True, detail=True) s3.find(test_bucket_name, maxdepth=None, withdirs=True, detail=True) infos3 = s3.find(test_bucket_name, maxdepth=1, withdirs=True, detail=True) assert infos1.keys() == infos3.keys() def test_get_file_info_with_selector(s3): fs = s3 base_dir = "selector-dir/" file_a = "selector-dir/test_file_a" file_b = "selector-dir/test_file_b" dir_a = "selector-dir/test_dir_a" file_c = "selector-dir/test_dir_a/test_file_c" try: fs.mkdir(base_dir) with fs.open(file_a, mode="wb"): pass with fs.open(file_b, mode="wb"): pass fs.mkdir(dir_a) with fs.open(file_c, mode="wb"): pass infos = fs.find(base_dir, maxdepth=None, withdirs=True, detail=True) assert len(infos) == 4 # includes base_dir directory for info in infos.values(): if info["name"].endswith(file_a): assert info["type"] == "file" elif info["name"].endswith(file_b): assert info["type"] == "file" elif info["name"].endswith(file_c): assert info["type"] == "file" elif info["name"].rstrip("/").endswith(dir_a): assert info["type"] == "directory" finally: fs.rm(base_dir, recursive=True) @pytest.mark.xfail( condition=version.parse(moto.__version__) <= version.parse("1.3.16"), reason="Moto 1.3.16 is not supporting pre-conditions.", ) def test_raise_exception_when_file_has_changed_during_reading(s3): test_file_name = "file1" test_file = "s3://" + test_bucket_name + "/" + test_file_name content1 = b"123" content2 = b"ABCDEFG" boto3_client = get_boto3_client() def create_file(content: bytes): boto3_client.put_object( Bucket=test_bucket_name, Key=test_file_name, Body=content ) create_file(b"123") with s3.open(test_file, "rb") as f: content = f.read() assert content == content1 with s3.open(test_file, "rb") as f: create_file(content2) with expect_errno(errno.EBUSY): f.read() def test_s3fs_etag_preserving_multipart_copy(monkeypatch, s3): # Set this to a lower value so that we can actually # test this without creating giant objects in memory monkeypatch.setattr(s3fs.core, "MANAGED_COPY_THRESHOLD", 5 * 2**20) test_file1 = test_bucket_name + "/test/multipart-upload.txt" test_file2 = test_bucket_name + "/test/multipart-upload-copy.txt" with s3.open(test_file1, "wb", block_size=5 * 2**21) as stream: for _ in range(5): stream.write(b"b" * (stream.blocksize + random.randrange(200))) file_1 = s3.info(test_file1) s3.copy(test_file1, test_file2) file_2 = s3.info(test_file2) s3.rm(test_file2) # normal copy() uses a block size of 5GB assert file_1["ETag"] != file_2["ETag"] s3.copy(test_file1, test_file2, preserve_etag=True) file_2 = s3.info(test_file2) s3.rm(test_file2) # etag preserving copy() determines each part size for the destination # by checking out the matching part's size on the source assert file_1["ETag"] == file_2["ETag"] s3.rm(test_file1) def test_sync_from_wihin_async(s3): # if treating as sync but within an even loop, e.g., calling from jupyter; # IO happens on dedicated thread. async def f(): S3FileSystem.clear_instance_cache() s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}) assert s3.ls(test_bucket_name) asyncio.run(f()) def test_token_paths(s3): fs, tok, files = fsspec.get_fs_token_paths( "s3://" + test_bucket_name + "/*.csv", storage_options={"client_kwargs": {"endpoint_url": endpoint_uri}}, ) assert files def test_same_name_but_no_exact(s3): s3.touch(test_bucket_name + "/very/similar/prefix1") s3.touch(test_bucket_name + "/very/similar/prefix2") s3.touch(test_bucket_name + "/very/similar/prefix3/something") assert not s3.exists(test_bucket_name + "/very/similar/prefix") assert not s3.exists(test_bucket_name + "/very/similar/prefi") assert not s3.exists(test_bucket_name + "/very/similar/pref") assert s3.exists(test_bucket_name + "/very/similar/") assert s3.exists(test_bucket_name + "/very/similar/prefix1") assert s3.exists(test_bucket_name + "/very/similar/prefix2") assert s3.exists(test_bucket_name + "/very/similar/prefix3") assert s3.exists(test_bucket_name + "/very/similar/prefix3/") assert s3.exists(test_bucket_name + "/very/similar/prefix3/something") assert not s3.exists(test_bucket_name + "/very/similar/prefix3/some") s3.touch(test_bucket_name + "/starting/very/similar/prefix") assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix1") assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix2") assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix3") assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix3/") assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix3/something") assert s3.exists(test_bucket_name + "/starting/very/similar/prefix") assert s3.exists(test_bucket_name + "/starting/very/similar/prefix/") def test_leading_forward_slash(s3): s3.touch(test_bucket_name + "/some/file") assert s3.ls(test_bucket_name + "/some/") assert s3.exists(test_bucket_name + "/some/file") assert s3.exists("s3://" + test_bucket_name + "/some/file") def test_lsdir(s3): # https://github.com/fsspec/s3fs/issues/475 s3.find(test_bucket_name) d = test_bucket_name + "/test" assert d in s3.ls(test_bucket_name) def test_rm_recursive_folder(s3): s3.touch(test_bucket_name + "/sub/file") s3.rm(test_bucket_name + "/sub", recursive=True) assert not s3.exists(test_bucket_name + "/sub/file") assert not s3.exists(test_bucket_name + "/sub") s3.touch(test_bucket_name + "/sub/file") s3.touch(test_bucket_name + "/sub/") # placeholder s3.rm(test_bucket_name + "/sub", recursive=True) assert not s3.exists(test_bucket_name + "/sub/file") assert not s3.exists(test_bucket_name + "/sub") s3.touch(test_bucket_name + "/sub/file") s3.rm(test_bucket_name, recursive=True) assert not s3.exists(test_bucket_name + "/sub/file") assert not s3.exists(test_bucket_name + "/sub") assert not s3.exists(test_bucket_name) def test_copy_file_without_etag(s3, monkeypatch): s3.touch(test_bucket_name + "/copy_tests/file") s3.ls(test_bucket_name + "/copy_tests/") [file] = s3.dircache[test_bucket_name + "/copy_tests"] assert file["name"] == test_bucket_name + "/copy_tests/file" file.pop("ETag") assert s3.info(file["name"]).get("ETag", None) is None s3.cp_file(file["name"], test_bucket_name + "/copy_tests/file2") assert s3.info(test_bucket_name + "/copy_tests/file2")["ETag"] is not None def test_find_with_prefix(s3): for cursor in range(100): s3.touch(test_bucket_name + f"/prefixes/test_{cursor}") s3.touch(test_bucket_name + "/prefixes2") assert len(s3.find(test_bucket_name + "/prefixes")) == 100 assert len(s3.find(test_bucket_name, prefix="prefixes")) == 101 assert len(s3.find(test_bucket_name + "/prefixes", prefix="test2_")) == 0 assert len(s3.find(test_bucket_name + "/prefixes/test_")) == 0 assert len(s3.find(test_bucket_name + "/prefixes", prefix="test_")) == 100 assert len(s3.find(test_bucket_name + "/prefixes/", prefix="test_")) == 100 test_1s = s3.find(test_bucket_name + "/prefixes/test_1") assert len(test_1s) == 1 assert test_1s[0] == test_bucket_name + "/prefixes/test_1" test_1s = s3.find(test_bucket_name + "/prefixes/", prefix="test_1") assert len(test_1s) == 11 assert test_1s == [test_bucket_name + "/prefixes/test_1"] + [ test_bucket_name + f"/prefixes/test_{cursor}" for cursor in range(10, 20) ] assert s3.find(test_bucket_name + "/prefixes/") == s3.find( test_bucket_name + "/prefixes/", prefix=None ) def test_list_after_find(s3): before = s3.ls("s3://test") s3.invalidate_cache("s3://test/2014-01-01.csv") s3.find("s3://test/2014-01-01.csv") after = s3.ls("s3://test") assert before == after def test_upload_recursive_to_bucket(s3, tmpdir): # GH#491 folders = [os.path.join(tmpdir, d) for d in ["outer", "outer/inner"]] files = [os.path.join(tmpdir, f) for f in ["outer/afile", "outer/inner/bfile"]] for d in folders: os.mkdir(d) for f in files: open(f, "w").write("hello") s3.put(folders[0], "newbucket", recursive=True) def test_rm_file(s3): target = test_bucket_name + "/to_be_removed/file" s3.touch(target) s3.rm_file(target) assert not s3.exists(target) assert not s3.exists(test_bucket_name + "/to_be_removed") def test_exists_isdir(s3): bad_path = "s3://nyc-tlc-asdfasdf/trip data/" assert not s3.exists(bad_path) assert not s3.isdir(bad_path) def test_list_del_multipart(s3): path = test_bucket_name + "/afile" f = s3.open(path, "wb") f.write(b"0" * 6 * 2**20) out = s3.list_multipart_uploads(test_bucket_name) assert [_ for _ in out if _["Key"] == "afile"] s3.clear_multipart_uploads(test_bucket_name) out = s3.list_multipart_uploads(test_bucket_name) assert not [_ for _ in out if _["Key"] == "afile"] try: f.close() # may error except Exception: pass def test_split_path(s3): buckets = [ "my-test-bucket", "arn:aws:s3:region:123456789012:accesspoint/my-access-point-name", "arn:aws:s3-outposts:region:123456789012:outpost/outpost-id/bucket/my-test-bucket", "arn:aws:s3-outposts:region:123456789012:outpost/outpost-id/accesspoint/my-accesspoint-name", "arn:aws:s3-object-lambda:region:123456789012:accesspoint/my-lambda-object-name", ] test_key = "my/test/path" for test_bucket in buckets: bucket, key, _ = s3.split_path("s3://" + test_bucket + "/" + test_key) assert bucket == test_bucket assert key == test_key def test_cp_directory_recursive(s3): src = test_bucket_name + "/src" src_file = src + "/file" s3.mkdir(src) s3.touch(src_file) target = test_bucket_name + "/target" # cp without slash assert not s3.exists(target) for loop in range(2): s3.cp(src, target, recursive=True) assert s3.isdir(target) if loop == 0: correct = [target + "/file"] assert s3.find(target) == correct else: correct = [target + "/file", target + "/src/file"] assert sorted(s3.find(target)) == correct s3.rm(target, recursive=True) # cp with slash assert not s3.exists(target) for loop in range(2): s3.cp(src + "/", target, recursive=True) assert s3.isdir(target) correct = [target + "/file"] assert s3.find(target) == correct def test_get_directory_recursive(s3, tmpdir): src = test_bucket_name + "/src" src_file = src + "/file" s3.mkdir(src) s3.touch(src_file) target = os.path.join(tmpdir, "target") target_fs = fsspec.filesystem("file") # get without slash assert not target_fs.exists(target) for loop in range(2): s3.get(src, target, recursive=True) assert target_fs.isdir(target) if loop == 0: assert target_fs.find(target) == [os.path.join(target, "file")] else: assert sorted(target_fs.find(target)) == [ os.path.join(target, "file"), os.path.join(target, "src", "file"), ] target_fs.rm(target, recursive=True) # get with slash assert not target_fs.exists(target) for loop in range(2): s3.get(src + "/", target, recursive=True) assert target_fs.isdir(target) assert target_fs.find(target) == [os.path.join(target, "file")] def test_put_directory_recursive(s3, tmpdir): src = os.path.join(tmpdir, "src") src_file = os.path.join(src, "file") source_fs = fsspec.filesystem("file") source_fs.mkdir(src) source_fs.touch(src_file) target = test_bucket_name + "/target" # put without slash assert not s3.exists(target) for loop in range(2): s3.put(src, target, recursive=True) assert s3.isdir(target) if loop == 0: assert s3.find(target) == [target + "/file"] else: assert sorted(s3.find(target)) == [target + "/file", target + "/src/file"] s3.rm(target, recursive=True) # put with slash assert not s3.exists(target) for loop in range(2): s3.put(src + "/", target, recursive=True) assert s3.isdir(target) assert s3.find(target) == [target + "/file"] def test_cp_two_files(s3): src = test_bucket_name + "/src" file0 = src + "/file0" file1 = src + "/file1" s3.mkdir(src) s3.touch(file0) s3.touch(file1) target = test_bucket_name + "/target" assert not s3.exists(target) s3.cp([file0, file1], target) assert s3.isdir(target) assert sorted(s3.find(target)) == [ target + "/file0", target + "/file1", ] def test_async_stream(s3_base): fn = test_bucket_name + "/target" data = b"hello world" * 1000 out = [] async def read_stream(): fs = S3FileSystem( anon=False, client_kwargs={"endpoint_url": endpoint_uri}, skip_instance_cache=True, ) await fs._mkdir(test_bucket_name) await fs._pipe(fn, data) f = await fs.open_async(fn, mode="rb", block_size=1000) while True: got = await f.read(1000) assert f.size == len(data) assert f.tell() if not got: break out.append(got) asyncio.run(read_stream()) assert b"".join(out) == data def test_rm_invalidates_cache(s3): # Issue 761: rm_file does not invalidate cache fn = test_bucket_name + "/2014-01-01.csv" assert s3.exists(fn) assert fn in s3.ls(test_bucket_name) s3.rm(fn) assert not s3.exists(fn) assert fn not in s3.ls(test_bucket_name) fn = test_bucket_name + "/2014-01-02.csv" assert s3.exists(fn) assert fn in s3.ls(test_bucket_name) s3.rm_file(fn) assert not s3.exists(fn) assert fn not in s3.ls(test_bucket_name) def test_cache_handles_find_with_maxdepth(s3): # Issue 773: invalidate_cache should not be needed when find is called with different maxdepth base_name = test_bucket_name + "/main" dir = base_name + "/dir1/fileB" file = base_name + "/fileA" s3.touch(dir) s3.touch(file) # Find with maxdepth=None f = s3.find(base_name, maxdepth=None, withdirs=False) assert base_name + "/fileA" in f assert base_name + "/dir1" not in f assert base_name + "/dir1/fileB" in f # Find with maxdepth=1. # Performed twice with cache invalidated between them which should give same result for _ in range(2): f = s3.find(base_name, maxdepth=1, withdirs=True) assert base_name + "/fileA" in f assert base_name + "/dir1" in f assert base_name + "/dir1/fileB" not in f s3.invalidate_cache() def test_bucket_versioning(s3): s3.mkdir("maybe_versioned") assert not s3.is_bucket_versioned("maybe_versioned") s3.make_bucket_versioned("maybe_versioned") assert s3.is_bucket_versioned("maybe_versioned") s3.make_bucket_versioned("maybe_versioned", False) assert not s3.is_bucket_versioned("maybe_versioned") @pytest.fixture() def s3_fixed_upload_size(s3): s3_fixed = S3FileSystem( anon=False, client_kwargs={"endpoint_url": endpoint_uri}, fixed_upload_size=True, ) s3_fixed.invalidate_cache() yield s3_fixed def test_upload_parts(s3_fixed_upload_size): with s3_fixed_upload_size.open(a, "wb", block_size=6_000_000) as f: f.write(b" " * 6_001_000) assert len(f.buffer.getbuffer()) == 1000 # check we are at the right position assert f.tell() == 6_001_000 # offset is introduced in fsspec.core, but never used. # apparently it should keep offset for part that is already uploaded assert f.offset == 6_000_000 f.write(b" " * 6_001_000) assert len(f.buffer.getbuffer()) == 2000 assert f.tell() == 2 * 6_001_000 assert f.offset == 2 * 6_000_000 with s3_fixed_upload_size.open(a, "r") as f: assert len(f.read()) == 6_001_000 * 2 def test_upload_part_with_prime_pads(s3_fixed_upload_size): block = 6_000_000 pad1, pad2 = 1013, 1019 # prime pad sizes to exclude divisibility with s3_fixed_upload_size.open(a, "wb", block_size=block) as f: f.write(b" " * (block + pad1)) assert len(f.buffer.getbuffer()) == pad1 # check we are at the right position assert f.tell() == block + pad1 assert f.offset == block f.write(b" " * (block + pad2)) assert len(f.buffer.getbuffer()) == pad1 + pad2 assert f.tell() == 2 * block + pad1 + pad2 assert f.offset == 2 * block with s3_fixed_upload_size.open(a, "r") as f: assert len(f.read()) == 2 * block + pad1 + pad2 @pytest.mark.asyncio async def test_invalidate_cache(s3: s3fs.S3FileSystem) -> None: await s3._call_s3("put_object", Bucket=test_bucket_name, Key="a/b.txt", Body=b"abc") before = await s3._ls(f"{test_bucket_name}/a/") assert sorted(before) == ["test/a/b.txt"] await s3._pipe_file(f"{test_bucket_name}/a/c.txt", data=b"abc") after = await s3._ls(f"{test_bucket_name}/a/") assert sorted(after) == ["test/a/b.txt", "test/a/c.txt"] def test_exist_after_delete(s3): test_dir = f"{test_bucket_name}/test/checkpoint_dir" s3.touch(f"{test_dir}/file.txt") assert s3.exists(test_dir) s3.rm(test_dir, recursive=True) assert not s3.exists(test_dir) # condition: True if running on botocore < 1.36.0 # The below tests for exclusive writes will fail on older versions of botocore. old_botocore = version.parse(botocore.__version__) < version.parse("1.36.0") @pytest.mark.xfail( reason="moto doesn't support IfNoneMatch for MPU when object created via MPU" ) def test_pipe_exclusive_big(s3): chunksize = 5 * 2**20 # minimum allowed data = b"x" * chunksize * 3 s3.pipe(f"{test_bucket_name}/afile", data, mode="overwrite", chunksize=chunksize) s3.pipe(f"{test_bucket_name}/afile", data, mode="overwrite", chunksize=chunksize) with pytest.raises(FileExistsError): s3.pipe(f"{test_bucket_name}/afile", data, mode="create", chunksize=chunksize) assert not s3.list_multipart_uploads(test_bucket_name) @pytest.mark.xfail( old_botocore, reason="botocore<1.33.0 lacks IfNoneMatch support", strict=True ) def test_pipe_exclusive_big_after_small(s3): """Test conditional MPU after creating object via put_object This test is required because moto's implementation of IfNoneMatch for MPU only works when the object is initially created via put_object and not via MPU. """ chunksize = 5 * 2**20 # minimum allowed # First, create object via put_object (small upload) s3.pipe(f"{test_bucket_name}/afile", b"small", mode="overwrite") # Now try multipart upload with mode="create" (should fail) with pytest.raises(FileExistsError): s3.pipe( f"{test_bucket_name}/afile", b"c" * chunksize * 3, mode="create", chunksize=chunksize, ) assert not s3.list_multipart_uploads(test_bucket_name) @pytest.mark.xfail( reason="moto doesn't support IfNoneMatch for MPU when object created via MPU" ) def test_put_exclusive_big(s3, tmpdir): chunksize = 5 * 2**20 # minimum allowed fn = f"{tmpdir}/afile" with open(fn, "wb") as f: f.write(b"x" * chunksize * 3) s3.put(fn, f"{test_bucket_name}/afile", mode="overwrite", chunksize=chunksize) s3.put(fn, f"{test_bucket_name}/afile", mode="overwrite", chunksize=chunksize) with pytest.raises(FileExistsError): s3.put(fn, f"{test_bucket_name}/afile", mode="create", chunksize=chunksize) assert not s3.list_multipart_uploads(test_bucket_name) @pytest.mark.xfail( old_botocore, reason="botocore<1.33.0 lacks IfNoneMatch support", strict=True ) def test_put_exclusive_big_after_small(s3, tmpdir): """Test conditional MPU after creating object via put_object. This test is required because moto's implementation of IfNoneMatch for MPU only works when the object is initially created via put_object and not via MPU. """ chunksize = 5 * 2**20 # minimum allowed fn = str(tmpdir.join("afile")) with open(fn, "wb") as f: f.write(b"x" * chunksize * 3) # First, create object via put_object (small upload) s3.pipe(f"{test_bucket_name}/afile", b"small", mode="overwrite") # Now try multipart upload with mode="create" (should fail) with pytest.raises(FileExistsError): s3.put(fn, f"{test_bucket_name}/afile", mode="create", chunksize=chunksize) assert not s3.list_multipart_uploads(test_bucket_name) @pytest.mark.xfail( old_botocore, reason="botocore<1.33.0 lacks IfNoneMatch support", strict=True ) def test_put_exclusive_small(s3, tmpdir): fn = f"{tmpdir}/afile" with open(fn, "wb") as f: f.write(b"x") s3.put(fn, f"{test_bucket_name}/afile", mode="overwrite") s3.put(fn, f"{test_bucket_name}/afile", mode="overwrite") with pytest.raises(FileExistsError): s3.put(fn, f"{test_bucket_name}/afile", mode="create") assert not s3.list_multipart_uploads(test_bucket_name) def test_bucket_info(s3): info = s3.info(test_bucket_name) assert "VersionId" in info assert info["type"] == "directory" assert info["name"] == test_bucket_name MB = 2**20 GB = 2**30 TB = 2**40 @pytest.mark.parametrize( ["filesize", "chunksize", "expected"], [ # small file, use default chunksize (1000, None, 50 * MB), # exact boundary, use default chunksize (50 * MB * MAX_UPLOAD_PARTS, None, 50 * MB), # file requiring increased chunksize (50 * MB * (MAX_UPLOAD_PARTS + 1), None, 52_434_043), # very large files, expect increased chunksize (1 * TB, None, 109_951_163), (5 * TB, None, 549_755_814), # respect explicit chunksize (5 * GB, 10 * MB, 10 * MB), ], ) def test_calculate_chunksize(filesize, chunksize, expected): assert calculate_chunksize(filesize, chunksize) == expected def test_find_ls_fail(s3): # beacuse of https://github.com/fsspec/s3fs/pull/989 client = get_boto3_client() files = { f"{test_bucket_name}/find/a/a": b"data", f"{test_bucket_name}/find/a/b": b"data", f"{test_bucket_name}/find/a": b"", # duplicate of dir, without "/" f"{test_bucket_name}/find/b": b"", # empty file without "/" and no children f"{test_bucket_name}/find/c/c": b"data", # directory with no placeholder f"{test_bucket_name}/find/d/d": b"data", # dir will acquire placeholder with "/" } client.put_object(Bucket=test_bucket_name, Key="find/d/", Body=b"") client.put_object( Bucket=test_bucket_name, Key="find/e/", Body=b"" ) # placeholder only s3.pipe(files) out0 = s3.ls(f"{test_bucket_name}/find", detail=True) s3.find(test_bucket_name, detail=False) out = s3.ls(f"{test_bucket_name}/find", detail=True) assert out == out0 s3.invalidate_cache() s3.find(f"{test_bucket_name}/find", detail=False) out = s3.ls(f"{test_bucket_name}/find", detail=True) assert out == out0 def test_find_missing_ls(s3): # https://github.com/fsspec/s3fs/issues/988#issuecomment-3436727753 BUCKET = test_bucket_name BASE_PREFIX = "disappearing-folders/" BASE = f"s3://{BUCKET}/{BASE_PREFIX}" s3_with_cache = S3FileSystem( anon=False, use_listings_cache=True, client_kwargs={"endpoint_url": endpoint_uri}, ) s3_no_cache = S3FileSystem( anon=False, use_listings_cache=False, client_kwargs={"endpoint_url": endpoint_uri}, ) s3_with_cache.pipe({f"{BASE}folder/foo/1.txt": b"", f"{BASE}bar.txt": b""}) s3_with_cache.find(BASE) listed_cached = s3_with_cache.ls(BASE, detail=False) listed_no_cache = s3_no_cache.ls(BASE, detail=False) assert set(listed_cached) == set(listed_no_cache) def test_session_close(): async def run_program(run): s3 = s3fs.S3FileSystem(anon=True, asynchronous=True) session = await s3.set_session() files = await s3._ls( "s3://noaa-hrrr-bdp-pds/hrrr.20140730/conus/" ) # Random open data store print(f"Number of files {len(files)}") await session.close() import aiobotocore.httpsession aiobotocore.httpsession.AIOHTTPSession asyncio.run(run_program(True)) asyncio.run(run_program(False)) def test_rm_recursive_prfix(s3): prefix = "logs/" # must end with "/" # Create empty "directory" in S3 client = get_boto3_client() client.put_object(Bucket=test_bucket_name, Key=prefix, Body=b"") logs_path = f"s3://{test_bucket_name}/{prefix}" s3.rm(logs_path, recursive=True) assert not s3.isdir(logs_path) s3fs-2026.2.0/s3fs/tests/test_utils.py000066400000000000000000000005621514121105500174100ustar00rootroot00000000000000import s3fs.utils as utils def test_get_brange(): assert list(utils._get_brange(100, 24)) == [ (0, 23), (24, 47), (48, 71), (72, 95), (96, 99), ] assert list(utils._get_brange(100, 25)) == [(0, 24), (25, 49), (50, 74), (75, 99)] assert list(utils._get_brange(100, 26)) == [(0, 25), (26, 51), (52, 77), (78, 99)] s3fs-2026.2.0/s3fs/utils.py000066400000000000000000000121311514121105500152020ustar00rootroot00000000000000import errno import logging from contextlib import contextmanager, AsyncExitStack from botocore.exceptions import ClientError logger = logging.getLogger("s3fs") @contextmanager def ignoring(*exceptions): try: yield except exceptions: pass class S3BucketRegionCache: # See https://github.com/aio-libs/aiobotocore/issues/866 # for details. def __init__(self, session, **client_kwargs): self._session = session self._stack = AsyncExitStack() self._client = None self._client_kwargs = client_kwargs self._buckets = {} self._regions = {} async def get_bucket_client(self, bucket_name=None): if bucket_name in self._buckets: return self._buckets[bucket_name] general_client = await self.get_client() if bucket_name is None: return general_client try: response = await general_client.head_bucket(Bucket=bucket_name) except ClientError as e: logger.debug("RC: HEAD_BUCKET call for %r has failed", bucket_name) response = e.response region = ( response["ResponseMetadata"] .get("HTTPHeaders", {}) .get("x-amz-bucket-region") ) if not region: logger.debug( "RC: No region in HEAD_BUCKET call response for %r, returning the general client", bucket_name, ) return general_client if region not in self._regions: logger.debug( "RC: Creating a new regional client for %r on the region %r", bucket_name, region, ) self._regions[region] = await self._stack.enter_async_context( self._session.create_client( "s3", region_name=region, **self._client_kwargs ) ) client = self._buckets[bucket_name] = self._regions[region] return client async def get_client(self): if not self._client: self._client = await self._stack.enter_async_context( self._session.create_client("s3", **self._client_kwargs) ) return self._client async def clear(self): logger.debug("RC: discarding all clients") self._buckets.clear() self._regions.clear() self._client = None await self._stack.aclose() async def __aenter__(self): return self async def __aexit__(self, *exc_args): await self.clear() class FileExpired(IOError): """ Is raised, when the file content has been changed from a different process after opening the file. Reading the file would lead to invalid or inconsistent output. This can also be triggered by outdated file-information inside the directory cache. In this case ``S3FileSystem.invalidate_cache`` can be used to force an update of the file-information when opening the file. """ def __init__(self, filename: str, e_tag: str): super().__init__( errno.EBUSY, "The remote file corresponding to filename %s and Etag %s no longer exists." % (filename, e_tag), ) def title_case(string): """ TitleCases a given string. Parameters ---------- string : underscore separated string """ return "".join(x.capitalize() for x in string.split("_")) class ParamKwargsHelper: """ Utility class to help extract the subset of keys that an s3 method is actually using Parameters ---------- s3 : boto S3FileSystem """ _kwarg_cache = {} def __init__(self, s3): self.s3 = s3 def _get_valid_keys(self, model_name): if model_name not in self._kwarg_cache: model = self.s3.meta.service_model.operation_model(model_name) valid_keys = ( set(model.input_shape.members.keys()) if model.input_shape is not None else set() ) self._kwarg_cache[model_name] = valid_keys return self._kwarg_cache[model_name] def filter_dict(self, method_name, d): model_name = title_case(method_name) valid_keys = self._get_valid_keys(model_name) if isinstance(d, SSEParams): d = d.to_kwargs() return {k: v for k, v in d.items() if k in valid_keys} class SSEParams: def __init__( self, server_side_encryption=None, sse_customer_algorithm=None, sse_customer_key=None, sse_kms_key_id=None, ): self.ServerSideEncryption = server_side_encryption self.SSECustomerAlgorithm = sse_customer_algorithm self.SSECustomerKey = sse_customer_key self.SSEKMSKeyId = sse_kms_key_id def to_kwargs(self): return {k: v for k, v in self.__dict__.items() if v is not None} def _get_brange(size, block): """ Chunk up a file into zero-based byte ranges Parameters ---------- size : file size block : block size """ for offset in range(0, size, block): yield offset, min(offset + block - 1, size - 1) s3fs-2026.2.0/setup.cfg000066400000000000000000000023601514121105500144360ustar00rootroot00000000000000[metadata] long_description: file: README.rst [versioneer] VCS = git style = pep440 versionfile_source = s3fs/_version.py versionfile_build = s3fs/_version.py tag_prefix = "" [flake8] exclude = __init__.py,versioneer.py,s3fs/tests/ max-line-length = 95 ignore = # Extra space in brackets E20, # Multiple spaces around "," E231,E241, # Comments E26, # Import formatting E4, # Comparing types instead of isinstance E721, # Assigning lambda expression E731, # continuation line under-indented for hanging indent E121, # continuation line over-indented for hanging indent E126, # continuation line over-indented for visual indent E127, # E128 continuation line under-indented for visual indent E128, # multiple statements on one line (semicolon) E702, # line break before binary operator W503, # visually indented line with same indent as next logical line E129, # unexpected indentation E116, # redefinition of unused 'loop' from line 10 F811, # local variable is assigned to but never used F841, # Ambiguous variable names E741 # line break after binary operator W504, # line too long (leave it to black!) E501, s3fs-2026.2.0/setup.py000077500000000000000000000021001514121105500143220ustar00rootroot00000000000000#!/usr/bin/env python from setuptools import setup import versioneer setup( name="s3fs", version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", ], description="Convenient Filesystem interface over S3", url="http://github.com/fsspec/s3fs/", maintainer="Martin Durant", maintainer_email="mdurant@continuum.io", license="BSD", keywords="s3, boto", packages=["s3fs"], python_requires=">= 3.10", install_requires=[open("requirements.txt").read().strip().split("\n")], long_description="README.md", long_description_content_type="text/markdown", zip_safe=False, ) s3fs-2026.2.0/test_requirements.txt000066400000000000000000000001171514121105500171360ustar00rootroot00000000000000mock; python_version < '3.3' moto>=4 flask flask_cors pytest>=4.2.0 pytest-env s3fs-2026.2.0/versioneer.py000066400000000000000000002515061514121105500153600ustar00rootroot00000000000000# Version: 0.29 """The Versioneer - like a rocketeer, but for versions. The Versioneer ============== * like a rocketeer, but for versions! * https://github.com/python-versioneer/python-versioneer * Brian Warner * License: Public Domain (Unlicense) * Compatible with: Python 3.7, 3.8, 3.9, 3.10, 3.11 and pypy3 * [![Latest Version][pypi-image]][pypi-url] * [![Build Status][travis-image]][travis-url] This is a tool for managing a recorded version number in setuptools-based python projects. The goal is to remove the tedious and error-prone "update the embedded version string" step from your release process. Making a new release should be as easy as recording a new tag in your version-control system, and maybe making new tarballs. ## Quick Install Versioneer provides two installation modes. The "classic" vendored mode installs a copy of versioneer into your repository. The experimental build-time dependency mode is intended to allow you to skip this step and simplify the process of upgrading. ### Vendored mode * `pip install versioneer` to somewhere in your $PATH * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is available, so you can also use `conda install -c conda-forge versioneer` * add a `[tool.versioneer]` section to your `pyproject.toml` or a `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) * Note that you will need to add `tomli; python_version < "3.11"` to your build-time dependencies if you use `pyproject.toml` * run `versioneer install --vendor` in your source tree, commit the results * verify version information with `python setup.py version` ### Build-time dependency mode * `pip install versioneer` to somewhere in your $PATH * A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is available, so you can also use `conda install -c conda-forge versioneer` * add a `[tool.versioneer]` section to your `pyproject.toml` or a `[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md)) * add `versioneer` (with `[toml]` extra, if configuring in `pyproject.toml`) to the `requires` key of the `build-system` table in `pyproject.toml`: ```toml [build-system] requires = ["setuptools", "versioneer[toml]"] build-backend = "setuptools.build_meta" ``` * run `versioneer install --no-vendor` in your source tree, commit the results * verify version information with `python setup.py version` ## Version Identifiers Source trees come from a variety of places: * a version-control system checkout (mostly used by developers) * a nightly tarball, produced by build automation * a snapshot tarball, produced by a web-based VCS browser, like github's "tarball from tag" feature * a release tarball, produced by "setup.py sdist", distributed through PyPI Within each source tree, the version identifier (either a string or a number, this tool is format-agnostic) can come from a variety of places: * ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows about recent "tags" and an absolute revision-id * the name of the directory into which the tarball was unpacked * an expanded VCS keyword ($Id$, etc) * a `_version.py` created by some earlier build step For released software, the version identifier is closely related to a VCS tag. Some projects use tag names that include more than just the version string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool needs to strip the tag prefix to extract the version identifier. For unreleased software (between tags), the version identifier should provide enough information to help developers recreate the same tree, while also giving them an idea of roughly how old the tree is (after version 1.2, before version 1.3). Many VCS systems can report a description that captures this, for example `git describe --tags --dirty --always` reports things like "0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the 0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has uncommitted changes). The version identifier is used for multiple purposes: * to allow the module to self-identify its version: `myproject.__version__` * to choose a name and prefix for a 'setup.py sdist' tarball ## Theory of Operation Versioneer works by adding a special `_version.py` file into your source tree, where your `__init__.py` can import it. This `_version.py` knows how to dynamically ask the VCS tool for version information at import time. `_version.py` also contains `$Revision$` markers, and the installation process marks `_version.py` to have this marker rewritten with a tag name during the `git archive` command. As a result, generated tarballs will contain enough information to get the proper version. To allow `setup.py` to compute a version too, a `versioneer.py` is added to the top level of your source tree, next to `setup.py` and the `setup.cfg` that configures it. This overrides several distutils/setuptools commands to compute the version when invoked, and changes `setup.py build` and `setup.py sdist` to replace `_version.py` with a small static file that contains just the generated version data. ## Installation See [INSTALL.md](./INSTALL.md) for detailed installation instructions. ## Version-String Flavors Code which uses Versioneer can learn about its version string at runtime by importing `_version` from your main `__init__.py` file and running the `get_versions()` function. From the "outside" (e.g. in `setup.py`), you can import the top-level `versioneer.py` and run `get_versions()`. Both functions return a dictionary with different flavors of version information: * `['version']`: A condensed version string, rendered using the selected style. This is the most commonly used value for the project's version string. The default "pep440" style yields strings like `0.11`, `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section below for alternative styles. * `['full-revisionid']`: detailed revision identifier. For Git, this is the full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". * `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the commit date in ISO 8601 format. This will be None if the date is not available. * `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that this is only accurate if run in a VCS checkout, otherwise it is likely to be False or None * `['error']`: if the version string could not be computed, this will be set to a string describing the problem, otherwise it will be None. It may be useful to throw an exception in setup.py if this is set, to avoid e.g. creating tarballs with a version string of "unknown". Some variants are more useful than others. Including `full-revisionid` in a bug report should allow developers to reconstruct the exact code being tested (or indicate the presence of local changes that should be shared with the developers). `version` is suitable for display in an "about" box or a CLI `--version` output: it can be easily compared against release notes and lists of bugs fixed in various releases. The installer adds the following text to your `__init__.py` to place a basic version in `YOURPROJECT.__version__`: from ._version import get_versions __version__ = get_versions()['version'] del get_versions ## Styles The setup.cfg `style=` configuration controls how the VCS information is rendered into a version string. The default style, "pep440", produces a PEP440-compliant string, equal to the un-prefixed tag name for actual releases, and containing an additional "local version" section with more detail for in-between builds. For Git, this is TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags --dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and that this commit is two revisions ("+2") beyond the "0.11" tag. For released software (exactly equal to a known tag), the identifier will only contain the stripped tag, e.g. "0.11". Other styles are available. See [details.md](details.md) in the Versioneer source tree for descriptions. ## Debugging Versioneer tries to avoid fatal errors: if something goes wrong, it will tend to return a version of "0+unknown". To investigate the problem, run `setup.py version`, which will run the version-lookup code in a verbose mode, and will display the full contents of `get_versions()` (including the `error` string, which may help identify what went wrong). ## Known Limitations Some situations are known to cause problems for Versioneer. This details the most significant ones. More can be found on Github [issues page](https://github.com/python-versioneer/python-versioneer/issues). ### Subprojects Versioneer has limited support for source trees in which `setup.py` is not in the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are two common reasons why `setup.py` might not be in the root: * Source trees which contain multiple subprojects, such as [Buildbot](https://github.com/buildbot/buildbot), which contains both "master" and "slave" subprojects, each with their own `setup.py`, `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI distributions (and upload multiple independently-installable tarballs). * Source trees whose main purpose is to contain a C library, but which also provide bindings to Python (and perhaps other languages) in subdirectories. Versioneer will look for `.git` in parent directories, and most operations should get the right version string. However `pip` and `setuptools` have bugs and implementation details which frequently cause `pip install .` from a subproject directory to fail to find a correct version string (so it usually defaults to `0+unknown`). `pip install --editable .` should work correctly. `setup.py install` might work too. Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in some later version. [Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking this issue. The discussion in [PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the issue from the Versioneer side in more detail. [pip PR#3176](https://github.com/pypa/pip/pull/3176) and [pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve pip to let Versioneer work correctly. Versioneer-0.16 and earlier only looked for a `.git` directory next to the `setup.cfg`, so subprojects were completely unsupported with those releases. ### Editable installs with setuptools <= 18.5 `setup.py develop` and `pip install --editable .` allow you to install a project into a virtualenv once, then continue editing the source code (and test) without re-installing after every change. "Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a convenient way to specify executable scripts that should be installed along with the python package. These both work as expected when using modern setuptools. When using setuptools-18.5 or earlier, however, certain operations will cause `pkg_resources.DistributionNotFound` errors when running the entrypoint script, which must be resolved by re-installing the package. This happens when the install happens with one version, then the egg_info data is regenerated while a different version is checked out. Many setup.py commands cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into a different virtualenv), so this can be surprising. [Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes this one, but upgrading to a newer version of setuptools should probably resolve it. ## Updating Versioneer To upgrade your project to a new release of Versioneer, do the following: * install the new Versioneer (`pip install -U versioneer` or equivalent) * edit `setup.cfg` and `pyproject.toml`, if necessary, to include any new configuration settings indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. * re-run `versioneer install --[no-]vendor` in your source tree, to replace `SRC/_version.py` * commit any changed files ## Future Directions This tool is designed to make it easily extended to other version-control systems: all VCS-specific components are in separate directories like src/git/ . The top-level `versioneer.py` script is assembled from these components by running make-versioneer.py . In the future, make-versioneer.py will take a VCS name as an argument, and will construct a version of `versioneer.py` that is specific to the given VCS. It might also take the configuration arguments that are currently provided manually during installation by editing setup.py . Alternatively, it might go the other direction and include code from all supported VCS systems, reducing the number of intermediate scripts. ## Similar projects * [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time dependency * [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of versioneer * [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools plugin ## License To make Versioneer easier to embed, all its code is dedicated to the public domain. The `_version.py` that it creates is also in the public domain. Specifically, both are released under the "Unlicense", as described in https://unlicense.org/. [pypi-image]: https://img.shields.io/pypi/v/versioneer.svg [pypi-url]: https://pypi.python.org/pypi/versioneer/ [travis-image]: https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg [travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer """ # pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring # pylint:disable=missing-class-docstring,too-many-branches,too-many-statements # pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error # pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with # pylint:disable=attribute-defined-outside-init,too-many-arguments import configparser import errno import json import os import re import subprocess import sys from pathlib import Path from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union from typing import NoReturn import functools have_tomllib = True if sys.version_info >= (3, 11): import tomllib else: try: import tomli as tomllib except ImportError: have_tomllib = False class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str versionfile_source: str versionfile_build: Optional[str] parentdir_prefix: Optional[str] verbose: Optional[bool] def get_root() -> str: """Get the project root directory. We require that all commands are run from the project root, i.e. the directory that contains setup.py, setup.cfg, and versioneer.py . """ root = os.path.realpath(os.path.abspath(os.getcwd())) setup_py = os.path.join(root, "setup.py") pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") if not ( os.path.exists(setup_py) or os.path.exists(pyproject_toml) or os.path.exists(versioneer_py) ): # allow 'python path/to/setup.py COMMAND' root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) setup_py = os.path.join(root, "setup.py") pyproject_toml = os.path.join(root, "pyproject.toml") versioneer_py = os.path.join(root, "versioneer.py") if not ( os.path.exists(setup_py) or os.path.exists(pyproject_toml) or os.path.exists(versioneer_py) ): err = ( "Versioneer was unable to run the project root directory. " "Versioneer requires setup.py to be executed from " "its immediate directory (like 'python setup.py COMMAND'), " "or in a way that lets it use sys.argv[0] to find the root " "(like 'python path/to/setup.py COMMAND')." ) raise VersioneerBadRootError(err) try: # Certain runtime workflows (setup.py install/develop in a setuptools # tree) execute all dependencies in a single python process, so # "versioneer" may be imported multiple times, and python's shared # module-import table will cache the first one. So we can't use # os.path.dirname(__file__), as that will find whichever # versioneer.py was first imported, even in later projects. my_path = os.path.realpath(os.path.abspath(__file__)) me_dir = os.path.normcase(os.path.splitext(my_path)[0]) vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) if me_dir != vsr_dir and "VERSIONEER_PEP518" not in globals(): print( "Warning: build in %s is using versioneer.py from %s" % (os.path.dirname(my_path), versioneer_py) ) except NameError: pass return root def get_config_from_root(root: str) -> VersioneerConfig: """Read the project setup.cfg file to determine Versioneer config.""" # This might raise OSError (if setup.cfg is missing), or # configparser.NoSectionError (if it lacks a [versioneer] section), or # configparser.NoOptionError (if it lacks "VCS="). See the docstring at # the top of versioneer.py for instructions on writing your setup.cfg . root_pth = Path(root) pyproject_toml = root_pth / "pyproject.toml" setup_cfg = root_pth / "setup.cfg" section: Union[Dict[str, Any], configparser.SectionProxy, None] = None if pyproject_toml.exists() and have_tomllib: try: with open(pyproject_toml, "rb") as fobj: pp = tomllib.load(fobj) section = pp["tool"]["versioneer"] except (tomllib.TOMLDecodeError, KeyError) as e: print(f"Failed to load config from {pyproject_toml}: {e}") print("Try to load it from setup.cfg") if not section: parser = configparser.ConfigParser() with open(setup_cfg) as cfg_file: parser.read_file(cfg_file) parser.get("versioneer", "VCS") # raise error if missing section = parser["versioneer"] # `cast`` really shouldn't be used, but its simplest for the # common VersioneerConfig users at the moment. We verify against # `None` values elsewhere where it matters cfg = VersioneerConfig() cfg.VCS = section["VCS"] cfg.style = section.get("style", "") cfg.versionfile_source = cast(str, section.get("versionfile_source")) cfg.versionfile_build = section.get("versionfile_build") cfg.tag_prefix = cast(str, section.get("tag_prefix")) if cfg.tag_prefix in ("''", '""', None): cfg.tag_prefix = "" cfg.parentdir_prefix = section.get("parentdir_prefix") if isinstance(section, configparser.SectionProxy): # Make sure configparser translates to bool cfg.verbose = section.getboolean("verbose") else: cfg.verbose = section.get("verbose") return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" # these dictionaries contain VCS-specific tools LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" HANDLERS.setdefault(vcs, {})[method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen( [command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs, ) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %s" % dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %s" % (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %s (error)" % dispcmd) print("stdout was %s" % stdout) return None, process.returncode return stdout, process.returncode LONG_VERSION_PY[ "git" ] = r''' # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build # directories (produced by setup.py build) will contain a much shorter file # that just contains the computed version number. # This file is released into the public domain. # Generated by versioneer-0.29 # https://github.com/python-versioneer/python-versioneer """Git implementation of _version.py.""" import errno import os import re import subprocess import sys from typing import Any, Callable, Dict, List, Optional, Tuple import functools def get_keywords() -> Dict[str, str]: """Get the keywords needed to look up the version information.""" # these strings will be replaced by git during git-archive. # setup.py/versioneer.py will grep for the variable names, so they must # each be defined on a line of their own. _version.py will just call # get_keywords(). git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} return keywords class VersioneerConfig: """Container for Versioneer configuration parameters.""" VCS: str style: str tag_prefix: str parentdir_prefix: str versionfile_source: str verbose: bool def get_config() -> VersioneerConfig: """Create, populate and return the VersioneerConfig() object.""" # these strings are filled in when 'setup.py versioneer' creates # _version.py cfg = VersioneerConfig() cfg.VCS = "git" cfg.style = "%(STYLE)s" cfg.tag_prefix = "%(TAG_PREFIX)s" cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" cfg.verbose = False return cfg class NotThisMethod(Exception): """Exception raised if a method is not valid for the current scenario.""" LONG_VERSION_PY: Dict[str, str] = {} HANDLERS: Dict[str, Dict[str, Callable]] = {} def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f return decorate def run_command( commands: List[str], args: List[str], cwd: Optional[str] = None, verbose: bool = False, hide_stderr: bool = False, env: Optional[Dict[str, str]] = None, ) -> Tuple[Optional[str], Optional[int]]: """Call the given command(s).""" assert isinstance(commands, list) process = None popen_kwargs: Dict[str, Any] = {} if sys.platform == "win32": # This hides the console window if pythonw.exe is used startupinfo = subprocess.STARTUPINFO() startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW popen_kwargs["startupinfo"] = startupinfo for command in commands: try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git process = subprocess.Popen([command] + args, cwd=cwd, env=env, stdout=subprocess.PIPE, stderr=(subprocess.PIPE if hide_stderr else None), **popen_kwargs) break except OSError as e: if e.errno == errno.ENOENT: continue if verbose: print("unable to run %%s" %% dispcmd) print(e) return None, None else: if verbose: print("unable to find command, tried %%s" %% (commands,)) return None, None stdout = process.communicate()[0].strip().decode() if process.returncode != 0: if verbose: print("unable to run %%s (error)" %% dispcmd) print("stdout was %%s" %% stdout) return None, process.returncode return stdout, process.returncode def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return {"version": dirname[len(parentdir_prefix):], "full-revisionid": None, "dirty": False, "error": None, "date": None} rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print("Tried directories %%s but none started with prefix %%s" %% (str(rootdirs), parentdir_prefix)) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %%d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r'\d', r)} if verbose: print("discarding '%%s', no digits" %% ",".join(refs - tags)) if verbose: print("likely tags: %%s" %% ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix):] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r'\d', r): continue if verbose: print("picking %%s" %% r) return {"version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date} # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return {"version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None} @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %%s not under git control" %% root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner(GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*" ], cwd=root) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[:git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = ("unable to parse git-describe output: '%%s'" %% describe_out) return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%%s' doesn't start with prefix '%%s'" print(fmt %% (full_tag, tag_prefix)) pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" %% (full_tag, tag_prefix)) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%%d.dev%%d" %% (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%%d" %% (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%%d" %% pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%%s" %% pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%%s" %% pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%%d" %% pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return {"version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None} if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%%s'" %% style) return {"version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date")} def get_versions() -> Dict[str, Any]: """Get version information or return default if unable to do so.""" # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have # __file__, we can work backwards from there to the root. Some # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which # case we can only use expanded keywords. cfg = get_config() verbose = cfg.verbose try: return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) except NotThisMethod: pass try: root = os.path.realpath(__file__) # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. for _ in cfg.versionfile_source.split('/'): root = os.path.dirname(root) except NameError: return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to find root of source tree", "date": None} try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) return render(pieces, cfg.style) except NotThisMethod: pass try: if cfg.parentdir_prefix: return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) except NotThisMethod: pass return {"version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None} ''' @register_vcs_handler("git", "get_keywords") def git_get_keywords(versionfile_abs: str) -> Dict[str, str]: """Extract version information from the given file.""" # the code embedded in _version.py can just fetch the value of these # keywords. When used from setup.py, we don't want to import _version.py, # so we do it with a regexp instead. This function is not used from # _version.py. keywords: Dict[str, str] = {} try: with open(versionfile_abs, "r") as fobj: for line in fobj: if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["refnames"] = mo.group(1) if line.strip().startswith("git_full ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["full"] = mo.group(1) if line.strip().startswith("git_date ="): mo = re.search(r'=\s*"(.*)"', line) if mo: keywords["date"] = mo.group(1) except OSError: pass return keywords @register_vcs_handler("git", "keywords") def git_versions_from_keywords( keywords: Dict[str, str], tag_prefix: str, verbose: bool, ) -> Dict[str, Any]: """Get version information from git keywords.""" if "refnames" not in keywords: raise NotThisMethod("Short version file found") date = keywords.get("date") if date is not None: # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 # -like" string, which we must then edit to make compliant), because # it's been around since git-1.5.3, and it's too difficult to # discover which version we're using, or to work around using an # older one. date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) refnames = keywords["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("keywords are unexpanded, not using") raise NotThisMethod("unexpanded keywords, not a git-archive tarball") refs = {r.strip() for r in refnames.strip("()").split(",")} # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d # expansion behaves like git log --decorate=short and strips out the # refs/heads/ and refs/tags/ prefixes that would let us distinguish # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') if not re.match(r"\d", r): continue if verbose: print("picking %s" % r) return { "version": r, "full-revisionid": keywords["full"].strip(), "dirty": False, "error": None, "date": date, } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") return { "version": "0+unknown", "full-revisionid": keywords["full"].strip(), "dirty": False, "error": "no suitable tags", "date": None, } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. This only gets called if the git-archive 'subst' keywords were *not* expanded, and _version.py hasn't already been rewritten with a short version string, meaning we're inside a checked out source tree. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] # GIT_DIR can interfere with correct operation of Versioneer. # It may be intended to be passed to the Versioneer-versioned project, # but that should not change where we get our version from. env = os.environ.copy() env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose) if rc != 0: if verbose: print("Directory %s not under git control" % root) raise NotThisMethod("'git rev-parse --git-dir' returned error") # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) describe_out, rc = runner( GITS, [ "describe", "--tags", "--dirty", "--always", "--long", "--match", f"{tag_prefix}[[:digit:]]*", ], cwd=root, ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") describe_out = describe_out.strip() full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) if full_out is None: raise NotThisMethod("'git rev-parse' failed") full_out = full_out.strip() pieces: Dict[str, Any] = {} pieces["long"] = full_out pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") branch_name = branch_name.strip() if branch_name == "HEAD": # If we aren't exactly on a branch, pick a branch which represents # the current commit. If all else fails, we are on a branchless # commit. branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) # --contains was added in git-1.5.4 if rc != 0 or branches is None: raise NotThisMethod("'git branch --contains' returned error") branches = branches.split("\n") # Remove the first line if we're running detached if "(" in branches[0]: branches.pop(0) # Strip off the leading "* " from the list of branches. branches = [branch[2:] for branch in branches] if "master" in branches: branch_name = "master" elif not branches: branch_name = None else: # Pick the first branch that is returned. Good or bad. branch_name = branches[0] pieces["branch"] = branch_name # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] # TAG might have hyphens. git_describe = describe_out # look for -dirty suffix dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( full_tag, tag_prefix, ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) # commit: short hex revision ID pieces["short"] = mo.group(3) else: # HEX: no tags pieces["closest-tag"] = None out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root) pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) return pieces def do_vcs_install(versionfile_source: str, ipy: Optional[str]) -> None: """Git-specific installation logic for Versioneer. For Git, this means creating/changing .gitattributes to mark _version.py for export-subst keyword substitution. """ GITS = ["git"] if sys.platform == "win32": GITS = ["git.cmd", "git.exe"] files = [versionfile_source] if ipy: files.append(ipy) if "VERSIONEER_PEP518" not in globals(): try: my_path = __file__ if my_path.endswith((".pyc", ".pyo")): my_path = os.path.splitext(my_path)[0] + ".py" versioneer_file = os.path.relpath(my_path) except NameError: versioneer_file = "versioneer.py" files.append(versioneer_file) present = False try: with open(".gitattributes", "r") as fobj: for line in fobj: if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True break except OSError: pass if not present: with open(".gitattributes", "a+") as fobj: fobj.write(f"{versionfile_source} export-subst\n") files.append(".gitattributes") run_command(GITS, ["add", "--"] + files) def versions_from_parentdir( parentdir_prefix: str, root: str, verbose: bool, ) -> Dict[str, Any]: """Try to determine the version from the parent directory name. Source tarballs conventionally unpack into a directory that includes both the project name and a version string. We will also support searching up two directory levels for an appropriately named parent directory """ rootdirs = [] for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): return { "version": dirname[len(parentdir_prefix) :], "full-revisionid": None, "dirty": False, "error": None, "date": None, } rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: print( "Tried directories %s but none started with prefix %s" % (str(rootdirs), parentdir_prefix) ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") SHORT_VERSION_PY = """ # This file was generated by 'versioneer.py' (0.29) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. import json version_json = ''' %s ''' # END VERSION_JSON def get_versions(): return json.loads(version_json) """ def versions_from_file(filename: str) -> Dict[str, Any]: """Try to determine the version from _version.py if present.""" try: with open(filename) as f: contents = f.read() except OSError: raise NotThisMethod("unable to read _version.py") mo = re.search( r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S ) if not mo: mo = re.search( r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S ) if not mo: raise NotThisMethod("no version_json in _version.py") return json.loads(mo.group(1)) def write_to_version_file(filename: str, versions: Dict[str, Any]) -> None: """Write the given version number to the given _version.py file.""" contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) print("set %s to '%s'" % (filename, versions["version"])) def plus_or_dot(pieces: Dict[str, Any]) -> str: """Return a + if we don't already have one, else return a .""" if "+" in pieces.get("closest-tag", ""): return "." return "+" def render_pep440(pieces: Dict[str, Any]) -> str: """Build up version string, with post-release "local version identifier". Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty Exceptions: 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_branch(pieces: Dict[str, Any]) -> str: """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . The ".dev0" means not master branch. Note that .dev0 sorts backwards (a feature branch will appear "older" than the master branch). Exceptions: 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]: """Split pep440 version string at the post-release segment. Returns the release segments before the post-release and the post-release version number (or -1 if no post-release segment is present). """ vc = str.split(ver, ".post") return vc[0], int(vc[1] or 0) if len(vc) == 2 else None def render_pep440_pre(pieces: Dict[str, Any]) -> str: """TAG[.postN.devDISTANCE] -- No -dirty. Exceptions: 1: no tags. 0.post0.devDISTANCE """ if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment tag_version, post_version = pep440_split_post(pieces["closest-tag"]) rendered = tag_version if post_version is not None: rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: # no commits, use the tag as the version rendered = pieces["closest-tag"] else: # exception #1 rendered = "0.post0.dev%d" % pieces["distance"] return rendered def render_pep440_post(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX] . The ".dev0" means dirty. Note that .dev0 sorts backwards (a dirty tree will appear "older" than the corresponding clean one), but you shouldn't be releasing software with -dirty anyways. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" rendered += "+g%s" % pieces["short"] return rendered def render_pep440_post_branch(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . The ".dev0" means not master branch. Exceptions: 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += plus_or_dot(pieces) rendered += "g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["branch"] != "master": rendered += ".dev0" rendered += "+g%s" % pieces["short"] if pieces["dirty"]: rendered += ".dirty" return rendered def render_pep440_old(pieces: Dict[str, Any]) -> str: """TAG[.postDISTANCE[.dev0]] . The ".dev0" means dirty. Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += ".post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" else: # exception #1 rendered = "0.post%d" % pieces["distance"] if pieces["dirty"]: rendered += ".dev0" return rendered def render_git_describe(pieces: Dict[str, Any]) -> str: """TAG[-DISTANCE-gHEX][-dirty]. Like 'git describe --tags --dirty --always'. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render_git_describe_long(pieces: Dict[str, Any]) -> str: """TAG-DISTANCE-gHEX[-dirty]. Like 'git describe --tags --dirty --always -long'. The distance/hash is unconditional. Exceptions: 1: no tags. HEX[-dirty] (note: no 'g' prefix) """ if pieces["closest-tag"]: rendered = pieces["closest-tag"] rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) else: # exception #1 rendered = pieces["short"] if pieces["dirty"]: rendered += "-dirty" return rendered def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: return { "version": "unknown", "full-revisionid": pieces.get("long"), "dirty": None, "error": pieces["error"], "date": None, } if not style or style == "default": style = "pep440" # the default if style == "pep440": rendered = render_pep440(pieces) elif style == "pep440-branch": rendered = render_pep440_branch(pieces) elif style == "pep440-pre": rendered = render_pep440_pre(pieces) elif style == "pep440-post": rendered = render_pep440_post(pieces) elif style == "pep440-post-branch": rendered = render_pep440_post_branch(pieces) elif style == "pep440-old": rendered = render_pep440_old(pieces) elif style == "git-describe": rendered = render_git_describe(pieces) elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: raise ValueError("unknown style '%s'" % style) return { "version": rendered, "full-revisionid": pieces["long"], "dirty": pieces["dirty"], "error": None, "date": pieces.get("date"), } class VersioneerBadRootError(Exception): """The project root directory is unknown or missing key files.""" def get_versions(verbose: bool = False) -> Dict[str, Any]: """Get the project version from whatever source is available. Returns dict with two keys: 'version' and 'full'. """ if "versioneer" in sys.modules: # see the discussion in cmdclass.py:get_cmdclass() del sys.modules["versioneer"] root = get_root() cfg = get_config_from_root(root) assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" handlers = HANDLERS.get(cfg.VCS) assert handlers, "unrecognized VCS '%s'" % cfg.VCS verbose = verbose or bool(cfg.verbose) # `bool()` used to avoid `None` assert ( cfg.versionfile_source is not None ), "please set versioneer.versionfile_source" assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" versionfile_abs = os.path.join(root, cfg.versionfile_source) # extract version from first of: _version.py, VCS command (e.g. 'git # describe'), parentdir. This is meant to work for developers using a # source checkout, for users of a tarball created by 'setup.py sdist', # and for users of a tarball/zipball created by 'git archive' or github's # download-from-tag feature or the equivalent in other VCSes. get_keywords_f = handlers.get("get_keywords") from_keywords_f = handlers.get("keywords") if get_keywords_f and from_keywords_f: try: keywords = get_keywords_f(versionfile_abs) ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) if verbose: print("got version from expanded keyword %s" % ver) return ver except NotThisMethod: pass try: ver = versions_from_file(versionfile_abs) if verbose: print("got version from file %s %s" % (versionfile_abs, ver)) return ver except NotThisMethod: pass from_vcs_f = handlers.get("pieces_from_vcs") if from_vcs_f: try: pieces = from_vcs_f(cfg.tag_prefix, root, verbose) ver = render(pieces, cfg.style) if verbose: print("got version from VCS %s" % ver) return ver except NotThisMethod: pass try: if cfg.parentdir_prefix: ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) if verbose: print("got version from parentdir %s" % ver) return ver except NotThisMethod: pass if verbose: print("unable to compute version") return { "version": "0+unknown", "full-revisionid": None, "dirty": None, "error": "unable to compute version", "date": None, } def get_version() -> str: """Get the short version string for this project.""" return get_versions()["version"] def get_cmdclass(cmdclass: Optional[Dict[str, Any]] = None): """Get the custom setuptools subclasses used by Versioneer. If the package uses a different cmdclass (e.g. one from numpy), it should be provide as an argument. """ if "versioneer" in sys.modules: del sys.modules["versioneer"] # this fixes the "python setup.py develop" case (also 'install' and # 'easy_install .'), in which subdependencies of the main project are # built (using setup.py bdist_egg) in the same python process. Assume # a main project A and a dependency B, which use different versions # of Versioneer. A's setup.py imports A's Versioneer, leaving it in # sys.modules by the time B's setup.py is executed, causing B to run # with the wrong versioneer. Setuptools wraps the sub-dep builds in a # sandbox that restores sys.modules to it's pre-build state, so the # parent is protected against the child's "import versioneer". By # removing ourselves from sys.modules here, before the child build # happens, we protect the child from the parent's versioneer too. # Also see https://github.com/python-versioneer/python-versioneer/issues/52 cmds = {} if cmdclass is None else cmdclass.copy() # we add "version" to setuptools from setuptools import Command class cmd_version(Command): description = "report generated version string" user_options: List[Tuple[str, str, str]] = [] boolean_options: List[str] = [] def initialize_options(self) -> None: pass def finalize_options(self) -> None: pass def run(self) -> None: vers = get_versions(verbose=True) print("Version: %s" % vers["version"]) print(" full-revisionid: %s" % vers.get("full-revisionid")) print(" dirty: %s" % vers.get("dirty")) print(" date: %s" % vers.get("date")) if vers["error"]: print(" error: %s" % vers["error"]) cmds["version"] = cmd_version # we override "build_py" in setuptools # # most invocation pathways end up running build_py: # distutils/build -> build_py # distutils/install -> distutils/build ->.. # setuptools/bdist_wheel -> distutils/install ->.. # setuptools/bdist_egg -> distutils/install_lib -> build_py # setuptools/install -> bdist_egg ->.. # setuptools/develop -> ? # pip install: # copies source tree to a tempdir before running egg_info/etc # if .git isn't copied too, 'git describe' will fail # then does setup.py bdist_wheel, or sometimes setup.py install # setup.py egg_info -> ? # pip install -e . and setuptool/editable_wheel will invoke build_py # but the build_py command is not expected to copy any files. # we override different "build_py" commands for both environments if "build_py" in cmds: _build_py: Any = cmds["build_py"] else: from setuptools.command.build_py import build_py as _build_py class cmd_build_py(_build_py): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_py.run(self) if getattr(self, "editable_mode", False): # During editable installs `.py` and data files are # not copied to build_lib return # now locate _version.py in the new build/ directory and replace # it with an updated value if cfg.versionfile_build: target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_py"] = cmd_build_py if "build_ext" in cmds: _build_ext: Any = cmds["build_ext"] else: from setuptools.command.build_ext import build_ext as _build_ext class cmd_build_ext(_build_ext): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() _build_ext.run(self) if self.inplace: # build_ext --inplace will only build extensions in # build/lib<..> dir with no _version.py to write to. # As in place builds will already have a _version.py # in the module dir, we do not need to write one. return # now locate _version.py in the new build/ directory and replace # it with an updated value if not cfg.versionfile_build: return target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) if not os.path.exists(target_versionfile): print( f"Warning: {target_versionfile} does not exist, skipping " "version update. This can happen if you are running build_ext " "without first running build_py." ) return print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) cmds["build_ext"] = cmd_build_ext if "cx_Freeze" in sys.modules: # cx_freeze enabled? from cx_Freeze.dist import build_exe as _build_exe # type: ignore # nczeczulin reports that py2exe won't like the pep440-style string # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. # setup(console=[{ # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION # "product_version": versioneer.get_version(), # ... class cmd_build_exe(_build_exe): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _build_exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) cmds["build_exe"] = cmd_build_exe del cmds["build_py"] if "py2exe" in sys.modules: # py2exe enabled? try: from py2exe.setuptools_buildexe import py2exe as _py2exe # type: ignore except ImportError: from py2exe.distutils_buildexe import py2exe as _py2exe # type: ignore class cmd_py2exe(_py2exe): def run(self) -> None: root = get_root() cfg = get_config_from_root(root) versions = get_versions() target_versionfile = cfg.versionfile_source print("UPDATING %s" % target_versionfile) write_to_version_file(target_versionfile, versions) _py2exe.run(self) os.unlink(target_versionfile) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) cmds["py2exe"] = cmd_py2exe # sdist farms its file list building out to egg_info if "egg_info" in cmds: _egg_info: Any = cmds["egg_info"] else: from setuptools.command.egg_info import egg_info as _egg_info class cmd_egg_info(_egg_info): def find_sources(self) -> None: # egg_info.find_sources builds the manifest list and writes it # in one shot super().find_sources() # Modify the filelist and normalize it root = get_root() cfg = get_config_from_root(root) self.filelist.append("versioneer.py") if cfg.versionfile_source: # There are rare cases where versionfile_source might not be # included by default, so we must be explicit self.filelist.append(cfg.versionfile_source) self.filelist.sort() self.filelist.remove_duplicates() # The write method is hidden in the manifest_maker instance that # generated the filelist and was thrown away # We will instead replicate their final normalization (to unicode, # and POSIX-style paths) from setuptools import unicode_utils normalized = [ unicode_utils.filesys_decode(f).replace(os.sep, "/") for f in self.filelist.files ] manifest_filename = os.path.join(self.egg_info, "SOURCES.txt") with open(manifest_filename, "w") as fobj: fobj.write("\n".join(normalized)) cmds["egg_info"] = cmd_egg_info # we override different "sdist" commands for both environments if "sdist" in cmds: _sdist: Any = cmds["sdist"] else: from setuptools.command.sdist import sdist as _sdist class cmd_sdist(_sdist): def run(self) -> None: versions = get_versions() self._versioneer_generated_versions = versions # unless we update this, the command will keep using the old # version self.distribution.metadata.version = versions["version"] return _sdist.run(self) def make_release_tree(self, base_dir: str, files: List[str]) -> None: root = get_root() cfg = get_config_from_root(root) _sdist.make_release_tree(self, base_dir, files) # now locate _version.py in the new base_dir directory # (remembering that it may be a hardlink) and replace it with an # updated value target_versionfile = os.path.join(base_dir, cfg.versionfile_source) print("UPDATING %s" % target_versionfile) write_to_version_file( target_versionfile, self._versioneer_generated_versions ) cmds["sdist"] = cmd_sdist return cmds CONFIG_ERROR = """ setup.cfg is missing the necessary Versioneer configuration. You need a section like: [versioneer] VCS = git style = pep440 versionfile_source = src/myproject/_version.py versionfile_build = myproject/_version.py tag_prefix = parentdir_prefix = myproject- You will also need to edit your setup.py to use the results: import versioneer setup(version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), ...) Please read the docstring in ./versioneer.py for configuration instructions, edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. """ SAMPLE_CONFIG = """ # See the docstring in versioneer.py for instructions. Note that you must # re-run 'versioneer.py setup' after changing this section, and commit the # resulting files. [versioneer] #VCS = git #style = pep440 #versionfile_source = #versionfile_build = #tag_prefix = #parentdir_prefix = """ OLD_SNIPPET = """ from ._version import get_versions __version__ = get_versions()['version'] del get_versions """ INIT_PY_SNIPPET = """ from . import {0} __version__ = {0}.get_versions()['version'] """ def do_setup() -> int: """Do main VCS-independent setup function for installing Versioneer.""" root = get_root() try: cfg = get_config_from_root(root) except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (OSError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: f.write(SAMPLE_CONFIG) print(CONFIG_ERROR, file=sys.stderr) return 1 print(" creating %s" % cfg.versionfile_source) with open(cfg.versionfile_source, "w") as f: LONG = LONG_VERSION_PY[cfg.VCS] f.write( LONG % { "DOLLAR": "$", "STYLE": cfg.style, "TAG_PREFIX": cfg.tag_prefix, "PARENTDIR_PREFIX": cfg.parentdir_prefix, "VERSIONFILE_SOURCE": cfg.versionfile_source, } ) ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") maybe_ipy: Optional[str] = ipy if os.path.exists(ipy): try: with open(ipy, "r") as f: old = f.read() except OSError: old = "" module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] snippet = INIT_PY_SNIPPET.format(module) if OLD_SNIPPET in old: print(" replacing boilerplate in %s" % ipy) with open(ipy, "w") as f: f.write(old.replace(OLD_SNIPPET, snippet)) elif snippet not in old: print(" appending to %s" % ipy) with open(ipy, "a") as f: f.write(snippet) else: print(" %s unmodified" % ipy) else: print(" %s doesn't exist, ok" % ipy) maybe_ipy = None # Make VCS-specific changes. For git, this means creating/changing # .gitattributes to mark _version.py for export-subst keyword # substitution. do_vcs_install(cfg.versionfile_source, maybe_ipy) return 0 def scan_setup_py() -> int: """Validate the contents of setup.py against Versioneer's expectations.""" found = set() setters = False errors = 0 with open("setup.py", "r") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") if "versioneer.get_cmdclass()" in line: found.add("cmdclass") if "versioneer.get_version()" in line: found.add("get_version") if "versioneer.VCS" in line: setters = True if "versioneer.versionfile_source" in line: setters = True if len(found) != 3: print("") print("Your setup.py appears to be missing some important items") print("(but I might be wrong). Please make sure it has something") print("roughly like the following:") print("") print(" import versioneer") print(" setup( version=versioneer.get_version(),") print(" cmdclass=versioneer.get_cmdclass(), ...)") print("") errors += 1 if setters: print("You should remove lines like 'versioneer.VCS = ' and") print("'versioneer.versionfile_source = ' . This configuration") print("now lives in setup.cfg, and should be removed from setup.py") print("") errors += 1 return errors def setup_command() -> NoReturn: """Set up Versioneer and exit with appropriate error code.""" errors = do_setup() errors += scan_setup_py() sys.exit(1 if errors else 0) if __name__ == "__main__": cmd = sys.argv[1] if cmd == "setup": setup_command()