pax_global_header 0000666 0000000 0000000 00000000064 15141211055 0014505 g ustar 00root root 0000000 0000000 52 comment=1181d335955418f081a1d0b94c3d8350cea0751f
s3fs-2026.2.0/ 0000775 0000000 0000000 00000000000 15141211055 0012614 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/.coveragerc 0000664 0000000 0000000 00000000177 15141211055 0014742 0 ustar 00root root 0000000 0000000 [run]
include =
s3fs/*
omit =
s3fs/tests/test*
[report]
show_missing = True
[html]
directory = coverage_html_report
s3fs-2026.2.0/.gitattributes 0000664 0000000 0000000 00000000036 15141211055 0015506 0 ustar 00root root 0000000 0000000 s3fs/_version.py export-subst
s3fs-2026.2.0/.github/ 0000775 0000000 0000000 00000000000 15141211055 0014154 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/.github/workflows/ 0000775 0000000 0000000 00000000000 15141211055 0016211 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/.github/workflows/ci.yml 0000664 0000000 0000000 00000002556 15141211055 0017337 0 ustar 00root root 0000000 0000000 name: CI
on: [push, pull_request]
jobs:
test:
name: Python ${{ matrix.python-version }} - AioBotocore ${{ matrix.aiobotocore-version }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version:
- "3.10"
- "3.11"
- "3.12"
- "3.13"
- "3.14"
aiobotocore-version: [">=2.19.0,<2.20.0", "<3.0.0", "<4.0.0"]
env:
BOTO_CONFIG: /dev/null
AWS_ACCESS_KEY_ID: foobar_key
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- name: Checkout source
uses: actions/checkout@v5
with:
fetch-depth: 0
- name: Setup conda
uses: conda-incubator/setup-miniconda@v3
with:
environment-file: ci/env.yaml
python-version: ${{ matrix.python-version }}
- name: Install
shell: bash -l {0}
run: |
pip install git+https://github.com/fsspec/filesystem_spec
pip install --upgrade "aiobotocore${{ matrix.aiobotocore-version }}"
pip install . --no-deps
pip list
- name: Run Tests
shell: bash -l {0}
run: pytest -vv -s s3fs
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
- uses: actions/setup-python@v6
with:
python-version: "3.11"
- uses: pre-commit/action@v3.0.1
s3fs-2026.2.0/.gitignore 0000664 0000000 0000000 00000000136 15141211055 0014604 0 ustar 00root root 0000000 0000000 *.pyc
.cache/
.pytest_cache/
.python-version
.idea/
__pycache__
dist/
*.egg-info
build/
venv/
s3fs-2026.2.0/.pre-commit-config.yaml 0000664 0000000 0000000 00000000660 15141211055 0017077 0 ustar 00root root 0000000 0000000 repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v6.0.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 25.9.0
hooks:
- id: black
exclude: ^docs/
- repo: https://github.com/pycqa/flake8
rev: 7.3.0
hooks:
- id: flake8
exclude: tests/|^docs/|__init__.py
s3fs-2026.2.0/.readthedocs.yaml 0000664 0000000 0000000 00000000360 15141211055 0016042 0 ustar 00root root 0000000 0000000 version: 2
build:
os: ubuntu-22.04
tools:
python: miniconda3-4.7
conda:
environment: docs/environment.yml
python:
install:
- method: pip
path: .
sphinx:
configuration: docs/source/conf.py
fail_on_warning: true
s3fs-2026.2.0/CONTRIBUTING.md 0000664 0000000 0000000 00000000216 15141211055 0015044 0 ustar 00root root 0000000 0000000 s3fs is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more.
s3fs-2026.2.0/LICENSE.txt 0000664 0000000 0000000 00000002741 15141211055 0014443 0 ustar 00root root 0000000 0000000 Copyright (c) 2016, Continuum Analytics, Inc. and contributors
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
Neither the name of Continuum Analytics nor the names of any contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
THE POSSIBILITY OF SUCH DAMAGE.
s3fs-2026.2.0/MANIFEST.in 0000664 0000000 0000000 00000000341 15141211055 0014350 0 ustar 00root root 0000000 0000000 recursive-include s3fs *.py
recursive-include docs *.rst
include setup.py
include README.rst
include LICENSE.txt
include MANIFEST.in
include requirements.txt
prune docs/_build
include versioneer.py
include s3fs/_version.py
s3fs-2026.2.0/README.md 0000664 0000000 0000000 00000001307 15141211055 0014074 0 ustar 00root root 0000000 0000000 s3fs
====
[|Build Status|](https://github.com/fsspec/s3fs/actions)
[|Documentation|](https://s3fs.readthedocs.io/en/latest/?badge=latest)
S3FS builds on [aiobotocore](https://aiobotocore.readthedocs.io/en/latest/)
to provide a convenient Python filesystem interface for S3.
Support
-------
Work on this repository is supported in part by:
"Anaconda, Inc. - Advancing AI through open source."
s3fs-2026.2.0/ci/ 0000775 0000000 0000000 00000000000 15141211055 0013207 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/ci/env.yaml 0000664 0000000 0000000 00000000361 15141211055 0014663 0 ustar 00root root 0000000 0000000 name: test_env
channels:
- conda-forge
dependencies:
- pytest
- pytest-asyncio
- pip
- pytest
- ujson
- requests
- decorator
- pytest-timeout
- flake8
- black
- httpretty
- aiobotocore
- moto
- flask
- fsspec
s3fs-2026.2.0/docs/ 0000775 0000000 0000000 00000000000 15141211055 0013544 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/docs/Makefile 0000664 0000000 0000000 00000016751 15141211055 0015216 0 ustar 00root root 0000000 0000000 # Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
.PHONY: help
help:
@echo "Please use \`make ' where is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " applehelp to make an Apple Help Book"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
@echo " coverage to run coverage check of the documentation (if enabled)"
.PHONY: clean
clean:
rm -rf $(BUILDDIR)/*
.PHONY: html
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
.PHONY: dirhtml
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
.PHONY: singlehtml
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
.PHONY: pickle
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
.PHONY: json
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
.PHONY: htmlhelp
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
.PHONY: qthelp
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/S3Fs.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/S3Fs.qhc"
.PHONY: applehelp
applehelp:
$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
@echo
@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
@echo "N.B. You won't be able to view it unless you put it in" \
"~/Library/Documentation/Help or install it in your application" \
"bundle."
.PHONY: devhelp
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/S3Fs"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/S3Fs"
@echo "# devhelp"
.PHONY: epub
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
.PHONY: latex
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
.PHONY: latexpdf
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
.PHONY: latexpdfja
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
.PHONY: text
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
.PHONY: man
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
.PHONY: texinfo
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
.PHONY: info
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
.PHONY: gettext
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
.PHONY: changes
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
.PHONY: linkcheck
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
.PHONY: doctest
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
.PHONY: coverage
coverage:
$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
@echo "Testing of coverage in the sources finished, look at the " \
"results in $(BUILDDIR)/coverage/python.txt."
.PHONY: xml
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
.PHONY: pseudoxml
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
s3fs-2026.2.0/docs/environment.yml 0000664 0000000 0000000 00000000200 15141211055 0016623 0 ustar 00root root 0000000 0000000 name: s3fs
channels:
- defaults
dependencies:
- python= 3.10
- botocore
- docutils<0.17
- sphinx
- sphinx_rtd_theme
s3fs-2026.2.0/docs/make.bat 0000664 0000000 0000000 00000016126 15141211055 0015157 0 ustar 00root root 0000000 0000000 @ECHO OFF
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set BUILDDIR=build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
set I18NSPHINXOPTS=%SPHINXOPTS% source
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^` where ^ is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. singlehtml to make a single large HTML file
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. devhelp to make HTML files and a Devhelp project
echo. epub to make an epub
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. text to make text files
echo. man to make manual pages
echo. texinfo to make Texinfo files
echo. gettext to make PO message catalogs
echo. changes to make an overview over all changed/added/deprecated items
echo. xml to make Docutils-native XML files
echo. pseudoxml to make pseudoxml-XML files for display purposes
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
echo. coverage to run coverage check of the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
REM Check if sphinx-build is available and fallback to Python version if any
%SPHINXBUILD% 1>NUL 2>NUL
if errorlevel 9009 goto sphinx_python
goto sphinx_ok
:sphinx_python
set SPHINXBUILD=python -m sphinx.__init__
%SPHINXBUILD% 2> nul
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
:sphinx_ok
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "singlehtml" (
%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\S3Fs.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\S3Fs.ghc
goto end
)
if "%1" == "devhelp" (
%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
if errorlevel 1 exit /b 1
echo.
echo.Build finished.
goto end
)
if "%1" == "epub" (
%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The epub file is in %BUILDDIR%/epub.
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
if errorlevel 1 exit /b 1
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdf" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf
cd %~dp0
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "latexpdfja" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
cd %BUILDDIR%/latex
make all-pdf-ja
cd %~dp0
echo.
echo.Build finished; the PDF files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "text" (
%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The text files are in %BUILDDIR%/text.
goto end
)
if "%1" == "man" (
%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The manual pages are in %BUILDDIR%/man.
goto end
)
if "%1" == "texinfo" (
%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
goto end
)
if "%1" == "gettext" (
%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
if errorlevel 1 exit /b 1
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
if errorlevel 1 exit /b 1
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
if errorlevel 1 exit /b 1
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
if "%1" == "coverage" (
%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
if errorlevel 1 exit /b 1
echo.
echo.Testing of coverage in the sources finished, look at the ^
results in %BUILDDIR%/coverage/python.txt.
goto end
)
if "%1" == "xml" (
%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The XML files are in %BUILDDIR%/xml.
goto end
)
if "%1" == "pseudoxml" (
%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
if errorlevel 1 exit /b 1
echo.
echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
goto end
)
:end
s3fs-2026.2.0/docs/source/ 0000775 0000000 0000000 00000000000 15141211055 0015044 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/docs/source/_static/ 0000775 0000000 0000000 00000000000 15141211055 0016472 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/docs/source/_static/custom.css 0000664 0000000 0000000 00000000124 15141211055 0020513 0 ustar 00root root 0000000 0000000 .classifier:before {
font-style: normal;
margin: 0.5em;
content: ":";
}
s3fs-2026.2.0/docs/source/api.rst 0000664 0000000 0000000 00000001612 15141211055 0016347 0 ustar 00root root 0000000 0000000 API
===
.. currentmodule:: s3fs.core
.. autosummary::
S3FileSystem
S3FileSystem.cat
S3FileSystem.du
S3FileSystem.exists
S3FileSystem.find
S3FileSystem.get
S3FileSystem.glob
S3FileSystem.info
S3FileSystem.ls
S3FileSystem.mkdir
S3FileSystem.mv
S3FileSystem.open
S3FileSystem.put
S3FileSystem.read_block
S3FileSystem.rm
S3FileSystem.tail
S3FileSystem.touch
.. autosummary::
S3File
S3File.close
S3File.flush
S3File.info
S3File.read
S3File.seek
S3File.tell
S3File.write
.. currentmodule:: s3fs.mapping
.. autosummary::
S3Map
.. currentmodule:: s3fs.core
.. autoclass:: S3FileSystem
:members:
:inherited-members:
.. autoclass:: S3File
:members:
:inherited-members:
.. currentmodule:: s3fs.mapping
.. autofunction:: S3Map
.. currentmodule:: s3fs.utils
.. autoclass:: ParamKwargsHelper
.. autoclass:: SSEParams
s3fs-2026.2.0/docs/source/changelog.rst 0000664 0000000 0000000 00000016470 15141211055 0017535 0 ustar 00root root 0000000 0000000 Changelog
=========
2026.2.0
--------
- add custom error handling (#1003)
- do delete placeholders with rm(recursive=True) (#1005)
- force new session if it was explicitly closed (#1002)
2026.1.0
--------
- allow aiobotocore 3 (#998)
2025.12.0
---------
- remove optional dependencies (#995)
- add support for py3.14 and remove 3.9 (#993)
- add link docs->repo (#992)
2025.10.0
---------
- get bucket info on demand (#987)
- add CoC (#986)
- add goatcounter tracker (#985)
2025.9.0
--------
- update README for distribution compliance (#977)
2025.7.0
--------
- fix exclusive write for small files (#974)
- acknowledge Anaconda support (#972)
- fix test typo (#970)
2025.5.1
--------
no changes
2025.5.0
--------
- simpler requirements syntax (#958)
- use head_bucket for info(bucket) (#961)
2025.3.2
--------
no changes
2025.3.1
--------
- get_event_loop -> get_running_loop at shutdown (#954)
2025.3.0
--------
- recreate sessino object on refresh (#939)
- re-enable CI tests (#940)
2025.2.0
--------
- update docstrings to new default values (#934)
- fix CI (#936)
2024.12.0
---------
- CI fixes (#922)
- smaller threshold for copy_managed (#921)
- exclusive write (#917)
- fix bug in _find (#913)
- parse query without upstream infer_storage_options (#912)
- bug in _upload_file_part_concurrent (#910)
2024.10.0
---------
- invalidate cache in one-shot pipe file (#904)
- make pipe() concurrent (#901)
- add py3.13 (#898)
- suppoert R2 multi-part uploads (#888)
2024.9.0
--------
no change
2024.6.1
--------
no changes
2024.6.0
--------
no changes
2024.5.0
--------
- widen fsspec req version (#869)
- _bulk_delete must return list (#866)
- retry on "reduce request rate" (#865)
2024.3.1
--------
- accept kwargs in get_file (#863)
2024.3.0
--------
- don't fail ls is parent is unaccessible (#860)
- allow checksum error to retry (#858)
- don't lsbuckets for isdir(bucket) (#856)
- concurrent uplads of parts in put_file (#848)
2024.2.0
--------
- fix cache lookup in _info (#840)
2023.12.2
---------
no changes
2023.12.1
---------
- revert fallback to anon (#835)
2023.12.0
---------
- fall back to anon if no creds are found or passed at all (#823)
- **relax version bounds for aiobotocore** (#829)
- avoid key error if LastModified missing (#828)
- add make_mucket_versioned method (#825)
- retain TZ on modified time (#818)
2023.10.0
---------
- make protocol attribute a tuple (#812)
- update to aiobotocore 2.7.0 (#809)
- fix in _get_file following failure after connect (#805)
- test for du of nonexistent (#803)
2023.9.2
--------
- allow size= in fs.open() (#797)
- rmdir for non-bucket (#975)
- moto updates (#973)
- fix CI warnings (#792)
- dircache usage with depth (#791)
2023.9.1
--------
- retry ClientPayloadError while reading after initial connection (#787)
- don't pass ACL if not specified (#785)
2023.9.0
--------
- aiobotocore to 2.5.4
- better ** support in bulk ops/glob (#769)
- default ACL to "private" rather than blank (#764)
- invalidate cache in rm_file (#762)
- closing client in running loop (#760)
2023.6.0
--------
- allow versions in info.exists (#746)
- streaming file to update it's size for tell (#745, 741)
2023.5.0
--------
- Fix "_" in xattrs tests (#732)
- Fix file pointer already at end of file when retrying put (#731)
- Fix repeated find corrupting cache (#730)
- Remove duplicate class definition (#727)
- return list of deleted keys in bulk deleted (#726)
2023.4.0
--------
- Add streaming async read file (#722)
- Doc fixes (#721)
- aiobotocore to 2.5.0 (#710)
2023.3.0
--------
- Allow setting endpoint_url as top-level kwarg (#704)
- minimum python version 3.8 (#702)
- Update docs config (#697)
- get/put/cp recursive extra tests (#691)
2023.1.0
--------
- parse lambda ARNs (#686)
- recursive on chmod (#679)
- default cache to be readahead (#678)
- temporary redirects in headBucket (#676)
- async iterator for listings (#670)
2022.11.0
---------
- optionally listing versions with ls (#661)
2022.10.0
---------
- directory cache race condition (#655)
- version aware find (#654)
2022.8.1
--------
(no change)
2022.8.0
--------
- aiobotocore 2.4.0 (#643)
- del/list multipart uploads (#645)
- disallow prerelease aiohttp (#640)
- docs syntax (#634)
2022.7.1
--------
No changes
2022.7.0
--------
- aiobotocore 2.3.4 (#633)
2022.5.0
--------
- aiobotocore 2.3 (#622, fixes #558)
- rate limiting (#619, #620)
2022.3.0
--------
- pre-commit (#612)
- aiobotocore 2.2 (#609)
- empty ETag (#605)
- HTTPClientError retry (#597)
- new callbacks support (#590)
2022.02.0
---------
- callbacks fixes (#594, 590)
- drop py36 (#582)
- metadata fixes (#575, 579)
2022.01.0
---------
- aiobotocore dep to 2.1.0 (#564)
- docs for non-aws (#567)
- ContentType in info (#570)
- small-file ACL (#574)
2021.11.1
---------
- deal with missing ETag (#557)
- ClientPayloadError to retryable (#556)
- pin aiobotocore (#555)
2021.11.0
---------
- move to fsspec org
- doc tweaks (#546, 540)
- redondant argument in _rm_versioned_bucket_contents (#439)
- allow client_method in url/sign (POST, etc) (#536)
- revert list_v2->head for info (#545)
2021.10.1
---------
- allow other methods than GET to url/sign (#536)
2021.10.0
---------
No changes (just released to keep pin with fsspec)
2021.09.0
---------
- check for bucket also with get_bucket_location (#533)
- update versioneer (#531)
2021.08.1
---------
- retry on IncompleteRead (#525)
- fix isdir for missing bucket (#522)
- raise for glob("*") (#5167)
2021.08.0
---------
- fix for aiobotocore update (#510)
2021.07.0
---------
- make bucket in put(recursive) (#496)
- non-truthy prefixes (#497)
- implement rm_file (#499)
2021.06.1
---------
- bucket region caching (#495)
2021.06.0
---------
- support "prefix" in directory listings (#486)
- support negative index in cat_file (#487, 488)
- don't requite ETag in file details (#480)
2021.05.0
---------
- optimize ``info``,``exists`` (and related) calls for non-version aware mode
- copy with entries without ETag (#480)
- find not to corrupts parent listing (#476)
- short listing to determine directory (#472, 471)
Version 2021.04.0
-----------------
- switch to calver and fsspec pin
- py36 (#462)
- async fixes (#456, 452)
Version 0.6.0
-------------
- update for fsspec 0.9.0 (#448)
- better errors (#443)
- cp to preserve ETAG (#441)
- CI (#435, #427, #395)
- 5GB PUT (#425)
- partial cat (#389)
- direct find (#360)
Version 0.5.0
-------------
- Asynchronous filesystem based on ``aiobotocore``
Version 0.4.0
-------------
- New instances no longer need reconnect (:pr:`244`) by `Martin Durant`_
- Always use multipart uploads when not autocommitting (:pr:`243`) by `Marius van Niekerk`_
- Create ``CONTRIBUTING.md`` (:pr:`248`) by `Jacob Tomlinson`_
- Use autofunction for ``S3Map`` sphinx autosummary (:pr:`251`) by `James Bourbeau`_
- Miscellaneous doc updates (:pr:`252`) by `James Bourbeau`_
- Support for Python 3.8 (:pr:`264`) by `Tom Augspurger`_
- Improved performance for ``isdir`` (:pr:`259`) by `Nate Yoder`_
- Increased the minimum required version of fsspec to 0.6.0
.. _`Martin Durant`: https://github.com/martindurant
.. _`Marius van Niekerk`: https://github.com/mariusvniekerk
.. _`Jacob Tomlinson`: https://github.com/jacobtomlinson
.. _`James Bourbeau`: https://github.com/jrbourbeau
.. _`Tom Augspurger`: https://github.com/TomAugspurger
.. _`Nate Yoder`: https://github.com/nateyoder
s3fs-2026.2.0/docs/source/code-of-conduct.rst 0000664 0000000 0000000 00000012636 15141211055 0020557 0 ustar 00root root 0000000 0000000 Code of Conduct
===============
All participants in the fsspec community are expected to adhere to a Code of Conduct.
As contributors and maintainers of this project, and in the interest of
fostering an open and welcoming community, we pledge to respect all people who
contribute through reporting issues, posting feature requests, updating
documentation, submitting pull requests or patches, and other activities.
We are committed to making participation in this project a harassment-free
experience for everyone, treating everyone as unique humans deserving of
respect.
Examples of unacceptable behaviour by participants include:
- The use of sexualized language or imagery
- Personal attacks
- Trolling or insulting/derogatory comments
- Public or private harassment
- Publishing other's private information, such as physical or electronic
addresses, without explicit permission
- Other unethical or unprofessional conduct
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviours that they deem inappropriate,
threatening, offensive, or harmful.
By adopting this Code of Conduct, project maintainers commit themselves
to fairly and consistently applying these principles to every aspect of
managing this project. Project maintainers who do not follow or enforce
the Code of Conduct may be permanently removed from the project team.
This code of conduct applies both within project spaces and in public
spaces when an individual is representing the project or its community.
If you feel the code of conduct has been violated, please report the
incident to the fsspec core team.
Reporting
---------
If you believe someone is violating theCode of Conduct we ask that you report it
to the Project by emailing community@anaconda.com. All reports will be kept
confidential. In some cases we may determine that a public statement will need
to be made. If that's the case, the identities of all victims and reporters
will remain confidential unless those individuals instruct us otherwise.
If you believe anyone is in physical danger, please notify appropriate law
enforcement first.
In your report please include:
- Your contact info
- Names (real, nicknames, or pseudonyms) of any individuals involved.
If there were other witnesses besides you, please try to include them as well.
- When and where the incident occurred. Please be as specific as possible.
- Your account of what occurred. If there is a publicly available record
please include a link.
- Any extra context you believe existed for the incident.
- If you believe this incident is ongoing.
- If you believe any member of the core team has a conflict of interest
in adjudicating the incident.
- What, if any, corrective response you believe would be appropriate.
- Any other information you believe we should have.
Core team members are obligated to maintain confidentiality with regard
to the reporter and details of an incident.
What happens next?
~~~~~~~~~~~~~~~~~~
You will receive an email acknowledging receipt of your complaint.
The core team will immediately meet to review the incident and determine:
- What happened.
- Whether this event constitutes a code of conduct violation.
- Who the bad actor was.
- Whether this is an ongoing situation, or if there is a threat to anyone's
physical safety.
- If this is determined to be an ongoing incident or a threat to physical safety,
the working groups' immediate priority will be to protect everyone involved.
If a member of the core team is one of the named parties, they will not be
included in any discussions, and will not be provided with any confidential
details from the reporter.
If anyone on the core team believes they have a conflict of interest in
adjudicating on a reported issue, they will inform the other core team
members, and exempt themselves from any discussion about the issue.
Following this declaration, they will not be provided with any confidential
details from the reporter.
Once the working group has a complete account of the events they will make a
decision as to how to response. Responses may include:
- Nothing (if we determine no violation occurred).
- A private reprimand from the working group to the individual(s) involved.
- A public reprimand.
- An imposed vacation
- A permanent or temporary ban from some or all spaces (GitHub repositories, etc.)
- A request for a public or private apology.
We'll respond within one week to the person who filed the report with either a
resolution or an explanation of why the situation is not yet resolved.
Once we've determined our final action, we'll contact the original reporter
to let them know what action (if any) we'll be taking. We'll take into account
feedback from the reporter on the appropriateness of our response, but we
don't guarantee we'll act on it.
Acknowledgement
---------------
This CoC is modified from the one by `BeeWare`_, which in turn refers to
the `Contributor Covenant`_ and the `Django`_ project.
.. _BeeWare: https://beeware.org/community/behavior/code-of-conduct/
.. _Contributor Covenant: https://www.contributor-covenant.org/version/1/3/0/code-of-conduct/
.. _Django: https://www.djangoproject.com/conduct/reporting/
.. raw:: html
s3fs-2026.2.0/docs/source/conf.py 0000664 0000000 0000000 00000021726 15141211055 0016353 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python3
#
# S3Fs documentation build configuration file, created by
# sphinx-quickstart on Mon Mar 21 15:20:01 2016.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.todo',
'sphinx.ext.ifconfig',
'sphinx.ext.viewcode',
'sphinx.ext.autosummary',
'sphinx.ext.extlinks',
'sphinx.ext.napoleon',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = 'S3Fs'
copyright = '2016, Continuum Analytics'
author = 'Continuum Analytics'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
import s3fs
version = s3fs.__version__
# The full version, including alpha/beta/rc tags.
release = version
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = []
# The reST default role (used for this markup: `text`) to use for all
# documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built documents.
#keep_warnings = False
# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False
extlinks = {
"pr": ("https://github.com/fsspec/s3fs/pull/%s", "PR #%s"),
}
# -- Options for HTML output ----------------------------------------------
html_theme = 'sphinx_rtd_theme'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Custom CSS file to override read the docs default CSS.
# Contains workaround for issue #790.
html_css_files = ["custom.css"]
# Add any extra paths that contain custom files (such as robots.txt or
# .htaccess) here, relative to this directory. These files are copied
# directly to the root of the documentation.
#html_extra_path = []
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Language to be used for generating the HTML full-text search index.
# Sphinx supports the following languages:
# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
#html_search_language = 'en'
# A dictionary with options for the search language support, empty by default.
# Now only 'ja' uses this config value
#html_search_options = {'type': 'default'}
# The name of a javascript file (relative to the configuration directory) that
# implements a search results scorer. If empty, the default will be used.
#html_search_scorer = 'scorer.js'
# Output file base name for HTML help builder.
htmlhelp_basename = 'S3Fsdoc'
# -- Options for LaTeX output ---------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
# Latex figure (float) alignment
#'figure_align': 'htbp',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'S3Fs.tex', 'S3Fs Documentation',
'Continuum Analytics', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output ---------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 's3fs', 'S3Fs Documentation',
[author], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output -------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'S3Fs', 'S3Fs Documentation',
author, 'S3Fs', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
#texinfo_no_detailmenu = False
s3fs-2026.2.0/docs/source/development.rst 0000664 0000000 0000000 00000000227 15141211055 0020121 0 ustar 00root root 0000000 0000000 Development
===========
Create a development environment::
$ pip install -r requirements.txt -r test_requirements.txt
Run tests::
$ pytest
s3fs-2026.2.0/docs/source/index.rst 0000664 0000000 0000000 00000033422 15141211055 0016711 0 ustar 00root root 0000000 0000000 S3Fs
====
S3Fs is a Pythonic file interface to S3. It builds on top of botocore_. The project is hosted on `GitHub `_ |github_stars|
.. |github_stars| image:: https://img.shields.io/github/stars/fsspec/s3fs?style=social
:target: https://github.com/fsspec/s3fs
:alt: GitHub Repository
The top-level class :py:class:`.S3FileSystem` holds connection information and allows
typical file-system style operations like ``cp``, ``mv``, ``ls``, ``du``,
``glob``, etc., as well as put/get of local files to/from S3.
The connection can be anonymous - in which case only publicly-available,
read-only buckets are accessible - or via credentials explicitly supplied
or in configuration files.
Calling ``open()`` on a :py:class:`.S3FileSystem` (typically using a context manager)
provides an :py:class:`.S3File` for read or write access to a particular key. The object
emulates the standard ``File`` protocol (``read``, ``write``, ``tell``,
``seek``), such that functions expecting a file can access S3. Only binary read
and write modes are implemented, with blocked caching.
S3Fs uses and is based upon `fsspec`_.
.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/
Examples
--------
Simple locate and read a file:
.. code-block:: python
>>> import s3fs
>>> s3 = s3fs.S3FileSystem(anon=True)
>>> s3.ls('my-bucket')
['my-file.txt']
>>> with s3.open('my-bucket/my-file.txt', 'rb') as f:
... print(f.read())
b'Hello, world'
(see also ``walk`` and ``glob``)
Reading with delimited blocks:
.. code-block:: python
>>> s3.read_block(path, offset=1000, length=10, delimiter=b'\n')
b'A whole line of text\n'
Writing with blocked caching:
.. code-block:: python
>>> s3 = s3fs.S3FileSystem(anon=False) # uses default credentials
>>> with s3.open('mybucket/new-file', 'wb') as f:
... f.write(2*2**20 * b'a')
... f.write(2*2**20 * b'a') # data is flushed and file closed
>>> s3.du('mybucket/new-file')
{'mybucket/new-file': 4194304}
Because S3Fs faithfully copies the Python file interface it can be used
smoothly with other projects that consume the file interface like ``gzip`` or
``pandas``.
.. code-block:: python
>>> with s3.open('mybucket/my-file.csv.gz', 'rb') as f:
... g = gzip.GzipFile(fileobj=f) # Decompress data with gzip
... df = pd.read_csv(g) # Read CSV file with Pandas
Integration
-----------
The libraries ``intake``, ``pandas`` and ``dask`` accept URLs with the prefix
"s3://", and will use s3fs to complete the IO operation in question. The
IO functions take an argument ``storage_options``, which will be passed
to :py:class:`.S3FileSystem`, for example:
.. code-block:: python
df = pd.read_excel("s3://bucket/path/file.xls",
storage_options={"anon": True})
This gives the chance to pass any credentials or other necessary
arguments needed to s3fs.
Async
-----
``s3fs`` is implemented using ``aiobotocore``, and offers async functionality.
A number of methods of :py:class:`.S3FileSystem` are ``async``, for for each of these,
there is also a synchronous version with the same name and lack of a ``_``
prefix.
If you wish to call ``s3fs`` from async code, then you should pass
``asynchronous=True, loop=`` to the constructor (the latter is optional,
if you wish to use both async and sync methods). You must also explicitly
await the client creation before making any S3 call.
.. code-block:: python
async def run_program():
s3 = S3FileSystem(..., asynchronous=True)
session = await s3.set_session()
... # perform work
await session.close()
asyncio.run(run_program()) # or call from your async code
Concurrent async operations are also used internally for bulk operations
such as ``pipe/cat``, ``get/put``, ``cp/mv/rm``. The async calls are
hidden behind a synchronisation layer, so are designed to be called
from normal code. If you are *not*
using async-style programming, you do not need to know about how this
works, but you might find the implementation interesting.
Multiprocessing
---------------
When using Python's `multiprocessing`_, the start method must be set to either
``spawn`` or ``forkserver``. ``fork`` is not safe to use because of the open sockets
and async thread used by s3fs, and may lead to
hard-to-find bugs and occasional deadlocks. Read more about the available
`start methods`_.
.. _multiprocessing: https://docs.python.org/3/library/multiprocessing.html
.. _start methods: https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
Limitations
-----------
This project is meant for convenience, rather than feature completeness.
The following are known current omissions:
- file access is always binary (although ``readline`` and iterating by line
are possible)
- no permissions/access-control (i.e., no ``chmod``/``chown`` methods)
Logging
-------
The logger named ``s3fs`` provides information about the operations of the file
system. To quickly see all messages, you can set the environment variable
``S3FS_LOGGING_LEVEL=DEBUG``. The presence of this environment variable will
install a handler for the logger that prints messages to stderr and set the log
level to the given value. More advance logging configuration is possible using
Python's standard `logging framework`_.
.. _logging framework: https://docs.python.org/3/library/logging.html
Errors
------
The ``s3fs`` library includes a built-in mechanism to automatically retry
operations when specific transient errors occur. You can customize this behavior
by adding specific exception types or defining complex logic via custom handlers.
Default Retryable Errors
~~~~~~~~~~~~~~~~~~~~~~~~
By default, ``s3fs`` will retry the following exception types:
- ``socket.timeout``
- ``HTTPClientError``
- ``IncompleteRead``
- ``FSTimeoutError``
- ``ResponseParserError``
- ``aiohttp.ClientPayloadError`` (if available)
Registering Custom Error Types
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To include additional exception types in the default retry logic, use the
``add_retryable_error`` function. This is useful for simple type-based retries.
.. code-block:: python
>>> class MyCustomError(Exception):
pass
>>> s3fs.add_retryable_error(MyCustomError)
Implementing Custom Error Handlers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For more complex scenarios, such as retrying based on an error message rather than
just the type, you can register a custom error handler using ``set_custom_error_handler``.
The handler should be a callable that accepts an exception instance and returns ``True``
if the error should be retried, or ``False`` otherwise.
.. code-block:: python
>>> def my_handler(e):
return isinstance(e, MyCustomError) and "some condition" in str(e)
>>> s3fs.set_custom_error_handler(my_handler)
Handling AWS ClientErrors
~~~~~~~~~~~~~~~~~~~~~~~~~
``s3fs`` provides specialized handling for ``botocore.exceptions.ClientError``.
While ``s3fs`` checks these against internal patterns (like throttling),
you can extend this behavior using a custom handler. Note that the internal
patterns will still be checked and handled before the custom handler.
.. code-block:: python
>>> def another_handler(e):
return isinstance(e, ClientError) and "Throttling" in str(e)
>>> s3fs.set_custom_error_handler(another_handler)
Credentials
-----------
The AWS key and secret may be provided explicitly when creating an :py:class:`.S3FileSystem`.
A more secure way, not including the credentials directly in code, is to allow
boto to establish the credentials automatically. Boto will try the following
methods, in order:
- ``AWS_ACCESS_KEY_ID``, ``AWS_SECRET_ACCESS_KEY``, and ``AWS_SESSION_TOKEN``
environment variables
- configuration files such as ``~/.aws/credentials``
- for nodes on EC2, the IAM metadata provider
You can specify a profile using ``s3fs.S3FileSystem(profile='PROFILE')``.
Otherwise ``sf3s`` will use authentication via `boto environment variables`_.
.. _boto environment variables: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables
In a distributed environment, it is not expected that raw credentials should
be passed between machines. In the explicitly provided credentials case, the
method :py:meth:`.S3FileSystem.get_delegated_s3pars` can be used to obtain temporary credentials.
When not using explicit credentials, it should be expected that every machine
also has the appropriate environment variables, config files or IAM roles
available.
If none of the credential methods are available, only anonymous access will
work, and ``anon=True`` must be passed to the constructor.
Furthermore, :py:meth:`.S3FileSystem.current` will return the most-recently created
instance, so this method could be used in preference to the constructor in
cases where the code must be agnostic of the credentials/config used.
S3 Compatible Storage
---------------------
To use ``s3fs`` against an S3 compatible storage, like `MinIO`_ or
`Ceph Object Gateway`_, you'll probably need to pass extra parameters when
creating the ``s3fs`` filesystem. Here are some sample configurations:
For a self-hosted MinIO instance:
.. code-block:: python
# When relying on auto discovery for credentials
>>> s3 = s3fs.S3FileSystem(
anon=False,
endpoint_url='https://...'
)
# Or passing the credentials directly
>>> s3 = s3fs.S3FileSystem(
key='miniokey...',
secret='asecretkey...',
endpoint_url='https://...'
)
It is also possible to set credentials through environment variables:
.. code-block:: python
# export FSSPEC_S3_ENDPOINT_URL=https://...
# export FSSPEC_S3_KEY='miniokey...'
# export FSSPEC_S3_SECRET='asecretkey...'
>>> s3 = s3fs.S3FileSystem()
# or ...
>>> f = fsspec.open("s3://minio-bucket/...")
For Storj DCS via the `S3-compatible Gateway `_:
.. code-block:: python
# When relying on auto discovery for credentials
>>> s3 = s3fs.S3FileSystem(
anon=False,
endpoint_url='https://gateway.storjshare.io'
)
# Or passing the credentials directly
>>> s3 = s3fs.S3FileSystem(
key='accesskey...',
secret='asecretkey...',
endpoint_url='https://gateway.storjshare.io'
)
For a Scaleway s3-compatible storage in the ``fr-par`` zone:
.. code-block:: python
>>> s3 = s3fs.S3FileSystem(
key='scaleway-api-key...',
secret='scaleway-secretkey...',
endpoint_url='https://s3.fr-par.scw.cloud',
client_kwargs={
'region_name': 'fr-par'
}
)
For an OVH s3-compatible storage in the ``GRA`` zone:
.. code-block:: python
>>> s3 = s3fs.S3FileSystem(
key='ovh-s3-key...',
secret='ovh-s3-secretkey...',
endpoint_url='https://s3.GRA.cloud.ovh.net',
client_kwargs={
'region_name': 'GRA'
},
config_kwargs={
'signature_version': 's3v4'
}
)
.. _MinIO: https://min.io
.. _Ceph Object Gateway: https://docs.ceph.com/docs/master/radosgw/
Requester Pays Buckets
----------------------
Some buckets, such as the `arXiv raw data
`__, are configured so that the
requester of the data pays any transfer fees. You must be
authenticated to access these buckets and (because these charges maybe
unexpected) amazon requires an additional key on many of the API
calls. To enable ``RequesterPays`` create your file system as
.. code-block:: python
>>> s3 = s3fs.S3FileSystem(anon=False, requester_pays=True)
Serverside Encryption
---------------------
For some buckets/files you may want to use some of s3's server side encryption
features. ``s3fs`` supports these in a few ways
.. code-block:: python
>>> s3 = s3fs.S3FileSystem(
... s3_additional_kwargs={'ServerSideEncryption': 'AES256'})
This will create an s3 filesystem instance that will append the
ServerSideEncryption argument to all s3 calls (where applicable).
The same applies for ``s3.open``. Most of the methods on the filesystem object
will also accept and forward keyword arguments to the underlying calls. The
most recently specified argument is applied last in the case where both
``s3_additional_kwargs`` and a method's ``**kwargs`` are used.
The ``s3.utils.SSEParams`` provides some convenient helpers for the serverside
encryption parameters in particular. An instance can be passed instead of a
regular python dictionary as the ``s3_additional_kwargs`` parameter.
Bucket Version Awareness
------------------------
If your bucket has object versioning enabled then you can add version-aware support
to ``s3fs``. This ensures that if a file is opened at a particular point in time that
version will be used for reading.
This mitigates the issue where more than one user is concurrently reading and writing
to the same object.
.. code-block:: python
>>> s3 = s3fs.S3FileSystem(version_aware=True)
# Open the file at the latest version
>>> fo = s3.open('versioned_bucket/object')
>>> versions = s3.object_version_info('versioned_bucket/object')
# Open the file at a particular version
>>> fo_old_version = s3.open('versioned_bucket/object', version_id='SOMEVERSIONID')
In order for this to function the user must have the necessary IAM permissions to perform
a GetObjectVersion
Contents
========
.. toctree::
install
development
api
changelog
code-of-conduct
:maxdepth: 2
.. _botocore: https://botocore.readthedocs.io/en/latest/
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
These docs pages collect anonymous tracking data using goatcounter, and the
dashboard is available to the public: https://s3fs.goatcounter.com/ .
.. raw:: html
s3fs-2026.2.0/docs/source/install.rst 0000664 0000000 0000000 00000001022 15141211055 0017237 0 ustar 00root root 0000000 0000000 Installation
============
Conda
-----
The ``s3fs`` library and its dependencies can be installed from the
`conda-forge `_ repository using
`conda `_::
$ conda install s3fs -c conda-forge
PyPI
----
You can install ``s3fs`` with pip::
pip install s3fs
Install from source
-------------------
You can also download the ``s3fs`` library from Github and install normally::
git clone git@github.com:fsspec/s3fs
cd s3fs
python setup.py install
s3fs-2026.2.0/pytest.ini 0000664 0000000 0000000 00000000032 15141211055 0014640 0 ustar 00root root 0000000 0000000 [pytest]
testpaths = s3fs
s3fs-2026.2.0/release-procedure.md 0000664 0000000 0000000 00000001061 15141211055 0016542 0 ustar 00root root 0000000 0000000 1. Verify tests on Linux, OS-X, and Windows
2. Complete entries in `docs/source/changelog.rst`.
There's no need for changing version numbers in source files.
The release version will be determined from the git tag (see below).
3. Tag the commit
git tag 1.2.3 -m "Version 1.2.3"
4. Push new version bump commit and tag to github
git push fsspec main --tags
5. Build source and wheel packages
rm -rf dist/
python setup.py sdist bdist_wheel --universal
6. Upload packages to PyPI
twine upload dist/*
s3fs-2026.2.0/requirements.txt 0000664 0000000 0000000 00000000110 15141211055 0016070 0 ustar 00root root 0000000 0000000 aiobotocore>=2.19.0,<4.0.0
fsspec==2026.2.0
aiohttp!=4.0.0a0, !=4.0.0a1
s3fs-2026.2.0/s3fs/ 0000775 0000000 0000000 00000000000 15141211055 0013472 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/s3fs/__init__.py 0000664 0000000 0000000 00000000317 15141211055 0015604 0 ustar 00root root 0000000 0000000 from .core import S3FileSystem, S3File, add_retryable_error, set_custom_error_handler
from .mapping import S3Map
from ._version import get_versions
__version__ = get_versions()["version"]
del get_versions
s3fs-2026.2.0/s3fs/_version.py 0000664 0000000 0000000 00000060117 15141211055 0015675 0 ustar 00root root 0000000 0000000 # This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
# feature). Distribution tarballs (built by setup.py sdist) and build
# directories (produced by setup.py build) will contain a much shorter file
# that just contains the computed version number.
# This file is released into the public domain.
# Generated by versioneer-0.29
# https://github.com/python-versioneer/python-versioneer
"""Git implementation of _version.py."""
import errno
import os
import re
import subprocess
import sys
from typing import Any, Callable, Dict, List, Optional, Tuple
import functools
def get_keywords() -> Dict[str, str]:
"""Get the keywords needed to look up the version information."""
# these strings will be replaced by git during git-archive.
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
git_refnames = " (HEAD -> main, tag: 2026.2.0)"
git_full = "1181d335955418f081a1d0b94c3d8350cea0751f"
git_date = "2026-02-05 16:57:01 -0500"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
class VersioneerConfig:
"""Container for Versioneer configuration parameters."""
VCS: str
style: str
tag_prefix: str
parentdir_prefix: str
versionfile_source: str
verbose: bool
def get_config() -> VersioneerConfig:
"""Create, populate and return the VersioneerConfig() object."""
# these strings are filled in when 'setup.py versioneer' creates
# _version.py
cfg = VersioneerConfig()
cfg.VCS = "git"
cfg.style = "pep440"
cfg.tag_prefix = ""
cfg.parentdir_prefix = "None"
cfg.versionfile_source = "s3fs/_version.py"
cfg.verbose = False
return cfg
class NotThisMethod(Exception):
"""Exception raised if a method is not valid for the current scenario."""
LONG_VERSION_PY: Dict[str, str] = {}
HANDLERS: Dict[str, Dict[str, Callable]] = {}
def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator
"""Create decorator to mark a method as the handler of a VCS."""
def decorate(f: Callable) -> Callable:
"""Store f in HANDLERS[vcs][method]."""
if vcs not in HANDLERS:
HANDLERS[vcs] = {}
HANDLERS[vcs][method] = f
return f
return decorate
def run_command(
commands: List[str],
args: List[str],
cwd: Optional[str] = None,
verbose: bool = False,
hide_stderr: bool = False,
env: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[str], Optional[int]]:
"""Call the given command(s)."""
assert isinstance(commands, list)
process = None
popen_kwargs: Dict[str, Any] = {}
if sys.platform == "win32":
# This hides the console window if pythonw.exe is used
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
popen_kwargs["startupinfo"] = startupinfo
for command in commands:
try:
dispcmd = str([command] + args)
# remember shell=False, so use git.cmd on windows, not just git
process = subprocess.Popen(
[command] + args,
cwd=cwd,
env=env,
stdout=subprocess.PIPE,
stderr=(subprocess.PIPE if hide_stderr else None),
**popen_kwargs,
)
break
except OSError as e:
if e.errno == errno.ENOENT:
continue
if verbose:
print("unable to run %s" % dispcmd)
print(e)
return None, None
else:
if verbose:
print("unable to find command, tried %s" % (commands,))
return None, None
stdout = process.communicate()[0].strip().decode()
if process.returncode != 0:
if verbose:
print("unable to run %s (error)" % dispcmd)
print("stdout was %s" % stdout)
return None, process.returncode
return stdout, process.returncode
def versions_from_parentdir(
parentdir_prefix: str,
root: str,
verbose: bool,
) -> Dict[str, Any]:
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes both
the project name and a version string. We will also support searching up
two directory levels for an appropriately named parent directory
"""
rootdirs = []
for _ in range(3):
dirname = os.path.basename(root)
if dirname.startswith(parentdir_prefix):
return {
"version": dirname[len(parentdir_prefix) :],
"full-revisionid": None,
"dirty": False,
"error": None,
"date": None,
}
rootdirs.append(root)
root = os.path.dirname(root) # up a level
if verbose:
print(
"Tried directories %s but none started with prefix %s"
% (str(rootdirs), parentdir_prefix)
)
raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
@register_vcs_handler("git", "get_keywords")
def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
"""Extract version information from the given file."""
# the code embedded in _version.py can just fetch the value of these
# keywords. When used from setup.py, we don't want to import _version.py,
# so we do it with a regexp instead. This function is not used from
# _version.py.
keywords: Dict[str, str] = {}
try:
with open(versionfile_abs, "r") as fobj:
for line in fobj:
if line.strip().startswith("git_refnames ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["refnames"] = mo.group(1)
if line.strip().startswith("git_full ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["full"] = mo.group(1)
if line.strip().startswith("git_date ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["date"] = mo.group(1)
except OSError:
pass
return keywords
@register_vcs_handler("git", "keywords")
def git_versions_from_keywords(
keywords: Dict[str, str],
tag_prefix: str,
verbose: bool,
) -> Dict[str, Any]:
"""Get version information from git keywords."""
if "refnames" not in keywords:
raise NotThisMethod("Short version file found")
date = keywords.get("date")
if date is not None:
# Use only the last line. Previous lines may contain GPG signature
# information.
date = date.splitlines()[-1]
# git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
# datestamp. However we prefer "%ci" (which expands to an "ISO-8601
# -like" string, which we must then edit to make compliant), because
# it's been around since git-1.5.3, and it's too difficult to
# discover which version we're using, or to work around using an
# older one.
date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
refnames = keywords["refnames"].strip()
if refnames.startswith("$Format"):
if verbose:
print("keywords are unexpanded, not using")
raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
refs = {r.strip() for r in refnames.strip("()").split(",")}
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
# expansion behaves like git log --decorate=short and strips out the
# refs/heads/ and refs/tags/ prefixes that would let us distinguish
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
tags = {r for r in refs if re.search(r"\d", r)}
if verbose:
print("discarding '%s', no digits" % ",".join(refs - tags))
if verbose:
print("likely tags: %s" % ",".join(sorted(tags)))
for ref in sorted(tags):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
r = ref[len(tag_prefix) :]
# Filter out refs that exactly match prefix or that don't start
# with a number once the prefix is stripped (mostly a concern
# when prefix is '')
if not re.match(r"\d", r):
continue
if verbose:
print("picking %s" % r)
return {
"version": r,
"full-revisionid": keywords["full"].strip(),
"dirty": False,
"error": None,
"date": date,
}
# no suitable tags, so version is "0+unknown", but full hex is still there
if verbose:
print("no suitable tags, using unknown + full revision id")
return {
"version": "0+unknown",
"full-revisionid": keywords["full"].strip(),
"dirty": False,
"error": "no suitable tags",
"date": None,
}
@register_vcs_handler("git", "pieces_from_vcs")
def git_pieces_from_vcs(
tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command
) -> Dict[str, Any]:
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
expanded, and _version.py hasn't already been rewritten with a short
version string, meaning we're inside a checked out source tree.
"""
GITS = ["git"]
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
# GIT_DIR can interfere with correct operation of Versioneer.
# It may be intended to be passed to the Versioneer-versioned project,
# but that should not change where we get our version from.
env = os.environ.copy()
env.pop("GIT_DIR", None)
runner = functools.partial(runner, env=env)
_, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose)
if rc != 0:
if verbose:
print("Directory %s not under git control" % root)
raise NotThisMethod("'git rev-parse --git-dir' returned error")
# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
# if there isn't one, this yields HEX[-dirty] (no NUM)
describe_out, rc = runner(
GITS,
[
"describe",
"--tags",
"--dirty",
"--always",
"--long",
"--match",
f"{tag_prefix}[[:digit:]]*",
],
cwd=root,
)
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
describe_out = describe_out.strip()
full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
if full_out is None:
raise NotThisMethod("'git rev-parse' failed")
full_out = full_out.strip()
pieces: Dict[str, Any] = {}
pieces["long"] = full_out
pieces["short"] = full_out[:7] # maybe improved later
pieces["error"] = None
branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
# --abbrev-ref was added in git-1.6.3
if rc != 0 or branch_name is None:
raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
branch_name = branch_name.strip()
if branch_name == "HEAD":
# If we aren't exactly on a branch, pick a branch which represents
# the current commit. If all else fails, we are on a branchless
# commit.
branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
# --contains was added in git-1.5.4
if rc != 0 or branches is None:
raise NotThisMethod("'git branch --contains' returned error")
branches = branches.split("\n")
# Remove the first line if we're running detached
if "(" in branches[0]:
branches.pop(0)
# Strip off the leading "* " from the list of branches.
branches = [branch[2:] for branch in branches]
if "master" in branches:
branch_name = "master"
elif not branches:
branch_name = None
else:
# Pick the first branch that is returned. Good or bad.
branch_name = branches[0]
pieces["branch"] = branch_name
# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
# TAG might have hyphens.
git_describe = describe_out
# look for -dirty suffix
dirty = git_describe.endswith("-dirty")
pieces["dirty"] = dirty
if dirty:
git_describe = git_describe[: git_describe.rindex("-dirty")]
# now we have TAG-NUM-gHEX or HEX
if "-" in git_describe:
# TAG-NUM-gHEX
mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
if not mo:
# unparsable. Maybe git-describe is misbehaving?
pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
return pieces
# tag
full_tag = mo.group(1)
if not full_tag.startswith(tag_prefix):
if verbose:
fmt = "tag '%s' doesn't start with prefix '%s'"
print(fmt % (full_tag, tag_prefix))
pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
full_tag,
tag_prefix,
)
return pieces
pieces["closest-tag"] = full_tag[len(tag_prefix) :]
# distance: number of commits since tag
pieces["distance"] = int(mo.group(2))
# commit: short hex revision ID
pieces["short"] = mo.group(3)
else:
# HEX: no tags
pieces["closest-tag"] = None
out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
pieces["distance"] = len(out.split()) # total number of commits
# commit date: see ISO-8601 comment in git_versions_from_keywords()
date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
# Use only the last line. Previous lines may contain GPG signature
# information.
date = date.splitlines()[-1]
pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
return pieces
def plus_or_dot(pieces: Dict[str, Any]) -> str:
"""Return a + if we don't already have one, else return a ."""
if "+" in pieces.get("closest-tag", ""):
return "."
return "+"
def render_pep440(pieces: Dict[str, Any]) -> str:
"""Build up version string, with post-release "local version identifier".
Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
Exceptions:
1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += plus_or_dot(pieces)
rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def render_pep440_branch(pieces: Dict[str, Any]) -> str:
"""TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
The ".dev0" means not master branch. Note that .dev0 sorts backwards
(a feature branch will appear "older" than the master branch).
Exceptions:
1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0"
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
"""Split pep440 version string at the post-release segment.
Returns the release segments before the post-release and the
post-release version number (or -1 if no post-release segment is present).
"""
vc = str.split(ver, ".post")
return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
def render_pep440_pre(pieces: Dict[str, Any]) -> str:
"""TAG[.postN.devDISTANCE] -- No -dirty.
Exceptions:
1: no tags. 0.post0.devDISTANCE
"""
if pieces["closest-tag"]:
if pieces["distance"]:
# update the post release segment
tag_version, post_version = pep440_split_post(pieces["closest-tag"])
rendered = tag_version
if post_version is not None:
rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
else:
rendered += ".post0.dev%d" % (pieces["distance"])
else:
# no commits, use the tag as the version
rendered = pieces["closest-tag"]
else:
# exception #1
rendered = "0.post0.dev%d" % pieces["distance"]
return rendered
def render_pep440_post(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX] .
The ".dev0" means dirty. Note that .dev0 sorts backwards
(a dirty tree will appear "older" than the corresponding clean one),
but you shouldn't be releasing software with -dirty anyways.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "g%s" % pieces["short"]
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += "+g%s" % pieces["short"]
return rendered
def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
The ".dev0" means not master branch.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "g%s" % pieces["short"]
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += "+g%s" % pieces["short"]
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def render_pep440_old(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]] .
The ".dev0" means dirty.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
return rendered
def render_git_describe(pieces: Dict[str, Any]) -> str:
"""TAG[-DISTANCE-gHEX][-dirty].
Like 'git describe --tags --dirty --always'.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"]:
rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render_git_describe_long(pieces: Dict[str, Any]) -> str:
"""TAG-DISTANCE-gHEX[-dirty].
Like 'git describe --tags --dirty --always -long'.
The distance/hash is unconditional.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
"""Render the given version pieces into the requested style."""
if pieces["error"]:
return {
"version": "unknown",
"full-revisionid": pieces.get("long"),
"dirty": None,
"error": pieces["error"],
"date": None,
}
if not style or style == "default":
style = "pep440" # the default
if style == "pep440":
rendered = render_pep440(pieces)
elif style == "pep440-branch":
rendered = render_pep440_branch(pieces)
elif style == "pep440-pre":
rendered = render_pep440_pre(pieces)
elif style == "pep440-post":
rendered = render_pep440_post(pieces)
elif style == "pep440-post-branch":
rendered = render_pep440_post_branch(pieces)
elif style == "pep440-old":
rendered = render_pep440_old(pieces)
elif style == "git-describe":
rendered = render_git_describe(pieces)
elif style == "git-describe-long":
rendered = render_git_describe_long(pieces)
else:
raise ValueError("unknown style '%s'" % style)
return {
"version": rendered,
"full-revisionid": pieces["long"],
"dirty": pieces["dirty"],
"error": None,
"date": pieces.get("date"),
}
def get_versions() -> Dict[str, Any]:
"""Get version information or return default if unable to do so."""
# I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
# __file__, we can work backwards from there to the root. Some
# py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
# case we can only use expanded keywords.
cfg = get_config()
verbose = cfg.verbose
try:
return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose)
except NotThisMethod:
pass
try:
root = os.path.realpath(__file__)
# versionfile_source is the relative path from the top of the source
# tree (where the .git directory might live) to this file. Invert
# this to find the root from __file__.
for _ in cfg.versionfile_source.split("/"):
root = os.path.dirname(root)
except NameError:
return {
"version": "0+unknown",
"full-revisionid": None,
"dirty": None,
"error": "unable to find root of source tree",
"date": None,
}
try:
pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
return render(pieces, cfg.style)
except NotThisMethod:
pass
try:
if cfg.parentdir_prefix:
return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
except NotThisMethod:
pass
return {
"version": "0+unknown",
"full-revisionid": None,
"dirty": None,
"error": "unable to compute version",
"date": None,
}
s3fs-2026.2.0/s3fs/core.py 0000664 0000000 0000000 00000272747 15141211055 0015017 0 ustar 00root root 0000000 0000000 import asyncio
import errno
import io
import logging
import math
import mimetypes
import os
import socket
import weakref
import re
from urllib3.exceptions import IncompleteRead
import fsspec # noqa: F401
from fsspec.spec import AbstractBufferedFile
from fsspec.utils import tokenize, setup_logging as setup_logger
from fsspec.asyn import (
AsyncFileSystem,
AbstractAsyncStreamedFile,
sync,
sync_wrapper,
FSTimeoutError,
_run_coros_in_chunks,
)
from fsspec.callbacks import _DEFAULT_CALLBACK
import aiobotocore
import botocore
import aiobotocore.session
from aiobotocore.config import AioConfig
from botocore.exceptions import ClientError, HTTPClientError, ParamValidationError
from botocore.parsers import ResponseParserError
from s3fs.errors import translate_boto_error
from s3fs.utils import S3BucketRegionCache, ParamKwargsHelper, _get_brange, FileExpired
# ClientPayloadError can be thrown during an incomplete read. aiohttp is a dependency of
# aiobotocore, we guard the import here in case this dependency is replaced in a future version
# of aiobotocore.
try:
from aiohttp import ClientPayloadError
except ImportError:
ClientPayloadError = None
logger = logging.getLogger("s3fs")
def setup_logging(level=None):
setup_logger(logger=logger, level=(level or os.environ["S3FS_LOGGING_LEVEL"]))
if "S3FS_LOGGING_LEVEL" in os.environ:
setup_logging()
MANAGED_COPY_THRESHOLD = 150 * 2**20
# Certain rate-limiting responses can send invalid XML
# (see https://github.com/fsspec/s3fs/issues/484), which can result in a parser error
# deep within botocore. So we treat those as retryable as well, even though there could
# be some false positives.
S3_RETRYABLE_ERRORS = (
socket.timeout,
HTTPClientError,
IncompleteRead,
FSTimeoutError,
ResponseParserError,
)
MAX_UPLOAD_PARTS = 10_000 # maximum number of parts for S3 multipart upload
if ClientPayloadError is not None:
S3_RETRYABLE_ERRORS += (ClientPayloadError,)
def add_retryable_error(exc):
"""
Add an exception type to the list of retryable S3 errors.
Parameters
----------
exc : Exception
The exception type to add to the retryable errors.
Examples
----------
>>> class MyCustomError(Exception): # doctest: +SKIP
... pass # doctest: +SKIP
>>> add_retryable_error(MyCustomError) # doctest: +SKIP
"""
global S3_RETRYABLE_ERRORS
S3_RETRYABLE_ERRORS += (exc,)
CUSTOM_ERROR_HANDLER = lambda _: False
def set_custom_error_handler(func):
"""Set a custom error handler function for S3 retryable errors.
The function should take an exception instance as its only argument,
and return True if the operation should be retried, or False otherwise.
This can also be used for custom behavior on `ClientError` exceptions,
such as retrying other patterns.
Parameters
----------
func : callable[[Exception], bool]
The custom error handler function.
Examples
----------
>>> def my_handler(e): # doctest: +SKIP
... return isinstance(e, MyCustomError) and "some condition" in str(e) # doctest: +SKIP
>>> set_custom_error_handler(my_handler) # doctest: +SKIP
>>> def another_handler(e): # doctest: +SKIP
... return isinstance(e, ClientError) and "Throttling" in str(e)" # doctest: +SKIP
>>> set_custom_error_handler(another_handler) # doctest: +SKIP
"""
global CUSTOM_ERROR_HANDLER
CUSTOM_ERROR_HANDLER = func
_VALID_FILE_MODES = {"r", "w", "a", "rb", "wb", "ab"}
_PRESERVE_KWARGS = [
"CacheControl",
"ContentDisposition",
"ContentEncoding",
"ContentLanguage",
"ContentLength",
"ContentType",
"Expires",
"WebsiteRedirectLocation",
"ServerSideEncryption",
"SSECustomerAlgorithm",
"SSEKMSKeyId",
"BucketKeyEnabled",
"StorageClass",
"ObjectLockMode",
"ObjectLockRetainUntilDate",
"ObjectLockLegalHoldStatus",
"Metadata",
]
key_acls = {
"private",
"public-read",
"public-read-write",
"authenticated-read",
"aws-exec-read",
"bucket-owner-read",
"bucket-owner-full-control",
}
buck_acls = {"private", "public-read", "public-read-write", "authenticated-read"}
async def _error_wrapper(func, *, args=(), kwargs=None, retries):
if kwargs is None:
kwargs = {}
err = None
for i in range(retries):
wait_time = min(1.7**i * 0.1, 15)
try:
return await func(*args, **kwargs)
except S3_RETRYABLE_ERRORS as e:
err = e
logger.debug("Retryable error: %s", e)
await asyncio.sleep(wait_time)
except ClientError as e:
logger.debug("Client error (maybe retryable): %s", e)
err = e
matched = False
for pattern in [
"SlowDown",
"reduce your request rate",
"XAmzContentSHA256Mismatch",
]:
if pattern in str(e):
matched = True
break
if matched:
await asyncio.sleep(wait_time)
else:
should_retry = CUSTOM_ERROR_HANDLER(e)
if should_retry:
await asyncio.sleep(wait_time)
else:
break
except Exception as e:
err = e
should_retry = CUSTOM_ERROR_HANDLER(e)
if should_retry:
await asyncio.sleep(wait_time)
else:
logger.debug("Nonretryable error: %s", e)
break
if "'coroutine'" in str(err):
# aiobotocore internal error - fetch original botocore error
tb = err.__traceback__
while tb.tb_next:
tb = tb.tb_next
try:
await tb.tb_frame.f_locals["response"]
except Exception as e:
err = e
err = translate_boto_error(err)
raise err
def version_id_kw(version_id):
"""Helper to make versionId kwargs.
Not all boto3 methods accept a None / empty versionId so dictionary expansion solves
that problem.
"""
if version_id:
return {"VersionId": version_id}
else:
return {}
def _coalesce_version_id(*args):
"""Helper to coalesce a list of version_ids down to one"""
version_ids = set(args)
if None in version_ids:
version_ids.remove(None)
if len(version_ids) > 1:
raise ValueError(
"Cannot coalesce version_ids where more than one are defined,"
f" {version_ids}"
)
elif len(version_ids) == 0:
return None
else:
return version_ids.pop()
def calculate_chunksize(filesize, chunksize=None, max_parts=MAX_UPLOAD_PARTS) -> int:
if chunksize is None:
chunksize = 50 * 2**20 # default chunksize set to 50 MiB
required_chunks = math.ceil(filesize / chunksize)
# increase chunksize to fit within the max_parts limit
if required_chunks > max_parts:
# S3 supports uploading objects up to 5 TiB in size,
# so each chunk can be up to ~524 MiB.
chunksize = math.ceil(filesize / max_parts)
return chunksize
class S3FileSystem(AsyncFileSystem):
"""
Access S3 as if it were a file system.
This exposes a filesystem-like API (ls, cp, open, etc.) on top of S3
storage.
Provide credentials either explicitly (``key=``, ``secret=``) or depend
on boto's credential methods. See botocore documentation for more
information. If no credentials are available, use ``anon=True``.
Parameters
----------
anon : bool (False)
Whether to use anonymous connection (public buckets only). If False,
uses the key/secret given, or boto's credential resolver (client_kwargs,
environment, variables, config files, EC2 IAM server, in that order)
endpoint_url : string (None)
Use this endpoint_url, if specified. Needed for connecting to non-AWS
S3 buckets. Takes precedence over `endpoint_url` in client_kwargs.
key : string (None)
If not anonymous, use this access key ID, if specified. Takes precedence
over `aws_access_key_id` in client_kwargs.
secret : string (None)
If not anonymous, use this secret access key, if specified. Takes
precedence over `aws_secret_access_key` in client_kwargs.
token : string (None)
If not anonymous, use this security token, if specified
use_ssl : bool (True)
Whether to use SSL in connections to S3; may be faster without, but
insecure. If ``use_ssl`` is also set in ``client_kwargs``,
the value set in ``client_kwargs`` will take priority.
s3_additional_kwargs : dict of parameters that are used when calling s3 api
methods. Typically used for things like "ServerSideEncryption".
client_kwargs : dict of parameters for the botocore client
requester_pays : bool (False)
If RequesterPays buckets are supported.
default_block_size: int (None)
If given, the default block size value used for ``open()``, if no
specific value is given at all time. The built-in default is 50MB.
default_fill_cache : Bool (True)
Whether to use cache filling with open by default. Refer to
``S3File.open``.
default_cache_type : string ("readahead")
If given, the default cache_type value used for ``open()``. Set to "none"
if no caching is desired. See fsspec's documentation for other available
cache_type values. Default cache_type is "readahead".
version_aware : bool (False)
Whether to support bucket versioning. If enable this will require the
user to have the necessary IAM permissions for dealing with versioned
objects. Note that in the event that you only need to work with the
latest version of objects in a versioned bucket, and do not need the
VersionId for those objects, you should set ``version_aware`` to False
for performance reasons. When set to True, filesystem instances will
use the S3 ListObjectVersions API call to list directory contents,
which requires listing all historical object versions.
cache_regions : bool (False)
Whether to cache bucket regions or not. Whenever a new bucket is used,
it will first find out which region it belongs and then use the client
for that region.
asynchronous : bool (False)
Whether this instance is to be used from inside coroutines.
config_kwargs : dict of parameters passed to ``botocore.client.Config``
kwargs : other parameters for core session.
session : aiobotocore AioSession object to be used for all connections.
This session will be used inplace of creating a new session inside S3FileSystem.
For example: aiobotocore.session.AioSession(profile='test_user')
max_concurrency : int (10)
The maximum number of concurrent transfers to use per file for multipart
upload (``put()``) operations. Defaults to 10. When used in
conjunction with ``S3FileSystem.put(batch_size=...)`` the maximum number of
simultaneous connections is ``max_concurrency * batch_size``. We may extend
this parameter to affect ``pipe()``, ``cat()`` and ``get()``. Increasing this
value will result in higher memory usage during multipart upload operations (by
``max_concurrency * chunksize`` bytes per file).
fixed_upload_size : bool (False)
Use same chunk size for all parts in multipart upload (last part can be smaller).
Cloudflare R2 storage requires fixed_upload_size=True for multipart uploads.
The following parameters are passed on to fsspec:
skip_instance_cache: to control reuse of instances
use_listings_cache, listings_expiry_time, max_paths: to control reuse of directory listings
Examples
--------
>>> s3 = S3FileSystem(anon=False) # doctest: +SKIP
>>> s3.ls('my-bucket/') # doctest: +SKIP
['my-file.txt']
>>> with s3.open('my-bucket/my-file.txt', mode='rb') as f: # doctest: +SKIP
... print(f.read()) # doctest: +SKIP
b'Hello, world!'
"""
root_marker = ""
connect_timeout = 5
retries = 5
read_timeout = 15
default_block_size = 50 * 2**20
protocol = ("s3", "s3a")
_extra_tokenize_attributes = ("default_block_size",)
def __init__(
self,
anon=False,
endpoint_url=None,
key=None,
secret=None,
token=None,
use_ssl=True,
client_kwargs=None,
requester_pays=False,
default_block_size=None,
default_fill_cache=True,
default_cache_type="readahead",
version_aware=False,
config_kwargs=None,
s3_additional_kwargs=None,
session=None,
username=None,
password=None,
cache_regions=False,
asynchronous=False,
loop=None,
max_concurrency=10,
fixed_upload_size: bool = False,
**kwargs,
):
if key and username:
raise KeyError("Supply either key or username, not both")
if secret and password:
raise KeyError("Supply secret or password, not both")
if username:
key = username
if password:
secret = password
self.endpoint_url = endpoint_url
self.anon = anon
self.key = key
self.secret = secret
self.token = token
self.kwargs = kwargs
super_kwargs = {
k: kwargs.pop(k)
for k in ["use_listings_cache", "listings_expiry_time", "max_paths"]
if k in kwargs
} # passed to fsspec superclass
super().__init__(loop=loop, asynchronous=asynchronous, **super_kwargs)
self.default_block_size = default_block_size or self.default_block_size
self.default_fill_cache = default_fill_cache
self.default_cache_type = default_cache_type
self.version_aware = version_aware
self.client_kwargs = client_kwargs or {}
self.config_kwargs = config_kwargs or {}
self.req_kw = {"RequestPayer": "requester"} if requester_pays else {}
self.s3_additional_kwargs = s3_additional_kwargs or {}
self.use_ssl = use_ssl
self.cache_regions = cache_regions
self._s3 = None
self.session = session
self.fixed_upload_size = fixed_upload_size
if max_concurrency < 1:
raise ValueError("max_concurrency must be >= 1")
self.max_concurrency = max_concurrency
@property
def s3(self):
if self._s3 is None:
if self.asynchronous:
raise RuntimeError("please await ``.set_session`` before anything else")
self.connect()
return self._s3
def _filter_kwargs(self, s3_method, kwargs):
return self._kwargs_helper.filter_dict(s3_method.__name__, kwargs)
async def get_s3(self, bucket=None):
if self.cache_regions and bucket is not None:
return await self._s3creator.get_bucket_client(bucket)
else:
return self._s3
async def _call_s3(self, method, *akwarglist, **kwargs):
await self.set_session()
s3 = await self.get_s3(kwargs.get("Bucket"))
method = getattr(s3, method)
kw2 = kwargs.copy()
kw2.pop("Body", None)
logger.debug("CALL: %s - %s - %s", method.__name__, akwarglist, kw2)
additional_kwargs = self._get_s3_method_kwargs(method, *akwarglist, **kwargs)
return await _error_wrapper(
method, kwargs=additional_kwargs, retries=self.retries
)
call_s3 = sync_wrapper(_call_s3)
def _get_s3_method_kwargs(self, method, *akwarglist, **kwargs):
additional_kwargs = self.s3_additional_kwargs.copy()
for akwargs in akwarglist:
additional_kwargs.update(akwargs)
# Add the normal kwargs in
additional_kwargs.update(kwargs)
# filter all kwargs
return self._filter_kwargs(method, additional_kwargs)
@staticmethod
def _get_kwargs_from_urls(urlpath):
"""
When we have a urlpath that contains a ?versionId=
Assume that we want to use version_aware mode for
the filesystem.
"""
from urllib.parse import urlsplit
url_query = urlsplit(urlpath).query
out = {}
if url_query is not None:
from urllib.parse import parse_qs
parsed = parse_qs(url_query)
if "versionId" in parsed:
out["version_aware"] = True
return out
def _find_bucket_key(self, s3_path):
"""
This is a helper function that given an s3 path such that the path is of
the form: bucket/key
It will return the bucket and the key represented by the s3 path
"""
bucket_format_list = [
re.compile(
r"^(?Parn:(aws).*:s3:[a-z\-0-9]*:[0-9]{12}:accesspoint[:/][^/]+)/?"
r"(?P.*)$"
),
re.compile(
r"^(?Parn:(aws).*:s3-outposts:[a-z\-0-9]+:[0-9]{12}:outpost[/:]"
r"[a-zA-Z0-9\-]{1,63}[/:](bucket|accesspoint)[/:][a-zA-Z0-9\-]{1,63})[/:]?(?P.*)$"
),
re.compile(
r"^(?Parn:(aws).*:s3-outposts:[a-z\-0-9]+:[0-9]{12}:outpost[/:]"
r"[a-zA-Z0-9\-]{1,63}[/:]bucket[/:]"
r"[a-zA-Z0-9\-]{1,63})[/:]?(?P.*)$"
),
re.compile(
r"^(?Parn:(aws).*:s3-object-lambda:[a-z\-0-9]+:[0-9]{12}:"
r"accesspoint[/:][a-zA-Z0-9\-]{1,63})[/:]?(?P.*)$"
),
]
for bucket_format in bucket_format_list:
match = bucket_format.match(s3_path)
if match:
return match.group("bucket"), match.group("key")
s3_components = s3_path.split("/", 1)
bucket = s3_components[0]
s3_key = ""
if len(s3_components) > 1:
s3_key = s3_components[1]
return bucket, s3_key
def split_path(self, path) -> tuple[str, str, str | None]:
"""
Normalise S3 path string into bucket and key.
Parameters
----------
path : string
Input path, like `s3://mybucket/path/to/file`
Examples
--------
>>> split_path("s3://mybucket/path/to/file")
['mybucket', 'path/to/file', None]
>>> split_path("s3://mybucket/path/to/versioned_file?versionId=some_version_id")
['mybucket', 'path/to/versioned_file', 'some_version_id']
"""
trail = path[len(path.rstrip("/")) :]
path = self._strip_protocol(path)
path = path.lstrip("/")
if "/" not in path:
return path, "", None
else:
bucket, keypart = self._find_bucket_key(path)
key, _, version_id = keypart.partition("?versionId=")
key += trail # restore trailing slashes removed by AbstractFileSystem._strip_protocol
return (
bucket,
key,
version_id if self.version_aware and version_id else None,
)
def _prepare_config_kwargs(self):
config_kwargs = self.config_kwargs.copy()
if "connect_timeout" not in config_kwargs.keys():
config_kwargs["connect_timeout"] = self.connect_timeout
if "read_timeout" not in config_kwargs.keys():
config_kwargs["read_timeout"] = self.read_timeout
return config_kwargs
async def set_session(self, refresh=False, kwargs={}):
"""Establish S3 connection object.
This async method is called by any operation on an ``S3FileSystem`` instance.
The ``refresh=True`` argument is useful if new credentials have been created
and the instance needs to be reestablished. ``connect`` is a blocking
version of ``set_session``.
Parameters
----------
refresh : bool (False)
If True, create a new session even if one already exists.
kwargs : dict
Currently unused.
Returns
-------
Session to be closed later with await .close()
Examples
--------
>>> s3 = S3FileSystem(profile="") # doctest: +SKIP
# use in an async coroutine to assign the client object to a local variable
>>> await s3.set_session() # doctest: +SKIP
# blocking version of set_session
>>> s3.connect(refresh=True) # doctest: +SKIP
"""
if self._s3 is not None and not refresh:
hsess = getattr(getattr(self._s3, "_endpoint", None), "http_session", None)
if hsess is not None:
if all(_.closed for _ in hsess._sessions.values()):
refresh = True
if not refresh:
return self._s3
logger.debug("Setting up s3fs instance")
client_kwargs = self.client_kwargs.copy()
init_kwargs = dict(
aws_access_key_id=self.key,
aws_secret_access_key=self.secret,
aws_session_token=self.token,
endpoint_url=self.endpoint_url,
)
init_kwargs = {
key: value
for key, value in init_kwargs.items()
if value is not None and value != client_kwargs.get(key)
}
if "use_ssl" not in client_kwargs.keys():
init_kwargs["use_ssl"] = self.use_ssl
config_kwargs = self._prepare_config_kwargs()
if self.anon:
from botocore import UNSIGNED
drop_keys = {
"aws_access_key_id",
"aws_secret_access_key",
"aws_session_token",
}
init_kwargs = {
key: value for key, value in init_kwargs.items() if key not in drop_keys
}
client_kwargs = {
key: value
for key, value in client_kwargs.items()
if key not in drop_keys
}
config_kwargs["signature_version"] = UNSIGNED
conf = AioConfig(**config_kwargs)
if self.session is None or refresh:
self.session = aiobotocore.session.AioSession(**self.kwargs)
for parameters in (config_kwargs, self.kwargs, init_kwargs, client_kwargs):
for option in ("region_name", "endpoint_url"):
if parameters.get(option):
self.cache_regions = False
break
else:
cache_regions = self.cache_regions
logger.debug(
"RC: caching enabled? %r (explicit option is %r)",
cache_regions,
self.cache_regions,
)
self.cache_regions = cache_regions
if self.cache_regions:
s3creator = S3BucketRegionCache(
self.session, config=conf, **init_kwargs, **client_kwargs
)
self._s3 = await s3creator.get_client()
else:
s3creator = self.session.create_client(
"s3", config=conf, **init_kwargs, **client_kwargs
)
self._s3 = await s3creator.__aenter__()
self._s3creator = s3creator
# the following actually closes the aiohttp connection; use of privates
# might break in the future, would cause exception at gc time
if not self.asynchronous:
weakref.finalize(self, self.close_session, self.loop, self._s3creator)
self._kwargs_helper = ParamKwargsHelper(self._s3)
return self._s3
_connect = set_session
connect = sync_wrapper(set_session)
@staticmethod
def close_session(loop, s3):
if loop is not None and loop.is_running():
try:
loop = asyncio.get_running_loop()
loop.create_task(s3.__aexit__(None, None, None))
return
except RuntimeError:
pass
try:
sync(loop, s3.__aexit__, None, None, None, timeout=0.1)
return
except FSTimeoutError:
pass
try:
# close the actual socket
s3._client._endpoint.http_session._connector._close()
except AttributeError:
# but during shutdown, it may have gone
pass
async def _get_delegated_s3pars(self, exp=3600):
"""Get temporary credentials from STS, appropriate for sending across a
network. Only relevant where the key/secret were explicitly provided.
Parameters
----------
exp : int
Time in seconds that credentials are good for
Returns
-------
dict of parameters
"""
if self.anon:
return {"anon": True}
if self.token: # already has temporary cred
return {
"key": self.key,
"secret": self.secret,
"token": self.token,
"anon": False,
}
if self.key is None or self.secret is None: # automatic credentials
return {"anon": False}
async with self.session.create_client("sts") as sts:
cred = sts.get_session_token(DurationSeconds=exp)["Credentials"]
return {
"key": cred["AccessKeyId"],
"secret": cred["SecretAccessKey"],
"token": cred["SessionToken"],
"anon": False,
}
get_delegated_s3pars = sync_wrapper(_get_delegated_s3pars)
def _open(
self,
path,
mode="rb",
block_size=None,
acl=False,
version_id=None,
fill_cache=None,
cache_type=None,
autocommit=True,
size=None,
requester_pays=None,
cache_options=None,
**kwargs,
):
"""Open a file for reading or writing
Parameters
----------
path: string
Path of file on S3
mode: string
One of 'r', 'w', 'a', 'rb', 'wb', or 'ab'. These have the same meaning
as they do for the built-in `open` function.
"x" mode, exclusive write, is only known to work on AWS S3, and
requires botocore>1.35.20. If the file is multi-part (i.e., has more
than one block), the condition is only checked on commit; if this fails,
the MPU is aborted.
block_size: int
Size of data-node blocks if reading
fill_cache: bool
If seeking to new a part of the file beyond the current buffer,
with this True, the buffer will be filled between the sections to
best support random access. When reading only a few specific chunks
out of a file, performance may be better if False.
acl: str
Canned ACL to set when writing. False sends no parameter and uses the bucket's
preset default; otherwise it should be a member of the `key_acls` set.
version_id : str
Explicit version of the object to open. This requires that the s3
filesystem is version aware and bucket versioning is enabled on the
relevant bucket.
encoding : str
The encoding to use if opening the file in text mode. The platform's
default text encoding is used if not given.
cache_type : str
See fsspec's documentation for available cache_type values. Set to "none"
if no caching is desired. If None, defaults to ``self.default_cache_type``.
requester_pays : bool (optional)
If RequesterPays buckets are supported. If None, defaults to the
value used when creating the S3FileSystem (which defaults to False.)
kwargs: dict-like
Additional parameters used for s3 methods. Typically used for
ServerSideEncryption.
"""
if block_size is None:
block_size = self.default_block_size
if fill_cache is None:
fill_cache = self.default_fill_cache
if requester_pays is None:
requester_pays = bool(self.req_kw)
acl = (
acl
or self.s3_additional_kwargs.get("ACL", False)
or self.s3_additional_kwargs.get("acl", False)
)
kw = self.s3_additional_kwargs.copy()
kw.update(kwargs)
if not self.version_aware and version_id:
raise ValueError(
"version_id cannot be specified if the filesystem "
"is not version aware"
)
if cache_type is None:
cache_type = self.default_cache_type
return S3File(
self,
path,
mode,
block_size=block_size,
acl=acl,
version_id=version_id,
fill_cache=fill_cache,
s3_additional_kwargs=kw,
cache_type=cache_type,
autocommit=autocommit,
requester_pays=requester_pays,
cache_options=cache_options,
size=size,
)
async def _lsdir(
self,
path,
refresh=False,
max_items=None,
delimiter="/",
prefix="",
versions=False,
):
bucket, key, _ = self.split_path(path)
if not prefix:
prefix = ""
if key:
prefix = key.lstrip("/") + "/" + prefix
if path not in self.dircache or refresh or not delimiter or versions:
try:
logger.debug("Get directory listing page for %s" % path)
dirs = []
files = []
async for c in self._iterdir(
bucket,
max_items=max_items,
delimiter=delimiter,
prefix=prefix,
versions=versions,
):
if c["type"] == "directory":
dirs.append(c)
else:
files.append(c)
files += dirs
files.sort(key=lambda f: f["name"])
except ClientError as e:
raise translate_boto_error(e)
if delimiter and files and not versions:
self.dircache[path] = files
return files
return self.dircache[path]
async def _iterdir(
self, bucket, max_items=None, delimiter="/", prefix="", versions=False
):
"""Iterate asynchronously over files and directories under `prefix`.
The contents are yielded in arbitrary order as info dicts.
"""
if versions and not self.version_aware:
raise ValueError(
"versions cannot be specified if the filesystem is not version aware"
)
await self.set_session()
s3 = await self.get_s3(bucket)
if self.version_aware:
method = "list_object_versions"
contents_key = "Versions"
else:
method = "list_objects_v2"
contents_key = "Contents"
pag = s3.get_paginator(method)
config = {}
if max_items is not None:
config.update(MaxItems=max_items, PageSize=2 * max_items)
it = pag.paginate(
Bucket=bucket,
Prefix=prefix,
Delimiter=delimiter,
PaginationConfig=config,
**self.req_kw,
)
async for i in it:
for l in i.get("CommonPrefixes", []):
c = {
"Key": l["Prefix"][:-1],
"Size": 0,
"StorageClass": "DIRECTORY",
"type": "directory",
}
self._fill_info(c, bucket, versions=False)
yield c
for c in i.get(contents_key, []):
if not self.version_aware or c.get("IsLatest") or versions:
c["type"] = "file"
c["size"] = c["Size"]
self._fill_info(c, bucket, versions=versions)
yield c
@staticmethod
def _fill_info(f, bucket, versions=False):
f["size"] = f["Size"]
f["Key"] = "/".join([bucket, f["Key"]])
f["name"] = f["Key"]
version_id = f.get("VersionId")
if versions and version_id and version_id != "null":
f["name"] += f"?versionId={version_id}"
async def _glob(self, path, **kwargs):
if path.startswith("*"):
raise ValueError("Cannot traverse all of S3")
return await super()._glob(path, **kwargs)
async def _find(
self, path, maxdepth=None, withdirs=None, detail=False, prefix="", **kwargs
):
"""List all files below path.
Like posix ``find`` command without conditions
Parameters
----------
path : str
maxdepth: int or None
If not None, the maximum number of levels to descend
withdirs: bool
Whether to include directory paths in the output. This is True
when used by glob, but users usually only want files.
prefix: str
Only return files that match ``^{path}/{prefix}`` (if there is an
exact match ``filename == {path}/{prefix}``, it also will be included)
"""
path = self._strip_protocol(path)
bucket, key, _ = self.split_path(path)
if not bucket:
raise ValueError("Cannot traverse all of S3")
if (withdirs or maxdepth) and prefix:
# TODO: perhaps propagate these to a glob(f"path/{prefix}*") call
raise ValueError(
"Can not specify 'prefix' option alongside 'withdirs'/'maxdepth' options."
)
if maxdepth:
return await super()._find(
bucket + "/" + key,
maxdepth=maxdepth,
withdirs=withdirs,
detail=detail,
**kwargs,
)
# TODO: implement find from dircache, if all listings are present
# if refresh is False:
# out = incomplete_tree_dirs(self.dircache, path)
# if len(out) == 1:
# await self._find(out[0])
# return super().find(path)
# elif len(out) == 0:
# return super().find(path)
# # else: we refresh anyway, having at least two missing trees
out = await self._lsdir(path, delimiter="", prefix=prefix, **kwargs)
if (not out and key) and not prefix:
try:
out = [await self._info(path)]
except FileNotFoundError:
out = []
dirs = []
sdirs = set()
thisdircache = {}
for o in out:
# not self._parent, because that strips "/" from placeholders
par = o["name"].rsplit("/", maxsplit=1)[0]
o["Key"] = o["name"]
name = o["name"]
while "/" in par:
if par not in sdirs:
sdirs.add(par)
d = False
if len(path) <= len(par):
d = {
"Key": par,
"Size": 0,
"name": par,
"StorageClass": "DIRECTORY",
"type": "directory",
"size": 0,
}
dirs.append(d)
thisdircache[par] = []
ppar = self._parent(par)
if ppar in thisdircache:
if d and d not in thisdircache[ppar]:
thisdircache[ppar].append(d)
if par in sdirs and not name.endswith("/"):
# exclude placeholdees, they do not belong in the directory listing
thisdircache[par].append(o)
par, name, o = par.rsplit("/", maxsplit=1)[0], par, d
if par in thisdircache or par in self.dircache:
break
# Explicitly add directories to their parents in the dircache
for d in dirs:
par = self._parent(d["name"])
# extra condition here (in any()) to deal with directory-marking files
if par in thisdircache and not any(
_["name"] == d["name"] for _ in thisdircache[par]
):
thisdircache[par].append(d)
if not prefix:
for k, v in thisdircache.items():
if k not in self.dircache and len(k) >= len(path):
self.dircache[k] = sorted(v, key=lambda x: x["name"])
if withdirs:
out = sorted(out + dirs, key=lambda x: x["name"])
if detail:
return {o["name"]: o for o in out}
return [o["name"] for o in out]
find = sync_wrapper(_find)
async def _mkdir(self, path, acl=False, create_parents=True, **kwargs):
path = self._strip_protocol(path).rstrip("/")
if not path:
raise ValueError
bucket, key, _ = self.split_path(path)
if await self._exists(bucket):
if not key:
# requested to create bucket, but bucket already exist
raise FileExistsError
# else: # do nothing as bucket is already created.
elif not key or create_parents:
if acl and acl not in buck_acls:
raise ValueError("ACL not in %s", buck_acls)
try:
params = {"Bucket": bucket}
if acl:
params["ACL"] = acl
region_name = kwargs.get("region_name", None) or self.client_kwargs.get(
"region_name", None
)
if region_name:
params["CreateBucketConfiguration"] = {
"LocationConstraint": region_name
}
await self._call_s3("create_bucket", **params)
self.invalidate_cache("")
self.invalidate_cache(bucket)
except ClientError as e:
raise translate_boto_error(e)
except ParamValidationError as e:
raise ValueError("Bucket create failed %r: %s" % (bucket, e))
else:
# raises if bucket doesn't exist and doesn't get create flag.
await self._ls(bucket)
mkdir = sync_wrapper(_mkdir)
async def _makedirs(self, path, exist_ok=False):
try:
await self._mkdir(path, create_parents=True)
except FileExistsError:
if exist_ok:
pass
else:
raise
makedirs = sync_wrapper(_makedirs)
async def _rmdir(self, path):
bucket, key, _ = self.split_path(path)
if key:
if await self._exists(path):
# User may have meant rm(path, recursive=True)
raise FileExistsError
raise FileNotFoundError
try:
await self._call_s3("delete_bucket", Bucket=path)
except botocore.exceptions.ClientError as e:
if "NoSuchBucket" in str(e):
raise FileNotFoundError(path) from e
if "BucketNotEmpty" in str(e):
raise OSError from e
raise
self.invalidate_cache(path)
self.invalidate_cache("")
rmdir = sync_wrapper(_rmdir)
async def _lsbuckets(self, refresh=False):
if "" not in self.dircache or refresh:
if self.anon:
# cannot list buckets if not logged in
return []
try:
files = (await self._call_s3("list_buckets"))["Buckets"]
except ClientError:
# listbucket permission missing
return []
for f in files:
f["Key"] = f["Name"]
f["Size"] = 0
f["StorageClass"] = "BUCKET"
f["size"] = 0
f["type"] = "directory"
f["name"] = f["Name"]
del f["Name"]
self.dircache[""] = files
return files
return self.dircache[""]
async def _ls(self, path, detail=False, refresh=False, versions=False):
"""List files in given bucket, or list of buckets.
Listing is cached unless `refresh=True`.
Note: only your buckets associated with the login will be listed by
`ls('')`, not any public buckets (even if already accessed).
Parameters
----------
path : string/bytes
location at which to list files
refresh : bool (=False)
if False, look in local cache for file details first
"""
path = self._strip_protocol(path).rstrip("/")
if path in ["", "/"]:
files = await self._lsbuckets(refresh)
else:
files = await self._lsdir(path, refresh, versions=versions)
if not files and "/" in path:
try:
files = await self._lsdir(
self._parent(path), refresh=refresh, versions=versions
)
except OSError:
pass
files = [
o
for o in files
if o["name"].rstrip("/") == path and o["type"] != "directory"
]
if not files:
raise FileNotFoundError(path)
if detail:
return files
return files if detail else sorted([o["name"] for o in files])
def _exists_in_cache(self, path, bucket, key, version_id):
fullpath = "/".join((bucket, key))
try:
entries = self._ls_from_cache(fullpath)
except FileNotFoundError:
return False
if entries is None:
return None
if not self.version_aware or version_id is None:
return True
for entry in entries:
if entry["name"] == fullpath and entry.get("VersionId") == version_id:
return True
# dircache doesn't support multiple versions, so we really can't tell if
# the one we want exists.
return None
async def _exists(self, path):
if path in ["", "/"]:
# the root always exists, even if anon
return True
path = self._strip_protocol(path)
bucket, key, version_id = self.split_path(path)
if key:
exists_in_cache = self._exists_in_cache(path, bucket, key, version_id)
if exists_in_cache is not None:
return exists_in_cache
try:
await self._info(path, bucket, key, version_id=version_id)
return True
except FileNotFoundError:
return False
elif self.dircache.get(bucket, False):
return True
else:
try:
if self._ls_from_cache(bucket):
return True
except FileNotFoundError:
# might still be a bucket we can access but don't own
pass
try:
await self._call_s3("head_bucket", Bucket=bucket, **self.req_kw)
return True
except Exception:
pass
try:
await self._call_s3("get_bucket_location", Bucket=bucket, **self.req_kw)
return True
except Exception:
return False
exists = sync_wrapper(_exists)
async def _touch(self, path, truncate=True, data=None, **kwargs):
"""Create empty file or truncate"""
bucket, key, version_id = self.split_path(path)
if version_id:
raise ValueError("S3 does not support touching existing versions of files")
if not truncate and await self._exists(path):
raise ValueError("S3 does not support touching existent files")
try:
write_result = await self._call_s3(
"put_object", Bucket=bucket, Key=key, **kwargs
)
except ClientError as ex:
raise translate_boto_error(ex)
self.invalidate_cache(self._parent(path))
return write_result
touch = sync_wrapper(_touch)
async def _cat_file(self, path, version_id=None, start=None, end=None):
bucket, key, vers = self.split_path(path)
if start is not None or end is not None:
head = {"Range": await self._process_limits(path, start, end)}
else:
head = {}
async def _call_and_read():
resp = await self._call_s3(
"get_object",
Bucket=bucket,
Key=key,
**version_id_kw(version_id or vers),
**head,
**self.req_kw,
)
try:
return await resp["Body"].read()
finally:
resp["Body"].close()
return await _error_wrapper(_call_and_read, retries=self.retries)
async def _pipe_file(
self,
path,
data,
chunksize=50 * 2**20,
max_concurrency=None,
mode="overwrite",
**kwargs,
):
"""
mode=="create", exclusive write, is only known to work on AWS S3, and
requires botocore>1.35.20
"""
bucket, key, _ = self.split_path(path)
concurrency = max_concurrency or self.max_concurrency
size = len(data)
if mode == "create":
match = {"IfNoneMatch": "*"}
else:
match = {}
# 5 GB is the limit for an S3 PUT
if size < min(5 * 2**30, 2 * chunksize):
out = await self._call_s3(
"put_object", Bucket=bucket, Key=key, Body=data, **kwargs, **match
)
self.invalidate_cache(path)
return out
else:
mpu = await self._call_s3(
"create_multipart_upload", Bucket=bucket, Key=key, **kwargs
)
ranges = list(range(0, len(data), chunksize))
inds = list(range(0, len(ranges), concurrency)) + [len(ranges)]
parts = []
try:
for start, stop in zip(inds[:-1], inds[1:]):
out = await asyncio.gather(
*[
self._call_s3(
"upload_part",
Bucket=bucket,
PartNumber=i + 1,
UploadId=mpu["UploadId"],
Body=data[ranges[i] : ranges[i] + chunksize],
Key=key,
)
for i in range(start, stop)
]
)
parts.extend(
{"PartNumber": i + 1, "ETag": o["ETag"]}
for i, o in zip(range(start, stop), out)
)
await self._call_s3(
"complete_multipart_upload",
Bucket=bucket,
Key=key,
UploadId=mpu["UploadId"],
MultipartUpload={"Parts": parts},
**match,
)
self.invalidate_cache(path)
except Exception:
await self._abort_mpu(bucket, key, mpu["UploadId"])
raise
async def _put_file(
self,
lpath,
rpath,
callback=_DEFAULT_CALLBACK,
chunksize=None,
max_concurrency=None,
mode="overwrite",
**kwargs,
):
"""
mode=="create", exclusive write, is only known to work on AWS S3, and
requires botocore>1.35.20
"""
bucket, key, _ = self.split_path(rpath)
if os.path.isdir(lpath):
if key:
# don't make remote "directory"
return
else:
await self._mkdir(lpath)
size = os.path.getsize(lpath)
callback.set_size(size)
if mode == "create":
match = {"IfNoneMatch": "*"}
else:
match = {}
if "ContentType" not in kwargs:
content_type, _ = mimetypes.guess_type(lpath)
if content_type is not None:
kwargs["ContentType"] = content_type
chunksize = calculate_chunksize(size, chunksize=chunksize)
with open(lpath, "rb") as f0:
if size < min(5 * 2**30, 2 * chunksize):
chunk = f0.read()
await self._call_s3(
"put_object", Bucket=bucket, Key=key, Body=chunk, **kwargs, **match
)
callback.relative_update(size)
else:
mpu = await self._call_s3(
"create_multipart_upload", Bucket=bucket, Key=key, **kwargs
)
try:
out = await self._upload_file_part_concurrent(
bucket,
key,
mpu,
f0,
chunksize,
callback=callback,
max_concurrency=max_concurrency,
)
parts = [
{"PartNumber": i + 1, "ETag": o["ETag"]}
for i, o in enumerate(out)
]
await self._call_s3(
"complete_multipart_upload",
Bucket=bucket,
Key=key,
UploadId=mpu["UploadId"],
MultipartUpload={"Parts": parts},
**match,
)
except Exception:
await self._abort_mpu(bucket, key, mpu["UploadId"])
raise
while rpath:
self.invalidate_cache(rpath)
rpath = self._parent(rpath)
async def _upload_file_part_concurrent(
self,
bucket,
key,
mpu,
f0,
chunksize,
callback=_DEFAULT_CALLBACK,
max_concurrency=None,
):
max_concurrency = max_concurrency or self.max_concurrency
if max_concurrency < 1:
raise ValueError("max_concurrency must be >= 1")
async def _upload_chunk(chunk, part_number):
result = await self._call_s3(
"upload_part",
Bucket=bucket,
PartNumber=part_number,
UploadId=mpu["UploadId"],
Body=chunk,
Key=key,
)
callback.relative_update(len(chunk))
return result
out = []
while True:
chunks = []
for i in range(max_concurrency):
chunk = f0.read(chunksize)
if chunk:
chunks.append(chunk)
if not chunks:
break
out.extend(
await asyncio.gather(
*[
_upload_chunk(chunk, len(out) + i)
for i, chunk in enumerate(chunks, 1)
]
)
)
return out
async def _get_file(
self, rpath, lpath, callback=_DEFAULT_CALLBACK, version_id=None, **kwargs
):
if os.path.isdir(lpath):
return
bucket, key, vers = self.split_path(rpath)
async def _open_file(range: int):
kw = self.req_kw.copy()
if range:
kw["Range"] = f"bytes={range}-"
resp = await self._call_s3(
"get_object",
Bucket=bucket,
Key=key,
**version_id_kw(version_id or vers),
**kw,
)
return resp["Body"], resp.get("ContentLength", None)
body, content_length = await _open_file(range=0)
callback.set_size(content_length)
failed_reads = 0
bytes_read = 0
try:
with open(lpath, "wb") as f0:
while True:
try:
chunk = await body.read(2**16)
except S3_RETRYABLE_ERRORS:
failed_reads += 1
if failed_reads >= self.retries:
# Give up if we've failed too many times.
raise
# Closing the body may result in an exception if we've failed to read from it.
try:
body.close()
except Exception:
pass
await asyncio.sleep(min(1.7**failed_reads * 0.1, 15))
# Byte ranges are inclusive, which means we need to be careful to not read the same data twice
# in a failure.
# Examples:
# Read 1 byte -> failure, retry with read_range=0, byte range should be 0-
# Read 1 byte, success. Read 1 byte: failure. Retry with read_range=1, byte-range should be 1-
# Read 1 bytes, success. Read 1 bytes: success. Read 1 byte, failure. Retry with read_range=2,
# byte-range should be 2-.
body, _ = await _open_file(bytes_read)
continue
if not chunk:
break
bytes_read += len(chunk)
segment_len = f0.write(chunk)
callback.relative_update(segment_len)
finally:
try:
body.close()
except Exception:
pass
async def _info(self, path, bucket=None, key=None, refresh=False, version_id=None):
path = self._strip_protocol(path)
bucket, key, path_version_id = self.split_path(path)
fullpath = "/".join((bucket, key))
if version_id is not None:
if not self.version_aware:
raise ValueError(
"version_id cannot be specified if the "
"filesystem is not version aware"
)
if path in ["/", ""]:
return {"name": path, "size": 0, "type": "directory"}
version_id = _coalesce_version_id(path_version_id, version_id)
if not refresh:
out = self._ls_from_cache(fullpath)
if out is not None:
if self.version_aware and version_id is not None:
# If cached info does not match requested version_id,
# fallback to calling head_object
out = [
o
for o in out
if o["name"] == fullpath and version_id == o.get("VersionId")
]
if out:
return out[0]
else:
out = [o for o in out if o["name"] == fullpath]
if out:
return out[0]
return {"name": path, "size": 0, "type": "directory"}
if key:
try:
out = await self._call_s3(
"head_object",
self.kwargs,
Bucket=bucket,
Key=key,
**version_id_kw(version_id),
**self.req_kw,
)
return {
"ETag": out.get("ETag", ""),
"LastModified": out.get("LastModified", ""),
"size": out["ContentLength"],
"name": "/".join([bucket, key]),
"type": "file",
"StorageClass": out.get("StorageClass", "STANDARD"),
"VersionId": out.get("VersionId"),
"ContentType": out.get("ContentType"),
}
except FileNotFoundError:
pass
except ClientError as e:
raise translate_boto_error(e, set_cause=False)
else:
try:
out = await self._call_s3("head_bucket", Bucket=bucket, **self.req_kw)
return {
"name": bucket,
"type": "directory",
"size": 0,
"StorageClass": "DIRECTORY",
"VersionId": out.get("VersionId"),
}
except ClientError as e:
raise translate_boto_error(e, set_cause=False)
try:
# We check to see if the path is a directory by attempting to list its
# contexts. If anything is found, it is indeed a directory
out = await self._call_s3(
"list_objects_v2",
self.kwargs,
Bucket=bucket,
Prefix=key.rstrip("/") + "/" if key else "",
Delimiter="/",
MaxKeys=1,
**self.req_kw,
)
if (
out.get("KeyCount", 0) > 0
or out.get("Contents", [])
or out.get("CommonPrefixes", [])
):
return {
"name": "/".join([bucket, key]),
"type": "directory",
"size": 0,
"StorageClass": "DIRECTORY",
}
raise FileNotFoundError(path)
except ClientError as e:
raise translate_boto_error(e, set_cause=False)
except ParamValidationError as e:
raise ValueError("Failed to list path %r: %s" % (path, e))
async def _checksum(self, path, refresh=False):
"""
Unique value for current version of file
If the checksum is the same from one moment to another, the contents
are guaranteed to be the same. If the checksum changes, the contents
*might* have changed.
Parameters
----------
path : string/bytes
path of file to get checksum for
refresh : bool (=False)
if False, look in local cache for file details first
"""
info = await self._info(path, refresh=refresh)
if info["type"] != "directory":
return int(info["ETag"].strip('"').split("-")[0], 16)
else:
return int(tokenize(info), 16)
checksum = sync_wrapper(_checksum)
async def _isdir(self, path):
path = self._strip_protocol(path).strip("/")
# Send buckets to super
if "/" not in path:
if path == "":
return True
try:
out = await self._lsdir(path)
return True
except FileNotFoundError:
return False
if path in self.dircache:
for fp in self.dircache[path]:
# For files the dircache can contain itself.
# If it contains anything other than itself it is a directory.
if fp["name"] != path:
return True
return False
parent = self._parent(path)
if parent in self.dircache:
for f in self.dircache[parent]:
if f["name"] == path:
# If we find ourselves return whether we are a directory
return f["type"] == "directory"
return False
# This only returns things within the path and NOT the path object itself
try:
return bool(await self._lsdir(path))
except FileNotFoundError:
return False
isdir = sync_wrapper(_isdir)
async def _object_version_info(self, path, **kwargs):
if not self.version_aware:
raise ValueError(
"version specific functionality is disabled for "
"non-version aware filesystems"
)
bucket, key, _ = self.split_path(path)
kwargs = {}
out = {"IsTruncated": True}
versions = []
while out["IsTruncated"]:
out = await self._call_s3(
"list_object_versions",
kwargs,
Bucket=bucket,
Prefix=key,
**self.req_kw,
)
versions.extend(out["Versions"])
kwargs.update(
{
"VersionIdMarker": out.get("NextVersionIdMarker", ""),
"KeyMarker": out.get("NextKeyMarker", ""),
}
)
return versions
object_version_info = sync_wrapper(_object_version_info)
_metadata_cache = {}
async def _metadata(self, path, refresh=False, **kwargs):
"""Return metadata of path.
Parameters
----------
path : string/bytes
filename to get metadata for
refresh : bool (=False)
(ignored)
"""
bucket, key, version_id = self.split_path(path)
response = await self._call_s3(
"head_object",
kwargs,
Bucket=bucket,
Key=key,
**version_id_kw(version_id),
**self.req_kw,
)
meta = {k.replace("_", "-"): v for k, v in response["Metadata"].items()}
return meta
metadata = sync_wrapper(_metadata)
def get_tags(self, path):
"""Retrieve tag key/values for the given path
Returns
-------
{str: str}
"""
bucket, key, version_id = self.split_path(path)
response = self.call_s3(
"get_object_tagging",
Bucket=bucket,
Key=key,
**version_id_kw(version_id),
)
return {v["Key"]: v["Value"] for v in response["TagSet"]}
def put_tags(self, path, tags, mode="o"):
"""Set tags for given existing key
Tags are a str:str mapping that can be attached to any key, see
https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/allocation-tag-restrictions.html
This is similar to, but distinct from, key metadata, which is usually
set at key creation time.
Parameters
----------
path: str
Existing key to attach tags to
tags: dict str, str
Tags to apply.
mode:
One of 'o' or 'm'
'o': Will over-write any existing tags.
'm': Will merge in new tags with existing tags. Incurs two remote
calls.
"""
bucket, key, version_id = self.split_path(path)
if mode == "m":
existing_tags = self.get_tags(path=path)
existing_tags.update(tags)
new_tags = [{"Key": k, "Value": v} for k, v in existing_tags.items()]
elif mode == "o":
new_tags = [{"Key": k, "Value": v} for k, v in tags.items()]
else:
raise ValueError("Mode must be {'o', 'm'}, not %s" % mode)
tag = {"TagSet": new_tags}
self.call_s3(
"put_object_tagging",
Bucket=bucket,
Key=key,
Tagging=tag,
**version_id_kw(version_id),
)
async def _getxattr(self, path, attr_name, **kwargs):
"""Get an attribute from the metadata.
Examples
--------
>>> mys3fs.getxattr('mykey', 'attribute_1') # doctest: +SKIP
'value_1'
"""
attr_name = attr_name.replace("_", "-")
xattr = await self._metadata(path, **kwargs)
if attr_name in xattr:
return xattr[attr_name]
return None
getxattr = sync_wrapper(_getxattr)
async def _setxattr(self, path, copy_kwargs=None, **kw_args):
"""Set metadata.
Attributes have to be of the form documented in the
`Metadata Reference`_.
Parameters
----------
kw_args : key-value pairs like field="value", where the values must be
strings. Does not alter existing fields, unless
the field appears here - if the value is None, delete the
field.
copy_kwargs : dict, optional
dictionary of additional params to use for the underlying
s3.copy_object.
Examples
--------
>>> mys3file.setxattr(attribute_1='value1', attribute_2='value2') # doctest: +SKIP
# Example for use with copy_args
>>> mys3file.setxattr(copy_kwargs={'ContentType': 'application/pdf'},
... attribute_1='value1') # doctest: +SKIP
.. _Metadata Reference: http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html#object-metadata
"""
kw_args = {k.replace("_", "-"): v for k, v in kw_args.items()}
bucket, key, version_id = self.split_path(path)
metadata = await self._metadata(path)
metadata.update(**kw_args)
copy_kwargs = copy_kwargs or {}
# remove all keys that are None
for kw_key in kw_args:
if kw_args[kw_key] is None:
metadata.pop(kw_key, None)
src = {"Bucket": bucket, "Key": key}
if version_id:
src["VersionId"] = version_id
await self._call_s3(
"copy_object",
copy_kwargs,
CopySource=src,
Bucket=bucket,
Key=key,
Metadata=metadata,
MetadataDirective="REPLACE",
)
# refresh metadata
self._metadata_cache[path] = metadata
setxattr = sync_wrapper(_setxattr)
async def _chmod(self, path, acl, recursive=False, **kwargs):
"""Set Access Control on a bucket/key
See http://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl
Parameters
----------
path : string
the object to set
acl : string
the value of ACL to apply
recursive : bool
whether to apply the ACL to all keys below the given path too
"""
bucket, key, version_id = self.split_path(path)
if recursive:
allfiles = await self._find(path, withdirs=False)
await asyncio.gather(
*[self._chmod(p, acl, recursive=False) for p in allfiles]
)
elif key:
if acl not in key_acls:
raise ValueError("ACL not in %s", key_acls)
await self._call_s3(
"put_object_acl",
kwargs,
Bucket=bucket,
Key=key,
ACL=acl,
**version_id_kw(version_id),
)
if not key:
if acl not in buck_acls:
raise ValueError("ACL not in %s", buck_acls)
await self._call_s3("put_bucket_acl", kwargs, Bucket=bucket, ACL=acl)
chmod = sync_wrapper(_chmod)
async def _url(self, path, expires=3600, client_method="get_object", **kwargs):
"""Generate presigned URL to access path by HTTP
Parameters
----------
path : string
the key path we are interested in
expires : int
the number of seconds this signature will be good for.
"""
bucket, key, version_id = self.split_path(path)
await self.set_session()
s3 = await self.get_s3(bucket)
return await s3.generate_presigned_url(
ClientMethod=client_method,
Params=dict(Bucket=bucket, Key=key, **version_id_kw(version_id), **kwargs),
ExpiresIn=expires,
)
url = sync_wrapper(_url)
async def _merge(self, path, filelist, **kwargs):
"""Create single S3 file from list of S3 files
Uses multi-part, no data is downloaded. The original files are
not deleted.
Parameters
----------
path : str
The final file to produce
filelist : list of str
The paths, in order, to assemble into the final file.
"""
bucket, key, version_id = self.split_path(path)
if version_id:
raise ValueError("Cannot write to an explicit versioned file!")
mpu = await self._call_s3(
"create_multipart_upload", kwargs, Bucket=bucket, Key=key
)
# TODO: Make this support versions?
out = await asyncio.gather(
*[
self._call_s3(
"upload_part_copy",
kwargs,
Bucket=bucket,
Key=key,
UploadId=mpu["UploadId"],
CopySource=f,
PartNumber=i + 1,
)
for (i, f) in enumerate(filelist)
]
)
parts = [
{"PartNumber": i + 1, "ETag": o["CopyPartResult"]["ETag"]}
for (i, o) in enumerate(out)
]
part_info = {"Parts": parts}
await self._call_s3(
"complete_multipart_upload",
Bucket=bucket,
Key=key,
UploadId=mpu["UploadId"],
MultipartUpload=part_info,
)
self.invalidate_cache(path)
merge = sync_wrapper(_merge)
async def _copy_basic(self, path1, path2, **kwargs):
"""Copy file between locations on S3
Not allowed where the origin is >5GB - use copy_managed
"""
buc1, key1, ver1 = self.split_path(path1)
buc2, key2, ver2 = self.split_path(path2)
if ver2:
raise ValueError("Cannot copy to a versioned file!")
try:
copy_src = {"Bucket": buc1, "Key": key1}
if ver1:
copy_src["VersionId"] = ver1
await self._call_s3(
"copy_object", kwargs, Bucket=buc2, Key=key2, CopySource=copy_src
)
except ClientError as e:
raise translate_boto_error(e)
except ParamValidationError as e:
raise ValueError("Copy failed (%r -> %r): %s" % (path1, path2, e)) from e
self.invalidate_cache(path2)
async def _copy_etag_preserved(self, path1, path2, size, total_parts, **kwargs):
"""Copy file between locations on S3 as multi-part while preserving
the etag (using the same part sizes for each part"""
bucket1, key1, version1 = self.split_path(path1)
bucket2, key2, version2 = self.split_path(path2)
mpu = await self._call_s3(
"create_multipart_upload", Bucket=bucket2, Key=key2, **kwargs
)
part_infos = await asyncio.gather(
*[
self._call_s3("head_object", Bucket=bucket1, Key=key1, PartNumber=i)
for i in range(1, total_parts + 1)
]
)
parts = []
brange_first = 0
for i, part_info in enumerate(part_infos, 1):
part_size = part_info["ContentLength"]
brange_last = brange_first + part_size - 1
if brange_last > size:
brange_last = size - 1
part = await self._call_s3(
"upload_part_copy",
Bucket=bucket2,
Key=key2,
PartNumber=i,
UploadId=mpu["UploadId"],
CopySource=path1,
CopySourceRange="bytes=%i-%i" % (brange_first, brange_last),
)
parts.append({"PartNumber": i, "ETag": part["CopyPartResult"]["ETag"]})
brange_first += part_size
await self._call_s3(
"complete_multipart_upload",
Bucket=bucket2,
Key=key2,
UploadId=mpu["UploadId"],
MultipartUpload={"Parts": parts},
)
self.invalidate_cache(path2)
async def _copy_managed(self, path1, path2, size, block=50 * 2**20, **kwargs):
"""Copy file between locations on S3 as multi-part
block: int
The size of the pieces, must be larger than 5MB and at most 5GB.
Smaller blocks mean more calls, only useful for testing.
"""
if block < 5 * 2**20 or block > 5 * 2**30:
raise ValueError("Copy block size must be 5MB<=block<=5GB")
bucket, key, version = self.split_path(path2)
mpu = await self._call_s3(
"create_multipart_upload", Bucket=bucket, Key=key, **kwargs
)
# attempting to do the following calls concurrently with gather causes
# occasional "upload is smaller than the minimum allowed"
out = [
await self._call_s3(
"upload_part_copy",
Bucket=bucket,
Key=key,
PartNumber=i + 1,
UploadId=mpu["UploadId"],
CopySource=self._strip_protocol(path1),
CopySourceRange="bytes=%i-%i" % (brange_first, brange_last),
)
for i, (brange_first, brange_last) in enumerate(_get_brange(size, block))
]
parts = [
{"PartNumber": i + 1, "ETag": o["CopyPartResult"]["ETag"]}
for i, o in enumerate(out)
]
await self._call_s3(
"complete_multipart_upload",
Bucket=bucket,
Key=key,
UploadId=mpu["UploadId"],
MultipartUpload={"Parts": parts},
)
self.invalidate_cache(path2)
async def _cp_file(self, path1, path2, preserve_etag=None, **kwargs):
"""Copy file between locations on S3.
preserve_etag: bool
Whether to preserve etag while copying. If the file is uploaded
as a single part, then it will be always equalivent to the md5
hash of the file hence etag will always be preserved. But if the
file is uploaded in multi parts, then this option will try to
reproduce the same multipart upload while copying and preserve
the generated etag.
"""
path1 = self._strip_protocol(path1)
bucket, key, vers = self.split_path(path1)
info = await self._info(path1, bucket, key, version_id=vers)
size = info["size"]
_, _, parts_suffix = info.get("ETag", "").strip('"').partition("-")
if preserve_etag and parts_suffix:
await self._copy_etag_preserved(
path1, path2, size, total_parts=int(parts_suffix)
)
elif size <= MANAGED_COPY_THRESHOLD:
# simple copy allowed for <5GB
await self._copy_basic(path1, path2, **kwargs)
else:
# if the preserve_etag is true, either the file is uploaded
# on multiple parts or the size is lower than 5GB
assert not preserve_etag
# serial multipart copy
await self._copy_managed(path1, path2, size, **kwargs)
async def _list_multipart_uploads(self, bucket):
out = await self._call_s3("list_multipart_uploads", Bucket=bucket)
return out.get("Contents", []) or out.get("Uploads", [])
list_multipart_uploads = sync_wrapper(_list_multipart_uploads)
async def _abort_mpu(self, bucket, key, mpu):
await self._call_s3(
"abort_multipart_upload",
Bucket=bucket,
Key=key,
UploadId=mpu,
)
abort_mpu = sync_wrapper(_abort_mpu)
async def _clear_multipart_uploads(self, bucket):
"""Remove any partial uploads in the bucket"""
await asyncio.gather(
*[
self._abort_mpu(bucket, upload["Key"], upload["UploadId"])
for upload in await self._list_multipart_uploads(bucket)
]
)
clear_multipart_uploads = sync_wrapper(_clear_multipart_uploads)
async def _bulk_delete(self, pathlist, **kwargs):
"""
Remove multiple keys with one call
Parameters
----------
pathlist : list(str)
The keys to remove, must all be in the same bucket.
Must have 0 < len <= 1000
"""
if not pathlist:
return []
buckets = {self.split_path(path)[0] for path in pathlist}
if len(buckets) > 1:
raise ValueError("Bulk delete files should refer to only one bucket")
bucket = buckets.pop()
if len(pathlist) > 1000:
raise ValueError("Max number of files to delete in one call is 1000")
delete_keys = {
"Objects": [{"Key": self.split_path(path)[1]} for path in pathlist],
"Quiet": True,
}
for path in pathlist:
self.invalidate_cache(self._parent(path))
out = await self._call_s3(
"delete_objects", kwargs, Bucket=bucket, Delete=delete_keys
)
# TODO: we report on successes but don't raise on any errors, effectively
# on_error="omit"
return [f"{bucket}/{_['Key']}" for _ in out.get("Deleted", [])]
async def _rm_file(self, path, **kwargs):
bucket, key, _ = self.split_path(path)
self.invalidate_cache(path)
try:
await self._call_s3("delete_object", Bucket=bucket, Key=key)
except ClientError as e:
raise translate_boto_error(e)
async def _rm(self, path, recursive=False, **kwargs):
if recursive and isinstance(path, str):
bucket, key, _ = self.split_path(path)
if not key and await self._is_bucket_versioned(bucket):
# special path to completely remove versioned bucket
await self._rm_versioned_bucket_contents(bucket)
paths = await self._expand_path(path, recursive=recursive)
files = [p for p in paths if self.split_path(p)[1]]
dirs = [p for p in paths if not self.split_path(p)[1]]
# TODO: fails if more than one bucket in list
out = await _run_coros_in_chunks(
[
self._bulk_delete(files[i : i + 1000])
for i in range(0, len(files), 1000)
],
batch_size=3,
nofiles=True,
)
await asyncio.gather(*[self._rmdir(d) for d in dirs])
[
(self.invalidate_cache(p), self.invalidate_cache(self._parent(p)))
for p in paths
]
return sum(out, [])
async def _is_bucket_versioned(self, bucket):
return (await self._call_s3("get_bucket_versioning", Bucket=bucket)).get(
"Status", ""
) == "Enabled"
is_bucket_versioned = sync_wrapper(_is_bucket_versioned)
async def _make_bucket_versioned(self, bucket, versioned: bool = True):
"""Set bucket versioning status"""
status = "Enabled" if versioned else "Suspended"
return await self._call_s3(
"put_bucket_versioning",
Bucket=bucket,
VersioningConfiguration={"Status": status},
)
make_bucket_versioned = sync_wrapper(_make_bucket_versioned)
async def _rm_versioned_bucket_contents(self, bucket):
"""Remove a versioned bucket and all contents"""
await self.set_session()
s3 = await self.get_s3(bucket)
pag = s3.get_paginator("list_object_versions")
async for plist in pag.paginate(Bucket=bucket):
obs = plist.get("Versions", []) + plist.get("DeleteMarkers", [])
delete_keys = {
"Objects": [
{"Key": i["Key"], "VersionId": i["VersionId"]} for i in obs
],
"Quiet": True,
}
if obs:
await self._call_s3("delete_objects", Bucket=bucket, Delete=delete_keys)
def invalidate_cache(self, path=None):
if path is None:
self.dircache.clear()
else:
path = self._strip_protocol(path)
self.dircache.pop(path, None)
while path:
self.dircache.pop(path, None)
path = self._parent(path)
async def _walk(self, path, maxdepth=None, **kwargs):
if path in ["", "*"] + [f"{p}://" for p in self.protocol]:
raise ValueError("Cannot crawl all of S3")
async for _ in super()._walk(path, maxdepth=maxdepth, **kwargs):
yield _
def modified(self, path, version_id=None, refresh=False):
"""Return the last modified timestamp of file at `path` as a datetime"""
info = self.info(path=path, version_id=version_id, refresh=refresh)
if "LastModified" not in info:
# This path is a bucket or folder, which do not currently have a modified date
raise IsADirectoryError
return info["LastModified"]
def sign(self, path, expiration=100, **kwargs):
return self.url(path, expires=expiration, **kwargs)
async def _invalidate_region_cache(self):
"""Invalidate the region cache (associated with buckets)
if ``cache_regions`` is turned on."""
if not self.cache_regions:
return None
# If the region cache is not initialized, then
# do nothing.
cache = getattr(self, "_s3creator", None)
if cache is not None:
await cache.clear()
invalidate_region_cache = sync_wrapper(_invalidate_region_cache)
async def open_async(self, path, mode="rb", **kwargs):
if "b" not in mode or kwargs.get("compression"):
raise ValueError
return S3AsyncStreamedFile(self, path, mode)
class S3File(AbstractBufferedFile):
"""
Open S3 key as a file. Data is only loaded and cached on demand.
Parameters
----------
s3 : S3FileSystem
botocore connection
path : string
S3 bucket/key to access
mode : str
One of 'rb', 'wb', 'ab'. These have the same meaning
as they do for the built-in `open` function.
block_size : int
read-ahead size for finding delimiters
fill_cache : bool
If seeking to new a part of the file beyond the current buffer,
with this True, the buffer will be filled between the sections to
best support random access. When reading only a few specific chunks
out of a file, performance may be better if False.
acl: str
Canned ACL to apply
version_id : str
Optional version to read the file at. If not specified this will
default to the current version of the object. This is only used for
reading.
requester_pays : bool (False)
If RequesterPays buckets are supported.
Examples
--------
>>> s3 = S3FileSystem() # doctest: +SKIP
>>> with s3.open('my-bucket/my-file.txt', mode='rb') as f: # doctest: +SKIP
... ... # doctest: +SKIP
See Also
--------
S3FileSystem.open: used to create ``S3File`` objects
"""
retries = 5
part_min = 5 * 2**20
part_max = 5 * 2**30
def __init__(
self,
s3,
path,
mode="rb",
block_size=50 * 2**20,
acl=False,
version_id=None,
fill_cache=True,
s3_additional_kwargs=None,
autocommit=True,
cache_type="readahead",
requester_pays=False,
cache_options=None,
size=None,
):
bucket, key, path_version_id = s3.split_path(path)
if not key:
raise ValueError("Attempt to open non key-like path: %s" % path)
self.bucket = bucket
self.key = key
self.version_id = _coalesce_version_id(version_id, path_version_id)
self.acl = acl
if self.acl and self.acl not in key_acls:
raise ValueError("ACL not in %s", key_acls)
self.mpu = None
self.parts = None
self.fill_cache = fill_cache
self.s3_additional_kwargs = s3_additional_kwargs or {}
self.req_kw = {"RequestPayer": "requester"} if requester_pays else {}
if "r" not in mode:
if block_size < 5 * 2**20:
raise ValueError("Block size must be >=5MB")
else:
if version_id and s3.version_aware:
self.version_id = version_id
self.details = s3.info(path, version_id=version_id)
self.size = self.details["size"]
elif s3.version_aware:
# In this case we have not managed to get the VersionId out of details and
# we should invalidate the cache and perform a full head_object since it
# has likely been partially populated by ls.
s3.invalidate_cache(path)
self.details = s3.info(path)
self.version_id = self.details.get("VersionId")
super().__init__(
s3,
path,
mode,
block_size,
autocommit=autocommit,
cache_type=cache_type,
cache_options=cache_options,
size=size,
)
self.s3 = self.fs # compatibility
# when not using autocommit we want to have transactional state to manage
self.append_block = False
if "a" in mode and s3.exists(path):
# See:
# put: https://boto3.amazonaws.com/v1/documentation/api/latest
# /reference/services/s3.html#S3.Client.put_object
#
# head: https://boto3.amazonaws.com/v1/documentation/api/latest
# /reference/services/s3.html#S3.Client.head_object
head = self._call_s3(
"head_object",
self.kwargs,
Bucket=bucket,
Key=key,
**version_id_kw(version_id),
**self.req_kw,
)
head = {
key: value
for key, value in head.items()
if key in _PRESERVE_KWARGS and key not in self.s3_additional_kwargs
}
loc = head.pop("ContentLength")
if loc < 5 * 2**20:
# existing file too small for multi-upload: download
self.write(self.fs.cat(self.path))
else:
self.append_block = True
self.loc = loc
# Reflect head
self.s3_additional_kwargs.update(head)
if "r" in mode and size is None and "ETag" in self.details:
self.req_kw["IfMatch"] = self.details["ETag"]
def _call_s3(self, method, *kwarglist, **kwargs):
return self.fs.call_s3(method, self.s3_additional_kwargs, *kwarglist, **kwargs)
def _initiate_upload(self):
if self.autocommit and not self.append_block and self.tell() < self.blocksize:
# only happens when closing small file, use on-shot PUT
return
logger.debug("Initiate upload for %s" % self)
self.parts = []
kw = dict(
Bucket=self.bucket,
Key=self.key,
)
if self.acl:
kw["ACL"] = self.acl
self.mpu = self._call_s3("create_multipart_upload", **kw)
if self.append_block:
# use existing data in key when appending,
# and block is big enough
out = self._call_s3(
"upload_part_copy",
self.s3_additional_kwargs,
Bucket=self.bucket,
Key=self.key,
PartNumber=1,
UploadId=self.mpu["UploadId"],
CopySource=self.path,
)
self.parts.append({"PartNumber": 1, "ETag": out["CopyPartResult"]["ETag"]})
def metadata(self, refresh=False, **kwargs):
"""Return metadata of file.
See :func:`~s3fs.S3Filesystem.metadata`.
Metadata is cached unless `refresh=True`.
"""
return self.fs.metadata(self.path, refresh, **kwargs)
def getxattr(self, xattr_name, **kwargs):
"""Get an attribute from the metadata.
See :func:`~s3fs.S3Filesystem.getxattr`.
Examples
--------
>>> mys3file.getxattr('attribute_1') # doctest: +SKIP
'value_1'
"""
return self.fs.getxattr(self.path, xattr_name, **kwargs)
def setxattr(self, copy_kwargs=None, **kwargs):
"""Set metadata.
See :func:`~s3fs.S3Filesystem.setxattr`.
Examples
--------
>>> mys3file.setxattr(attribute_1='value1', attribute_2='value2') # doctest: +SKIP
"""
if self.writable():
raise NotImplementedError(
"cannot update metadata while file is open for writing"
)
return self.fs.setxattr(self.path, copy_kwargs=copy_kwargs, **kwargs)
def url(self, **kwargs):
"""HTTP URL to read this file (if it already exists)"""
return self.fs.url(self.path, **kwargs)
def _fetch_range(self, start, end):
try:
return _fetch_range(
self.fs,
self.bucket,
self.key,
self.version_id,
start,
end,
req_kw=self.req_kw,
)
except OSError as ex:
if ex.args[0] == errno.EINVAL and "pre-conditions" in ex.args[1]:
raise FileExpired(
filename=self.details["name"], e_tag=self.details.get("ETag")
) from ex
else:
raise
def _upload_chunk(self, final=False):
bucket, key, _ = self.fs.split_path(self.path)
logger.debug(
"Upload for %s, final=%s, loc=%s, buffer loc=%s"
% (self, final, self.loc, self.buffer.tell())
)
if (
self.autocommit
and not self.append_block
and final
and self.tell() < self.blocksize
):
# only happens when closing small file, use one-shot PUT
pass
else:
self.buffer.seek(0)
def upload_part(part_data: bytes):
if len(part_data) == 0:
return
part = len(self.parts) + 1
logger.debug(
"Upload chunk %s, %s; %s bytes" % (self, part, len(part_data))
)
out = self._call_s3(
"upload_part",
Bucket=bucket,
PartNumber=part,
UploadId=self.mpu["UploadId"],
Body=part_data,
Key=key,
)
part_header = {"PartNumber": part, "ETag": out["ETag"]}
if "ChecksumSHA256" in out:
part_header["ChecksumSHA256"] = out["ChecksumSHA256"]
self.parts.append(part_header)
def n_bytes_left() -> int:
return len(self.buffer.getbuffer()) - self.buffer.tell()
min_chunk = 1 if final else self.blocksize
# TODO: concurrent here
if self.fs.fixed_upload_size:
# all chunks have fixed size, exception: last one can be smaller
while n_bytes_left() >= min_chunk:
upload_part(self.buffer.read(self.blocksize))
else:
while n_bytes_left() >= min_chunk:
upload_part(self.buffer.read(self.part_max))
if self.autocommit and final:
self.commit()
else:
# update 'upload offset'
self.offset += self.buffer.tell()
# create new smaller buffer, seek to file end
self.buffer = io.BytesIO(self.buffer.read())
self.buffer.seek(0, 2)
return False # instruct fsspec.flush to NOT clear self.buffer
def commit(self):
logger.debug("Commit %s" % self)
match = {"IfNoneMatch": "*"} if "x" in self.mode else {}
if self.tell() == 0:
if self.buffer is not None:
logger.debug("Empty file committed %s" % self)
self._abort_mpu()
write_result = self.fs.touch(self.path, **self.kwargs)
elif not self.parts:
if self.buffer is not None:
logger.debug("One-shot upload of %s" % self)
self.buffer.seek(0)
data = self.buffer.read()
kw = dict(Key=self.key, Bucket=self.bucket, Body=data, **self.kwargs)
if self.acl:
kw["ACL"] = self.acl
write_result = self._call_s3("put_object", **kw, **match)
else:
raise RuntimeError
else:
logger.debug("Complete multi-part upload for %s " % self)
part_info = {"Parts": self.parts}
write_result = self._call_s3(
"complete_multipart_upload",
Bucket=self.bucket,
Key=self.key,
UploadId=self.mpu["UploadId"],
MultipartUpload=part_info,
**match,
)
if self.fs.version_aware:
self.version_id = write_result.get("VersionId")
# complex cache invalidation, since file's appearance can cause several
# directories
self.buffer = None
parts = self.path.split("/")
path = parts[0]
for p in parts[1:]:
if path in self.fs.dircache and not [
True for f in self.fs.dircache[path] if f["name"] == path + "/" + p
]:
self.fs.invalidate_cache(path)
path = path + "/" + p
def discard(self):
self._abort_mpu()
self.buffer = None # file becomes unusable
def _abort_mpu(self):
if self.mpu:
self.fs.abort_mpu(self.bucket, self.key, self.mpu["UploadId"])
self.mpu = None
class S3AsyncStreamedFile(AbstractAsyncStreamedFile):
def __init__(self, fs, path, mode):
self.fs = fs
self.path = path
self.mode = mode
self.r = None
self.loc = 0
self.size = None
async def read(self, length=-1):
if self.r is None:
bucket, key, gen = self.fs.split_path(self.path)
r = await self.fs._call_s3("get_object", Bucket=bucket, Key=key)
self.size = int(r["ResponseMetadata"]["HTTPHeaders"]["content-length"])
self.r = r["Body"]
out = await self.r.read(length)
self.loc += len(out)
return out
def _fetch_range(fs, bucket, key, version_id, start, end, req_kw=None):
if req_kw is None:
req_kw = {}
if start == end:
logger.debug(
"skip fetch for negative range - bucket=%s,key=%s,start=%d,end=%d",
bucket,
key,
start,
end,
)
return b""
logger.debug("Fetch: %s/%s, %s-%s", bucket, key, start, end)
return sync(fs.loop, _inner_fetch, fs, bucket, key, version_id, start, end, req_kw)
async def _inner_fetch(fs, bucket, key, version_id, start, end, req_kw=None):
async def _call_and_read():
resp = await fs._call_s3(
"get_object",
Bucket=bucket,
Key=key,
Range="bytes=%i-%i" % (start, end - 1),
**version_id_kw(version_id),
**req_kw,
)
try:
return await resp["Body"].read()
finally:
resp["Body"].close()
return await _error_wrapper(_call_and_read, retries=fs.retries)
s3fs-2026.2.0/s3fs/errors.py 0000664 0000000 0000000 00000017431 15141211055 0015366 0 ustar 00root root 0000000 0000000 """S3 error codes adapted into more natural Python ones.
Adapted from: https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html
"""
import errno
import functools
# Fallback values since some systems might not have these.
ENAMETOOLONG = getattr(errno, "ENAMETOOLONG", errno.EINVAL)
ENOTEMPTY = getattr(errno, "ENOTEMPTY", errno.EINVAL)
EMSGSIZE = getattr(errno, "EMSGSIZE", errno.EINVAL)
EREMOTEIO = getattr(errno, "EREMOTEIO", errno.EIO)
EREMCHG = getattr(errno, "EREMCHG", errno.ENOENT)
ERROR_CODE_TO_EXCEPTION = {
"AccessDenied": PermissionError,
"AccountProblem": PermissionError,
"AllAccessDisabled": PermissionError,
"AmbiguousGrantByEmailAddress": functools.partial(IOError, errno.EINVAL),
"AuthorizationHeaderMalformed": functools.partial(IOError, errno.EINVAL),
"BadDigest": functools.partial(IOError, errno.EINVAL),
"BucketAlreadyExists": FileExistsError,
"BucketAlreadyOwnedByYou": FileExistsError,
"BucketNotEmpty": functools.partial(IOError, ENOTEMPTY),
"CredentialsNotSupported": functools.partial(IOError, errno.EINVAL),
"CrossLocationLoggingProhibited": PermissionError,
"EntityTooSmall": functools.partial(IOError, errno.EINVAL),
"EntityTooLarge": functools.partial(IOError, EMSGSIZE),
"ExpiredToken": PermissionError,
"IllegalLocationConstraintException": PermissionError,
"IllegalVersioningConfigurationException": functools.partial(IOError, errno.EINVAL),
"IncompleteBody": functools.partial(IOError, errno.EINVAL),
"IncorrectNumberOfFilesInPostRequest": functools.partial(IOError, errno.EINVAL),
"InlineDataTooLarge": functools.partial(IOError, EMSGSIZE),
"InternalError": functools.partial(IOError, EREMOTEIO),
"InvalidAccessKeyId": PermissionError,
"InvalidAddressingHeader": functools.partial(IOError, errno.EINVAL),
"InvalidArgument": functools.partial(IOError, errno.EINVAL),
"InvalidBucketName": functools.partial(IOError, errno.EINVAL),
"InvalidBucketState": functools.partial(IOError, errno.EPERM),
"InvalidDigest": functools.partial(IOError, errno.EINVAL),
"InvalidEncryptionAlgorithmError": functools.partial(IOError, errno.EINVAL),
"InvalidLocationConstraint": functools.partial(IOError, errno.EINVAL),
"InvalidObjectState": PermissionError,
"InvalidPart": functools.partial(IOError, errno.EINVAL),
"InvalidPartOrder": functools.partial(IOError, errno.EINVAL),
"InvalidPayer": PermissionError,
"InvalidPolicyDocument": functools.partial(IOError, errno.EINVAL),
"InvalidRange": functools.partial(IOError, errno.EINVAL),
"InvalidRequest": functools.partial(IOError, errno.EINVAL),
"InvalidSecurity": PermissionError,
"InvalidSOAPRequest": functools.partial(IOError, errno.EINVAL),
"InvalidStorageClass": functools.partial(IOError, errno.EINVAL),
"InvalidTargetBucketForLogging": functools.partial(IOError, errno.EINVAL),
"InvalidToken": functools.partial(IOError, errno.EINVAL),
"InvalidURI": functools.partial(IOError, errno.EINVAL),
"KeyTooLongError": functools.partial(IOError, ENAMETOOLONG),
"MalformedACLError": functools.partial(IOError, errno.EINVAL),
"MalformedPOSTRequest": functools.partial(IOError, errno.EINVAL),
"MalformedXML": functools.partial(IOError, errno.EINVAL),
"MaxMessageLengthExceeded": functools.partial(IOError, EMSGSIZE),
"MaxPostPreDataLengthExceededError": functools.partial(IOError, EMSGSIZE),
"MetadataTooLarge": functools.partial(IOError, EMSGSIZE),
"MethodNotAllowed": functools.partial(IOError, errno.EPERM),
"MissingAttachment": functools.partial(IOError, errno.EINVAL),
"MissingContentLength": functools.partial(IOError, errno.EINVAL),
"MissingRequestBodyError": functools.partial(IOError, errno.EINVAL),
"MissingSecurityElement": functools.partial(IOError, errno.EINVAL),
"MissingSecurityHeader": functools.partial(IOError, errno.EINVAL),
"NoLoggingStatusForKey": functools.partial(IOError, errno.EINVAL),
"NoSuchBucket": FileNotFoundError,
"NoSuchBucketPolicy": FileNotFoundError,
"NoSuchKey": FileNotFoundError,
"NoSuchLifecycleConfiguration": FileNotFoundError,
"NoSuchUpload": FileNotFoundError,
"NoSuchVersion": FileNotFoundError,
"NotImplemented": functools.partial(IOError, errno.ENOSYS),
"NotSignedUp": PermissionError,
"OperationAborted": functools.partial(IOError, errno.EBUSY),
"PermanentRedirect": functools.partial(IOError, EREMCHG),
"PreconditionFailed": functools.partial(IOError, errno.EINVAL),
"Redirect": functools.partial(IOError, EREMCHG),
"RestoreAlreadyInProgress": functools.partial(IOError, errno.EBUSY),
"RequestIsNotMultiPartContent": functools.partial(IOError, errno.EINVAL),
"RequestTimeout": TimeoutError,
"RequestTimeTooSkewed": PermissionError,
"RequestTorrentOfBucketError": functools.partial(IOError, errno.EPERM),
"SignatureDoesNotMatch": PermissionError,
"ServiceUnavailable": functools.partial(IOError, errno.EBUSY),
"SlowDown": functools.partial(IOError, errno.EBUSY),
"TemporaryRedirect": functools.partial(IOError, EREMCHG),
"TokenRefreshRequired": functools.partial(IOError, errno.EINVAL),
"TooManyBuckets": functools.partial(IOError, errno.EINVAL),
"UnexpectedContent": functools.partial(IOError, errno.EINVAL),
"UnresolvableGrantByEmailAddress": functools.partial(IOError, errno.EINVAL),
"UserKeyMustBeSpecified": functools.partial(IOError, errno.EINVAL),
"301": functools.partial(IOError, EREMCHG), # PermanentRedirect
"307": functools.partial(IOError, EREMCHG), # Redirect
"400": functools.partial(IOError, errno.EINVAL),
"403": PermissionError,
"404": FileNotFoundError,
"405": functools.partial(IOError, errno.EPERM),
"409": functools.partial(IOError, errno.EBUSY),
"412": functools.partial(IOError, errno.EINVAL), # PreconditionFailed
"416": functools.partial(IOError, errno.EINVAL), # InvalidRange
"500": functools.partial(IOError, EREMOTEIO), # InternalError
"501": functools.partial(IOError, errno.ENOSYS), # NotImplemented
"503": functools.partial(IOError, errno.EBUSY), # SlowDown
}
def translate_boto_error(error, message=None, set_cause=True, *args, **kwargs):
"""Convert a ClientError exception into a Python one.
Parameters
----------
error : botocore.exceptions.ClientError
The exception returned by the boto API.
message : str
An error message to use for the returned exception. If not given, the
error message returned by the server is used instead.
set_cause : bool
Whether to set the __cause__ attribute to the previous exception if the
exception is translated.
*args, **kwargs :
Additional arguments to pass to the exception constructor, after the
error message. Useful for passing the filename arguments to ``IOError``.
Returns
-------
An instantiated exception ready to be thrown. If the error code isn't
recognized, an IOError with the original error message is returned.
"""
error_response = getattr(error, "response", None)
if error_response is None:
# non-http error, or response is None:
return error
code = error_response["Error"].get("Code")
if (
code == "PreconditionFailed"
and error_response["Error"].get("Condition", "") == "If-None-Match"
):
constructor = FileExistsError
else:
constructor = ERROR_CODE_TO_EXCEPTION.get(code)
if constructor:
if not message:
message = error_response["Error"].get("Message", str(error))
custom_exc = constructor(message, *args, **kwargs)
else:
# No match found, wrap this in an IOError with the appropriate message.
custom_exc = OSError(errno.EIO, message or str(error), *args)
if set_cause:
custom_exc.__cause__ = error
return custom_exc
s3fs-2026.2.0/s3fs/mapping.py 0000664 0000000 0000000 00000000355 15141211055 0015502 0 ustar 00root root 0000000 0000000 from .core import S3FileSystem
def S3Map(root, s3, check=False, create=False):
"""Mirror previous class, not implemented in fsspec"""
s3 = s3 or S3FileSystem.current()
return s3.get_mapper(root, check=check, create=create)
s3fs-2026.2.0/s3fs/tests/ 0000775 0000000 0000000 00000000000 15141211055 0014634 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/s3fs/tests/__init__.py 0000664 0000000 0000000 00000000000 15141211055 0016733 0 ustar 00root root 0000000 0000000 s3fs-2026.2.0/s3fs/tests/derived/ 0000775 0000000 0000000 00000000000 15141211055 0016256 5 ustar 00root root 0000000 0000000 s3fs-2026.2.0/s3fs/tests/derived/__init__.py 0000664 0000000 0000000 00000000000 15141211055 0020355 0 ustar 00root root 0000000 0000000 s3fs-2026.2.0/s3fs/tests/derived/s3fs_fixtures.py 0000664 0000000 0000000 00000005451 15141211055 0021444 0 ustar 00root root 0000000 0000000 import json
import os
import pytest
import requests
import time
from fsspec.tests.abstract import AbstractFixtures
from s3fs.core import S3FileSystem
test_bucket_name = "test"
secure_bucket_name = "test-secure"
versioned_bucket_name = "test-versioned"
port = 5556
endpoint_uri = "http://127.0.0.1:%s/" % port
class S3fsFixtures(AbstractFixtures):
@pytest.fixture(scope="class")
def fs(self, _s3_base, _get_boto3_client):
client = _get_boto3_client
client.create_bucket(Bucket=test_bucket_name, ACL="public-read")
client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read")
client.put_bucket_versioning(
Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"}
)
# initialize secure bucket
client.create_bucket(Bucket=secure_bucket_name, ACL="public-read")
policy = json.dumps(
{
"Version": "2012-10-17",
"Id": "PutObjPolicy",
"Statement": [
{
"Sid": "DenyUnEncryptedObjectUploads",
"Effect": "Deny",
"Principal": "*",
"Action": "s3:PutObject",
"Resource": f"arn:aws:s3:::{secure_bucket_name}/*",
"Condition": {
"StringNotEquals": {
"s3:x-amz-server-side-encryption": "aws:kms"
}
},
}
],
}
)
client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy)
S3FileSystem.clear_instance_cache()
s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri})
s3.invalidate_cache()
yield s3
@pytest.fixture
def fs_path(self):
return test_bucket_name
@pytest.fixture
def supports_empty_directories(self):
return False
@pytest.fixture(scope="class")
def _get_boto3_client(self):
from botocore.session import Session
# NB: we use the sync botocore client for setup
session = Session()
return session.create_client("s3", endpoint_url=endpoint_uri)
@pytest.fixture(scope="class")
def _s3_base(self):
# copy of s3_base in test_s3fs
from moto.moto_server.threaded_moto_server import ThreadedMotoServer
server = ThreadedMotoServer(ip_address="127.0.0.1", port=port)
server.start()
if "AWS_SECRET_ACCESS_KEY" not in os.environ:
os.environ["AWS_SECRET_ACCESS_KEY"] = "foo"
if "AWS_ACCESS_KEY_ID" not in os.environ:
os.environ["AWS_ACCESS_KEY_ID"] = "foo"
print("server up")
yield
print("moto done")
server.stop()
s3fs-2026.2.0/s3fs/tests/derived/s3fs_test.py 0000664 0000000 0000000 00000001766 15141211055 0020557 0 ustar 00root root 0000000 0000000 import pytest
import fsspec.tests.abstract as abstract
from s3fs.tests.derived.s3fs_fixtures import S3fsFixtures
class TestS3fsCopy(abstract.AbstractCopyTests, S3fsFixtures):
pass
class TestS3fsGet(abstract.AbstractGetTests, S3fsFixtures):
pass
class TestS3fsPut(abstract.AbstractPutTests, S3fsFixtures):
pass
def botocore_too_old():
import botocore
from packaging.version import parse
MIN_BOTOCORE_VERSION = "1.33.2"
return parse(botocore.__version__) < parse(MIN_BOTOCORE_VERSION)
class TestS3fsPipe(abstract.AbstractPipeTests, S3fsFixtures):
test_pipe_exclusive = pytest.mark.skipif(
botocore_too_old(), reason="Older botocore doesn't support exclusive writes"
)(abstract.AbstractPipeTests.test_pipe_exclusive)
class TestS3fsOpen(abstract.AbstractOpenTests, S3fsFixtures):
test_open_exclusive = pytest.mark.xfail(
reason="complete_multipart_upload doesn't implement condition in moto"
)(abstract.AbstractOpenTests.test_open_exclusive)
s3fs-2026.2.0/s3fs/tests/test_custom_error_handler.py 0000664 0000000 0000000 00000016540 15141211055 0022473 0 ustar 00root root 0000000 0000000 """Tests for custom error handler functionality."""
import asyncio
import pytest
from botocore.exceptions import ClientError
import s3fs.core
from s3fs.core import (
S3FileSystem,
_error_wrapper,
set_custom_error_handler,
add_retryable_error,
)
# Custom exception types for testing
class CustomRetryableError(Exception):
"""A custom exception that should be retried."""
pass
class CustomNonRetryableError(Exception):
"""A custom exception that should not be retried."""
pass
@pytest.fixture(autouse=True)
def reset_error_handler():
"""Reset the custom error handler and retryable errors after each test."""
original_errors = s3fs.core.S3_RETRYABLE_ERRORS
yield
# Reset to default handler
s3fs.core.CUSTOM_ERROR_HANDLER = lambda e: False
# Reset retryable errors tuple
s3fs.core.S3_RETRYABLE_ERRORS = original_errors
def test_handler_retry_on_custom_exception():
"""Test that custom error handler allows retrying on custom exceptions."""
call_count = 0
async def failing_func():
nonlocal call_count
call_count += 1
if call_count < 3:
raise CustomRetryableError("Custom error that should retry")
return "success"
# Set up custom handler to retry CustomRetryableError
def custom_handler(e):
return isinstance(e, CustomRetryableError)
set_custom_error_handler(custom_handler)
# Should retry and eventually succeed
async def run_test():
result = await _error_wrapper(failing_func, retries=5)
assert result == "success"
assert call_count == 3 # Failed twice, succeeded on third attempt
asyncio.run(run_test())
def test_handler_no_retry_on_other_exception():
"""Test that custom error handler does not retry exceptions it doesn't handle."""
call_count = 0
async def failing_func():
nonlocal call_count
call_count += 1
raise CustomNonRetryableError("Custom error that should not retry")
# Set up custom handler that only retries CustomRetryableError
def custom_handler(e):
return isinstance(e, CustomRetryableError)
set_custom_error_handler(custom_handler)
# Should not retry and fail immediately
async def run_test():
with pytest.raises(CustomNonRetryableError):
await _error_wrapper(failing_func, retries=5)
assert call_count == 1 # Should only be called once
asyncio.run(run_test())
def test_handler_with_client_error():
"""Test that custom handler can make ClientError retryable."""
call_count = 0
async def failing_func():
nonlocal call_count
call_count += 1
if call_count < 3:
# Create a ClientError that doesn't match the built-in retry patterns
error_response = {
"Error": {
"Code": "CustomThrottlingError",
"Message": "Custom throttling message",
}
}
raise ClientError(error_response, "operation_name")
return "success"
# Set up custom handler to retry on specific ClientError codes
def custom_handler(e):
if isinstance(e, ClientError):
return e.response.get("Error", {}).get("Code") == "CustomThrottlingError"
return False
set_custom_error_handler(custom_handler)
# Should retry and eventually succeed
async def run_test():
result = await _error_wrapper(failing_func, retries=5)
assert result == "success"
assert call_count == 3
asyncio.run(run_test())
def test_handler_preserves_builtin_retry_pattern():
"""Test that custom handler doesn't interfere with built-in retry logic."""
call_count = 0
async def failing_func():
nonlocal call_count
call_count += 1
if call_count < 3:
# SlowDown is a built-in retryable pattern
error_response = {
"Error": {
"Code": "SlowDown",
"Message": "Please reduce your request rate",
}
}
raise ClientError(error_response, "operation_name")
return "success"
# Set up a custom handler that handles something else
def custom_handler(e):
return isinstance(e, CustomRetryableError)
set_custom_error_handler(custom_handler)
# Should still retry SlowDown errors due to built-in logic
async def run_test():
result = await _error_wrapper(failing_func, retries=5)
assert result == "success"
assert call_count == 3
asyncio.run(run_test())
def test_handler_max_retries():
"""Test that custom handler respects max retries."""
call_count = 0
async def always_failing_func():
nonlocal call_count
call_count += 1
raise CustomRetryableError("Always fails")
def custom_handler(e):
return isinstance(e, CustomRetryableError)
set_custom_error_handler(custom_handler)
# Should retry up to retries limit then raise
async def run_test():
with pytest.raises(CustomRetryableError):
await _error_wrapper(always_failing_func, retries=3)
assert call_count == 3
asyncio.run(run_test())
def test_handler_sleep_behavior():
"""Test that retries due to custom handler also wait between attempts."""
call_times = []
async def failing_func():
call_times.append(asyncio.get_event_loop().time())
raise CustomRetryableError("Retry me")
def custom_handler(e):
return isinstance(e, CustomRetryableError)
set_custom_error_handler(custom_handler)
async def run_test():
with pytest.raises(CustomRetryableError):
await _error_wrapper(failing_func, retries=3)
# Should have made 3 attempts
assert len(call_times) == 3
# Check that there was a delay between attempts
# The wait time formula is min(1.7**i * 0.1, 15)
# For i=0: min(0.1, 15) = 0.1
# For i=1: min(0.17, 15) = 0.17
if len(call_times) >= 2:
time_between_first_and_second = call_times[1] - call_times[0]
# Should be roughly 0.1 seconds (with some tolerance)
assert time_between_first_and_second >= 0.05
asyncio.run(run_test())
def test_default_handler():
"""Test behavior when custom handler is not set explicitly."""
call_count = 0
async def failing_func():
nonlocal call_count
call_count += 1
raise ValueError("Regular exception")
# Don't set a custom handler, use default (returns False)
# Should not retry regular exceptions
async def run_test():
with pytest.raises(ValueError):
await _error_wrapper(failing_func, retries=5)
assert call_count == 1
asyncio.run(run_test())
def test_add_retryable_error():
"""Test adding a custom exception to the retryable errors tuple."""
call_count = 0
async def failing_func():
nonlocal call_count
call_count += 1
if call_count < 3:
raise CustomRetryableError("Custom error")
return "success"
# Add CustomRetryableError to the retryable errors
add_retryable_error(CustomRetryableError)
# Should now be retried automatically without custom handler
async def run_test():
result = await _error_wrapper(failing_func, retries=5)
assert result == "success"
assert call_count == 3
asyncio.run(run_test())
s3fs-2026.2.0/s3fs/tests/test_mapping.py 0000664 0000000 0000000 00000005453 15141211055 0017707 0 ustar 00root root 0000000 0000000 import pytest
from s3fs.tests.test_s3fs import s3_base, s3, test_bucket_name
from s3fs import S3Map, S3FileSystem
root = test_bucket_name + "/mapping"
def test_simple(s3):
d = s3.get_mapper(root)
assert not d
assert list(d) == list(d.keys()) == []
assert list(d.values()) == []
assert list(d.items()) == []
s3.get_mapper(root)
def test_default_s3filesystem(s3):
d = s3.get_mapper(root)
assert d.fs is s3
def test_errors(s3):
d = s3.get_mapper(root)
with pytest.raises(KeyError):
d["nonexistent"]
try:
s3.get_mapper("does-not-exist", check=True)
except Exception as e:
assert "does-not-exist" in str(e)
def test_with_data(s3):
d = s3.get_mapper(root)
d["x"] = b"123"
assert list(d) == list(d.keys()) == ["x"]
assert list(d.values()) == [b"123"]
assert list(d.items()) == [("x", b"123")]
assert d["x"] == b"123"
assert bool(d)
assert s3.find(root) == [test_bucket_name + "/mapping/x"]
d["x"] = b"000"
assert d["x"] == b"000"
d["y"] = b"456"
assert d["y"] == b"456"
assert set(d) == {"x", "y"}
d.clear()
assert list(d) == []
def test_complex_keys(s3):
d = s3.get_mapper(root)
d[1] = b"hello"
assert d[1] == b"hello"
del d[1]
d[1, 2] = b"world"
assert d[1, 2] == b"world"
del d[1, 2]
d["x", 1, 2] = b"hello world"
assert d["x", 1, 2] == b"hello world"
assert ("x", 1, 2) in d
def test_clear_empty(s3):
d = s3.get_mapper(root)
d.clear()
assert list(d) == []
d[1] = b"1"
assert list(d) == ["1"]
d.clear()
assert list(d) == []
def test_no_dircache(s3):
from s3fs.tests.test_s3fs import endpoint_uri
import fsspec
d = fsspec.get_mapper(
"s3://" + root,
anon=False,
client_kwargs={"endpoint_url": endpoint_uri},
use_listings_cache=False,
)
d.clear()
assert list(d) == []
d[1] = b"1"
assert list(d) == ["1"]
d.clear()
assert list(d) == []
def test_pickle(s3):
d = s3.get_mapper(root)
d["x"] = b"1"
import pickle
d2 = pickle.loads(pickle.dumps(d))
assert d2["x"] == b"1"
def test_array(s3):
from array import array
d = s3.get_mapper(root)
d["x"] = array("B", [65] * 1000)
assert d["x"] == b"A" * 1000
def test_bytearray(s3):
d = s3.get_mapper(root)
d["x"] = bytearray(b"123")
assert d["x"] == b"123"
def test_new_bucket(s3):
try:
s3.get_mapper("new-bucket", check=True)
assert False
except ValueError as e:
assert "create" in str(e)
d = s3.get_mapper("new-bucket", create=True)
assert not d
d = s3.get_mapper("new-bucket/new-directory")
assert not d
def test_old_api(s3):
import fsspec.mapping
assert isinstance(S3Map(root, s3), fsspec.mapping.FSMap)
s3fs-2026.2.0/s3fs/tests/test_s3fs.py 0000664 0000000 0000000 00000273110 15141211055 0017127 0 ustar 00root root 0000000 0000000 import asyncio
import errno
import datetime
from contextlib import contextmanager
import json
from concurrent.futures import ProcessPoolExecutor
import io
import os
import random
import requests
import time
import sys
import pytest
import moto
from moto.moto_server.threaded_moto_server import ThreadedMotoServer
from itertools import chain
import fsspec.core
from dateutil.tz import tzutc
import botocore
import s3fs.core
from s3fs.core import MAX_UPLOAD_PARTS, S3FileSystem, calculate_chunksize
from s3fs.utils import ignoring, SSEParams
from botocore.exceptions import NoCredentialsError
from fsspec.asyn import sync
from fsspec.callbacks import Callback
from packaging import version
test_bucket_name = "test"
secure_bucket_name = "test-secure"
versioned_bucket_name = "test-versioned"
files = {
"test/accounts.1.json": (
b'{"amount": 100, "name": "Alice"}\n'
b'{"amount": 200, "name": "Bob"}\n'
b'{"amount": 300, "name": "Charlie"}\n'
b'{"amount": 400, "name": "Dennis"}\n'
),
"test/accounts.2.json": (
b'{"amount": 500, "name": "Alice"}\n'
b'{"amount": 600, "name": "Bob"}\n'
b'{"amount": 700, "name": "Charlie"}\n'
b'{"amount": 800, "name": "Dennis"}\n'
),
}
csv_files = {
"2014-01-01.csv": (
b"name,amount,id\n" b"Alice,100,1\n" b"Bob,200,2\n" b"Charlie,300,3\n"
),
"2014-01-02.csv": (b"name,amount,id\n"),
"2014-01-03.csv": (
b"name,amount,id\n" b"Dennis,400,4\n" b"Edith,500,5\n" b"Frank,600,6\n"
),
}
text_files = {
"nested/file1": b"hello\n",
"nested/file2": b"world",
"nested/nested2/file1": b"hello\n",
"nested/nested2/file2": b"world",
}
glob_files = {"file.dat": b"", "filexdat": b""}
a = test_bucket_name + "/tmp/test/a"
b = test_bucket_name + "/tmp/test/b"
c = test_bucket_name + "/tmp/test/c"
d = test_bucket_name + "/tmp/test/d"
port = 5555
endpoint_uri = "http://127.0.0.1:%s/" % port
@pytest.fixture(scope="module")
def s3_base():
# writable local S3 system
# This fixture is module-scoped, meaning that we can re-use the MotoServer across all tests
server = ThreadedMotoServer(ip_address="127.0.0.1", port=port)
server.start()
if "AWS_SECRET_ACCESS_KEY" not in os.environ:
os.environ["AWS_SECRET_ACCESS_KEY"] = "foo"
if "AWS_ACCESS_KEY_ID" not in os.environ:
os.environ["AWS_ACCESS_KEY_ID"] = "foo"
os.environ.pop("AWS_PROFILE", None)
print("server up")
yield
print("moto done")
server.stop()
@pytest.fixture(autouse=True)
def reset_s3_fixture():
# We reuse the MotoServer for all tests
# But we do want a clean state for every test
try:
requests.post(f"{endpoint_uri}/moto-api/reset")
except:
pass
def get_boto3_client():
from botocore.session import Session
# NB: we use the sync botocore client for setup
session = Session()
return session.create_client("s3", endpoint_url=endpoint_uri)
@pytest.fixture()
def s3(s3_base):
client = get_boto3_client()
client.create_bucket(Bucket=test_bucket_name, ACL="public-read")
client.create_bucket(Bucket=versioned_bucket_name, ACL="public-read")
client.put_bucket_versioning(
Bucket=versioned_bucket_name, VersioningConfiguration={"Status": "Enabled"}
)
# initialize secure bucket
client.create_bucket(Bucket=secure_bucket_name, ACL="public-read")
policy = json.dumps(
{
"Version": "2012-10-17",
"Id": "PutObjPolicy",
"Statement": [
{
"Sid": "DenyUnEncryptedObjectUploads",
"Effect": "Deny",
"Principal": "*",
"Action": "s3:PutObject",
"Resource": f"arn:aws:s3:::{secure_bucket_name}/*",
"Condition": {
"StringNotEquals": {
"s3:x-amz-server-side-encryption": "aws:kms"
}
},
}
],
}
)
client.put_bucket_policy(Bucket=secure_bucket_name, Policy=policy)
for flist in [files, csv_files, text_files, glob_files]:
for f, data in flist.items():
client.put_object(Bucket=test_bucket_name, Key=f, Body=data)
S3FileSystem.clear_instance_cache()
s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri})
s3.invalidate_cache()
yield s3
@contextmanager
def expect_errno(expected_errno):
"""Expect an OSError and validate its errno code."""
with pytest.raises(OSError) as error:
yield
assert error.value.errno == expected_errno, "OSError has wrong error code."
def test_simple(s3):
data = b"a" * (10 * 2**20)
with s3.open(a, "wb") as f:
f.write(data)
with s3.open(a, "rb") as f:
out = f.read(len(data))
assert len(data) == len(out)
assert out == data
def test_with_size(s3):
data = b"a" * (10 * 2**20)
with s3.open(a, "wb") as f:
f.write(data)
with s3.open(a, "rb", size=100) as f:
assert f.size == 100
out = f.read()
assert len(out) == 100
@pytest.mark.parametrize("default_cache_type", ["none", "bytes", "mmap", "readahead"])
def test_default_cache_type(s3, default_cache_type):
data = b"a" * (10 * 2**20)
s3 = S3FileSystem(
anon=False,
default_cache_type=default_cache_type,
client_kwargs={"endpoint_url": endpoint_uri},
)
with s3.open(a, "wb") as f:
f.write(data)
with s3.open(a, "rb") as f:
assert isinstance(f.cache, fsspec.core.caches[default_cache_type])
out = f.read(len(data))
assert len(data) == len(out)
assert out == data
def test_ssl_off():
s3 = S3FileSystem(use_ssl=False, client_kwargs={"endpoint_url": endpoint_uri})
assert s3.s3.meta.endpoint_url.startswith("http://")
def test_client_kwargs():
s3 = S3FileSystem(client_kwargs={"endpoint_url": "http://foo"})
assert s3.s3.meta.endpoint_url.startswith("http://foo")
def test_config_kwargs():
s3 = S3FileSystem(
config_kwargs={"signature_version": "s3v4"},
client_kwargs={"endpoint_url": endpoint_uri},
)
assert s3.connect().meta.config.signature_version == "s3v4"
def test_config_kwargs_class_attributes_default():
s3 = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri})
assert s3.connect().meta.config.connect_timeout == 5
assert s3.connect().meta.config.read_timeout == 15
def test_config_kwargs_class_attributes_override():
s3 = S3FileSystem(
config_kwargs={
"connect_timeout": 60,
"read_timeout": 120,
},
client_kwargs={"endpoint_url": endpoint_uri},
)
assert s3.connect().meta.config.connect_timeout == 60
assert s3.connect().meta.config.read_timeout == 120
def test_user_session_is_preserved():
from aiobotocore.session import get_session
session = get_session()
s3 = S3FileSystem(session=session)
s3.connect()
assert s3.session == session
def test_idempotent_connect(s3):
stale_s3 = s3.s3
stale_session = s3.session
s3.connect(refresh=True)
assert stale_s3 is not s3.s3
assert stale_session is not s3.session
def test_multiple_objects(s3):
s3.connect()
s3.ls("test")
s32 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri})
assert s32.session
assert s3.ls("test") == s32.ls("test")
def test_info(s3):
s3.touch(a)
s3.touch(b)
info = s3.info(a)
linfo = s3.ls(a, detail=True)[0]
assert abs(info.pop("LastModified") - linfo.pop("LastModified")).seconds < 1
info.pop("VersionId")
info.pop("ContentType")
linfo.pop("Key")
linfo.pop("Size")
linfo.pop("ChecksumAlgorithm", None) # field DNE in some S3-compatible providers
assert info == linfo
parent = a.rsplit("/", 1)[0]
s3.invalidate_cache() # remove full path from the cache
s3.ls(parent) # fill the cache with parent dir
assert s3.info(a) == s3.dircache[parent][0] # correct value
assert id(s3.info(a)) == id(s3.dircache[parent][0]) # is object from cache
assert id(s3.info(f"/{a}")) == id(s3.dircache[parent][0]) # is object from cache
new_parent = test_bucket_name + "/foo"
s3.mkdir(new_parent)
with pytest.raises(FileNotFoundError):
s3.info(new_parent)
with pytest.raises(FileNotFoundError):
s3.ls(new_parent)
with pytest.raises(FileNotFoundError):
s3.info(new_parent)
def test_info_cached(s3):
path = test_bucket_name + "/tmp/"
fqpath = "s3://" + path
s3.touch(path + "test")
info = s3.info(fqpath)
assert info == s3.info(fqpath)
assert info == s3.info(path)
def test_checksum(s3):
bucket = test_bucket_name
d = "checksum"
prefix = d + "/e"
o1 = prefix + "1"
o2 = prefix + "2"
path1 = bucket + "/" + o1
path2 = bucket + "/" + o2
client = s3.s3
# init client and files
sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="")
sync(s3.loop, client.put_object, Bucket=bucket, Key=o2, Body="")
# change one file, using cache
sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="foo")
checksum = s3.checksum(path1)
s3.ls(path1) # force caching
sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar")
# refresh == False => checksum doesn't change
assert checksum == s3.checksum(path1)
# change one file, without cache
sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="foo")
checksum = s3.checksum(path1, refresh=True)
s3.ls(path1) # force caching
sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar")
# refresh == True => checksum changes
assert checksum != s3.checksum(path1, refresh=True)
# Test for nonexistent file
sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar")
s3.ls(path1) # force caching
sync(s3.loop, client.delete_object, Bucket=bucket, Key=o1)
with pytest.raises(FileNotFoundError):
s3.checksum(o1, refresh=True)
# Test multipart upload
upload_id = sync(
s3.loop,
client.create_multipart_upload,
Bucket=bucket,
Key=o1,
)["UploadId"]
etag1 = sync(
s3.loop,
client.upload_part,
Bucket=bucket,
Key=o1,
UploadId=upload_id,
PartNumber=1,
Body="0" * (5 * 1024 * 1024),
)["ETag"]
etag2 = sync(
s3.loop,
client.upload_part,
Bucket=bucket,
Key=o1,
UploadId=upload_id,
PartNumber=2,
Body="0",
)["ETag"]
sync(
s3.loop,
client.complete_multipart_upload,
Bucket=bucket,
Key=o1,
UploadId=upload_id,
MultipartUpload={
"Parts": [
{"PartNumber": 1, "ETag": etag1},
{"PartNumber": 2, "ETag": etag2},
]
},
)
s3.checksum(path1, refresh=True)
def test_multi_checksum(s3):
# Moto accepts the request to add checksum, and accepts the checksum mode,
# but doesn't actually return the checksum
# So, this is mostly a stub test
file_key = "checksum"
path = test_bucket_name + "/" + file_key
s3 = S3FileSystem(
anon=False,
client_kwargs={"endpoint_url": endpoint_uri},
s3_additional_kwargs={"ChecksumAlgorithm": "SHA256"},
)
with s3.open(
path,
"wb",
blocksize=5 * 2**20,
) as f:
f.write(b"0" * (5 * 2**20 + 1)) # starts multipart and puts first part
f.write(b"data") # any extra data
assert s3.cat(path) == b"0" * (5 * 2**20 + 1) + b"data"
FileHead = sync(
s3.loop,
s3.s3.head_object,
Bucket=test_bucket_name,
Key=file_key,
ChecksumMode="ENABLED",
)
# assert "ChecksumSHA256" in FileHead
test_xattr_sample_metadata = {"testxattr": "1"}
def test_xattr(s3):
bucket, key = (test_bucket_name, "tmp/test/xattr")
filename = bucket + "/" + key
body = b"aaaa"
public_read_acl = {
"Permission": "READ",
"Grantee": {
"URI": "http://acs.amazonaws.com/groups/global/AllUsers",
"Type": "Group",
},
}
resp = sync(
s3.loop,
s3.s3.put_object,
Bucket=bucket,
Key=key,
ACL="public-read",
Metadata=test_xattr_sample_metadata,
Body=body,
)
# save etag for later
etag = s3.info(filename)["ETag"]
assert (
public_read_acl
in sync(s3.loop, s3.s3.get_object_acl, Bucket=bucket, Key=key)["Grants"]
)
assert s3.getxattr(filename, "testxattr") == test_xattr_sample_metadata["testxattr"]
assert s3.metadata(filename) == {"testxattr": "1"} # note _ became -
s3file = s3.open(filename)
assert s3file.getxattr("testxattr") == test_xattr_sample_metadata["testxattr"]
assert s3file.metadata() == {"testxattr": "1"} # note _ became -
s3file.setxattr(testxattr="2")
assert s3file.getxattr("testxattr") == "2"
s3file.setxattr(**{"testxattr": None})
assert s3file.metadata() == {}
assert s3.cat(filename) == body
# check that ACL and ETag are preserved after updating metadata
assert (
public_read_acl
in sync(s3.loop, s3.s3.get_object_acl, Bucket=bucket, Key=key)["Grants"]
)
assert s3.info(filename)["ETag"] == etag
def test_xattr_setxattr_in_write_mode(s3):
s3file = s3.open(a, "wb")
with pytest.raises(NotImplementedError):
s3file.setxattr(test_xattr="1")
@pytest.mark.xfail()
def test_delegate(s3):
out = s3.get_delegated_s3pars()
assert out
assert out["token"]
s32 = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri}, **out)
assert not s32.anon
assert out == s32.get_delegated_s3pars()
def test_not_delegate():
s3 = S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri})
out = s3.get_delegated_s3pars()
assert out == {"anon": True}
s3 = S3FileSystem(
anon=False, client_kwargs={"endpoint_url": endpoint_uri}
) # auto credentials
out = s3.get_delegated_s3pars()
assert out == {"anon": False}
def test_ls(s3):
assert set(s3.ls("", detail=False)) == {
test_bucket_name,
secure_bucket_name,
versioned_bucket_name,
}
with pytest.raises(FileNotFoundError):
s3.ls("nonexistent")
fn = test_bucket_name + "/test/accounts.1.json"
assert fn in s3.ls(test_bucket_name + "/test", detail=False)
def test_pickle(s3):
import pickle
s32 = pickle.loads(pickle.dumps(s3))
assert s3.ls("test") == s32.ls("test")
s33 = pickle.loads(pickle.dumps(s32))
assert s3.ls("test") == s33.ls("test")
def test_ls_touch(s3):
assert not s3.exists(test_bucket_name + "/tmp/test")
s3.touch(a)
s3.touch(b)
L = s3.ls(test_bucket_name + "/tmp/test", True)
assert {d["Key"] for d in L} == {a, b}
L = s3.ls(test_bucket_name + "/tmp/test", False)
assert set(L) == {a, b}
@pytest.mark.parametrize("version_aware", [True, False])
def test_exists_versioned(s3, version_aware):
"""Test to ensure that a prefix exists when using a versioned bucket"""
import uuid
n = 3
s3 = S3FileSystem(
anon=False,
version_aware=version_aware,
client_kwargs={"endpoint_url": endpoint_uri},
)
segments = [versioned_bucket_name] + [str(uuid.uuid4()) for _ in range(n)]
path = "/".join(segments)
for i in range(2, n + 1):
assert not s3.exists("/".join(segments[:i]))
s3.touch(path)
for i in range(2, n + 1):
assert s3.exists("/".join(segments[:i]))
def test_isfile(s3):
assert not s3.isfile("")
assert not s3.isfile("/")
assert not s3.isfile(test_bucket_name)
assert not s3.isfile(test_bucket_name + "/test")
assert not s3.isfile(test_bucket_name + "/test/foo")
assert s3.isfile(test_bucket_name + "/test/accounts.1.json")
assert s3.isfile(test_bucket_name + "/test/accounts.2.json")
assert not s3.isfile(a)
s3.touch(a)
assert s3.isfile(a)
assert not s3.isfile(b)
assert not s3.isfile(b + "/")
s3.mkdir(b)
assert not s3.isfile(b)
assert not s3.isfile(b + "/")
assert not s3.isfile(c)
assert not s3.isfile(c + "/")
s3.mkdir(c + "/")
assert not s3.isfile(c)
assert not s3.isfile(c + "/")
def test_isdir(s3):
assert s3.isdir("")
assert s3.isdir("/")
assert s3.isdir(test_bucket_name)
assert s3.isdir(test_bucket_name + "/test")
assert not s3.isdir(test_bucket_name + "/test/foo")
assert not s3.isdir(test_bucket_name + "/test/accounts.1.json")
assert not s3.isdir(test_bucket_name + "/test/accounts.2.json")
assert not s3.isdir(a)
s3.touch(a)
assert not s3.isdir(a)
assert not s3.isdir(b)
assert not s3.isdir(b + "/")
assert not s3.isdir(c)
assert not s3.isdir(c + "/")
# test cache
s3.invalidate_cache()
assert not s3.dircache
s3.ls(test_bucket_name + "/nested")
assert test_bucket_name + "/nested" in s3.dircache
assert not s3.isdir(test_bucket_name + "/nested/file1")
assert not s3.isdir(test_bucket_name + "/nested/file2")
assert s3.isdir(test_bucket_name + "/nested/nested2")
assert s3.isdir(test_bucket_name + "/nested/nested2/")
def test_rm(s3):
assert not s3.exists(a)
s3.touch(a)
assert s3.exists(a)
s3.rm(a)
assert not s3.exists(a)
# the API is OK with deleting non-files; maybe this is an effect of using bulk
# with pytest.raises(FileNotFoundError):
# s3.rm(test_bucket_name + '/nonexistent')
with pytest.raises(FileNotFoundError):
s3.rm("nonexistent")
out = s3.rm(test_bucket_name + "/nested", recursive=True)
assert test_bucket_name + "/nested/nested2/file1" in out
assert not s3.exists(test_bucket_name + "/nested/nested2/file1")
# whole bucket
out = s3.rm(test_bucket_name, recursive=True)
assert test_bucket_name + "/2014-01-01.csv" in out
assert not s3.exists(test_bucket_name + "/2014-01-01.csv")
assert not s3.exists(test_bucket_name)
def test_rmdir(s3):
bucket = "test1_bucket"
s3.mkdir(bucket)
s3.rmdir(bucket)
assert bucket not in s3.ls("/")
# Issue 689, s3fs rmdir command returns error when given a valid s3 path.
dir = test_bucket_name + "/dir"
assert not s3.exists(dir)
with pytest.raises(FileNotFoundError):
s3.rmdir(dir)
s3.touch(dir + "/file")
assert s3.exists(dir)
assert s3.exists(dir + "/file")
with pytest.raises(FileExistsError):
s3.rmdir(dir)
with pytest.raises(OSError):
s3.rmdir(test_bucket_name)
def test_mkdir(s3):
bucket = "test1_bucket"
s3.mkdir(bucket)
assert bucket in s3.ls("/")
def test_mkdir_existing_bucket(s3):
# mkdir called on existing bucket should be no-op and not calling create_bucket
# creating a s3 bucket
bucket = "test1_bucket"
s3.mkdir(bucket)
assert bucket in s3.ls("/")
# a second call.
with pytest.raises(FileExistsError):
s3.mkdir(bucket)
def test_mkdir_bucket_and_key_1(s3):
bucket = "test1_bucket"
file = bucket + "/a/b/c"
s3.mkdir(file, create_parents=True)
assert bucket in s3.ls("/")
def test_mkdir_bucket_and_key_2(s3):
bucket = "test1_bucket"
file = bucket + "/a/b/c"
with pytest.raises(FileNotFoundError):
s3.mkdir(file, create_parents=False)
assert bucket not in s3.ls("/")
def test_mkdir_region_name(s3):
bucket = "test2_bucket"
s3.mkdir(bucket, region_name="eu-central-1")
assert bucket in s3.ls("/")
def test_mkdir_client_region_name(s3):
bucket = "test3_bucket"
s3 = S3FileSystem(
anon=False,
client_kwargs={"region_name": "eu-central-1", "endpoint_url": endpoint_uri},
)
s3.mkdir(bucket)
assert bucket in s3.ls("/")
def test_makedirs(s3):
bucket = "test_makedirs_bucket"
test_file = bucket + "/a/b/c/file"
s3.makedirs(test_file)
assert bucket in s3.ls("/")
def test_makedirs_existing_bucket(s3):
bucket = "test_makedirs_bucket"
s3.mkdir(bucket)
assert bucket in s3.ls("/")
test_file = bucket + "/a/b/c/file"
# no-op, and no error.
s3.makedirs(test_file)
def test_makedirs_pure_bucket_exist_ok(s3):
bucket = "test1_bucket"
s3.mkdir(bucket)
s3.makedirs(bucket, exist_ok=True)
def test_makedirs_pure_bucket_error_on_exist(s3):
bucket = "test1_bucket"
s3.mkdir(bucket)
with pytest.raises(FileExistsError):
s3.makedirs(bucket, exist_ok=False)
def test_bulk_delete(s3):
with pytest.raises(FileNotFoundError):
s3.rm(["nonexistent/file"])
filelist = s3.find(test_bucket_name + "/nested")
s3.rm(filelist)
assert not s3.exists(test_bucket_name + "/nested/nested2/file1")
@pytest.mark.xfail(reason="anon user is still privileged on moto")
def test_anonymous_access(s3):
with ignoring(NoCredentialsError):
s3 = S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri})
assert s3.ls("") == []
# TODO: public bucket doesn't work through moto
with pytest.raises(PermissionError):
s3.mkdir("newbucket")
def test_s3_file_access(s3):
fn = test_bucket_name + "/nested/file1"
data = b"hello\n"
assert s3.cat(fn) == data
assert s3.head(fn, 3) == data[:3]
assert s3.tail(fn, 3) == data[-3:]
assert s3.tail(fn, 10000) == data
def test_s3_file_info(s3):
fn = test_bucket_name + "/nested/file1"
data = b"hello\n"
assert fn in s3.find(test_bucket_name)
assert s3.exists(fn)
assert not s3.exists(fn + "another")
assert s3.info(fn)["Size"] == len(data)
with pytest.raises(FileNotFoundError):
s3.info(fn + "another")
def test_content_type_is_set(s3, tmpdir):
test_file = str(tmpdir) + "/test.json"
destination = test_bucket_name + "/test.json"
open(test_file, "w").write("text")
s3.put(test_file, destination)
assert s3.info(destination)["ContentType"] == "application/json"
def test_content_type_is_not_overrided(s3, tmpdir):
test_file = os.path.join(str(tmpdir), "test.json")
destination = os.path.join(test_bucket_name, "test.json")
open(test_file, "w").write("text")
s3.put(test_file, destination, ContentType="text/css")
assert s3.info(destination)["ContentType"] == "text/css"
def test_bucket_exists(s3):
assert s3.exists(test_bucket_name)
assert not s3.exists(test_bucket_name + "x")
s3 = S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri})
assert s3.exists(test_bucket_name)
assert not s3.exists(test_bucket_name + "x")
def test_du(s3):
d = s3.du(test_bucket_name, total=False)
assert all(isinstance(v, int) and v >= 0 for v in d.values())
assert test_bucket_name + "/nested/file1" in d
assert s3.du(test_bucket_name + "/test/", total=True) == sum(
map(len, files.values())
)
assert s3.du(test_bucket_name) == s3.du("s3://" + test_bucket_name)
# Issue 450, s3.du of non-existent directory
dir = test_bucket_name + "/does-not-exist"
assert not s3.exists(dir)
assert s3.du(dir) == 0
assert s3.du(dir + "/") == 0
def test_s3_ls(s3):
fn = test_bucket_name + "/nested/file1"
assert fn not in s3.ls(test_bucket_name + "/")
assert fn in s3.ls(test_bucket_name + "/nested/")
assert fn in s3.ls(test_bucket_name + "/nested")
assert s3.ls("s3://" + test_bucket_name + "/nested/") == s3.ls(
test_bucket_name + "/nested"
)
def test_s3_big_ls(s3):
for x in range(1200):
s3.touch(test_bucket_name + "/thousand/%i.part" % x)
assert len(s3.find(test_bucket_name)) > 1200
s3.rm(test_bucket_name + "/thousand/", recursive=True)
assert len(s3.find(test_bucket_name + "/thousand/")) == 0
def test_s3_ls_detail(s3):
L = s3.ls(test_bucket_name + "/nested", detail=True)
assert all(isinstance(item, dict) for item in L)
def test_s3_glob(s3):
fn = test_bucket_name + "/nested/file1"
assert fn not in s3.glob(test_bucket_name + "/")
assert fn not in s3.glob(test_bucket_name + "/*")
assert fn not in s3.glob(test_bucket_name + "/nested")
assert fn in s3.glob(test_bucket_name + "/nested/*")
assert fn in s3.glob(test_bucket_name + "/nested/file*")
assert fn in s3.glob(test_bucket_name + "/*/*")
assert all(
any(p.startswith(f + "/") or p == f for p in s3.find(test_bucket_name))
for f in s3.glob(test_bucket_name + "/nested/*")
)
assert [test_bucket_name + "/nested/nested2"] == s3.glob(
test_bucket_name + "/nested/nested2"
)
out = s3.glob(test_bucket_name + "/nested/nested2/*")
assert {"test/nested/nested2/file1", "test/nested/nested2/file2"} == set(out)
with pytest.raises(ValueError):
s3.glob("*")
# Make sure glob() deals with the dot character (.) correctly.
assert test_bucket_name + "/file.dat" in s3.glob(test_bucket_name + "/file.*")
assert test_bucket_name + "/filexdat" not in s3.glob(test_bucket_name + "/file.*")
def test_get_list_of_summary_objects(s3):
L = s3.ls(test_bucket_name + "/test")
assert len(L) == 2
assert [l.lstrip(test_bucket_name).lstrip("/") for l in sorted(L)] == sorted(
list(files)
)
L2 = s3.ls("s3://" + test_bucket_name + "/test")
assert L == L2
def test_read_keys_from_bucket(s3):
for k, data in files.items():
file_contents = s3.cat("/".join([test_bucket_name, k]))
assert file_contents == data
assert s3.cat("/".join([test_bucket_name, k])) == s3.cat(
"s3://" + "/".join([test_bucket_name, k])
)
def test_url(s3):
fn = test_bucket_name + "/nested/file1"
url = s3.url(fn, expires=100)
assert "http" in url
import urllib.parse
components = urllib.parse.urlparse(url)
query = urllib.parse.parse_qs(components.query)
exp = int(query["Expires"][0])
delta = abs(exp - time.time() - 100)
assert delta < 5
with s3.open(fn) as f:
assert "http" in f.url()
def test_seek(s3):
with s3.open(a, "wb") as f:
f.write(b"123")
with s3.open(a) as f:
assert f.read() == b"123"
with s3.open(a) as f:
f.seek(1000)
with pytest.raises(ValueError):
f.seek(-1)
with pytest.raises(ValueError):
f.seek(-5, 2)
with pytest.raises(ValueError):
f.seek(0, 10)
f.seek(0)
assert f.read(1) == b"1"
f.seek(0)
assert f.read(1) == b"1"
f.seek(3)
assert f.read(1) == b""
f.seek(-1, 2)
assert f.read(1) == b"3"
f.seek(-1, 1)
f.seek(-1, 1)
assert f.read(1) == b"2"
for i in range(4):
assert f.seek(i) == i
def test_bad_open(s3):
with pytest.raises(ValueError):
s3.open("")
def test_copy(s3):
fn = test_bucket_name + "/test/accounts.1.json"
s3.copy(fn, fn + "2")
assert s3.cat(fn) == s3.cat(fn + "2")
def test_copy_managed(s3):
data = b"abc" * 12 * 2**20
fn = test_bucket_name + "/test/biggerfile"
with s3.open(fn, "wb") as f:
f.write(data)
sync(s3.loop, s3._copy_managed, fn, fn + "2", size=len(data), block=5 * 2**20)
assert s3.cat(fn) == s3.cat(fn + "2")
with pytest.raises(ValueError):
sync(s3.loop, s3._copy_managed, fn, fn + "3", size=len(data), block=4 * 2**20)
with pytest.raises(ValueError):
sync(s3.loop, s3._copy_managed, fn, fn + "3", size=len(data), block=6 * 2**30)
@pytest.mark.parametrize("recursive", [True, False])
def test_move(s3, recursive):
fn = test_bucket_name + "/test/accounts.1.json"
data = s3.cat(fn)
s3.mv(fn, fn + "2", recursive=recursive)
assert s3.cat(fn + "2") == data
assert not s3.exists(fn)
def test_get_put(s3, tmpdir):
test_file = str(tmpdir.join("test.json"))
s3.get(test_bucket_name + "/test/accounts.1.json", test_file)
data = files["test/accounts.1.json"]
assert open(test_file, "rb").read() == data
s3.put(test_file, test_bucket_name + "/temp")
assert s3.du(test_bucket_name + "/temp", total=False)[
test_bucket_name + "/temp"
] == len(data)
assert s3.cat(test_bucket_name + "/temp") == data
def test_get_put_big(s3, tmpdir):
test_file = str(tmpdir.join("test"))
data = b"1234567890A" * 2**20
open(test_file, "wb").write(data)
s3.put(test_file, test_bucket_name + "/bigfile")
test_file = str(tmpdir.join("test2"))
s3.get(test_bucket_name + "/bigfile", test_file)
assert open(test_file, "rb").read() == data
def test_get_put_with_callback(s3, tmpdir):
test_file = str(tmpdir.join("test.json"))
class BranchingCallback(Callback):
def branch(self, path_1, path_2, kwargs):
kwargs["callback"] = BranchingCallback()
cb = BranchingCallback()
s3.get(test_bucket_name + "/test/accounts.1.json", test_file, callback=cb)
assert cb.size == 1
assert cb.value == 1
cb = BranchingCallback()
s3.put(test_file, test_bucket_name + "/temp", callback=cb)
assert cb.size == 1
assert cb.value == 1
def test_get_file_with_callback(s3, tmpdir):
test_file = str(tmpdir.join("test.json"))
cb = Callback()
s3.get_file(test_bucket_name + "/test/accounts.1.json", test_file, callback=cb)
assert cb.size == os.stat(test_file).st_size
assert cb.value == cb.size
def test_get_file_with_kwargs(s3, tmpdir):
test_file = str(tmpdir.join("test.json"))
get_file_kwargs = {"max_concurency": 1, "random_kwarg": "value"}
s3.get_file(
test_bucket_name + "/test/accounts.1.json", test_file, **get_file_kwargs
)
@pytest.mark.parametrize("size", [2**10, 10 * 2**20])
def test_put_file_with_callback(s3, tmpdir, size):
test_file = str(tmpdir.join("test.json"))
with open(test_file, "wb") as f:
f.write(b"1234567890A" * size)
cb = Callback()
s3.put_file(test_file, test_bucket_name + "/temp", callback=cb)
assert cb.size == os.stat(test_file).st_size
assert cb.value == cb.size
assert s3.size(test_bucket_name + "/temp") == 11 * size
@pytest.mark.parametrize("factor", [1, 5, 6])
def test_put_file_does_not_truncate(s3, tmpdir, factor):
test_file = str(tmpdir.join("test.json"))
chunksize = 5 * 2**20
block = b"x" * chunksize
with open(test_file, "wb") as f:
f.write(block * factor)
s3.put_file(
test_file, test_bucket_name + "/temp", max_concurrency=5, chunksize=chunksize
)
assert s3.size(test_bucket_name + "/temp") == factor * chunksize
@pytest.mark.parametrize("size", [2**10, 2**20, 10 * 2**20])
def test_pipe_cat_big(s3, size):
data = b"1234567890A" * size
s3.pipe(test_bucket_name + "/bigfile", data)
assert s3.cat(test_bucket_name + "/bigfile") == data
def test_errors(s3):
with pytest.raises(FileNotFoundError):
s3.open(test_bucket_name + "/tmp/test/shfoshf", "rb")
# This is fine, no need for interleaving directories on S3
# with pytest.raises((IOError, OSError)):
# s3.touch('tmp/test/shfoshf/x')
# Deleting nonexistent or zero paths is allowed for now
# with pytest.raises(FileNotFoundError):
# s3.rm(test_bucket_name + '/tmp/test/shfoshf/x')
with pytest.raises(FileNotFoundError):
s3.mv(test_bucket_name + "/tmp/test/shfoshf/x", "tmp/test/shfoshf/y")
with pytest.raises(ValueError):
s3.open("x", "rb")
with pytest.raises(FileNotFoundError):
s3.rm("unknown")
with pytest.raises(ValueError):
with s3.open(test_bucket_name + "/temp", "wb") as f:
f.read()
with pytest.raises(ValueError):
f = s3.open(test_bucket_name + "/temp", "rb")
f.close()
f.read()
with pytest.raises(ValueError):
s3.mkdir("/")
with pytest.raises(ValueError):
s3.find("")
with pytest.raises(ValueError):
s3.find("s3://")
def test_errors_cause_preservings(monkeypatch, s3):
# We translate the error, and preserve the original one
with pytest.raises(FileNotFoundError) as exc:
s3.rm("unknown")
assert type(exc.value.__cause__).__name__ == "NoSuchBucket"
async def head_object(*args, **kwargs):
raise NoCredentialsError
monkeypatch.setattr(type(s3.s3), "head_object", head_object)
# Since the error is not translate, the __cause__ would
# be None
with pytest.raises(NoCredentialsError) as exc:
s3.info("test/a.txt")
assert exc.value.__cause__ is None
def test_read_small(s3):
fn = test_bucket_name + "/2014-01-01.csv"
with s3.open(fn, "rb", block_size=10, cache_type="bytes") as f:
out = []
while True:
data = f.read(3)
if data == b"":
break
out.append(data)
assert s3.cat(fn) == b"".join(out)
# cache drop
assert len(f.cache) < len(out)
def test_read_s3_block(s3):
data = files["test/accounts.1.json"]
lines = io.BytesIO(data).readlines()
path = test_bucket_name + "/test/accounts.1.json"
assert s3.read_block(path, 1, 35, b"\n") == lines[1]
assert s3.read_block(path, 0, 30, b"\n") == lines[0]
assert s3.read_block(path, 0, 35, b"\n") == lines[0] + lines[1]
assert s3.read_block(path, 0, 5000, b"\n") == data
assert len(s3.read_block(path, 0, 5)) == 5
assert len(s3.read_block(path, 4, 5000)) == len(data) - 4
assert s3.read_block(path, 5000, 5010) == b""
assert s3.read_block(path, 5, None) == s3.read_block(path, 5, 1000)
def test_new_bucket(s3):
assert not s3.exists("new")
s3.mkdir("new")
assert s3.exists("new")
with s3.open("new/temp", "wb") as f:
f.write(b"hello")
with pytest.raises(OSError):
s3.rmdir("new")
s3.rm("new/temp")
s3.rmdir("new")
assert "new" not in s3.ls("")
assert not s3.exists("new")
with pytest.raises(FileNotFoundError):
s3.ls("new")
def test_new_bucket_auto(s3):
assert not s3.exists("new")
with pytest.raises(Exception):
s3.mkdir("new/other", create_parents=False)
s3.mkdir("new/other", create_parents=True)
assert s3.exists("new")
s3.touch("new/afile")
with pytest.raises(Exception):
s3.rm("new")
with pytest.raises(Exception):
s3.rmdir("new")
s3.rm("new", recursive=True)
assert not s3.exists("new")
def test_dynamic_add_rm(s3):
s3.mkdir("one")
s3.mkdir("one/two")
assert s3.exists("one")
s3.ls("one")
s3.touch("one/two/file_a")
assert s3.exists("one/two/file_a")
s3.rm("one", recursive=True)
assert not s3.exists("one")
def test_write_small(s3):
with s3.open(test_bucket_name + "/test", "wb") as f:
f.write(b"hello")
assert s3.cat(test_bucket_name + "/test") == b"hello"
s3.open(test_bucket_name + "/test", "wb").close()
assert s3.info(test_bucket_name + "/test")["size"] == 0
def test_write_small_with_acl(s3):
bucket, key = (test_bucket_name, "test-acl")
filename = bucket + "/" + key
body = b"hello"
public_read_acl = {
"Permission": "READ",
"Grantee": {
"URI": "http://acs.amazonaws.com/groups/global/AllUsers",
"Type": "Group",
},
}
with s3.open(filename, "wb", acl="public-read") as f:
f.write(body)
assert s3.cat(filename) == body
assert (
public_read_acl
in sync(s3.loop, s3.s3.get_object_acl, Bucket=bucket, Key=key)["Grants"]
)
def test_write_large(s3):
"flush() chunks buffer when processing large singular payload"
mb = 2**20
payload_size = int(2.5 * 5 * mb)
payload = b"0" * payload_size
with s3.open(test_bucket_name + "/test", "wb") as fd:
fd.write(payload)
assert s3.cat(test_bucket_name + "/test") == payload
assert s3.info(test_bucket_name + "/test")["size"] == payload_size
def test_write_limit(s3):
"flush() respects part_max when processing large singular payload"
mb = 2**20
block_size = 15 * mb
payload_size = 44 * mb
payload = b"0" * payload_size
with s3.open(test_bucket_name + "/test", "wb", blocksize=block_size) as fd:
fd.write(payload)
assert s3.cat(test_bucket_name + "/test") == payload
assert s3.info(test_bucket_name + "/test")["size"] == payload_size
def test_write_small_secure(s3):
s3 = S3FileSystem(
s3_additional_kwargs={"ServerSideEncryption": "aws:kms"},
client_kwargs={"endpoint_url": endpoint_uri},
)
s3.mkdir("mybucket")
with s3.open("mybucket/test", "wb") as f:
f.write(b"hello")
assert s3.cat("mybucket/test") == b"hello"
sync(s3.loop, s3.s3.head_object, Bucket="mybucket", Key="test")
def test_write_large_secure(s3):
# build our own s3fs with the relevant additional kwarg
s3 = S3FileSystem(
s3_additional_kwargs={"ServerSideEncryption": "AES256"},
client_kwargs={"endpoint_url": endpoint_uri},
)
s3.mkdir("mybucket")
with s3.open("mybucket/myfile", "wb") as f:
f.write(b"hello hello" * 10**6)
assert s3.cat("mybucket/myfile") == b"hello hello" * 10**6
def test_write_fails(s3):
with pytest.raises(ValueError):
s3.touch(test_bucket_name + "/temp")
s3.open(test_bucket_name + "/temp", "rb").write(b"hello")
with pytest.raises(ValueError):
s3.open(test_bucket_name + "/temp", "wb", block_size=10)
f = s3.open(test_bucket_name + "/temp", "wb")
f.close()
with pytest.raises(ValueError):
f.write(b"hello")
with pytest.raises(FileNotFoundError):
s3.open("nonexistentbucket/temp", "wb").close()
def test_write_blocks(s3):
with s3.open(test_bucket_name + "/temp", "wb", block_size=5 * 2**20) as f:
f.write(b"a" * 2 * 2**20)
assert f.buffer.tell() == 2 * 2**20
assert not (f.parts)
f.flush()
assert f.buffer.tell() == 2 * 2**20
assert not (f.parts)
f.write(b"a" * 2 * 2**20)
f.write(b"a" * 2 * 2**20)
assert f.mpu
assert f.parts
assert s3.info(test_bucket_name + "/temp")["size"] == 6 * 2**20
with s3.open(test_bucket_name + "/temp2", "wb", block_size=10 * 2**20) as f:
f.write(b"a" * 15 * 2**20)
assert f.buffer.tell() == 0
assert s3.info(test_bucket_name + "/temp2")["size"] == 15 * 2**20
def test_readline(s3):
all_items = chain.from_iterable(
[files.items(), csv_files.items(), text_files.items()]
)
for k, data in all_items:
with s3.open("/".join([test_bucket_name, k]), "rb") as f:
result = f.readline()
expected = data.split(b"\n")[0] + (b"\n" if data.count(b"\n") else b"")
assert result == expected
def test_readline_empty(s3):
data = b""
with s3.open(a, "wb") as f:
f.write(data)
with s3.open(a, "rb") as f:
result = f.readline()
assert result == data
def test_readline_blocksize(s3):
data = b"ab\n" + b"a" * (10 * 2**20) + b"\nab"
with s3.open(a, "wb") as f:
f.write(data)
with s3.open(a, "rb") as f:
result = f.readline()
expected = b"ab\n"
assert result == expected
result = f.readline()
expected = b"a" * (10 * 2**20) + b"\n"
assert result == expected
result = f.readline()
expected = b"ab"
assert result == expected
def test_next(s3):
expected = csv_files["2014-01-01.csv"].split(b"\n")[0] + b"\n"
with s3.open(test_bucket_name + "/2014-01-01.csv") as f:
result = next(f)
assert result == expected
def test_iterable(s3):
data = b"abc\n123"
with s3.open(a, "wb") as f:
f.write(data)
with s3.open(a) as f, io.BytesIO(data) as g:
for froms3, fromio in zip(f, g):
assert froms3 == fromio
f.seek(0)
assert f.readline() == b"abc\n"
assert f.readline() == b"123"
f.seek(1)
assert f.readline() == b"bc\n"
with s3.open(a) as f:
out = list(f)
with s3.open(a) as f:
out2 = f.readlines()
assert out == out2
assert b"".join(out) == data
def test_readable(s3):
with s3.open(a, "wb") as f:
assert not f.readable()
with s3.open(a, "rb") as f:
assert f.readable()
def test_seekable(s3):
with s3.open(a, "wb") as f:
assert not f.seekable()
with s3.open(a, "rb") as f:
assert f.seekable()
def test_writable(s3):
with s3.open(a, "wb") as f:
assert f.writable()
with s3.open(a, "rb") as f:
assert not f.writable()
def test_merge(s3):
with s3.open(a, "wb") as f:
f.write(b"a" * 10 * 2**20)
with s3.open(b, "wb") as f:
f.write(b"a" * 10 * 2**20)
s3.merge(test_bucket_name + "/joined", [a, b])
assert s3.info(test_bucket_name + "/joined")["size"] == 2 * 10 * 2**20
def test_append(s3):
data = text_files["nested/file1"]
with s3.open(test_bucket_name + "/nested/file1", "ab") as f:
assert f.tell() == len(data) # append, no write, small file
assert s3.cat(test_bucket_name + "/nested/file1") == data
with s3.open(test_bucket_name + "/nested/file1", "ab") as f:
f.write(b"extra") # append, write, small file
assert s3.cat(test_bucket_name + "/nested/file1") == data + b"extra"
with s3.open(a, "wb") as f:
f.write(b"a" * 10 * 2**20)
with s3.open(a, "ab") as f:
pass # append, no write, big file
data = s3.cat(a)
assert len(data) == 10 * 2**20 and set(data) == set(b"a")
with s3.open(a, "ab") as f:
assert f.parts is None
f._initiate_upload()
assert f.parts
assert f.tell() == 10 * 2**20
f.write(b"extra") # append, small write, big file
data = s3.cat(a)
assert len(data) == 10 * 2**20 + len(b"extra")
assert data[-5:] == b"extra"
with s3.open(a, "ab") as f:
assert f.tell() == 10 * 2**20 + 5
f.write(b"b" * 10 * 2**20) # append, big write, big file
assert f.tell() == 20 * 2**20 + 5
data = s3.cat(a)
assert len(data) == 10 * 2**20 + len(b"extra") + 10 * 2**20
assert data[10 * 2**20 : 10 * 2**20 + 5] == b"extra"
assert set(data[-10 * 2**20 :]) == set(b"b")
# Keep Head Metadata
head = dict(
CacheControl="public",
ContentDisposition="string",
ContentEncoding="gzip",
ContentLanguage="ru-RU",
ContentType="text/csv",
Expires=datetime.datetime(2015, 1, 1, 0, 0, tzinfo=tzutc()),
Metadata={"string": "string"},
ServerSideEncryption="AES256",
StorageClass="REDUCED_REDUNDANCY",
WebsiteRedirectLocation="https://www.example.com/",
)
with s3.open(a, "wb", **head) as f:
f.write(b"data")
with s3.open(a, "ab") as f:
f.write(b"other")
with s3.open(a) as f:
filehead = {
k: v
for k, v in f._call_s3(
"head_object", f.kwargs, Bucket=f.bucket, Key=f.key
).items()
if k in head
}
assert filehead == head
def test_bigger_than_block_read(s3):
with s3.open(test_bucket_name + "/2014-01-01.csv", "rb", block_size=3) as f:
out = []
while True:
data = f.read(20)
out.append(data)
if len(data) == 0:
break
assert b"".join(out) == csv_files["2014-01-01.csv"]
def test_current(s3):
s3._cache.clear()
s3 = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri})
assert s3.current() is s3
assert S3FileSystem.current() is s3
def test_array(s3):
from array import array
data = array("B", [65] * 1000)
with s3.open(a, "wb") as f:
f.write(data)
with s3.open(a, "rb") as f:
out = f.read()
assert out == b"A" * 1000
def _get_s3_id(s3):
return id(s3.s3)
@pytest.mark.parametrize(
"method",
[
"spawn",
pytest.param(
"forkserver",
marks=pytest.mark.skipif(
sys.platform.startswith("win"),
reason="'forkserver' not available on windows",
),
),
],
)
def test_no_connection_sharing_among_processes(s3, method):
import multiprocessing as mp
ctx = mp.get_context(method)
executor = ProcessPoolExecutor(mp_context=ctx)
conn_id = executor.submit(_get_s3_id, s3).result()
assert id(s3.connect()) != conn_id, "Processes should not share S3 connections."
@pytest.mark.xfail()
def test_public_file(s3):
# works on real s3, not on moto
test_bucket_name = "s3fs_public_test"
other_bucket_name = "s3fs_private_test"
s3.touch(test_bucket_name)
s3.touch(test_bucket_name + "/afile")
s3.touch(other_bucket_name, acl="public-read")
s3.touch(other_bucket_name + "/afile", acl="public-read")
s = S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri})
with pytest.raises(PermissionError):
s.ls(test_bucket_name)
s.ls(other_bucket_name)
s3.chmod(test_bucket_name, acl="public-read")
s3.chmod(other_bucket_name, acl="private")
with pytest.raises(PermissionError):
s.ls(other_bucket_name, refresh=True)
assert s.ls(test_bucket_name, refresh=True)
# public file in private bucket
with s3.open(other_bucket_name + "/see_me", "wb", acl="public-read") as f:
f.write(b"hello")
assert s.cat(other_bucket_name + "/see_me") == b"hello"
def test_upload_with_s3fs_prefix(s3):
path = "s3://test/prefix/key"
with s3.open(path, "wb") as f:
f.write(b"a" * (10 * 2**20))
with s3.open(path, "ab") as f:
f.write(b"b" * (10 * 2**20))
def test_multipart_upload_blocksize(s3):
blocksize = 5 * (2**20)
expected_parts = 3
s3f = s3.open(a, "wb", block_size=blocksize)
for _ in range(3):
data = b"b" * blocksize
s3f.write(data)
# Ensure that the multipart upload consists of only 3 parts
assert len(s3f.parts) == expected_parts
s3f.close()
def test_default_pars(s3):
s3 = S3FileSystem(
default_block_size=20,
default_fill_cache=False,
client_kwargs={"endpoint_url": endpoint_uri},
)
fn = test_bucket_name + "/" + list(files)[0]
with s3.open(fn) as f:
assert f.blocksize == 20
assert f.fill_cache is False
with s3.open(fn, block_size=40, fill_cache=True) as f:
assert f.blocksize == 40
assert f.fill_cache is True
def test_tags(s3):
tagset = {"tag1": "value1", "tag2": "value2"}
fname = list(files)[0]
s3.touch(fname)
s3.put_tags(fname, tagset)
assert s3.get_tags(fname) == tagset
# Ensure merge mode updates value of existing key and adds new one
new_tagset = {"tag2": "updatedvalue2", "tag3": "value3"}
s3.put_tags(fname, new_tagset, mode="m")
tagset.update(new_tagset)
assert s3.get_tags(fname) == tagset
@pytest.mark.parametrize("prefix", ["", "/dir", "/dir/subdir"])
def test_versions(s3, prefix):
parent = versioned_bucket_name + prefix
versioned_file = parent + "/versioned_file"
s3 = S3FileSystem(
anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
)
with s3.open(versioned_file, "wb") as fo:
fo.write(b"1")
first_version = fo.version_id
with s3.open(versioned_file, "wb") as fo:
fo.write(b"2")
second_version = fo.version_id
assert s3.isfile(versioned_file)
versions = s3.object_version_info(versioned_file)
assert len(versions) == 2
assert {version["VersionId"] for version in versions} == {
first_version,
second_version,
}
with s3.open(versioned_file) as fo:
assert fo.version_id == second_version
assert fo.read() == b"2"
with s3.open(versioned_file, version_id=first_version) as fo:
assert fo.version_id == first_version
assert fo.read() == b"1"
versioned_file_v1 = f"{versioned_file}?versionId={first_version}"
versioned_file_v2 = f"{versioned_file}?versionId={second_version}"
assert s3.ls(parent) == [versioned_file]
assert set(s3.ls(parent, versions=True)) == {versioned_file_v1, versioned_file_v2}
assert s3.exists(versioned_file_v1)
assert s3.info(versioned_file_v1)
assert s3.exists(versioned_file_v2)
assert s3.info(versioned_file_v2)
def test_list_versions_many(s3):
# moto doesn't actually behave in the same way that s3 does here so this doesn't test
# anything really in moto 1.2
s3 = S3FileSystem(
anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
)
versioned_file = versioned_bucket_name + "/versioned_file2"
for i in range(1200):
with s3.open(versioned_file, "wb") as fo:
fo.write(b"1")
versions = s3.object_version_info(versioned_file)
assert len(versions) == 1200
def test_fsspec_versions_multiple(s3):
"""Test that the standard fsspec.core.get_fs_token_paths behaves as expected for versionId urls"""
s3 = S3FileSystem(
anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
)
versioned_file = versioned_bucket_name + "/versioned_file3"
version_lookup = {}
for i in range(20):
contents = str(i).encode()
with s3.open(versioned_file, "wb") as fo:
fo.write(contents)
version_lookup[fo.version_id] = contents
urls = [
f"s3://{versioned_file}?versionId={version}"
for version in version_lookup.keys()
]
fs, token, paths = fsspec.core.get_fs_token_paths(
urls, storage_options=dict(client_kwargs={"endpoint_url": endpoint_uri})
)
assert isinstance(fs, S3FileSystem)
assert fs.version_aware
for path in paths:
with fs.open(path, "rb") as fo:
contents = fo.read()
assert contents == version_lookup[fo.version_id]
def test_versioned_file_fullpath(s3):
versioned_file = versioned_bucket_name + "/versioned_file_fullpath"
s3 = S3FileSystem(
anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
)
with s3.open(versioned_file, "wb") as fo:
fo.write(b"1")
# moto doesn't correctly return a versionId for a multipart upload. So we resort to this.
# version_id = fo.version_id
versions = s3.object_version_info(versioned_file)
version_ids = [version["VersionId"] for version in versions]
version_id = version_ids[0]
with s3.open(versioned_file, "wb") as fo:
fo.write(b"2")
file_with_version = f"{versioned_file}?versionId={version_id}"
with s3.open(file_with_version, "rb") as fo:
assert fo.version_id == version_id
assert fo.read() == b"1"
versions = s3.object_version_info(versioned_file)
version_ids = [version["VersionId"] for version in versions]
assert set(s3.ls(versioned_bucket_name, versions=True)) == {
f"{versioned_file}?versionId={vid}" for vid in version_ids
}
def test_versions_unaware(s3):
versioned_file = versioned_bucket_name + "/versioned_file3"
s3 = S3FileSystem(
anon=False, version_aware=False, client_kwargs={"endpoint_url": endpoint_uri}
)
with s3.open(versioned_file, "wb") as fo:
fo.write(b"1")
with s3.open(versioned_file, "wb") as fo:
fo.write(b"2")
with s3.open(versioned_file) as fo:
assert fo.version_id is None
assert fo.read() == b"2"
with pytest.raises(ValueError):
with s3.open(versioned_file, version_id="0"):
fo.read()
def test_versions_dircached(s3):
versioned_file = versioned_bucket_name + "/dir/versioned_file"
s3 = S3FileSystem(
anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
)
with s3.open(versioned_file, "wb") as fo:
fo.write(b"1")
first_version = fo.version_id
with s3.open(versioned_file, "wb") as fo:
fo.write(b"2")
second_version = fo.version_id
s3.find(versioned_bucket_name)
cached = s3.dircache[versioned_bucket_name + "/dir"][0]
assert cached.get("VersionId") == second_version
assert s3.info(versioned_file) == cached
assert (
s3.info(versioned_file, version_id=first_version).get("VersionId")
== first_version
)
assert (
s3.info(versioned_file, version_id=second_version).get("VersionId")
== second_version
)
def test_text_io__stream_wrapper_works(s3):
"""Ensure using TextIOWrapper works."""
s3.mkdir("bucket")
with s3.open("bucket/file.txt", "wb") as fd:
fd.write("\u00af\\_(\u30c4)_/\u00af".encode("utf-16-le"))
with s3.open("bucket/file.txt", "rb") as fd:
with io.TextIOWrapper(fd, "utf-16-le") as stream:
assert stream.readline() == "\u00af\\_(\u30c4)_/\u00af"
def test_text_io__basic(s3):
"""Text mode is now allowed."""
s3.mkdir("bucket")
with s3.open("bucket/file.txt", "w", encoding="utf-8") as fd:
fd.write("\u00af\\_(\u30c4)_/\u00af")
with s3.open("bucket/file.txt", "r", encoding="utf-8") as fd:
assert fd.read() == "\u00af\\_(\u30c4)_/\u00af"
def test_text_io__override_encoding(s3):
"""Allow overriding the default text encoding."""
s3.mkdir("bucket")
with s3.open("bucket/file.txt", "w", encoding="ibm500") as fd:
fd.write("Hello, World!")
with s3.open("bucket/file.txt", "r", encoding="ibm500") as fd:
assert fd.read() == "Hello, World!"
def test_readinto(s3):
s3.mkdir("bucket")
with s3.open("bucket/file.txt", "wb") as fd:
fd.write(b"Hello, World!")
contents = bytearray(15)
with s3.open("bucket/file.txt", "rb") as fd:
assert fd.readinto(contents) == 13
assert contents.startswith(b"Hello, World!")
def test_change_defaults_only_subsequent():
"""Test for Issue #135
Ensure that changing the default block size doesn't affect existing file
systems that were created using that default. It should only affect file
systems created after the change.
"""
try:
S3FileSystem.cachable = False # don't reuse instances with same pars
fs_default = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri})
assert fs_default.default_block_size == 50 * (1024**2)
fs_overridden = S3FileSystem(
default_block_size=64 * (1024**2),
client_kwargs={"endpoint_url": endpoint_uri},
)
assert fs_overridden.default_block_size == 64 * (1024**2)
# Suppose I want all subsequent file systems to have a block size of 1 GiB
# instead of 5 MiB:
S3FileSystem.default_block_size = 1024**3
fs_big = S3FileSystem(client_kwargs={"endpoint_url": endpoint_uri})
assert fs_big.default_block_size == 1024**3
# Test the other file systems created to see if their block sizes changed
assert fs_overridden.default_block_size == 64 * (1024**2)
assert fs_default.default_block_size == 50 * (1024**2)
finally:
S3FileSystem.default_block_size = 5 * (1024**2)
S3FileSystem.cachable = True
def test_cache_after_copy(s3):
# https://github.com/dask/dask/issues/5134
s3.touch("test/afile")
assert "test/afile" in s3.ls("s3://test", False)
s3.cp("test/afile", "test/bfile")
assert "test/bfile" in s3.ls("s3://test", False)
def test_autocommit(s3):
auto_file = test_bucket_name + "/auto_file"
committed_file = test_bucket_name + "/commit_file"
aborted_file = test_bucket_name + "/aborted_file"
s3 = S3FileSystem(
anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
)
def write_and_flush(path, autocommit):
with s3.open(path, "wb", autocommit=autocommit) as fo:
fo.write(b"1")
return fo
# regular behavior
fo = write_and_flush(auto_file, autocommit=True)
assert fo.autocommit
assert s3.exists(auto_file)
fo = write_and_flush(committed_file, autocommit=False)
assert not fo.autocommit
assert not s3.exists(committed_file)
fo.commit()
assert s3.exists(committed_file)
fo = write_and_flush(aborted_file, autocommit=False)
assert not s3.exists(aborted_file)
fo.discard()
assert not s3.exists(aborted_file)
# Cannot commit a file that was discarded
with pytest.raises(Exception):
fo.commit()
def test_autocommit_mpu(s3):
"""When not autocommitting we always want to use multipart uploads"""
path = test_bucket_name + "/auto_commit_with_mpu"
with s3.open(path, "wb", autocommit=False) as fo:
fo.write(b"1")
assert fo.mpu is not None
assert len(fo.parts) == 1
def test_touch(s3):
# create
fn = test_bucket_name + "/touched"
assert not s3.exists(fn)
s3.touch(fn)
assert s3.exists(fn)
assert s3.size(fn) == 0
# truncates
with s3.open(fn, "wb") as f:
f.write(b"data")
assert s3.size(fn) == 4
s3.touch(fn, truncate=True)
assert s3.size(fn) == 0
# exists error
with s3.open(fn, "wb") as f:
f.write(b"data")
assert s3.size(fn) == 4
with pytest.raises(ValueError):
s3.touch(fn, truncate=False)
assert s3.size(fn) == 4
def test_touch_versions(s3):
versioned_file = versioned_bucket_name + "/versioned_file"
s3 = S3FileSystem(
anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
)
with s3.open(versioned_file, "wb") as fo:
fo.write(b"1")
first_version = fo.version_id
with s3.open(versioned_file, "wb") as fo:
fo.write(b"")
second_version = fo.version_id
assert s3.isfile(versioned_file)
versions = s3.object_version_info(versioned_file)
assert len(versions) == 2
assert {version["VersionId"] for version in versions} == {
first_version,
second_version,
}
with s3.open(versioned_file) as fo:
assert fo.version_id == second_version
assert fo.read() == b""
with s3.open(versioned_file, version_id=first_version) as fo:
assert fo.version_id == first_version
assert fo.read() == b"1"
def test_cat_missing(s3):
fn0 = test_bucket_name + "/file0"
fn1 = test_bucket_name + "/file1"
s3.touch(fn0)
with pytest.raises(FileNotFoundError):
s3.cat([fn0, fn1], on_error="raise")
out = s3.cat([fn0, fn1], on_error="omit")
assert list(out) == [fn0]
out = s3.cat([fn0, fn1], on_error="return")
assert fn1 in out
assert isinstance(out[fn1], FileNotFoundError)
def test_get_directories(s3, tmpdir):
s3.touch(test_bucket_name + "/dir/dirkey/key0")
s3.touch(test_bucket_name + "/dir/dirkey/key1")
s3.touch(test_bucket_name + "/dir/dirkey")
s3.touch(test_bucket_name + "/dir/dir/key")
d = str(tmpdir)
# Target directory with trailing slash
s3.get(test_bucket_name + "/dir/", d, recursive=True)
assert {"dirkey", "dir"} == set(os.listdir(d))
assert ["key"] == os.listdir(os.path.join(d, "dir"))
assert {"key0", "key1"} == set(os.listdir(os.path.join(d, "dirkey")))
local_fs = fsspec.filesystem("file")
local_fs.rm(os.path.join(d, "dir"), recursive=True)
local_fs.rm(os.path.join(d, "dirkey"), recursive=True)
# Target directory without trailing slash
s3.get(test_bucket_name + "/dir", d, recursive=True)
assert ["dir"] == os.listdir(d)
assert {"dirkey", "dir"} == set(os.listdir(os.path.join(d, "dir")))
assert {"key0", "key1"} == set(os.listdir(os.path.join(d, "dir", "dirkey")))
def test_seek_reads(s3):
fn = test_bucket_name + "/myfile"
with s3.open(fn, "wb") as f:
f.write(b"a" * 175627146)
with s3.open(fn, "rb", blocksize=100) as f:
f.seek(175561610)
d1 = f.read(65536)
f.seek(4)
size = 17562198
d2 = f.read(size)
assert len(d2) == size
f.seek(17562288)
size = 17562187
d3 = f.read(size)
assert len(d3) == size
def test_connect_many(s3):
from multiprocessing.pool import ThreadPool
def task(i):
S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri}).ls("")
return True
pool = ThreadPool(processes=20)
out = pool.map(task, range(40))
assert all(out)
pool.close()
pool.join()
def test_requester_pays(s3):
fn = test_bucket_name + "/myfile"
s3 = S3FileSystem(requester_pays=True, client_kwargs={"endpoint_url": endpoint_uri})
assert s3.req_kw["RequestPayer"] == "requester"
s3.touch(fn)
with s3.open(fn, "rb") as f:
assert f.req_kw["RequestPayer"] == "requester"
def test_credentials():
s3 = S3FileSystem(
key="foo", secret="foo", client_kwargs={"endpoint_url": endpoint_uri}
)
assert s3.s3._request_signer._credentials.access_key == "foo"
assert s3.s3._request_signer._credentials.secret_key == "foo"
s3 = S3FileSystem(
client_kwargs={
"aws_access_key_id": "bar",
"aws_secret_access_key": "bar",
"endpoint_url": endpoint_uri,
}
)
assert s3.s3._request_signer._credentials.access_key == "bar"
assert s3.s3._request_signer._credentials.secret_key == "bar"
s3 = S3FileSystem(
key="foo",
client_kwargs={"aws_secret_access_key": "bar", "endpoint_url": endpoint_uri},
)
assert s3.s3._request_signer._credentials.access_key == "foo"
assert s3.s3._request_signer._credentials.secret_key == "bar"
s3 = S3FileSystem(
key="foobar",
secret="foobar",
client_kwargs={
"aws_access_key_id": "foobar",
"aws_secret_access_key": "foobar",
"endpoint_url": endpoint_uri,
},
)
assert s3.s3._request_signer._credentials.access_key == "foobar"
assert s3.s3._request_signer._credentials.secret_key == "foobar"
with pytest.raises((TypeError, KeyError)):
# should be TypeError: arg passed twice; but in moto can be KeyError
S3FileSystem(
key="foo",
secret="foo",
client_kwargs={
"aws_access_key_id": "bar",
"aws_secret_access_key": "bar",
"endpoint_url": endpoint_uri,
},
).s3
def test_modified(s3):
dir_path = test_bucket_name + "/modified"
file_path = dir_path + "/file"
# Test file
s3.touch(file_path)
modified = s3.modified(path=file_path)
assert isinstance(modified, datetime.datetime)
assert modified.tzinfo is not None
# Test directory
with pytest.raises(IsADirectoryError):
modified = s3.modified(path=dir_path)
# Test bucket
with pytest.raises(IsADirectoryError):
s3.modified(path=test_bucket_name)
def test_async_s3(s3):
async def _():
s3 = S3FileSystem(
anon=False,
asynchronous=True,
loop=asyncio.get_running_loop(),
client_kwargs={"region_name": "eu-central-1", "endpoint_url": endpoint_uri},
)
fn = test_bucket_name + "/nested/file1"
data = b"hello\n"
# Is good with or without connect()
await s3._cat_file(fn)
session = await s3.set_session() # creates client
assert await s3._cat_file(fn) == data
assert await s3._cat_file(fn, start=0, end=3) == data[:3]
# TODO: file IO is *not* async
# with s3.open(fn, "rb") as f:
# assert f.read() == data
try:
await session.close()
except AttributeError:
# bug in aiobotocore 1.4.1
await session._endpoint.http_session._session.close()
asyncio.run(_())
def test_cat_ranges(s3):
data = b"a string to select from"
fn = test_bucket_name + "/parts"
s3.pipe(fn, data)
assert s3.cat_file(fn) == data
assert s3.cat_file(fn, start=5) == data[5:]
assert s3.cat_file(fn, end=5) == data[:5]
assert s3.cat_file(fn, start=1, end=-1) == data[1:-1]
assert s3.cat_file(fn, start=-5) == data[-5:]
def test_async_s3_old(s3):
async def _():
s3 = S3FileSystem(
anon=False,
asynchronous=True,
loop=asyncio.get_running_loop(),
client_kwargs={"region_name": "eu-central-1", "endpoint_url": endpoint_uri},
)
fn = test_bucket_name + "/nested/file1"
data = b"hello\n"
# Check old API
session = await s3._connect()
assert await s3._cat_file(fn, start=0, end=3) == data[:3]
try:
await session.close()
except AttributeError:
# bug in aiobotocore 1.4.1
await session._endpoint.http_session._session.close()
asyncio.run(_())
def test_via_fsspec(s3):
import fsspec
s3.mkdir("mine")
with fsspec.open(
"s3://mine/oi", "wb", client_kwargs={"endpoint_url": endpoint_uri}
) as f:
f.write(b"hello")
with fsspec.open(
"s3://mine/oi", "rb", client_kwargs={"endpoint_url": endpoint_uri}
) as f:
assert f.read() == b"hello"
@pytest.mark.parametrize(
["raw_url", "expected_url", "expected_version_aware"],
[
(
"s3://arn:aws:s3:us-west-2:123456789012:accesspoint/abc/123.jpg",
"arn:aws:s3:us-west-2:123456789012:accesspoint/abc/123.jpg",
False,
),
(
"s3://arn:aws:s3:us-west-2:123456789012:accesspoint/abc/123.jpg?versionId=some_version_id",
"arn:aws:s3:us-west-2:123456789012:accesspoint/abc/123.jpg?versionId=some_version_id",
True,
),
(
"s3://xyz/abc/123.jpg",
"xyz/abc/123.jpg",
False,
),
(
"s3://xyz/abc/123.jpg?versionId=some_version_id",
"xyz/abc/123.jpg?versionId=some_version_id",
True,
),
],
)
def test_fsspec_url_to_fs_compatability(
s3, raw_url, expected_url, expected_version_aware
):
import fsspec
fs, url = fsspec.url_to_fs(raw_url)
assert isinstance(fs, type(s3))
assert fs.version_aware is expected_version_aware
assert url == expected_url
def test_repeat_exists(s3):
fn = "s3://" + test_bucket_name + "/file1"
s3.touch(fn)
assert s3.exists(fn)
assert s3.exists(fn)
def test_with_xzarr(s3):
da = pytest.importorskip("dask.array")
xr = pytest.importorskip("xarray")
name = "sample"
nana = xr.DataArray(da.random.random((1024, 1024, 10, 9, 1)))
s3_path = f"{test_bucket_name}/{name}"
s3store = s3.get_mapper(s3_path)
s3.ls("")
nana.to_dataset().to_zarr(store=s3store, mode="w", consolidated=True, compute=True)
def test_async_close():
async def _():
loop = asyncio.get_event_loop()
s3 = S3FileSystem(anon=False, asynchronous=True, loop=loop)
await s3._connect()
fn = test_bucket_name + "/afile"
async def async_wrapper():
coros = [
asyncio.ensure_future(s3._get_file(fn, "/nonexistent/a/b/c"), loop=loop)
for _ in range(3)
]
completed, pending = await asyncio.wait(coros)
for future in completed:
with pytest.raises(OSError):
future.result()
await asyncio.gather(*[async_wrapper() for __ in range(2)])
try:
await s3._s3.close()
except AttributeError:
# bug in aiobotocore 1.4.1
await s3._s3._endpoint.http_session._session.close()
asyncio.run(_())
def test_put_single(s3, tmpdir):
fn = os.path.join(str(tmpdir), "dir")
os.mkdir(fn)
open(os.path.join(fn, "abc"), "w").write("text")
# Put with trailing slash
s3.put(fn + "/", test_bucket_name) # no-op, no files
assert not s3.exists(test_bucket_name + "/abc")
assert not s3.exists(test_bucket_name + "/dir")
s3.put(fn + "/", test_bucket_name, recursive=True)
assert s3.cat(test_bucket_name + "/abc") == b"text"
# Put without trailing slash
s3.put(fn, test_bucket_name, recursive=True)
assert s3.cat(test_bucket_name + "/dir/abc") == b"text"
def test_shallow_find(s3):
"""Test that find method respects maxdepth.
Verify that the ``find`` method respects the ``maxdepth`` parameter. With
``maxdepth=1``, the results of ``find`` should be the same as those of
``ls``, without returning subdirectories. See also issue 378.
"""
ls_output = s3.ls(test_bucket_name)
assert sorted(ls_output + [test_bucket_name]) == s3.find(
test_bucket_name, maxdepth=1, withdirs=True
)
assert ls_output == s3.glob(test_bucket_name + "/*")
def test_multi_find(s3):
s3.mkdir("bucket/test")
s3.mkdir("bucket/test/sub")
s3.write_text("bucket/test/file.txt", "some_text")
s3.write_text("bucket/test/sub/file.txt", "some_text")
out1 = s3.find("bucket", withdirs=True)
out2 = s3.find("bucket", withdirs=True)
assert (
out1
== out2
== [
"bucket/test",
"bucket/test/file.txt",
"bucket/test/sub",
"bucket/test/sub/file.txt",
]
)
out1 = s3.find("bucket", withdirs=False)
out2 = s3.find("bucket", withdirs=False)
assert out1 == out2 == ["bucket/test/file.txt", "bucket/test/sub/file.txt"]
def test_version_sizes(s3):
# protect against caching of incorrect version details
s3 = S3FileSystem(
anon=False, version_aware=True, client_kwargs={"endpoint_url": endpoint_uri}
)
import gzip
path = f"s3://{versioned_bucket_name}/test.txt.gz"
versions = [
s3.pipe_file(path, gzip.compress(text))
for text in (
b"good morning!",
b"hello!",
b"hi!",
b"hello!",
)
]
for version in versions:
version_id = version["VersionId"]
with s3.open(path, version_id=version_id) as f:
with gzip.GzipFile(fileobj=f) as zfp:
zfp.read()
def test_find_no_side_effect(s3):
infos1 = s3.find(test_bucket_name, maxdepth=1, withdirs=True, detail=True)
s3.find(test_bucket_name, maxdepth=None, withdirs=True, detail=True)
infos3 = s3.find(test_bucket_name, maxdepth=1, withdirs=True, detail=True)
assert infos1.keys() == infos3.keys()
def test_get_file_info_with_selector(s3):
fs = s3
base_dir = "selector-dir/"
file_a = "selector-dir/test_file_a"
file_b = "selector-dir/test_file_b"
dir_a = "selector-dir/test_dir_a"
file_c = "selector-dir/test_dir_a/test_file_c"
try:
fs.mkdir(base_dir)
with fs.open(file_a, mode="wb"):
pass
with fs.open(file_b, mode="wb"):
pass
fs.mkdir(dir_a)
with fs.open(file_c, mode="wb"):
pass
infos = fs.find(base_dir, maxdepth=None, withdirs=True, detail=True)
assert len(infos) == 4 # includes base_dir directory
for info in infos.values():
if info["name"].endswith(file_a):
assert info["type"] == "file"
elif info["name"].endswith(file_b):
assert info["type"] == "file"
elif info["name"].endswith(file_c):
assert info["type"] == "file"
elif info["name"].rstrip("/").endswith(dir_a):
assert info["type"] == "directory"
finally:
fs.rm(base_dir, recursive=True)
@pytest.mark.xfail(
condition=version.parse(moto.__version__) <= version.parse("1.3.16"),
reason="Moto 1.3.16 is not supporting pre-conditions.",
)
def test_raise_exception_when_file_has_changed_during_reading(s3):
test_file_name = "file1"
test_file = "s3://" + test_bucket_name + "/" + test_file_name
content1 = b"123"
content2 = b"ABCDEFG"
boto3_client = get_boto3_client()
def create_file(content: bytes):
boto3_client.put_object(
Bucket=test_bucket_name, Key=test_file_name, Body=content
)
create_file(b"123")
with s3.open(test_file, "rb") as f:
content = f.read()
assert content == content1
with s3.open(test_file, "rb") as f:
create_file(content2)
with expect_errno(errno.EBUSY):
f.read()
def test_s3fs_etag_preserving_multipart_copy(monkeypatch, s3):
# Set this to a lower value so that we can actually
# test this without creating giant objects in memory
monkeypatch.setattr(s3fs.core, "MANAGED_COPY_THRESHOLD", 5 * 2**20)
test_file1 = test_bucket_name + "/test/multipart-upload.txt"
test_file2 = test_bucket_name + "/test/multipart-upload-copy.txt"
with s3.open(test_file1, "wb", block_size=5 * 2**21) as stream:
for _ in range(5):
stream.write(b"b" * (stream.blocksize + random.randrange(200)))
file_1 = s3.info(test_file1)
s3.copy(test_file1, test_file2)
file_2 = s3.info(test_file2)
s3.rm(test_file2)
# normal copy() uses a block size of 5GB
assert file_1["ETag"] != file_2["ETag"]
s3.copy(test_file1, test_file2, preserve_etag=True)
file_2 = s3.info(test_file2)
s3.rm(test_file2)
# etag preserving copy() determines each part size for the destination
# by checking out the matching part's size on the source
assert file_1["ETag"] == file_2["ETag"]
s3.rm(test_file1)
def test_sync_from_wihin_async(s3):
# if treating as sync but within an even loop, e.g., calling from jupyter;
# IO happens on dedicated thread.
async def f():
S3FileSystem.clear_instance_cache()
s3 = S3FileSystem(anon=False, client_kwargs={"endpoint_url": endpoint_uri})
assert s3.ls(test_bucket_name)
asyncio.run(f())
def test_token_paths(s3):
fs, tok, files = fsspec.get_fs_token_paths(
"s3://" + test_bucket_name + "/*.csv",
storage_options={"client_kwargs": {"endpoint_url": endpoint_uri}},
)
assert files
def test_same_name_but_no_exact(s3):
s3.touch(test_bucket_name + "/very/similar/prefix1")
s3.touch(test_bucket_name + "/very/similar/prefix2")
s3.touch(test_bucket_name + "/very/similar/prefix3/something")
assert not s3.exists(test_bucket_name + "/very/similar/prefix")
assert not s3.exists(test_bucket_name + "/very/similar/prefi")
assert not s3.exists(test_bucket_name + "/very/similar/pref")
assert s3.exists(test_bucket_name + "/very/similar/")
assert s3.exists(test_bucket_name + "/very/similar/prefix1")
assert s3.exists(test_bucket_name + "/very/similar/prefix2")
assert s3.exists(test_bucket_name + "/very/similar/prefix3")
assert s3.exists(test_bucket_name + "/very/similar/prefix3/")
assert s3.exists(test_bucket_name + "/very/similar/prefix3/something")
assert not s3.exists(test_bucket_name + "/very/similar/prefix3/some")
s3.touch(test_bucket_name + "/starting/very/similar/prefix")
assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix1")
assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix2")
assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix3")
assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix3/")
assert not s3.exists(test_bucket_name + "/starting/very/similar/prefix3/something")
assert s3.exists(test_bucket_name + "/starting/very/similar/prefix")
assert s3.exists(test_bucket_name + "/starting/very/similar/prefix/")
def test_leading_forward_slash(s3):
s3.touch(test_bucket_name + "/some/file")
assert s3.ls(test_bucket_name + "/some/")
assert s3.exists(test_bucket_name + "/some/file")
assert s3.exists("s3://" + test_bucket_name + "/some/file")
def test_lsdir(s3):
# https://github.com/fsspec/s3fs/issues/475
s3.find(test_bucket_name)
d = test_bucket_name + "/test"
assert d in s3.ls(test_bucket_name)
def test_rm_recursive_folder(s3):
s3.touch(test_bucket_name + "/sub/file")
s3.rm(test_bucket_name + "/sub", recursive=True)
assert not s3.exists(test_bucket_name + "/sub/file")
assert not s3.exists(test_bucket_name + "/sub")
s3.touch(test_bucket_name + "/sub/file")
s3.touch(test_bucket_name + "/sub/") # placeholder
s3.rm(test_bucket_name + "/sub", recursive=True)
assert not s3.exists(test_bucket_name + "/sub/file")
assert not s3.exists(test_bucket_name + "/sub")
s3.touch(test_bucket_name + "/sub/file")
s3.rm(test_bucket_name, recursive=True)
assert not s3.exists(test_bucket_name + "/sub/file")
assert not s3.exists(test_bucket_name + "/sub")
assert not s3.exists(test_bucket_name)
def test_copy_file_without_etag(s3, monkeypatch):
s3.touch(test_bucket_name + "/copy_tests/file")
s3.ls(test_bucket_name + "/copy_tests/")
[file] = s3.dircache[test_bucket_name + "/copy_tests"]
assert file["name"] == test_bucket_name + "/copy_tests/file"
file.pop("ETag")
assert s3.info(file["name"]).get("ETag", None) is None
s3.cp_file(file["name"], test_bucket_name + "/copy_tests/file2")
assert s3.info(test_bucket_name + "/copy_tests/file2")["ETag"] is not None
def test_find_with_prefix(s3):
for cursor in range(100):
s3.touch(test_bucket_name + f"/prefixes/test_{cursor}")
s3.touch(test_bucket_name + "/prefixes2")
assert len(s3.find(test_bucket_name + "/prefixes")) == 100
assert len(s3.find(test_bucket_name, prefix="prefixes")) == 101
assert len(s3.find(test_bucket_name + "/prefixes", prefix="test2_")) == 0
assert len(s3.find(test_bucket_name + "/prefixes/test_")) == 0
assert len(s3.find(test_bucket_name + "/prefixes", prefix="test_")) == 100
assert len(s3.find(test_bucket_name + "/prefixes/", prefix="test_")) == 100
test_1s = s3.find(test_bucket_name + "/prefixes/test_1")
assert len(test_1s) == 1
assert test_1s[0] == test_bucket_name + "/prefixes/test_1"
test_1s = s3.find(test_bucket_name + "/prefixes/", prefix="test_1")
assert len(test_1s) == 11
assert test_1s == [test_bucket_name + "/prefixes/test_1"] + [
test_bucket_name + f"/prefixes/test_{cursor}" for cursor in range(10, 20)
]
assert s3.find(test_bucket_name + "/prefixes/") == s3.find(
test_bucket_name + "/prefixes/", prefix=None
)
def test_list_after_find(s3):
before = s3.ls("s3://test")
s3.invalidate_cache("s3://test/2014-01-01.csv")
s3.find("s3://test/2014-01-01.csv")
after = s3.ls("s3://test")
assert before == after
def test_upload_recursive_to_bucket(s3, tmpdir):
# GH#491
folders = [os.path.join(tmpdir, d) for d in ["outer", "outer/inner"]]
files = [os.path.join(tmpdir, f) for f in ["outer/afile", "outer/inner/bfile"]]
for d in folders:
os.mkdir(d)
for f in files:
open(f, "w").write("hello")
s3.put(folders[0], "newbucket", recursive=True)
def test_rm_file(s3):
target = test_bucket_name + "/to_be_removed/file"
s3.touch(target)
s3.rm_file(target)
assert not s3.exists(target)
assert not s3.exists(test_bucket_name + "/to_be_removed")
def test_exists_isdir(s3):
bad_path = "s3://nyc-tlc-asdfasdf/trip data/"
assert not s3.exists(bad_path)
assert not s3.isdir(bad_path)
def test_list_del_multipart(s3):
path = test_bucket_name + "/afile"
f = s3.open(path, "wb")
f.write(b"0" * 6 * 2**20)
out = s3.list_multipart_uploads(test_bucket_name)
assert [_ for _ in out if _["Key"] == "afile"]
s3.clear_multipart_uploads(test_bucket_name)
out = s3.list_multipart_uploads(test_bucket_name)
assert not [_ for _ in out if _["Key"] == "afile"]
try:
f.close() # may error
except Exception:
pass
def test_split_path(s3):
buckets = [
"my-test-bucket",
"arn:aws:s3:region:123456789012:accesspoint/my-access-point-name",
"arn:aws:s3-outposts:region:123456789012:outpost/outpost-id/bucket/my-test-bucket",
"arn:aws:s3-outposts:region:123456789012:outpost/outpost-id/accesspoint/my-accesspoint-name",
"arn:aws:s3-object-lambda:region:123456789012:accesspoint/my-lambda-object-name",
]
test_key = "my/test/path"
for test_bucket in buckets:
bucket, key, _ = s3.split_path("s3://" + test_bucket + "/" + test_key)
assert bucket == test_bucket
assert key == test_key
def test_cp_directory_recursive(s3):
src = test_bucket_name + "/src"
src_file = src + "/file"
s3.mkdir(src)
s3.touch(src_file)
target = test_bucket_name + "/target"
# cp without slash
assert not s3.exists(target)
for loop in range(2):
s3.cp(src, target, recursive=True)
assert s3.isdir(target)
if loop == 0:
correct = [target + "/file"]
assert s3.find(target) == correct
else:
correct = [target + "/file", target + "/src/file"]
assert sorted(s3.find(target)) == correct
s3.rm(target, recursive=True)
# cp with slash
assert not s3.exists(target)
for loop in range(2):
s3.cp(src + "/", target, recursive=True)
assert s3.isdir(target)
correct = [target + "/file"]
assert s3.find(target) == correct
def test_get_directory_recursive(s3, tmpdir):
src = test_bucket_name + "/src"
src_file = src + "/file"
s3.mkdir(src)
s3.touch(src_file)
target = os.path.join(tmpdir, "target")
target_fs = fsspec.filesystem("file")
# get without slash
assert not target_fs.exists(target)
for loop in range(2):
s3.get(src, target, recursive=True)
assert target_fs.isdir(target)
if loop == 0:
assert target_fs.find(target) == [os.path.join(target, "file")]
else:
assert sorted(target_fs.find(target)) == [
os.path.join(target, "file"),
os.path.join(target, "src", "file"),
]
target_fs.rm(target, recursive=True)
# get with slash
assert not target_fs.exists(target)
for loop in range(2):
s3.get(src + "/", target, recursive=True)
assert target_fs.isdir(target)
assert target_fs.find(target) == [os.path.join(target, "file")]
def test_put_directory_recursive(s3, tmpdir):
src = os.path.join(tmpdir, "src")
src_file = os.path.join(src, "file")
source_fs = fsspec.filesystem("file")
source_fs.mkdir(src)
source_fs.touch(src_file)
target = test_bucket_name + "/target"
# put without slash
assert not s3.exists(target)
for loop in range(2):
s3.put(src, target, recursive=True)
assert s3.isdir(target)
if loop == 0:
assert s3.find(target) == [target + "/file"]
else:
assert sorted(s3.find(target)) == [target + "/file", target + "/src/file"]
s3.rm(target, recursive=True)
# put with slash
assert not s3.exists(target)
for loop in range(2):
s3.put(src + "/", target, recursive=True)
assert s3.isdir(target)
assert s3.find(target) == [target + "/file"]
def test_cp_two_files(s3):
src = test_bucket_name + "/src"
file0 = src + "/file0"
file1 = src + "/file1"
s3.mkdir(src)
s3.touch(file0)
s3.touch(file1)
target = test_bucket_name + "/target"
assert not s3.exists(target)
s3.cp([file0, file1], target)
assert s3.isdir(target)
assert sorted(s3.find(target)) == [
target + "/file0",
target + "/file1",
]
def test_async_stream(s3_base):
fn = test_bucket_name + "/target"
data = b"hello world" * 1000
out = []
async def read_stream():
fs = S3FileSystem(
anon=False,
client_kwargs={"endpoint_url": endpoint_uri},
skip_instance_cache=True,
)
await fs._mkdir(test_bucket_name)
await fs._pipe(fn, data)
f = await fs.open_async(fn, mode="rb", block_size=1000)
while True:
got = await f.read(1000)
assert f.size == len(data)
assert f.tell()
if not got:
break
out.append(got)
asyncio.run(read_stream())
assert b"".join(out) == data
def test_rm_invalidates_cache(s3):
# Issue 761: rm_file does not invalidate cache
fn = test_bucket_name + "/2014-01-01.csv"
assert s3.exists(fn)
assert fn in s3.ls(test_bucket_name)
s3.rm(fn)
assert not s3.exists(fn)
assert fn not in s3.ls(test_bucket_name)
fn = test_bucket_name + "/2014-01-02.csv"
assert s3.exists(fn)
assert fn in s3.ls(test_bucket_name)
s3.rm_file(fn)
assert not s3.exists(fn)
assert fn not in s3.ls(test_bucket_name)
def test_cache_handles_find_with_maxdepth(s3):
# Issue 773: invalidate_cache should not be needed when find is called with different maxdepth
base_name = test_bucket_name + "/main"
dir = base_name + "/dir1/fileB"
file = base_name + "/fileA"
s3.touch(dir)
s3.touch(file)
# Find with maxdepth=None
f = s3.find(base_name, maxdepth=None, withdirs=False)
assert base_name + "/fileA" in f
assert base_name + "/dir1" not in f
assert base_name + "/dir1/fileB" in f
# Find with maxdepth=1.
# Performed twice with cache invalidated between them which should give same result
for _ in range(2):
f = s3.find(base_name, maxdepth=1, withdirs=True)
assert base_name + "/fileA" in f
assert base_name + "/dir1" in f
assert base_name + "/dir1/fileB" not in f
s3.invalidate_cache()
def test_bucket_versioning(s3):
s3.mkdir("maybe_versioned")
assert not s3.is_bucket_versioned("maybe_versioned")
s3.make_bucket_versioned("maybe_versioned")
assert s3.is_bucket_versioned("maybe_versioned")
s3.make_bucket_versioned("maybe_versioned", False)
assert not s3.is_bucket_versioned("maybe_versioned")
@pytest.fixture()
def s3_fixed_upload_size(s3):
s3_fixed = S3FileSystem(
anon=False,
client_kwargs={"endpoint_url": endpoint_uri},
fixed_upload_size=True,
)
s3_fixed.invalidate_cache()
yield s3_fixed
def test_upload_parts(s3_fixed_upload_size):
with s3_fixed_upload_size.open(a, "wb", block_size=6_000_000) as f:
f.write(b" " * 6_001_000)
assert len(f.buffer.getbuffer()) == 1000
# check we are at the right position
assert f.tell() == 6_001_000
# offset is introduced in fsspec.core, but never used.
# apparently it should keep offset for part that is already uploaded
assert f.offset == 6_000_000
f.write(b" " * 6_001_000)
assert len(f.buffer.getbuffer()) == 2000
assert f.tell() == 2 * 6_001_000
assert f.offset == 2 * 6_000_000
with s3_fixed_upload_size.open(a, "r") as f:
assert len(f.read()) == 6_001_000 * 2
def test_upload_part_with_prime_pads(s3_fixed_upload_size):
block = 6_000_000
pad1, pad2 = 1013, 1019 # prime pad sizes to exclude divisibility
with s3_fixed_upload_size.open(a, "wb", block_size=block) as f:
f.write(b" " * (block + pad1))
assert len(f.buffer.getbuffer()) == pad1
# check we are at the right position
assert f.tell() == block + pad1
assert f.offset == block
f.write(b" " * (block + pad2))
assert len(f.buffer.getbuffer()) == pad1 + pad2
assert f.tell() == 2 * block + pad1 + pad2
assert f.offset == 2 * block
with s3_fixed_upload_size.open(a, "r") as f:
assert len(f.read()) == 2 * block + pad1 + pad2
@pytest.mark.asyncio
async def test_invalidate_cache(s3: s3fs.S3FileSystem) -> None:
await s3._call_s3("put_object", Bucket=test_bucket_name, Key="a/b.txt", Body=b"abc")
before = await s3._ls(f"{test_bucket_name}/a/")
assert sorted(before) == ["test/a/b.txt"]
await s3._pipe_file(f"{test_bucket_name}/a/c.txt", data=b"abc")
after = await s3._ls(f"{test_bucket_name}/a/")
assert sorted(after) == ["test/a/b.txt", "test/a/c.txt"]
def test_exist_after_delete(s3):
test_dir = f"{test_bucket_name}/test/checkpoint_dir"
s3.touch(f"{test_dir}/file.txt")
assert s3.exists(test_dir)
s3.rm(test_dir, recursive=True)
assert not s3.exists(test_dir)
# condition: True if running on botocore < 1.36.0
# The below tests for exclusive writes will fail on older versions of botocore.
old_botocore = version.parse(botocore.__version__) < version.parse("1.36.0")
@pytest.mark.xfail(
reason="moto doesn't support IfNoneMatch for MPU when object created via MPU"
)
def test_pipe_exclusive_big(s3):
chunksize = 5 * 2**20 # minimum allowed
data = b"x" * chunksize * 3
s3.pipe(f"{test_bucket_name}/afile", data, mode="overwrite", chunksize=chunksize)
s3.pipe(f"{test_bucket_name}/afile", data, mode="overwrite", chunksize=chunksize)
with pytest.raises(FileExistsError):
s3.pipe(f"{test_bucket_name}/afile", data, mode="create", chunksize=chunksize)
assert not s3.list_multipart_uploads(test_bucket_name)
@pytest.mark.xfail(
old_botocore, reason="botocore<1.33.0 lacks IfNoneMatch support", strict=True
)
def test_pipe_exclusive_big_after_small(s3):
"""Test conditional MPU after creating object via put_object
This test is required because moto's implementation of IfNoneMatch for MPU
only works when the object is initially created via put_object and not via
MPU.
"""
chunksize = 5 * 2**20 # minimum allowed
# First, create object via put_object (small upload)
s3.pipe(f"{test_bucket_name}/afile", b"small", mode="overwrite")
# Now try multipart upload with mode="create" (should fail)
with pytest.raises(FileExistsError):
s3.pipe(
f"{test_bucket_name}/afile",
b"c" * chunksize * 3,
mode="create",
chunksize=chunksize,
)
assert not s3.list_multipart_uploads(test_bucket_name)
@pytest.mark.xfail(
reason="moto doesn't support IfNoneMatch for MPU when object created via MPU"
)
def test_put_exclusive_big(s3, tmpdir):
chunksize = 5 * 2**20 # minimum allowed
fn = f"{tmpdir}/afile"
with open(fn, "wb") as f:
f.write(b"x" * chunksize * 3)
s3.put(fn, f"{test_bucket_name}/afile", mode="overwrite", chunksize=chunksize)
s3.put(fn, f"{test_bucket_name}/afile", mode="overwrite", chunksize=chunksize)
with pytest.raises(FileExistsError):
s3.put(fn, f"{test_bucket_name}/afile", mode="create", chunksize=chunksize)
assert not s3.list_multipart_uploads(test_bucket_name)
@pytest.mark.xfail(
old_botocore, reason="botocore<1.33.0 lacks IfNoneMatch support", strict=True
)
def test_put_exclusive_big_after_small(s3, tmpdir):
"""Test conditional MPU after creating object via put_object.
This test is required because moto's implementation of IfNoneMatch for MPU
only works when the object is initially created via put_object and not via
MPU.
"""
chunksize = 5 * 2**20 # minimum allowed
fn = str(tmpdir.join("afile"))
with open(fn, "wb") as f:
f.write(b"x" * chunksize * 3)
# First, create object via put_object (small upload)
s3.pipe(f"{test_bucket_name}/afile", b"small", mode="overwrite")
# Now try multipart upload with mode="create" (should fail)
with pytest.raises(FileExistsError):
s3.put(fn, f"{test_bucket_name}/afile", mode="create", chunksize=chunksize)
assert not s3.list_multipart_uploads(test_bucket_name)
@pytest.mark.xfail(
old_botocore, reason="botocore<1.33.0 lacks IfNoneMatch support", strict=True
)
def test_put_exclusive_small(s3, tmpdir):
fn = f"{tmpdir}/afile"
with open(fn, "wb") as f:
f.write(b"x")
s3.put(fn, f"{test_bucket_name}/afile", mode="overwrite")
s3.put(fn, f"{test_bucket_name}/afile", mode="overwrite")
with pytest.raises(FileExistsError):
s3.put(fn, f"{test_bucket_name}/afile", mode="create")
assert not s3.list_multipart_uploads(test_bucket_name)
def test_bucket_info(s3):
info = s3.info(test_bucket_name)
assert "VersionId" in info
assert info["type"] == "directory"
assert info["name"] == test_bucket_name
MB = 2**20
GB = 2**30
TB = 2**40
@pytest.mark.parametrize(
["filesize", "chunksize", "expected"],
[
# small file, use default chunksize
(1000, None, 50 * MB),
# exact boundary, use default chunksize
(50 * MB * MAX_UPLOAD_PARTS, None, 50 * MB),
# file requiring increased chunksize
(50 * MB * (MAX_UPLOAD_PARTS + 1), None, 52_434_043),
# very large files, expect increased chunksize
(1 * TB, None, 109_951_163),
(5 * TB, None, 549_755_814),
# respect explicit chunksize
(5 * GB, 10 * MB, 10 * MB),
],
)
def test_calculate_chunksize(filesize, chunksize, expected):
assert calculate_chunksize(filesize, chunksize) == expected
def test_find_ls_fail(s3):
# beacuse of https://github.com/fsspec/s3fs/pull/989
client = get_boto3_client()
files = {
f"{test_bucket_name}/find/a/a": b"data",
f"{test_bucket_name}/find/a/b": b"data",
f"{test_bucket_name}/find/a": b"", # duplicate of dir, without "/"
f"{test_bucket_name}/find/b": b"", # empty file without "/" and no children
f"{test_bucket_name}/find/c/c": b"data", # directory with no placeholder
f"{test_bucket_name}/find/d/d": b"data", # dir will acquire placeholder with "/"
}
client.put_object(Bucket=test_bucket_name, Key="find/d/", Body=b"")
client.put_object(
Bucket=test_bucket_name, Key="find/e/", Body=b""
) # placeholder only
s3.pipe(files)
out0 = s3.ls(f"{test_bucket_name}/find", detail=True)
s3.find(test_bucket_name, detail=False)
out = s3.ls(f"{test_bucket_name}/find", detail=True)
assert out == out0
s3.invalidate_cache()
s3.find(f"{test_bucket_name}/find", detail=False)
out = s3.ls(f"{test_bucket_name}/find", detail=True)
assert out == out0
def test_find_missing_ls(s3):
# https://github.com/fsspec/s3fs/issues/988#issuecomment-3436727753
BUCKET = test_bucket_name
BASE_PREFIX = "disappearing-folders/"
BASE = f"s3://{BUCKET}/{BASE_PREFIX}"
s3_with_cache = S3FileSystem(
anon=False,
use_listings_cache=True,
client_kwargs={"endpoint_url": endpoint_uri},
)
s3_no_cache = S3FileSystem(
anon=False,
use_listings_cache=False,
client_kwargs={"endpoint_url": endpoint_uri},
)
s3_with_cache.pipe({f"{BASE}folder/foo/1.txt": b"", f"{BASE}bar.txt": b""})
s3_with_cache.find(BASE)
listed_cached = s3_with_cache.ls(BASE, detail=False)
listed_no_cache = s3_no_cache.ls(BASE, detail=False)
assert set(listed_cached) == set(listed_no_cache)
def test_session_close():
async def run_program(run):
s3 = s3fs.S3FileSystem(anon=True, asynchronous=True)
session = await s3.set_session()
files = await s3._ls(
"s3://noaa-hrrr-bdp-pds/hrrr.20140730/conus/"
) # Random open data store
print(f"Number of files {len(files)}")
await session.close()
import aiobotocore.httpsession
aiobotocore.httpsession.AIOHTTPSession
asyncio.run(run_program(True))
asyncio.run(run_program(False))
def test_rm_recursive_prfix(s3):
prefix = "logs/" # must end with "/"
# Create empty "directory" in S3
client = get_boto3_client()
client.put_object(Bucket=test_bucket_name, Key=prefix, Body=b"")
logs_path = f"s3://{test_bucket_name}/{prefix}"
s3.rm(logs_path, recursive=True)
assert not s3.isdir(logs_path)
s3fs-2026.2.0/s3fs/tests/test_utils.py 0000664 0000000 0000000 00000000562 15141211055 0017410 0 ustar 00root root 0000000 0000000 import s3fs.utils as utils
def test_get_brange():
assert list(utils._get_brange(100, 24)) == [
(0, 23),
(24, 47),
(48, 71),
(72, 95),
(96, 99),
]
assert list(utils._get_brange(100, 25)) == [(0, 24), (25, 49), (50, 74), (75, 99)]
assert list(utils._get_brange(100, 26)) == [(0, 25), (26, 51), (52, 77), (78, 99)]
s3fs-2026.2.0/s3fs/utils.py 0000664 0000000 0000000 00000012131 15141211055 0015202 0 ustar 00root root 0000000 0000000 import errno
import logging
from contextlib import contextmanager, AsyncExitStack
from botocore.exceptions import ClientError
logger = logging.getLogger("s3fs")
@contextmanager
def ignoring(*exceptions):
try:
yield
except exceptions:
pass
class S3BucketRegionCache:
# See https://github.com/aio-libs/aiobotocore/issues/866
# for details.
def __init__(self, session, **client_kwargs):
self._session = session
self._stack = AsyncExitStack()
self._client = None
self._client_kwargs = client_kwargs
self._buckets = {}
self._regions = {}
async def get_bucket_client(self, bucket_name=None):
if bucket_name in self._buckets:
return self._buckets[bucket_name]
general_client = await self.get_client()
if bucket_name is None:
return general_client
try:
response = await general_client.head_bucket(Bucket=bucket_name)
except ClientError as e:
logger.debug("RC: HEAD_BUCKET call for %r has failed", bucket_name)
response = e.response
region = (
response["ResponseMetadata"]
.get("HTTPHeaders", {})
.get("x-amz-bucket-region")
)
if not region:
logger.debug(
"RC: No region in HEAD_BUCKET call response for %r, returning the general client",
bucket_name,
)
return general_client
if region not in self._regions:
logger.debug(
"RC: Creating a new regional client for %r on the region %r",
bucket_name,
region,
)
self._regions[region] = await self._stack.enter_async_context(
self._session.create_client(
"s3", region_name=region, **self._client_kwargs
)
)
client = self._buckets[bucket_name] = self._regions[region]
return client
async def get_client(self):
if not self._client:
self._client = await self._stack.enter_async_context(
self._session.create_client("s3", **self._client_kwargs)
)
return self._client
async def clear(self):
logger.debug("RC: discarding all clients")
self._buckets.clear()
self._regions.clear()
self._client = None
await self._stack.aclose()
async def __aenter__(self):
return self
async def __aexit__(self, *exc_args):
await self.clear()
class FileExpired(IOError):
"""
Is raised, when the file content has been changed from a different process after
opening the file. Reading the file would lead to invalid or inconsistent output.
This can also be triggered by outdated file-information inside the directory cache.
In this case ``S3FileSystem.invalidate_cache`` can be used to force an update of
the file-information when opening the file.
"""
def __init__(self, filename: str, e_tag: str):
super().__init__(
errno.EBUSY,
"The remote file corresponding to filename %s and Etag %s no longer exists."
% (filename, e_tag),
)
def title_case(string):
"""
TitleCases a given string.
Parameters
----------
string : underscore separated string
"""
return "".join(x.capitalize() for x in string.split("_"))
class ParamKwargsHelper:
"""
Utility class to help extract the subset of keys that an s3 method is
actually using
Parameters
----------
s3 : boto S3FileSystem
"""
_kwarg_cache = {}
def __init__(self, s3):
self.s3 = s3
def _get_valid_keys(self, model_name):
if model_name not in self._kwarg_cache:
model = self.s3.meta.service_model.operation_model(model_name)
valid_keys = (
set(model.input_shape.members.keys())
if model.input_shape is not None
else set()
)
self._kwarg_cache[model_name] = valid_keys
return self._kwarg_cache[model_name]
def filter_dict(self, method_name, d):
model_name = title_case(method_name)
valid_keys = self._get_valid_keys(model_name)
if isinstance(d, SSEParams):
d = d.to_kwargs()
return {k: v for k, v in d.items() if k in valid_keys}
class SSEParams:
def __init__(
self,
server_side_encryption=None,
sse_customer_algorithm=None,
sse_customer_key=None,
sse_kms_key_id=None,
):
self.ServerSideEncryption = server_side_encryption
self.SSECustomerAlgorithm = sse_customer_algorithm
self.SSECustomerKey = sse_customer_key
self.SSEKMSKeyId = sse_kms_key_id
def to_kwargs(self):
return {k: v for k, v in self.__dict__.items() if v is not None}
def _get_brange(size, block):
"""
Chunk up a file into zero-based byte ranges
Parameters
----------
size : file size
block : block size
"""
for offset in range(0, size, block):
yield offset, min(offset + block - 1, size - 1)
s3fs-2026.2.0/setup.cfg 0000664 0000000 0000000 00000002360 15141211055 0014436 0 ustar 00root root 0000000 0000000 [metadata]
long_description: file: README.rst
[versioneer]
VCS = git
style = pep440
versionfile_source = s3fs/_version.py
versionfile_build = s3fs/_version.py
tag_prefix = ""
[flake8]
exclude = __init__.py,versioneer.py,s3fs/tests/
max-line-length = 95
ignore =
# Extra space in brackets
E20,
# Multiple spaces around ","
E231,E241,
# Comments
E26,
# Import formatting
E4,
# Comparing types instead of isinstance
E721,
# Assigning lambda expression
E731,
# continuation line under-indented for hanging indent
E121,
# continuation line over-indented for hanging indent
E126,
# continuation line over-indented for visual indent
E127,
# E128 continuation line under-indented for visual indent
E128,
# multiple statements on one line (semicolon)
E702,
# line break before binary operator
W503,
# visually indented line with same indent as next logical line
E129,
# unexpected indentation
E116,
# redefinition of unused 'loop' from line 10
F811,
# local variable is assigned to but never used
F841,
# Ambiguous variable names
E741
# line break after binary operator
W504,
# line too long (leave it to black!)
E501,
s3fs-2026.2.0/setup.py 0000775 0000000 0000000 00000002100 15141211055 0014322 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
from setuptools import setup
import versioneer
setup(
name="s3fs",
version=versioneer.get_version(),
cmdclass=versioneer.get_cmdclass(),
classifiers=[
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
],
description="Convenient Filesystem interface over S3",
url="http://github.com/fsspec/s3fs/",
maintainer="Martin Durant",
maintainer_email="mdurant@continuum.io",
license="BSD",
keywords="s3, boto",
packages=["s3fs"],
python_requires=">= 3.10",
install_requires=[open("requirements.txt").read().strip().split("\n")],
long_description="README.md",
long_description_content_type="text/markdown",
zip_safe=False,
)
s3fs-2026.2.0/test_requirements.txt 0000664 0000000 0000000 00000000117 15141211055 0017136 0 ustar 00root root 0000000 0000000 mock; python_version < '3.3'
moto>=4
flask
flask_cors
pytest>=4.2.0
pytest-env
s3fs-2026.2.0/versioneer.py 0000664 0000000 0000000 00000251506 15141211055 0015360 0 ustar 00root root 0000000 0000000 # Version: 0.29
"""The Versioneer - like a rocketeer, but for versions.
The Versioneer
==============
* like a rocketeer, but for versions!
* https://github.com/python-versioneer/python-versioneer
* Brian Warner
* License: Public Domain (Unlicense)
* Compatible with: Python 3.7, 3.8, 3.9, 3.10, 3.11 and pypy3
* [![Latest Version][pypi-image]][pypi-url]
* [![Build Status][travis-image]][travis-url]
This is a tool for managing a recorded version number in setuptools-based
python projects. The goal is to remove the tedious and error-prone "update
the embedded version string" step from your release process. Making a new
release should be as easy as recording a new tag in your version-control
system, and maybe making new tarballs.
## Quick Install
Versioneer provides two installation modes. The "classic" vendored mode installs
a copy of versioneer into your repository. The experimental build-time dependency mode
is intended to allow you to skip this step and simplify the process of upgrading.
### Vendored mode
* `pip install versioneer` to somewhere in your $PATH
* A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is
available, so you can also use `conda install -c conda-forge versioneer`
* add a `[tool.versioneer]` section to your `pyproject.toml` or a
`[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md))
* Note that you will need to add `tomli; python_version < "3.11"` to your
build-time dependencies if you use `pyproject.toml`
* run `versioneer install --vendor` in your source tree, commit the results
* verify version information with `python setup.py version`
### Build-time dependency mode
* `pip install versioneer` to somewhere in your $PATH
* A [conda-forge recipe](https://github.com/conda-forge/versioneer-feedstock) is
available, so you can also use `conda install -c conda-forge versioneer`
* add a `[tool.versioneer]` section to your `pyproject.toml` or a
`[versioneer]` section to your `setup.cfg` (see [Install](INSTALL.md))
* add `versioneer` (with `[toml]` extra, if configuring in `pyproject.toml`)
to the `requires` key of the `build-system` table in `pyproject.toml`:
```toml
[build-system]
requires = ["setuptools", "versioneer[toml]"]
build-backend = "setuptools.build_meta"
```
* run `versioneer install --no-vendor` in your source tree, commit the results
* verify version information with `python setup.py version`
## Version Identifiers
Source trees come from a variety of places:
* a version-control system checkout (mostly used by developers)
* a nightly tarball, produced by build automation
* a snapshot tarball, produced by a web-based VCS browser, like github's
"tarball from tag" feature
* a release tarball, produced by "setup.py sdist", distributed through PyPI
Within each source tree, the version identifier (either a string or a number,
this tool is format-agnostic) can come from a variety of places:
* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
about recent "tags" and an absolute revision-id
* the name of the directory into which the tarball was unpacked
* an expanded VCS keyword ($Id$, etc)
* a `_version.py` created by some earlier build step
For released software, the version identifier is closely related to a VCS
tag. Some projects use tag names that include more than just the version
string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
needs to strip the tag prefix to extract the version identifier. For
unreleased software (between tags), the version identifier should provide
enough information to help developers recreate the same tree, while also
giving them an idea of roughly how old the tree is (after version 1.2, before
version 1.3). Many VCS systems can report a description that captures this,
for example `git describe --tags --dirty --always` reports things like
"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
uncommitted changes).
The version identifier is used for multiple purposes:
* to allow the module to self-identify its version: `myproject.__version__`
* to choose a name and prefix for a 'setup.py sdist' tarball
## Theory of Operation
Versioneer works by adding a special `_version.py` file into your source
tree, where your `__init__.py` can import it. This `_version.py` knows how to
dynamically ask the VCS tool for version information at import time.
`_version.py` also contains `$Revision$` markers, and the installation
process marks `_version.py` to have this marker rewritten with a tag name
during the `git archive` command. As a result, generated tarballs will
contain enough information to get the proper version.
To allow `setup.py` to compute a version too, a `versioneer.py` is added to
the top level of your source tree, next to `setup.py` and the `setup.cfg`
that configures it. This overrides several distutils/setuptools commands to
compute the version when invoked, and changes `setup.py build` and `setup.py
sdist` to replace `_version.py` with a small static file that contains just
the generated version data.
## Installation
See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
## Version-String Flavors
Code which uses Versioneer can learn about its version string at runtime by
importing `_version` from your main `__init__.py` file and running the
`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
import the top-level `versioneer.py` and run `get_versions()`.
Both functions return a dictionary with different flavors of version
information:
* `['version']`: A condensed version string, rendered using the selected
style. This is the most commonly used value for the project's version
string. The default "pep440" style yields strings like `0.11`,
`0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
below for alternative styles.
* `['full-revisionid']`: detailed revision identifier. For Git, this is the
full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
commit date in ISO 8601 format. This will be None if the date is not
available.
* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
this is only accurate if run in a VCS checkout, otherwise it is likely to
be False or None
* `['error']`: if the version string could not be computed, this will be set
to a string describing the problem, otherwise it will be None. It may be
useful to throw an exception in setup.py if this is set, to avoid e.g.
creating tarballs with a version string of "unknown".
Some variants are more useful than others. Including `full-revisionid` in a
bug report should allow developers to reconstruct the exact code being tested
(or indicate the presence of local changes that should be shared with the
developers). `version` is suitable for display in an "about" box or a CLI
`--version` output: it can be easily compared against release notes and lists
of bugs fixed in various releases.
The installer adds the following text to your `__init__.py` to place a basic
version in `YOURPROJECT.__version__`:
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions
## Styles
The setup.cfg `style=` configuration controls how the VCS information is
rendered into a version string.
The default style, "pep440", produces a PEP440-compliant string, equal to the
un-prefixed tag name for actual releases, and containing an additional "local
version" section with more detail for in-between builds. For Git, this is
TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
that this commit is two revisions ("+2") beyond the "0.11" tag. For released
software (exactly equal to a known tag), the identifier will only contain the
stripped tag, e.g. "0.11".
Other styles are available. See [details.md](details.md) in the Versioneer
source tree for descriptions.
## Debugging
Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
to return a version of "0+unknown". To investigate the problem, run `setup.py
version`, which will run the version-lookup code in a verbose mode, and will
display the full contents of `get_versions()` (including the `error` string,
which may help identify what went wrong).
## Known Limitations
Some situations are known to cause problems for Versioneer. This details the
most significant ones. More can be found on Github
[issues page](https://github.com/python-versioneer/python-versioneer/issues).
### Subprojects
Versioneer has limited support for source trees in which `setup.py` is not in
the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
two common reasons why `setup.py` might not be in the root:
* Source trees which contain multiple subprojects, such as
[Buildbot](https://github.com/buildbot/buildbot), which contains both
"master" and "slave" subprojects, each with their own `setup.py`,
`setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
distributions (and upload multiple independently-installable tarballs).
* Source trees whose main purpose is to contain a C library, but which also
provide bindings to Python (and perhaps other languages) in subdirectories.
Versioneer will look for `.git` in parent directories, and most operations
should get the right version string. However `pip` and `setuptools` have bugs
and implementation details which frequently cause `pip install .` from a
subproject directory to fail to find a correct version string (so it usually
defaults to `0+unknown`).
`pip install --editable .` should work correctly. `setup.py install` might
work too.
Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
some later version.
[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking
this issue. The discussion in
[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the
issue from the Versioneer side in more detail.
[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
pip to let Versioneer work correctly.
Versioneer-0.16 and earlier only looked for a `.git` directory next to the
`setup.cfg`, so subprojects were completely unsupported with those releases.
### Editable installs with setuptools <= 18.5
`setup.py develop` and `pip install --editable .` allow you to install a
project into a virtualenv once, then continue editing the source code (and
test) without re-installing after every change.
"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
convenient way to specify executable scripts that should be installed along
with the python package.
These both work as expected when using modern setuptools. When using
setuptools-18.5 or earlier, however, certain operations will cause
`pkg_resources.DistributionNotFound` errors when running the entrypoint
script, which must be resolved by re-installing the package. This happens
when the install happens with one version, then the egg_info data is
regenerated while a different version is checked out. Many setup.py commands
cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
a different virtualenv), so this can be surprising.
[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes
this one, but upgrading to a newer version of setuptools should probably
resolve it.
## Updating Versioneer
To upgrade your project to a new release of Versioneer, do the following:
* install the new Versioneer (`pip install -U versioneer` or equivalent)
* edit `setup.cfg` and `pyproject.toml`, if necessary,
to include any new configuration settings indicated by the release notes.
See [UPGRADING](./UPGRADING.md) for details.
* re-run `versioneer install --[no-]vendor` in your source tree, to replace
`SRC/_version.py`
* commit any changed files
## Future Directions
This tool is designed to make it easily extended to other version-control
systems: all VCS-specific components are in separate directories like
src/git/ . The top-level `versioneer.py` script is assembled from these
components by running make-versioneer.py . In the future, make-versioneer.py
will take a VCS name as an argument, and will construct a version of
`versioneer.py` that is specific to the given VCS. It might also take the
configuration arguments that are currently provided manually during
installation by editing setup.py . Alternatively, it might go the other
direction and include code from all supported VCS systems, reducing the
number of intermediate scripts.
## Similar projects
* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time
dependency
* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of
versioneer
* [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools
plugin
## License
To make Versioneer easier to embed, all its code is dedicated to the public
domain. The `_version.py` that it creates is also in the public domain.
Specifically, both are released under the "Unlicense", as described in
https://unlicense.org/.
[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg
[pypi-url]: https://pypi.python.org/pypi/versioneer/
[travis-image]:
https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg
[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer
"""
# pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring
# pylint:disable=missing-class-docstring,too-many-branches,too-many-statements
# pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error
# pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with
# pylint:disable=attribute-defined-outside-init,too-many-arguments
import configparser
import errno
import json
import os
import re
import subprocess
import sys
from pathlib import Path
from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
from typing import NoReturn
import functools
have_tomllib = True
if sys.version_info >= (3, 11):
import tomllib
else:
try:
import tomli as tomllib
except ImportError:
have_tomllib = False
class VersioneerConfig:
"""Container for Versioneer configuration parameters."""
VCS: str
style: str
tag_prefix: str
versionfile_source: str
versionfile_build: Optional[str]
parentdir_prefix: Optional[str]
verbose: Optional[bool]
def get_root() -> str:
"""Get the project root directory.
We require that all commands are run from the project root, i.e. the
directory that contains setup.py, setup.cfg, and versioneer.py .
"""
root = os.path.realpath(os.path.abspath(os.getcwd()))
setup_py = os.path.join(root, "setup.py")
pyproject_toml = os.path.join(root, "pyproject.toml")
versioneer_py = os.path.join(root, "versioneer.py")
if not (
os.path.exists(setup_py)
or os.path.exists(pyproject_toml)
or os.path.exists(versioneer_py)
):
# allow 'python path/to/setup.py COMMAND'
root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
setup_py = os.path.join(root, "setup.py")
pyproject_toml = os.path.join(root, "pyproject.toml")
versioneer_py = os.path.join(root, "versioneer.py")
if not (
os.path.exists(setup_py)
or os.path.exists(pyproject_toml)
or os.path.exists(versioneer_py)
):
err = (
"Versioneer was unable to run the project root directory. "
"Versioneer requires setup.py to be executed from "
"its immediate directory (like 'python setup.py COMMAND'), "
"or in a way that lets it use sys.argv[0] to find the root "
"(like 'python path/to/setup.py COMMAND')."
)
raise VersioneerBadRootError(err)
try:
# Certain runtime workflows (setup.py install/develop in a setuptools
# tree) execute all dependencies in a single python process, so
# "versioneer" may be imported multiple times, and python's shared
# module-import table will cache the first one. So we can't use
# os.path.dirname(__file__), as that will find whichever
# versioneer.py was first imported, even in later projects.
my_path = os.path.realpath(os.path.abspath(__file__))
me_dir = os.path.normcase(os.path.splitext(my_path)[0])
vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
if me_dir != vsr_dir and "VERSIONEER_PEP518" not in globals():
print(
"Warning: build in %s is using versioneer.py from %s"
% (os.path.dirname(my_path), versioneer_py)
)
except NameError:
pass
return root
def get_config_from_root(root: str) -> VersioneerConfig:
"""Read the project setup.cfg file to determine Versioneer config."""
# This might raise OSError (if setup.cfg is missing), or
# configparser.NoSectionError (if it lacks a [versioneer] section), or
# configparser.NoOptionError (if it lacks "VCS="). See the docstring at
# the top of versioneer.py for instructions on writing your setup.cfg .
root_pth = Path(root)
pyproject_toml = root_pth / "pyproject.toml"
setup_cfg = root_pth / "setup.cfg"
section: Union[Dict[str, Any], configparser.SectionProxy, None] = None
if pyproject_toml.exists() and have_tomllib:
try:
with open(pyproject_toml, "rb") as fobj:
pp = tomllib.load(fobj)
section = pp["tool"]["versioneer"]
except (tomllib.TOMLDecodeError, KeyError) as e:
print(f"Failed to load config from {pyproject_toml}: {e}")
print("Try to load it from setup.cfg")
if not section:
parser = configparser.ConfigParser()
with open(setup_cfg) as cfg_file:
parser.read_file(cfg_file)
parser.get("versioneer", "VCS") # raise error if missing
section = parser["versioneer"]
# `cast`` really shouldn't be used, but its simplest for the
# common VersioneerConfig users at the moment. We verify against
# `None` values elsewhere where it matters
cfg = VersioneerConfig()
cfg.VCS = section["VCS"]
cfg.style = section.get("style", "")
cfg.versionfile_source = cast(str, section.get("versionfile_source"))
cfg.versionfile_build = section.get("versionfile_build")
cfg.tag_prefix = cast(str, section.get("tag_prefix"))
if cfg.tag_prefix in ("''", '""', None):
cfg.tag_prefix = ""
cfg.parentdir_prefix = section.get("parentdir_prefix")
if isinstance(section, configparser.SectionProxy):
# Make sure configparser translates to bool
cfg.verbose = section.getboolean("verbose")
else:
cfg.verbose = section.get("verbose")
return cfg
class NotThisMethod(Exception):
"""Exception raised if a method is not valid for the current scenario."""
# these dictionaries contain VCS-specific tools
LONG_VERSION_PY: Dict[str, str] = {}
HANDLERS: Dict[str, Dict[str, Callable]] = {}
def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator
"""Create decorator to mark a method as the handler of a VCS."""
def decorate(f: Callable) -> Callable:
"""Store f in HANDLERS[vcs][method]."""
HANDLERS.setdefault(vcs, {})[method] = f
return f
return decorate
def run_command(
commands: List[str],
args: List[str],
cwd: Optional[str] = None,
verbose: bool = False,
hide_stderr: bool = False,
env: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[str], Optional[int]]:
"""Call the given command(s)."""
assert isinstance(commands, list)
process = None
popen_kwargs: Dict[str, Any] = {}
if sys.platform == "win32":
# This hides the console window if pythonw.exe is used
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
popen_kwargs["startupinfo"] = startupinfo
for command in commands:
try:
dispcmd = str([command] + args)
# remember shell=False, so use git.cmd on windows, not just git
process = subprocess.Popen(
[command] + args,
cwd=cwd,
env=env,
stdout=subprocess.PIPE,
stderr=(subprocess.PIPE if hide_stderr else None),
**popen_kwargs,
)
break
except OSError as e:
if e.errno == errno.ENOENT:
continue
if verbose:
print("unable to run %s" % dispcmd)
print(e)
return None, None
else:
if verbose:
print("unable to find command, tried %s" % (commands,))
return None, None
stdout = process.communicate()[0].strip().decode()
if process.returncode != 0:
if verbose:
print("unable to run %s (error)" % dispcmd)
print("stdout was %s" % stdout)
return None, process.returncode
return stdout, process.returncode
LONG_VERSION_PY[
"git"
] = r'''
# This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
# feature). Distribution tarballs (built by setup.py sdist) and build
# directories (produced by setup.py build) will contain a much shorter file
# that just contains the computed version number.
# This file is released into the public domain.
# Generated by versioneer-0.29
# https://github.com/python-versioneer/python-versioneer
"""Git implementation of _version.py."""
import errno
import os
import re
import subprocess
import sys
from typing import Any, Callable, Dict, List, Optional, Tuple
import functools
def get_keywords() -> Dict[str, str]:
"""Get the keywords needed to look up the version information."""
# these strings will be replaced by git during git-archive.
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
class VersioneerConfig:
"""Container for Versioneer configuration parameters."""
VCS: str
style: str
tag_prefix: str
parentdir_prefix: str
versionfile_source: str
verbose: bool
def get_config() -> VersioneerConfig:
"""Create, populate and return the VersioneerConfig() object."""
# these strings are filled in when 'setup.py versioneer' creates
# _version.py
cfg = VersioneerConfig()
cfg.VCS = "git"
cfg.style = "%(STYLE)s"
cfg.tag_prefix = "%(TAG_PREFIX)s"
cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
cfg.verbose = False
return cfg
class NotThisMethod(Exception):
"""Exception raised if a method is not valid for the current scenario."""
LONG_VERSION_PY: Dict[str, str] = {}
HANDLERS: Dict[str, Dict[str, Callable]] = {}
def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator
"""Create decorator to mark a method as the handler of a VCS."""
def decorate(f: Callable) -> Callable:
"""Store f in HANDLERS[vcs][method]."""
if vcs not in HANDLERS:
HANDLERS[vcs] = {}
HANDLERS[vcs][method] = f
return f
return decorate
def run_command(
commands: List[str],
args: List[str],
cwd: Optional[str] = None,
verbose: bool = False,
hide_stderr: bool = False,
env: Optional[Dict[str, str]] = None,
) -> Tuple[Optional[str], Optional[int]]:
"""Call the given command(s)."""
assert isinstance(commands, list)
process = None
popen_kwargs: Dict[str, Any] = {}
if sys.platform == "win32":
# This hides the console window if pythonw.exe is used
startupinfo = subprocess.STARTUPINFO()
startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
popen_kwargs["startupinfo"] = startupinfo
for command in commands:
try:
dispcmd = str([command] + args)
# remember shell=False, so use git.cmd on windows, not just git
process = subprocess.Popen([command] + args, cwd=cwd, env=env,
stdout=subprocess.PIPE,
stderr=(subprocess.PIPE if hide_stderr
else None), **popen_kwargs)
break
except OSError as e:
if e.errno == errno.ENOENT:
continue
if verbose:
print("unable to run %%s" %% dispcmd)
print(e)
return None, None
else:
if verbose:
print("unable to find command, tried %%s" %% (commands,))
return None, None
stdout = process.communicate()[0].strip().decode()
if process.returncode != 0:
if verbose:
print("unable to run %%s (error)" %% dispcmd)
print("stdout was %%s" %% stdout)
return None, process.returncode
return stdout, process.returncode
def versions_from_parentdir(
parentdir_prefix: str,
root: str,
verbose: bool,
) -> Dict[str, Any]:
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes both
the project name and a version string. We will also support searching up
two directory levels for an appropriately named parent directory
"""
rootdirs = []
for _ in range(3):
dirname = os.path.basename(root)
if dirname.startswith(parentdir_prefix):
return {"version": dirname[len(parentdir_prefix):],
"full-revisionid": None,
"dirty": False, "error": None, "date": None}
rootdirs.append(root)
root = os.path.dirname(root) # up a level
if verbose:
print("Tried directories %%s but none started with prefix %%s" %%
(str(rootdirs), parentdir_prefix))
raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
@register_vcs_handler("git", "get_keywords")
def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
"""Extract version information from the given file."""
# the code embedded in _version.py can just fetch the value of these
# keywords. When used from setup.py, we don't want to import _version.py,
# so we do it with a regexp instead. This function is not used from
# _version.py.
keywords: Dict[str, str] = {}
try:
with open(versionfile_abs, "r") as fobj:
for line in fobj:
if line.strip().startswith("git_refnames ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["refnames"] = mo.group(1)
if line.strip().startswith("git_full ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["full"] = mo.group(1)
if line.strip().startswith("git_date ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["date"] = mo.group(1)
except OSError:
pass
return keywords
@register_vcs_handler("git", "keywords")
def git_versions_from_keywords(
keywords: Dict[str, str],
tag_prefix: str,
verbose: bool,
) -> Dict[str, Any]:
"""Get version information from git keywords."""
if "refnames" not in keywords:
raise NotThisMethod("Short version file found")
date = keywords.get("date")
if date is not None:
# Use only the last line. Previous lines may contain GPG signature
# information.
date = date.splitlines()[-1]
# git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
# datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
# -like" string, which we must then edit to make compliant), because
# it's been around since git-1.5.3, and it's too difficult to
# discover which version we're using, or to work around using an
# older one.
date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
refnames = keywords["refnames"].strip()
if refnames.startswith("$Format"):
if verbose:
print("keywords are unexpanded, not using")
raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
refs = {r.strip() for r in refnames.strip("()").split(",")}
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
tags = {r[len(TAG):] for r in refs if r.startswith(TAG)}
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %%d
# expansion behaves like git log --decorate=short and strips out the
# refs/heads/ and refs/tags/ prefixes that would let us distinguish
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
tags = {r for r in refs if re.search(r'\d', r)}
if verbose:
print("discarding '%%s', no digits" %% ",".join(refs - tags))
if verbose:
print("likely tags: %%s" %% ",".join(sorted(tags)))
for ref in sorted(tags):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
r = ref[len(tag_prefix):]
# Filter out refs that exactly match prefix or that don't start
# with a number once the prefix is stripped (mostly a concern
# when prefix is '')
if not re.match(r'\d', r):
continue
if verbose:
print("picking %%s" %% r)
return {"version": r,
"full-revisionid": keywords["full"].strip(),
"dirty": False, "error": None,
"date": date}
# no suitable tags, so version is "0+unknown", but full hex is still there
if verbose:
print("no suitable tags, using unknown + full revision id")
return {"version": "0+unknown",
"full-revisionid": keywords["full"].strip(),
"dirty": False, "error": "no suitable tags", "date": None}
@register_vcs_handler("git", "pieces_from_vcs")
def git_pieces_from_vcs(
tag_prefix: str,
root: str,
verbose: bool,
runner: Callable = run_command
) -> Dict[str, Any]:
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
expanded, and _version.py hasn't already been rewritten with a short
version string, meaning we're inside a checked out source tree.
"""
GITS = ["git"]
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
# GIT_DIR can interfere with correct operation of Versioneer.
# It may be intended to be passed to the Versioneer-versioned project,
# but that should not change where we get our version from.
env = os.environ.copy()
env.pop("GIT_DIR", None)
runner = functools.partial(runner, env=env)
_, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root,
hide_stderr=not verbose)
if rc != 0:
if verbose:
print("Directory %%s not under git control" %% root)
raise NotThisMethod("'git rev-parse --git-dir' returned error")
# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
# if there isn't one, this yields HEX[-dirty] (no NUM)
describe_out, rc = runner(GITS, [
"describe", "--tags", "--dirty", "--always", "--long",
"--match", f"{tag_prefix}[[:digit:]]*"
], cwd=root)
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
describe_out = describe_out.strip()
full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
if full_out is None:
raise NotThisMethod("'git rev-parse' failed")
full_out = full_out.strip()
pieces: Dict[str, Any] = {}
pieces["long"] = full_out
pieces["short"] = full_out[:7] # maybe improved later
pieces["error"] = None
branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"],
cwd=root)
# --abbrev-ref was added in git-1.6.3
if rc != 0 or branch_name is None:
raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
branch_name = branch_name.strip()
if branch_name == "HEAD":
# If we aren't exactly on a branch, pick a branch which represents
# the current commit. If all else fails, we are on a branchless
# commit.
branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
# --contains was added in git-1.5.4
if rc != 0 or branches is None:
raise NotThisMethod("'git branch --contains' returned error")
branches = branches.split("\n")
# Remove the first line if we're running detached
if "(" in branches[0]:
branches.pop(0)
# Strip off the leading "* " from the list of branches.
branches = [branch[2:] for branch in branches]
if "master" in branches:
branch_name = "master"
elif not branches:
branch_name = None
else:
# Pick the first branch that is returned. Good or bad.
branch_name = branches[0]
pieces["branch"] = branch_name
# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
# TAG might have hyphens.
git_describe = describe_out
# look for -dirty suffix
dirty = git_describe.endswith("-dirty")
pieces["dirty"] = dirty
if dirty:
git_describe = git_describe[:git_describe.rindex("-dirty")]
# now we have TAG-NUM-gHEX or HEX
if "-" in git_describe:
# TAG-NUM-gHEX
mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
if not mo:
# unparsable. Maybe git-describe is misbehaving?
pieces["error"] = ("unable to parse git-describe output: '%%s'"
%% describe_out)
return pieces
# tag
full_tag = mo.group(1)
if not full_tag.startswith(tag_prefix):
if verbose:
fmt = "tag '%%s' doesn't start with prefix '%%s'"
print(fmt %% (full_tag, tag_prefix))
pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
%% (full_tag, tag_prefix))
return pieces
pieces["closest-tag"] = full_tag[len(tag_prefix):]
# distance: number of commits since tag
pieces["distance"] = int(mo.group(2))
# commit: short hex revision ID
pieces["short"] = mo.group(3)
else:
# HEX: no tags
pieces["closest-tag"] = None
out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
pieces["distance"] = len(out.split()) # total number of commits
# commit date: see ISO-8601 comment in git_versions_from_keywords()
date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip()
# Use only the last line. Previous lines may contain GPG signature
# information.
date = date.splitlines()[-1]
pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
return pieces
def plus_or_dot(pieces: Dict[str, Any]) -> str:
"""Return a + if we don't already have one, else return a ."""
if "+" in pieces.get("closest-tag", ""):
return "."
return "+"
def render_pep440(pieces: Dict[str, Any]) -> str:
"""Build up version string, with post-release "local version identifier".
Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
Exceptions:
1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += plus_or_dot(pieces)
rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def render_pep440_branch(pieces: Dict[str, Any]) -> str:
"""TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
The ".dev0" means not master branch. Note that .dev0 sorts backwards
(a feature branch will appear "older" than the master branch).
Exceptions:
1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0"
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += "+untagged.%%d.g%%s" %% (pieces["distance"],
pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
"""Split pep440 version string at the post-release segment.
Returns the release segments before the post-release and the
post-release version number (or -1 if no post-release segment is present).
"""
vc = str.split(ver, ".post")
return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
def render_pep440_pre(pieces: Dict[str, Any]) -> str:
"""TAG[.postN.devDISTANCE] -- No -dirty.
Exceptions:
1: no tags. 0.post0.devDISTANCE
"""
if pieces["closest-tag"]:
if pieces["distance"]:
# update the post release segment
tag_version, post_version = pep440_split_post(pieces["closest-tag"])
rendered = tag_version
if post_version is not None:
rendered += ".post%%d.dev%%d" %% (post_version + 1, pieces["distance"])
else:
rendered += ".post0.dev%%d" %% (pieces["distance"])
else:
# no commits, use the tag as the version
rendered = pieces["closest-tag"]
else:
# exception #1
rendered = "0.post0.dev%%d" %% pieces["distance"]
return rendered
def render_pep440_post(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX] .
The ".dev0" means dirty. Note that .dev0 sorts backwards
(a dirty tree will appear "older" than the corresponding clean one),
but you shouldn't be releasing software with -dirty anyways.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%%d" %% pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "g%%s" %% pieces["short"]
else:
# exception #1
rendered = "0.post%%d" %% pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += "+g%%s" %% pieces["short"]
return rendered
def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
The ".dev0" means not master branch.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%%d" %% pieces["distance"]
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "g%%s" %% pieces["short"]
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0.post%%d" %% pieces["distance"]
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += "+g%%s" %% pieces["short"]
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def render_pep440_old(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]] .
The ".dev0" means dirty.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%%d" %% pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
else:
# exception #1
rendered = "0.post%%d" %% pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
return rendered
def render_git_describe(pieces: Dict[str, Any]) -> str:
"""TAG[-DISTANCE-gHEX][-dirty].
Like 'git describe --tags --dirty --always'.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"]:
rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render_git_describe_long(pieces: Dict[str, Any]) -> str:
"""TAG-DISTANCE-gHEX[-dirty].
Like 'git describe --tags --dirty --always -long'.
The distance/hash is unconditional.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
"""Render the given version pieces into the requested style."""
if pieces["error"]:
return {"version": "unknown",
"full-revisionid": pieces.get("long"),
"dirty": None,
"error": pieces["error"],
"date": None}
if not style or style == "default":
style = "pep440" # the default
if style == "pep440":
rendered = render_pep440(pieces)
elif style == "pep440-branch":
rendered = render_pep440_branch(pieces)
elif style == "pep440-pre":
rendered = render_pep440_pre(pieces)
elif style == "pep440-post":
rendered = render_pep440_post(pieces)
elif style == "pep440-post-branch":
rendered = render_pep440_post_branch(pieces)
elif style == "pep440-old":
rendered = render_pep440_old(pieces)
elif style == "git-describe":
rendered = render_git_describe(pieces)
elif style == "git-describe-long":
rendered = render_git_describe_long(pieces)
else:
raise ValueError("unknown style '%%s'" %% style)
return {"version": rendered, "full-revisionid": pieces["long"],
"dirty": pieces["dirty"], "error": None,
"date": pieces.get("date")}
def get_versions() -> Dict[str, Any]:
"""Get version information or return default if unable to do so."""
# I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
# __file__, we can work backwards from there to the root. Some
# py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
# case we can only use expanded keywords.
cfg = get_config()
verbose = cfg.verbose
try:
return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
verbose)
except NotThisMethod:
pass
try:
root = os.path.realpath(__file__)
# versionfile_source is the relative path from the top of the source
# tree (where the .git directory might live) to this file. Invert
# this to find the root from __file__.
for _ in cfg.versionfile_source.split('/'):
root = os.path.dirname(root)
except NameError:
return {"version": "0+unknown", "full-revisionid": None,
"dirty": None,
"error": "unable to find root of source tree",
"date": None}
try:
pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
return render(pieces, cfg.style)
except NotThisMethod:
pass
try:
if cfg.parentdir_prefix:
return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
except NotThisMethod:
pass
return {"version": "0+unknown", "full-revisionid": None,
"dirty": None,
"error": "unable to compute version", "date": None}
'''
@register_vcs_handler("git", "get_keywords")
def git_get_keywords(versionfile_abs: str) -> Dict[str, str]:
"""Extract version information from the given file."""
# the code embedded in _version.py can just fetch the value of these
# keywords. When used from setup.py, we don't want to import _version.py,
# so we do it with a regexp instead. This function is not used from
# _version.py.
keywords: Dict[str, str] = {}
try:
with open(versionfile_abs, "r") as fobj:
for line in fobj:
if line.strip().startswith("git_refnames ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["refnames"] = mo.group(1)
if line.strip().startswith("git_full ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["full"] = mo.group(1)
if line.strip().startswith("git_date ="):
mo = re.search(r'=\s*"(.*)"', line)
if mo:
keywords["date"] = mo.group(1)
except OSError:
pass
return keywords
@register_vcs_handler("git", "keywords")
def git_versions_from_keywords(
keywords: Dict[str, str],
tag_prefix: str,
verbose: bool,
) -> Dict[str, Any]:
"""Get version information from git keywords."""
if "refnames" not in keywords:
raise NotThisMethod("Short version file found")
date = keywords.get("date")
if date is not None:
# Use only the last line. Previous lines may contain GPG signature
# information.
date = date.splitlines()[-1]
# git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
# datestamp. However we prefer "%ci" (which expands to an "ISO-8601
# -like" string, which we must then edit to make compliant), because
# it's been around since git-1.5.3, and it's too difficult to
# discover which version we're using, or to work around using an
# older one.
date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
refnames = keywords["refnames"].strip()
if refnames.startswith("$Format"):
if verbose:
print("keywords are unexpanded, not using")
raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
refs = {r.strip() for r in refnames.strip("()").split(",")}
# starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
# just "foo-1.0". If we see a "tag: " prefix, prefer those.
TAG = "tag: "
tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
if not tags:
# Either we're using git < 1.8.3, or there really are no tags. We use
# a heuristic: assume all version tags have a digit. The old git %d
# expansion behaves like git log --decorate=short and strips out the
# refs/heads/ and refs/tags/ prefixes that would let us distinguish
# between branches and tags. By ignoring refnames without digits, we
# filter out many common branch names like "release" and
# "stabilization", as well as "HEAD" and "master".
tags = {r for r in refs if re.search(r"\d", r)}
if verbose:
print("discarding '%s', no digits" % ",".join(refs - tags))
if verbose:
print("likely tags: %s" % ",".join(sorted(tags)))
for ref in sorted(tags):
# sorting will prefer e.g. "2.0" over "2.0rc1"
if ref.startswith(tag_prefix):
r = ref[len(tag_prefix) :]
# Filter out refs that exactly match prefix or that don't start
# with a number once the prefix is stripped (mostly a concern
# when prefix is '')
if not re.match(r"\d", r):
continue
if verbose:
print("picking %s" % r)
return {
"version": r,
"full-revisionid": keywords["full"].strip(),
"dirty": False,
"error": None,
"date": date,
}
# no suitable tags, so version is "0+unknown", but full hex is still there
if verbose:
print("no suitable tags, using unknown + full revision id")
return {
"version": "0+unknown",
"full-revisionid": keywords["full"].strip(),
"dirty": False,
"error": "no suitable tags",
"date": None,
}
@register_vcs_handler("git", "pieces_from_vcs")
def git_pieces_from_vcs(
tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command
) -> Dict[str, Any]:
"""Get version from 'git describe' in the root of the source tree.
This only gets called if the git-archive 'subst' keywords were *not*
expanded, and _version.py hasn't already been rewritten with a short
version string, meaning we're inside a checked out source tree.
"""
GITS = ["git"]
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
# GIT_DIR can interfere with correct operation of Versioneer.
# It may be intended to be passed to the Versioneer-versioned project,
# but that should not change where we get our version from.
env = os.environ.copy()
env.pop("GIT_DIR", None)
runner = functools.partial(runner, env=env)
_, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose)
if rc != 0:
if verbose:
print("Directory %s not under git control" % root)
raise NotThisMethod("'git rev-parse --git-dir' returned error")
# if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
# if there isn't one, this yields HEX[-dirty] (no NUM)
describe_out, rc = runner(
GITS,
[
"describe",
"--tags",
"--dirty",
"--always",
"--long",
"--match",
f"{tag_prefix}[[:digit:]]*",
],
cwd=root,
)
# --long was added in git-1.5.5
if describe_out is None:
raise NotThisMethod("'git describe' failed")
describe_out = describe_out.strip()
full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
if full_out is None:
raise NotThisMethod("'git rev-parse' failed")
full_out = full_out.strip()
pieces: Dict[str, Any] = {}
pieces["long"] = full_out
pieces["short"] = full_out[:7] # maybe improved later
pieces["error"] = None
branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
# --abbrev-ref was added in git-1.6.3
if rc != 0 or branch_name is None:
raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
branch_name = branch_name.strip()
if branch_name == "HEAD":
# If we aren't exactly on a branch, pick a branch which represents
# the current commit. If all else fails, we are on a branchless
# commit.
branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
# --contains was added in git-1.5.4
if rc != 0 or branches is None:
raise NotThisMethod("'git branch --contains' returned error")
branches = branches.split("\n")
# Remove the first line if we're running detached
if "(" in branches[0]:
branches.pop(0)
# Strip off the leading "* " from the list of branches.
branches = [branch[2:] for branch in branches]
if "master" in branches:
branch_name = "master"
elif not branches:
branch_name = None
else:
# Pick the first branch that is returned. Good or bad.
branch_name = branches[0]
pieces["branch"] = branch_name
# parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
# TAG might have hyphens.
git_describe = describe_out
# look for -dirty suffix
dirty = git_describe.endswith("-dirty")
pieces["dirty"] = dirty
if dirty:
git_describe = git_describe[: git_describe.rindex("-dirty")]
# now we have TAG-NUM-gHEX or HEX
if "-" in git_describe:
# TAG-NUM-gHEX
mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
if not mo:
# unparsable. Maybe git-describe is misbehaving?
pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
return pieces
# tag
full_tag = mo.group(1)
if not full_tag.startswith(tag_prefix):
if verbose:
fmt = "tag '%s' doesn't start with prefix '%s'"
print(fmt % (full_tag, tag_prefix))
pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
full_tag,
tag_prefix,
)
return pieces
pieces["closest-tag"] = full_tag[len(tag_prefix) :]
# distance: number of commits since tag
pieces["distance"] = int(mo.group(2))
# commit: short hex revision ID
pieces["short"] = mo.group(3)
else:
# HEX: no tags
pieces["closest-tag"] = None
out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
pieces["distance"] = len(out.split()) # total number of commits
# commit date: see ISO-8601 comment in git_versions_from_keywords()
date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
# Use only the last line. Previous lines may contain GPG signature
# information.
date = date.splitlines()[-1]
pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
return pieces
def do_vcs_install(versionfile_source: str, ipy: Optional[str]) -> None:
"""Git-specific installation logic for Versioneer.
For Git, this means creating/changing .gitattributes to mark _version.py
for export-subst keyword substitution.
"""
GITS = ["git"]
if sys.platform == "win32":
GITS = ["git.cmd", "git.exe"]
files = [versionfile_source]
if ipy:
files.append(ipy)
if "VERSIONEER_PEP518" not in globals():
try:
my_path = __file__
if my_path.endswith((".pyc", ".pyo")):
my_path = os.path.splitext(my_path)[0] + ".py"
versioneer_file = os.path.relpath(my_path)
except NameError:
versioneer_file = "versioneer.py"
files.append(versioneer_file)
present = False
try:
with open(".gitattributes", "r") as fobj:
for line in fobj:
if line.strip().startswith(versionfile_source):
if "export-subst" in line.strip().split()[1:]:
present = True
break
except OSError:
pass
if not present:
with open(".gitattributes", "a+") as fobj:
fobj.write(f"{versionfile_source} export-subst\n")
files.append(".gitattributes")
run_command(GITS, ["add", "--"] + files)
def versions_from_parentdir(
parentdir_prefix: str,
root: str,
verbose: bool,
) -> Dict[str, Any]:
"""Try to determine the version from the parent directory name.
Source tarballs conventionally unpack into a directory that includes both
the project name and a version string. We will also support searching up
two directory levels for an appropriately named parent directory
"""
rootdirs = []
for _ in range(3):
dirname = os.path.basename(root)
if dirname.startswith(parentdir_prefix):
return {
"version": dirname[len(parentdir_prefix) :],
"full-revisionid": None,
"dirty": False,
"error": None,
"date": None,
}
rootdirs.append(root)
root = os.path.dirname(root) # up a level
if verbose:
print(
"Tried directories %s but none started with prefix %s"
% (str(rootdirs), parentdir_prefix)
)
raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
SHORT_VERSION_PY = """
# This file was generated by 'versioneer.py' (0.29) from
# revision-control system data, or from the parent directory name of an
# unpacked source archive. Distribution tarballs contain a pre-generated copy
# of this file.
import json
version_json = '''
%s
''' # END VERSION_JSON
def get_versions():
return json.loads(version_json)
"""
def versions_from_file(filename: str) -> Dict[str, Any]:
"""Try to determine the version from _version.py if present."""
try:
with open(filename) as f:
contents = f.read()
except OSError:
raise NotThisMethod("unable to read _version.py")
mo = re.search(
r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S
)
if not mo:
mo = re.search(
r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S
)
if not mo:
raise NotThisMethod("no version_json in _version.py")
return json.loads(mo.group(1))
def write_to_version_file(filename: str, versions: Dict[str, Any]) -> None:
"""Write the given version number to the given _version.py file."""
contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": "))
with open(filename, "w") as f:
f.write(SHORT_VERSION_PY % contents)
print("set %s to '%s'" % (filename, versions["version"]))
def plus_or_dot(pieces: Dict[str, Any]) -> str:
"""Return a + if we don't already have one, else return a ."""
if "+" in pieces.get("closest-tag", ""):
return "."
return "+"
def render_pep440(pieces: Dict[str, Any]) -> str:
"""Build up version string, with post-release "local version identifier".
Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
Exceptions:
1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += plus_or_dot(pieces)
rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def render_pep440_branch(pieces: Dict[str, Any]) -> str:
"""TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
The ".dev0" means not master branch. Note that .dev0 sorts backwards
(a feature branch will appear "older" than the master branch).
Exceptions:
1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0"
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def pep440_split_post(ver: str) -> Tuple[str, Optional[int]]:
"""Split pep440 version string at the post-release segment.
Returns the release segments before the post-release and the
post-release version number (or -1 if no post-release segment is present).
"""
vc = str.split(ver, ".post")
return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
def render_pep440_pre(pieces: Dict[str, Any]) -> str:
"""TAG[.postN.devDISTANCE] -- No -dirty.
Exceptions:
1: no tags. 0.post0.devDISTANCE
"""
if pieces["closest-tag"]:
if pieces["distance"]:
# update the post release segment
tag_version, post_version = pep440_split_post(pieces["closest-tag"])
rendered = tag_version
if post_version is not None:
rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
else:
rendered += ".post0.dev%d" % (pieces["distance"])
else:
# no commits, use the tag as the version
rendered = pieces["closest-tag"]
else:
# exception #1
rendered = "0.post0.dev%d" % pieces["distance"]
return rendered
def render_pep440_post(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX] .
The ".dev0" means dirty. Note that .dev0 sorts backwards
(a dirty tree will appear "older" than the corresponding clean one),
but you shouldn't be releasing software with -dirty anyways.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "g%s" % pieces["short"]
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
rendered += "+g%s" % pieces["short"]
return rendered
def render_pep440_post_branch(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
The ".dev0" means not master branch.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += plus_or_dot(pieces)
rendered += "g%s" % pieces["short"]
if pieces["dirty"]:
rendered += ".dirty"
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["branch"] != "master":
rendered += ".dev0"
rendered += "+g%s" % pieces["short"]
if pieces["dirty"]:
rendered += ".dirty"
return rendered
def render_pep440_old(pieces: Dict[str, Any]) -> str:
"""TAG[.postDISTANCE[.dev0]] .
The ".dev0" means dirty.
Exceptions:
1: no tags. 0.postDISTANCE[.dev0]
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"] or pieces["dirty"]:
rendered += ".post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
else:
# exception #1
rendered = "0.post%d" % pieces["distance"]
if pieces["dirty"]:
rendered += ".dev0"
return rendered
def render_git_describe(pieces: Dict[str, Any]) -> str:
"""TAG[-DISTANCE-gHEX][-dirty].
Like 'git describe --tags --dirty --always'.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
if pieces["distance"]:
rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render_git_describe_long(pieces: Dict[str, Any]) -> str:
"""TAG-DISTANCE-gHEX[-dirty].
Like 'git describe --tags --dirty --always -long'.
The distance/hash is unconditional.
Exceptions:
1: no tags. HEX[-dirty] (note: no 'g' prefix)
"""
if pieces["closest-tag"]:
rendered = pieces["closest-tag"]
rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
else:
# exception #1
rendered = pieces["short"]
if pieces["dirty"]:
rendered += "-dirty"
return rendered
def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]:
"""Render the given version pieces into the requested style."""
if pieces["error"]:
return {
"version": "unknown",
"full-revisionid": pieces.get("long"),
"dirty": None,
"error": pieces["error"],
"date": None,
}
if not style or style == "default":
style = "pep440" # the default
if style == "pep440":
rendered = render_pep440(pieces)
elif style == "pep440-branch":
rendered = render_pep440_branch(pieces)
elif style == "pep440-pre":
rendered = render_pep440_pre(pieces)
elif style == "pep440-post":
rendered = render_pep440_post(pieces)
elif style == "pep440-post-branch":
rendered = render_pep440_post_branch(pieces)
elif style == "pep440-old":
rendered = render_pep440_old(pieces)
elif style == "git-describe":
rendered = render_git_describe(pieces)
elif style == "git-describe-long":
rendered = render_git_describe_long(pieces)
else:
raise ValueError("unknown style '%s'" % style)
return {
"version": rendered,
"full-revisionid": pieces["long"],
"dirty": pieces["dirty"],
"error": None,
"date": pieces.get("date"),
}
class VersioneerBadRootError(Exception):
"""The project root directory is unknown or missing key files."""
def get_versions(verbose: bool = False) -> Dict[str, Any]:
"""Get the project version from whatever source is available.
Returns dict with two keys: 'version' and 'full'.
"""
if "versioneer" in sys.modules:
# see the discussion in cmdclass.py:get_cmdclass()
del sys.modules["versioneer"]
root = get_root()
cfg = get_config_from_root(root)
assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
handlers = HANDLERS.get(cfg.VCS)
assert handlers, "unrecognized VCS '%s'" % cfg.VCS
verbose = verbose or bool(cfg.verbose) # `bool()` used to avoid `None`
assert (
cfg.versionfile_source is not None
), "please set versioneer.versionfile_source"
assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
versionfile_abs = os.path.join(root, cfg.versionfile_source)
# extract version from first of: _version.py, VCS command (e.g. 'git
# describe'), parentdir. This is meant to work for developers using a
# source checkout, for users of a tarball created by 'setup.py sdist',
# and for users of a tarball/zipball created by 'git archive' or github's
# download-from-tag feature or the equivalent in other VCSes.
get_keywords_f = handlers.get("get_keywords")
from_keywords_f = handlers.get("keywords")
if get_keywords_f and from_keywords_f:
try:
keywords = get_keywords_f(versionfile_abs)
ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
if verbose:
print("got version from expanded keyword %s" % ver)
return ver
except NotThisMethod:
pass
try:
ver = versions_from_file(versionfile_abs)
if verbose:
print("got version from file %s %s" % (versionfile_abs, ver))
return ver
except NotThisMethod:
pass
from_vcs_f = handlers.get("pieces_from_vcs")
if from_vcs_f:
try:
pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
ver = render(pieces, cfg.style)
if verbose:
print("got version from VCS %s" % ver)
return ver
except NotThisMethod:
pass
try:
if cfg.parentdir_prefix:
ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
if verbose:
print("got version from parentdir %s" % ver)
return ver
except NotThisMethod:
pass
if verbose:
print("unable to compute version")
return {
"version": "0+unknown",
"full-revisionid": None,
"dirty": None,
"error": "unable to compute version",
"date": None,
}
def get_version() -> str:
"""Get the short version string for this project."""
return get_versions()["version"]
def get_cmdclass(cmdclass: Optional[Dict[str, Any]] = None):
"""Get the custom setuptools subclasses used by Versioneer.
If the package uses a different cmdclass (e.g. one from numpy), it
should be provide as an argument.
"""
if "versioneer" in sys.modules:
del sys.modules["versioneer"]
# this fixes the "python setup.py develop" case (also 'install' and
# 'easy_install .'), in which subdependencies of the main project are
# built (using setup.py bdist_egg) in the same python process. Assume
# a main project A and a dependency B, which use different versions
# of Versioneer. A's setup.py imports A's Versioneer, leaving it in
# sys.modules by the time B's setup.py is executed, causing B to run
# with the wrong versioneer. Setuptools wraps the sub-dep builds in a
# sandbox that restores sys.modules to it's pre-build state, so the
# parent is protected against the child's "import versioneer". By
# removing ourselves from sys.modules here, before the child build
# happens, we protect the child from the parent's versioneer too.
# Also see https://github.com/python-versioneer/python-versioneer/issues/52
cmds = {} if cmdclass is None else cmdclass.copy()
# we add "version" to setuptools
from setuptools import Command
class cmd_version(Command):
description = "report generated version string"
user_options: List[Tuple[str, str, str]] = []
boolean_options: List[str] = []
def initialize_options(self) -> None:
pass
def finalize_options(self) -> None:
pass
def run(self) -> None:
vers = get_versions(verbose=True)
print("Version: %s" % vers["version"])
print(" full-revisionid: %s" % vers.get("full-revisionid"))
print(" dirty: %s" % vers.get("dirty"))
print(" date: %s" % vers.get("date"))
if vers["error"]:
print(" error: %s" % vers["error"])
cmds["version"] = cmd_version
# we override "build_py" in setuptools
#
# most invocation pathways end up running build_py:
# distutils/build -> build_py
# distutils/install -> distutils/build ->..
# setuptools/bdist_wheel -> distutils/install ->..
# setuptools/bdist_egg -> distutils/install_lib -> build_py
# setuptools/install -> bdist_egg ->..
# setuptools/develop -> ?
# pip install:
# copies source tree to a tempdir before running egg_info/etc
# if .git isn't copied too, 'git describe' will fail
# then does setup.py bdist_wheel, or sometimes setup.py install
# setup.py egg_info -> ?
# pip install -e . and setuptool/editable_wheel will invoke build_py
# but the build_py command is not expected to copy any files.
# we override different "build_py" commands for both environments
if "build_py" in cmds:
_build_py: Any = cmds["build_py"]
else:
from setuptools.command.build_py import build_py as _build_py
class cmd_build_py(_build_py):
def run(self) -> None:
root = get_root()
cfg = get_config_from_root(root)
versions = get_versions()
_build_py.run(self)
if getattr(self, "editable_mode", False):
# During editable installs `.py` and data files are
# not copied to build_lib
return
# now locate _version.py in the new build/ directory and replace
# it with an updated value
if cfg.versionfile_build:
target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build)
print("UPDATING %s" % target_versionfile)
write_to_version_file(target_versionfile, versions)
cmds["build_py"] = cmd_build_py
if "build_ext" in cmds:
_build_ext: Any = cmds["build_ext"]
else:
from setuptools.command.build_ext import build_ext as _build_ext
class cmd_build_ext(_build_ext):
def run(self) -> None:
root = get_root()
cfg = get_config_from_root(root)
versions = get_versions()
_build_ext.run(self)
if self.inplace:
# build_ext --inplace will only build extensions in
# build/lib<..> dir with no _version.py to write to.
# As in place builds will already have a _version.py
# in the module dir, we do not need to write one.
return
# now locate _version.py in the new build/ directory and replace
# it with an updated value
if not cfg.versionfile_build:
return
target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build)
if not os.path.exists(target_versionfile):
print(
f"Warning: {target_versionfile} does not exist, skipping "
"version update. This can happen if you are running build_ext "
"without first running build_py."
)
return
print("UPDATING %s" % target_versionfile)
write_to_version_file(target_versionfile, versions)
cmds["build_ext"] = cmd_build_ext
if "cx_Freeze" in sys.modules: # cx_freeze enabled?
from cx_Freeze.dist import build_exe as _build_exe # type: ignore
# nczeczulin reports that py2exe won't like the pep440-style string
# as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
# setup(console=[{
# "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
# "product_version": versioneer.get_version(),
# ...
class cmd_build_exe(_build_exe):
def run(self) -> None:
root = get_root()
cfg = get_config_from_root(root)
versions = get_versions()
target_versionfile = cfg.versionfile_source
print("UPDATING %s" % target_versionfile)
write_to_version_file(target_versionfile, versions)
_build_exe.run(self)
os.unlink(target_versionfile)
with open(cfg.versionfile_source, "w") as f:
LONG = LONG_VERSION_PY[cfg.VCS]
f.write(
LONG
% {
"DOLLAR": "$",
"STYLE": cfg.style,
"TAG_PREFIX": cfg.tag_prefix,
"PARENTDIR_PREFIX": cfg.parentdir_prefix,
"VERSIONFILE_SOURCE": cfg.versionfile_source,
}
)
cmds["build_exe"] = cmd_build_exe
del cmds["build_py"]
if "py2exe" in sys.modules: # py2exe enabled?
try:
from py2exe.setuptools_buildexe import py2exe as _py2exe # type: ignore
except ImportError:
from py2exe.distutils_buildexe import py2exe as _py2exe # type: ignore
class cmd_py2exe(_py2exe):
def run(self) -> None:
root = get_root()
cfg = get_config_from_root(root)
versions = get_versions()
target_versionfile = cfg.versionfile_source
print("UPDATING %s" % target_versionfile)
write_to_version_file(target_versionfile, versions)
_py2exe.run(self)
os.unlink(target_versionfile)
with open(cfg.versionfile_source, "w") as f:
LONG = LONG_VERSION_PY[cfg.VCS]
f.write(
LONG
% {
"DOLLAR": "$",
"STYLE": cfg.style,
"TAG_PREFIX": cfg.tag_prefix,
"PARENTDIR_PREFIX": cfg.parentdir_prefix,
"VERSIONFILE_SOURCE": cfg.versionfile_source,
}
)
cmds["py2exe"] = cmd_py2exe
# sdist farms its file list building out to egg_info
if "egg_info" in cmds:
_egg_info: Any = cmds["egg_info"]
else:
from setuptools.command.egg_info import egg_info as _egg_info
class cmd_egg_info(_egg_info):
def find_sources(self) -> None:
# egg_info.find_sources builds the manifest list and writes it
# in one shot
super().find_sources()
# Modify the filelist and normalize it
root = get_root()
cfg = get_config_from_root(root)
self.filelist.append("versioneer.py")
if cfg.versionfile_source:
# There are rare cases where versionfile_source might not be
# included by default, so we must be explicit
self.filelist.append(cfg.versionfile_source)
self.filelist.sort()
self.filelist.remove_duplicates()
# The write method is hidden in the manifest_maker instance that
# generated the filelist and was thrown away
# We will instead replicate their final normalization (to unicode,
# and POSIX-style paths)
from setuptools import unicode_utils
normalized = [
unicode_utils.filesys_decode(f).replace(os.sep, "/")
for f in self.filelist.files
]
manifest_filename = os.path.join(self.egg_info, "SOURCES.txt")
with open(manifest_filename, "w") as fobj:
fobj.write("\n".join(normalized))
cmds["egg_info"] = cmd_egg_info
# we override different "sdist" commands for both environments
if "sdist" in cmds:
_sdist: Any = cmds["sdist"]
else:
from setuptools.command.sdist import sdist as _sdist
class cmd_sdist(_sdist):
def run(self) -> None:
versions = get_versions()
self._versioneer_generated_versions = versions
# unless we update this, the command will keep using the old
# version
self.distribution.metadata.version = versions["version"]
return _sdist.run(self)
def make_release_tree(self, base_dir: str, files: List[str]) -> None:
root = get_root()
cfg = get_config_from_root(root)
_sdist.make_release_tree(self, base_dir, files)
# now locate _version.py in the new base_dir directory
# (remembering that it may be a hardlink) and replace it with an
# updated value
target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
print("UPDATING %s" % target_versionfile)
write_to_version_file(
target_versionfile, self._versioneer_generated_versions
)
cmds["sdist"] = cmd_sdist
return cmds
CONFIG_ERROR = """
setup.cfg is missing the necessary Versioneer configuration. You need
a section like:
[versioneer]
VCS = git
style = pep440
versionfile_source = src/myproject/_version.py
versionfile_build = myproject/_version.py
tag_prefix =
parentdir_prefix = myproject-
You will also need to edit your setup.py to use the results:
import versioneer
setup(version=versioneer.get_version(),
cmdclass=versioneer.get_cmdclass(), ...)
Please read the docstring in ./versioneer.py for configuration instructions,
edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
"""
SAMPLE_CONFIG = """
# See the docstring in versioneer.py for instructions. Note that you must
# re-run 'versioneer.py setup' after changing this section, and commit the
# resulting files.
[versioneer]
#VCS = git
#style = pep440
#versionfile_source =
#versionfile_build =
#tag_prefix =
#parentdir_prefix =
"""
OLD_SNIPPET = """
from ._version import get_versions
__version__ = get_versions()['version']
del get_versions
"""
INIT_PY_SNIPPET = """
from . import {0}
__version__ = {0}.get_versions()['version']
"""
def do_setup() -> int:
"""Do main VCS-independent setup function for installing Versioneer."""
root = get_root()
try:
cfg = get_config_from_root(root)
except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e:
if isinstance(e, (OSError, configparser.NoSectionError)):
print("Adding sample versioneer config to setup.cfg", file=sys.stderr)
with open(os.path.join(root, "setup.cfg"), "a") as f:
f.write(SAMPLE_CONFIG)
print(CONFIG_ERROR, file=sys.stderr)
return 1
print(" creating %s" % cfg.versionfile_source)
with open(cfg.versionfile_source, "w") as f:
LONG = LONG_VERSION_PY[cfg.VCS]
f.write(
LONG
% {
"DOLLAR": "$",
"STYLE": cfg.style,
"TAG_PREFIX": cfg.tag_prefix,
"PARENTDIR_PREFIX": cfg.parentdir_prefix,
"VERSIONFILE_SOURCE": cfg.versionfile_source,
}
)
ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
maybe_ipy: Optional[str] = ipy
if os.path.exists(ipy):
try:
with open(ipy, "r") as f:
old = f.read()
except OSError:
old = ""
module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0]
snippet = INIT_PY_SNIPPET.format(module)
if OLD_SNIPPET in old:
print(" replacing boilerplate in %s" % ipy)
with open(ipy, "w") as f:
f.write(old.replace(OLD_SNIPPET, snippet))
elif snippet not in old:
print(" appending to %s" % ipy)
with open(ipy, "a") as f:
f.write(snippet)
else:
print(" %s unmodified" % ipy)
else:
print(" %s doesn't exist, ok" % ipy)
maybe_ipy = None
# Make VCS-specific changes. For git, this means creating/changing
# .gitattributes to mark _version.py for export-subst keyword
# substitution.
do_vcs_install(cfg.versionfile_source, maybe_ipy)
return 0
def scan_setup_py() -> int:
"""Validate the contents of setup.py against Versioneer's expectations."""
found = set()
setters = False
errors = 0
with open("setup.py", "r") as f:
for line in f.readlines():
if "import versioneer" in line:
found.add("import")
if "versioneer.get_cmdclass()" in line:
found.add("cmdclass")
if "versioneer.get_version()" in line:
found.add("get_version")
if "versioneer.VCS" in line:
setters = True
if "versioneer.versionfile_source" in line:
setters = True
if len(found) != 3:
print("")
print("Your setup.py appears to be missing some important items")
print("(but I might be wrong). Please make sure it has something")
print("roughly like the following:")
print("")
print(" import versioneer")
print(" setup( version=versioneer.get_version(),")
print(" cmdclass=versioneer.get_cmdclass(), ...)")
print("")
errors += 1
if setters:
print("You should remove lines like 'versioneer.VCS = ' and")
print("'versioneer.versionfile_source = ' . This configuration")
print("now lives in setup.cfg, and should be removed from setup.py")
print("")
errors += 1
return errors
def setup_command() -> NoReturn:
"""Set up Versioneer and exit with appropriate error code."""
errors = do_setup()
errors += scan_setup_py()
sys.exit(1 if errors else 0)
if __name__ == "__main__":
cmd = sys.argv[1]
if cmd == "setup":
setup_command()