.*)$')
def as_str(self):
return '{self.transport}::{self.RI}'.format(self=self)
def _split_colon(s, maxsplit=1):
"""Split on unescaped colon"""
return re.compile(r'(?^()[]{}$\'" '
# TODO: RF using re.sub
def escape_ssh_path(path):
"""Escape all special characters present in the path"""
for c in _SSH_ESCAPED_CHARACTERS:
if c in path:
path = path.replace(c, '\\' + c)
return path
def unescape_ssh_path(path):
"""Un-escape all special characters present in the path"""
for c in _SSH_ESCAPED_CHARACTERS[::-1]:
if c in path:
path = path.replace('\\' + c, c)
return path
def parse_url_opts(url):
"""Given a string with url-style query, split into content before # and options as dict"""
url = URL(url)
# we need to filter out query and fragment to get the base url
fields = url.fields
fields.pop('query')
fields.pop('fragment')
opts = url.query_dict
return str(URL(**fields)), opts
# TODO: should we just define URL.good_for_git or smth like that? ;)
# although git also understands regular paths
def is_url(ri):
"""Returns whether argument is a resource identifier what datalad should treat as a URL
This includes ssh "urls" which git understands.
Parameters
----------
ri : str or RI
The resource identifier (as a string or RI) to "analyze"
"""
if not isinstance(ri, RI):
try:
ri = RI(ri)
except: # MIH: MemoryError?
return False
return isinstance(ri, (URL, SSHRI))
# TODO: RF to remove duplication
def is_datalad_compat_ri(ri):
"""Returns whether argument is a resource identifier what datalad should treat as a URL
including its own DataLadRI
"""
if not isinstance(ri, RI):
try:
ri = RI(ri)
except: # MIH: MemoryError?
return False
return isinstance(ri, (URL, SSHRI, DataLadRI))
# TODO: better name? additionally may be move to SSHRI.is_valid() or sth.
def is_ssh(ri):
"""helper to determine, whether `ri` requires an SSH connection
Parameters
----------
ri: str or RI
Returns
-------
bool
"""
# not exactly fitting the doc, but we actually can deal not necessarily with
# string or RI only, but with everything RI itself can deal with:
_ri = RI(ri) if not isinstance(ri, RI) else ri
return isinstance(_ri, SSHRI) \
or (isinstance(_ri, URL) and _ri.scheme == 'ssh')
def get_local_file_url(fname: str,
compatibility: str = 'git-annex',
allow_relative_path: bool = True
) -> str:
"""Return OS specific URL pointing to a local file
Parameters
----------
fname : string
Filename. If not absolute, abspath is used
compatibility : str, optional
This parameter is only interpreted on Windows systems. If set to
anything else than 'git', the anchor, e.g. `C:` of `fname` will be put
into the `file-auth` part, i.e. network location, defined in RFC 8089.
This option is mainly used to support git-annex specific encoding of
Windows paths.
allow_relative_path: bool, optional
Allow `fname` to be a relative path. The path will be converted to an
absolute path, by using the current directory as path prefix.
"""
url_path = local_path2url_path(fname, allow_relative_path=allow_relative_path)
if on_windows and compatibility != "git":
# Work around the way in which git-annex interprets file URLs on
# Windows. This code path will put the path anchor, e.g. `C:` of `fname`
# into the network location component of the resulting URL.
return "file:/" + url_path
result = "file://" + url_path
return result
def get_url_cache_filename(url, name=None):
"""Return a filename where to cache online doc from a url"""
if not name:
name = "misc"
cache_dir = opj(cfg.obtain('datalad.locations.cache'), name)
doc_fname = opj(
cache_dir,
'{}-{}.p{}'.format(
urlsplit(url).netloc,
md5(url.encode('utf-8')).hexdigest(),
pickle.HIGHEST_PROTOCOL)
)
return doc_fname
def get_cached_url_content(url, name=None, fetcher=None, maxage=None):
"""Loader of a document from a url, which caches loaded instance on disk
Doesn't do anything smart about http headers etc which could provide
information for cache/proxy servers for how long to retain etc
TODO: theoretically it is not network specific at all -- and just a memoize
pattern, but may be some time we would make it treat headers etc correctly.
And ATM would support any URL we support via providers/downloaders
Parameters
----------
fetcher: callable, optional
Function to call with url if needed to be refetched
maxage: float, optional
Age in days to retain valid for. <0 - would retain forever. If None -
would consult the config, 0 - would force to reload
"""
doc_fname = get_url_cache_filename(url, name)
if maxage is None:
maxage = float(cfg.get('datalad.locations.cache-maxage'))
doc = None
if os.path.exists(doc_fname) and maxage != 0:
fage = (time.time() - os.stat(doc_fname).st_mtime)/(24. * 3600)
if maxage < 0 or fage < maxage:
try:
lgr.debug("use cached request result to '%s' from %s", url, doc_fname)
doc = pickle.load(open(doc_fname, 'rb'))
except Exception as e: # it is OK to ignore any error and fall back on the true source
lgr.warning(
"cannot load cache from '%s', fall back to download: %s",
doc_fname, CapturedException(e))
if doc is None:
if fetcher is None:
from datalad.downloaders.providers import Providers
providers = Providers.from_config_files()
fetcher = providers.fetch
doc = fetcher(url)
ensure_dir(dirname(doc_fname))
# use pickle to store the entire request result dict
pickle.dump(doc, open(doc_fname, 'wb'))
lgr.debug("stored result of request to '%s' in %s", url, doc_fname)
return doc
def download_url(url, dest=None, overwrite=False):
"""Download a file from a URL
Supports and honors any DataLad "downloader/provider" configuration.
Parameters
----------
url: str
Source URL to download from.
dest: Path-like or None
Destination file name (file must not exist), or name of a target
directory (must exists, and filename must be derivable from `url`).
If None, the downloaded content will be returned as a string.
overwrite: bool
Force overwriting an existing destination file.
Returns
-------
str
Path of the downloaded file, or URL content if `dest` is None.
Raises
------
DownloadError
If `dest` already exists and is a file, or if `dest` is a directory
and no filename could be determined from `url`, or if no file was
found at the given `url`.
"""
from datalad.downloaders.providers import Providers
providers = Providers.from_config_files()
if dest:
return providers.download(url, path=str(dest), overwrite=overwrite)
else:
return providers.fetch(url)
def local_path2url_path(local_path: str,
allow_relative_path: bool = False
) -> str:
"""Convert a local path into an URL path component"""
local_path = Path(local_path)
if not local_path.is_absolute() and allow_relative_path:
local_path = local_path.absolute()
url = urlparse(Path(local_path).as_uri())
if url.netloc:
raise ValueError(
f"cannot convert remote path to an URL path: {local_path}")
return url.path
def url_path2local_path(url_path: str | PurePosixPath) -> str | Path:
if isinstance(url_path, PurePosixPath):
return_path = True
url_path = str(url_path)
else:
return_path = False
if not url_path or not url_path.startswith("/"):
# We expect a 'path-absolute' as defined in RFC 3986, therefore the
# path must begin with a slash.
raise ValueError(
f"url path does not start with '/': {url_path}, and is therefore "
f"not an absolute-path as defined in RFC 8089")
if url_path.startswith("//"):
# We expect a 'path-absolute' as defined in RFC 3986, therefore the
# first segment must not be empty, i.e. the path must not start with
# two or more slashes.
raise ValueError(
f"url path has empty first segment: {url_path}, and is therefore "
f"not an absolute-path as defined in RFC 8089")
return (
Path(url2pathname(url_path))
if return_path
else url2pathname(url_path)
)
def quote_path(path: str, safe: str = "/") -> str:
"""quote the path component of a URL, takes OS specifics into account
On Windows-like system a path-prefix consisting of a slash, a single letter,
a colon, and a slash, i.e. '/c:/Windows', the colon will not be quoted.
All characters after the colon will be quoted by `urllib.parse.quote`.
On Unix-like systems the complete path component will be quoted by
'urllib.parse.quote'.
Parameters
----------
path: str
The path that should be quoted
safe: str (default '/')
Characters that should not be quoted, passed
on to the save-parameter of `urllib.parse.quote`.
Returns
-------
str
The quoted path component
"""
if on_windows:
if re.match("^/[a-zA-Z]:/", path):
return path[:3] + quote(path[3:], safe=safe)
return quote(path, safe=safe)
lgr.log(5, "Done importing support.network")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/parallel.py 0000644 0001751 0001751 00000060723 15137634221 020044 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Helpers for parallel execution
"""
__docformat__ = 'restructuredtext'
import concurrent.futures
import inspect
import logging
import sys
import time
import uuid
from collections import defaultdict
from queue import (
Empty,
Queue,
)
from threading import Thread
from datalad.support.exceptions import CapturedException
from ..log import log_progress
from ..utils import path_is_subpath
from . import ansi_colors as colors
lgr = logging.getLogger('datalad.parallel')
def _count_str(count, verb, omg=False):
if count:
msg = "{:d} {}".format(count, verb)
if omg:
msg = colors.color_word(msg, colors.RED)
return msg
#
# safe_to_consume helpers
#
def no_parentds_in_futures(futures, path, skip=tuple()):
"""Return True if no path in futures keys is parentds for provided path
Assumes that the future's key is the path.
Parameters
----------
skip: iterable
Do not consider futures with paths in skip. E.g. it could be top level
dataset which we know it exists already, and it is ok to start with child
process before it
"""
# TODO: OPT. Could benefit from smarter than linear time if not one at a time?
# or may be we should only go through active futures (still linear!)?
return all(not path_is_subpath(path, p) or p in skip for p in futures)
def no_subds_in_futures(futures, path, skip=tuple()):
"""Return True if no path in futures keys is a subdataset for provided path
See `no_parentds_in_futures` for more info
"""
return all(not path_is_subpath(p, path) or p in skip for p in futures)
class ProducerConsumer:
"""Producer/Consumer implementation to (possibly) parallelize execution.
It is an iterable providing a multi-threaded producer/consumer implementation,
where there could be multiple consumers for items produced by a producer. Since
in DataLad majority of time is done in IO interactions with outside git and git-annex
processes, and since we typically operate across multiple datasets, multi-threading
across datasets operations already provides a significant performance benefit.
All results from consumers are all yielded as soon as they are produced by consumers.
Because this implementation is based on threads, `producer` and `consumer` could
be some "closures" within code, thus having lean interface and accessing
data from shared "outer scope".
Notes
-----
- with jobs > 1, results are yielded as soon as available, so order
might not match the one provided by "producer".
- `producer` must produce unique entries. AssertionError might be raised if
the same entry is to be consumed.
- `consumer` can add to the queue of items produced by producer via
`.add_to_producer_queue`. This allows for continuous reuse of the same
instance in recursive operations (see `get` use of ProducerConsumer).
- if producer or consumer raise an exception, we will try to "fail gracefully",
unless subsequent Ctrl-C is pressed, we will let already running jobs to
finish first.
Examples
--------
A simple and somewhat boring example to count lines in '*.py'
>>> from glob import glob
>>> from pprint import pprint
>>> from datalad.support.parallel import ProducerConsumer
>>> def count_lines(fname):
... with open(fname) as f:
... return fname, len(f.readlines())
>>> pprint(dict(ProducerConsumer(glob("*.py"), count_lines))) # doctest: +SKIP
{'setup.py': 182, 'versioneer.py': 2136}
More usage examples could be found in `test_parallel.py` and around the
codebase `addurls.py`, `get.py`, `save.py`, etc.
"""
# Users should not specify -J100 and then just come complaining without
# being informed that they are out of luck
_alerted_already = False
def __init__(self,
producer, consumer,
*,
jobs=None,
safe_to_consume=None,
producer_future_key=None,
reraise_immediately=False,
agg=None,
):
"""
Parameters
----------
producer: iterable
Provides items to feed a consumer with
consumer: callable
Is provided with items produced by producer. Multiple consumers might
operate in parallel threads if jobs > 1
jobs: int, optional
If None or "auto", 'datalad.runtime.max-jobs' configuration variable is
consulted. With jobs=0 there is no threaded execution whatsoever. With
jobs=1 there is a separate thread for the producer, so in effect with jobs=1
some parallelization between producer (if it is a generator) and consumer
could be achieved, while there is only a single thread available for consumers.
safe_to_consume: callable, optional
A callable which gets a dict of all known futures and current item from producer.
It should return `True` if executor can proceed with current value from producer.
If not (unsafe to consume) - we will wait.
WARNING: outside code should make sure about provider and `safe_to_consume` to
play nicely or a very suboptimal behavior or possibly even a deadlock can happen.
producer_future_key: callable, optional
A key function for a value from producer which will be used as a key in futures
dictionary and output of which is passed to safe_to_consume.
reraise_immediately: bool, optional
If True, it would stop producer yielding values as soon as it detects that some
exception has occurred (although there might still be values in the queue to be yielded
which were collected before the exception was raised).
agg: callable, optional
Should be a callable with two arguments: (item, prior total) and return a new total
which will get assigned to .total of this object. If not specified, .total is
just a number of items produced by the producer.
"""
self.producer = producer
self.consumer = consumer
self.jobs = jobs
self.safe_to_consume = safe_to_consume
self.producer_future_key = producer_future_key
self.reraise_immediately = reraise_immediately
self.agg = agg
self.total = None if self.agg else 0
self._jobs = None # actual "parallel" jobs used
# Relevant only for _iter_threads
self._producer_finished = None
self._producer_queue = None
self._producer_exception = None
self._producer_interrupt = None
# so we could interrupt more or less gracefully
self._producer_thread = None
self._executor = None
self._futures = {}
self._interrupted = False
@property
def interrupted(self):
return self._interrupted
def __del__(self):
# if we are killed while executing, we should ask executor to shutdown
shutdown = getattr(self, "shutdown", None)
if shutdown:
shutdown(force=True)
def shutdown(self, force=False, exception=None):
if self._producer_thread and self._producer_thread.is_alive():
# we will try to let the worker to finish "gracefully"
self._producer_interrupt = f"shutdown due to {exception}"
# purge producer queue
if self._producer_queue:
while not self._producer_queue.empty():
self._producer_queue.get()
lgr.debug("Shutting down %s with %d futures. Reason: %s",
self._executor, len(self._futures), exception)
if not force:
# pop not yet running or done futures.
# Those would still have a chance to yield results and finish gracefully
# or their exceptions to be bubbled up FWIW.
ntotal = len(self._futures)
ncanceled = 0
nrunning = 0
# Do in reverse order so if any job still manages
# to sneak in, it would be the earlier submitted one.
for k, future in list(self._futures.items())[::-1]:
running = future.running()
nrunning += int(running)
if not (running or future.done()):
if self._futures.pop(k).cancel():
ncanceled += 1
lgr.info("Canceled %d out of %d jobs. %d left running.",
ncanceled, ntotal, nrunning)
else:
# just pop all entirely
for k in list(self._futures)[::-1]:
self._futures.pop(k).cancel()
if self._executor:
self._executor.shutdown()
self._executor = None
if exception:
raise exception
lgr.debug("Finished shutdown with force=%s due to exception=%r", force, exception)
def _update_total(self, value):
if self.agg:
self.total = (
self.agg(value, self.total) if self.total is not None else self.agg(value)
)
else:
self.total += 1
@classmethod
def get_effective_jobs(cls, jobs):
"""Return actual number of jobs to be used.
It will account for configuration variable ('datalad.runtime.max-jobs') and possible
other requirements (such as version of Python).
"""
if jobs in (None, "auto"):
from datalad import cfg
# ATM there is no "auto" for this operation, so in both auto and None
# just consult max-jobs which can only be an int ATM.
# "auto" could be for some auto-scaling based on a single future time
# to complete, scaling up/down. Ten config variable could accept "auto" as well
jobs = cfg.obtain('datalad.runtime.max-jobs')
return jobs
def __iter__(self):
self._jobs = self.get_effective_jobs(self.jobs)
if self._jobs == 0:
yield from self._iter_serial()
else:
yield from self._iter_threads(self._jobs)
def _iter_serial(self):
# depchecker is not consulted, serial execution
# reraise_immediately is also "always False by design"
# To allow consumer to add to the queue
self._producer_queue = producer_queue = Queue()
def produce():
# First consume all coming directly from producer and then go through all which
# consumer might have added to the producer queue
for args in self._producer_iter:
self._update_total(args)
yield args
# consumer could have added to the queue while we were still
# producing
while not producer_queue.empty():
yield producer_queue.get()
for args in produce():
res = self.consumer(args)
if inspect.isgenerator(res):
lgr.debug("Got consumer worker which returned a generator %s", res)
yield from res
else:
lgr.debug("Got straight result %s, not a generator", res)
yield res
@property
def _producer_iter(self):
"""A little helper to also support generator functions"""
return self.producer() if inspect.isgeneratorfunction(self.producer) else self.producer
def _iter_threads(self, jobs):
self._interrupted = False
self._producer_finished = False
self._producer_exception = None
self._producer_interrupt = None
# To allow feeding producer queue with more entries, possibly from consumer!
self._producer_queue = producer_queue = Queue()
consumer_queue = Queue()
def producer_worker():
"""That is the one which interrogates producer and updates .total"""
try:
for value in self._producer_iter:
if self._producer_interrupt:
raise InterruptedError("Producer thread was interrupted due to %s" % self._producer_interrupt)
self.add_to_producer_queue(value)
except InterruptedError:
pass # There is some outside exception which will be raised
except BaseException as e:
self._producer_exception = e
finally:
self._producer_finished = True
def consumer_worker(callable, *args, **kwargs):
"""Since jobs could return a generator and we cannot really "inspect" for that
"""
res = callable(*args, **kwargs)
if inspect.isgenerator(res):
lgr.debug("Got consumer worker which returned a generator %s", res)
didgood = False
for r in res:
didgood = True
lgr.debug("Adding %s to queue", r)
consumer_queue.put(r)
if not didgood:
lgr.error("Nothing was obtained from %s :-(", res)
else:
lgr.debug("Got straight result %s, not a generator", res)
consumer_queue.put(res)
self._producer_thread = Thread(target=producer_worker)
self._producer_thread.start()
self._futures = futures = {}
lgr.debug("Initiating ThreadPoolExecutor with %d jobs", jobs)
# we will increase sleep_time when doing nothing useful
sleeper = Sleeper()
interrupted_by_exception = None
with concurrent.futures.ThreadPoolExecutor(jobs) as executor:
self._executor = executor
# yield from the producer_queue (.total and .finished could be accessed meanwhile)
while True:
try:
done_useful = False
if self.reraise_immediately and self._producer_exception and not interrupted_by_exception:
# so we have a chance to exit gracefully
# No point to reraise if there is already an exception which was raised
# which might have even been this one
lgr.debug("Reraising an exception from producer as soon as we found it")
raise self._producer_exception
if (self._producer_finished and
not futures and
consumer_queue.empty() and
producer_queue.empty()):
# This will let us not "escape" the while loop and reraise any possible exception
# within the loop if we have any.
# Otherwise we might see "RuntimeError: generator ignored GeneratorExit"
# when e.g. we did continue upon interrupted_by_exception, and then
# no other subsequent exception was raised and we left the loop
raise _FinalShutdown()
# important! We are using threads, so worker threads will be sharing CPU time
# with this master thread. For it to become efficient, we should consume as much
# as possible from producer asap and push it to executor. So drain the queue
while not (producer_queue.empty() or interrupted_by_exception):
done_useful = True
try:
job_args = producer_queue.get() # timeout=0.001)
job_key = self.producer_future_key(job_args) if self.producer_future_key else job_args
if self.safe_to_consume:
# Sleep a little if we are not yet ready
# TODO: add some .debug level reporting based on elapsed time
# IIRC I did smth like growing exponentially delays somewhere (dandi?)
while not self.safe_to_consume(futures, job_key):
self._pop_done_futures(lgr) or sleeper()
# Current implementation, to provide depchecking, relies on unique
# args for the job
assert job_key not in futures
lgr.debug("Submitting worker future for %s", job_args)
futures[job_key] = executor.submit(consumer_worker, self.consumer, job_args)
except Empty:
pass
# check active futures
if not consumer_queue.empty():
done_useful = True
# ATM we do not bother of some "in order" reporting
# Just report as soon as any new record arrives
res = consumer_queue.get()
lgr.debug("Got %s from consumer_queue", res)
yield res
done_useful |= self._pop_done_futures(lgr)
if not done_useful: # you need some rest
# TODO: same here -- progressive logging
lgr.log(5,
"Did nothing useful, sleeping. Have "
"producer_finished=%s producer_queue.empty=%s futures=%s consumer_queue.empty=%s",
self._producer_finished,
producer_queue.empty(),
futures,
consumer_queue.empty(),
)
sleeper()
else:
sleeper.reset()
except (_FinalShutdown, GeneratorExit):
self.shutdown(force=True, exception=self._producer_exception or interrupted_by_exception)
break # if there were no exception to raise
except BaseException as exc:
ce = CapturedException(exc)
self._interrupted = True
if interrupted_by_exception:
# so we are here again but now it depends why we are here
if isinstance(exc, KeyboardInterrupt):
lgr.warning("Interrupted via Ctrl-C. Forcing the exit")
self.shutdown(force=True, exception=exc)
else:
lgr.warning(
"One more exception was received while "
"trying to finish gracefully: %s",
ce)
# and we go back into the loop until we finish or there is Ctrl-C
else:
interrupted_by_exception = exc
lgr.warning(
"Received an exception %s. Canceling not-yet "
"running jobs and waiting for completion of "
"running. You can force earlier forceful exit "
"by Ctrl-C.", ce)
self.shutdown(force=False, exception=exc)
def add_to_producer_queue(self, value):
self._producer_queue.put(value)
self._update_total(value)
def _pop_done_futures(self, lgr):
"""Removes .done from provided futures.
Returns
-------
bool
True if any future was removed
"""
done_useful = False
# remove futures which are done
for args, future in list(self._futures.items()):
if future.done():
done_useful = True
future_ = self._futures.pop(args)
exception = future_.exception()
if exception:
lgr.debug("Future for %r raised %s. Re-raising to trigger graceful shutdown etc", args, exception)
raise exception
lgr.debug("Future for %r is done", args)
return done_useful
class Sleeper():
def __init__(self):
self.min_sleep_time = 0.001
# but no more than to this max
self.max_sleep_time = 0.1
self.sleep_time = self.min_sleep_time
def __call__(self):
time.sleep(self.sleep_time)
self.sleep_time = min(self.max_sleep_time, self.sleep_time * 2)
def reset(self):
self.sleep_time = self.min_sleep_time
class ProducerConsumerProgressLog(ProducerConsumer):
"""ProducerConsumer wrapper with log_progress reporting.
It is to be used around a `consumer` which returns or yields result records.
If that is not the case -- use regular `ProducerConsumer`.
It will update `.total` of the `log_progress` each time it changes (i.e. whenever
producer produced new values to be consumed).
"""
def __init__(self,
producer, consumer,
*,
log_filter=None,
label="Total", unit="items",
lgr=None,
**kwargs
):
"""
Parameters
----------
producer, consumer, **kwargs
Passed into ProducerConsumer. Most likely kwargs must not include 'agg' or
if provided, it must return an 'int' value.
log_filter: callable, optional
If defined, only result records for which callable evaluates to True will be
passed to log_progress
label, unit: str, optional
Provided to log_progress
lgr: logger, optional
Provided to log_progress. Local one is used if not provided
"""
super().__init__(producer, consumer, **kwargs)
self.log_filter = log_filter
self.label = label
self.unit = unit
self.lgr = lgr
def __iter__(self):
pid = str(uuid.uuid4()) # could be based on PID and time may be to be informative?
lgr_ = self.lgr
label = self.label
if lgr_ is None:
lgr_ = lgr
log_progress(lgr_.info, pid,
"%s: starting", self.label,
# will become known only later total=len(items),
label=self.label, unit=" " + self.unit,
noninteractive_level=5)
counts = defaultdict(int)
total_announced = None # self.total
for res in super().__iter__():
if self.total and total_announced != self.total:
# update total with new information
log_progress(
lgr_.info,
pid,
"", # None flips python 3.6.7 in conda if nose ran without -s
# I do not think there is something
# valuable to announce
total=self.total,
# unfortunately of no effect, so we cannot inform that more items to come
# unit=("+" if not it.finished else "") + " " + unit,
update=0, # not None, so it does not stop
noninteractive_level=5
)
total_announced = self.total
if not (self.log_filter and not self.log_filter(res)):
counts[res["status"]] += 1
count_strs = [_count_str(*args)
for args in [(counts["notneeded"], "skipped", False),
(counts["error"], "failed", True)]]
if counts["notneeded"] or counts["error"] or self.interrupted:
strs = count_strs
if self.interrupted:
strs.append("exiting!")
label = "{} ({})".format(
self.label,
", ".join(filter(None, count_strs)))
log_progress(
lgr_.error if res["status"] == "error" else lgr_.info,
pid,
"%s: processed result%s", self.label,
" for " + res["path"] if "path" in res else "",
label=label, update=1, increment=True,
noninteractive_level=5)
yield res
log_progress(lgr_.info, pid, "%s: done", self.label,
noninteractive_level=5)
class _FinalShutdown(Exception):
"""Used internally for the final forceful shutdown if any exception did happen"""
pass
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/param.py 0000644 0001751 0001751 00000012271 15137634221 017343 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##g
"""Parameter representation"""
__docformat__ = 'restructuredtext'
import argparse
import re
import textwrap
from datalad.utils import getargspec
from .constraints import expand_constraint_spec
_whitespace_re = re.compile(r'\n\s+|^\s+')
class Parameter(object):
"""This class shall serve as a representation of a parameter.
"""
# Known keyword arguments which we want to allow to pass over into
# argparser.add_argument . Mentioned explicitly, since otherwise
# are not verified while working in Python-only API
# include_kwonlyargs=True is future-proofing since ATM in 3.9 there is no
# *, in Action.__init__ but could be added later, and semantically it
# makes sense to include those among _KNOWN_ARGS
_KNOWN_ARGS = getargspec(
argparse.Action.__init__, include_kwonlyargs=True
).args + ['action']
def __init__(self, constraints=None, doc=None, args=None, **kwargs):
"""Add constraints (validator) specifications and a docstring for
a parameter.
Parameters
----------
constraints : callable
A functor that takes any input value, performs checks or type
conversions and finally returns a value that is appropriate for a
parameter or raises an exception. This will also be used to set up
the ``type`` functionality of argparse.add_argument.
doc : str
Documentation about the purpose of this parameter.
args : tuple or None
Any additional positional args for argparser.add_argument. This is
most useful for assigned multiple alternative argument names or
create positional arguments.
**kwargs :
Any additional keyword args for argparser.add_argument.
Examples
--------
Ensure a parameter is a float
>>> from datalad.support.param import Parameter
>>> from datalad.support.constraints import (EnsureFloat, EnsureRange,
... AltConstraints, Constraints)
>>> C = Parameter(constraints=EnsureFloat())
Ensure a parameter is of type float or None:
>>> C = Parameter(constraints=AltConstraints(EnsureFloat(), None))
Ensure a parameter is None or of type float and lies in the inclusive
range (7.0,44.0):
>>> C = Parameter(
... AltConstraints(
... Constraints(EnsureFloat(),
... EnsureRange(min=7.0, max=44.0)),
... None))
"""
self.constraints = expand_constraint_spec(constraints)
self._doc = doc
self.cmd_args = args
# Verify that no mistyped kwargs present
unknown_args = set(kwargs).difference(self._KNOWN_ARGS)
if unknown_args:
raise ValueError(
"Detected unknown argument(s) for the Parameter: %s. Known are: %s"
% (', '.join(unknown_args), ', '.join(self._KNOWN_ARGS))
)
self.cmd_kwargs = kwargs
def get_autodoc(self, name, indent=" ", width=70, default=None, has_default=False):
"""Docstring for the parameter to be used in lists of parameters
Returns
-------
string or list of strings (if indent is None)
"""
paramsdoc = '%s' % name
sdoc = None
if self.constraints is not None:
sdoc = self.constraints.short_description()
elif 'action' in self.cmd_kwargs \
and self.cmd_kwargs['action'] in ("store_true", "store_false"):
sdoc = 'bool'
if sdoc is not None:
if sdoc[0] == '(' and sdoc[-1] == ')':
sdoc = sdoc[1:-1]
nargs = self.cmd_kwargs.get('nargs', '')
if isinstance(nargs, int):
sdoc = '{}-item sequence of {}'.format(nargs, sdoc)
elif nargs == '+':
sdoc = 'non-empty sequence of {}'.format(sdoc)
elif nargs == '*':
sdoc = 'sequence of {}'.format(sdoc)
if self.cmd_kwargs.get('action', None) == 'append':
sdoc = 'list of {}'.format(sdoc)
paramsdoc += " : %s" % sdoc
if has_default:
paramsdoc += ", optional"
paramsdoc = [paramsdoc]
doc = self._doc
if doc is None:
doc = ''
doc = doc.strip()
if len(doc) and not doc.endswith('.'):
doc += '.'
if has_default:
doc += " [Default: %r]" % (default,)
# Explicitly deal with multiple spaces, for some reason
# replace_whitespace is non-effective
doc = _whitespace_re.sub(' ', doc)
paramsdoc += [indent + x
for x in textwrap.wrap(doc, width=width - len(indent),
replace_whitespace=True)]
return '\n'.join(paramsdoc)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/path.py 0000644 0001751 0001751 00000024713 15137634221 017203 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Helper functionality and overloads for paths treatment
One of the reasons is also to robustify operation with unicode filenames
"""
from __future__ import annotations
# TODO: RF and move all paths related functions from datalad.utils in here
import os
import os.path as op
# to not pollute API importing as _
from collections import defaultdict as _defaultdict
from collections.abc import (
Iterable,
Iterator,
)
from functools import wraps
from itertools import dropwhile
from pathlib import (
Path,
PurePosixPath,
)
from ..utils import (
ensure_bytes,
getpwd,
)
def _get_unicode_robust_version(f):
@wraps(f)
def wrapped(*args, **kwargs):
try:
return f(*args, **kwargs)
except UnicodeEncodeError:
return f(ensure_bytes(*args, **kwargs))
doc = getattr(f, '__doc__', None)
# adjust only if __doc__ is not completely absent (None)
if doc is not None:
wrapped.__doc__ = doc + \
"\n\nThis wrapper around original function would encode forcefully " \
"to utf-8 if initial invocation fails"
return wrapped
abspath = op.abspath
basename = op.basename
curdir = op.curdir
dirname = op.dirname
exists = _get_unicode_robust_version(op.exists)
isdir = _get_unicode_robust_version(op.isdir)
isabs = _get_unicode_robust_version(op.isabs)
join = op.join
lexists = _get_unicode_robust_version(op.lexists)
normpath = op.normpath
pardir = op.pardir
pathsep = op.pathsep
relpath = op.relpath
realpath = _get_unicode_robust_version(op.realpath)
sep = op.sep
def robust_abspath(p: str | Path) -> str:
"""A helper which would not fail if p is relative and we are in non-existing directory
It will rely on getpwd, which would rely on $PWD env variable to report
the path. Desired for improved resilience during e.g. reporting as in
https://github.com/datalad/datalad/issues/2787
"""
try:
return abspath(p)
except OSError:
if not isabs(p):
try:
os.getcwd()
except Exception:
return normpath(join(getpwd(), p))
# if no exception raised it was not the reason, raise original
raise
def split_ext(filename: str) -> tuple[str, str]:
"""Use git-annex's splitShortExtensions rule for splitting extensions.
Parameters
----------
filename : str
Returns
-------
A tuple with (root, extension)
Examples
--------
>>> from datalad.local.addurls import split_ext
>>> split_ext("filename.py")
('filename', '.py')
>>> split_ext("filename.tar.gz")
('filename', '.tar.gz')
>>> split_ext("filename.above4chars.ext")
('filename.above4chars', '.ext')
"""
parts = filename.split(".")
if len(parts) == 1:
return filename, ""
tail = list(dropwhile(lambda x: len(x) < 5,
reversed(parts[1:])))
file_parts = parts[:1] + tail[::-1]
ext_parts = parts[1+len(tail):]
return ".".join(file_parts), "." + ".".join(ext_parts)
def get_parent_paths(paths: list[str], parents: list[str], only_with_parents: bool = False, *, sep: str = '/') -> list[str]:
"""Given a list of children paths, return their parent paths among parents
or their own path if there is no known parent. A path is also considered its
own parent (haven't you watched Predestination?) ;)
All paths should be relative, not pointing outside (not starting
with ../), and normalized (no // or dir/../dir and alike). Only minimal
sanity checking of values is done. By default paths are considered to be
POSIX. Use 'sep' kwarg to set to `os.sep` to provide OS specific handling.
Accent is made on performance to avoid O(len(paths) * len(parents))
runtime. ATM should be typically less than O(len(paths) * len(log(parents)))
Initial intended use - for a list of paths in the repository
to provide their paths as files/submodules known to that repository, to
overcome difference in ls-tree and ls-files, where ls-files outputs nothing
for paths within submodules.
It is coded, so it could later be applied even whenever there are nested
parents, e.g. parents = ['sub', 'sub/sub'] and then the "deepest" parent
is selected
Parameters
----------
parents: list of str
paths: list of str
only_with_parents: bool, optional
If set to True, return a list of only parent paths where that path had
a parent
sep: str, optional
Path separator. By default - '/' and thus treating paths as POSIX.
If you are processing OS-specific paths (for both `parents` and `paths`),
specify `sep=os.sep`.
Returns
-------
A list of paths (without duplicates), where some entries replaced with
their "parents" without duplicates. So for 'a/b' and 'a/c' with a being
among parents, there will be a single 'a'
"""
# Let's do an early check even though then we would skip the checks on paths
# being relative etc
if not parents:
return [] if only_with_parents else paths
# We will create a lookup for known parent lengths
parent_set = set(parents) # O(log(len(parents))) lookup
# Will be used in sanity checking that we got consistently used separators, i.e.
# not mixing non-POSIX paths and POSIX parents
asep = {'/': '\\', '\\': '/'}[sep]
# rely on path[:n] be quick, and len(parent_lengths) << len(parent_set)
# when len(parent_set) is large. We will also bail checking any parent of
# the length if at that length path has no directory boundary ('/').
#
# Create mapping for each length of
# parent path to list of parents with that length
parent_lengths_map: dict[int, set[str]] = _defaultdict(set)
for parent in parent_set:
_get_parent_paths_check(parent, sep, asep)
parent_lengths_map[len(parent)].add(parent)
# Make it ordered in the descending order so we select the deepest/longest parent
# and store them as sets for faster lookup.
# Could be an ordered dict but no need
parent_lengths = [(l, parent_lengths_map[l]) for l in sorted(parent_lengths_map, reverse=True)]
res = []
seen = set()
for path in paths: # O(len(paths)) - unavoidable but could be parallelized!
# Sanity check -- should not be too expensive
_get_parent_paths_check(path, sep, asep)
for parent_length, parents_ in parent_lengths: # O(len(parent_lengths))
if (len(path) < parent_length) or (len(path) > parent_length and path[parent_length] != sep):
continue # no directory deep enough
candidate_parent = path[:parent_length]
if candidate_parent in parents_: # O(log(len(parent_set))) but expected one less due to per length handling
if candidate_parent not in seen:
res.append(candidate_parent)
seen.add(candidate_parent)
break # it is!
else: # no hits
if not only_with_parents:
if path not in seen:
res.append(path)
seen.add(path)
return res
def get_filtered_paths_(paths: Iterable[str|Path], filter_paths: Iterable[str | Path],
*, include_within_path: bool = False) \
-> Iterator[str]:
"""Among paths (or Path objects) select the ones within filter_paths.
All `paths` and `filter_paths` must be relative and POSIX.
In case of `include_with_path=True`, if a `filter_path` points to some path
under a `path` within `paths`, that path would be returned as well, e.g.
`path` 'submod' would be returned if there is a `filter_path` 'submod/subsub/file'.
Complexity is O(N*log(N)), where N is the largest of the lengths of `paths`
or `filter_paths`.
Yields
------
paths, sorted (so order is not preserved), which reside under 'filter_paths' or
path within 'filter_paths' is under that path.
"""
# do conversion and sanity checks, O(N)
def _harmonize_paths(l: Iterable[str | Path]) -> list[tuple[str, ...]]:
ps = []
for p in l:
pp = PurePosixPath(p)
if pp.is_absolute():
raise ValueError(f"Got absolute path {p}, expected relative")
if pp.parts and pp.parts[0] == '..':
raise ValueError(f"Path {p} leads outside")
ps.append(pp.parts) # store parts
return sorted(ps) # O(N * log(N))
paths_parts = _harmonize_paths(paths)
filter_paths_parts = _harmonize_paths(filter_paths)
# we will pretty much "scroll" through sorted paths and filter_paths at the same time
for path_parts in paths_parts:
while filter_paths_parts:
filter_path_parts = filter_paths_parts[0]
l = min(len(path_parts), len(filter_path_parts))
# if common part is "greater" in the path -- we can go to the next "filter"
if filter_path_parts[:l] < path_parts[:l]:
# get to the next one
filter_paths_parts = filter_paths_parts[1:]
else:
break # otherwise -- consider this one!
else:
# no filter path left - the other paths cannot be the selected ones
break
if include_within_path:
# if one identical or subpath of another one -- their parts match in the beginning
# and we will just reuse that 'l'
pass
else:
# if all components of the filter match, for that we also add len(path_parts) check below
l = len(filter_path_parts)
if len(path_parts) >= l and (path_parts[:l] == filter_path_parts[:l]):
yield '/'.join(path_parts)
def _get_parent_paths_check(path: str, sep: str, asep: str) -> None:
"""A little helper for get_parent_paths"""
if isabs(path) or path.startswith(pardir + sep) or path.startswith(curdir + sep):
raise ValueError("Expected relative within directory paths, got %r" % path)
if sep+sep in path:
raise ValueError(f"Expected normalized paths, got {path} containing '{sep+sep}'")
if asep in path:
raise ValueError(f"Expected paths with {sep} as separator, got {path} containing '{asep}'")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/repodates.py 0000644 0001751 0001751 00000024612 15137634221 020233 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Utilities for checking repository dates.
"""
import logging
import operator
import re
import time
from datalad.log import log_progress
from datalad.support.exceptions import CommandError
from datalad.support.gitrepo import GitRepo
lgr = logging.getLogger('datalad.repodates')
def _cat_blob(repo, obj, bad_ok=False):
"""Call `git cat-file blob OBJ`.
Parameters
----------
repo : GitRepo
obj : str
Blob object.
bad_ok : boolean, optional
Don't fail if `obj` doesn't name a known blob.
Returns
-------
Blob's content (str) or None if `obj` is not and `bad_ok` is true.
"""
if bad_ok:
kwds = {"expect_fail": True, "expect_stderr": True}
else:
kwds = {}
try:
out_cat = repo.call_git(["cat-file", "blob", obj], read_only=True,
**kwds)
except CommandError as exc:
if bad_ok and "bad file" in exc.stderr:
out_cat = None
else:
raise
return out_cat
def branch_blobs(repo, branch):
"""Get all blobs for `branch`.
Parameters
----------
repo : GitRepo
branch : str
Returns
-------
A generator object that returns (hexsha, content, file name) for each blob
in `branch`. Note: By design a blob isn't tied to a particular file name;
the returned file name matches what is returned by 'git rev-list'.
"""
# Note: This might be nicer with rev-list's --filter and
# --filter-print-omitted, but those aren't available until Git v2.16.
lines = repo.call_git_items_(["rev-list", "--objects"] + [branch],
read_only=True)
# Trees and blobs have an associated path printed.
objects = (ln.split() for ln in lines)
blob_trees = [obj for obj in objects if len(obj) == 2]
num_objects = len(blob_trees)
log_progress(lgr.info, "repodates_branch_blobs",
"Checking %d objects", num_objects,
label="Checking objects", total=num_objects, unit=" objects")
# This is inefficient. It makes a git call for each object, some of which
# aren't even blobs. We could instead use 'git cat-file --batch'.
for obj, fname in blob_trees:
log_progress(lgr.info, "repodates_branch_blobs",
"Checking %s", obj,
increment=True, update=1)
content = _cat_blob(repo, obj, bad_ok=True)
if content:
yield obj, content, fname
log_progress(lgr.info, "repodates_branch_blobs",
"Finished checking %d objects", num_objects)
def branch_blobs_in_tree(repo, branch):
"""Get all blobs for the current tree of `branch`.
Parameters
----------
repo : GitRepo
branch : str, optional
Returns
-------
A generator object that returns (hexsha, content, file name) for each blob.
Note: If there are multiple files in the tree that point to the blob, only
the first file name that is reported by 'git ls-tree' is used (i.e., one
entry per blob is yielded).
"""
seen_blobs = set()
lines = list(repo.call_git_items_(["ls-tree", "-z", "-r", branch],
sep="\0", read_only=True))
if lines:
num_lines = len(lines)
log_progress(lgr.info,
"repodates_blobs_in_tree",
"Checking %d objects in git-annex tree", num_lines,
label="Checking objects", total=num_lines,
unit=" objects")
for line in lines:
if not line:
continue
_, obj_type, obj, fname = line.split()
log_progress(lgr.info, "repodates_blobs_in_tree",
"Checking %s", obj,
increment=True, update=1)
if obj_type == "blob" and obj not in seen_blobs:
yield obj, _cat_blob(repo, obj), fname
seen_blobs.add(obj)
log_progress(lgr.info, "repodates_blobs_in_tree",
"Finished checking %d blobs", num_lines)
# In uuid.log, timestamps look like "timestamp=1523283745.683191724s" and occur
# at the end of the line. In the *.log and *.log.meta files that are
# associated with annexed files, the timestamps occur at beginning of the line
# and don't have the "timestamp=" prefix.
ANNEX_DATE_RE = re.compile(r"^(?:[^\n]+timestamp=)?([0-9]+)(?:\.[0-9]+)?s",
re.MULTILINE)
def search_annex_timestamps(text):
"""Extract unix timestamps content of the git-annex branch.
Parameters
----------
text : str
Content from the git-annex branch (e.g., the content of the "uuid.log"
file).
Returns
-------
A generator object that returns a unix timestamp (without fractional any
seconds) for each timestamp found in `text`.
"""
for match in ANNEX_DATE_RE.finditer(text):
yield int(match.group(1))
def annex_dates(repo, all_objects=True):
"""Get git-annex branch blobs containing dates.
Parameters
----------
repo : GitRepo
all_objects : bool, optional
Instead for searching the content of all blobs in the git-annex branch,
search only the blobs that are in the tree of the tip of the git-annex
branch.
Returns
-------
A generator object that returns a tuple with the blob hexsha, a generator
with the blob's timestamps, and an associated file name.
"""
blob_fn = branch_blobs if all_objects else branch_blobs_in_tree
for hexsha, content, fname in blob_fn(repo, "git-annex"):
yield hexsha, search_annex_timestamps(content), fname
def tag_dates(repo, pattern=""):
"""Get timestamps for annotated tags.
Parameters
----------
repo : GitRepo
pattern : str
Limit the tags by this pattern. It will be appended to 'refs/tags'
argument passed to `git for-each-ref`.
Returns
-------
A generator object that returns a tuple with the tag hexsha and timestamp.
"""
for rec in repo.for_each_ref_(
fields=['objectname', 'taggerdate:raw'],
pattern='refs/tags/' + pattern):
if not rec['taggerdate:raw']:
# There's not a tagger date. It's not an annotated tag.
continue
yield rec['objectname'], int(rec['taggerdate:raw'].split()[0])
def log_dates(repo, revs=None):
"""Get log timestamps.
Parameters
----------
repo : GitRepo
revs : list, optional
Extract timestamps from commit objects that are reachable from these
revisions.
Returns
-------
A generator object that returns a tuple with the commit hexsha, author
timestamp, and committer timestamp.
"""
opts = [] if revs else ["--branches"]
try:
for line in repo.get_revisions(revs, fmt="%H %at %ct", options=opts):
hexsha, author_timestamp, committer_timestamp = line.split()
yield hexsha, int(author_timestamp), int(committer_timestamp)
except CommandError as e:
# With some Git versions, calling `git log --{all,branches,remotes}` in
# a repo with no commits may signal an error.
if "does not have any commits yet" not in e.stderr:
raise e
def check_dates(repo, timestamp=None, which="newer", revs=None,
annex=True, tags=True):
"""Search for dates in `repo` that are newer than `timestamp`.
This examines commit logs of local branches and the content of blobs in the
git-annex branch.
Parameters
----------
repo : GitRepo or str
If a str is passed, it is taken as the path to a GitRepo.
timestamp : int, optional
Unix timestamp. It defaults to a day before now.
which : {"newer", "older"}
Whether to return timestamps that are newer or older than `timestamp`.
revs : list, optional
Search for commit timestamps in commits that are area reachable from
these revisions. Any revision-specification allowed by `git log` can be
used, including things like `--all`. Defaults to all local branches.
annex : {True, "tree", False}, optional
If True, search the content of all blobs in the git-annex branch. If
"tree", search only the blobs that are in the tree of the tip of the
git-annex branch. If False, do not search git-annex blobs.
tags : bool, optional
Whether to check dates the dates of annotated tags.
Returns
-------
A dict that reports newer timestamps.
"""
if isinstance(repo, str):
repo = GitRepo(repo, create=False)
if timestamp is None:
timestamp = int(time.time()) - 60 * 60 * 24
if which == "newer":
cmp_fn = operator.gt
elif which == "older":
cmp_fn = operator.lt
else:
raise ValueError("unrecognized value for `which`: {}".format(which))
results = {}
lgr.debug("Checking dates in logs")
for hexsha, a_timestamp, c_timestamp in log_dates(repo, revs=revs):
if cmp_fn(a_timestamp, timestamp) or cmp_fn(c_timestamp, timestamp):
results[hexsha] = {"type": "commit",
"author-timestamp": a_timestamp,
"committer-timestamp": c_timestamp}
if tags:
lgr.debug("Checking dates of annotated tags")
for hexsha, tag_timestamp in tag_dates(repo):
if cmp_fn(tag_timestamp, timestamp):
results[hexsha] = {"type": "tag",
"timestamp": tag_timestamp}
if annex and "git-annex" in repo.get_branches():
all_objects = annex != "tree"
lgr.debug("Checking dates in blobs of git-annex branch%s",
"" if all_objects else "'s tip")
for hexsha, timestamps, fname in annex_dates(repo, all_objects):
hits = [ts for ts in timestamps if cmp_fn(ts, timestamp)]
if hits:
results[hexsha] = {"type": "annex-blob",
"timestamps": hits,
"filename": fname}
return {"reference-timestamp": timestamp,
"which": which,
"objects": results}
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/s3.py 0000644 0001751 0001751 00000036356 15137634221 016602 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil; coding: utf-8 -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##g
"""Variety of helpers to deal with AWS S3
Use as a script to generate test buckets via e.g.
python -m datalad.support.s3 generate test1_dirs
"""
__docformat__ = 'restructuredtext'
import logging
import mimetypes
import re
from pathlib import PurePath
from urllib.request import (
Request,
urlopen,
)
import boto3
from botocore.exceptions import ClientError
import datalad.log # Just to have lgr setup happen this one used a script
from datalad.support.network import URL
lgr = logging.getLogger('datalad.s3')
# TODO: should become a config option and managed along with the rest
S3_ADMIN_CREDENTIAL = "datalad-datalad-admin-s3"
S3_TEST_CREDENTIAL = "datalad-datalad-test-s3"
def _get_s3_resource(credname=None):
"""Creates a boto3 s3 resource
If credential name is given, DataLad credentials are retrieved or
entered; otherwise boto3 credential discovery mechanism is used
(~/.aws/config or env vars). Resources are a higher-level
abstraction than clients, and seem more fitting for use in this
module.
"""
if credname is not None:
from datalad.downloaders.credentials import AWS_S3
credential = AWS_S3(credname, None)
if not credential.is_known:
credential.enter_new()
creds = credential()
session = boto3.session.Session(
aws_access_key_id=creds["key_id"],
aws_secret_access_key=creds["secret_id"],
)
else:
session = boto3.session.Session()
return session.resource("s3")
class VersionedFilesPool(object):
"""Just a helper which would help to create versioned files in the bucket"""
def __init__(self, bucket):
self._versions = {}
self._bucket = bucket
@property
def bucket(self):
return self._bucket
def __call__(self, filename, prefix='', load=None):
self._versions[filename] = version = self._versions.get(filename, 0) + 1
version_str = f"version{version}"
mtype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
if load is None:
load = prefix
if PurePath(filename).suffix == 'html':
load += f'{version_str}'
else:
load += version_str
object = self.bucket.Object(key=filename)
object.put(
Body=load.encode(),
ContentType=mtype,
)
return object
def reset_version(self, filename):
self._versions[filename] = 0
def prune_and_delete_bucket(bucket):
"""Deletes all the content and then deletes bucket
Should be used with care -- no confirmation requested
"""
bucket.object_versions.delete()
bucket.delete()
lgr.info("Bucket %s was removed", bucket.name)
def set_bucket_public_access_policy(bucket):
# we need to enable permissions for making content available
bucket.Policy().put(Policy="""{
"Version":"2012-10-17",
"Statement":[{
"Sid":"AddPerm",
"Effect":"Allow",
"Principal": "*",
"Action":["s3:GetObject", "s3:GetObjectVersion", "s3:GetObjectTorrent", "s3:GetObjectVersionTorrent"],
"Resource":["arn:aws:s3:::%s/*"]
}
]
}""" % bucket.name)
def gen_test_bucket(bucket_name):
s3 = _get_s3_resource(S3_ADMIN_CREDENTIAL)
region = s3.meta.client.meta.region_name
bucket = s3.Bucket(bucket_name)
# assure we have none
exists = True
try:
s3.meta.client.head_bucket(Bucket=bucket_name)
except ClientError as e:
error_code = e.response["Error"]["Code"]
if error_code == "404":
exists = False
else:
# likely 403
raise e
if exists:
lgr.info("Deleting existing bucket %s", bucket.name)
prune_and_delete_bucket(bucket)
# by default, bucket is created in us-east, leading to constraint exception
# if user has different (endpoint) region in config - read & use the latter
bucket.create(CreateBucketConfiguration={"LocationConstraint": region})
return bucket
def _gen_bucket_test0(bucket_name="datalad-test0", versioned=True):
bucket = gen_test_bucket(bucket_name)
# Enable web access to that bucket to everyone
bucket.Website().put(
WebsiteConfiguration={"IndexDocument": {"Suffix": "index.html"}}
)
set_bucket_public_access_policy(bucket)
files = VersionedFilesPool(bucket)
files("1version-nonversioned1.txt")
files("2versions-nonversioned1.txt")
if versioned:
# make bucket versioned AFTER we uploaded one file already
bucket.Versioning().enable()
files("2versions-nonversioned1.txt")
files("2versions-nonversioned1.txt_sameprefix")
for v in range(3):
files("3versions-allversioned.txt")
files("3versions-allversioned.txt_sameprefix") # to test possible problems
# File which was created and then removed
#bucket.delete_key(files("1version-removed.txt"))
files("1version-removed.txt").delete()
# File which was created/removed/recreated (with new content)
files("2versions-removed-recreated.txt").delete()
files("2versions-removed-recreated.txt")
files("2versions-removed-recreated.txt_sameprefix")
# File which was created/removed/recreated (with new content)
f = "1version-removed-recreated.txt"
files(f).delete()
files.reset_version(f)
files(f)
lgr.info("Bucket %s was generated and populated", bucket_name)
return bucket
def gen_bucket_test0_versioned():
return _gen_bucket_test0('datalad-test0-versioned', versioned=True)
def gen_bucket_test0_nonversioned():
return _gen_bucket_test0('datalad-test0-nonversioned', versioned=False)
def gen_bucket_test1_dirs():
bucket_name = 'datalad-test1-dirs-versioned'
bucket = gen_test_bucket(bucket_name)
bucket.Versioning().enable()
# Enable web access to that bucket to everyone
bucket.Website().put(
WebsiteConfiguration={"IndexDocument": {"Suffix": "index.html"}}
)
set_bucket_public_access_policy(bucket)
files = VersionedFilesPool(bucket)
files("d1", load="") # creating an empty file
# then we would like to remove that d1 as a file and make a directory out of it
files("d1/file1.txt")
# and then delete it and place it back
files("d1", load="smth")
def gen_bucket_test2_obscurenames_versioned():
# in principle bucket name could also contain ., but boto doesn't digest it
# well
bucket_name = 'datalad-test2-obscurenames-versioned'
bucket = gen_test_bucket(bucket_name)
bucket.Versioning().enable()
# Enable web access to that bucket to everyone
bucket.Website().put(
WebsiteConfiguration={"IndexDocument": {"Suffix": "index.html"}}
)
set_bucket_public_access_policy(bucket)
files = VersionedFilesPool(bucket)
# http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html
files("f 1", load="")
files("f [1][2]")
# Need to grow up for this .... TODO
#files(u"ΡΠ½ΠΈΠΊΠΎΠ΄")
#files(u"ΡΠ½ΠΈ/ΠΊΠΎΠ΄")
# all fancy ones at once
files("f!-_.*'( )")
# the super-fancy which aren't guaranteed to be good idea (as well as [] above)
files("f &$=@:+,?;")
def gen_bucket_test1_manydirs():
# to test crawling with flexible subdatasets making decisions
bucket_name = 'datalad-test1-manydirs-versioned'
bucket = gen_test_bucket(bucket_name)
bucket.Versioning().enable()
# Enable web access to that bucket to everyone
bucket.Website().put(
WebsiteConfiguration={"IndexDocument": {"Suffix": "index.html"}}
)
set_bucket_public_access_policy(bucket)
files = VersionedFilesPool(bucket)
files("d1", load="") # creating an empty file
# then we would like to remove that d1 as a file and make a directory out of it
files("d1/file1.txt")
files("d1/sd1/file1.txt")
files("d1/sd2/file3.txt", load="a")
files("d1/sd2/ssd1/file4.txt")
files("d2/file1.txt")
files("d2/sd1/file1.txt")
files("d2/sd1/ssd/sssd/file1.txt")
def add_version_to_url(url, version, replace=False):
"""Add a version ID to `url`.
Parameters
----------
url : datalad.support.network.URL
A URL.
version : str
The value of 'versionId='.
replace : boolean, optional
If a versionID is already present in `url`, replace it.
Returns
-------
A versioned URL (str)
"""
version_id = "versionId={}".format(version)
if not url.query:
query = version_id
else:
ver_match = re.match("(?P.*&)?"
"(?PversionId=[^&]+)"
"(?P&.*)?",
url.query)
if ver_match:
if replace:
query = "".join([ver_match.group("pre") or "",
version_id,
ver_match.group("post") or ""])
else:
query = url.query
else:
query = url.query + "&" + version_id
return URL(**dict(url.fields, query=query)).as_str()
def get_versioned_url(url, guarantee_versioned=False, return_all=False, verify=False,
s3client=None, update=False):
"""Given a url return a versioned URL
Originally targeting AWS S3 buckets with versioning enabled
Parameters
----------
url : string
guarantee_versioned : bool, optional
Would fail if buckets is determined to have no versioning enabled.
It will not fail if we fail to determine if bucket is versioned or
not
return_all: bool, optional
If True, would return a list with URLs for all the versions of this
file, sorted chronologically with latest first (when possible, e.g.
for S3). Remove markers get ignored
verify: bool, optional
Verify that URL is accessible. As discovered some versioned keys might
be denied access to
s3client: botocore.client.S3, optional
A boto3 client instance that will be used to interact with AWS; if None,
a new one will be created.
update : bool, optional
If the URL already contains a version ID, update it to the latest version
ID. This option has no effect if return_all is true.
Returns
-------
string or list of string
"""
url_rec = URL(url)
s3_bucket, fpath = None, url_rec.path.lstrip('/')
was_versioned = False
all_versions = []
# hostname regex match allowing optional region code
# bucket-name.s3.region-code.amazonaws.com
match_virtual_hosted_style = re.match(
r"^(.+)(\.s3)(?:[.-][a-z0-9-]+){0,1}(\.amazonaws\.com)$", url_rec.hostname
)
# s3.region-code.amazonaws.com/bucket-name/key-name
match_path_style = re.match(
r"^s3(?:\.[a-z0-9-]+){0,1}(\.amazonaws\.com)$", url_rec.hostname
)
if match_virtual_hosted_style is not None:
s3_bucket = match_virtual_hosted_style.group(1)
elif match_path_style is not None:
if url_rec.scheme not in ('http', 'https'):
raise ValueError("Do not know how to handle %s scheme" % url_rec.scheme)
# url is s3.amazonaws.com/bucket/PATH
s3_bucket, fpath = fpath.split('/', 1)
elif url_rec.scheme == 's3':
s3_bucket = url_rec.hostname # must be
if url_rec.query and 'versionId=' in url_rec.query:
was_versioned = True
all_versions.append(url)
else:
# and for now implement magical conversion to URL
# TODO: wouldn't work if needs special permissions etc
# actually for now
raise NotImplementedError
if s3_bucket:
# TODO: cache
if s3client is None:
# we need to reuse our providers
from ..downloaders.providers import Providers
providers = Providers.from_config_files()
s3url = "s3://%s/" % s3_bucket
s3provider = providers.get_provider(s3url)
authenticator = s3provider.authenticator
if not authenticator:
# we will use the default one
from ..downloaders.s3 import S3Authenticator
authenticator = S3Authenticator()
if authenticator.client is not None:
# we have established connection before, so let's just reuse
s3client = authenticator.client
else:
s3client = authenticator.authenticate(s3_bucket, s3provider.credential)
supports_versioning = True # assume that it does
try:
# Status can be "Enabled" | "Suspended", or missing altogether
response = s3client.get_bucket_versioning(Bucket=s3_bucket)
supports_versioning = response.get("Status") == "Enabled"
except ClientError as e:
# might be forbidden, i.e. "403 Forbidden" so we try then anyways
supports_versioning = 'maybe'
if supports_versioning:
response = s3client.list_object_versions(
Bucket=s3_bucket,
Prefix=fpath,
)
all_keys = response.get("Versions", [])
# Filter and sort them so the newest one on top
all_keys = [
x
for x in sorted(
all_keys,
key=lambda x: (x["LastModified"], x["IsLatest"]),
reverse=True,
)
if ((x["Key"] == fpath)) # match exact name, not just prefix
]
# our current assumptions
assert all_keys[0]["IsLatest"]
# boto compatibility note: boto3 response should separate
# Versions & DeleteMarkers, no action needed
for key in all_keys:
url_versioned = add_version_to_url(
url_rec, key["VersionId"], replace=update and not return_all)
all_versions.append(url_versioned)
if verify:
# it would throw HTTPError exception if not accessible
_ = urlopen(Request(url_versioned))
was_versioned = True
if not return_all:
break
if guarantee_versioned and not was_versioned:
raise RuntimeError("Could not version %s" % url)
if not all_versions:
# we didn't get a chance
all_versions = [url_rec.as_str()]
if return_all:
return all_versions
else:
return all_versions[0]
if __name__ == '__main__':
import sys
lgr.setLevel(logging.INFO)
# TODO: proper cmdline
if len(sys.argv) > 1 and sys.argv[1] == "generate":
if len(sys.argv) < 3:
raise ValueError("Say 'all' to regenerate all, or give a generators name")
name = sys.argv[2]
if name.lower() == 'all':
gb = [k for k in locals().keys() if k.startswith('gen_bucket')]
for func_name in gb:
locals()[func_name]()
else:
locals()['gen_bucket_%s' % name]()
else:
print("nothing to do")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/sshconnector.py 0000644 0001751 0001751 00000075617 15137634221 020770 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Interface to an ssh connection.
Allows for connecting via ssh and keeping the connection open
(by using a controlmaster), in order to perform several ssh commands or
git calls to a ssh remote without the need to reauthenticate.
"""
import logging
import os
import tempfile
import threading
from hashlib import md5
from socket import gethostname
from subprocess import Popen
import fasteners
from datalad.cmd import (
NoCapture,
StdOutErrCapture,
WitlessRunner,
)
from datalad.support.exceptions import (
CapturedException,
CommandError,
ConnectionOpenFailedError,
)
from datalad.support.external_versions import external_versions
# importing the quote function here so it can always be imported from this
# module
# this used to be shlex.quote(), but is now a cross-platform helper
from datalad.utils import (
Path,
auto_repr,
ensure_list,
on_windows,
)
from datalad.utils import quote_cmdlinearg as sh_quote
# !!! Do not import network here -- delay import, allows to shave off 50ms or so
# on initial import datalad time
# from datalad.support.network import RI, is_ssh
lgr = logging.getLogger('datalad.support.sshconnector')
def get_connection_hash(hostname, port='', username='', identity_file='',
bundled=None, force_ip=False):
"""Generate a hash based on SSH connection properties
This can be used for generating filenames that are unique
to a connection from and to a particular machine (with
port and login username). The hash also contains the local
host name.
Identity file corresponds to a file that will be passed via ssh's -i
option.
All parameters correspond to the respective properties of an SSH
connection, except for `bundled`, which is unused.
.. deprecated:: 0.16
The ``bundled`` argument is ignored.
"""
if bundled is not None:
import warnings
warnings.warn(
"The `bundled` argument of `get_connection_hash()` is ignored. "
"It will be removed in a future release.",
DeprecationWarning)
# returning only first 8 characters to minimize our chance
# of hitting a limit on the max path length for the Unix socket.
# Collisions would be very unlikely even if we used less than 8.
# References:
# https://github.com/ansible/ansible/issues/11536#issuecomment-153030743
# https://github.com/datalad/datalad/pull/1377
# The "# nosec" below skips insecure hash checks by 'codeclimate'. The hash
# is not security critical, since it is only used as an "abbreviation" of
# the unique connection property string.
return md5( # nosec
'{lhost}{rhost}{port}{identity_file}{username}{force_ip}'.format(
lhost=gethostname(),
rhost=hostname,
port=port,
identity_file=identity_file,
username=username,
force_ip=force_ip or ''
).encode('utf-8')).hexdigest()[:8]
@auto_repr
class BaseSSHConnection(object):
"""Representation of an SSH connection.
"""
def __init__(self, sshri, identity_file=None,
use_remote_annex_bundle=None, force_ip=False):
"""Create a connection handler
The actual opening of the connection is performed on-demand.
Parameters
----------
sshri: SSHRI
SSH resource identifier (contains all connection-relevant info),
or another resource identifier that can be converted into an SSHRI.
identity_file : str or None
Value to pass to ssh's -i option.
use_remote_annex_bundle : bool, optional
If enabled, look for a git-annex installation on the remote and
prefer its Git binaries in the search path (i.e. prefer a bundled
Git over a system package). See also the configuration setting
datalad.ssh.try-use-annex-bundled-git
force_ip : {False, 4, 6}
Force the use of IPv4 or IPv6 addresses with -4 or -6.
.. versionchanged:: 0.16
The default for `use_remote_annex_bundle` changed from `True`
to `None`. Instead of attempting to use a potentially available
git-annex bundle on the remote host by default, this behavior
is now conditional on the `datalad.ssh.try-use-annex-bundled-git`
(off by default).
"""
self._runner = None
self._ssh_executable = None
self._ssh_version = None
from datalad.support.network import (
SSHRI,
is_ssh,
)
if not is_ssh(sshri):
raise ValueError(
"Non-SSH resource identifiers are not supported for SSH "
"connections: {}".format(sshri))
self.sshri = SSHRI(**{k: v for k, v in sshri.fields.items()
if k in ('username', 'hostname', 'port')})
# arguments only used for opening a connection
self._ssh_open_args = []
# arguments for annex ssh invocation
self._ssh_args = []
self._ssh_open_args.extend(
['-p', '{}'.format(self.sshri.port)] if self.sshri.port else [])
if force_ip:
self._ssh_open_args.append("-{}".format(force_ip))
if identity_file:
self._ssh_open_args.extend(["-i", identity_file])
self._use_remote_annex_bundle = use_remote_annex_bundle
# essential properties of the remote system
self._remote_props = {}
def __call__(self, cmd, options=None, stdin=None, log_output=True):
"""Executes a command on the remote.
It is the callers responsibility to properly quote commands
for remote execution (e.g. filename with spaces of other special
characters).
Parameters
----------
cmd: str
command to run on the remote
options : list of str, optional
Additional options to pass to the `-o` flag of `ssh`. Note: Many
(probably most) of the available configuration options should not be
set here because they can critically change the properties of the
connection. This exists to allow options like SendEnv to be set.
log_output: bool
Whether to capture and return stdout+stderr.
Returns
-------
tuple of str
stdout, stderr of the command run, if `log_output` was `True`
"""
raise NotImplementedError
def open(self):
"""Opens the connection.
Returns
-------
bool
To return True if connection establishes a control socket successfully.
Return False otherwise
"""
raise NotImplementedError
def close(self):
"""Closes the connection.
"""
raise NotImplementedError
@property
def ssh_executable(self):
"""determine which ssh client executable should be used.
"""
if not self._ssh_executable:
from datalad import cfg
self._ssh_executable = cfg.obtain("datalad.ssh.executable")
return self._ssh_executable
@property
def runner(self):
if self._runner is None:
self._runner = WitlessRunner()
return self._runner
@property
def ssh_version(self):
if self._ssh_version is None:
ssh_version = external_versions["cmd:ssh"]
self._ssh_version = ssh_version.version if ssh_version else None
return self._ssh_version
def _adjust_cmd_for_bundle_execution(self, cmd):
from datalad import cfg
# locate annex and set the bundled vs. system Git machinery in motion
if self._use_remote_annex_bundle \
or cfg.obtain('datalad.ssh.try-use-annex-bundled-git'):
remote_annex_installdir = self.get_annex_installdir()
if remote_annex_installdir:
# make sure to use the bundled git version if any exists
cmd = '{}; {}'.format(
'export "PATH={}:$PATH"'.format(remote_annex_installdir),
cmd)
return cmd
def _exec_ssh(self, ssh_cmd, cmd, options=None, stdin=None, log_output=True):
cmd = self._adjust_cmd_for_bundle_execution(cmd)
for opt in options or []:
ssh_cmd.extend(["-o", opt])
# build SSH call, feed remote command as a single last argument
# whatever it contains will go to the remote machine for execution
# we cannot perform any sort of escaping, because it will limit
# what we can do on the remote, e.g. concatenate commands with '&&'
ssh_cmd += [self.sshri.as_str()] + [cmd]
lgr.debug("%s is used to run %s", self, ssh_cmd)
# TODO: pass expect parameters from above?
# Hard to explain to toplevel users ... So for now, just set True
out = self.runner.run(
ssh_cmd,
protocol=StdOutErrCapture if log_output else NoCapture,
stdin=stdin)
return out['stdout'], out['stderr']
def _get_scp_command_spec(self, recursive, preserve_attrs):
"""Internal helper for SCP interface methods"""
# Convert ssh's port flag (-p) to scp's (-P).
scp_options = ["-P" if x == "-p" else x for x in self._ssh_args]
# add recursive, preserve_attributes flag if recursive, preserve_attrs set and create scp command
scp_options += ["-r"] if recursive else []
scp_options += ["-p"] if preserve_attrs else []
return ["scp"] + scp_options
def _quote_filename(self, filename):
if self.ssh_version and self.ssh_version[0] < 9:
return _quote_filename_for_scp(filename)
# no filename quoting for OpenSSH version 9 and above
return filename
def put(self, source, destination, recursive=False, preserve_attrs=False):
"""Copies source file/folder to destination on the remote.
Note: this method performs escaping of filenames to an extent that
moderately weird ones should work (spaces, quotes, pipes, other
characters with special shell meaning), but more complicated cases
might require appropriate external preprocessing of filenames.
Parameters
----------
source : str or list
file/folder path(s) to copy from on local
destination : str
file/folder path to copy to on remote
recursive : bool
flag to enable recursive copying of given sources
preserve_attrs : bool
preserve modification times, access times, and modes from the
original file
Returns
-------
str
stdout, stderr of the copy operation.
"""
# make sure we have an open connection, will test if action is needed
# by itself
self.open()
scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs)
# add source filepath(s) to scp command
scp_cmd += ensure_list(source)
# add destination path
scp_cmd += ['%s:%s' % (
self.sshri.hostname,
self._quote_filename(destination),
)]
out = self.runner.run(scp_cmd, protocol=StdOutErrCapture)
return out['stdout'], out['stderr']
def get(self, source, destination, recursive=False, preserve_attrs=False):
"""Copies source file/folder from remote to a local destination.
Note: this method performs escaping of filenames to an extent that
moderately weird ones should work (spaces, quotes, pipes, other
characters with special shell meaning), but more complicated cases
might require appropriate external preprocessing of filenames.
Parameters
----------
source : str or list
file/folder path(s) to copy from the remote host
destination : str
file/folder path to copy to on the local host
recursive : bool
flag to enable recursive copying of given sources
preserve_attrs : bool
preserve modification times, access times, and modes from the
original file
Returns
-------
str
stdout, stderr of the copy operation.
"""
# make sure we have an open connection, will test if action is needed
# by itself
self.open()
scp_cmd = self._get_scp_command_spec(recursive, preserve_attrs)
# add source filepath(s) to scp command, prefixed with the remote host
scp_cmd += ["%s:%s" % (self.sshri.hostname, self._quote_filename(s))
for s in ensure_list(source)]
# add destination path
scp_cmd += [destination]
out = self.runner.run(scp_cmd, protocol=StdOutErrCapture)
return out['stdout'], out['stderr']
def get_annex_installdir(self):
key = 'installdir:annex'
if key in self._remote_props:
return self._remote_props[key]
annex_install_dir = None
# already set here to avoid any sort of recursion until we know
# more
self._remote_props[key] = annex_install_dir
try:
with tempfile.TemporaryFile() as tempf:
# TODO does not work on windows
annex_install_dir = self(
# use sh -e to be able to fail at each stage of the process
"sh -e -c 'dirname $(readlink -f $(which git-annex-shell))'"
, stdin=tempf
)[0].strip()
except CommandError as e:
lgr.debug('Failed to locate remote git-annex installation: %s',
CapturedException(e))
self._remote_props[key] = annex_install_dir
return annex_install_dir
def get_annex_version(self):
key = 'cmd:annex'
if key in self._remote_props:
return self._remote_props[key]
try:
# modern annex versions
version = self('git annex version --raw')[0]
except CommandError:
# either no annex, or old version
try:
# fall back on method that could work with older installations
out, err = self('git annex version')
version = out.split('\n')[0].split(':')[1].strip()
except CommandError as e:
lgr.debug('Failed to determine remote git-annex version: %s',
CapturedException(e))
version = None
self._remote_props[key] = version
return version
def get_git_version(self):
key = 'cmd:git'
if key in self._remote_props:
return self._remote_props[key]
git_version = None
try:
git_version = self('git version')[0].split()[2]
except CommandError as e:
lgr.debug('Failed to determine Git version: %s',
CapturedException(e))
self._remote_props[key] = git_version
return git_version
@auto_repr
class NoMultiplexSSHConnection(BaseSSHConnection):
"""Representation of an SSH connection.
The connection is opened for execution of a single process, and closed
as soon as the process end.
"""
def __call__(self, cmd, options=None, stdin=None, log_output=True):
# there is no dedicated "open" step, put all args together
ssh_cmd = [self.ssh_executable] + self._ssh_open_args + self._ssh_args
return self._exec_ssh(
ssh_cmd,
cmd,
options=options,
stdin=stdin,
log_output=log_output)
def is_open(self):
return False
def open(self):
return False
def close(self):
# we perform blocking execution, we should not return from __call__ until
# the connection is already closed
pass
@auto_repr
class MultiplexSSHConnection(BaseSSHConnection):
"""Representation of a (shared) ssh connection.
"""
def __init__(self, ctrl_path, sshri, **kwargs):
"""Create a connection handler
The actual opening of the connection is performed on-demand.
Parameters
----------
ctrl_path: str
path to SSH controlmaster
sshri: SSHRI
SSH resource identifier (contains all connection-relevant info),
or another resource identifier that can be converted into an SSHRI.
**kwargs
Pass on to BaseSSHConnection
"""
super().__init__(sshri, **kwargs)
# on windows cmd args lists are always converted into a string using appropriate
# quoting rules, on other platforms args lists are passed directly and we need
# to take care of quoting ourselves
ctrlpath_arg = "ControlPath={}".format(ctrl_path if on_windows else sh_quote(str(ctrl_path)))
self._ssh_args += ["-o", ctrlpath_arg]
self._ssh_open_args += [
"-fN",
"-o", "ControlMaster=auto",
"-o", "ControlPersist=15m",
]
self.ctrl_path = Path(ctrl_path)
self._opened_by_us = False
# used by @fasteners.locked
self._lock = [
threading.Lock(),
fasteners.process_lock.InterProcessLock(self.ctrl_path.with_suffix('.lck'))
]
def __call__(self, cmd, options=None, stdin=None, log_output=True):
# XXX: check for open socket once
# and provide roll back if fails to run and was not explicitly
# checked first
# MIH: this would mean that we would have to distinguish failure
# of a payload command from failure of SSH itself. SSH however,
# only distinguishes success and failure of the entire operation
# Increase in fragility from introspection makes a potential
# performance benefit a questionable improvement.
# make sure we have an open connection, will test if action is needed
# by itself
self.open()
ssh_cmd = [self.ssh_executable] + self._ssh_args
return self._exec_ssh(
ssh_cmd,
cmd,
options=options,
stdin=stdin,
log_output=log_output)
def _assemble_multiplex_ssh_cmd(self, additional_arguments):
return [self.ssh_executable] \
+ additional_arguments \
+ self._ssh_args \
+ [self.sshri.as_str()]
def is_open(self):
if not self.ctrl_path.exists():
lgr.log(
5,
"Not opening %s for checking since %s does not exist",
self, self.ctrl_path
)
return False
# check whether controlmaster is still running:
cmd = self._assemble_multiplex_ssh_cmd(["-O", "check"])
lgr.debug("Checking %s by calling %s", self, cmd)
try:
# expect_stderr since ssh would announce to stderr
# "Master is running" and that is normal, not worthy warning about
# etc -- we are doing the check here for successful operation
with tempfile.TemporaryFile() as tempf:
self.runner.run(
cmd,
# do not leak output
protocol=StdOutErrCapture,
stdin=tempf)
res = True
except CommandError as e:
if e.code != 255:
# this is not a normal SSH error, whine ...
raise e
# SSH died and left socket behind, or server closed connection
self.close()
res = False
lgr.debug(
"Check of %s has %s",
self,
{True: 'succeeded', False: 'failed'}[res])
return res
@fasteners.locked
def open(self):
"""Opens the connection.
In other words: Creates the SSH ControlMaster to be used by this
connection, if it is not there already.
Returns
-------
bool
True when SSH reports success opening the connection, False when
a ControlMaster for an open connection already exists.
Raises
------
ConnectionOpenFailedError
When starting the SSH ControlMaster process failed.
"""
# the socket should vanish almost instantly when the connection closes
# sending explicit 'check' commands to the control master is expensive
# (needs tempfile to shield stdin, Runner overhead, etc...)
# as we do not use any advanced features (forwarding, stop[ing the
# master without exiting) it should be relatively safe to just perform
# the much cheaper check of an existing control path
if self.ctrl_path.exists():
return False
# create ssh control master command
cmd = self._assemble_multiplex_ssh_cmd(self._ssh_open_args)
# start control master:
lgr.debug("Opening %s by calling %s", self, cmd)
# The following call is exempt from bandit's security checks because
# we/the user control the content of 'cmd'.
proc = Popen(cmd) # nosec
stdout, stderr = proc.communicate(input="\n") # why the f.. this is necessary?
# wait till the command exits, connection is conclusively
# open or not at this point
exit_code = proc.wait()
if exit_code != 0:
raise ConnectionOpenFailedError(
cmd,
'Failed to open SSH connection (could not start ControlMaster process)',
exit_code,
stdout,
stderr,
)
self._opened_by_us = True
return True
def close(self):
if not self._opened_by_us:
lgr.debug("Not closing %s since was not opened by itself", self)
return
# stop controlmaster:
cmd = self._assemble_multiplex_ssh_cmd(["-O", "stop"])
lgr.debug("Closing %s by calling %s", self, cmd)
try:
self.runner.run(cmd, protocol=StdOutErrCapture)
except CommandError as e:
lgr.debug("Failed to run close command")
if self.ctrl_path.exists():
lgr.debug("Removing existing control path %s", self.ctrl_path)
# socket need to go in any case
self.ctrl_path.unlink()
if e.code != 255:
# not a "normal" SSH error
raise e
@auto_repr
class BaseSSHManager(object):
"""Interface for an SSHManager
"""
def ensure_initialized(self):
"""Ensures that manager is initialized"""
pass
assure_initialized = ensure_initialized
def get_connection(self, url, use_remote_annex_bundle=None, force_ip=False):
"""Get an SSH connection handler
Parameters
----------
url: str
ssh url
use_remote_annex_bundle : bool, optional
If enabled, look for a git-annex installation on the remote and
prefer its Git binaries in the search path (i.e. prefer a bundled
Git over a system package). See also the configuration setting
datalad.ssh.try-use-annex-bundled-git
force_ip : {False, 4, 6}
Force the use of IPv4 or IPv6 addresses.
Returns
-------
BaseSSHConnection
.. versionchanged:: 0.16
The default for `use_remote_annex_bundle` changed from `True`
to `None`. Instead of attempting to use a potentially available
git-annex bundle on the remote host by default, this behavior
is now conditional on the `datalad.ssh.try-use-annex-bundled-git`
(off by default).
"""
raise NotImplementedError
def _prep_connection_args(self, url):
# parse url:
from datalad.support.network import (
RI,
is_ssh,
)
if isinstance(url, RI):
sshri = url
else:
if ':' not in url and '/' not in url:
# it is just a hostname
lgr.debug("Assuming %r is just a hostname for ssh connection",
url)
url += ':'
sshri = RI(url)
if not is_ssh(sshri):
raise ValueError("Unsupported SSH URL: '{0}', use "
"ssh://host/path or host:path syntax".format(url))
from datalad import cfg
identity_file = cfg.get("datalad.ssh.identityfile")
return sshri, identity_file
def close(self, allow_fail=True):
"""Closes all connections, known to this instance.
Parameters
----------
allow_fail: bool, optional
If True, swallow exceptions which might be thrown during
connection.close, and just log them at DEBUG level
"""
pass
@auto_repr
class NoMultiplexSSHManager(BaseSSHManager):
"""Does not "manage" and just returns a new connection
"""
def get_connection(self, url, use_remote_annex_bundle=None, force_ip=False):
sshri, identity_file = self._prep_connection_args(url)
return NoMultiplexSSHConnection(
sshri,
identity_file=identity_file,
use_remote_annex_bundle=use_remote_annex_bundle,
force_ip=force_ip,
)
@auto_repr
class MultiplexSSHManager(BaseSSHManager):
"""Keeps ssh connections to share. Serves singleton representation
per connection.
A custom identity file can be specified via `datalad.ssh.identityfile`.
Callers are responsible for reloading `datalad.cfg` if they have changed
this value since loading datalad.
"""
def __init__(self):
super().__init__()
self._socket_dir = None
self._connections = dict()
# Initialization of prev_connections is happening during initial
# handling of socket_dir, so we do not define them here explicitly
# to an empty list to fail if logic is violated
self._prev_connections = None
# and no explicit initialization in the constructor
# self.ensure_initialized()
@property
def socket_dir(self):
"""Return socket_dir, and if was not defined before,
and also pick up all previous connections (if any)
"""
self.ensure_initialized()
return self._socket_dir
def ensure_initialized(self):
"""Assures that manager is initialized - knows socket_dir, previous connections
"""
if self._socket_dir is not None:
return
from datalad import cfg
self._socket_dir = Path(cfg.obtain('datalad.locations.sockets'))
self._socket_dir.mkdir(exist_ok=True, parents=True)
try:
os.chmod(str(self._socket_dir), 0o700)
except OSError as exc:
lgr.warning(
"Failed to (re)set permissions on the %s. "
"Most likely future communications would be impaired or fail. "
"Original exception: %s",
self._socket_dir, CapturedException(exc)
)
try:
self._prev_connections = [p
for p in self.socket_dir.iterdir()
if not p.is_dir()]
except OSError as exc:
self._prev_connections = []
lgr.warning(
"Failed to list %s for existing sockets. "
"Most likely future communications would be impaired or fail. "
"Original exception: %s",
self._socket_dir, CapturedException(exc)
)
lgr.log(5,
"Found %d previous connections",
len(self._prev_connections))
assure_initialized = ensure_initialized
def get_connection(self, url, use_remote_annex_bundle=None, force_ip=False):
sshri, identity_file = self._prep_connection_args(url)
conhash = get_connection_hash(
sshri.hostname,
port=sshri.port,
identity_file=identity_file or "",
username=sshri.username,
force_ip=force_ip,
)
# determine control master:
ctrl_path = self.socket_dir / conhash
# do we know it already?
if ctrl_path in self._connections:
return self._connections[ctrl_path]
else:
c = MultiplexSSHConnection(
ctrl_path, sshri, identity_file=identity_file,
use_remote_annex_bundle=use_remote_annex_bundle,
force_ip=force_ip)
self._connections[ctrl_path] = c
return c
def close(self, allow_fail=True, ctrl_path=None):
"""Closes all connections, known to this instance.
Parameters
----------
allow_fail: bool, optional
If True, swallow exceptions which might be thrown during
connection.close, and just log them at DEBUG level
ctrl_path: str, Path, or list of str or Path, optional
If specified, only the path(s) provided would be considered
"""
if self._connections:
ctrl_paths = [Path(p) for p in ensure_list(ctrl_path)]
to_close = [c for c in self._connections
# don't close if connection wasn't opened by SSHManager
if self._connections[c].ctrl_path
not in self._prev_connections and
self._connections[c].ctrl_path.exists()
and (not ctrl_paths
or self._connections[c].ctrl_path in ctrl_paths)]
if to_close:
lgr.debug("Closing %d SSH connections...", len(to_close))
for cnct in to_close:
f = self._connections[cnct].close
if allow_fail:
f()
else:
try:
f()
except Exception as exc:
ce = CapturedException(exc)
lgr.debug("Failed to close a connection: "
"%s", ce.message)
self._connections = dict()
# retain backward compat with 0.13.4 and earlier
# should be ok since cfg already defined by the time this one is imported
from .. import cfg
if cfg.obtain('datalad.ssh.multiplex-connections'):
SSHManager = MultiplexSSHManager
SSHConnection = MultiplexSSHConnection
else:
SSHManager = NoMultiplexSSHManager
SSHConnection = NoMultiplexSSHConnection
def _quote_filename_for_scp(name):
"""Manually escape shell goodies in a file name.
Why manual? Because the author couldn't find a better way, and
simply quoting the entire filename does not work with SCP's overly
strict file matching criteria (likely a bug on their side).
Hence this beauty:
"""
for s, t in (
(' ', '\\ '),
('"', '\\"'),
("'", "\\'"),
("&", "\\&"),
("|", "\\|"),
(">", "\\>"),
("<", "\\<"),
(";", "\\;")):
name = name.replace(s, t)
return name
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/sshrun.py 0000644 0001751 0001751 00000010515 15137634221 017564 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""SSH command to expose datalad's connection management to 3rd-party tools
Primary use case is to be used with git as core.sshCommand
"""
__docformat__ = 'restructuredtext'
import logging
import os
import sys
import tempfile
from datalad import ssh_manager
from datalad.interface.base import (
Interface,
build_doc,
)
from datalad.support.param import Parameter
from datalad.utils import split_cmdline
lgr = logging.getLogger('datalad.sshrun')
@build_doc
class SSHRun(Interface):
"""Run command on remote machines via SSH.
This is a replacement for a small part of the functionality of SSH.
In addition to SSH alone, this command can make use of datalad's SSH
connection management. Its primary use case is to be used with Git
as 'core.sshCommand' or via "GIT_SSH_COMMAND".
Configure `datalad.ssh.identityfile` to pass a file to the ssh's -i option.
"""
_params_ = dict(
login=Parameter(
args=("login",),
doc="[user@]hostname"),
cmd=Parameter(
args=("cmd",),
doc="command for remote execution"),
port=Parameter(
args=("-p", '--port'),
doc="port to connect to on the remote host"),
ipv4=Parameter(
args=("-4",),
dest="ipv4",
doc="use IPv4 addresses only",
action="store_true"),
ipv6=Parameter(
args=("-6",),
dest="ipv6",
doc="use IPv6 addresses only",
action="store_true"),
options=Parameter(
args=("-o",),
metavar="OPTION",
dest="options",
doc="configuration option passed to SSH",
action="append"),
no_stdin=Parameter(
args=("-n",),
action="store_true",
dest="no_stdin",
doc="Do not connect stdin to the process"),
)
@staticmethod
def __call__(login, cmd,
*,
port=None, ipv4=False, ipv6=False, options=None,
no_stdin=False):
lgr.debug("sshrun invoked: login=%r, cmd=%r, port=%r, options=%r, "
"ipv4=%r, ipv6=%r, no_stdin=%r",
login, cmd, port, options, ipv4, ipv6, no_stdin)
# Perspective workarounds for git-annex invocation, see
# https://github.com/datalad/datalad/issues/1456#issuecomment-292641319
if cmd.startswith("'") and cmd.endswith("'"):
lgr.debug(
"Detected additional level of quotations in %r so performing "
"command line splitting", cmd
)
# there is an additional layer of quotes
# Let's strip them off by splitting the command
cmd_ = split_cmdline(cmd)
if len(cmd_) != 1:
raise RuntimeError(
"Obtained more or less than a single argument after "
"command line splitting: %s" % repr(cmd_))
cmd = cmd_[0]
sshurl = 'ssh://{}{}'.format(
login,
':{}'.format(port) if port else '')
if ipv4 and ipv6:
raise ValueError("Cannot force both IPv4 and IPv6")
elif ipv4:
force_ip = 4
elif ipv6:
force_ip = 6
else:
force_ip = None
ssh = ssh_manager.get_connection(sshurl, force_ip=force_ip)
# use an empty temp file as stdin if none shall be connected
stdin_ = tempfile.TemporaryFile() if no_stdin else sys.stdin
try:
# We pipe the SSH process' stdout/stderr by means of
# `log_output=False`. That's necessary to let callers - for example
# git-clone - communicate with the SSH process. Hence, we expect no
# output being returned from this call:
out, err = ssh(cmd, stdin=stdin_, log_output=False, options=options)
assert not out
assert not err
finally:
if no_stdin:
stdin_.close()
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/stats.py 0000644 0001751 0001751 00000014054 15137634221 017402 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""A helper for collecting stats on carried out actions
"""
__docformat__ = 'restructuredtext'
# TODO: we have already smth in progressbar... check
import humanize
_COUNTS = (
'files', 'urls',
'add_git', 'add_annex', 'dropped',
'skipped', 'overwritten', 'renamed', 'removed',
'downloaded', 'downloaded_size', 'downloaded_time',
'datasets_crawled',
'datasets_crawl_failed',
)
_LISTS = (
'merges', # merges which were carried out (from -> to)
'versions', # versions encountered. Latest would be used for tagging
)
_FORMATTERS = {
# TODO:
'downloaded_size': humanize.naturalsize,
'merges': lambda merges: ", ".join('->'.join(merge) for merge in merges),
'versions': lambda versions: ', '.join(versions)
}
# @auto_repr
class ActivityStats(object):
"""Helper to collect/pass statistics on carried out actions
It also keeps track of total counts, which do not get reset by
reset() call, and "total" stat could be obtained by .get_total()
Could be done so many other ways
"""
__metrics__ = _COUNTS + _LISTS
__slots__ = __metrics__ + ('_current', '_total')
def __init__(self, **vals):
self._current = {}
self._total = {}
self.reset(full=True, vals=vals)
def __repr__(self):
# since auto_repr doesn't support "non-0" values atm
return "%s(%s)" \
% (self.__class__.__name__,
", ".join(["%s=%s" % (k, v) for k, v in self._current.items() if v]))
# Comparisons operate solely on _current
def __eq__(self, other):
return (self._current == other._current) # and (self._total == other._total)
def __ne__(self, other):
return (self._current != other._current) # or (self._total != other._total)
def __iadd__(self, other):
for m in other.__metrics__:
# not inplace for increased paranoia for bloody lists, and dummy implementation of *add
self._current[m] = self._current[m] + other._current[m]
self._total[m] = self._total[m] + other._total[m]
return self
def __add__(self, other):
# crashed
# out = deepcopy(self)
# so doing ugly way
out = ActivityStats(**self._current)
out._total = self._total.copy()
out += other
return out
def __setattr__(self, key, value):
if key in self.__metrics__:
self._current[key] = value
else:
return super(ActivityStats, self).__setattr__(key, value)
def __getattribute__(self, key):
if (not key.startswith('_')) and key in self.__metrics__:
return self._current[key]
else:
return super(ActivityStats, self).__getattribute__(key)
def _get_updated_total(self):
"""Return _total updated with _current
"""
out = self._total.copy()
for k, v in self._current.items():
# not inplace + so we could create copies of lists
out[k] = out[k] + v
return out
def increment(self, k, v=1):
"""Helper for incrementing counters"""
self._current[k] += v
def _reset_values(self, d, vals):
for c in _COUNTS:
d[c] = vals.get(c, 0)
for l in _LISTS:
d[l] = vals.get(l, [])
def reset(self, full=False, vals=None):
# Initialize
if vals is None:
vals = {}
if not full:
self._total = self._get_updated_total()
self._reset_values(self._current, vals=vals)
if full:
self._reset_values(self._total, vals=vals)
def get_total(self):
"""Return a copy of total stats (for convenience)"""
return self.__class__(**self._get_updated_total())
def as_dict(self):
return self._current.copy()
def as_str(self, mode='full'):
"""
Parameters
----------
mode : {'full', 'line'}
"""
# Example
#"""
#URLs processed: {urls}
# downloaded: {downloaded}
# downloaded size: {downloaded_size}
#Files processed: {files}
# skipped: {skipped}
# renamed: {renamed}
# removed: {removed}
# added to git: {add_git}
# added to annex: {add_annex}
# overwritten: {overwritten}
#Branches merged:
# upstream -> master
#"""
# TODO: improve
entries = self.as_dict()
entries.update({
k: (_FORMATTERS[k](entries[k]) if entries[k] else '')
for k in _FORMATTERS
})
out_formats = [
("URLs processed", "urls"),
(" downloaded", "downloaded"),
(" size", "downloaded_size"),
("Files processed", "files"),
(" skipped", "skipped"),
(" renamed", "renamed"),
(" removed", "removed"),
(" overwritten", "overwritten"),
(" +git", "add_git"),
(" +annex", "add_annex"),
("Branches merged", "merges"),
("Datasets crawled", "datasets_crawled"),
(" failed", "datasets_crawl_failed"),
]
# Filter out empty/0 ones
out = ["%s: " % s + str(entries[m]) for s, m in out_formats if entries[m]]
if mode == 'full':
return '\n'.join(out)
elif mode == 'line':
for i, o in enumerate(out):
if o[0] != ' ':
out[i] = ' ' + o
return ','.join(out).lstrip()
#return "{files} files (git/annex: {add_git}/{add_annex}), " \
# "{skipped} skipped, {renamed} renamed, {overwritten} overwritten".format(
# **entries)
else:
raise ValueError("Unknown mode %s" % mode)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/status.py 0000644 0001751 0001751 00000004513 15137634221 017566 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""(comparable) descriptors of the file status
"""
__docformat__ = 'restructuredtext'
from ..utils import auto_repr
@auto_repr
class FileStatus(object):
"""Description of the file status to e.g. check if newer version is available
"""
def __init__(self, size=None, mtime=None, filename=None):
self.size = size
self.mtime = mtime
# TODO: actually not sure if filename should be here!
self.filename = filename
def __eq__(self, other):
# If other is still None, we must be different
if not other:
return False
# Disallow comparison of empty ones
if self.size is None and self.mtime is None: # and self.filename is None:
return NotImplemented
if other.size is None and other.mtime is None: # and other.filename is None:
return NotImplemented
same = \
self.size == other.size # and \
#self.filename == other.filename
if not same:
return False
# now deal with time.
# TODO: provide a config option for mtime comparison precision
# we might want to claim times equal up to a second precision
# since e.g. some file systems do not even store sub-sec timing
# TODO: config crawl.mtime_delta
# if any of them int and another float -- we need to trim float to int
if self.mtime == other.mtime:
return True
elif self.mtime is None or other.mtime is None:
return False
# none is None if here and not equal exactly
if isinstance(self.mtime, int) or isinstance(other.mtime, int):
return int(self.mtime) == int(other.mtime)
return False
def __ne__(self, other):
out = self == other
if isinstance(out, bool):
return not out
elif out is NotImplemented:
return out
else:
raise RuntimeError("Unknown return %r" % (out,))
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/strings.py 0000644 0001751 0001751 00000004366 15137634221 017742 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##g
"""Variety of helpers to deal with strings"""
from __future__ import annotations
__docformat__ = 'restructuredtext'
import re
from typing import AnyStr
def get_replacement_dict(rules: AnyStr | list[AnyStr | list[AnyStr] | tuple[AnyStr, AnyStr]]) -> dict[AnyStr, AnyStr]:
"""Given a string with replacement rules, produces a dict of from: to"""
if isinstance(rules, (bytes, str)):
rules = [rules]
pairs = dict()
for rule in rules:
if isinstance(rule, (list, tuple)):
if len(rule) == 2:
pairs[rule[0]] = rule[1]
else:
raise ValueError("Got a rule %s which is not a string or a pair of values (from, to)"
% repr(rule))
elif len(rule) <= 2:
raise ValueError("")
else:
rule_split = rule[1:].split(rule[0:1])
if len(rule_split) != 2:
raise ValueError(
"Rename string must be of format '/pat1/replacement', "
"where / is an arbitrary character to decide replacement. "
"Got %r when trying to separate %r" % (rule_split, rule)
)
pairs[rule_split[0]] = rule_split[1]
return pairs
def apply_replacement_rules(rules: AnyStr | list[AnyStr | list[AnyStr] | tuple[AnyStr, AnyStr]], s: AnyStr) -> AnyStr:
r"""Apply replacement rules specified as a single string
Examples
--------
>>> apply_replacement_rules(r'/my_(.*)\.dat/your_\1.dat.gz', 'd/my_pony.dat')
'd/your_pony.dat.gz'
Parameters
----------
rules : str, list of str
Rules of the format '/pat1/replacement', where / is an arbitrary
character to decide replacement.
Returns
-------
str
"""
for regexp, replacement in get_replacement_dict(rules).items():
s = re.sub(regexp, replacement, s)
return s
././@PaxHeader 0000000 0000000 0000000 00000000033 00000000000 010211 x ustar 00 27 mtime=1769945274.887061
datalad-1.3.1/datalad/support/tests/ 0000755 0001751 0001751 00000000000 15137634273 017037 5 ustar 00runner runner ././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/__init__.py 0000644 0001751 0001751 00000000723 15137634221 021143 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Tests for support submodules. Majority still resides under datalad.tests
"""
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_annexrepo.py 0000644 0001751 0001751 00000277727 15137634221 022466 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test implementation of class AnnexRepo
"""
import gc
import json
import logging
import os
import re
import sys
import unittest.mock
from functools import partial
from glob import glob
from os import mkdir
from os.path import (
basename,
curdir,
exists,
)
from os.path import join as opj
from os.path import (
pardir,
relpath,
)
from queue import Queue
from shutil import copyfile
from unittest.mock import patch
from urllib.parse import (
urljoin,
urlsplit,
)
import pytest
from datalad import cfg as dl_cfg
from datalad.api import clone
from datalad.cmd import GitWitlessRunner
from datalad.cmd import WitlessRunner as Runner
from datalad.consts import (
DATALAD_SPECIAL_REMOTE,
DATALAD_SPECIAL_REMOTES_UUIDS,
WEB_SPECIAL_REMOTE_UUID,
)
from datalad.distribution.dataset import Dataset
from datalad.runner.gitrunner import GitWitlessRunner
from datalad.support import path as op
# imports from same module:
from datalad.support.annexrepo import (
AnnexJsonProtocol,
AnnexRepo,
GeneratorAnnexJsonNoStderrProtocol,
GeneratorAnnexJsonProtocol,
)
from datalad.support.exceptions import (
AnnexBatchCommandError,
CommandError,
FileInGitError,
FileNotInAnnexError,
FileNotInRepositoryError,
IncompleteResultsError,
InsufficientArgumentsError,
MissingExternalDependency,
OutdatedExternalDependency,
OutOfSpaceError,
RemoteNotAvailableError,
)
from datalad.support.external_versions import external_versions
from datalad.support.gitrepo import GitRepo
from datalad.support.sshconnector import get_connection_hash
from datalad.tests.utils_pytest import (
DEFAULT_BRANCH,
DEFAULT_REMOTE,
OBSCURE_FILENAME,
SkipTest,
assert_cwd_unchanged,
)
from datalad.tests.utils_pytest import assert_dict_equal as deq_
from datalad.tests.utils_pytest import (
assert_equal,
assert_false,
assert_in,
assert_is_instance,
assert_not_equal,
assert_not_in,
assert_not_is_instance,
assert_raises,
assert_re_in,
assert_repo_status,
assert_result_count,
assert_true,
create_tree,
eq_,
find_files,
get_most_obscure_supported_name,
known_failure_githubci_win,
known_failure_windows,
maybe_adjust_repo,
ok_,
ok_annex_get,
ok_file_has_content,
ok_file_under_git,
ok_git_config_not_empty,
on_github,
on_nfs,
on_travis,
serve_path_via_http,
set_annex_version,
skip_if,
skip_if_adjusted_branch,
skip_if_on_windows,
skip_if_root,
skip_nomultiplex_ssh,
slow,
swallow_logs,
swallow_outputs,
with_parametric_batch,
with_sameas_remote,
with_tempfile,
with_tree,
xfail_buggy_annex_info,
)
from datalad.utils import (
Path,
chpwd,
get_linux_distribution,
on_windows,
quote_cmdlinearg,
rmtree,
unlink,
)
_GIT_ANNEX_VERSIONS_INFO = AnnexRepo.check_repository_versions()
@assert_cwd_unchanged
@with_tempfile
@with_tempfile
def test_AnnexRepo_instance_from_clone(src=None, dst=None):
origin = AnnexRepo(src, create=True)
ar = AnnexRepo.clone(src, dst)
assert_is_instance(ar, AnnexRepo, "AnnexRepo was not created.")
ok_(os.path.exists(os.path.join(dst, '.git', 'annex')))
# do it again should raise ValueError since git will notice
# there's already a git-repo at that path and therefore can't clone to `dst`
with swallow_logs(new_level=logging.WARN) as cm:
assert_raises(ValueError, AnnexRepo.clone, src, dst)
@assert_cwd_unchanged
@with_tempfile
def test_AnnexRepo_instance_from_existing(path=None):
AnnexRepo(path, create=True)
ar = AnnexRepo(path)
assert_is_instance(ar, AnnexRepo, "AnnexRepo was not created.")
ok_(os.path.exists(os.path.join(path, '.git')))
@assert_cwd_unchanged
@with_tempfile
def test_AnnexRepo_instance_brand_new(path=None):
GitRepo(path)
assert_raises(RuntimeError, AnnexRepo, path, create=False)
ar = AnnexRepo(path)
assert_is_instance(ar, AnnexRepo, "AnnexRepo was not created.")
ok_(os.path.exists(os.path.join(path, '.git')))
@assert_cwd_unchanged
@with_tempfile
def test_AnnexRepo_crippled_filesystem(dst=None):
ar = AnnexRepo(dst)
# fake git-annex entries in .git/config:
ar.config.set(
"annex.crippledfilesystem",
'true',
scope='local')
ok_(ar.is_crippled_fs())
ar.config.set(
"annex.crippledfilesystem",
'false',
scope='local')
assert_false(ar.is_crippled_fs())
# since we can't remove the entry, just rename it to fake its absence:
ar.config.rename_section("annex", "removed", scope='local')
ar.config.set("annex.something", "value", scope='local')
assert_false(ar.is_crippled_fs())
@known_failure_githubci_win
@with_tempfile
@assert_cwd_unchanged
def test_AnnexRepo_is_direct_mode(path=None):
ar = AnnexRepo(path)
eq_(ar.config.getbool("annex", "direct", False),
ar.is_direct_mode())
@known_failure_githubci_win
@with_tempfile()
def test_AnnexRepo_is_direct_mode_gitrepo(path=None):
repo = GitRepo(path, create=True)
# artificially make .git/annex so no annex section gets initialized
# in .git/config. We did manage somehow to make this happen (via publish)
# but didn't reproduce yet, so just creating manually
mkdir(opj(repo.path, '.git', 'annex'))
ar = AnnexRepo(path, init=False, create=False)
# It is unlikely though that annex would be in direct mode (requires explicit)
# annex magic, without having annex section under .git/config
dm = ar.is_direct_mode()
# no direct mode, ever
assert_false(dm)
# ignore warning since we are testing that function here. Remove upon full deprecation
@pytest.mark.filterwarnings(r"ignore: AnnexRepo.get_file_key\(\) is deprecated")
@assert_cwd_unchanged
@with_tempfile
def test_AnnexRepo_get_file_key(annex_path=None):
ar = AnnexRepo(annex_path)
(ar.pathobj / 'test.dat').write_text('123\n')
ar.save('test.dat', git=True)
(ar.pathobj / 'test-annex.dat').write_text(
"content to be annex-addurl'd")
ar.save('some')
# test-annex.dat should return the correct key:
test_annex_key = \
'SHA256E-s28' \
'--2795fb26981c5a687b9bf44930cc220029223f472cea0f0b17274f4473181e7b.dat'
eq_(ar.get_file_key("test-annex.dat"), test_annex_key)
# and should take a list with an empty string as result, if a file wasn't
# in annex:
eq_(
ar.get_file_key(["filenotpresent.wtf", "test-annex.dat"]),
['', test_annex_key]
)
# test.dat is actually in git
# should raise Exception; also test for polymorphism
assert_raises(IOError, ar.get_file_key, "test.dat")
assert_raises(FileNotInAnnexError, ar.get_file_key, "test.dat")
assert_raises(FileInGitError, ar.get_file_key, "test.dat")
# filenotpresent.wtf doesn't even exist
assert_raises(IOError, ar.get_file_key, "filenotpresent.wtf")
# if we force batch mode, no failure for not present or not annexed files
eq_(ar.get_file_key("filenotpresent.wtf", batch=True), '')
eq_(ar.get_file_key("test.dat", batch=True), '')
eq_(ar.get_file_key("test-annex.dat", batch=True), test_annex_key)
@with_tempfile(mkdir=True)
def test_AnnexRepo_get_outofspace(annex_path=None):
ar = AnnexRepo(annex_path, create=True)
def raise_cmderror(*args, **kwargs):
raise CommandError(
cmd="whatever",
stderr="junk around not enough free space, need 905.6 MB more and after"
)
with patch.object(GitWitlessRunner, 'run_on_filelist_chunks', raise_cmderror) as cma, \
assert_raises(OutOfSpaceError) as cme:
ar.get("file")
exc = cme.value
eq_(exc.sizemore_msg, '905.6 MB')
assert_re_in(".*annex.*(find|get).*needs 905.6 MB more", str(exc), re.DOTALL)
@with_tempfile
@with_tempfile
def test_AnnexRepo_get_remote_na(src=None, path=None):
origin = AnnexRepo(src, create=True)
(origin.pathobj / 'test-annex.dat').write_text("content")
origin.save()
ar = AnnexRepo.clone(src, path)
with assert_raises(RemoteNotAvailableError) as cme:
ar.get('test-annex.dat', options=["--from=NotExistingRemote"])
eq_(cme.value.remote, "NotExistingRemote")
# and similar one whenever invoking with remote parameter
with assert_raises(RemoteNotAvailableError) as cme:
ar.get('test-annex.dat', remote="NotExistingRemote")
eq_(cme.value.remote, "NotExistingRemote")
@with_sameas_remote
def test_annex_repo_sameas_special(repo=None):
remotes = repo.get_special_remotes()
eq_(len(remotes), 2)
rsync_info = [v for v in remotes.values()
if v.get("sameas-name") == "r_rsync"]
eq_(len(rsync_info), 1)
# r_rsync is a sameas remote that points to r_dir. Its sameas-name value
# has been copied under "name".
eq_(rsync_info[0]["name"], rsync_info[0]["sameas-name"])
# 1 is enough to test file_has_content
@with_parametric_batch
@with_tempfile
@with_tempfile
def test_AnnexRepo_file_has_content(src=None, annex_path=None, *, batch):
origin = AnnexRepo(src)
(origin.pathobj / 'test.dat').write_text('123\n')
origin.save('test.dat', git=True)
(origin.pathobj / 'test-annex.dat').write_text("content")
origin.save('some')
ar = AnnexRepo.clone(src, annex_path)
testfiles = ["test-annex.dat", "test.dat"]
eq_(ar.file_has_content(testfiles), [False, False])
ok_annex_get(ar, "test-annex.dat")
eq_(ar.file_has_content(testfiles, batch=batch), [True, False])
eq_(ar.file_has_content(testfiles[:1], batch=batch), [True])
eq_(ar.file_has_content(testfiles + ["bogus.txt"], batch=batch),
[True, False, False])
assert_false(ar.file_has_content("bogus.txt", batch=batch))
ok_(ar.file_has_content("test-annex.dat", batch=batch))
ar.unlock(["test-annex.dat"])
eq_(ar.file_has_content(["test-annex.dat"], batch=batch),
[True])
with open(opj(annex_path, "test-annex.dat"), "a") as ofh:
ofh.write("more")
eq_(ar.file_has_content(["test-annex.dat"], batch=batch),
[False])
# 1 is enough to test
@xfail_buggy_annex_info
@with_parametric_batch
@with_tempfile
@with_tempfile
def test_AnnexRepo_is_under_annex(src=None, annex_path=None, *, batch):
origin = AnnexRepo(src)
(origin.pathobj / 'test-annex.dat').write_text("content")
origin.save('some')
ar = AnnexRepo.clone(src, annex_path)
with open(opj(annex_path, 'not-committed.txt'), 'w') as f:
f.write("aaa")
testfiles = ["test-annex.dat", "not-committed.txt", "INFO.txt"]
# wouldn't change
target_value = [True, False, False]
eq_(ar.is_under_annex(testfiles, batch=batch), target_value)
ok_annex_get(ar, "test-annex.dat")
eq_(ar.is_under_annex(testfiles, batch=batch), target_value)
eq_(ar.is_under_annex(testfiles[:1], batch=batch), target_value[:1])
eq_(ar.is_under_annex(testfiles[1:], batch=batch), target_value[1:])
eq_(ar.is_under_annex(testfiles + ["bogus.txt"], batch=batch),
target_value + [False])
assert_false(ar.is_under_annex("bogus.txt", batch=batch))
ok_(ar.is_under_annex("test-annex.dat", batch=batch))
ar.unlock(["test-annex.dat"])
eq_(ar.is_under_annex(["test-annex.dat"], batch=batch),
[True])
with open(opj(annex_path, "test-annex.dat"), "a") as ofh:
ofh.write("more")
eq_(ar.is_under_annex(["test-annex.dat"], batch=batch),
[False])
@xfail_buggy_annex_info
@with_tree(tree=(('about.txt', 'Lots of abouts'),
('about2.txt', 'more abouts'),
('d', {'sub.txt': 'more stuff'})))
@serve_path_via_http()
@with_tempfile
def test_AnnexRepo_web_remote(sitepath=None, siteurl=None, dst=None):
ar = AnnexRepo(dst, create=True)
testurl = urljoin(siteurl, 'about.txt')
testurl2 = urljoin(siteurl, 'about2.txt')
testurl3 = urljoin(siteurl, 'd/sub.txt')
url_file_prefix = urlsplit(testurl).netloc.split(':')[0]
testfile = '%s_about.txt' % url_file_prefix
testfile2 = '%s_about2.txt' % url_file_prefix
testfile3 = opj('d', 'sub.txt')
# get the file from remote
with swallow_outputs() as cmo:
ar.add_url_to_file(testfile, testurl)
l = ar.whereis(testfile)
assert_in(WEB_SPECIAL_REMOTE_UUID, l)
eq_(len(l), 2)
ok_(ar.file_has_content(testfile))
# output='full'
lfull = ar.whereis(testfile, output='full')
eq_(set(lfull), set(l)) # the same entries
non_web_remote = l[1 - l.index(WEB_SPECIAL_REMOTE_UUID)]
assert_in('urls', lfull[non_web_remote])
eq_(lfull[non_web_remote]['urls'], [])
assert_not_in('uuid', lfull[WEB_SPECIAL_REMOTE_UUID]) # no uuid in the records
eq_(lfull[WEB_SPECIAL_REMOTE_UUID]['urls'], [testurl])
assert_equal(lfull[WEB_SPECIAL_REMOTE_UUID]['description'], 'web')
# --all and --key are incompatible
assert_raises(CommandError, ar.whereis, [testfile], options='--all', output='full', key=True)
# output='descriptions'
ldesc = ar.whereis(testfile, output='descriptions')
eq_(set(ldesc), set([v['description'] for v in lfull.values()]))
# info w/ and w/o fast mode
for fast in [True, False]:
info = ar.info(testfile, fast=fast)
eq_(info['size'], 14)
assert(info['key']) # that it is there
info_batched = ar.info(testfile, batch=True, fast=fast)
eq_(info, info_batched)
# while at it ;)
with swallow_outputs() as cmo:
eq_(ar.info('nonexistent', batch=False), None)
eq_(ar.info('nonexistent-batch', batch=True), None)
eq_(cmo.out, '')
eq_(cmo.err, '')
ar.precommit() # to stop all the batched processes for swallow_outputs
# annex repo info
repo_info = ar.repo_info(fast=False)
eq_(repo_info['local annex size'], 14)
eq_(repo_info['backend usage'], {'SHA256E': 1})
# annex repo info in fast mode
repo_info_fast = ar.repo_info(fast=True)
# doesn't give much testable info, so just comparing a subset for match with repo_info info
eq_(repo_info_fast['semitrusted repositories'], repo_info['semitrusted repositories'])
#import pprint; pprint.pprint(repo_info)
# remove the remote
ar.rm_url(testfile, testurl)
l = ar.whereis(testfile)
assert_not_in(WEB_SPECIAL_REMOTE_UUID, l)
eq_(len(l), 1)
# now only 1 copy; drop should fail
try:
res = ar.drop(testfile)
except CommandError as e:
# there should be at least one result that was captured
# TODO think about a more standard way of accessing such
# records in a CommandError, maybe having a more specialized
# exception derived from CommandError
res = e.kwargs['stdout_json'][0]
eq_(res['command'], 'drop')
eq_(res['success'], False)
assert_in('adjust numcopies', res['note'])
# read the url using different method
ar.add_url_to_file(testfile, testurl)
l = ar.whereis(testfile)
assert_in(WEB_SPECIAL_REMOTE_UUID, l)
eq_(len(l), 2)
ok_(ar.file_has_content(testfile))
# 2 known copies now; drop should succeed
ar.drop(testfile)
l = ar.whereis(testfile)
assert_in(WEB_SPECIAL_REMOTE_UUID, l)
eq_(len(l), 1)
assert_false(ar.file_has_content(testfile))
lfull = ar.whereis(testfile, output='full')
assert_not_in(non_web_remote, lfull) # not present -- so not even listed
# multiple files/urls
# get the file from remote
with swallow_outputs() as cmo:
ar.add_url_to_file(testfile2, testurl2)
# TODO: if we ask for whereis on all files, we should get for all files
lall = ar.whereis('.')
eq_(len(lall), 2)
for e in lall:
assert(isinstance(e, list))
# but we don't know which one for which file. need a 'full' one for that
lall_full = ar.whereis('.', output='full')
ok_(ar.file_has_content(testfile2))
ok_(lall_full[testfile2][non_web_remote]['here'])
eq_(set(lall_full), {testfile, testfile2})
# add a bogus 2nd url to testfile
someurl = "http://example.com/someurl"
ar.add_url_to_file(testfile, someurl, options=['--relaxed'])
lfull = ar.whereis(testfile, output='full')
eq_(set(lfull[WEB_SPECIAL_REMOTE_UUID]['urls']), {testurl, someurl})
# and now test with a file in subdirectory
subdir = opj(dst, 'd')
os.mkdir(subdir)
with swallow_outputs() as cmo:
ar.add_url_to_file(testfile3, url=testurl3)
ok_file_has_content(opj(dst, testfile3), 'more stuff')
eq_(set(ar.whereis(testfile3)), {WEB_SPECIAL_REMOTE_UUID, non_web_remote})
eq_(set(ar.whereis(testfile3, output='full').keys()), {WEB_SPECIAL_REMOTE_UUID, non_web_remote})
# and if we ask for both files
info2 = ar.info([testfile, testfile3])
eq_(set(info2), {testfile, testfile3})
eq_(info2[testfile3]['size'], 10)
full = ar.whereis([], options='--all', output='full')
eq_(len(full.keys()), 3) # we asked for all files -- got 3 keys
assert_in(WEB_SPECIAL_REMOTE_UUID, full['SHA256E-s10--a978713ea759207f7a6f9ebc9eaebd1b40a69ae408410ddf544463f6d33a30e1.txt'])
# which would work even if we cd to that subdir, but then we should use explicit curdir
with chpwd(subdir):
cur_subfile = opj(curdir, 'sub.txt')
eq_(set(ar.whereis(cur_subfile)), {WEB_SPECIAL_REMOTE_UUID, non_web_remote})
eq_(set(ar.whereis(cur_subfile, output='full').keys()), {WEB_SPECIAL_REMOTE_UUID, non_web_remote})
testfiles = [cur_subfile, opj(pardir, testfile)]
info2_ = ar.info(testfiles)
# Should maintain original relative file names
eq_(set(info2_), set(testfiles))
eq_(info2_[cur_subfile]['size'], 10)
@with_tree(tree={"a.txt": "a",
"b": "b",
OBSCURE_FILENAME: "c",
"subdir": {"d": "d", "e": "e"}})
def test_find_batch_equivalence(path=None):
ar = AnnexRepo(path)
files = ["a.txt", "b", OBSCURE_FILENAME]
ar.add(files + ["subdir"])
ar.commit("add files")
query = ["not-there"] + files
expected = {f: f for f in files}
expected.update({"not-there": ""})
eq_(expected, ar.find(query, batch=True))
eq_(expected, ar.find(query))
# If we give a subdirectory, we split that output.
eq_(set(ar.find(["subdir"])["subdir"]), {"subdir/d", "subdir/e"})
eq_(ar.find(["subdir"]), ar.find(["subdir"], batch=True))
# manually ensure that no annex batch processes are around anymore
# that make the test cleanup break on windows.
# story at https://github.com/datalad/datalad/issues/4190
# even an explicit `del ar` does not get it done
ar._batched.close()
@with_tempfile(mkdir=True)
def test_repo_info(path=None):
repo = AnnexRepo(path)
info = repo.repo_info() # works in empty repo without crashing
eq_(info['local annex size'], 0)
eq_(info['size of annexed files in working tree'], 0)
def get_custom(custom={}):
"""Need a helper since repo_info modifies in place so we should generate
new each time
"""
custom_json = {
'available local disk space': 'unknown',
'size of annexed files in working tree': "0",
'success': True,
'command': 'info',
}
if custom:
custom_json.update(custom)
return [custom_json]
with patch.object(
repo, '_call_annex_records',
return_value=get_custom()):
info = repo.repo_info()
eq_(info['available local disk space'], None)
with patch.object(
repo, '_call_annex_records',
return_value=get_custom({
"available local disk space": "19193986496 (+100000 reserved)"})):
info = repo.repo_info()
eq_(info['available local disk space'], 19193986496)
@with_tempfile
@with_tempfile
def test_AnnexRepo_migrating_backends(src=None, dst=None):
origin = AnnexRepo(src)
(origin.pathobj / 'test-annex.dat').write_text("content")
origin.save('some')
ar = AnnexRepo.clone(src, dst, backend='MD5')
eq_(ar.default_backends, ['MD5'])
# GitPython has a bug which causes .git/config being wiped out
# under Python3, triggered by collecting its config instance I guess
gc.collect()
ok_git_config_not_empty(ar) # Must not blow, see https://github.com/gitpython-developers/GitPython/issues/333
filename = get_most_obscure_supported_name()
filename_abs = os.path.join(dst, filename)
f = open(filename_abs, 'w')
f.write("What to write?")
f.close()
ar.add(filename, backend='MD5')
eq_(ar.get_file_backend(filename), 'MD5')
eq_(ar.get_file_backend('test-annex.dat'), 'SHA256E')
# migrating will only do, if file is present
ok_annex_get(ar, 'test-annex.dat')
eq_(ar.get_file_backend('test-annex.dat'), 'SHA256E')
ar.migrate_backend('test-annex.dat')
eq_(ar.get_file_backend('test-annex.dat'), 'MD5')
ar.migrate_backend('', backend='SHA1')
eq_(ar.get_file_backend(filename), 'SHA1')
eq_(ar.get_file_backend('test-annex.dat'), 'SHA1')
tree1args = dict(
tree=(
('firstfile', 'whatever'),
('secondfile', 'something else'),
('remotefile', 'pretends to be remote'),
('faraway', 'incredibly remote')),
)
# keys for files if above tree is generated and added to annex with MD5E backend
tree1_md5e_keys = {
'firstfile': 'MD5E-s8--008c5926ca861023c1d2a36653fd88e2',
'faraway': 'MD5E-s17--5b849ed02f914d3bbb5038fe4e3fead9',
'secondfile': 'MD5E-s14--6c7ba9c5a141421e1c03cb9807c97c74',
'remotefile': 'MD5E-s21--bf7654b3de20d5926d407ea7d913deb0'
}
# this code is only here for documentation purposes
# @with_tree(**tree1args)
# def __test_get_md5s(path):
# # was used just to generate above dict
# annex = AnnexRepo(path, init=True, backend='MD5E')
# files = [basename(f) for f in find_files('.*', path)]
# annex.add(files)
# annex.commit()
# print({f: p['key'] for f, p in annex.get_content_annexinfo(files)})
@with_parametric_batch
@with_tree(**tree1args)
def test_dropkey(path=None, *, batch):
kw = {'batch': batch}
annex = AnnexRepo(path, init=True, backend='MD5E')
files = list(tree1_md5e_keys)
annex.add(files)
annex.commit()
# drop one key
annex.drop_key(tree1_md5e_keys[files[0]], **kw)
# drop multiple
annex.drop_key([tree1_md5e_keys[f] for f in files[1:3]], **kw)
# drop already dropped -- should work as well atm
# https://git-annex.branchable.com/bugs/dropkey_--batch_--json_--force_is_always_succesfull
annex.drop_key(tree1_md5e_keys[files[0]], **kw)
# and a mix with already dropped or not
annex.drop_key(list(tree1_md5e_keys.values()), **kw)
# AnnexRepo is not able to guarantee that all batched processes are
# terminated when test cleanup code runs, avoid a crash (i.e. resource busy)
annex._batched.close()
@with_tree(**tree1args)
@serve_path_via_http()
def test_AnnexRepo_backend_option(path=None, url=None):
ar = AnnexRepo(path, backend='MD5')
# backend recorded in .gitattributes
eq_(ar.get_gitattributes('.')['.']['annex.backend'], 'MD5')
ar.add('firstfile', backend='SHA1')
ar.add('secondfile')
eq_(ar.get_file_backend('firstfile'), 'SHA1')
eq_(ar.get_file_backend('secondfile'), 'MD5')
with swallow_outputs() as cmo:
# must be added under different name since annex 20160114
ar.add_url_to_file('remotefile2', url + 'remotefile', backend='SHA1')
eq_(ar.get_file_backend('remotefile2'), 'SHA1')
with swallow_outputs() as cmo:
ar.add_url_to_file('from_faraway', url + 'faraway', backend='SHA1')
eq_(ar.get_file_backend('from_faraway'), 'SHA1')
@with_tempfile
@with_tempfile
def test_AnnexRepo_get_file_backend(src=None, dst=None):
origin = AnnexRepo(src, create=True)
(origin.pathobj / 'test-annex.dat').write_text("content")
origin.save()
ar = AnnexRepo.clone(src, dst)
eq_(ar.get_file_backend('test-annex.dat'), 'SHA256E')
# no migration
ok_annex_get(ar, 'test-annex.dat', network=False)
ar.migrate_backend('test-annex.dat', backend='SHA1')
eq_(ar.get_file_backend('test-annex.dat'), 'SHA1')
@skip_if_adjusted_branch
@with_tempfile
def test_AnnexRepo_always_commit(path=None):
repo = AnnexRepo(path)
def get_annex_commit_counts():
return len(repo.get_revisions("git-annex"))
n_annex_commits_initial = get_annex_commit_counts()
file1 = get_most_obscure_supported_name() + "_1"
file2 = get_most_obscure_supported_name() + "_2"
with open(opj(path, file1), 'w') as f:
f.write("First file.")
with open(opj(path, file2), 'w') as f:
f.write("Second file.")
# always_commit == True is expected to be default
repo.add(file1)
# Now git-annex log should show the addition:
out_list = list(repo.call_annex_items_(['log']))
eq_(len(out_list), 1)
quote = lambda s: s.replace('"', r'\"')
def assert_in_out(filename, out):
filename_quoted = quote(filename)
if repo._check_version_kludges('quotepath-respected') == "no":
assert_in(filename, out)
elif repo._check_version_kludges('quotepath-respected') == "maybe":
assert filename in out or filename_quoted in out
else:
assert_in(filename_quoted, out)
assert_in_out(file1, out_list[0])
# check git log of git-annex branch:
# expected: initial creation, update (by annex add) and another
# update (by annex log)
eq_(get_annex_commit_counts(), n_annex_commits_initial + 1)
with patch.object(repo, "always_commit", False):
repo.add(file2)
# No additional git commit:
eq_(get_annex_commit_counts(), n_annex_commits_initial + 1)
out = repo.call_annex(['log'])
# And we see only the file before always_commit was set to false:
assert_in_out(file1, out)
assert_not_in(file2, out)
assert_not_in(quote(file2), out)
# With always_commit back to True, do something that will trigger a commit
# on the annex branches.
repo.call_annex(['sync'])
out = repo.call_annex(['log'])
assert_in_out(file1, out)
assert_in_out(file2, out)
# Now git knows as well:
eq_(get_annex_commit_counts(), n_annex_commits_initial + 2)
@with_tempfile
@with_tempfile
def test_AnnexRepo_on_uninited_annex(src=None, path=None):
origin = AnnexRepo(src, create=True)
(origin.pathobj / 'test-annex.dat').write_text("content")
origin.save()
# "Manually" clone to avoid initialization:
runner = Runner()
runner.run(["git", "clone", origin.path, path])
assert_false(exists(opj(path, '.git', 'annex'))) # must not be there for this test to be valid
annex = AnnexRepo(path, create=False, init=False) # so we can initialize without
# and still can get our things
assert_false(annex.file_has_content('test-annex.dat'))
annex.get('test-annex.dat')
ok_(annex.file_has_content('test-annex.dat'))
@assert_cwd_unchanged
@with_tempfile
def test_AnnexRepo_commit(path=None):
ds = AnnexRepo(path, create=True)
filename = opj(path, get_most_obscure_supported_name())
with open(filename, 'w') as f:
f.write("File to add to git")
ds.add(filename, git=True)
assert_raises(AssertionError, assert_repo_status, path, annex=True)
ds.commit("test _commit")
assert_repo_status(path, annex=True)
# nothing to commit doesn't raise by default:
ds.commit()
# but does with careless=False:
assert_raises(CommandError, ds.commit, careless=False)
# committing untracked file raises:
with open(opj(path, "untracked"), "w") as f:
f.write("some")
assert_raises(FileNotInRepositoryError, ds.commit, files="untracked")
# not existing file as well:
assert_raises(FileNotInRepositoryError, ds.commit, files="not-existing")
@with_tempfile
def test_AnnexRepo_add_to_annex(path=None):
repo = AnnexRepo(path)
assert_repo_status(repo, annex=True)
filename = get_most_obscure_supported_name()
filename_abs = opj(repo.path, filename)
with open(filename_abs, "w") as f:
f.write("some")
out_json = repo.add(filename)
# file is known to annex:
ok_(repo.is_under_annex(filename_abs),
"Annexed file is not a link.")
assert_in('key', out_json)
key = repo.get_file_annexinfo(filename)['key']
assert_false(key == '')
assert_equal(key, out_json['key'])
ok_(repo.file_has_content(filename))
# uncommitted:
ok_(repo.dirty)
repo.commit("Added file to annex.")
assert_repo_status(repo, annex=True)
# now using commit/msg options:
filename = "another.txt"
with open(opj(repo.path, filename), "w") as f:
f.write("something else")
repo.add(filename)
repo.commit(msg="Added another file to annex.")
# known to annex:
fileprops = repo.get_file_annexinfo(filename, eval_availability=True)
ok_(fileprops['key'])
ok_(fileprops['has_content'])
# and committed:
assert_repo_status(repo, annex=True)
@with_tempfile
def test_AnnexRepo_add_to_git(path=None):
repo = AnnexRepo(path)
assert_repo_status(repo, annex=True)
filename = get_most_obscure_supported_name()
with open(opj(repo.path, filename), "w") as f:
f.write("some")
repo.add(filename, git=True)
# not in annex, but in git:
eq_(repo.get_file_annexinfo(filename), {})
# uncommitted:
ok_(repo.dirty)
repo.commit("Added file to annex.")
assert_repo_status(repo, annex=True)
# now using commit/msg options:
filename = "another.txt"
with open(opj(repo.path, filename), "w") as f:
f.write("something else")
repo.add(filename, git=True)
repo.commit(msg="Added another file to annex.")
# not in annex, but in git:
eq_(repo.get_file_annexinfo(filename), {})
# and committed:
assert_repo_status(repo, annex=True)
@with_tempfile
@with_tempfile
def test_AnnexRepo_get(src=None, dst=None):
ar = AnnexRepo(src)
(ar.pathobj / 'test-annex.dat').write_text(
"content to be annex-addurl'd")
ar.save('some')
annex = AnnexRepo.clone(src, dst)
assert_is_instance(annex, AnnexRepo, "AnnexRepo was not created.")
testfile = 'test-annex.dat'
testfile_abs = opj(dst, testfile)
assert_false(annex.file_has_content("test-annex.dat"))
with swallow_outputs():
annex.get(testfile)
ok_(annex.file_has_content("test-annex.dat"))
ok_file_has_content(testfile_abs, "content to be annex-addurl'd", strip=True)
called = []
# for some reason yoh failed mock to properly just call original func
orig_run = annex._git_runner.run_on_filelist_chunks
def check_run(cmd, files, **kwargs):
cmd_name = cmd[cmd.index('annex') + 1]
called.append(cmd_name)
if cmd_name == 'find':
assert_not_in('-J5', cmd)
elif cmd_name == 'get':
assert_in('-J5', cmd)
else:
raise AssertionError(
"no other commands so far should be ran. Got %s" % cmd
)
return orig_run(cmd, files, **kwargs)
annex.drop(testfile)
with patch.object(GitWitlessRunner, 'run_on_filelist_chunks',
side_effect=check_run), \
swallow_outputs():
annex.get(testfile, jobs=5)
eq_(called, ['find', 'get'])
ok_file_has_content(testfile_abs, "content to be annex-addurl'd", strip=True)
@with_tree(tree={'file.dat': 'content'})
@with_tempfile
def test_v7_detached_get(opath=None, path=None):
# http://git-annex.branchable.com/bugs/get_fails_to_place_v7_unlocked_file_content_into_the_file_tree_in_v7_in_repo_with_detached_HEAD/
origin = AnnexRepo(opath, create=True, version=7)
GitRepo.add(origin, 'file.dat') # force direct `git add` invocation
origin.commit('added')
AnnexRepo.clone(opath, path)
repo = AnnexRepo(path)
# test getting in a detached HEAD
repo.checkout('HEAD^{}')
repo.call_annex(['upgrade']) # TODO: .upgrade ?
repo.get('file.dat')
ok_file_has_content(op.join(repo.path, 'file.dat'), "content")
# TODO:
#def init_remote(self, name, options):
#def enable_remote(self, name):
@pytest.mark.parametrize("batch", [False, True])
@with_tempfile
@with_tempfile
@with_tempfile
def test_AnnexRepo_get_contentlocation(src=None, path=None, work_dir_outside=None, *, batch):
ar = AnnexRepo(src)
(ar.pathobj / 'test-annex.dat').write_text(
"content to be annex-addurl'd")
ar.save('some')
annex = AnnexRepo.clone(src, path)
fname = 'test-annex.dat'
key = annex.get_file_annexinfo(fname)['key']
# MIH at this point the whole test and get_contentlocation() itself
# is somewhat moot. The above call already has properties like
# 'hashdirmixed', 'hashdirlower', and 'key' from which the location
# could be built.
# with eval_availability=True, it also has 'objloc' with a absolute
# path to a verified annex key location
# TODO: see if we can avoid this or specify custom exception
eq_(annex.get_contentlocation(key, batch=batch), '')
with swallow_outputs() as cmo:
annex.get(fname)
key_location = annex.get_contentlocation(key, batch=batch)
assert(key_location)
if annex.is_managed_branch():
# the rest of the test assumes annexed files being symlinks
return
# they both should point to the same location eventually
eq_((annex.pathobj / fname).resolve(),
(annex.pathobj / key_location).resolve())
# test how it would look if done under a subdir of the annex:
with chpwd(opj(annex.path, 'subdir'), mkdir=True):
key_location = annex.get_contentlocation(key, batch=batch)
# they both should point to the same location eventually
eq_((annex.pathobj / fname).resolve(),
(annex.pathobj / key_location).resolve())
# test how it would look if done under a dir outside of the annex:
with chpwd(work_dir_outside, mkdir=True):
key_location = annex.get_contentlocation(key, batch=batch)
# they both should point to the same location eventually
eq_((annex.pathobj / fname).resolve(),
(annex.pathobj / key_location).resolve())
@known_failure_windows
@with_tree(tree=(('about.txt', 'Lots of abouts'),
('about2.txt', 'more abouts'),
('about2_.txt', 'more abouts_'),
('d', {'sub.txt': 'more stuff'})))
@serve_path_via_http()
@with_tempfile
def test_AnnexRepo_addurl_to_file_batched(sitepath=None, siteurl=None, dst=None):
if dl_cfg.get('datalad.fake-dates'):
raise SkipTest(
"Faked dates are enabled; skipping batched addurl tests")
ar = AnnexRepo(dst, create=True)
testurl = urljoin(siteurl, 'about.txt')
testurl2 = urljoin(siteurl, 'about2.txt')
testurl2_ = urljoin(siteurl, 'about2_.txt')
testurl3 = urljoin(siteurl, 'd/sub.txt')
url_file_prefix = urlsplit(testurl).netloc.split(':')[0]
testfile = 'about.txt'
testfile2 = 'about2.txt'
testfile2_ = 'about2_.txt'
testfile3 = opj('d', 'sub.txt')
# add to an existing but not committed file
# TODO: __call__ of the BatchedAnnex must be checked to be called
copyfile(opj(sitepath, 'about.txt'), opj(dst, testfile))
# must crash sensibly since file exists, we shouldn't addurl to non-annexed files
with assert_raises(AnnexBatchCommandError):
ar.add_url_to_file(testfile, testurl, batch=True)
# Remove it and re-add
unlink(opj(dst, testfile))
ar.add_url_to_file(testfile, testurl, batch=True)
info = ar.info(testfile)
eq_(info['size'], 14)
assert(info['key'])
# not even added to index yet since we this repo is with default batch_size
assert_not_in(WEB_SPECIAL_REMOTE_UUID, ar.whereis(testfile))
# TODO: none of the below should re-initiate the batch process
# add to an existing and staged annex file
copyfile(opj(sitepath, 'about2.txt'), opj(dst, testfile2))
ar.add(testfile2)
ar.add_url_to_file(testfile2, testurl2, batch=True)
assert(ar.info(testfile2))
# not committed yet
# assert_in(WEB_SPECIAL_REMOTE_UUID, ar.whereis(testfile2))
# add to an existing and committed annex file
copyfile(opj(sitepath, 'about2_.txt'), opj(dst, testfile2_))
ar.add(testfile2_)
if ar.is_direct_mode():
assert_in(WEB_SPECIAL_REMOTE_UUID, ar.whereis(testfile))
else:
assert_not_in(WEB_SPECIAL_REMOTE_UUID, ar.whereis(testfile))
ar.commit("added about2_.txt and there was about2.txt lingering around")
# commit causes closing all batched annexes, so testfile gets committed
assert_in(WEB_SPECIAL_REMOTE_UUID, ar.whereis(testfile))
assert(not ar.dirty)
ar.add_url_to_file(testfile2_, testurl2_, batch=True)
assert(ar.info(testfile2_))
assert_in(WEB_SPECIAL_REMOTE_UUID, ar.whereis(testfile2_))
# add into a new file
# filename = 'newfile.dat'
filename = get_most_obscure_supported_name()
# Note: The following line was necessary, since the test setup just
# doesn't work with singletons
# TODO: Singleton mechanic needs a general solution for this
AnnexRepo._unique_instances.clear()
ar2 = AnnexRepo(dst, batch_size=1)
with swallow_outputs():
eq_(len(ar2._batched), 0)
ar2.add_url_to_file(filename, testurl, batch=True)
eq_(len(ar2._batched), 1) # we added one more with batch_size=1
ar2.precommit() # to possibly stop batch process occupying the stdout
ar2.commit("added new file") # would do nothing ATM, but also doesn't fail
assert_in(filename, ar2.get_files())
assert_in(WEB_SPECIAL_REMOTE_UUID, ar2.whereis(filename))
ar.commit("actually committing new files")
assert_in(filename, ar.get_files())
assert_in(WEB_SPECIAL_REMOTE_UUID, ar.whereis(filename))
# this poor bugger still wasn't added since we used default batch_size=0 on him
# and closing the pipes now shouldn't anyhow affect things
eq_(len(ar._batched), 1)
ar._batched.close()
eq_(len(ar._batched), 1) # doesn't remove them, just closes
assert(not ar.dirty)
ar._batched.clear()
eq_(len(ar._batched), 0) # .clear also removes
raise SkipTest("TODO: more, e.g. add with a custom backend")
# TODO: also with different modes (relaxed, fast)
# TODO: verify that file is added with that backend and that we got a new batched process
@with_tree(tree={"foo": "foo content"})
@serve_path_via_http()
@with_tree(tree={"bar": "bar content"})
def test_annexrepo_fake_dates_disables_batched(sitepath=None, siteurl=None, dst=None):
ar = AnnexRepo(dst, create=True, fake_dates=True)
with swallow_logs(new_level=logging.DEBUG) as cml:
ar.add_url_to_file("foo-dst", urljoin(siteurl, "foo"), batch=True)
cml.assert_logged(
msg="Not batching addurl call because fake dates are enabled",
level="DEBUG",
regex=False)
ar.add("bar")
ar.commit("add bar")
key = ar.get_content_annexinfo(["bar"]).popitem()[1]['key']
with swallow_logs(new_level=logging.DEBUG) as cml:
ar.drop_key(key, batch=True)
cml.assert_logged(
msg="Not batching drop_key call because fake dates are enabled",
level="DEBUG",
regex=False)
@with_tempfile(mkdir=True)
def test_annex_backends(path=None):
path = Path(path)
repo_default = AnnexRepo(path / "r_default")
eq_(repo_default.default_backends, None)
repo_kw = AnnexRepo(path / "repo_kw", backend='MD5E')
eq_(repo_kw.default_backends, ['MD5E'])
# persists
repo_kw = AnnexRepo(path / "repo_kw")
eq_(repo_kw.default_backends, ['MD5E'])
repo_config = AnnexRepo(path / "repo_config")
repo_config.config.set("annex.backend", "MD5E", reload=True)
eq_(repo_config.default_backends, ["MD5E"])
repo_compat = AnnexRepo(path / "repo_compat")
repo_compat.config.set("annex.backends", "MD5E WORM", reload=True)
eq_(repo_compat.default_backends, ["MD5E", "WORM"])
# ignore deprecation warnings since here we should not use high level
# interface like push
@pytest.mark.filterwarnings(r"ignore: AnnexRepo.copy_to\(\) is deprecated")
@skip_nomultiplex_ssh # too much of "multiplex" testing
@with_tempfile(mkdir=True)
def test_annex_ssh(topdir=None):
topdir = Path(topdir)
rm1 = AnnexRepo(topdir / "remote1", create=True)
rm2 = AnnexRepo.clone(rm1.path, str(topdir / "remote2"))
rm2.remove_remote(DEFAULT_REMOTE)
main_tmp = AnnexRepo.clone(rm1.path, str(topdir / "main"))
main_tmp.remove_remote(DEFAULT_REMOTE)
repo_path = main_tmp.path
del main_tmp
remote_1_path = rm1.path
remote_2_path = rm2.path
from datalad import ssh_manager
# check whether we are the first to use these sockets:
hash_1 = get_connection_hash('datalad-test')
socket_1 = opj(str(ssh_manager.socket_dir), hash_1)
hash_2 = get_connection_hash('datalad-test2')
socket_2 = opj(str(ssh_manager.socket_dir), hash_2)
datalad_test_was_open = exists(socket_1)
datalad_test2_was_open = exists(socket_2)
# repo to test:AnnexRepo(repo_path)
# At first, directly use git to add the remote, which should be recognized
# by AnnexRepo's constructor
gr = GitRepo(repo_path, create=True)
gr.add_remote("ssh-remote-1", "ssh://datalad-test" + remote_1_path)
ar = AnnexRepo(repo_path, create=False)
# socket was not touched:
if datalad_test_was_open:
ok_(exists(socket_1))
else:
ok_(not exists(socket_1))
# remote interaction causes socket to be created:
(ar.pathobj / "foo").write_text("foo")
(ar.pathobj / "bar").write_text("bar")
ar.add("foo")
ar.add("bar")
ar.commit("add files")
ar.copy_to(["foo"], remote="ssh-remote-1")
# copy_to() opens it if needed.
#
# Note: This isn't racy because datalad-sshrun should not close this itself
# because the connection was either already open before this test or
# copy_to(), not the underlying git-annex/datalad-sshrun call, opens it.
ok_(exists(socket_1))
# add another remote:
ar.add_remote('ssh-remote-2', "ssh://datalad-test2" + remote_2_path)
# socket was not touched:
if datalad_test2_was_open:
# FIXME: occasionally(?) fails in V6:
# ok_(exists(socket_2))
pass
else:
ok_(not exists(socket_2))
# copy to the new remote:
#
# Same racy note as the copy_to() call above.
ar.copy_to(["foo"], remote="ssh-remote-2")
if not exists(socket_2): # pragma: no cover
# @known_failure (marked for grep)
raise SkipTest("test_annex_ssh hit known failure (gh-4781)")
# Check that git-annex is actually using datalad-sshrun.
fail_cmd = quote_cmdlinearg(sys.executable) + "-c 'assert 0'"
with patch.dict('os.environ', {'GIT_SSH_COMMAND': fail_cmd}):
with assert_raises(CommandError):
ar.copy_to(["bar"], remote="ssh-remote-2")
ar.copy_to(["bar"], remote="ssh-remote-2")
ssh_manager.close(ctrl_path=[socket_1, socket_2])
@with_tempfile
def test_annex_remove(path=None):
ar = AnnexRepo(path)
(ar.pathobj / 'test-annex.dat').write_text(
"content to be annex-addurl'd")
ar.save('some')
repo = AnnexRepo(path, create=False)
file_list = list(repo.get_content_annexinfo(init=None))
assert len(file_list) >= 1
# remove a single file
out = repo.remove(str(file_list[0]))
assert_not_in(file_list[0], repo.get_content_annexinfo(init=None))
eq_(out[0], str(file_list[0].relative_to(repo.pathobj)))
with open(opj(repo.path, "rm-test.dat"), "w") as f:
f.write("whatever")
# add it
repo.add("rm-test.dat")
# remove without '--force' should fail, due to staged changes:
assert_raises(CommandError, repo.remove, "rm-test.dat")
assert_in("rm-test.dat", repo.get_annexed_files())
# now force:
out = repo.remove("rm-test.dat", force=True)
assert_not_in("rm-test.dat", repo.get_annexed_files())
eq_(out[0], "rm-test.dat")
@with_tempfile
@with_tempfile
@with_tempfile
def test_repo_version_upgrade(path1=None, path2=None, path3=None):
with swallow_logs(new_level=logging.INFO) as cm:
# Since git-annex 7.20181031, v6 repos upgrade to v7.
# Future proofing: We will test on v6 as long as it is upgradeable,
# but would switch to first upgradeable after
Uversion = 6 if 6 in _GIT_ANNEX_VERSIONS_INFO["upgradable"] \
else _GIT_ANNEX_VERSIONS_INFO["upgradeable"][0]
v_first_supported = next(i for i in _GIT_ANNEX_VERSIONS_INFO["supported"] if i >= Uversion)
annex = AnnexRepo(path1, create=True, version=Uversion)
assert_repo_status(path1, annex=True)
v_upgraded_to = int(annex.config.get('annex.version'))
if external_versions['cmd:annex'] <= '10.20220724':
eq_(v_upgraded_to, v_first_supported)
assert_in("will be upgraded to 8", cm.out)
else:
# 10.20220724-5-g63cef2ae0 started to auto-upgrade to 10, although 8 was the
# lowest supported. In general we can only assert that we upgrade into one
# of the supported
assert_in(v_upgraded_to, _GIT_ANNEX_VERSIONS_INFO["supported"])
assert_in("will be upgraded to %s or later version" % v_first_supported, cm.out)
# default from config item (via env var):
with patch.dict('os.environ', {'DATALAD_REPO_VERSION': str(Uversion)}):
# and check consistency of upgrading to the default version:
annex = AnnexRepo(path2, create=True)
version = int(annex.config.get('annex.version'))
eq_(version, v_upgraded_to)
@pytest.mark.parametrize("version", _GIT_ANNEX_VERSIONS_INFO["supported"])
def test_repo_version_supported(version, tmp_path):
# default from config item (via env var):
Uversion = _GIT_ANNEX_VERSIONS_INFO["upgradable"][0]
with patch.dict('os.environ', {'DATALAD_REPO_VERSION': str(Uversion)}):
# ...parameter `version` still has priority over default config:
annex = AnnexRepo(str(tmp_path), create=True, version=version)
annex_version = int(annex.config.get('annex.version'))
if not annex.is_managed_branch():
# There is no "upgrade" for any of the supported versions.
# if we are not in adjusted branch
eq_(annex_version, version)
else:
print("HERE")
# some annex command might have ran to trigger the update
assert annex_version in {v for v in _GIT_ANNEX_VERSIONS_INFO["supported"] if v >= version}
# ignore deprecation warnings since that is the test testing that functionality
@pytest.mark.filterwarnings(r"ignore: AnnexRepo.copy_to\(\) is deprecated")
@with_tempfile
@with_tempfile
@with_tempfile
def test_annex_copy_to(src=None, origin=None, clone=None):
ar = AnnexRepo(src)
(ar.pathobj / 'test.dat').write_text("123\n")
ar.save('some', git=True)
(ar.pathobj / 'test-annex.dat').write_text("content")
ar.save('some')
repo = AnnexRepo.clone(src, origin)
remote = AnnexRepo.clone(origin, clone)
repo.add_remote("target", clone)
assert_raises(IOError, repo.copy_to, "doesnt_exist.dat", "target")
assert_raises(FileInGitError, repo.copy_to, "test.dat", "target")
assert_raises(ValueError, repo.copy_to, "test-annex.dat", "invalid_target")
# see #3102
# "copying" a dir shouldn't do anything and not raise.
os.mkdir(opj(repo.path, "subdir"))
repo.copy_to("subdir", "target")
# test-annex.dat has no content to copy yet:
eq_(repo.copy_to("test-annex.dat", "target"), [])
repo.get("test-annex.dat")
# now it has:
eq_(repo.copy_to("test-annex.dat", "target"), ["test-annex.dat"])
# and will not be copied again since it was already copied
eq_(repo.copy_to(["test.dat", "test-annex.dat"], "target"), [])
# Test that if we pass a list of items and annex processes them nicely,
# we would obtain a list back. To not stress our tests even more -- let's mock
def ok_copy(command, **kwargs):
# Check that we do pass to annex call only the list of files which we
# asked to be copied
assert_in('copied1', kwargs['files'])
assert_in('copied2', kwargs['files'])
assert_in('existed', kwargs['files'])
return [
{"command":"copy","note":"to target ...", "success":True,
"key":"akey1", "file":"copied1"},
{"command":"copy","note":"to target ...", "success":True,
"key":"akey2", "file":"copied2"},
{"command":"copy","note":"checking target ...", "success":True,
"key":"akey3", "file":"existed"},
]
# Note that we patch _call_annex_records,
# which is in turn invoked first by copy_to for "find" operation.
# TODO: provide a dedicated handling within above ok_copy for 'find' command
with patch.object(repo, '_call_annex_records', ok_copy):
eq_(repo.copy_to(["copied2", "copied1", "existed"], "target"),
["copied1", "copied2"])
# now let's test that we are correctly raising the exception in case if
# git-annex execution fails
orig_run = repo._call_annex
# Kinda a bit off the reality since no nonex* would not be returned/handled
# by _get_expected_files, so in real life -- wouldn't get report about Incomplete!?
def fail_to_copy(command, **kwargs):
if command[0] == 'copy':
# That is not how annex behaves
# http://git-annex.branchable.com/bugs/copy_does_not_reflect_some_failed_copies_in_--json_output/
# for non-existing files output goes into stderr
#
# stderr output depends on config+version of annex, though:
if not dl_cfg.getbool(
section="annex", option="skipunknown",
# git-annex switched default for this config:
default=bool(
external_versions['cmd:annex'] < '10.20220222')):
stderr = "error: pathspec 'nonex1' did not match any file(s) " \
"known to git\n" \
"error: pathspec 'nonex2' did not match any file(s) " \
"known to git\n"
else:
stderr = "git-annex: nonex1 not found\n" \
"git-annex: nonex2 not found\n"
raise CommandError(
"Failed to run ...",
stdout_json=[
{"command":"copy","note":"to target ...", "success":True,
"key":"akey1", "file":"copied"},
{"command":"copy","note":"checking target ...",
"success":True, "key":"akey2", "file":"existed"},
],
stderr=stderr
)
else:
return orig_run(command, **kwargs)
def fail_to_copy_get_expected(files, expr):
assert files == ["copied", "existed", "nonex1", "nonex2"]
return {'akey1': 10}, ["copied"]
with patch.object(repo, '_call_annex', fail_to_copy), \
patch.object(repo, '_get_expected_files', fail_to_copy_get_expected):
with assert_raises(IncompleteResultsError) as cme:
repo.copy_to(["copied", "existed", "nonex1", "nonex2"], "target")
eq_(cme.value.results, ["copied"])
eq_(cme.value.failed, ['nonex1', 'nonex2'])
@with_tempfile
@with_tempfile
def test_annex_drop(src=None, dst=None):
ar = AnnexRepo(src)
(ar.pathobj / 'test-annex.dat').write_text("content")
ar.save('some')
ar = AnnexRepo.clone(src, dst)
testfile = 'test-annex.dat'
assert_false(ar.file_has_content(testfile))
ar.get(testfile)
ok_(ar.file_has_content(testfile))
eq_(len([f for f in ar.fsck(fast=True) if f['file'] == testfile]), 1)
# drop file by name:
result = ar.drop([testfile])
assert_false(ar.file_has_content(testfile))
ok_(isinstance(result, list))
eq_(len(result), 1)
eq_(result[0]['command'], 'drop')
eq_(result[0]['success'], True)
eq_(result[0]['file'], testfile)
ar.get(testfile)
# drop file by key:
testkey = ar.get_file_annexinfo(testfile)['key']
result = ar.drop([testkey], key=True)
assert_false(ar.file_has_content(testfile))
ok_(isinstance(result, list))
eq_(len(result), 1)
eq_(result[0]['command'], 'drop')
eq_(result[0]['success'], True)
eq_(result[0]['key'], testkey)
# insufficient arguments:
assert_raises(TypeError, ar.drop)
assert_raises(InsufficientArgumentsError, ar.drop, [], options=["--jobs=5"])
assert_raises(InsufficientArgumentsError, ar.drop, [])
# too much arguments:
assert_raises(CommandError, ar.drop, ['.'], options=['--all'])
(ar.pathobj / 'somefile.txt').write_text('this')
ar.save()
with assert_raises(CommandError) as e:
ar.drop('somefile.txt')
# CommandError has to pull the errors from the JSON record 'note'
assert_in('necessary cop', str(e.value))
with assert_raises(CommandError) as e:
ar._call_annex_records(['fsck', '-N', '3'])
# CommandError has to pull the errors from the JSON record 'error-messages'
assert_in('1 of 3 trustworthy copies', str(e.value))
@with_tree({"a.txt": "a", "b.txt": "b", "c.py": "c", "d": "d"})
def test_annex_get_annexed_files(path=None):
repo = AnnexRepo(path)
repo.add(".")
repo.commit()
eq_(set(repo.get_annexed_files()), {"a.txt", "b.txt", "c.py", "d"})
repo.drop("a.txt", options=["--force"])
eq_(set(repo.get_annexed_files()), {"a.txt", "b.txt", "c.py", "d"})
eq_(set(repo.get_annexed_files(with_content_only=True)),
{"b.txt", "c.py", "d"})
eq_(set(repo.get_annexed_files(patterns=["*.txt"])),
{"a.txt", "b.txt"})
eq_(set(repo.get_annexed_files(with_content_only=True,
patterns=["*.txt"])),
{"b.txt"})
eq_(set(repo.get_annexed_files(patterns=["*.txt", "*.py"])),
{"a.txt", "b.txt", "c.py"})
eq_(set(repo.get_annexed_files()),
set(repo.get_annexed_files(patterns=["*"])))
eq_(set(repo.get_annexed_files(with_content_only=True)),
set(repo.get_annexed_files(with_content_only=True, patterns=["*"])))
@pytest.mark.parametrize("batch", [True, False])
@with_tree(tree={"test-annex.dat": "content"})
@serve_path_via_http()
@with_tempfile()
@with_tempfile()
def test_is_available(_=None, content_url=None, origpath=None, path=None, *,
batch):
fname = "test-annex.dat"
content_url += "/" + fname
origds = Dataset(origpath).create()
origds.repo.add_url_to_file(fname, content_url)
origds.save()
origds.drop(fname)
annex = clone(origpath, path).repo
# bkw = {'batch': batch}
if batch:
is_available = partial(annex.is_available, batch=batch)
else:
is_available = annex.is_available
key = annex.get_content_annexinfo([fname]).popitem()[1]['key']
# explicit is to verify data type etc
assert is_available(key, key=True) is True
assert is_available(fname) is True
# known remote but doesn't have it
assert is_available(fname, remote=DEFAULT_REMOTE) is False
# If the 'datalad' special remote is present, it will claim fname's URL.
if DATALAD_SPECIAL_REMOTE in annex.get_remotes():
remote = DATALAD_SPECIAL_REMOTE
uuid = DATALAD_SPECIAL_REMOTES_UUIDS[DATALAD_SPECIAL_REMOTE]
else:
remote = "web"
uuid = WEB_SPECIAL_REMOTE_UUID
# it is on the 'web'
assert is_available(fname, remote=remote) is True
# not effective somehow :-/ may be the process already running or smth
# with swallow_logs(), swallow_outputs(): # it will complain!
assert is_available(fname, remote='unknown') is False
assert_false(is_available("boguskey", key=True))
# remove url
urls = annex.whereis(fname, output="full").get(uuid, {}).get("urls", [])
assert(len(urls) == 1)
eq_(urls,
annex.whereis(key, key=True, output="full")
.get(uuid, {}).get("urls"))
annex.rm_url(fname, urls[0])
assert is_available(key, key=True) is False
assert is_available(fname) is False
assert is_available(fname, remote=remote) is False
@with_tempfile(mkdir=True)
def test_get_urls_none(path=None):
ar = AnnexRepo(path, create=True)
with open(opj(ar.path, "afile"), "w") as f:
f.write("content")
eq_(ar.get_urls("afile"), [])
@xfail_buggy_annex_info
@with_tempfile(mkdir=True)
def test_annex_add_no_dotfiles(path=None):
ar = AnnexRepo(path, create=True)
print(ar.path)
assert_true(os.path.exists(ar.path))
assert_false(ar.dirty)
os.makedirs(opj(ar.path, '.datalad'))
# we don't care about empty directories
assert_false(ar.dirty)
with open(opj(ar.path, '.datalad', 'somefile'), 'w') as f:
f.write('some content')
# make sure the repo is considered dirty now
assert_true(ar.dirty) # TODO: has been more detailed assertion (untracked file)
# now add to git, and it should work
ar.add('.', git=True)
# all in index
assert_true(ar.dirty)
# TODO: has been more specific:
# assert_false(ar.repo.is_dirty(
# index=False, working_tree=True, untracked_files=True, submodules=True))
ar.commit(msg="some")
# all committed
assert_false(ar.dirty)
# not known to annex
assert_false(ar.is_under_annex(opj(ar.path, '.datalad', 'somefile')))
@with_tempfile
def test_annex_version_handling_at_min_version(path=None):
with set_annex_version(AnnexRepo.GIT_ANNEX_MIN_VERSION):
po = patch.object(AnnexRepo, '_check_git_annex_version',
side_effect=AnnexRepo._check_git_annex_version)
with po as cmpc:
eq_(AnnexRepo.git_annex_version, None)
ar1 = AnnexRepo(path, create=True)
assert(ar1)
eq_(AnnexRepo.git_annex_version, AnnexRepo.GIT_ANNEX_MIN_VERSION)
eq_(cmpc.call_count, 1)
# 2nd time must not be called
try:
# Note: Remove to cause creation of a new instance
rmtree(path)
except OSError:
pass
ar2 = AnnexRepo(path)
assert(ar2)
eq_(AnnexRepo.git_annex_version, AnnexRepo.GIT_ANNEX_MIN_VERSION)
eq_(cmpc.call_count, 1)
@with_tempfile
def test_annex_version_handling_bad_git_annex(path=None):
with set_annex_version(None):
eq_(AnnexRepo.git_annex_version, None)
with assert_raises(MissingExternalDependency) as cme:
AnnexRepo(path)
linux_distribution_name = get_linux_distribution()[0]
if linux_distribution_name == 'debian':
assert_in("handbook.datalad.org", str(cme.value))
eq_(AnnexRepo.git_annex_version, None)
with set_annex_version('6.20160505'):
eq_(AnnexRepo.git_annex_version, None)
try:
# Note: Remove to cause creation of a new instance
rmtree(path)
except OSError:
pass
assert_raises(OutdatedExternalDependency, AnnexRepo, path)
# and we don't assign it
eq_(AnnexRepo.git_annex_version, None)
# so we could still fail
try:
# Note: Remove to cause creation of a new instance
rmtree(path)
except OSError:
pass
assert_raises(OutdatedExternalDependency, AnnexRepo, path)
@with_tempfile
@with_tempfile
def test_get_description(path1=None, path2=None):
annex1 = AnnexRepo(path1, create=True)
# some content for git-annex branch
create_tree(path1, {'1.dat': 'content'})
annex1.add('1.dat', git=False)
annex1.commit("msg")
annex1_description = annex1.get_description()
assert_not_equal(annex1_description, path1)
annex2 = AnnexRepo(path2, create=True, description='custom 2')
eq_(annex2.get_description(), 'custom 2')
# not yet known
eq_(annex2.get_description(uuid=annex1.uuid), None)
annex2.add_remote('annex1', path1)
annex2.fetch('annex1')
# it will match the remote name
eq_(annex2.get_description(uuid=annex1.uuid),
annex1_description + ' [annex1]')
# add a little probe file to make sure it stays untracked
create_tree(path1, {'probe': 'probe'})
assert_not_in('probe', annex2.get_indexed_files())
annex2.localsync('annex1')
assert_not_in('probe', annex2.get_indexed_files())
# but let's remove the remote
annex2.remove_remote('annex1')
eq_(annex2.get_description(uuid=annex1.uuid), annex1_description)
@with_tempfile(mkdir=True)
@with_tempfile(mkdir=True)
def test_AnnexRepo_flyweight(path1=None, path2=None):
import sys
repo1 = AnnexRepo(path1, create=True)
assert_is_instance(repo1, AnnexRepo)
# Due to issue 4862, we currently still require gc.collect() under unclear
# circumstances to get rid of an exception traceback when creating in an
# existing directory. That traceback references the respective function
# frames which in turn reference the repo instance (they are methods).
# Doesn't happen on all systems, though. Eventually we need to figure that
# out.
# However, still test for the refcount after gc.collect() to ensure we don't
# introduce new circular references and make the issue worse!
gc.collect()
# As long as we don't reintroduce any circular references or produce
# garbage during instantiation that isn't picked up immediately, `repo1`
# should be the only counted reference to this instance.
# Note, that sys.getrefcount reports its own argument and therefore one
# reference too much.
# Python 3.14+ changed internal reference handling - the interpreter now
# "borrows" references when loading objects onto the operand stack instead
# of incrementing refcount, leading to different sys.getrefcount() values.
# Per Python docs: "do not rely on the returned value to be accurate,
# other than a value of 0 or 1". The actual test for circular references
# is whether the object gets garbage collected below (lines ~1805-1825) -
# if circular refs existed, the finalizer wouldn't be called.
if sys.version_info < (3, 14):
assert_equal(1, sys.getrefcount(repo1) - 1)
# instantiate again:
repo2 = AnnexRepo(path1, create=False)
assert_is_instance(repo2, AnnexRepo)
# the very same object:
ok_(repo1 is repo2)
# reference the same in an different way:
with chpwd(path1):
repo3 = AnnexRepo(relpath(path1, start=path2), create=False)
assert_is_instance(repo3, AnnexRepo)
# it's the same object:
ok_(repo1 is repo3)
# but path attribute is absolute, so they are still equal:
ok_(repo1 == repo3)
# Now, let's try to get a GitRepo instance from a path, we already have an
# AnnexRepo of
repo4 = GitRepo(path1)
assert_is_instance(repo4, GitRepo)
assert_not_is_instance(repo4, AnnexRepo)
orig_id = id(repo1)
# Be sure we have exactly one object in memory:
assert_equal(1, len([o for o in gc.get_objects()
if isinstance(o, AnnexRepo) and o.path == path1]))
# But we have two GitRepos in memory (the AnnexRepo and repo4):
assert_equal(2, len([o for o in gc.get_objects()
if isinstance(o, GitRepo) and o.path == path1]))
# deleting one reference doesn't change anything - we still get the same
# thing:
del repo1
gc.collect() # TODO: see first comment above
ok_(repo2 is not None)
ok_(repo2 is repo3)
ok_(repo2 == repo3)
repo1 = AnnexRepo(path1)
eq_(orig_id, id(repo1))
del repo1
del repo2
# for testing that destroying the object calls close() on BatchedAnnex:
class Dummy:
def __init__(self, *args, **kwargs):
self.close_called = False
def close(self):
self.close_called = True
fake_batch = Dummy()
# Killing last reference will lead to garbage collection which will call
# AnnexRepo's finalizer:
with patch.object(repo3._batched, 'close', fake_batch.close):
with swallow_logs(new_level=1) as cml:
del repo3
gc.collect() # TODO: see first comment above
cml.assert_logged(msg="Finalizer called on: AnnexRepo(%s)" % path1,
level="Level 1",
regex=False)
# finalizer called close() on BatchedAnnex:
assert_true(fake_batch.close_called)
# Flyweight is gone:
assert_not_in(path1, AnnexRepo._unique_instances.keys())
# gc doesn't know any instance anymore:
assert_equal([], [o for o in gc.get_objects()
if isinstance(o, AnnexRepo) and o.path == path1])
# GitRepo is unaffected:
assert_equal(1, len([o for o in gc.get_objects()
if isinstance(o, GitRepo) and o.path == path1]))
# new object is created on re-request:
repo1 = AnnexRepo(path1)
assert_equal(1, len([o for o in gc.get_objects()
if isinstance(o, AnnexRepo) and o.path == path1]))
@with_tempfile
@with_tempfile(mkdir=True)
@with_tempfile
def test_AnnexRepo_get_toppath(repo=None, tempdir=None, repo2=None):
AnnexRepo(repo, create=True)
reporeal = str(Path(repo).resolve())
eq_(AnnexRepo.get_toppath(repo, follow_up=False), reporeal)
eq_(AnnexRepo.get_toppath(repo), repo)
# Generate some nested directory
AnnexRepo(repo2, create=True)
repo2real = str(Path(repo2).resolve())
nested = opj(repo2, "d1", "d2")
os.makedirs(nested)
eq_(AnnexRepo.get_toppath(nested, follow_up=False), repo2real)
eq_(AnnexRepo.get_toppath(nested), repo2)
# and if not under git, should return None
eq_(AnnexRepo.get_toppath(tempdir), None)
def test_AnnexRepo_get_submodules():
raise SkipTest("TODO")
@with_tempfile(mkdir=True)
def test_AnnexRepo_dirty(path=None):
repo = AnnexRepo(path, create=True)
ok_(not repo.dirty)
# pure git operations:
# untracked file
with open(opj(path, 'file1.txt'), 'w') as f:
f.write('whatever')
ok_(repo.dirty)
# staged file
repo.add('file1.txt', git=True)
ok_(repo.dirty)
# clean again
repo.commit("file1.txt added")
ok_(not repo.dirty)
# modify to be the same
with open(opj(path, 'file1.txt'), 'w') as f:
f.write('whatever')
ok_(not repo.dirty)
# modified file
with open(opj(path, 'file1.txt'), 'w') as f:
f.write('something else')
ok_(repo.dirty)
# clean again
repo.add('file1.txt', git=True)
repo.commit("file1.txt modified")
ok_(not repo.dirty)
# annex operations:
# untracked file
with open(opj(path, 'file2.txt'), 'w') as f:
f.write('different content')
ok_(repo.dirty)
# annexed file
repo.add('file2.txt', git=False)
ok_(repo.dirty)
# commit
repo.commit("file2.txt annexed")
ok_(not repo.dirty)
repo.unlock("file2.txt")
# Unlocking the file is seen as a modification when we're not already in an
# adjusted branch (for this test, that would be the case if we're on a
# crippled filesystem).
ok_(repo.dirty ^ repo.is_managed_branch())
repo.save()
ok_(not repo.dirty)
subm = AnnexRepo(repo.pathobj / "subm", create=True)
(subm.pathobj / "foo").write_text("foo")
subm.save()
ok_(repo.dirty)
repo.save()
assert_false(repo.dirty)
maybe_adjust_repo(subm)
assert_false(repo.dirty)
@with_tempfile(mkdir=True)
def test_AnnexRepo_set_remote_url(path=None):
ar = AnnexRepo(path, create=True)
ar.add_remote('some', 'http://example.com/.git')
assert_equal(ar.config['remote.some.url'],
'http://example.com/.git')
assert_not_in('remote.some.annexurl', ar.config.keys())
# change url:
ar.set_remote_url('some', 'http://believe.it')
assert_equal(ar.config['remote.some.url'],
'http://believe.it')
assert_not_in('remote.some.annexurl', ar.config.keys())
# set push url:
ar.set_remote_url('some', 'ssh://whatever.ru', push=True)
assert_equal(ar.config['remote.some.pushurl'],
'ssh://whatever.ru')
assert_in('remote.some.annexurl', ar.config.keys())
assert_equal(ar.config['remote.some.annexurl'],
'ssh://whatever.ru')
@with_tempfile(mkdir=True)
def test_wanted(path=None):
ar = AnnexRepo(path, create=True)
eq_(ar.get_preferred_content('wanted'), None)
# test samples with increasing "trickiness"
for v in ("standard",
"include=*.nii.gz or include=*.nii",
"exclude=archive/* and (include=*.dat or smallerthan=2b)"
):
ar.set_preferred_content('wanted', expr=v)
eq_(ar.get_preferred_content('wanted'), v)
# give it some file so clone/checkout works without hiccups
create_tree(ar.path, {'1.dat': 'content'})
ar.add('1.dat')
ar.commit(msg="blah")
# make a clone and see if all cool there
# intentionally clone as pure Git and do not annex init so to see if we
# are ignoring crummy log msgs
ar1_path = ar.path + '_1'
GitRepo.clone(ar.path, ar1_path)
ar1 = AnnexRepo(ar1_path, init=False)
eq_(ar1.get_preferred_content('wanted'), None)
eq_(ar1.get_preferred_content('wanted', DEFAULT_REMOTE), v)
ar1.set_preferred_content('wanted', expr='standard')
eq_(ar1.get_preferred_content('wanted'), 'standard')
@with_tempfile(mkdir=True)
def test_AnnexRepo_metadata(path=None):
# prelude
obscure_name = get_most_obscure_supported_name()
ar = AnnexRepo(path, create=True)
create_tree(
path,
{
'up.dat': 'content',
obscure_name: {
obscure_name + '.dat': 'lowcontent'
}
})
ar.add('.', git=False)
ar.commit('content')
assert_repo_status(path)
# fugue
# doesn't do anything if there is nothing to do
ar.set_metadata('up.dat')
eq_([], list(ar.get_metadata(None)))
eq_([], list(ar.get_metadata('')))
eq_([], list(ar.get_metadata([])))
eq_({'up.dat': {}}, dict(ar.get_metadata('up.dat')))
# basic invocation
eq_(1, len(ar.set_metadata(
'up.dat',
reset={'mike': 'awesome'},
add={'tag': 'awesome'},
remove={'tag': 'awesome'}, # cancels prev, just to use it
init={'virgin': 'true'},
purge=['nothere'])))
# no timestamps by default
md = dict(ar.get_metadata('up.dat'))
deq_({'up.dat': {
'virgin': ['true'],
'mike': ['awesome']}},
md)
# matching timestamp entries for all keys
md_ts = dict(ar.get_metadata('up.dat', timestamps=True))
for k in md['up.dat']:
assert_in('{}-lastchanged'.format(k), md_ts['up.dat'])
assert_in('lastchanged', md_ts['up.dat'])
# recursive needs a flag
assert_raises(CommandError, ar.set_metadata, '.', purge=['virgin'])
ar.set_metadata('.', purge=['virgin'], recursive=True)
deq_({'up.dat': {
'mike': ['awesome']}},
dict(ar.get_metadata('up.dat')))
# Use trickier tags (spaces, =)
ar.set_metadata('.', reset={'tag': 'one and= '}, purge=['mike'], recursive=True)
playfile = opj(obscure_name, obscure_name + '.dat')
target = {
'up.dat': {
'tag': ['one and= ']},
playfile: {
'tag': ['one and= ']}}
deq_(target, dict(ar.get_metadata('.')))
for batch in (True, False):
# no difference in reporting between modes
deq_(target, dict(ar.get_metadata(['up.dat', playfile], batch=batch)))
# incremental work like a set
ar.set_metadata(playfile, add={'tag': 'one and= '})
deq_(target, dict(ar.get_metadata('.')))
ar.set_metadata(playfile, add={'tag': ' two'})
# returned values are sorted
eq_([' two', 'one and= '], dict(ar.get_metadata(playfile))[playfile]['tag'])
# init honor prior values
ar.set_metadata(playfile, init={'tag': 'three'})
eq_([' two', 'one and= '], dict(ar.get_metadata(playfile))[playfile]['tag'])
ar.set_metadata(playfile, remove={'tag': ' two'})
deq_(target, dict(ar.get_metadata('.')))
# remove non-existing doesn't error and doesn't change anything
ar.set_metadata(playfile, remove={'ether': 'best'})
deq_(target, dict(ar.get_metadata('.')))
# add works without prior existence
ar.set_metadata(playfile, add={'novel': 'best'})
eq_(['best'], dict(ar.get_metadata(playfile))[playfile]['novel'])
@with_tree(tree={'file.txt': 'content'})
@serve_path_via_http()
@with_tempfile
def test_AnnexRepo_addurl_batched_and_set_metadata(path=None, url=None, dest=None):
ar = AnnexRepo(dest, create=True)
fname = "file.txt"
ar.add_url_to_file(fname, urljoin(url, fname), batch=True)
ar.set_metadata(fname, init={"number": "one"})
eq_(["one"], dict(ar.get_metadata(fname))[fname]["number"])
@with_tempfile(mkdir=True)
def test_change_description(path=None):
# prelude
ar = AnnexRepo(path, create=True, description='some')
eq_(ar.get_description(), 'some')
# try change it
ar = AnnexRepo(path, create=False, init=True, description='someother')
# this doesn't cut the mustard, still old
eq_(ar.get_description(), 'some')
# need to resort to "internal" helper
ar._init(description='someother')
eq_(ar.get_description(), 'someother')
@with_tempfile
@with_tempfile
def test_AnnexRepo_get_corresponding_branch(src_path=None, path=None):
src = AnnexRepo(src_path, create=True)
(src.pathobj / 'test-annex.dat').write_text("content")
src.save('some')
ar = AnnexRepo.clone(src_path, path)
# we should be on the default branch.
eq_(DEFAULT_BRANCH,
ar.get_corresponding_branch() or ar.get_active_branch())
# special case v6 adjusted branch is not provided by a dedicated build:
ar.adjust()
# as above, we still want to get the default branch, while being on
# 'adjusted/(unlocked)'
eq_('adjusted/{}(unlocked)'.format(DEFAULT_BRANCH),
ar.get_active_branch())
eq_(DEFAULT_BRANCH, ar.get_corresponding_branch())
@with_tempfile
@with_tempfile
def test_AnnexRepo_get_tracking_branch(src_path=None, path=None):
src = AnnexRepo(src_path, create=True)
(src.pathobj / 'test-annex.dat').write_text("content")
src.save('some')
ar = AnnexRepo.clone(src_path, path)
# we want the relation to original branch, e.g. in v6+ adjusted branch
eq_((DEFAULT_REMOTE, 'refs/heads/' + DEFAULT_BRANCH),
ar.get_tracking_branch())
@skip_if_adjusted_branch
@with_tempfile
def test_AnnexRepo_is_managed_branch(path=None):
ar = AnnexRepo(path, create=True)
(ar.pathobj / 'test-annex.dat').write_text("content")
ar.save('some')
ar.adjust()
ok_(ar.is_managed_branch())
@with_tempfile(mkdir=True)
def test_fake_is_not_special(path=None):
ar = AnnexRepo(path, create=True)
# doesn't exist -- we fail by default
assert_raises(RemoteNotAvailableError, ar.is_special_annex_remote, "fake")
assert_false(ar.is_special_annex_remote("fake", check_if_known=False))
@with_tree(tree={"remote": {}, "main": {}, "special": {}})
def test_is_special(path=None):
rem = AnnexRepo(op.join(path, "remote"), create=True)
dir_arg = "directory={}".format(op.join(path, "special"))
rem.init_remote("imspecial",
["type=directory", "encryption=none", dir_arg])
ok_(rem.is_special_annex_remote("imspecial"))
ar = AnnexRepo.clone(rem.path, op.join(path, "main"))
assert_false(ar.is_special_annex_remote(DEFAULT_REMOTE))
assert_false(ar.is_special_annex_remote("imspecial",
check_if_known=False))
ar.enable_remote("imspecial", options=[dir_arg])
ok_(ar.is_special_annex_remote("imspecial"))
# With a mis-configured remote, give warning and return false.
ar.config.unset(f"remote.{DEFAULT_REMOTE}.url", scope="local")
with swallow_logs(new_level=logging.WARNING) as cml:
assert_false(ar.is_special_annex_remote(DEFAULT_REMOTE))
cml.assert_logged(msg=".*no URL.*", level="WARNING", regex=True)
@with_tempfile(mkdir=True)
def test_fake_dates(path=None):
ar = AnnexRepo(path, create=True, fake_dates=True)
timestamp = ar.config.obtain("datalad.fake-dates-start") + 1
# Commits from the "git annex init" call are one second ahead.
for commit in ar.get_branch_commits_("git-annex"):
eq_(timestamp, int(ar.format_commit('%ct', commit)))
assert_in("timestamp={}s".format(timestamp),
ar.call_git(["cat-file", "blob", "git-annex:uuid.log"], read_only=True))
# to prevent regression
# http://git-annex.branchable.com/bugs/v6_-_under_subdir__58___git_add___34__whines__34____44___git_commit___34__blows__34__/
# It is disabled because is not per se relevant to DataLad since we do not
# Since we invoke from the top of the repo, we do not hit it,
# but thought to leave it around if we want to enforce/test system-wide git being
# compatible with annex for v6 mode
@with_tempfile(mkdir=True)
def _test_add_under_subdir(path):
ar = AnnexRepo(path, create=True, version=6)
gr = GitRepo(path) # "Git" view over the repository, so we force "git add"
subdir = opj(path, 'sub')
subfile = opj('sub', 'empty')
# os.mkdir(subdir)
create_tree(subdir, {'empty': ''})
runner = Runner(cwd=subdir)
with chpwd(subdir):
runner.run(['git', 'add', 'empty']) # should add successfully
# gr.commit('important') #
runner.run(['git', 'commit', '-m', 'important'])
ar.is_under_annex(subfile)
# https://github.com/datalad/datalad/issues/2892
@with_tempfile(mkdir=True)
def test_error_reporting(path=None):
ar = AnnexRepo(path, create=True)
res = ar.call_annex_records(['add'], files='gl\\orious BS')
target = {
'command': 'add',
# whole thing, despite space, properly quotes backslash
'file': 'gl\\orious BS',
'note': 'not found',
'success': False
}
assert len(res) >= 1
if 'message-id' in res[0]:
# new since ~ 10.20230407-99-gbe36e208c2
target['message-id'] = 'FileNotFound'
target['input'] = ['gl\\orious BS']
target['error-messages'] = ['git-annex: gl\\orious BS not found']
else:
# our own produced record
target['error-messages'] = ['File unknown to git']
eq_(res, [target])
@with_tree(tree={
'file1': "content1",
'dir1': {'file2': 'content2'},
})
def test_annexjson_protocol(path=None):
ar = AnnexRepo(path, create=True)
ar.save()
assert_repo_status(path)
# first an orderly execution
res = ar._call_annex(
['find', '.', '--json'],
protocol=AnnexJsonProtocol)
for k in ('stdout', 'stdout_json', 'stderr'):
assert_in(k, res)
orig_j = res['stdout_json']
eq_(len(orig_j), 2)
# not meant as an exhaustive check for output structure,
# just some assurance that it is not totally alien
ok_(all(j['file'] for j in orig_j))
# no complaints, unless git-annex is triggered to run in debug mode
if logging.getLogger('datalad.annex').getEffectiveLevel() > 8:
eq_(res['stderr'], '')
# Note: git-annex-find does not error with all annex
# versions. Fixed in annex commit
# ce91f10132805d11448896304821b0aa9c6d9845 (Feb 28, 2022).
if '10.20220222' < external_versions['cmd:annex'] < '10.20220322':
raise SkipTest("zero-exit annex-find bug")
# now the same, but with a forced error
with assert_raises(CommandError) as e:
ar._call_annex(['find', '.', 'error', '--json'],
protocol=AnnexJsonProtocol)
# normal operation is not impaired
eq_(e.value.kwargs['stdout_json'], orig_j)
# we get a clue what went wrong,
# but reporting depends on config + version (default changed):
msg = "pathspec 'error' did not match" if not dl_cfg.getbool(
section="annex", option="skipunknown",
# git-annex switched default for this config:
default=bool(external_versions['cmd:annex'] < '10.20220222')) else \
"error not found"
assert_in(msg, e.value.stderr)
# there should be no errors reported in an individual records
# hence also no pointless statement in the str()
assert_not_in('errors from JSON records', str(e.value))
@with_tempfile
def test_annexjson_protocol_long(path=None, *, caplog):
records = [
{"k": "v" * 20},
# Value based off of
# Lib.asyncio.unix_events._UnixReadPipeTransport.max_size.
{"k": "v" * 256 * 1024},
# and tiny ones in between should not be lost
{"k": "v"},
# even a much larger one - we should handle as well
{"k": "v" * 256 * 1024 * 5},
]
with open(path, 'w') as f:
for record in records:
print("print(%r);" % json.dumps(record), file=f)
runner = GitWitlessRunner()
with caplog.at_level(logging.ERROR), \
swallow_logs(new_level=logging.ERROR):
res = runner.run(
[sys.executable, path],
protocol=AnnexJsonProtocol
)
eq_(res['stdout'], '')
eq_(res['stderr'], '')
eq_(res['stdout_json'], records)
@pytest.mark.parametrize("print_opt", ['', ', end=""'])
@with_tempfile
def test_annexjson_protocol_incorrect(path=None, *, print_opt, caplog):
# Test that we still log some incorrectly formed JSON record
bad_json = '{"I": "am wrong,}'
with open(path, 'w') as f:
print("print(%r%s);" % (bad_json, print_opt), file=f)
runner = GitWitlessRunner()
# caplog only to not cause memory error in case of heavy debugging
# Unfortunately it lacks similar .assert_logged with a regex matching
# to be just used instead
with caplog.at_level(logging.ERROR), \
swallow_logs(new_level=logging.ERROR) as cml:
res = runner.run(
[sys.executable, path],
protocol=AnnexJsonProtocol
)
cml.assert_logged(
msg=".*[rR]eceived undecodable JSON output",
level="ERROR",
regex=True)
# only error logged and nothing returned
eq_(res['stdout'], '')
eq_(res['stderr'], '')
eq_(res['stdout_json'], [])
# see https://github.com/datalad/datalad/pull/5400 for troubleshooting
# for stalling with unlock=False, and then with unlock=True it took >= 300 sec
# https://github.com/datalad/datalad/pull/5433#issuecomment-784470028
@skip_if((on_github or on_travis) and on_nfs) # TODO. stalled on travis, fails on github
# http://git-annex.branchable.com/bugs/cannot_commit___34__annex_add__34__ed_modified_file_which_switched_its_largefile_status_to_be_committed_to_git_now/#comment-bf70dd0071de1bfdae9fd4f736fd1ec
# https://github.com/datalad/datalad/issues/1651
@known_failure_githubci_win
@pytest.mark.parametrize("unlock", [True, False])
@with_tree(tree={
'.gitattributes': "** annex.largefiles=(largerthan=4b)",
'alwaysbig': 'a'*10,
'willgetshort': 'b'*10,
'tobechanged-git': 'a',
'tobechanged-annex': 'a'*10,
})
def test_commit_annex_commit_changed(path=None, *, unlock):
# Here we test commit working correctly if file was just removed
# (not unlocked), edited and committed back
# TODO: an additional possible interaction to check/solidify - if files
# first get unannexed (after being optionally unlocked first)
unannex = False
ar = AnnexRepo(path, create=True)
ar.save(paths=[".gitattributes"], git=True)
ar.save("initial commit")
assert_repo_status(path)
# Now let's change all but commit only some
files = [op.basename(p) for p in glob(op.join(path, '*'))]
if unlock:
ar.unlock(files)
if unannex:
ar.unannex(files)
create_tree(
path
, {
'alwaysbig': 'a'*11,
'willgetshort': 'b',
'tobechanged-git': 'aa',
'tobechanged-annex': 'a'*11,
'untracked': 'unique'
}
, remove_existing=True
)
assert_repo_status(
path
, modified=files if not unannex else ['tobechanged-git']
, untracked=['untracked'] if not unannex else
# all but the one in git now
['alwaysbig', 'tobechanged-annex', 'untracked', 'willgetshort']
)
ar.save("message", paths=['alwaysbig', 'willgetshort'])
assert_repo_status(
path
, modified=['tobechanged-git', 'tobechanged-annex']
, untracked=['untracked']
)
ok_file_under_git(path, 'alwaysbig', annexed=True)
ok_file_under_git(path, 'willgetshort', annexed=False)
ar.save("message2", untracked='no') # commit all changed
assert_repo_status(
path
, untracked=['untracked']
)
ok_file_under_git(path, 'tobechanged-git', annexed=False)
ok_file_under_git(path, 'tobechanged-annex', annexed=True)
_test_unannex_tree = {
OBSCURE_FILENAME: 'content1',
OBSCURE_FILENAME + ".dat": 'content2',
}
if not on_windows and (
external_versions['cmd:annex'] <= '10.20230407' or external_versions['cmd:annex'] >= '10.20230408'
):
# Only whenever we are not within the development versions of the 10.20230407
# where we cannot do version comparison relibalye,
# the case where we have entire filename within ""
_test_unannex_tree[f'"{OBSCURE_FILENAME}"'] = 'content3'
@with_tree(tree=_test_unannex_tree)
def test_unannex_etc(path=None):
# Primarily to test if quote/unquote/not-quote'ing work for tricky
# filenames. Ref: https://github.com/datalad/datalad/pull/7372
repo = AnnexRepo(path)
files = list(_test_unannex_tree)
# here it is through json so kinda guaranteed to work but let's check too
assert files == [x['file'] for x in repo.add(files)]
assert sorted(files) == sorted(repo.get_annexed_files())
assert files == repo.unannex(files)
@slow # 15 + 17sec on travis
@pytest.mark.parametrize("cls", [GitRepo, AnnexRepo])
@with_tempfile(mkdir=True)
def test_files_split_exc(topdir=None, *, cls):
r = cls(topdir)
# absent files -- should not crash with "too long" but some other more
# meaningful exception
files = ["f" * 100 + "%04d" % f for f in range(100000)]
if isinstance(r, AnnexRepo):
# Annex'es add first checks for what is being added and does not fail
# for non existing files either ATM :-/ TODO: make consistent etc
r.add(files)
else:
with assert_raises(Exception) as ecm:
r.add(files)
assert_not_in('too long', str(ecm.value))
assert_not_in('too many', str(ecm.value))
# with 204 (/ + (98+3)*2 + /) chars guaranteed, we hit "filename too long" quickly on windows
# so we are doomed to shorten the filepath for testing on windows. Since the limits are smaller
# on windows (16k vs e.g. 1m on linux in CMD_MAX_ARG), it would already be a "struggle" for it,
# we also reduce number of dirs/files
_ht_len, _ht_n = (48, 20) if on_windows else (98, 100)
_HEAVY_TREE = {
# might already run into 'filename too long' on windows probably
"d" * _ht_len + '%03d' % d: {
# populate with not entirely unique but still not all identical (empty) keys.
# With content unique to that filename we would still get 100 identical
# files for each key, thus possibly hitting regressions in annex like
# https://git-annex.branchable.com/bugs/significant_performance_regression_impacting_datal/
# but also would not hit filesystem as hard as if we had all the keys unique.
'f' * _ht_len + '%03d' % f: str(f)
for f in range(_ht_n)
}
for d in range(_ht_n)
}
# @known_failure_windows # might fail with some older annex `cp` failing to set permissions
@slow # 313s well -- if errors out - only 3 sec
@pytest.mark.parametrize("cls", [GitRepo, AnnexRepo])
@with_tree(tree=_HEAVY_TREE)
def test_files_split(topdir=None, *, cls):
from glob import glob
r = cls(topdir)
dirs = glob(op.join(topdir, '*'))
files = glob(op.join(topdir, '*', '*'))
r.add(files)
r.commit(files=files)
# Let's modify and do dl.add for even a heavier test
# Now do for real on some heavy directory
import datalad.api as dl
for f in files:
os.unlink(f)
with open(f, 'w') as f:
f.write('1')
dl.save(dataset=r.path, path=dirs, result_renderer="disabled")
@skip_if_on_windows
@skip_if_root
@with_tree({
'repo': {
'file1': 'file1',
'file2': 'file2'
}
})
def test_ro_operations(path=None):
# This test would function only if there is a way to run sudo
# non-interactively, e.g. on Travis or on your local (watchout!) system
# after you ran sudo command recently.
run = Runner().run
sudochown = lambda cmd: run(['sudo', '-n', 'chown'] + cmd)
repo = AnnexRepo(op.join(path, 'repo'), init=True)
repo.add('file1')
repo.commit()
# make a clone
repo2 = repo.clone(repo.path, op.join(path, 'clone'))
repo2.get('file1')
# progress forward original repo and fetch (but nothing else) it into repo2
repo.add('file2')
repo.commit()
repo2.fetch(DEFAULT_REMOTE)
# Assure that regardless of umask everyone could read it all
run(['chmod', '-R', 'a+rX', repo2.path])
try:
# To assure that git/git-annex really cannot acquire a lock and do
# any changes (e.g. merge git-annex branch), we make this repo owned by root
sudochown(['-R', 'root', repo2.path])
except Exception as exc:
# Exception could be CommandError or IOError when there is no sudo
raise SkipTest("Cannot run sudo chown non-interactively: %s" % exc)
# recent git would refuse to run git status in repository owned by someone else
# which could lead to odd git-annex errors before 10.20220504-55-gaf0d85446 AKA 10.20220525~13
# see https://github.com/datalad/datalad/issues/5665 and after an informative error
# https://github.com/datalad/datalad/issues/6708
# To overcome - explicitly add the path into allowed
dl_cfg.add('safe.directory', repo2.path, scope='global')
try:
assert not repo2.get('file1') # should work since file is here already
repo2.status() # should be Ok as well
# and we should get info on the file just fine
assert repo2.info('file1')
# The tricky part is the repo_info which might need to update
# remotes UUID -- by default it should fail!
# Oh well -- not raised on travis... whatever for now
#with assert_raises(CommandError):
# repo2.repo_info()
# but should succeed if we disallow merges
repo2.repo_info(merge_annex_branches=False)
# and ultimately the ls which uses it
try:
from datalad.api import ls
ls(repo2.path, all_=True, long_=True)
except ImportError:
raise SkipTest(
"No `ls` command available (provided by -deprecated extension)")
finally:
sudochown(['-R', str(os.geteuid()), repo2.path])
# just check that all is good again
repo2.repo_info()
@skip_if_on_windows
@skip_if_root
@with_tree({
'file1': 'file1',
})
def test_save_noperms(path=None):
# check that we do report annex error messages
# This test would function only if there is a way to run sudo
# non-interactively, e.g. on Travis or on your local (watchout!) system
# after you ran sudo command recently.
repo = AnnexRepo(path, init=True)
run = Runner().run
sudochown = lambda cmd: run(['sudo', '-n', 'chown'] + cmd)
try:
# To assure that git/git-annex really cannot acquire a lock and do
# any changes (e.g. merge git-annex branch), we make this repo owned by root
sudochown(['-R', 'root:root', str(repo.pathobj / 'file1')])
except Exception as exc:
# Exception could be CommandError or IOError when there is no sudo
raise SkipTest("Cannot run sudo chown non-interactively: %s" % exc)
try:
repo.save(paths=['file1'])
except CommandError as exc:
res = exc.kwargs["stdout_json"]
assert_result_count(res, 1)
assert_result_count(res, 1, file='file1',
command='add', success=False)
assert_in('permission denied', res[0]['error-messages'][0])
finally:
sudochown(['-R', str(os.geteuid()), repo.path])
def test_get_size_from_key():
# see https://git-annex.branchable.com/internals/key_format/
# BACKEND[-sNNNN][-mNNNN][-SNNNN-CNNNN]--NAME
test_keys = {"ANYBACKEND--NAME": None,
"ANYBACKEND-s123-m1234--NAME-WITH-DASHES.ext": 123,
"MD5E-s100-S10-C1--somen.ame": 10,
"SHA256-s99-S10-C10--name": 9,
"SHA256E-sNaN--name": None, # debatable: None or raise?
}
invalid = ["ANYBACKEND-S10-C30--missing-total",
"s99-S10-C10--NOBACKEND",
"MD5-s100-S5--no-chunk-number"]
for key in invalid:
assert_raises(ValueError, AnnexRepo.get_size_from_key, key)
for key, value in test_keys.items():
eq_(AnnexRepo.get_size_from_key(key), value)
@with_tempfile(mkdir=True)
def test_call_annex(path=None):
ar = AnnexRepo(path, create=True)
# we raise on mistakes
with assert_raises(CommandError):
ar._call_annex(['not-an-annex-command'])
# and we get to know why
try:
ar._call_annex(['not-an-annex-command'])
except CommandError as e:
assert_in('Invalid argument', e.stderr)
@with_tempfile
def test_whereis_zero_copies(path=None):
repo = AnnexRepo(path, create=True)
(repo.pathobj / "foo").write_text("foo")
repo.save()
repo.drop(["foo"], options=["--force"])
for output in "full", "uuids", "descriptions":
res = repo.whereis(files=["foo"], output=output)
if output == "full":
assert_equal(res["foo"], {})
else:
assert_equal(res, [[]])
@with_tempfile(mkdir=True)
def test_whereis_batch_eqv(path=None):
path = Path(path)
repo_a = AnnexRepo(path / "a", create=True)
(repo_a.pathobj / "foo").write_text("foo")
(repo_a.pathobj / "bar").write_text("bar")
(repo_a.pathobj / "baz").write_text("baz")
repo_a.save()
repo_b = repo_a.clone(repo_a.path, str(path / "b"))
repo_b.drop(["bar"])
repo_b.drop(["baz"])
repo_b.drop(["baz"], options=["--from=" + DEFAULT_REMOTE, "--force"])
files = ["foo", "bar", "baz"]
info = repo_b.get_content_annexinfo(files)
keys = [info[repo_b.pathobj / f]['key'] for f in files]
for output in "full", "uuids", "descriptions":
out_non_batch = repo_b.whereis(files=files, batch=False, output=output)
assert_equal(out_non_batch,
repo_b.whereis(files=files, batch=True, output=output))
out_non_batch_keys = repo_b.whereis(files=keys, batch=False, key=True, output=output)
# should be identical
if output == 'full':
# we need to map files to keys though
assert_equal(out_non_batch_keys,
{k: out_non_batch[f] for f, k in zip(files, keys)})
else:
assert_equal(out_non_batch, out_non_batch_keys)
# --batch-keys support was introduced
assert_equal(out_non_batch_keys,
repo_b.whereis(files=keys, batch=True, key=True, output=output))
def test_done_deprecation():
with unittest.mock.patch("datalad.cmd.warnings.warn") as warn_mock:
_ = AnnexJsonProtocol("done")
warn_mock.assert_called_once()
with unittest.mock.patch("datalad.cmd.warnings.warn") as warn_mock:
_ = AnnexJsonProtocol()
warn_mock.assert_not_called()
def test_generator_annex_json_protocol():
runner = Runner()
stdin_queue = Queue()
def json_object(count: int):
json_template = '{{"id": "some-id", "count": {count}}}'
return json_template.format(count=count).encode()
count = 123
stdin_queue.put(json_object(count=count))
for result in runner.run(cmd="cat", protocol=GeneratorAnnexJsonProtocol, stdin=stdin_queue):
assert_equal(
result,
{
"id": "some-id",
"count": count,
}
)
if count == 133:
break
count += 1
stdin_queue.put(json_object(count=count))
def test_captured_exception():
class RaiseMock:
def add_(self, *args, **kwargs):
raise CommandError("RaiseMock.add_")
with patch("datalad.support.annexrepo.super") as repl_super:
repl_super.return_value = RaiseMock()
gen = AnnexRepo.add_(object(), [])
assert_raises(CommandError, gen.send, None)
@skip_if_on_windows
def test_stderr_rejecting_protocol_trigger():
result_generator = GitWitlessRunner().run(
"echo ssss >&2",
protocol=GeneratorAnnexJsonNoStderrProtocol)
try:
tuple(result_generator)
except CommandError as e:
assert_in("ssss", e.stderr)
return
assert_true(False)
@skip_if_on_windows
def test_stderr_rejecting_protocol_ignore():
result_generator = GitWitlessRunner().run(
['echo', '{"status": "ok"}'],
protocol=GeneratorAnnexJsonNoStderrProtocol)
assert_equal(tuple(result_generator), ({"status": "ok"},))
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_ansi_colors.py 0000644 0001751 0001751 00000010067 15137634221 022760 0 ustar 00runner runner # emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test ANSI color tools """
import os
from unittest.mock import patch
from datalad.support import ansi_colors as colors
from datalad.tests.utils_pytest import (
assert_equal,
patch_config,
)
def test_color_enabled():
# In the absence of NO_COLOR, follow ui.color, or ui.is_interactive if 'auto'
with patch.dict(os.environ), \
patch('datalad.support.ansi_colors.ui'):
os.environ.pop('NO_COLOR', None)
for is_interactive in (True, False):
colors.ui.is_interactive = is_interactive
with patch_config({'datalad.ui.color': 'off'}):
assert_equal(colors.color_enabled(), False)
with patch_config({'datalad.ui.color': 'on'}):
assert_equal(colors.color_enabled(), True)
with patch_config({'datalad.ui.color': 'auto'}):
assert_equal(colors.color_enabled(), is_interactive)
# In the presence of NO_COLOR, default to disable, unless ui.color is "on"
# The value of NO_COLOR should have no effect, so try true-ish and false-ish values
for NO_COLOR in ("", "1", "0"):
with patch.dict(os.environ, {'NO_COLOR': NO_COLOR}), \
patch('datalad.support.ansi_colors.ui'):
for is_interactive in (True, False):
colors.ui.is_interactive = is_interactive
with patch_config({'datalad.ui.color': 'on'}):
assert_equal(colors.color_enabled(), True)
for ui_color in ('off', 'auto'):
with patch_config({'datalad.ui.color': ui_color}):
assert_equal(colors.color_enabled(), False)
#
# In all other tests, just patch color_enabled
#
def test_format_msg():
fmt = r'a$BOLDb$RESETc$BOLDd$RESETe'
for enabled in (True, False):
with patch('datalad.support.ansi_colors.color_enabled', lambda: enabled):
assert_equal(colors.format_msg(fmt), 'abcde')
assert_equal(colors.format_msg(fmt, use_color=False), 'abcde')
with patch('datalad.support.ansi_colors.color_enabled', lambda: False):
for use_color in (True, False):
assert_equal(colors.format_msg(fmt), 'abcde')
assert_equal(colors.format_msg(fmt, use_color=use_color), 'abcde')
with patch('datalad.support.ansi_colors.color_enabled', lambda: True):
assert_equal(colors.format_msg(fmt, use_color=True), 'a\033[1mb\033[0mc\033[1md\033[0me')
def test_color_word():
s = 'word'
green_s = '\033[1;32mword\033[0m'
for enabled in (True, False):
with patch('datalad.support.ansi_colors.color_enabled', lambda: enabled):
assert_equal(colors.color_word(s, colors.GREEN, force=True), green_s)
with patch('datalad.support.ansi_colors.color_enabled', lambda: True):
assert_equal(colors.color_word(s, colors.GREEN), green_s)
assert_equal(colors.color_word(s, colors.GREEN, force=False), green_s)
with patch('datalad.support.ansi_colors.color_enabled', lambda: False):
assert_equal(colors.color_word(s, colors.GREEN), s)
assert_equal(colors.color_word(s, colors.GREEN, force=False), s)
def test_color_status():
# status -> (plain, colored)
statuses = {
'ok': ('ok', '\033[1;32mok\033[0m'),
'notneeded': ('notneeded', '\033[1;32mnotneeded\033[0m'),
'impossible': ('impossible', '\033[1;33mimpossible\033[0m'),
'error': ('error', '\033[1;31merror\033[0m'),
'invalid': ('invalid', 'invalid'),
}
for enabled in (True, False):
with patch('datalad.support.ansi_colors.color_enabled', lambda: enabled):
for status, retopts in statuses.items():
assert_equal(colors.color_status(status), retopts[enabled])
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_cache.py 0000644 0001751 0001751 00000001234 15137634221 021504 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from ...tests.utils_pytest import assert_equal
from ..cache import DictCache
def test_DictCache():
d = DictCache(size_limit=2)
assert_equal(d, {})
d['a'] = 2
d['b'] = 1
assert_equal(d, {'a': 2, 'b': 1})
d['c'] = 2
assert_equal(d, {'c': 2, 'b': 1})
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_captured_exception.py 0000644 0001751 0001751 00000010434 15137634221 024330 0 ustar 00runner runner import sys
from unittest.mock import patch
from datalad import cfg
from datalad.support.exceptions import (
CapturedException,
format_exception_with_cause,
)
from datalad.tests.utils_pytest import (
assert_equal,
assert_re_in,
assert_true,
)
def test_CapturedException():
try:
raise Exception("BOOM")
except Exception as e:
captured_exc = CapturedException(e)
assert_re_in(r"BOOM \[test_captured_exception.py:test_CapturedException:[0-9]+\]", captured_exc.format_oneline_tb())
assert_re_in(r"^\[.*\]", captured_exc.format_oneline_tb(include_str=False)) # only traceback
try:
raise NotImplementedError
except Exception as e:
captured_exc = CapturedException(e)
assert_re_in(r"NotImplementedError \[test_captured_exception.py:test_CapturedException:[0-9]+\]", captured_exc.format_oneline_tb())
def f():
def f2():
raise Exception("my bad again")
try:
f2()
except Exception as e:
# exception chain
raise RuntimeError("new message") from e
try:
f()
except Exception as e:
captured_exc = CapturedException(e)
# default limit: one level:
estr1 = captured_exc.format_oneline_tb(limit=1)
estr2 = captured_exc.format_oneline_tb(limit=2)
# and we can control it via environ/config by default
try:
with patch.dict('os.environ', {'DATALAD_EXC_STR_TBLIMIT': '3'}):
cfg.reload()
estr3 = captured_exc.format_oneline_tb()
with patch.dict('os.environ', {}, clear=True):
cfg.reload()
estr_ = captured_exc.format_oneline_tb()
finally:
cfg.reload() # make sure we don't have a side effect on other tests
estr_full = captured_exc.format_oneline_tb(10)
assert_re_in(r"new message -caused by- my bad again \[test_captured_exception.py:test_CapturedException:[0-9]+,test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f2:[0-9]+\]", estr_full)
assert_re_in(r"new message -caused by- my bad again \[test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f2:[0-9]+\]", estr3)
assert_re_in(r"new message -caused by- my bad again \[test_captured_exception.py:f:[0-9]+,test_captured_exception.py:f2:[0-9]+\]", estr2)
assert_re_in(r"new message -caused by- my bad again \[test_captured_exception.py:f2:[0-9]+\]", estr1)
# default: no limit:
assert_equal(estr_, estr_full)
# standard output
full_display = captured_exc.format_standard().splitlines()
assert_equal(full_display[0], "Traceback (most recent call last):")
# points in f and f2 for first exception with two lines each
# (where is the line and what reads the line):
assert full_display[1].lstrip().startswith("File")
assert full_display[2].strip() == "f2()"
inc = int(sys.version_info >= (3, 13))
assert full_display[3 + inc].lstrip().startswith("File")
assert full_display[4 + inc].strip() == "raise Exception(\"my bad again\")"
assert full_display[5 + inc].strip() == "Exception: my bad again"
assert full_display[7 + inc].strip() == "The above exception was the direct cause of the following exception:"
assert full_display[9 + inc] == "Traceback (most recent call last):"
# ...
assert full_display[-1].strip() == "RuntimeError: new message"
# CapturedException.__repr__:
assert_re_in(r".*test_captured_exception.py:f2:[0-9]+\]$",
captured_exc.__repr__())
def makeitraise():
def raise_valueerror():
try:
raise_runtimeerror()
except Exception as e:
raise ValueError from e
def raise_runtimeerror():
raise RuntimeError("Mike")
try:
raise_valueerror()
except Exception as e:
raise RuntimeError from e
def test_format_exception_with_cause():
try:
makeitraise()
except Exception as e:
assert_equal(
format_exception_with_cause(e),
'RuntimeError -caused by- ValueError -caused by- Mike')
# make sure it also works with TracebackException/CapturedException:
ce = CapturedException(e)
assert_equal(
ce.format_with_cause(),
'RuntimeError -caused by- ValueError -caused by- Mike')
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_cookies.py 0000644 0001751 0001751 00000003667 15137634221 022111 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from datalad.support import path as op
from datalad.tests.utils_pytest import (
assert_equal,
known_failure_githubci_win,
with_tempfile,
)
from datalad.utils import rmtree
from ..cookies import CookiesDB
@known_failure_githubci_win
@with_tempfile(mkdir=True)
def test_no_blows(cookiesdir=None):
cookies = CookiesDB(op.join(cookiesdir, 'mycookies'))
# set the cookie
cookies['best'] = 'mine'
assert_equal(cookies['best'], 'mine')
"""
Somehow this manages to trigger on conda but not on debian for me
File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/shelve.py", line 125, in __setitem__
self.dict[key.encode(self.keyencoding)] = f.getvalue()
File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 216, in __setitem__
self._index[key] = self._setval(pos, val)
File "/home/yoh/anaconda-2018.12-3.7/envs/test-gitpython/lib/python3.7/dbm/dumb.py", line 178, in _setval
with _io.open(self._datfile, 'rb+') as f:
FileNotFoundError: [Errno 2] No such file or directory: '/home/yoh/.tmp/datalad_temp_test_no_blowsalnsw_wk/mycookies.dat'
on Debian (python 3.7.3~rc1-1) I just get a warning: BDB3028 /home/yoh/.tmp/datalad_temp_test_no_blows58tdg67s/mycookies.db: unable to flush: No such file or directory
"""
try:
rmtree(cookiesdir)
except OSError:
# on NFS directory might still be open, so .nfs* lock file would prevent
# removal, but it shouldn't matter and .close should succeed
pass
cookies.close()
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_digests.py 0000644 0001751 0001751 00000003755 15137634221 022115 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from os.path import join as opj
from datalad.tests.utils_pytest import (
assert_equal,
with_tree,
)
from ..digests import Digester
@with_tree(tree={'sample.txt': '123',
'0': chr(0),
'long.txt': '123abz\n'*1000000})
def test_digester(path=None):
digester = Digester()
assert_equal(
digester(opj(path, 'sample.txt')),
{
'md5': '202cb962ac59075b964b07152d234b70',
'sha1': '40bd001563085fc35165329ea1ff5c5ecbdbbeef',
'sha256': 'a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3',
'sha512': '3c9909afec25354d551dae21590bb26e38d53f2173b8d3dc3eee4c047e7ab1c1eb8b85103e3be7ba613b31bb5c9c36214dc9f14a42fd7a2fdb84856bca5c44c2'
})
assert_equal(
digester(opj(path, '0')),
{
'md5': '93b885adfe0da089cdf634904fd59f71',
'sha1': '5ba93c9db0cff93f52b521d7420e43f6eda2784f',
'sha256': '6e340b9cffb37a989ca544e6bb780a2c78901d3fb33738768511a30617afa01d',
'sha512': 'b8244d028981d693af7b456af8efa4cad63d282e19ff14942c246e50d9351d22704a802a71c3580b6370de4ceb293c324a8423342557d4e5c38438f0e36910ee',
})
assert_equal(
digester(opj(path, 'long.txt')),
{
'md5': '81b196e3d8a1db4dd2e89faa39614396',
'sha1': '5273ac6247322c3c7b4735a6d19fd4a5366e812f',
'sha256': '80028815b3557e30d7cbef1d8dbc30af0ec0858eff34b960d2839fd88ad08871',
'sha512': '684d23393eee455f44c13ab00d062980937a5d040259d69c6b291c983bf635e1d405ff1dc2763e433d69b8f299b3f4da500663b813ce176a43e29ffcc31b0159'
})
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_due_utils.py 0000644 0001751 0001751 00000004754 15137634221 022450 0 ustar 00runner runner import logging
from unittest.mock import patch
from ...distribution import dataset as dataset_mod
from ...distribution.dataset import Dataset
from ...tests.utils_pytest import (
swallow_logs,
with_tempfile,
)
from ..due import (
Doi,
Text,
due,
)
from ..due_utils import duecredit_dataset
from ..external_versions import external_versions
@with_tempfile(mkdir=True)
def test_duecredit_dataset(path=None):
dataset = Dataset(path)
# Verify that we do not call duecredit_dataset if due is not enabled
# Seems can't patch.object.enabled so we will just test differently
# depending on either enabled or not
if not due.active:
with patch.object(dataset_mod, 'duecredit_dataset') as cmdc:
dataset.create()
cmdc.assert_not_called()
else:
with patch.object(dataset_mod, 'duecredit_dataset') as cmdc:
dataset.create()
cmdc.assert_called_once_with(dataset)
# note: doesn't crash even if we call it incorrectly (needs dataset)
duecredit_dataset()
# No metadata -- no citation ATM.
# TODO: possibly reconsider - may be our catch-all should be used there
# as well
with patch.object(due, 'cite') as mcite:
with swallow_logs(new_level=logging.DEBUG) as cml:
duecredit_dataset(dataset) # should not crash or anything
# since no metadata - we issue warning and return without citing
# anything
cml.assert_logged(
regex='.*Failed to obtain metadata.*Will not provide duecredit.*'
)
mcite.assert_not_called()
# Below we will rely on duecredit Entries being comparable, so if
# duecredit is available and does not provide __cmp__ we make it for now
# Whenever https://github.com/duecredit/duecredit/pull/148 is merged, and
# probably 0.7.1 released - we will eventually remove this monkey patching.
# Checking if __eq__ was actually provided seems tricky on py2, so decided
# to just do version comparison
try:
if external_versions['duecredit'] < '0.7.1':
from duecredit.entries import DueCreditEntry
def _entry_eq(self, other):
return (
(self._rawentry == other._rawentry) and
(self._key == other._key)
)
DueCreditEntry.__eq__ = _entry_eq
except:
# assume that not present so donothing stubs would be used, and
# we will just compare Nones
pass
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_extensions.py 0000644 0001751 0001751 00000003337 15137634221 022646 0 ustar 00runner runner from datalad.tests.utils_pytest import (
assert_in,
assert_raises,
eq_,
nok_,
ok_,
)
from ..extensions import (
has_config,
register_config,
)
def test_register_config():
nok_(has_config('datalad.testdummies.invalid'))
assert_raises(
ValueError,
register_config,
'datalad.testdummies.invalid',
title=None,
dialog='yesno')
nok_(has_config('datalad.testdummies.invalid'))
cfgkey = 'datalad.testdummies.try1'
nok_(has_config(cfgkey))
register_config(
cfgkey,
'This is what happens, when you do not listen to mama!',
default_fn=lambda: 5,
description='Try on-access default "computation"',
type=int,
dialog='question',
scope='global',
)
from datalad.interface.common_cfg import definitions
assert_in(cfgkey, definitions)
# same thing, other part of the API
assert_in(cfgkey, definitions.keys())
# and yet another
assert_in(cfgkey, [k for k, v in definitions.items()])
# one more still
assert_in(cfgkey, [k for k in definitions])
# more smoke testing, we must have at least this one
ok_(len(definitions))
df = definitions[cfgkey]
# on access default computation
eq_(df['default'], 5)
# we could set any novel property
df['novel'] = 'unexpected'
eq_(df.get('novel'), 'unexpected')
eq_(df.get('toonovel'), None)
# smoke test str/repr
assert_in('mama', str(df))
assert_in('mama', repr(df))
# internal data structure for UI was assembled
assert_in('ui', df)
# more smoke
assert_in('ui', df.keys())
assert_in('ui', [k for k in df])
nkeys = len(df)
df.update(funky='seven')
eq_(len(df), nkeys + 1)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_external_versions.py 0000644 0001751 0001751 00000024103 15137634221 024213 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import logging
from os import linesep
import pytest
from datalad import __version__
from datalad.cmd import (
StdOutErrCapture,
WitlessRunner,
)
from datalad.support.annexrepo import AnnexRepo
from datalad.support.exceptions import (
CommandError,
MissingExternalDependency,
OutdatedExternalDependency,
)
from datalad.tests.utils_pytest import (
SkipTest,
assert_equal,
assert_false,
assert_greater,
assert_greater_equal,
assert_in,
assert_not_in,
assert_raises,
assert_true,
create_tree,
patch,
set_annex_version,
swallow_logs,
with_tempfile,
)
from ..external_versions import (
ExternalVersions,
LooseVersion,
)
# just to ease testing
def cmp(a, b):
return (a > b) - (a < b)
def test_external_versions_basic():
ev = ExternalVersions()
our_module = 'datalad'
assert_equal(ev.versions, {})
assert_equal(ev[our_module], __version__)
# and it could be compared
assert_greater_equal(ev[our_module], __version__)
# We got some odd failure in this test not long are after switching to versionner
# https://github.com/datalad/datalad/issues/5785. Verify that we do get expected
# data types
our_version = ev[our_module].version
assert isinstance(our_version, (str, list)), f"Got {our_version!r} of type {type(our_version)}"
assert_greater(ev[our_module], '0.1')
assert_equal(list(ev.keys()), [our_module])
assert_true(our_module in ev)
assert_false('unknown' in ev)
# all are LooseVersions now
assert_true(isinstance(ev[our_module], LooseVersion))
version_str = __version__
assert_equal(ev.dumps(), "Versions: %s=%s" % (our_module, version_str))
# For non-existing one we get None
assert_equal(ev['custom__nonexisting'], None)
# and nothing gets added to _versions for nonexisting
assert_equal(set(ev.versions.keys()), {our_module})
# but if it is a module without version, we get it set to UNKNOWN
assert_equal(ev['os'], ev.UNKNOWN)
# And get a record on that inside
assert_equal(ev.versions.get('os'), ev.UNKNOWN)
# And that thing is "True", i.e. present
assert(ev['os'])
# but not comparable with anything besides itself (was above)
assert_raises(TypeError, cmp, ev['os'], '0')
assert_raises(TypeError, assert_greater, ev['os'], '0')
return
## Code below is from original duecredit, and we don't care about
## testing this one
## And we can get versions based on modules themselves
#from datalad.tests import mod
#assert_equal(ev[mod], mod.__version__)
## Check that we can get a copy of the versions
#versions_dict = ev.versions
#versions_dict[our_module] = "0.0.1"
#assert_equal(versions_dict[our_module], "0.0.1")
#assert_equal(ev[our_module], __version__)
def test_external_version_contains():
ev = ExternalVersions()
assert_true("datalad" in ev)
assert_false("does not exist" in ev)
def test_external_versions_unknown():
assert_equal(str(ExternalVersions.UNKNOWN), 'UNKNOWN')
def _test_external(ev, modname):
try:
exec("import %s" % modname, globals(), locals())
except ImportError:
raise SkipTest("External %s not present" % modname)
except Exception as e:
raise SkipTest("External %s fails to import" % modname) from e
assert (ev[modname] is not ev.UNKNOWN)
assert_greater(ev[modname], '0.0.1')
assert_greater('1000000.0', ev[modname]) # unlikely in our lifetimes
def test_external_versions_popular_packages():
ev = ExternalVersions()
for modname in ('scipy', 'numpy', 'mvpa2', 'sklearn', 'statsmodels', 'pandas',
'matplotlib', 'psychopy', 'github'):
_test_external(ev, modname)
# more of a smoke test
assert_false(linesep in ev.dumps())
assert_true(ev.dumps(indent=True).endswith(linesep))
@with_tempfile(mkdir=True)
def test_external_versions_rogue_module(topd=None):
ev = ExternalVersions()
# if module throws some other non-ImportError exception upon import
# we must not crash, but issue a warning
modname = 'verycustomrogue__'
create_tree(topd, {modname + '.py': 'raise Exception("pickaboo")'})
with patch('sys.path', [topd]), \
swallow_logs(new_level=logging.WARNING) as cml:
assert ev[modname] is None
assert_true(ev.dumps(indent=True).endswith(linesep))
assert_in('pickaboo', cml.out)
def test_custom_versions():
ev = ExternalVersions()
assert(ev['cmd:annex'] > '6.20160101') # annex must be present and recentish
assert_equal(set(ev.versions.keys()), {'cmd:annex'})
# some older git version don't support files to be passed to
# `commit` call under some conditions and this will lead to diverse
# errors
assert(ev['cmd:git'] > '2.0') # git must be present and recentish
assert(isinstance(ev['cmd:git'], LooseVersion))
assert_equal(set(ev.versions.keys()), {'cmd:annex', 'cmd:git'})
# and there is also a version of system-wide installed git, which might
# differ from cmd:git but should be at least good old 1.7
assert(ev['cmd:system-git'] > '1.7')
ev.CUSTOM = {'bogus': lambda: 1 / 0}
assert_equal(ev['bogus'], None)
assert_equal(set(ev.versions), {'cmd:annex', 'cmd:git', 'cmd:system-git'})
def test_ancient_annex():
class _runner(object):
def run(self, cmd, *args, **kwargs):
if '--raw' in cmd:
raise CommandError
return dict(stdout="git-annex version: 0.1", stderr="")
ev = ExternalVersions()
with patch('datalad.support.external_versions._runner', _runner()):
assert_equal(ev['cmd:annex'], '0.1')
def _test_annex_version_comparison(v, cmp_):
class _runner(object):
def run(self, cmd, *args, **kwargs):
return dict(stdout=v, stderr="")
ev = ExternalVersions()
with set_annex_version(None), \
patch('datalad.support.external_versions._runner', _runner()), \
patch('datalad.support.annexrepo.external_versions',
ExternalVersions()):
ev['cmd:annex'] < AnnexRepo.GIT_ANNEX_MIN_VERSION
if cmp_ in (1, 0):
AnnexRepo._check_git_annex_version()
if cmp_ == 0:
assert_equal(AnnexRepo.git_annex_version, v)
elif cmp == -1:
with assert_raises(OutdatedExternalDependency):
ev.check('cmd:annex', min_version=AnnexRepo.GIT_ANNEX_MIN_VERSION)
with assert_raises(OutdatedExternalDependency):
AnnexRepo._check_git_annex_version()
def test_annex_version_comparison():
# see https://github.com/datalad/datalad/issues/1128
for cmp_, base in [(-1, '6.2011'), (1, "2100.0")]:
# there could be differing versions of a version
# release, snapshot, neurodebian build of a snapshot
for v in base, base + '-g0a34f08', base + '+gitg9f179ae-1~ndall+1':
# they all must be comparable to our specification of min version
_test_annex_version_comparison(v, cmp_)
_test_annex_version_comparison(str(AnnexRepo.GIT_ANNEX_MIN_VERSION), 0)
def _test_list_tuple(thing):
version = ExternalVersions._deduce_version(thing)
assert_greater(version, '0.0.1')
assert_greater('0.2', version)
assert_equal('0.1', version)
assert_equal(version, '0.1')
def test_list_tuple():
class thing_with_tuple_version:
__version__ = (0, 1)
class thing_with_list_version:
__version__ = [0, 1]
for v in thing_with_list_version, thing_with_tuple_version, '0.1', (0, 1), [0, 1]:
_test_list_tuple(v)
def test_system_ssh_version():
try:
WitlessRunner().run(['ssh', '-V'], protocol=StdOutErrCapture)
except FileNotFoundError as exc:
pytest.skip(f"no ssh binary available: {exc}")
ev = ExternalVersions()
assert ev['cmd:system-ssh'] # usually we have some available at boxes we test
def test_ssh_versions():
for s, v in [
('OpenSSH_7.4p1 Debian-6, OpenSSL 1.0.2k 26 Jan 2017', '7.4p1'),
('OpenSSH_8.1p1, LibreSSL 2.7.3', '8.1p1'),
('OpenSSH_for_Windows_8.1p1, LibreSSL 3.0.2', '8.1p1'),
]:
ev = ExternalVersions()
# TODO: figure out leaner way
class _runner(object):
def run(self, cmd, *args, **kwargs):
return dict(stdout="", stderr=s)
with patch('datalad.support.external_versions._runner', _runner()):
assert_equal(ev['cmd:system-ssh'], v)
def test_humanize():
# doesn't provide __version__
assert ExternalVersions()['humanize']
def test_check():
ev = ExternalVersions()
# should be all good
ev.check('datalad')
ev.check('datalad', min_version=__version__)
with assert_raises(MissingExternalDependency):
ev.check('dataladkukaracha')
with assert_raises(MissingExternalDependency) as cme:
ev.check('dataladkukaracha', min_version="buga", msg="duga")
assert_in("duga", str(cme.value))
with assert_raises(OutdatedExternalDependency):
ev.check('datalad', min_version="10000000") # we will never get there!
def test_add():
ev = ExternalVersions()
ev.add('custom1', lambda: "0.1.0")
assert_in("custom1=0.1.0", ev.dumps(query=True))
assert_not_in("numpy", ev.INTERESTING) # we do not have it by default yet
assert_not_in("numpy=", ev.dumps(query=True))
ev.add('numpy')
try:
import numpy
except ImportError:
# no numpy, we do not have some bogus entry
assert_not_in("numpy=", ev.dumps(query=True))
else:
assert_in("numpy=%s" % numpy.__version__, ev.dumps(query=True))
assert_in("custom1=0.1.0", ev.dumps(query=True)) # we still have that one
# override with a new function will work
ev.add('custom1', lambda: "0.2.0")
assert_in("custom1=0.2.0", ev.dumps(query=True))
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_fileinfo.py 0000644 0001751 0001751 00000035424 15137634221 022244 0 ustar 00runner runner # ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test file info getters"""
import os.path as op
from pathlib import Path
import datalad.utils as ut
from datalad.distribution.dataset import Dataset
from datalad.support.exceptions import NoSuchPathError
from datalad.support.gitrepo import GitRepo
from datalad.tests.utils_pytest import (
assert_dict_equal,
assert_equal,
assert_false,
assert_in,
assert_not_in,
assert_raises,
assert_repo_status,
get_annexstatus,
get_convoluted_situation,
known_failure_githubci_win,
on_nfs,
on_travis,
skip_if,
slow,
with_tempfile,
with_tree,
)
@slow # 10sec on travis
@known_failure_githubci_win
@with_tempfile
def test_get_content_info(path=None):
repo = GitRepo(path)
assert_equal(repo.get_content_info(), {})
# an invalid reference causes an exception
assert_raises(ValueError, repo.get_content_info, ref='HEAD')
ds = get_convoluted_situation(path)
repopath = ds.repo.pathobj
assert_equal(ds.repo.pathobj, repopath)
assert_equal(ds.pathobj, ut.Path(path))
# verify general rules on fused info records that are incrementally
# assembled: for git content info, amended with annex info on 'HEAD'
# (to get the last committed stage and with it possibly vanished
# content), and lastly annex info wrt to the present worktree, to
# also get info on added/staged content
# this fuses the info reported from
# - git ls-files
# - git annex findref HEAD
# - git annex find --include '*'
for f, r in get_annexstatus(ds.repo).items():
if f.match('*_untracked'):
assert(r.get('gitshasum', None) is None)
if f.match('*_deleted'):
assert(not f.exists() and not f.is_symlink() is None)
if f.match('subds_*'):
assert(r['type'] == 'dataset' if r.get('gitshasum', None) else 'directory')
if f.match('file_*'):
# which one exactly depends on many things
assert_in(r['type'], ('file', 'symlink'))
if f.match('file_ingit*'):
assert(r['type'] == 'file')
elif '.datalad' not in f.parts and not f.match('.git*') and \
r.get('gitshasum', None) and not f.match('subds*'):
# this should be known to annex, one way or another
# regardless of whether things add deleted or staged
# or anything in between
assert_in('key', r, f)
assert_in('keyname', r, f)
assert_in('backend', r, f)
assert_in('bytesize', r, f)
# no duplication with path
assert_not_in('file', r, f)
# query full untracked report
res = ds.repo.get_content_info()
assert_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
assert_not_in(repopath.joinpath('dir_untracked'), res)
# query for compact untracked report
res = ds.repo.get_content_info(untracked='normal')
assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
assert_in(repopath.joinpath('dir_untracked'), res)
# query no untracked report
res = ds.repo.get_content_info(untracked='no')
assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res)
assert_not_in(repopath.joinpath('dir_untracked'), res)
# git status integrity
status = ds.repo.status()
for t in ('subds', 'file'):
for s in ('untracked', 'added', 'deleted', 'clean',
'ingit_clean', 'dropped_clean', 'modified',
'ingit_modified'):
for l in ('', ut.PurePosixPath('subdir', '')):
if t == 'subds' and 'ingit' in s or 'dropped' in s:
# invalid combination
continue
if t == 'subds' and s == 'deleted':
# same as subds_unavailable -> clean
continue
p = repopath.joinpath(l, '{}_{}'.format(t, s))
assert p.match('*_{}'.format(status[p]['state'])), p
if t == 'subds':
assert_in(status[p]['type'], ('dataset', 'directory'), p)
else:
assert_in(status[p]['type'], ('file', 'symlink'), p)
# git annex status integrity
annexstatus = get_annexstatus(ds.repo)
for t in ('file',):
for s in ('untracked', 'added', 'deleted', 'clean',
'ingit_clean', 'dropped_clean', 'modified',
'ingit_modified'):
for l in ('', ut.PurePosixPath('subdir', '')):
p = repopath.joinpath(l, '{}_{}'.format(t, s))
if s in ('untracked', 'ingit_clean', 'ingit_modified'):
# annex knows nothing about these things
assert_not_in('key', annexstatus[p])
continue
assert_in('key', annexstatus[p])
# dear future,
# if the next one fails, git-annex might have changed the
# nature of the path that are being reported by
# `annex find --json`
# when this was written `hashir*` was a native path, but
# `file` was a POSIX path
assert_equal(annexstatus[p]['has_content'], 'dropped' not in s)
# check the different subds evaluation modes
someds = Dataset(ds.pathobj / 'subds_modified' / 'someds')
dirtyds_path = someds.pathobj / 'dirtyds'
assert_not_in(
'state',
someds.repo.status(eval_submodule_state='no')[dirtyds_path]
)
assert_equal(
'clean',
someds.repo.status(eval_submodule_state='commit')[dirtyds_path]['state']
)
assert_equal(
'modified',
someds.repo.status(eval_submodule_state='full')[dirtyds_path]['state']
)
@with_tempfile
def test_compare_content_info(path=None):
# TODO remove when `create` is RF to return the new Dataset
ds = Dataset(path).create()
assert_repo_status(path)
# for a clean repo HEAD and worktree query should yield identical results
# minus a 'bytesize' report that is readily available for HEAD, but would
# not a stat call per file for the worktree, and is not done ATM
wt = ds.repo.get_content_info(ref=None)
assert_dict_equal(
wt,
{f: {k: v for k, v in p.items() if k != 'bytesize'}
for f, p in ds.repo.get_content_info(ref='HEAD').items()}
)
@with_tempfile
def test_subds_path(path=None):
# a dataset with a subdataset with a file, all neatly tracked
ds = Dataset(path).create()
subds = ds.create('sub')
assert_repo_status(path)
with (subds.pathobj / 'some.txt').open('w') as f:
f.write(u'test')
ds.save(recursive=True)
assert_repo_status(path)
# querying the toplevel dataset repo for a subdspath should
# report the subdataset record in the dataset
# (unlike `git status`, which is silent for subdataset paths),
# but definitely not report the subdataset as deleted
# https://github.com/datalad/datalad-revolution/issues/17
stat = ds.repo.status(paths=[op.join('sub', 'some.txt')])
assert_equal(list(stat.keys()), [subds.repo.pathobj])
assert_equal(stat[subds.repo.pathobj]['state'], 'clean')
@skip_if(on_travis and on_nfs) # TODO. stalls https://github.com/datalad/datalad/pull/7372
@with_tempfile
def test_report_absent_keys(path=None):
ds = Dataset(path).create()
# create an annexed file
testfile = ds.pathobj / 'dummy'
testfile.write_text(u'nothing')
ds.save()
# present in a full report and in a partial report
# based on worktree of HEAD ref
for ai in (
ds.repo.get_content_annexinfo(eval_availability=True),
ds.repo.get_content_annexinfo(
paths=['dummy'],
eval_availability=True),
ds.repo.get_content_annexinfo(
ref='HEAD',
eval_availability=True),
ds.repo.get_content_annexinfo(
ref='HEAD',
paths=['dummy'],
eval_availability=True)):
assert_in(testfile, ai)
assert_equal(ai[testfile]['has_content'], True)
# drop the key, not available anywhere else
ds.drop('dummy', reckless='kill')
# does not change a thing, except the key is gone
for ai in (
ds.repo.get_content_annexinfo(eval_availability=True),
ds.repo.get_content_annexinfo(
paths=['dummy'],
eval_availability=True),
ds.repo.get_content_annexinfo(
ref='HEAD',
eval_availability=True),
ds.repo.get_content_annexinfo(
ref='HEAD',
paths=['dummy'],
eval_availability=True)):
assert_in(testfile, ai)
assert_equal(ai[testfile]['has_content'], False)
# make sure files with URL keys are correctly reported:
from datalad.conftest import test_http_server
remote_file_name = 'imaremotefile.dat'
local_file_name = 'mehasurlkey'
(Path(test_http_server.path) / remote_file_name).write_text("weee")
remote_file_url = f'{test_http_server.url}/{remote_file_name}'
# we need to get a file with a URL key and check its local availability
ds.repo.call_annex(['addurl', '--relaxed', remote_file_url, '--file',
local_file_name])
ds.save("URL keys!")
# should not be there
res = ds.repo.get_file_annexinfo(local_file_name, eval_availability=True)
assert_equal(res['has_content'], False)
ds.get(local_file_name)
# should be there
res = ds.repo.get_file_annexinfo(local_file_name, eval_availability=True)
assert_equal(res['has_content'], True)
@with_tempfile
def test_annexinfo_init(path=None):
ds = Dataset(path).create()
foo = ds.pathobj / "foo"
foo_cont = b"foo content"
foo.write_bytes(foo_cont)
bar = ds.pathobj / "bar"
bar.write_text(u"bar content")
ds.save()
# Custom init limits report, with original dict getting updated.
cinfo_custom_init = ds.repo.get_content_annexinfo(
init={foo: {"bytesize": 0,
"this-is-surely-only-here": "right?"}})
assert_not_in(bar, cinfo_custom_init)
assert_in(foo, cinfo_custom_init)
assert_equal(cinfo_custom_init[foo]["bytesize"], len(foo_cont))
assert_equal(cinfo_custom_init[foo]["this-is-surely-only-here"],
"right?")
# "git" injects get_content_info() values.
cinfo_init_git = ds.repo.get_content_annexinfo(init="git")
assert_in("gitshasum", cinfo_init_git[foo])
# init=None, on the other hand, does not.
cinfo_init_none = ds.repo.get_content_annexinfo(init=None)
assert_in(foo, cinfo_init_none)
assert_in(bar, cinfo_init_none)
assert_not_in("gitshasum", cinfo_init_none[foo])
@with_tempfile
def test_info_path_inside_submodule(path=None):
ds = Dataset(path).create()
subds = ds.create("submod")
foo = (subds.pathobj / "foo")
foo.write_text("foo")
ds.save(recursive=True)
cinfo = ds.repo.get_content_info(
ref="HEAD", paths=[foo.relative_to(ds.pathobj)])
assert_in("gitshasum", cinfo[subds.pathobj])
@with_tempfile
def test_get_content_info_dotgit(path=None):
ds = Dataset(path).create()
# Files in .git/ won't be reported, though this takes a kludge on our side
# before Git 2.25.
assert_false(ds.repo.get_content_info(paths=[op.join(".git", "config")]))
@with_tempfile
def test_get_content_info_paths_empty_list(path=None):
ds = Dataset(path).create()
# Unlike None, passing any empty list as paths to get_content_info() does
# not report on all content.
assert_false(ds.repo.get_content_info(paths=[]))
assert_false(ds.repo.get_content_info(paths=[], ref="HEAD"))
# Add annex content to make sure its not reported.
(ds.pathobj / "foo").write_text("foo")
ds.save()
# Same for get_content_annexinfo()...
assert_false(ds.repo.get_content_annexinfo(paths=[]))
assert_false(ds.repo.get_content_annexinfo(paths=[], init=None))
assert_false(ds.repo.get_content_annexinfo(paths=[], ref="HEAD"))
assert_false(
ds.repo.get_content_annexinfo(paths=[], ref="HEAD", init=None))
# ... where whatever was passed for init will be returned as is.
assert_equal(
ds.repo.get_content_annexinfo(
paths=[], ref="HEAD", init={"random": {"entry": "a"}}),
{"random": {"entry": "a"}})
@with_tempfile
def test_status_paths_empty_list(path=None):
ds = Dataset(path).create()
assert_equal(ds.repo.status(paths=[]), {})
@with_tree(tree=(('ingit.txt', 'ingit'),
('inannex.txt', 'inannex'),
('dir1', {'dropped': 'dropped'}),
('dir2', {'d21': 'd21', 'd22': 'd22'})))
def test_get_file_annexinfo(path=None):
ds = Dataset(path).create(force=True)
ds.save('ingit.txt', to_git=True)
ds.save()
# have some content-less component for testing
ds.drop(ds.pathobj / 'dir1', reckless='kill')
repo = ds.repo
# only handles a single file at a time
assert_raises(ValueError, repo.get_file_annexinfo, repo.pathobj / 'dir2')
# however, it only functionally matters that there is only a single file to
# report on not that the exact query path matches, the matching path is in
# the report
assert_equal(
repo.pathobj / 'dir1' / 'dropped',
repo.get_file_annexinfo(repo.pathobj / 'dir1')['path'])
# does not raise on a non-annex file, instead it returns no properties
assert_equal(repo.get_file_annexinfo('ingit.txt'), {})
# but does raise on path that doesn exist
assert_raises(NoSuchPathError, repo.get_file_annexinfo, 'nothere')
# check return properties for utility
props = repo.get_file_annexinfo('inannex.txt')
# to replace get_file_backend()
assert_equal(props['backend'], 'MD5E')
# to replace get_file_key()
assert_equal(props['key'], 'MD5E-s7--3b158c5b0a18c247ebad28c09fc3e180.txt')
# for size reporting
assert_equal(props['bytesize'], 7)
# all records have a pathobj
assert_equal(props['path'], repo.pathobj / 'inannex.txt')
# test if `eval_availability` has desired effect
assert_not_in('has_content', props)
# extended set of properties, after more expensive availability check
props = repo.get_file_annexinfo('inannex.txt', eval_availability=True)
# to replace file_has_content()
assert_equal(props['has_content'], True)
# to replace get_contentlocation()
assert_equal(
Path(props['objloc']).read_text(),
'inannex')
# make sure has_content is not always True
props = repo.get_file_annexinfo(
ds.pathobj / 'dir1' / 'dropped', eval_availability=True)
assert_equal(props['has_content'], False)
assert_not_in('objloc', props)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_gitrepo.py 0000644 0001751 0001751 00000155667 15137634221 022136 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test implementation of class GitRepo
"""
import logging
import os
import os.path as op
import sys
import pytest
from datalad import get_encoding_info
from datalad.cmd import (
StdOutCapture,
StdOutErrCapture,
WitlessRunner,
)
from datalad.support.exceptions import (
CommandError,
FileNotInRepositoryError,
InvalidGitRepositoryError,
NoSuchPathError,
PathKnownToRepositoryError,
)
from datalad.support.external_versions import external_versions
from datalad.support.gitrepo import (
GitRepo,
_normalize_path,
normalize_paths,
to_options,
)
from datalad.support.sshconnector import get_connection_hash
from datalad.tests.utils_pytest import (
DEFAULT_BRANCH,
DEFAULT_REMOTE,
SkipTest,
assert_cwd_unchanged,
assert_equal,
assert_false,
assert_in,
assert_in_results,
assert_is_instance,
assert_not_equal,
assert_not_in,
assert_raises,
assert_repo_status,
assert_true,
create_tree,
eq_,
get_most_obscure_supported_name,
integration,
neq_,
ok_,
skip_if_no_network,
skip_if_on_windows,
skip_nomultiplex_ssh,
slow,
swallow_logs,
with_tempfile,
with_tree,
xfail_buggy_annex_info,
)
from datalad.utils import (
Path,
chpwd,
getpwd,
on_windows,
rmtree,
)
@with_tempfile(mkdir=True)
def test_GitRepo_invalid_path(path=None):
with chpwd(path):
assert_raises(ValueError, GitRepo, path="git://some/url", create=True)
ok_(not op.exists(op.join(path, "git:")))
assert_raises(ValueError, GitRepo, path="file://some_location/path/at/location", create=True)
ok_(not op.exists(op.join(path, "file:")))
@assert_cwd_unchanged
@with_tempfile
@with_tempfile
def test_GitRepo_instance_from_clone(src=None, dst=None):
origin = GitRepo(src, create=True)
gr = GitRepo.clone(src, dst)
assert_is_instance(gr, GitRepo, "GitRepo was not created.")
ok_(op.exists(op.join(dst, '.git')))
# do it again should raise ValueError since git will notice there's
# already a git-repo at that path and therefore can't clone to `dst`
# Note: Since GitRepo is now a WeakSingletonRepo, this is prevented from
# happening atm. Disabling for now:
# raise SkipTest("Disabled for RF: WeakSingletonRepo")
with swallow_logs() as logs:
assert_raises(ValueError, GitRepo.clone, src, dst)
@assert_cwd_unchanged
@with_tempfile
def test_GitRepo_instance_from_existing(path=None):
GitRepo(path, create=True)
gr = GitRepo(path)
assert_is_instance(gr, GitRepo, "GitRepo was not created.")
ok_(op.exists(op.join(path, '.git')))
@assert_cwd_unchanged
@with_tempfile
@with_tempfile
def test_GitRepo_instance_from_not_existing(path=None, path2=None):
# 1. create=False and path doesn't exist:
assert_raises(NoSuchPathError, GitRepo, path, create=False)
assert_false(op.exists(path))
# 2. create=False, path exists, but no git repo:
os.mkdir(path)
ok_(op.exists(path))
assert_raises(InvalidGitRepositoryError, GitRepo, path, create=False)
assert_false(op.exists(op.join(path, '.git')))
# 3. create=True, path doesn't exist:
gr = GitRepo(path2, create=True)
assert_is_instance(gr, GitRepo, "GitRepo was not created.")
ok_(op.exists(op.join(path2, '.git')))
assert_repo_status(path2, annex=False)
# 4. create=True, path exists, but no git repo:
gr = GitRepo(path, create=True)
assert_is_instance(gr, GitRepo, "GitRepo was not created.")
ok_(op.exists(op.join(path, '.git')))
assert_repo_status(path, annex=False)
@with_tempfile
def test_GitRepo_init_options(path=None):
# passing an option, not explicitly defined in GitRepo class:
gr = GitRepo(path, create=True, bare=True)
ok_(gr.config.getbool(section="core", option="bare"))
@with_tempfile
@with_tempfile(mkdir=True)
@with_tree(tree={'somefile': 'content', 'config': 'not a git config'})
@with_tree(tree={'afile': 'other',
'.git': {}})
@with_tempfile
@with_tempfile
def test_GitRepo_bare(path=None, empty_dir=None, non_empty_dir=None, empty_dot_git=None, non_bare=None,
clone_path=None):
import gc
# create a bare repo:
gr = GitRepo(path, create=True, bare=True)
assert_equal(gr.dot_git, gr.pathobj)
assert_true(gr.bare)
assert_true(gr.config.getbool("core", "bare"))
assert_false((gr.pathobj / '.git').exists())
assert_false(gr.call_git_success(['status'], expect_stderr=True))
# kill the object and try to get a new instance on an existing bare repo:
del gr
gc.collect()
gr = GitRepo(path, create=False)
assert_equal(gr.dot_git, gr.pathobj)
assert_true(gr.bare)
assert_true(gr.config.getbool("core", "bare"))
assert_false((gr.pathobj / '.git').exists())
assert_false(gr.call_git_success(['status'], expect_stderr=True))
# an empty dir is not a bare repo:
assert_raises(InvalidGitRepositoryError, GitRepo, empty_dir,
create=False)
# an arbitrary dir is not a bare repo:
assert_raises(InvalidGitRepositoryError, GitRepo, non_empty_dir,
create=False)
# nor is a path with an empty .git:
assert_raises(InvalidGitRepositoryError, GitRepo, empty_dot_git,
create=False)
# a regular repo is not bare
non_bare_repo = GitRepo(non_bare, create=True)
assert_false(non_bare_repo.bare)
# we can have a bare clone
clone = GitRepo.clone(non_bare, clone_path, clone_options={'bare': True})
assert_true(clone.bare)
@with_tree(
tree={
'subds': {
'file_name': ''
}
}
)
def test_init_fail_under_known_subdir(path=None):
repo = GitRepo(path, create=True)
repo.add(op.join('subds', 'file_name'))
# Should fail even if we do not commit but only add to index:
with assert_raises(PathKnownToRepositoryError) as cme:
GitRepo(op.join(path, 'subds'), create=True)
assert_in("file_name", str(cme.value)) # we provide a list of offenders
# and after we commit - the same story
repo.commit("added file")
with assert_raises(PathKnownToRepositoryError) as cme:
GitRepo(op.join(path, 'subds'), create=True)
# But it would succeed if we disable the checks
GitRepo(op.join(path, 'subds'), create=True, create_sanity_checks=False)
@with_tempfile
@with_tempfile
def test_GitRepo_equals(path1=None, path2=None):
repo1 = GitRepo(path1)
repo2 = GitRepo(path1)
ok_(repo1 == repo2)
eq_(repo1, repo2)
repo2 = GitRepo(path2)
neq_(repo1, repo2)
ok_(repo1 != repo2)
@assert_cwd_unchanged
@with_tempfile
@with_tempfile
def test_GitRepo_add(src=None, path=None):
gr = GitRepo(path)
filename = get_most_obscure_supported_name()
with open(op.join(path, filename), 'w') as f:
f.write("File to add to git")
added = gr.add(filename)
eq_(added, {'success': True, 'file': filename})
assert_in(filename, gr.get_indexed_files(),
"%s not successfully added to %s" % (filename, path))
# uncommitted:
ok_(gr.dirty)
filename = "another.txt"
with open(op.join(path, filename), 'w') as f:
f.write("Another file to add to git")
# include committing:
added2 = gr.add(filename)
gr.commit(msg="Add two files.")
eq_(added2, {'success': True, 'file': filename})
assert_in(filename, gr.get_indexed_files(),
"%s not successfully added to %s" % (filename, path))
assert_repo_status(path)
@assert_cwd_unchanged
@with_tree(tree={
'd': {'f1': 'content1',
'f2': 'content2'},
'file': 'content3',
'd2': {'f1': 'content1',
'f2': 'content2'},
'file2': 'content3'
})
def test_GitRepo_remove(path=None):
gr = GitRepo(path, create=True)
gr.add('*')
gr.commit("committing all the files")
eq_(gr.remove('file'), ['file'])
eq_(set(gr.remove('d', r=True, f=True)), {'d/f1', 'd/f2'})
eq_(set(gr.remove('*', r=True, f=True)), {'file2', 'd2/f1', 'd2/f2'})
@assert_cwd_unchanged
@with_tempfile
def test_GitRepo_commit(path=None):
gr = GitRepo(path)
filename = get_most_obscure_supported_name()
with open(op.join(path, filename), 'w') as f:
f.write("File to add to git")
gr.add(filename)
gr.commit("Testing GitRepo.commit().")
assert_repo_status(gr)
eq_("Testing GitRepo.commit().",
gr.format_commit("%B").strip())
with open(op.join(path, filename), 'w') as f:
f.write("changed content")
gr.add(filename)
gr.commit("commit with options", options=to_options(dry_run=True))
# wasn't actually committed:
ok_(gr.dirty)
# commit with empty message:
gr.commit()
assert_repo_status(gr)
assert_equal(gr.format_commit("%B").strip(), "[DATALAD] Recorded changes")
# amend commit:
assert_equal(len(list(gr.get_branch_commits_())), 2)
last_sha = gr.get_hexsha()
with open(op.join(path, filename), 'w') as f:
f.write("changed again")
gr.add(filename)
gr.commit("amend message", options=to_options(amend=True))
assert_repo_status(gr)
assert_equal(gr.format_commit("%B").strip(), "amend message")
assert_not_equal(last_sha, gr.get_hexsha())
assert_equal(len(list(gr.get_branch_commits_())), 2)
# amend w/o message maintains previous one:
gr.commit(options=to_options(amend=True))
assert_repo_status(gr)
assert_equal(len(list(gr.get_branch_commits_())), 2)
assert_equal(gr.format_commit("%B").strip(), "amend message")
# nothing to commit doesn't raise by default:
gr.commit()
# but does with careless=False:
assert_raises(CommandError, gr.commit, careless=False)
# committing untracked file raises:
with open(op.join(path, "untracked"), "w") as f:
f.write("some")
assert_raises(FileNotInRepositoryError, gr.commit, files="untracked")
# not existing file as well:
assert_raises(FileNotInRepositoryError, gr.commit, files="not-existing")
@with_tempfile
def test_GitRepo_get_indexed_files(path=None):
gr = GitRepo(path)
for filename in ('some1.txt', 'some2.dat'):
with open(op.join(path, filename), 'w') as f:
f.write(filename)
gr.add(filename)
gr.commit('Some files')
idx_list = gr.get_indexed_files()
runner = WitlessRunner(cwd=path)
out = runner.run(['git', 'ls-files'], protocol=StdOutCapture)
out_list = list(filter(bool, out['stdout'].split('\n')))
for item in idx_list:
assert_in(item, out_list, "%s not found in output of git ls-files in %s" % (item, path))
for item in out_list:
assert_in(item, idx_list, "%s not found in output of get_indexed_files in %s" % (item, path))
@with_tree([
('empty', ''),
('d1', (
('empty', ''),
('d2',
(('empty', ''),
)),
)),
])
@assert_cwd_unchanged(ok_to_chdir=True)
def test_normalize_path(git_path=None):
gr = GitRepo(git_path)
# cwd is currently outside the repo, so any relative path
# should be interpreted as relative to `annex_path`
assert_raises(FileNotInRepositoryError, _normalize_path, gr.path, getpwd())
result = _normalize_path(gr.path, "testfile")
eq_(result, "testfile", "_normalize_path() returned %s" % result)
# result = _normalize_path(gr.path, op.join('.', 'testfile'))
# eq_(result, "testfile", "_normalize_path() returned %s" % result)
#
# result = _normalize_path(gr.path, op.join('testdir', '..', 'testfile'))
# eq_(result, "testfile", "_normalize_path() returned %s" % result)
# Note: By now, normpath within normalize_paths() is disabled, therefore
# disable these tests.
result = _normalize_path(gr.path, op.join('testdir', 'testfile'))
eq_(result, op.join("testdir", "testfile"), "_normalize_path() returned %s" % result)
result = _normalize_path(gr.path, op.join(git_path, "testfile"))
eq_(result, "testfile", "_normalize_path() returned %s" % result)
# now we are inside, so
# OLD PHILOSOPHY: relative paths are relative to cwd and have
# to be converted to be relative to annex_path
# NEW PHILOSOPHY: still relative to repo! unless starts with . (curdir) or .. (pardir)
with chpwd(op.join(git_path, 'd1', 'd2')):
result = _normalize_path(gr.path, "testfile")
eq_(result, 'testfile', "_normalize_path() returned %s" % result)
# if not joined as directory name but just a prefix to the filename, should
# behave correctly
for d in (op.curdir, op.pardir):
result = _normalize_path(gr.path, d + "testfile")
eq_(result, d + 'testfile', "_normalize_path() returned %s" % result)
result = _normalize_path(gr.path, op.join(op.curdir, "testfile"))
eq_(result, op.join('d1', 'd2', 'testfile'), "_normalize_path() returned %s" % result)
result = _normalize_path(gr.path, op.join(op.pardir, 'testfile'))
eq_(result, op.join('d1', 'testfile'), "_normalize_path() returned %s" % result)
assert_raises(FileNotInRepositoryError, _normalize_path, gr.path, op.join(git_path, '..', 'outside'))
result = _normalize_path(gr.path, op.join(git_path, 'd1', 'testfile'))
eq_(result, op.join('d1', 'testfile'), "_normalize_path() returned %s" % result)
def test_GitRepo_files_decorator():
class testclass(object):
def __init__(self):
self.path = op.join('some', 'where')
# TODO
# yoh: logic is alien to me below why to have two since both look identical!
@normalize_paths
def decorated_many(self, files):
return files
@normalize_paths
def decorated_one(self, file_):
return file_
test_instance = testclass()
# When a single file passed -- single path returned
obscure_filename = get_most_obscure_supported_name()
file_to_test = op.join(test_instance.path, 'deep', obscure_filename)
# file doesn't exist
eq_(test_instance.decorated_one(file_to_test),
_normalize_path(test_instance.path, file_to_test))
eq_(test_instance.decorated_one(file_to_test),
_normalize_path(test_instance.path, file_to_test))
file_to_test = obscure_filename
eq_(test_instance.decorated_many(file_to_test),
_normalize_path(test_instance.path, file_to_test))
eq_(test_instance.decorated_one(file_to_test),
_normalize_path(test_instance.path, file_to_test))
file_to_test = op.join(obscure_filename, 'beyond', 'obscure')
eq_(test_instance.decorated_many(file_to_test),
_normalize_path(test_instance.path, file_to_test))
file_to_test = op.join(getpwd(), 'somewhere', 'else', obscure_filename)
assert_raises(FileNotInRepositoryError, test_instance.decorated_many,
file_to_test)
# If a list passed -- list returned
files_to_test = ['now', op.join('a list', 'of'), 'paths']
expect = []
for item in files_to_test:
expect.append(_normalize_path(test_instance.path, item))
eq_(test_instance.decorated_many(files_to_test), expect)
eq_(test_instance.decorated_many(''), [])
assert_raises(ValueError, test_instance.decorated_many, 1)
assert_raises(ValueError, test_instance.decorated_one, 1)
@skip_if_no_network
@with_tempfile
def test_GitRepo_remote_add(path=None):
gr = GitRepo(path)
gr.add_remote('github', 'https://github.com/datalad/testrepo--basic--r1')
out = gr.get_remotes()
assert_in('github', out)
eq_(len(out), 1)
eq_('https://github.com/datalad/testrepo--basic--r1', gr.config['remote.github.url'])
@with_tempfile
def test_GitRepo_remote_remove(path=None):
gr = GitRepo(path)
gr.add_remote('github', 'https://github.com/datalad/testrepo--basic--r1')
out = gr.get_remotes()
eq_(len(out), 1)
gr.remove_remote('github')
out = gr.get_remotes()
eq_(len(out), 0)
@with_tempfile
def test_GitRepo_get_remote_url(path=None):
gr = GitRepo(path)
gr.add_remote('github', 'https://github.com/datalad/testrepo--basic--r1')
eq_(gr.get_remote_url('github'),
'https://github.com/datalad/testrepo--basic--r1')
@with_tempfile
@with_tempfile
def test_GitRepo_fetch(orig_path=None, clone_path=None):
origin = GitRepo(orig_path)
with open(op.join(orig_path, 'some.txt'), 'w') as f:
f.write("New text file.")
origin.add('some.txt')
origin.commit("new file added.")
clone = GitRepo.clone(orig_path, clone_path)
filename = get_most_obscure_supported_name()
origin.checkout("new_branch", ['-b'])
with open(op.join(orig_path, filename), 'w') as f:
f.write("New file.")
origin.add(filename)
origin.commit("new file added.")
fetched = clone.fetch(remote=DEFAULT_REMOTE)
# test FetchInfo list returned by fetch
eq_([DEFAULT_REMOTE + '/' + clone.get_active_branch(),
DEFAULT_REMOTE + '/new_branch'],
[commit['ref'] for commit in fetched])
assert_repo_status(clone.path, annex=False)
assert_in(DEFAULT_REMOTE + "/new_branch", clone.get_remote_branches())
assert_in(filename, clone.get_files(DEFAULT_REMOTE + "/new_branch"))
assert_false(op.exists(op.join(clone_path, filename))) # not checked out
# create a remote without an URL:
origin.add_remote('not-available', 'git://example.com/not/existing')
origin.config.unset('remote.not-available.url', scope='local')
# fetch without provided URL
assert_raises(CommandError, origin.fetch, 'not-available')
def _path2localsshurl(path):
"""Helper to build valid localhost SSH urls on Windows too"""
path = op.abspath(path)
p = Path(path)
if p.drive:
path = '/'.join(('/{}'.format(p.drive[0]),) + p.parts[1:])
url = "ssh://datalad-test{}".format(path)
return url
@skip_nomultiplex_ssh
@with_tempfile
@with_tempfile
def test_GitRepo_ssh_fetch(remote_path=None, repo_path=None):
from datalad import ssh_manager
remote_repo = GitRepo(remote_path)
with open(op.join(remote_path, 'some.txt'), 'w') as f:
f.write("New text file.")
remote_repo.add('some.txt')
remote_repo.commit("new file added.")
url = _path2localsshurl(remote_path)
socket_path = op.join(str(ssh_manager.socket_dir),
get_connection_hash('datalad-test'))
repo = GitRepo(repo_path, create=True)
repo.add_remote("ssh-remote", url)
# we don't know any branches of the remote:
eq_([], repo.get_remote_branches())
fetched = repo.fetch(remote="ssh-remote")
assert_in('ssh-remote/' + DEFAULT_BRANCH,
[commit['ref'] for commit in fetched])
assert_repo_status(repo)
# the connection is known to the SSH manager, since fetch() requested it:
assert_in(socket_path, list(map(str, ssh_manager._connections)))
# and socket was created:
ok_(op.exists(socket_path))
# we actually fetched it:
assert_in('ssh-remote/' + DEFAULT_BRANCH,
repo.get_remote_branches())
@skip_nomultiplex_ssh
@with_tempfile
@with_tempfile
def test_GitRepo_ssh_push(repo_path=None, remote_path=None):
from datalad import ssh_manager
remote_repo = GitRepo(remote_path, create=True)
url = _path2localsshurl(remote_path)
socket_path = op.join(str(ssh_manager.socket_dir),
get_connection_hash('datalad-test'))
repo = GitRepo(repo_path, create=True)
repo.add_remote("ssh-remote", url)
# modify local repo:
repo.checkout("ssh-test", ['-b'])
with open(op.join(repo.path, "ssh_testfile.dat"), "w") as f:
f.write("whatever")
repo.add("ssh_testfile.dat")
repo.commit("ssh_testfile.dat added.")
# file is not known to the remote yet:
assert_not_in("ssh_testfile.dat", remote_repo.get_indexed_files())
# push changes:
pushed = list(repo.push(remote="ssh-remote", refspec="ssh-test"))
# test PushInfo
assert_in("refs/heads/ssh-test", [p['from_ref'] for p in pushed])
assert_in("refs/heads/ssh-test", [p['to_ref'] for p in pushed])
# the connection is known to the SSH manager, since fetch() requested it:
assert_in(socket_path, list(map(str, ssh_manager._connections)))
# and socket was created:
ok_(op.exists(socket_path))
# remote now knows the changes:
assert_in("ssh-test", remote_repo.get_branches())
assert_in("ssh_testfile.dat", remote_repo.get_files("ssh-test"))
# amend to make it require "--force":
repo.commit("amended", options=['--amend'])
# push without --force should yield an error:
res = repo.push(remote="ssh-remote", refspec="ssh-test")
assert_in_results(
res,
from_ref='refs/heads/ssh-test',
to_ref='refs/heads/ssh-test',
operations=['rejected', 'error'],
note='[rejected] (non-fast-forward)',
remote='ssh-remote',
)
# now push using force:
repo.push(remote="ssh-remote", refspec="ssh-test", force=True)
# correct commit message in remote:
assert_in("amended",
remote_repo.format_commit(
'%s',
list(remote_repo.get_branch_commits_('ssh-test'))[-1]
))
@with_tempfile
@with_tempfile
def test_GitRepo_push_n_checkout(orig_path=None, clone_path=None):
origin = GitRepo(orig_path)
clone = GitRepo.clone(orig_path, clone_path)
filename = get_most_obscure_supported_name()
with open(op.join(clone_path, filename), 'w') as f:
f.write("New file.")
clone.add(filename)
clone.commit("new file added.")
# TODO: need checkout first:
clone.push(DEFAULT_REMOTE, '+{}:new-branch'.format(DEFAULT_BRANCH))
origin.checkout('new-branch')
ok_(op.exists(op.join(orig_path, filename)))
@with_tempfile
@with_tempfile
@with_tempfile
def test_GitRepo_remote_update(path1=None, path2=None, path3=None):
git1 = GitRepo(path1)
git2 = GitRepo(path2)
git3 = GitRepo(path3)
git1.add_remote('git2', path2)
git1.add_remote('git3', path3)
# Setting up remote 'git2'
with open(op.join(path2, 'masterfile'), 'w') as f:
f.write("git2 in master")
git2.add('masterfile')
git2.commit("Add something to master.")
git2.checkout('branch2', ['-b'])
with open(op.join(path2, 'branch2file'), 'w') as f:
f.write("git2 in branch2")
git2.add('branch2file')
git2.commit("Add something to branch2.")
# Setting up remote 'git3'
with open(op.join(path3, 'masterfile'), 'w') as f:
f.write("git3 in master")
git3.add('masterfile')
git3.commit("Add something to master.")
git3.checkout('branch3', ['-b'])
with open(op.join(path3, 'branch3file'), 'w') as f:
f.write("git3 in branch3")
git3.add('branch3file')
git3.commit("Add something to branch3.")
git1.update_remote()
# checkouts are 'tests' themselves, since they'll raise CommandError
# if something went wrong
git1.checkout('branch2')
git1.checkout('branch3')
branches1 = git1.get_branches()
eq_({'branch2', 'branch3'}, set(branches1))
@with_tempfile
@with_tempfile
def test_GitRepo_get_files(src_path=None, path=None):
src = GitRepo(src_path)
for filename in ('some1.txt', 'some2.dat'):
with open(op.join(src_path, filename), 'w') as f:
f.write(filename)
src.add(filename)
src.commit('Some files')
gr = GitRepo.clone(src.path, path)
# get the expected files via os for comparison:
os_files = set()
for (dirpath, dirnames, filenames) in os.walk(path):
rel_dir = os.path.relpath(dirpath, start=path)
if rel_dir.startswith(".git"):
continue
for file_ in filenames:
file_path = os.path.normpath(op.join(rel_dir, file_))
os_files.add(file_path)
# get the files via GitRepo:
local_files = set(gr.get_files())
remote_files = set(gr.get_files(
branch=f"{DEFAULT_REMOTE}/{DEFAULT_BRANCH}"))
eq_(local_files, set(gr.get_indexed_files()))
eq_(local_files, remote_files)
eq_(local_files, os_files)
# create a different branch:
gr.checkout('new_branch', ['-b'])
filename = 'another_file.dat'
with open(op.join(path, filename), 'w') as f:
f.write("something")
gr.add(filename)
gr.commit("Added.")
# now get the files again:
local_files = set(gr.get_files())
eq_(local_files, os_files.union({filename}))
# retrieve remote branch again, which should not have changed:
remote_files = set(gr.get_files(
branch=f"{DEFAULT_REMOTE}/{DEFAULT_BRANCH}"))
eq_(remote_files, os_files)
eq_(set([filename]), local_files.difference(remote_files))
# switch back and query non-active branch:
gr.checkout(DEFAULT_BRANCH)
local_files = set(gr.get_files())
branch_files = set(gr.get_files(branch="new_branch"))
eq_(set([filename]), branch_files.difference(local_files))
@with_tempfile
@with_tempfile(mkdir=True)
@with_tempfile
def test_GitRepo_get_toppath(repo=None, tempdir=None, repo2=None):
GitRepo(repo, create=True)
reporeal = str(Path(repo).resolve())
eq_(GitRepo.get_toppath(repo, follow_up=False), reporeal)
eq_(GitRepo.get_toppath(repo), repo)
# Generate some nested directory
GitRepo(repo2, create=True)
repo2real = str(Path(repo2).resolve())
nested = op.join(repo2, "d1", "d2")
os.makedirs(nested)
eq_(GitRepo.get_toppath(nested, follow_up=False), repo2real)
eq_(GitRepo.get_toppath(nested), repo2)
# and if not under git, should return None
eq_(GitRepo.get_toppath(tempdir), None)
@with_tempfile(mkdir=True)
def test_GitRepo_dirty(path=None):
repo = GitRepo(path, create=True)
ok_(not repo.dirty)
# untracked file
with open(op.join(path, 'file1.txt'), 'w') as f:
f.write('whatever')
ok_(repo.dirty)
# staged file
repo.add('file1.txt')
ok_(repo.dirty)
# clean again
repo.commit("file1.txt added")
ok_(not repo.dirty)
# modify to be the same
with open(op.join(path, 'file1.txt'), 'w') as f:
f.write('whatever')
ok_(not repo.dirty)
# modified file
with open(op.join(path, 'file1.txt'), 'w') as f:
f.write('something else')
ok_(repo.dirty)
# clean again
repo.add('file1.txt')
repo.commit("file1.txt modified")
ok_(not repo.dirty)
# An empty directory doesn't count as dirty.
os.mkdir(op.join(path, "empty"))
ok_(not repo.dirty)
# Neither does an empty directory with an otherwise empty directory.
os.mkdir(op.join(path, "empty", "empty-again"))
ok_(not repo.dirty)
subm = GitRepo(repo.pathobj / "subm", create=True)
(subm.pathobj / "subfile").write_text(u"")
subm.save()
repo.save()
ok_(not repo.dirty)
(subm.pathobj / "subfile").write_text(u"changed")
ok_(repo.dirty)
# User configuration doesn't affect .dirty's answer.
repo.config.set("diff.ignoreSubmodules", "all", scope="local")
ok_(repo.dirty)
# GitRepo.commit currently can't handle this setting, so remove it for the
# save() calls below.
repo.config.unset("diff.ignoreSubmodules", scope="local")
subm.save()
repo.save()
ok_(not repo.dirty)
repo.config.set("status.showUntrackedFiles", "no", scope="local")
create_tree(repo.path, {"untracked_dir": {"a": "a"}})
ok_(repo.dirty)
@with_tempfile(mkdir=True)
def test_GitRepo_get_merge_base(src=None):
repo = GitRepo(src, create=True)
with open(op.join(src, 'file.txt'), 'w') as f:
f.write('load')
repo.add('*')
repo.commit('committing')
assert_raises(ValueError, repo.get_merge_base, [])
branch1 = repo.get_active_branch()
branch1_hexsha = repo.get_hexsha()
eq_(len(branch1_hexsha), 40)
eq_(repo.get_merge_base(branch1), branch1_hexsha)
# Let's create a detached branch
branch2 = "_detach_"
repo.checkout(branch2, options=["--orphan"])
# it will have all the files
# Must not do: https://github.com/gitpython-developers/GitPython/issues/375
# repo.git_add('.')
repo.add('*')
# NOTE: fun part is that we should have at least a different commit message
# so it results in a different checksum ;)
repo.commit("committing again")
assert(repo.get_indexed_files()) # we did commit
assert(repo.get_merge_base(branch1) is None)
assert(repo.get_merge_base([branch2, branch1]) is None)
# Let's merge them up -- then merge base should match the master
repo.merge(branch1, allow_unrelated=True)
eq_(repo.get_merge_base(branch1), branch1_hexsha)
# if points to some empty/non-existing branch - should also be None
assert(repo.get_merge_base(['nonexistent', branch2]) is None)
@with_tempfile(mkdir=True)
def test_GitRepo_git_get_branch_commits_(src=None):
repo = GitRepo(src, create=True)
with open(op.join(src, 'file.txt'), 'w') as f:
f.write('load')
repo.add('*')
repo.commit('committing')
# go in a branch with a name that matches the file to require
# proper disambiguation
repo.call_git(['checkout', '-b', 'file.txt'])
commits_default = list(repo.get_branch_commits_())
commits = list(repo.get_branch_commits_(DEFAULT_BRANCH))
eq_(commits, commits_default)
eq_(len(commits), 1)
@with_tempfile
@with_tempfile
def test_get_tracking_branch(o_path=None, c_path=None):
src = GitRepo(o_path)
for filename in ('some1.txt', 'some2.dat'):
with open(op.join(o_path, filename), 'w') as f:
f.write(filename)
src.add(filename)
src.commit('Some files')
clone = GitRepo.clone(o_path, c_path)
# Note, that the default branch might differ even if it is always 'master'.
# For direct mode annex repositories it would then be "annex/direct/master"
# for example. Therefore use whatever branch is checked out by default:
master_branch = clone.get_active_branch()
ok_(master_branch)
eq_((DEFAULT_REMOTE, 'refs/heads/' + master_branch),
clone.get_tracking_branch())
clone.checkout('new_branch', ['-b'])
eq_((None, None), clone.get_tracking_branch())
eq_((DEFAULT_REMOTE, 'refs/heads/' + master_branch),
clone.get_tracking_branch(master_branch))
clone.checkout(master_branch, options=["--track", "-btopic"])
eq_(('.', 'refs/heads/' + master_branch),
clone.get_tracking_branch())
eq_((None, None),
clone.get_tracking_branch(remote_only=True))
@with_tempfile
def test_GitRepo_get_submodules(path=None):
repo = GitRepo(path, create=True)
s_abc = GitRepo(op.join(path, "s_abc"), create=True)
s_abc.commit(msg="c s_abc", options=["--allow-empty"])
repo.save(path="s_abc")
s_xyz = GitRepo(op.join(path, "s_xyz"), create=True)
s_xyz.commit(msg="c s_xyz", options=["--allow-empty"])
repo.save(path="s_xyz")
eq_([s["gitmodule_name"]
for s in repo.get_submodules(sorted_=True)],
["s_abc", "s_xyz"])
# Limit by path
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=["s_abc"])],
["s_abc"])
# Pointing to a path within submodule should include it too
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=["s_abc/unrelated"])],
["s_abc"])
# top level should list all submodules
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=[repo.path])],
["s_abc", "s_xyz"])
# Limit by non-existing/non-matching path
eq_([s["gitmodule_name"]
for s in repo.get_submodules(paths=["s_unknown"])],
[])
@with_tempfile
def test_get_submodules_parent_on_unborn_branch(path=None):
repo = GitRepo(path, create=True)
subrepo = GitRepo(op.join(path, "sub"), create=True)
subrepo.commit(msg="s", options=["--allow-empty"])
repo.save(path="sub")
eq_([s["gitmodule_name"] for s in repo.get_submodules_()],
["sub"])
def test_to_options():
class Some(object):
def cmd_func(self, git_options=None, annex_options=None, options=None):
git_options = git_options[:] if git_options else []
annex_options = annex_options[:] if annex_options else []
options = options[:] if options else []
faked_cmd_call = ['git'] + git_options + ['annex'] + \
annex_options + ['my_cmd'] + options
return faked_cmd_call
eq_(Some().cmd_func(options=to_options(m="bla", force=True)),
['git', 'annex', 'my_cmd', '--force', '-m', 'bla'])
eq_(Some().cmd_func(git_options=to_options(C="/some/where"),
annex_options=to_options(JSON=True),
options=to_options(unused=True)),
['git', '-C', '/some/where', 'annex', '--JSON', 'my_cmd', '--unused'])
eq_(Some().cmd_func(git_options=to_options(C="/some/where", split_single_char_options=False),
annex_options=to_options(JSON=True),
options=to_options(unused=True)),
['git', '-C/some/where', 'annex', '--JSON', 'my_cmd', '--unused'])
def test_to_options_from_gitpython():
"""Imported from GitPython and modified.
Original copyright:
Copyright (C) 2008, 2009 Michael Trier and contributors
Original license:
BSD 3-Clause "New" or "Revised" License
"""
eq_(["-s"], to_options(**{'s': True}))
eq_(["-s", "5"], to_options(**{'s': 5}))
eq_([], to_options(**{'s': None}))
eq_(["--max-count"], to_options(**{'max_count': True}))
eq_(["--max-count=5"], to_options(**{'max_count': 5}))
eq_(["--max-count=0"], to_options(**{'max_count': 0}))
eq_([], to_options(**{'max_count': None}))
# Multiple args are supported by using lists/tuples
eq_(["-L", "1-3", "-L", "12-18"], to_options(**{'L': ('1-3', '12-18')}))
eq_(["-C", "-C"], to_options(**{'C': [True, True, None, False]}))
# order is undefined
res = to_options(**{'s': True, 't': True})
eq_({'-s', '-t'}, set(res))
@with_tempfile
def test_GitRepo_count_objects(repo_path=None):
repo = GitRepo(repo_path, create=True)
# test if dictionary returned
eq_(isinstance(repo.count_objects, dict), True)
# test if dictionary contains keys and values we expect
empty_count = {'count': 0, 'garbage': 0, 'in-pack': 0, 'packs': 0, 'prune-packable': 0,
'size': 0, 'size-garbage': 0, 'size-pack': 0}
eq_(empty_count, repo.count_objects)
# this is simply broken on win, but less important
# https://github.com/datalad/datalad/issues/3639
@skip_if_on_windows
@with_tempfile
def test_optimized_cloning(path=None):
# make test repo with one file and one commit
originpath = op.join(path, 'origin')
repo = GitRepo(originpath, create=True)
with open(op.join(originpath, 'test'), 'w') as f:
f.write('some')
repo.add('test')
repo.commit('init')
assert_repo_status(originpath, annex=False)
from glob import glob
def _get_inodes(repo):
return dict(
[(os.path.join(*o.split(os.sep)[-2:]),
os.stat(o).st_ino)
for o in glob(os.path.join(repo.path,
repo.get_git_dir(repo),
'objects', '*', '*'))])
origin_inodes = _get_inodes(repo)
# now clone it in different ways and see what happens to the object storage
from datalad.support.network import get_local_file_url
clonepath = op.join(path, 'clone')
for src in (originpath, get_local_file_url(originpath, compatibility='git')):
clone = GitRepo.clone(url=src, path=clonepath, create=True)
clone_inodes = _get_inodes(clone)
eq_(origin_inodes, clone_inodes, msg='with src={}'.format(src))
rmtree(clonepath)
# del clone
# gc.collect()
# Note: del needed, since otherwise WeakSingletonRepo would just
# return the original object in second run
@with_tempfile(mkdir=True)
@with_tempfile(mkdir=True)
def test_GitRepo_flyweight(path1=None, path2=None):
import gc
repo1 = GitRepo(path1, create=True)
assert_is_instance(repo1, GitRepo)
# Due to issue 4862, we currently still require gc.collect() under unclear
# circumstances to get rid of an exception traceback when creating in an
# existing directory. That traceback references the respective function
# frames which in turn reference the repo instance (they are methods).
# Doesn't happen on all systems, though. Eventually we need to figure that
# out.
# However, still test for the refcount after gc.collect() to ensure we don't
# introduce new circular references and make the issue worse!
gc.collect()
# As long as we don't reintroduce any circular references or produce
# garbage during instantiation that isn't picked up immediately, `repo1`
# should be the only counted reference to this instance.
# Note, that sys.getrefcount reports its own argument and therefore one
# reference too much.
# Python 3.14+ changed internal reference handling - the interpreter now
# "borrows" references when loading objects onto the operand stack instead
# of incrementing refcount, leading to different sys.getrefcount() values.
# Per Python docs: "do not rely on the returned value to be accurate,
# other than a value of 0 or 1". The actual test for circular references
# is whether the object gets garbage collected below (lines ~1165-1185) -
# if circular refs existed, the finalizer wouldn't be called.
if sys.version_info < (3, 14):
assert_equal(1, sys.getrefcount(repo1) - 1)
# instantiate again:
repo2 = GitRepo(path1, create=False)
assert_is_instance(repo2, GitRepo)
# the very same object:
ok_(repo1 is repo2)
# reference the same in a different way:
with chpwd(path1):
repo3 = GitRepo(op.relpath(path1, start=path2), create=False)
# it's the same object:
ok_(repo1 is repo3)
# and realpath attribute is the same, so they are still equal:
ok_(repo1 == repo3)
orig_id = id(repo1)
# Be sure we have exactly one object in memory:
assert_equal(1, len([o for o in gc.get_objects()
if isinstance(o, GitRepo) and o.path == path1]))
# deleting one reference doesn't change anything - we still get the same
# thing:
gc.collect() # TODO: see first comment above
del repo1
ok_(repo2 is not None)
ok_(repo2 is repo3)
ok_(repo2 == repo3)
# re-requesting still delivers the same thing:
repo1 = GitRepo(path1)
assert_equal(orig_id, id(repo1))
# killing all references should result in the instance being gc'd and
# re-request yields a new object:
del repo1
del repo2
# Killing last reference will lead to garbage collection which will call
# GitRepo's finalizer:
with swallow_logs(new_level=1) as cml:
del repo3
gc.collect() # TODO: see first comment above
cml.assert_logged(msg="Finalizer called on: GitRepo(%s)" % path1,
level="Level 1",
regex=False)
# Flyweight is gone:
assert_not_in(path1, GitRepo._unique_instances.keys())
# gc doesn't know any instance anymore:
assert_equal([], [o for o in gc.get_objects()
if isinstance(o, GitRepo) and o.path == path1])
# new object is created on re-request:
repo1 = GitRepo(path1)
assert_equal(1, len([o for o in gc.get_objects()
if isinstance(o, GitRepo) and o.path == path1]))
@with_tree(tree={'ignore-sub.me': {'a_file.txt': 'some content'},
'ignore.me': 'ignored content',
'dontigno.re': 'other content'})
def test_GitRepo_gitignore(path=None):
gr = GitRepo(path, create=True)
sub = GitRepo(op.join(path, 'ignore-sub.me'))
# we need to commit something, otherwise add_submodule
# will already refuse the submodule for having no commit
sub.add('a_file.txt')
sub.commit()
from ..exceptions import GitIgnoreError
with open(op.join(path, '.gitignore'), "w") as f:
f.write("*.me")
with assert_raises(GitIgnoreError) as cme:
gr.add('ignore.me')
eq_(cme.value.paths, ['ignore.me'])
with assert_raises(GitIgnoreError) as cme:
gr.add(['ignore.me', 'dontigno.re', op.join('ignore-sub.me', 'a_file.txt')])
eq_(set(cme.value.paths), {'ignore.me', 'ignore-sub.me'})
eq_(gr.get_gitattributes('.')['.'], {}) # nothing is recorded within .gitattributes
@with_tempfile(mkdir=True)
def test_GitRepo_set_remote_url(path=None):
gr = GitRepo(path, create=True)
gr.add_remote('some', 'http://example.com/.git')
eq_(gr.config['remote.some.url'],
'http://example.com/.git')
# change url:
gr.set_remote_url('some', 'http://believe.it')
eq_(gr.config['remote.some.url'],
'http://believe.it')
# set push url:
gr.set_remote_url('some', 'ssh://whatever.ru', push=True)
eq_(gr.config['remote.some.pushurl'],
'ssh://whatever.ru')
# add remote without url
url2 = 'http://repo2.example.com/.git'
gr.add_remote('some-without-url', url2)
eq_(gr.config['remote.some-without-url.url'], url2)
# "remove" it
gr.config.unset('remote.some-without-url.url', scope='local')
with assert_raises(KeyError):
gr.config['remote.some-without-url.url']
eq_(set(gr.get_remotes()), {'some', 'some-without-url'})
eq_(set(gr.get_remotes(with_urls_only=True)), {'some'})
@with_tempfile(mkdir=True)
def test_gitattributes(path=None):
gr = GitRepo(path, create=True)
# starts without any attributes file
ok_(not op.exists(op.join(gr.path, '.gitattributes')))
eq_(gr.get_gitattributes('.')['.'], {})
# bool is a tag or unsets, anything else is key/value
gr.set_gitattributes([('*', {'tag': True}), ('*', {'sec.key': 'val'})])
ok_(op.exists(op.join(gr.path, '.gitattributes')))
eq_(gr.get_gitattributes('.')['.'], {'tag': True, 'sec.key': 'val'})
# unset by amending the record, but does not remove notion of the
# tag entirely
gr.set_gitattributes([('*', {'tag': False})])
eq_(gr.get_gitattributes('.')['.'], {'tag': False, 'sec.key': 'val'})
# attributes file is not added or committed, we can ignore such
# attributes
eq_(gr.get_gitattributes('.', index_only=True)['.'], {})
# we can send absolute path patterns and write to any file, and
# the patterns will be translated relative to the target file
gr.set_gitattributes([
(op.join(gr.path, 'relative', 'ikethemike/**'), {'bang': True})],
attrfile=op.join('relative', '.gitattributes'))
# directory and file get created
ok_(op.exists(op.join(gr.path, 'relative', '.gitattributes')))
eq_(gr.get_gitattributes(
op.join(gr.path, 'relative', 'ikethemike', 'probe')),
# always comes out relative to the repo root, even if abs goes in
{op.join('relative', 'ikethemike', 'probe'):
{'tag': False, 'sec.key': 'val', 'bang': True}})
if get_encoding_info()['default'] != 'ascii' and not on_windows:
# do not perform this on obscure systems without anything like UTF
# it is not relevant whether a path actually exists, and paths
# with spaces and other funky stuff are just fine
funky = u'{} {}'.format(
get_most_obscure_supported_name(),
get_most_obscure_supported_name())
gr.set_gitattributes([(funky, {'this': 'that'})])
eq_(gr.get_gitattributes(funky)[funky], {
'this': 'that',
'tag': False,
'sec.key': 'val',
})
# mode='w' should replace the entire file:
gr.set_gitattributes([('**', {'some': 'nonsense'})], mode='w')
eq_(gr.get_gitattributes('.')['.'], {'some': 'nonsense'})
# mode='a' appends additional key/value
gr.set_gitattributes([('*', {'king': 'kong'})], mode='a')
eq_(gr.get_gitattributes('.')['.'], {'some': 'nonsense', 'king': 'kong'})
# handle files without trailing newline
with open(op.join(gr.path, '.gitattributes'), 'r+') as f:
s = f.read()
f.seek(0)
f.write(s.rstrip())
f.truncate()
gr.set_gitattributes([('*', {'ding': 'dong'})], mode='a')
eq_(gr.get_gitattributes('.')['.'],
{'some': 'nonsense', 'king': 'kong', 'ding': 'dong'})
@with_tempfile(mkdir=True)
def test_get_hexsha_tag(path=None):
gr = GitRepo(path, create=True)
gr.commit(msg="msg", options=["--allow-empty"])
gr.tag("atag", message="atag msg")
# get_hexsha() dereferences a tag to a commit.
eq_(gr.get_hexsha("atag"), gr.get_hexsha())
@with_tempfile(mkdir=True)
def test_get_tags(path=None):
from unittest.mock import patch
gr = GitRepo(path, create=True)
eq_(gr.get_tags(), [])
eq_(gr.describe(), None)
# Explicitly override the committer date because tests may set it to a
# fixed value, but we want to check that the returned tags are sorted by
# the date the tag (for annotaged tags) or commit (for lightweight tags)
# was created.
with patch.dict("os.environ", {"GIT_COMMITTER_DATE":
"Thu, 07 Apr 2005 22:13:13 +0200"}):
create_tree(gr.path, {'file': ""})
gr.add('file')
gr.commit(msg="msg")
eq_(gr.get_tags(), [])
eq_(gr.describe(), None)
gr.tag("nonannotated")
tags1 = [{'name': 'nonannotated', 'hexsha': gr.get_hexsha()}]
eq_(gr.get_tags(), tags1)
eq_(gr.describe(), None)
eq_(gr.describe(tags=True), tags1[0]['name'])
first_commit = gr.get_hexsha()
with patch.dict("os.environ", {"GIT_COMMITTER_DATE":
"Fri, 08 Apr 2005 22:13:13 +0200"}):
create_tree(gr.path, {'file': "123"})
gr.add('file')
gr.commit(msg="changed")
with patch.dict("os.environ", {"GIT_COMMITTER_DATE":
"Fri, 09 Apr 2005 22:13:13 +0200"}):
gr.tag("annotated", message="annotation")
# The annotated tag happened later, so it comes last.
tags2 = tags1 + [{'name': 'annotated', 'hexsha': gr.get_hexsha()}]
eq_(gr.get_tags(), tags2)
eq_(gr.describe(), tags2[1]['name'])
# compare prev commit
eq_(gr.describe(commitish=first_commit), None)
eq_(gr.describe(commitish=first_commit, tags=True), tags1[0]['name'])
gr.tag('specific', commit='HEAD~1')
eq_(gr.get_hexsha('specific'), gr.get_hexsha('HEAD~1'))
assert_in('specific', gr.get_tags(output='name'))
# retag a different commit
assert_raises(CommandError, gr.tag, 'specific', commit='HEAD')
# force it
gr.tag('specific', commit='HEAD', options=['-f'])
eq_(gr.get_hexsha('specific'), gr.get_hexsha('HEAD'))
# delete
gr.call_git(['tag', '-d', 'specific'])
eq_(gr.get_tags(), tags2)
# more than one
gr.tag('one')
gr.tag('two')
gr.call_git(['tag', '-d', 'one', 'two'])
eq_(gr.get_tags(), tags2)
@with_tree(tree={'1': ""})
def test_get_commit_date(path=None):
gr = GitRepo(path, create=True)
eq_(gr.get_commit_date(), None)
# Let's make a commit with a custom date
DATE = "Wed Mar 14 03:47:30 2018 -0000"
DATE_EPOCH = 1520999250
gr.add('1')
gr.commit("committed", date=DATE)
gr = GitRepo(path, create=True)
date = gr.get_commit_date()
neq_(date, None)
eq_(date, DATE_EPOCH)
eq_(date, gr.get_commit_date(DEFAULT_BRANCH))
# and even if we get into a detached head
gr.checkout(gr.get_hexsha())
eq_(gr.get_active_branch(), None)
eq_(date, gr.get_commit_date(DEFAULT_BRANCH))
@with_tree(tree={"foo": "foo content",
"bar": "bar content"})
def test_fake_dates(path=None):
gr = GitRepo(path, create=True, fake_dates=True)
gr.add("foo")
gr.commit("commit foo")
seconds_initial = gr.config.obtain("datalad.fake-dates-start")
# First commit is incremented by 1 second.
eq_(seconds_initial + 1, gr.get_commit_date())
# The second commit by 2.
gr.add("bar")
gr.commit("commit bar")
eq_(seconds_initial + 2, gr.get_commit_date())
# If we checkout another branch, its time is still based on the latest
# timestamp in any local branch.
gr.checkout("other", options=["--orphan"])
with open(op.join(path, "baz"), "w") as ofh:
ofh.write("baz content")
gr.add("baz")
gr.commit("commit baz")
eq_(gr.get_active_branch(), "other")
eq_(seconds_initial + 3, gr.get_commit_date())
@slow # 15sec on Yarik's laptop and tripped Travis CI
@with_tempfile(mkdir=True)
def test_duecredit(path=None):
# Just to check that no obvious side-effects
run = WitlessRunner(cwd=path).run
cmd = [
sys.executable, "-c",
"from datalad.support.gitrepo import GitRepo; GitRepo(%r, create=True)" % path
]
env = os.environ.copy()
# Test with duecredit not enabled for sure
env.pop('DUECREDIT_ENABLE', None)
# Alternative workaround for what to be fixed by
# https://github.com/datalad/datalad/pull/3215
# where underlying datalad process might issue a warning since our temp
# cwd is not matching possibly present PWD env variable
env.pop('PWD', None)
out = run(cmd, env=env, protocol=StdOutErrCapture)
outs = ''.join(out.values()) # Let's not depend on where duecredit decides to spit out
# All quiet
test_string = 'Data management and distribution platform'
assert_not_in(test_string, outs)
# and now enable DUECREDIT - output could come to stderr
env['DUECREDIT_ENABLE'] = '1'
out = run(cmd, env=env, protocol=StdOutErrCapture)
outs = ''.join(out.values())
if external_versions['duecredit']:
assert_in(test_string, outs)
else:
assert_not_in(test_string, outs)
@with_tempfile(mkdir=True)
def test_GitRepo_get_revisions(path=None):
gr = GitRepo(path, create=True)
def commit(msg):
gr.commit(msg=msg, options=["--allow-empty"])
# We catch the error and return empty if the current branch doesn't have a
# commit checked out.
eq_(gr.get_revisions(), [])
# But will raise if on a bad ref name, including an unborn branch.
with assert_raises(CommandError):
gr.get_revisions(DEFAULT_BRANCH)
# By default, we query HEAD.
commit("1")
eq_(len(gr.get_revisions()), 1)
gr.checkout("other", options=["-b"])
commit("2")
# We can also query branch by name.
eq_(len(gr.get_revisions(DEFAULT_BRANCH)), 1)
eq_(len(gr.get_revisions("other")), 2)
# "name" is sugar for ["name"].
eq_(gr.get_revisions(DEFAULT_BRANCH),
gr.get_revisions([DEFAULT_BRANCH]))
gr.checkout(DEFAULT_BRANCH)
commit("3")
eq_(len(gr.get_revisions(DEFAULT_BRANCH)), 2)
# We can pass multiple revisions...
eq_(len(gr.get_revisions([DEFAULT_BRANCH, "other"])), 3)
# ... or options like --all and --branches
eq_(gr.get_revisions([DEFAULT_BRANCH, "other"]),
gr.get_revisions(options=["--all"]))
# Ranges are supported.
eq_(gr.get_revisions(DEFAULT_BRANCH + ".."), [])
@xfail_buggy_annex_info
@with_tree({"foo": "foo",
".gitattributes": "* annex.largefiles=anything"})
def test_gitrepo_add_to_git_with_annex_v7(path=None):
from datalad.support.annexrepo import AnnexRepo
ar = AnnexRepo(path, create=True, version=7)
gr = GitRepo(path)
gr.add("foo")
gr.commit(msg="c1")
assert_false(ar.is_under_annex("foo"))
@with_tree({"foo": "foo", "bar": "bar"})
def test_gitrepo_call_git_methods(path=None):
gr = GitRepo(path)
gr.add(["foo", "bar"])
gr.commit(msg="foobar")
gr.call_git(["mv"], files=["foo", "foo.txt"])
ok_(op.exists(op.join(gr.path, 'foo.txt')))
for expect_fail, check in [(False, assert_in),
(True, assert_not_in)]:
with swallow_logs(new_level=logging.DEBUG) as cml:
with assert_raises(CommandError):
gr.call_git(["mv"], files=["notthere", "dest"],
expect_fail=expect_fail)
check("fatal: bad source", cml.out)
eq_(list(gr.call_git_items_(["ls-files"], read_only=True)),
["bar", "foo.txt"])
eq_(list(gr.call_git_items_(["ls-files", "-z"], sep="\0", read_only=True)),
["bar", "foo.txt"])
with assert_raises(AssertionError):
gr.call_git_oneline(["ls-files"], read_only=True)
eq_(gr.call_git_oneline(["ls-files"], files=["bar"], read_only=True),
"bar")
ok_(gr.call_git_success(["rev-parse", "HEAD^{commit}"], read_only=True))
with swallow_logs(new_level=logging.DEBUG) as cml:
assert_false(gr.call_git_success(["rev-parse", "HEAD^{blob}"],
read_only=True))
assert_not_in("expected blob type", cml.out)
@integration
# http is well tested already
# 'git' is not longer supported
@pytest.mark.parametrize("proto", ["https"])
@skip_if_no_network
@with_tempfile
def test_protocols(destdir=None, *, proto):
# git-annex-standalone build can get git bundle which would fail to
# download via https, resulting in messages such as
# fatal: unable to find remote helper for 'https'
# which happened with git-annex-standalone 7.20191017+git2-g7b13db551-1~ndall+1
GitRepo.clone('%s://github.com/datalad-tester/testtt' % proto, destdir)
@with_tempfile
def test_gitrepo_push_default_first_kludge(path=None):
path = Path(path)
repo_a = GitRepo(path / "a", bare=True)
repo_b = GitRepo.clone(repo_a.path, str(path / "b"))
(repo_b.pathobj / "foo").write_text("foo")
repo_b.save()
# push() usually pushes all refspecs in one go.
with swallow_logs(new_level=logging.DEBUG) as cml:
res_oneshot = repo_b.push(remote=DEFAULT_REMOTE,
refspec=[DEFAULT_BRANCH + ":b-oneshot",
DEFAULT_BRANCH + ":a-oneshot",
DEFAULT_BRANCH + ":c-oneshot"])
cmds_oneshot = [ln for ln in cml.out.splitlines()
if "Run" in ln and "push" in ln and DEFAULT_BRANCH in ln]
eq_(len(cmds_oneshot), 1)
assert_in(":a-oneshot", cmds_oneshot[0])
assert_in(":b-oneshot", cmds_oneshot[0])
assert_in(":c-oneshot", cmds_oneshot[0])
eq_(len(res_oneshot), 3)
# But if datalad-push-default-first is set...
cfg_var = f"remote.{DEFAULT_REMOTE}.datalad-push-default-first"
repo_b.config.set(cfg_var, "true", scope="local")
with swallow_logs(new_level=logging.DEBUG) as cml:
res_twoshot = repo_b.push(remote=DEFAULT_REMOTE,
refspec=[DEFAULT_BRANCH + ":b-twoshot",
DEFAULT_BRANCH + ":a-twoshot",
DEFAULT_BRANCH + ":c-twoshot"])
cmds_twoshot = [ln for ln in cml.out.splitlines()
if "Run" in ln and "push" in ln and DEFAULT_BRANCH in ln]
# ... there are instead two git-push calls.
eq_(len(cmds_twoshot), 2)
# The first is for the first item of the refspec.
assert_in(":b-twoshot", cmds_twoshot[0])
assert_not_in(":b-twoshot", cmds_twoshot[1])
# The remaining items are in the second call.
assert_in(":a-twoshot", cmds_twoshot[1])
assert_in(":c-twoshot", cmds_twoshot[1])
assert_not_in(":c-twoshot", cmds_twoshot[0])
assert_not_in(":a-twoshot", cmds_twoshot[0])
# The result returned by push() has the same number of records, though.
eq_(len(res_twoshot), 3)
# The configuration variable is removed afterward.
assert_false(repo_b.config.get(cfg_var))
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_globbedpaths.py 0000644 0001751 0001751 00000017716 15137634221 023113 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-; coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""test GlobbedPaths
"""
__docformat__ = 'restructuredtext'
import logging
import os.path as op
from itertools import product
from unittest.mock import patch
from datalad.tests.utils_pytest import (
OBSCURE_FILENAME,
assert_in,
eq_,
swallow_logs,
with_tree,
)
from ..globbedpaths import GlobbedPaths
def test_globbedpaths_get_sub_patterns():
gp = GlobbedPaths([], "doesn't matter")
for pat, expected in [
# If there are no patterns in the directory component, we get no
# sub-patterns.
("", []),
("nodir", []),
(op.join("nomagic", "path"), []),
(op.join("nomagic", "path*"), []),
# Create sub-patterns from leading path, successively dropping the
# right-most component.
(op.join("s*", "path"), ["s*" + op.sep]),
(op.join("s", "ss*", "path"), [op.join("s", "ss*") + op.sep]),
(op.join("s", "ss*", "path*"), [op.join("s", "ss*") + op.sep]),
(op.join("s", "ss*" + op.sep), []),
(op.join("s*", "ss", "path*"),
[op.join("s*", "ss") + op.sep,
"s*" + op.sep]),
(op.join("s?", "ss", "sss*", "path*"),
[op.join("s?", "ss", "sss*") + op.sep,
op.join("s?", "ss") + op.sep,
"s?" + op.sep])]:
eq_(gp._get_sub_patterns(pat), expected)
bOBSCURE_FILENAME = f"b{OBSCURE_FILENAME}.dat"
@with_tree(tree={"1.txt": "",
"2.dat": "",
"3.txt": "",
bOBSCURE_FILENAME: "",
"subdir": {"1.txt": "", "2.txt": "", "subsub": {"3.dat": ""}}})
def test_globbedpaths(path=None):
dotdir = op.curdir + op.sep
for patterns, expected in [
(["1.txt", "2.dat"], {"1.txt", "2.dat"}),
([dotdir + "1.txt", "2.dat"], {dotdir + "1.txt", "2.dat"}),
(["*.txt", "*.dat"], {"1.txt", "2.dat", bOBSCURE_FILENAME, "3.txt"}),
([dotdir + "*.txt", "*.dat"],
{dotdir + "1.txt", "2.dat", bOBSCURE_FILENAME, dotdir + "3.txt"}),
([op.join("subdir", "*.txt")],
{op.join("subdir", "1.txt"), op.join("subdir", "2.txt")}),
(["subdir" + op.sep], {"subdir" + op.sep}),
([dotdir + op.join("subdir", "*.txt")],
{dotdir + op.join(*ps)
for ps in [("subdir", "1.txt"), ("subdir", "2.txt")]}),
(["*.txt"], {"1.txt", "3.txt"}),
([op.join("subdir", "**")],
{op.join(*ps)
for ps in [("subdir" + op.sep,), ("subdir", "subsub"),
("subdir", "1.txt"), ("subdir", "2.txt"),
("subdir", "subsub", "3.dat")]}),
([dotdir + op.join("**", "*.dat")],
{dotdir + op.join("2.dat"), dotdir + bOBSCURE_FILENAME,
dotdir + op.join("subdir", "subsub", "3.dat")})]:
gp = GlobbedPaths(patterns, pwd=path)
eq_(set(gp.expand()), expected)
eq_(set(gp.expand(full=True)),
{op.join(path, p) for p in expected})
pardir = op.pardir + op.sep
subdir_path = op.join(path, "subdir")
for patterns, expected in [
(["*.txt"], {"1.txt", "2.txt"}),
([dotdir + "*.txt"], {dotdir + p for p in ["1.txt", "2.txt"]}),
([pardir + "*.txt"], {pardir + p for p in ["1.txt", "3.txt"]}),
([dotdir + pardir + "*.txt"],
{dotdir + pardir + p for p in ["1.txt", "3.txt"]}),
# Patterns that don't match are retained by default.
(["amiss"], {"amiss"})]:
gp = GlobbedPaths(patterns, pwd=subdir_path)
eq_(set(gp.expand()), expected)
eq_(set(gp.expand(full=True)),
{op.join(subdir_path, p) for p in expected})
# Full patterns still get returned as relative to pwd.
gp = GlobbedPaths([op.join(path, "*.dat")], pwd=path)
eq_(gp.expand(), ["2.dat", bOBSCURE_FILENAME])
# "." gets special treatment.
gp = GlobbedPaths([".", "*.dat"], pwd=path)
eq_(set(gp.expand()), {"2.dat", bOBSCURE_FILENAME, "."})
eq_(gp.expand(dot=False), ["2.dat", bOBSCURE_FILENAME])
gp = GlobbedPaths(["."], pwd=path, expand=False)
eq_(gp.expand(), ["."])
eq_(gp.paths, ["."])
# We can the glob outputs.
glob_results = {"z": "z",
"a": ["x", "d", "b"]}
with patch('glob.glob', lambda k, **kwargs: glob_results[k]):
gp = GlobbedPaths(["z", "a"])
eq_(gp.expand(), ["z", "b", "d", "x"])
# glob expansion for paths property is determined by expand argument.
for expand, expected in [(True, ["2.dat", bOBSCURE_FILENAME]),
(False, ["*.dat"])]:
gp = GlobbedPaths(["*.dat"], pwd=path, expand=expand)
eq_(gp.paths, expected)
with swallow_logs(new_level=logging.DEBUG) as cml:
GlobbedPaths(["not here"], pwd=path).expand()
assert_in("No matching files found for 'not here'", cml.out)
@with_tree(tree={"1.txt": "", "2.dat": "", "3.txt": ""})
def test_globbedpaths_misses(path=None):
gp = GlobbedPaths(["amiss"], pwd=path)
eq_(gp.expand_strict(), [])
eq_(gp.misses, ["amiss"])
eq_(gp.expand(include_misses=True), ["amiss"])
# miss at beginning
gp = GlobbedPaths(["amiss", "*.txt", "*.dat"], pwd=path)
eq_(gp.expand_strict(), ["1.txt", "3.txt", "2.dat"])
eq_(gp.expand(include_misses=True),
["amiss", "1.txt", "3.txt", "2.dat"])
# miss in middle
gp = GlobbedPaths(["*.txt", "amiss", "*.dat"], pwd=path)
eq_(gp.expand_strict(), ["1.txt", "3.txt", "2.dat"])
eq_(gp.misses, ["amiss"])
eq_(gp.expand(include_misses=True),
["1.txt", "3.txt", "amiss", "2.dat"])
# miss at end
gp = GlobbedPaths(["*.txt", "*.dat", "amiss"], pwd=path)
eq_(gp.expand_strict(), ["1.txt", "3.txt", "2.dat"])
eq_(gp.misses, ["amiss"])
eq_(gp.expand(include_misses=True),
["1.txt", "3.txt", "2.dat", "amiss"])
# miss at beginning, middle, and end
gp = GlobbedPaths(["amiss1", "amiss2", "*.txt", "amiss3", "*.dat",
"amiss4"],
pwd=path)
eq_(gp.expand_strict(), ["1.txt", "3.txt", "2.dat"])
eq_(gp.misses, ["amiss1", "amiss2", "amiss3", "amiss4"])
eq_(gp.expand(include_misses=True),
["amiss1", "amiss2", "1.txt", "3.txt", "amiss3", "2.dat", "amiss4"])
# Property expands if needed.
gp = GlobbedPaths(["amiss"], pwd=path)
eq_(gp.misses, ["amiss"])
@with_tree(tree={"adir": {},
"bdir": {},
"other": {},
"1.txt": "", "2.dat": "", "3.txt": ""})
def test_globbedpaths_partial_matches(path=None):
gp = GlobbedPaths([op.join("?dir", "*.txt"), "*.txt"], pwd=path)
eq_(gp.expand_strict(), ["1.txt", "3.txt"])
expected_partial = ["adir" + op.sep, "bdir" + op.sep]
eq_(gp.partial_hits, expected_partial)
eq_(gp.expand(include_partial=True),
expected_partial + ["1.txt", "3.txt"])
# Property expands if needed.
gp = GlobbedPaths([op.join("?dir", "*.txt")], pwd=path)
eq_(gp.partial_hits, expected_partial)
@with_tree(tree={"1.txt": "",
"2.dat": "",
"3.txt": "",
"foo.dat": ""})
def test_globbedpaths_cached(path=None):
# Smoke test to trigger cache handling.
gp = GlobbedPaths([op.join("?", ".dat"), "*.txt"], pwd=path)
for full, partial, misses in product([False, True], repeat=3):
eq_(gp.expand(full=full,
include_misses=misses,
include_partial=partial),
gp.expand(full=full,
include_misses=misses,
include_partial=partial))
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_json_py.py 0000644 0001751 0001751 00000004731 15137634221 022127 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-; coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import logging
import os.path as op
from json import JSONDecodeError
from datalad.support.json_py import (
dump,
dump2stream,
load,
load_stream,
loads,
)
from datalad.tests.utils_pytest import (
assert_in,
assert_raises,
eq_,
swallow_logs,
with_tempfile,
)
@with_tempfile(content=b'{"Authors": ["A1"\xc2\xa0, "A2"]}')
def test_load_screwy_unicode(fname=None):
# test that we can tollerate some screwy unicode embeddings within json
assert_raises(JSONDecodeError, load, fname, fixup=False)
with swallow_logs(new_level=logging.WARNING) as cml:
eq_(load(fname), {'Authors': ['A1', 'A2']})
assert_in('Failed to decode content', cml.out)
@with_tempfile(content=u"""\
{"key0": "aβ¨b"}
{"key1": "plain"}""".encode("utf-8"))
def test_load_unicode_line_separator(fname=None):
# See gh-3523.
result = list(load_stream(fname))
eq_(len(result), 2)
eq_(result[0]["key0"], u"aβ¨b")
eq_(result[1]["key1"], u"plain")
def test_loads():
eq_(loads('{"a": 2}'), {'a': 2})
with assert_raises(JSONDecodeError),\
swallow_logs(new_level=logging.WARNING) as cml:
loads('{"a": 2}x')
assert_in('Failed to load content from', cml.out)
@with_tempfile(mkdir=True)
def test_compression(path=None):
fname = op.join(path, 'test.json.xz')
content = 'dummy'
# dump compressed
dump(content, fname, compressed=True)
# filename extension match auto-enabled compression "detection"
eq_(load(fname), content)
# but was it actually compressed?
# we don't care how exactly it blows up (UnicodeDecodeError, etc),
# but it has to blow
assert_raises(Exception, load, fname, compressed=False)
@with_tempfile
def test_dump(path=None):
assert(not op.exists(path))
# dump is nice and create the target directory
dump('some', op.join(path, 'file.json'))
assert(op.exists(path))
# at least a smoke test
@with_tempfile
def test_dump2stream(path=None):
stream = [dict(a=5), dict(b=4)]
dump2stream([dict(a=5), dict(b=4)], path)
eq_(list(load_stream(path)), stream)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_locking.py 0000644 0001751 0001751 00000015530 15137634221 022073 0 ustar 00runner runner #emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
#ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import os
import os.path as op
import sys
from pathlib import Path
from time import time
from fasteners import InterProcessLock
from datalad.tests.utils_pytest import (
assert_false,
assert_greater,
assert_in,
assert_not_in,
assert_raises,
assert_true,
eq_,
ok_,
ok_exists,
on_osx,
with_tempfile,
)
from ...cmd import (
CommandError,
StdOutErrCapture,
WitlessRunner,
)
from ...utils import ensure_unicode
from ..locking import (
lock_if_check_fails,
try_lock_informatively,
)
class Subproc:
# By implementing this closure as a class instead of as a nested function,
# it becomes possible to pickle it.
def __init__(self, tempfile):
self.tempfile = tempfile
def __call__(self, q):
with lock_if_check_fails(False, self.tempfile, blocking=False, _return_acquired=True)\
as (_, lock2, acquired):
# we used to check for .acquired here but it was removed from
# fasteners API: https://github.com/harlowja/fasteners/issues/71
q.put(acquired)
@with_tempfile
def test_lock_if_check_fails(tempfile=None):
# basic test, should never try to lock so filename is not important
with lock_if_check_fails(True, None) as (check, lock):
assert check is True
assert lock is None
assert check # still available outside
# and with a callable
with lock_if_check_fails(lambda: "valuable", None) as (check, lock):
eq_(check, "valuable")
assert lock is None
eq_(check, "valuable")
# basic test, should never try to lock so filename is not important
with lock_if_check_fails(False, tempfile) as (check, lock):
ok_(lock)
ok_exists(tempfile + '.lck')
assert not op.exists(tempfile + '.lck') # and it gets removed after
# the same with providing operation
# basic test, should never try to lock so filename is not important
with lock_if_check_fails(False, tempfile, operation='get') as (check, lock):
ok_(lock)
ok_exists(tempfile + '.get-lck')
assert not op.exists(tempfile + '.get-lck') # and it gets removed after
from multiprocessing import (
Process,
Queue,
)
q = Queue()
p = Process(target=Subproc(tempfile), args=(q,))
# now we need somehow to actually check the bloody lock functioning
with lock_if_check_fails((op.exists, (tempfile,)), tempfile, _return_acquired=True) as (check, lock, acquired):
eq_(check, False)
ok_(lock)
ok_(acquired)
# but now we will try to lock again, but we need to do it in another
# process
p.start()
assert q.get() is False
p.join()
with open(tempfile, 'w') as f:
pass
ok_exists(tempfile)
ok_exists(tempfile)
# and we redo -- it will acquire it
p = Process(target=Subproc(tempfile), args=(q,))
p.start()
ok_(q.get())
p.join()
@with_tempfile
def test_try_lock_informatively(tempfile=None):
lock = InterProcessLock(tempfile + '.lck')
lock_path = ensure_unicode(lock.path) # can be bytes, complicates string formattingetc
t0 = time()
with try_lock_informatively(lock, purpose="happy life") as acquired:
assert_true(lock.acquired)
assert_true(acquired)
assert_greater(2, time() - t0) # should not take any notable time, we cannot be blocking
"""
# InterProcessLock is not re-entrant so nesting should not be used, will result
# in exception on release
with try_lock_informatively(lock, timeouts=[dt, dt*2], proceed_unlocked=True) as acquired:
assert_true(lock.acquired) # due to outer cm
assert_true(acquired) # lock is reentrant apparently
"""
# Let's try in a completely different subprocess
runner = WitlessRunner(env=dict(os.environ, DATALAD_LOG_LEVEL='info', DATALAD_LOG_TARGET='stderr'))
script1 = Path(tempfile + "-script1.py")
script1_fmt = f"""
from fasteners import InterProcessLock
from time import time
from datalad.support.locking import try_lock_informatively
lock = InterProcessLock({lock_path!r})
with try_lock_informatively(lock, timeouts=[0.05, 0.15], proceed_unlocked={{proceed_unlocked}}) as acquired:
print("Lock acquired=%s" % acquired)
"""
script1.write_text(script1_fmt.format(proceed_unlocked=True))
t0 = time()
res = runner.run([sys.executable, str(script1)], protocol=StdOutErrCapture)
assert_in('Lock acquired=False', res['stdout'])
assert_in(f'Failed to acquire lock at {lock_path} in 0.05', res['stderr'])
assert_in(f'Failed to acquire lock at {lock_path} in 0.15', res['stderr'])
assert_in('proceed without locking', res['stderr'])
assert_greater(time() - t0, 0.19999) # should wait for at least 0.2
try:
import psutil
# PID does not correspond
assert_in('Check following process: PID=', res['stderr'])
assert_in(f'CWD={os.getcwd()} CMDLINE=', res['stderr'])
except ImportError:
pass # psutil was not installed, cannot get list of files
except AssertionError:
# we must have had the other one then
assert_in('failed to determine one', res['stderr'])
if not on_osx:
# so far we had only OSX reporting failing to get PIDs information
# but if it is something else -- re-raise original exception
raise
# in 2nd case, lets try without proceeding unlocked
script1.write_text(script1_fmt.format(proceed_unlocked=False))
t0 = time()
with assert_raises(CommandError) as cme:
runner.run([sys.executable, str(script1)], protocol=StdOutErrCapture)
assert_in(f"Failed to acquire lock at {lock_path} in 2 attempts.", str(cme.value))
assert_in(f"RuntimeError", str(cme.value))
assert_false(cme.value.stdout) # nothing there since print should not happen
assert_in(f'Failed to acquire lock at {lock_path} in 0.05', cme.value.stderr)
assert_in(f'Failed to acquire lock at {lock_path} in 0.15', cme.value.stderr)
assert_greater(time() - t0, 0.19999) # should wait for at least 0.2
# now that we left context, should work out just fine
res = runner.run([sys.executable, str(script1)], protocol=StdOutErrCapture)
assert_in('Lock acquired=True', res['stdout'])
assert_not_in(f'Failed to acquire lock', res['stderr'])
assert_not_in('PID', res['stderr'])
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_network.py 0000644 0001751 0001751 00000062664 15137634221 022150 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import logging
import os
import tempfile
from os.path import isabs
from os.path import join as opj
import pytest
import datalad.support.network
from datalad.distribution.dataset import Dataset
from datalad.support.network import (
RI,
SSHRI,
URL,
DataLadRI,
GitTransportRI,
PathRI,
_split_colon,
dlurljoin,
get_local_file_url,
get_response_disposition_filename,
get_tld,
get_url_straight_filename,
is_datalad_compat_ri,
is_ssh,
is_url,
iso8601_to_epoch,
local_path2url_path,
local_path_representation,
local_url_path_representation,
parse_url_opts,
quote_path,
same_website,
url_path2local_path,
urlquote,
)
from datalad.tests.utils_pytest import (
OBSCURE_FILENAME,
SkipTest,
assert_in,
assert_raises,
assert_status,
eq_,
get_most_obscure_supported_name,
known_failure_githubci_win,
neq_,
nok_,
ok_,
skip_if,
swallow_logs,
with_tempfile,
)
from datalad.utils import (
Path,
PurePosixPath,
on_windows,
)
def test_same_website():
ok_(same_website("http://a.b", "http://a.b/2014/01/xxx/"))
ok_(same_website("http://a.b/page/2/", "http://a.b/2014/01/xxx/"))
ok_(same_website("https://a.b/page/2/", "http://a.b/2014/01/xxx/"))
ok_(same_website("http://a.b/page/2/", "https://a.b/2014/01/xxx/"))
def test_get_tld():
eq_(get_tld('http://example.com'), 'example.com')
eq_(get_tld('http://example.com/1'), 'example.com')
eq_(get_tld('http://example.com/1/2'), 'example.com')
eq_(get_tld('example.com/1/2'), 'example.com')
eq_(get_tld('s3://example.com/1/2'), 'example.com')
assert_raises(ValueError, get_tld, "")
assert_raises(ValueError, get_tld, "s3://")
assert_raises(ValueError, get_tld, "http://")
def test_dlurljoin():
eq_(dlurljoin('http://a.b/', 'f'), 'http://a.b/f')
eq_(dlurljoin('http://a.b/page', 'f'), 'http://a.b/f')
eq_(dlurljoin('http://a.b/dir/', 'f'), 'http://a.b/dir/f')
eq_(dlurljoin('http://a.b/dir/', 'http://url'), 'http://url')
eq_(dlurljoin('http://a.b/dir/', '/'), 'http://a.b/')
eq_(dlurljoin('http://a.b/dir/', '/x/y'), 'http://a.b/x/y')
@pytest.mark.parametrize("suf", [
'',
'#',
'#tag',
'#tag/obscure',
'?param=1',
'?param=1&another=/',
])
def test_get_url_straight_filename(suf):
eq_(get_url_straight_filename('http://a.b/' + suf), '')
eq_(get_url_straight_filename('http://a.b/p1' + suf), 'p1')
eq_(get_url_straight_filename('http://a.b/p1/' + suf), '')
eq_(get_url_straight_filename('http://a.b/p1/' + suf, allowdir=True), 'p1')
eq_(get_url_straight_filename('http://a.b/p1/p2' + suf), 'p2')
eq_(get_url_straight_filename('http://a.b/p1/p2/' + suf), '')
eq_(get_url_straight_filename('http://a.b/p1/p2/' + suf, allowdir=True), 'p2')
eq_(get_url_straight_filename('http://a.b/p1/p2/' + suf, allowdir=True, strip=('p2', 'xxx')), 'p1')
eq_(get_url_straight_filename('http://a.b/p1/p2/' + suf, strip=('p2', 'xxx')), '')
from ..network import rfc2822_to_epoch
def test_rfc2822_to_epoch():
eq_(rfc2822_to_epoch("Thu, 16 Oct 2014 01:16:17 EDT"), 1413436577)
def test_get_response_disposition_filename():
eq_(get_response_disposition_filename('attachment;filename="Part1-Subjects1-99.tar"'), "Part1-Subjects1-99.tar")
eq_(get_response_disposition_filename('attachment'), None)
def test_parse_url_opts():
url = 'http://map.org/api/download/?id=157'
output = parse_url_opts(url)
eq_(output, ('http://map.org/api/download/', {'id': '157'}))
url = 's3://bucket/save/?key=891'
output = parse_url_opts(url)
eq_(output, ('s3://bucket/save/', {'key': '891'}))
url = 'http://map.org/api/download/?id=98&code=13'
output = parse_url_opts(url)
eq_(output, ('http://map.org/api/download/', {'id': '98', 'code': '13'}))
def test_split_colon():
eq_(_split_colon('a:b'), ['a', 'b'])
eq_(_split_colon('a:b:c'), ['a', 'b:c'])
eq_(_split_colon('a:b:c', 2), ['a', 'b', 'c'])
eq_(_split_colon('ab'), ['ab'])
eq_(_split_colon(r'a\:b'), [r'a\:b'])
def test_url_eq():
assert URL() == URL()
# doesn't make sense to ask what kind of a url it is an empty URL
assert URL() != URL(hostname='x')
# Different types aren't equal even if have the same fields values
assert URL(path='x') != PathRI(path='x')
assert URL(hostname='x') != SSHRI(hostname='x')
assert str(URL(hostname='x')) != str(SSHRI(hostname='x'))
def _check_ri(ri, cls, exact_str=True, localpath=None, **fields):
"""just a helper to carry out few checks on urls"""
with swallow_logs(new_level=logging.DEBUG) as cml:
ri_ = cls(**fields)
murl = RI(ri)
assert murl.__class__ == cls # not just a subclass
assert murl == ri_
if isinstance(ri, str):
assert str(RI(ri)) == ri
assert eval(repr(ri_)) == ri # repr leads back to identical ri_
assert ri == ri_ # just in case ;) above should fail first if smth is wrong
if not exact_str:
assert_in('Parsed version of', cml.out)
if exact_str:
assert str(ri) == str(ri_)
else:
assert str(ri) != str(ri_)
# and that we have access to all those fields
nok_(set(fields).difference(set(cls._FIELDS)))
for f, v in fields.items():
assert getattr(ri_, f) == v
if localpath:
if cls == URL:
local_representation = local_url_path_representation(localpath)
else:
local_representation = local_path_representation(localpath)
assert ri_.localpath == local_representation
old_localpath = ri_.localpath # for a test below
else:
# if not given -- must be a remote url, should raise exception on
# non-Windows systems. But not on Windows systems because we allow UNCs
# to be encoded in URLs
if not on_windows:
with assert_raises(ValueError):
ri_.localpath
# This one does not have a path. TODO: either proxy path from its .RI or adjust
# hierarchy of classes to make it more explicit
if cls == GitTransportRI:
return
# do changes in the path persist?
old_str = str(ri_)
ri_.path = newpath = opj(ri_.path, 'sub')
assert ri_.path == newpath
assert str(ri_) != old_str
if localpath:
assert ri_.localpath == local_path_representation(opj(old_localpath, 'sub'))
def test_url_base():
# Basic checks
assert_raises(ValueError, URL, "http://example.com", hostname='example.com')
url = URL("http://example.com")
eq_(url.hostname, 'example.com')
eq_(url.scheme, 'http')
eq_(url.port, '') # not specified -- empty strings
eq_(url.username, '') # not specified -- empty strings
eq_(repr(url), "URL(hostname='example.com', netloc='example.com', scheme='http')")
eq_(url, "http://example.com") # automagic coercion in __eq__
neq_(URL(), URL(hostname='x'))
smth = URL('smth')
eq_(smth.hostname, '')
ok_(bool(smth))
nok_(bool(URL()))
assert_raises(ValueError, url._set_from_fields, unknown='1')
with swallow_logs(new_level=logging.WARNING) as cml:
# we don't "care" about params ATM so there is a warning if there are any
purl = URL("http://example.com/;param")
eq_(str(purl), 'http://example.com/;param') # but we do maintain original string
assert_in('ParseResults contains params', cml.out)
eq_(purl.as_str(), 'http://example.com/')
@with_tempfile
def test_pathri_guessing(filename=None):
# Complaining about ;param only at DEBUG level
# see https://github.com/datalad/datalad/issues/6872
with swallow_logs(new_level=logging.DEBUG) as cml:
# we don't "care" about params ATM so there is a warning if there are any
ri = RI(f"{filename};param")
assert isinstance(ri, PathRI)
if not on_windows:
# Does not happen on Windows since paths with \ instead of / do not
# look like possible URLs
assert_in('ParseResults contains params', cml.out)
@skip_if(not on_windows)
def test_pathri_windows_anchor():
assert RI('file:///c:/Windows').localpath == 'C:\\Windows'
@known_failure_githubci_win
def test_url_samples():
_check_ri("http://example.com", URL, scheme='http', hostname="example.com", netloc='example.com')
# "complete" one for classical http
_check_ri("http://user:pw@example.com:8080/p/sp?p1=v1&p2=v2#frag", URL,
scheme='http', netloc='user:pw@example.com:8080',
hostname="example.com", port=8080, username='user', password='pw',
path='/p/sp', query='p1=v1&p2=v2', fragment='frag')
# sample one for ssh with specifying the scheme
# XXX? might be useful? https://github.com/FriendCode/giturlparse.py
_check_ri("ssh://host/path/sp1", URL, scheme='ssh', hostname='host',
netloc='host', path='/path/sp1')
_check_ri("user@host:path/sp1", SSHRI,
hostname='host', path='path/sp1', username='user')
_check_ri("host:path/sp1", SSHRI, hostname='host', path='path/sp1')
_check_ri("host:path", SSHRI, hostname='host', path='path')
_check_ri("host:/path", SSHRI, hostname='host', path='/path')
_check_ri("user@host", SSHRI, hostname='host', username='user')
# TODO!!! should this be a legit URL like this?
# _check_ri("host", SSHRI, hostname='host'))
eq_(repr(RI("host:path")), "SSHRI(hostname='host', path='path')")
# And now perspective 'datalad', implicit=True urls pointing to the canonical center location
_check_ri("///", DataLadRI)
_check_ri("///p/s1", DataLadRI, path='p/s1')
# could be considered by someone as "URI reference" relative to scheme
_check_ri("//a/", DataLadRI, remote='a')
_check_ri("//a/data", DataLadRI, path='data', remote='a')
# here we will do custom magic allowing only schemes with + in them, such as dl+archive
# or not so custom as
_check_ri("hg+https://host/user/proj", URL, scheme="hg+https",
netloc='host', hostname='host', path='/user/proj')
# "old" style
_check_ri("dl+archive:KEY/path/sp1#size=123", URL,
scheme='dl+archive', path='KEY/path/sp1', fragment='size=123')
# "new" style
_check_ri("dl+archive:KEY#path=path/sp1&size=123", URL,
scheme='dl+archive', path='KEY', fragment='path=path/sp1&size=123')
# actually above one is probably wrong since we need to encode the path
_check_ri("dl+archive:KEY#path=path%2Fbsp1&size=123", URL,
scheme='dl+archive', path='KEY', fragment='path=path%2Fbsp1&size=123')
#https://en.wikipedia.org/wiki/File_URI_scheme
_check_ri("file://host", URL, scheme='file', netloc='host', hostname='host')
_check_ri("file://host/path/sp1", URL, scheme='file', netloc='host',
hostname='host', path='/path/sp1')
# stock libraries of Python aren't quite ready for ipv6
ipv6address = '2001:db8:85a3::8a2e:370:7334'
_check_ri("file://%s/path/sp1" % ipv6address, URL,
scheme='file', netloc=ipv6address, hostname=ipv6address,
path='/path/sp1')
for lh in ('localhost', '::1', '', '127.3.4.155'):
if on_windows:
url = RI(f"file://{lh}/path/sp1")
assert url.localpath == f'\\\\{lh}\\path\\sp1' if lh else '\\path\\sp1'
else:
_check_ri("file://%s/path/sp1" % lh, URL, localpath='/path/sp1',
scheme='file', netloc=lh, hostname=lh, path='/path/sp1')
_check_ri('http://[1fff:0:a88:85a3::ac1f]:8001/index.html', URL,
scheme='http', netloc='[1fff:0:a88:85a3::ac1f]:8001',
hostname='1fff:0:a88:85a3::ac1f', port=8001, path='/index.html')
_check_ri("file:///path/sp1", URL, localpath='/path/sp1', scheme='file', path='/path/sp1')
# we don't do any magical comprehension for home paths/drives for windows
# of file:// urls, thus leaving /~ and /c: for now:
_check_ri("file:///~/path/sp1", URL, localpath='/~/path/sp1', scheme='file', path='/~/path/sp1')
_check_ri("file:///%7E/path/sp1", URL, localpath='/~/path/sp1', scheme='file', path='/~/path/sp1', exact_str=False)
# not sure but let's check
if on_windows:
_check_ri("file:///C:/path/sp1", URL, localpath='C:/path/sp1', scheme='file', path='/C:/path/sp1', exact_str=False)
_check_ri("file:/C:/path/sp1", URL, localpath='C:/path/sp1', scheme='file', path='/C:/path/sp1', exact_str=False)
# git-annex style drive-letter encoding
_check_ri("file://C:/path/sp1", URL, netloc="C:", hostname="c", localpath='C:/path/sp1', scheme='file', path='/path/sp1', exact_str=False)
else:
_check_ri("file:///C:/path/sp1", URL, localpath='/C:/path/sp1', scheme='file', path='/C:/path/sp1', exact_str=False)
_check_ri("file:/C:/path/sp1", URL, localpath='/C:/path/sp1', scheme='file', path='/C:/path/sp1', exact_str=False)
# and now implicit paths or actually they are also "URI references"
_check_ri("f", PathRI, localpath='f', path='f')
_check_ri("f/s1", PathRI, localpath='f/s1', path='f/s1')
_check_ri(PurePosixPath("f"), PathRI, localpath='f', path='f')
_check_ri(PurePosixPath("f/s1"), PathRI, localpath='f/s1', path='f/s1')
# colons are problematic and might cause confusion into SSHRI
_check_ri("f/s:1", PathRI, localpath='f/s:1', path='f/s:1')
_check_ri("f/s:", PathRI, localpath='f/s:', path='f/s:')
_check_ri("/f", PathRI, localpath='/f', path='/f')
_check_ri("/f/s1", PathRI, localpath='/f/s1', path='/f/s1')
# some github ones, just to make sure
_check_ri("git://host/user/proj", URL, scheme="git", netloc="host",
hostname="host", path="/user/proj")
_check_ri("git@host:user/proj", SSHRI, hostname="host", path="user/proj", username='git')
_check_ri('weird:/', SSHRI, hostname='weird', path='/')
# since schema is not allowing some symbols so we need to add additional check
_check_ri('weird_url:/', SSHRI, hostname='weird_url', path='/')
_check_ri('example.com:/', SSHRI, hostname='example.com', path='/')
_check_ri('example.com:path/sp1', SSHRI, hostname='example.com', path='path/sp1')
_check_ri('example.com/path/sp1\\:fname', PathRI, localpath='example.com/path/sp1\\:fname',
path='example.com/path/sp1\\:fname')
# ssh is as stupid as us, so we will stay "Consistently" dumb
"""
$> ssh example.com/path/sp1:fname
ssh: Could not resolve hostname example.com/path/sp1:fname: Name or service not known
edit 20190516 yoh: but this looks like a perfectly valid path.
SSH knows that it is not a path but its SSHRI so it can stay dumb.
We are trying to be smart and choose between RIs (even when we know that
it is e.g. a file).
"""
_check_ri('e.com/p/sp:f', PathRI, localpath='e.com/p/sp:f', path='e.com/p/sp:f')
_check_ri('user@someho.st/mydir', PathRI, localpath='user@someho.st/mydir', path='user@someho.st/mydir')
# SSHRIs have .port, but it is empty
eq_(SSHRI(hostname='example.com').port, '')
# check that we are getting a warning logged when url can't be reconstructed
# precisely
# actually failed to come up with one -- becomes late here
#_check_ri("http://host///..//p", scheme='http', path='/..//p')
# actually this one is good enough to trigger a warning and I still don't know
# what it should exactly be!?
with swallow_logs(new_level=logging.DEBUG) as cml:
weird_str = 'weird://'
weird_url = RI(weird_str)
repr(weird_url)
cml.assert_logged(
'Parsed version of SSHRI .weird:/. '
'differs from original .weird://.'
)
# but we store original str
eq_(str(weird_url), weird_str)
neq_(weird_url.as_str(), weird_str)
raise SkipTest("TODO: file://::1/some does complain about parsed version dropping ::1")
def test_git_transport_ri():
_check_ri("gcrypt::http://somewhere", GitTransportRI, RI='http://somewhere', transport='gcrypt')
# man git-push says
# ::
# where may be a path, a server and path, or an arbitrary URL-like string...
# so full path to my.com/... should be ok?
_check_ri("http::/my.com/some/path", GitTransportRI, RI='/my.com/some/path', transport='http')
# some ssh server. And we allow for some additional chars in transport.
# Git doesn't define since it does not care! we will then be flexible too
_check_ri("trans-port::server:path", GitTransportRI, RI='server:path', transport='trans-port')
@pytest.mark.parametrize("cls,clskwargs,target_url", [
(SSHRI, {}, r'example.com:/ "' + r"';a&b&cd `| "),
(URL, {'scheme': "http"}, 'http://example.com/%20%22%27%3Ba%26b%26cd%20%60%7C%20'),
(PathRI, {}, r'/ "' + r"';a&b&cd `| "), # nothing is done to file:implicit
])
def test_url_quote_path(cls, clskwargs, target_url):
path = '/ "\';a&b&cd `| '
if not (cls is PathRI):
clskwargs['hostname'] = hostname = 'example.com'
url = cls(path=path, **clskwargs)
eq_(url.path, path)
if 'hostname' in clskwargs:
eq_(url.hostname, hostname)
# all nasty symbols should be quoted
url_str = str(url)
eq_(url_str, target_url)
# no side-effects:
eq_(url.path, path)
if 'hostname' in clskwargs:
eq_(url.hostname, hostname)
# and figured out and unquoted
url_ = RI(url_str)
ok_(isinstance(url_, cls))
eq_(url_.path, path)
if 'hostname' in clskwargs:
eq_(url.hostname, hostname)
def test_url_compose_archive_one():
url = URL(scheme='dl+archive', path='KEY',
fragment=dict((('path', 'f/p/ s+'), ('size', 30))))
# funny - space is encoded as + but + is %2B
eq_(str(url), 'dl+archive:KEY#path=f/p/+s%2B&size=30')
eq_(url.fragment_dict, {'path': 'f/p/ s+', 'size': '30'})
def test_url_fragments_and_query():
url = URL(hostname="host", query=dict((('a', 'x/b'), ('b', 'y'))))
eq_(str(url), '//host?a=x%2Fb&b=y')
eq_(url.query, 'a=x%2Fb&b=y')
eq_(url.query_dict, {'a': 'x/b', 'b': 'y'})
url = URL(hostname="host", fragment=dict((('b', 'x/b'), ('a', 'y'))))
eq_(str(url), '//host#b=x/b&a=y')
eq_(url.fragment, 'b=x/b&a=y')
eq_(url.fragment_dict, {'a': 'y', 'b': 'x/b'})
fname = get_most_obscure_supported_name()
url = URL(hostname="host", fragment={'a': fname})
eq_(url.fragment_dict, {'a': fname})
def test_url_dicts():
eq_(URL("http://host").query_dict, {})
def test_get_url_path_on_fileurls():
assert URL('file:///a').path == '/a'
assert URL('file:///a/b').path == '/a/b'
assert URL('file:///a/b').localpath == local_path_representation('/a/b')
assert URL('file:///a/b#id').path == '/a/b'
assert URL('file:///a/b?whatever').path == '/a/b'
def test_is_url():
ok_(is_url('file://localhost/some'))
ok_(is_url('http://localhost'))
ok_(is_url('ssh://me@localhost'))
# in current understanding it is indeed a url but an 'ssh', implicit=True, not just
# a useless scheme=weird with a hope to point to a netloc
with swallow_logs():
ok_(is_url('weird://'))
nok_(is_url('relative'))
nok_(is_url('/absolute'))
ok_(is_url('like@sshlogin')) # actually we do allow ssh:implicit urls ATM
nok_(is_url(''))
nok_(is_url(' '))
nok_(is_url(123)) # stuff of other types wouldn't be considered a URL
# we can pass RI instance directly
ok_(is_url(RI('file://localhost/some')))
nok_(is_url(RI('relative')))
# TODO: RF with test_is_url to avoid duplication
def test_is_datalad_compat_ri():
ok_(is_datalad_compat_ri('ssh://user:passw@host/path'))
ok_(is_datalad_compat_ri('http://example.com'))
ok_(is_datalad_compat_ri('file://localhost/some'))
ok_(is_datalad_compat_ri('///localhost/some'))
nok_(is_datalad_compat_ri('relative'))
nok_(is_datalad_compat_ri('.///localhost/some'))
nok_(is_datalad_compat_ri(123))
def test_get_local_file_url():
compat_annex = 'git-annex'
compat_git = 'git'
for path, url, compatibility in (
# relpaths are special-cased below
('test.txt', 'test.txt', compat_annex),
(OBSCURE_FILENAME, urlquote(OBSCURE_FILENAME), compat_annex),
) + ((
('C:\\Windows\\notepad.exe', 'file://C:/Windows/notepad.exe', compat_annex),
('C:\\Windows\\notepad.exe', 'file:///C:/Windows/notepad.exe', compat_git),
) if on_windows else (
('/a', 'file:///a', compat_annex),
('/a/b/c', 'file:///a/b/c', compat_annex),
('/a~', 'file:///a~', compat_annex),
# there are no files with trailing slashes in the name
#('/a b/', 'file:///a%20b/'),
('/a b/name', 'file:///a%20b/name', compat_annex),
)):
# Yarik found no better way to trigger. .decode() isn't enough
print("D: %s" % path)
if isabs(path):
assert get_local_file_url(path, compatibility=compatibility) == url
abs_path = path
else:
assert get_local_file_url(path, allow_relative_path=True, compatibility=compatibility) \
== '/'.join((get_local_file_url(os.getcwd(), compatibility=compatibility), url))
abs_path = opj(os.getcwd(), path)
if compatibility == compat_git:
assert get_local_file_url(abs_path, compatibility=compatibility) == Path(abs_path).as_uri()
@with_tempfile(mkdir=True)
def test_get_local_file_url_compatibility(path=None):
# smoke test for file:// URL compatibility with other datalad/git/annex
# pieces
path = Path(path)
ds1 = Dataset(path / 'ds1').create()
ds2 = Dataset(path / 'ds2').create()
testfile = path / 'testfile.txt'
testfile.write_text('some')
# compat with annex addurl
ds1.repo.add_url_to_file(
'test.txt',
get_local_file_url(str(testfile), compatibility='git-annex'))
# compat with git clone/submodule
assert_status(
'ok',
ds1.clone(get_local_file_url(ds2.path, compatibility='git'),
result_xfm=None, return_type='generator'))
def test_is_ssh():
ssh_locators = ["ssh://host",
"ssh://host/some/where",
"user@host:path/sp1",
"user@host:/absolute/path/sp1",
"host:path/sp1",
"host:/absolute/path/sp1",
"user@host"]
for ri in ssh_locators:
ok_(is_ssh(ri), "not considered ssh (string): %s" % ri)
ok_(is_ssh(RI(ri)), "not considered ssh (RI): %s" % ri)
non_ssh_locators = ["file://path/to",
"/abs/path",
"../rel/path",
"http://example.com",
"git://host/user/proj",
"s3://bucket/save/?key=891"]
for ri in non_ssh_locators:
ok_(not is_ssh(ri), "considered ssh (string): %s" % ri)
ok_(not is_ssh(RI(ri)), "considered ssh (RI): %s" % ri)
def test_iso8601_to_epoch():
epoch = 1467901515
eq_(iso8601_to_epoch('2016-07-07T14:25:15+00:00'), epoch)
eq_(iso8601_to_epoch('2016-07-07T14:25:15+11:00'),
epoch - 11 * 60 * 60)
eq_(iso8601_to_epoch('2016-07-07T14:25:15Z'), epoch)
eq_(iso8601_to_epoch('2016-07-07T14:25:15'), epoch)
eq_(iso8601_to_epoch('2016-07-07T14:25:14'), epoch-1)
def test_mapping_identity():
from datalad.tests.utils_pytest import OBSCURE_FILENAME
absolute_obscure_path = str(Path('/').absolute() / OBSCURE_FILENAME)
temp_dir = tempfile.gettempdir()
print(f"temp_dir: {temp_dir}")
for name in (temp_dir, opj(temp_dir, "x.txt"), absolute_obscure_path):
# On some platforms, e.g. MacOS, `temp_dir` might contain trailing
# slashes. Since the conversion and its inverse normalize paths, we
# compare the result to the normalized path
normalized_name = str(Path(name))
assert url_path2local_path(local_path2url_path(name)) == normalized_name
prefix = "/C:" if on_windows else ""
for name in map(quote_path, (prefix + "/window", prefix + "/d", prefix + "/" + OBSCURE_FILENAME)):
assert local_path2url_path(url_path2local_path(name)) == name
def test_auto_resolve_path():
relative_path = str(Path("a/b"))
with pytest.raises(ValueError):
local_path2url_path(relative_path)
local_path2url_path("", allow_relative_path=True)
@skip_if(not on_windows)
def test_hostname_detection():
with pytest.raises(ValueError):
local_path2url_path("\\\\server\\share\\path")
def test_url_path2local_path_excceptions():
with pytest.raises(ValueError):
url_path2local_path('')
with pytest.raises(ValueError):
url_path2local_path(None)
with pytest.raises(ValueError):
url_path2local_path('a/b')
with pytest.raises(ValueError):
url_path2local_path(PurePosixPath('a/b'))
with pytest.raises(ValueError):
url_path2local_path(PurePosixPath('//a/b'))
def test_quote_path(monkeypatch):
with monkeypatch.context() as ctx:
ctx.setattr(datalad.support.network, 'on_windows', True)
assert quote_path("/c:/win:xxx") == "/c:/win%3Axxx"
assert quote_path("/C:/win:xxx") == "/C:/win%3Axxx"
ctx.setattr(datalad.support.network, 'on_windows', False)
assert quote_path("/c:/win:xxx") == "/c%3A/win%3Axxx"
assert quote_path("/C:/win:xxx") == "/C%3A/win%3Axxx"
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_parallel.py 0000644 0001751 0001751 00000027173 15137634221 022247 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import logging
import sys
from functools import partial
from time import (
sleep,
time,
)
import pytest
# logging effects threading and causes some 'weak' tests to fail,
# so we will just skip those (well, if happens again -- disable altogether)
from datalad import lgr
from datalad.support import path as op
from datalad.support.exceptions import IncompleteResultsError
# absolute import only to be able to run test without `nose` so to see progress bar
from datalad.support.parallel import (
ProducerConsumer,
ProducerConsumerProgressLog,
no_parentds_in_futures,
)
from datalad.tests.utils_pytest import (
assert_equal,
assert_greater,
assert_greater_equal,
assert_raises,
assert_repo_status,
rmtree,
slow,
with_tempfile,
)
from datalad.utils import on_windows
info_log_level = lgr.getEffectiveLevel() >= logging.INFO
@pytest.fixture(params=["auto", None, 1, 10])
def jobs(request):
"""Fixture to automagically sweep over a sample of "jobs" values
"""
return request.param
@pytest.mark.parametrize("PC", [ProducerConsumer, ProducerConsumerProgressLog])
def test_ProducerConsumer_PC(PC, jobs):
def slowprod(n, secs=0.001):
for i in range(n):
yield i
sleep(secs)
def slowcons(i):
# so takes longer to consume than to produce and progress bar will appear
# after slowprod is done producing
sleep(0.002)
yield from fastcons(i)
def fastcons(i):
# we should still work correctly if consumer is fast!
yield {
"i": i, "status": "ok" if i % 2 else "error"
}
for cons in fastcons, slowcons:
# sorted since order of completion is not guaranteed
assert_equal(
sorted(PC(
slowprod(10),
cons,
jobs=jobs),
key=lambda r: r['i']),
[{"i": i, "status": "ok" if i % 2 else "error"} for i in range(10)])
def test_producing_consumer(jobs):
def producer():
yield from range(3)
def consumer(i):
yield i
if isinstance(i, int):
pc.add_to_producer_queue(str(i**2))
# we auto-detect generator function producer
pc = ProducerConsumer(producer, consumer, jobs=jobs)
assert_equal(set(pc), {0, 1, 2, "0", "1", "4"})
def test_producer_future_key(jobs):
if sys.version_info >= (3, 13) and jobs == 10:
pytest.xfail("Known issue with Python 3.13 and jobs=10")
if on_windows and jobs == 10:
pytest.xfail("Known issue on Windows with jobs=10")
def producer():
for i in range(3):
yield i, {"k": i**2} # dict is mutable, will need a key
def consumer(args):
i, d = args
yield i
pc = ProducerConsumer(producer(), consumer, producer_future_key=lambda r: r[0], jobs=jobs)
assert_equal(list(pc), [0, 1, 2])
@slow # 12sec on Yarik's laptop
@with_tempfile(mkdir=True)
def test_creatsubdatasets(topds_path=None, n=2):
from datalad.api import create
from datalad.distribution.dataset import Dataset
ds = Dataset(topds_path).create()
paths = [op.join(topds_path, "subds%d" % i) for i in range(n)]
paths.extend(op.join(topds_path, "subds%d" % i, "subsub%d" %k) for i in range(n) for k in range(2))
# To allow for parallel execution without hitting the problem of
# a lock in the super dataset, we create all subdatasets, and then
# save them all within their superdataset
create_ = partial(create, # cfg_proc="yoda",
result_xfm=None, return_type='generator')
# if we flip the paths so to go from the end, create without --force should fail
# and we should get the exception (the first one encountered!)
# Note: reraise_immediately is of "concern" only for producer. since we typically
# rely on outside code to do the killing!
assert_raises(IncompleteResultsError, list, ProducerConsumer(paths[::-1], create_, jobs=5))
# we are in a dirty state, let's just remove all those for a clean run
rmtree(topds_path)
# and this one followed by save should be good IFF we provide our dependency checker
ds = Dataset(topds_path).create()
list(ProducerConsumer(paths, create_, safe_to_consume=no_parentds_in_futures, jobs=5))
ds.save(paths)
assert_repo_status(ds.repo)
def test_gracefull_death():
def assert_provides_and_raises(pc, exception, target=None):
"""Helper to get all results before exception is raised"""
results = []
with assert_raises(exception):
for r in pc:
results.append(r)
# results should be sorted since we do not guarantee order
results = sorted(results)
if target is not None:
assert_equal(results, target)
return results
def interrupted_producer():
yield 1
raise ValueError()
def consumer(i):
sleep(0.001)
yield i
assert_provides_and_raises(
ProducerConsumer(interrupted_producer(), consumer, jobs=3), ValueError, [1])
def faulty_consumer(i):
sleep(0.001)
if i == 1:
raise ValueError()
return i
# so we do not get failed, but other parallel ones finish their job
results = assert_provides_and_raises(
ProducerConsumer(range(1000), faulty_consumer, jobs=5), ValueError)
# and analysis of futures to raise an exception can take some time etc, so
# we could get more, but for sure we should not get all 999 and not even a 100.
# But some times we get some excursions above 100, so limiting to 300
if info_log_level:
assert_greater(300, len(results))
assert_equal(results[:4], [0, 2, 3, 4])
def producer():
for i in range(10):
sleep(0.0003)
yield i
raise ValueError()
# by default we do not stop upon producer failing
assert_provides_and_raises(
ProducerConsumer(producer(), consumer, jobs=2), ValueError, list(range(10)))
# if producer produces more than we can as quickly consume but then fails
# ATM we do not proceed to consume other items, but fail when we finish
# consuming until the time point when producer has failed
# by default we do not stop upon producer failing
results = assert_provides_and_raises(
ProducerConsumer(producer(), consumer, reraise_immediately=True, jobs=2),
ValueError)
# we will get some results, seems around 4 and they should be "sequential"
assert_equal(results, list(range(len(results))))
try:
assert_greater_equal(len(results), 2)
except AssertionError:
# Possible TODO: if tests below would start failing too, move xfail to the level
# of the entire test
pytest.xfail(f"Rarely but happens. Got only {len(results)} instead of at least 2")
# This test relies too much on threads scheduling to not hog up on handling
# consumers, but if it happens so - they might actually consume all results
# before producer decides to finally raise an exception. As such it remains
# flaky and thus not ran, but could be useful to test locally while
# changing that logic.
#
# if info_log_level and not (on_windows or on_osx):
# # consumers should not be able to consume all produced items.
# # production of 10 should take 3 unites, while consumers 10/2 (jobs)
# # 5 units, so some should not have a chance.
# assert_greater_equal(8, len(results))
# Simulate situation close to what we have when outside code consumes
# some yielded results and then "looses interest" (on_failure="error").
# In this case we should still exit gracefully (no GeneratorExit warnings),
# not over-produce, and also do not kill already running consumers
consumed = []
def inner():
def consumer(i):
sleep(0.01)
consumed.append(i)
return i
pc = iter(ProducerConsumer(range(1000), consumer, jobs=2))
yield next(pc)
yield next(pc)
# typically it should be [0, 1] but it does happen some times that
# one other worker gets ahead and we get [0, 2]. As it is not per se the
# purpose of this test to ensure absence of such race, we just allow for any
# two from first 3 possible.
assert len(set(inner()).intersection({0, 1, 2})) == 2
consumed = sorted(consumed)
assert_equal(consumed, list(range(len(consumed))))
assert_greater_equal(len(consumed), 4) # we should wait for that 2nd batch to finish
if info_log_level:
assert_greater_equal(20, len(consumed))
# `test_stalling` is a speculative test that is intended to detect stalled
# subprocess execution by assuming an upper limit for the execution time of the
# subprocess. Due to the nature of non-realtime process scheduling, this
# assumption is necessarily incorrect and might be validated in a perfectly
# working system. In other words, the test has the potential to create false
# positives.
# By raising the assumed maximum execution time, we try to reduce the number of
# false positives.
#
# The test exists because an earlier version of `WitlessRunner` was based on
# event loops and there was at least one stalling condition that manifested
# itself in python 3.7 (see:
# https://github.com/datalad/datalad/pull/5022#issuecomment-708716290). As of
# datalad version 0.16, event loops are no longer used in `WitlessRunner` and
# this test is a shot in the dark.
def test_stalling(kill=False):
import concurrent.futures
from datalad.runner.coreprotocols import StdOutErrCapture
from datalad.runner.runner import WitlessRunner
def worker():
return WitlessRunner().run(["echo", "1"], StdOutErrCapture)
t0 = time()
result1 = worker()
dt1 = time() - t0
t0 = time()
with concurrent.futures.ThreadPoolExecutor(1) as executor:
future = executor.submit(worker)
dt2_limit = max((5, dt1 * 100))
while not future.done():
sleep(dt1/3)
if time() - t0 > dt2_limit:
# does not even shutdown
# executor.shutdown(wait=False)
if kill:
# raising an exception isn't enough!
print("exceeded")
import os
import signal
os.kill(os.getpid(), signal.SIGTERM)
raise AssertionError(f"Future has not finished in {dt2_limit}s")
result2 = future.result()
assert result1 == result2
@with_tempfile(mkdir=True)
def test_parallel_flyweights(topd=None):
from datalad.support.gitrepo import GitRepo
# ProducerConsumer relies on unique args to consumer so we will provide 2nd different arg
def create_GitRepo(args):
return GitRepo(args[0])
# let's really hunt down race condition
for batch in range(10):
repopath = op.join(topd, str(batch))
# should succeed and be the same thing
# An example of errored run: https://github.com/datalad/datalad/issues/6598
repos = list(
ProducerConsumer(
((repopath, i) for i in range(10)),
create_GitRepo,
jobs=10
)
)
assert op.exists(repopath)
instances = set(map(id, repos))
assert len(instances) == 1
if __name__ == '__main__':
test_ProducerConsumer()
# test_creatsubdatasets()
# test_stalling(kill=True)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_path.py 0000644 0001751 0001751 00000011416 15137634221 021400 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import os
from pathlib import PurePosixPath
import pytest
from ...tests.utils_pytest import (
SkipTest,
assert_raises,
eq_,
with_tempfile,
)
from ...utils import (
chpwd,
on_windows,
rmtree,
)
from ..path import (
abspath,
curdir,
get_filtered_paths_,
get_parent_paths,
robust_abspath,
split_ext,
)
@with_tempfile(mkdir=True)
def test_robust_abspath(tdir=None):
with chpwd(tdir):
eq_(robust_abspath(curdir), tdir)
try:
if os.environ.get('DATALAD_ASSERT_NO_OPEN_FILES'):
raise Exception("cannot test under such pressure")
rmtree(tdir)
except Exception as exc:
# probably windows or above exception
raise SkipTest(
"Cannot test in current environment") from exc
assert_raises(OSError, abspath, curdir)
eq_(robust_abspath(curdir), tdir)
def test_split_ext():
eq_(split_ext("file"), ("file", ""))
eq_(split_ext("file.py"), ("file", ".py"))
eq_(split_ext("file.tar.gz"), ("file", ".tar.gz"))
eq_(split_ext("file.toolong.gz"), ("file.toolong", ".gz"))
eq_(split_ext("file.a.b.c.d"), ("file", ".a.b.c.d"))
eq_(split_ext("file.a.b.cccc.d"), ("file", ".a.b.cccc.d"))
eq_(split_ext("file.a.b.ccccc.d"), ("file.a.b.ccccc", ".d"))
eq_(split_ext("file.a.b..c"), ("file", ".a.b..c"))
@pytest.mark.parametrize("sep", [None, '/', '\\'])
def test_get_parent_paths(sep):
if sep is None:
gpp = get_parent_paths
else:
from functools import partial
gpp = partial(get_parent_paths, sep=sep)
# sanity/border checks
eq_(gpp([], []), [])
eq_(gpp([], ['a']), [])
eq_(gpp(['a'], ['a']), ['a'])
# Helper to provide testing across different seps and platforms while
# specifying only POSIX paths here in the test
def _p(path):
if sep is None:
return path
else:
return path.replace('/', sep)
_pp = lambda paths: list(map(_p, paths))
# no absolute paths anywhere
if on_windows:
assert_raises(ValueError, gpp, 'C:\\a', ['a'])
assert_raises(ValueError, gpp, ['a'], 'C:\\a')
elif sep != '\\': # \ does not make it absolute
assert_raises(ValueError, gpp, _p('/a'), ['a'])
assert_raises(ValueError, gpp, ['a'], [_p('/a')])
assert_raises(ValueError, gpp, [_p('a//a')], ['a'])
# dups the actual code but there is no other way AFAIK
asep = {'/': '\\', None: '\\', '\\': '/'}[sep]
assert_raises(ValueError, gpp, [f'a{asep}a'], ['a'])
paths = _pp(['a', 'a/b', 'a/b/file', 'c', 'd/sub/123'])
eq_(gpp(paths, []), paths)
eq_(gpp(paths, [], True), [])
# actually a tricky one! we should check in descending lengths etc
eq_(gpp(paths, paths), paths)
# every path is also its own parent
eq_(gpp(paths, paths, True), paths)
# subdatasets not for every path -- multiple paths hitting the same parent,
# and we will be getting only a single entry
# to mimic how git ls-tree operates
eq_(gpp(paths, ['a']), ['a', 'c', _p('d/sub/123')])
eq_(gpp(paths, ['a'], True), ['a'])
# and we get the deepest parent
eq_(gpp(_pp(['a/b/file', 'a/b/file2']), _pp(['a', 'a/b'])), _pp(['a/b']))
def test_get_filtered_paths_():
# just to avoid typing all the same
def gfp(*args, **kwargs):
return list(get_filtered_paths_(*args, **kwargs))
assert gfp(['a', 'b'], ['a', 'c']) == ['a']
assert gfp(['a', 'b'], ['b']) == ['b']
assert gfp(['a', 'b'], ['c']) == []
assert gfp(['a', 'b'], ['a/b', 'c']) == [] # a is not subpath of a/b
assert gfp(['a', 'b'], ['a/b', 'c'], include_within_path=True) == ['a'] # a is not subpath of a/b
# all paths returned due to '.', and order is sorted
paths = ['a', 'b', '1/2/3', 'abc']
paths_sorted = sorted(paths)
assert gfp(paths, ['.']) == paths_sorted
assert gfp(paths, paths_sorted) == paths_sorted
assert gfp(paths, paths_sorted, include_within_path=True) == paths_sorted
# we can take a mix of str and Path
assert gfp([PurePosixPath(paths[0])] + paths[1:], ['.']) == paths_sorted
# nothing within empty "filter_paths" matches -- so no paths yielded
assert gfp(paths, []) == []
assert_raises(ValueError, gfp, ['/a'], [])
assert_raises(ValueError, gfp, [PurePosixPath('/a')], [])
assert_raises(ValueError, gfp, ['a'], ['/a'])
assert_raises(ValueError, gfp, ['../a'], ['a'])
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_repo_save.py 0000644 0001751 0001751 00000014315 15137634221 022430 0 ustar 00runner runner # ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test saveds function"""
import shutil
from datalad.api import create
from datalad.distribution.dataset import Dataset
from datalad.support.annexrepo import AnnexRepo
from datalad.support.gitrepo import GitRepo
from datalad.tests.utils_pytest import (
assert_in,
assert_in_results,
assert_not_in,
assert_repo_status,
create_tree,
eq_,
get_annexstatus,
get_convoluted_situation,
known_failure_windows,
slow,
with_tempfile,
)
from datalad.utils import (
on_windows,
rmtree,
)
@with_tempfile
def test_save_basics(path=None):
ds = Dataset(path).create(result_renderer='disabled')
# nothing happens
eq_(list(ds.repo.save(paths=[], _status={})),
[])
# dataset is clean, so nothing happens with all on default
eq_(list(ds.repo.save()),
[])
def _test_save_all(path, repocls):
ds = get_convoluted_situation(path, repocls)
orig_status = ds.repo.status(untracked='all')
# TODO test the results when the are crafted
res = ds.repo.save()
# make sure we get a 'delete' result for each deleted file
eq_(
set(r['path'] for r in res if r['action'] == 'delete'),
{str(k) for k, v in orig_status.items()
if k.name in ('file_deleted', 'file_staged_deleted')}
)
saved_status = ds.repo.status(untracked='all')
# we still have an entry for everything that did not get deleted
# intentionally
eq_(
len([f for f, p in orig_status.items()
if not f.match('*_deleted')]),
len(saved_status))
# everything but subdataset entries that contain untracked content,
# or modified subsubdatasets is now clean, a repo simply doesn touch
# other repos' private parts
for f, p in saved_status.items():
if p.get('state', None) != 'clean':
assert f.match('subds_modified'), f
# Since we already have rich filetree, now save at dataset level
# recursively and introspect some known gotchas
resr = ds.save(recursive=True)
# File within subdataset got committed to git-annex, which was not the
# case for GitRepo parent https://github.com/datalad/datalad/issues/7351
assert_in_results(
resr,
status='ok',
path=str(ds.pathobj / 'subds_modified' / 'someds' / 'dirtyds' / 'file_untracked'),
# if key is None -- was committed to git which should have not happened!
key="MD5E-s14--2c320e0c56ed653384a926292647f226")
return ds
@slow # 11sec on travis
@known_failure_windows # see gh-5462
@with_tempfile
def test_gitrepo_save_all(path=None):
_test_save_all(path, GitRepo)
@slow # 11sec on travis
@known_failure_windows # see gh-5462
@with_tempfile
def test_annexrepo_save_all(path=None):
_test_save_all(path, AnnexRepo)
@with_tempfile
def test_save_typechange(path=None):
ckwa = dict(result_renderer='disabled')
ds = Dataset(path).create(**ckwa)
foo = ds.pathobj / 'foo'
# save a file
foo.write_text('some')
ds.save(**ckwa)
# now delete the file and replace with a directory and a file in it
foo.unlink()
foo.mkdir()
bar = foo / 'bar'
bar.write_text('foobar')
res = ds.save(**ckwa)
assert_in_results(res, path=str(bar), action='add', status='ok')
assert_repo_status(ds.repo)
if not on_windows:
# now replace file with subdataset
# (this is https://github.com/datalad/datalad/issues/5418)
bar.unlink()
Dataset(ds.pathobj / 'tmp').create(**ckwa)
shutil.move(ds.pathobj / 'tmp', bar)
res = ds.save(**ckwa)
assert_repo_status(ds.repo)
assert len(ds.subdatasets(**ckwa)) == 1
# now replace directory with subdataset
rmtree(foo)
Dataset(ds.pathobj / 'tmp').create(**ckwa)
shutil.move(ds.pathobj / 'tmp', foo)
# right now a first save() will save the subdataset removal only
ds.save(**ckwa)
# subdataset is gone
assert len(ds.subdatasets(**ckwa)) == 0
# but it takes a second save() run to get a valid status report
# to understand that there is a new subdataset on a higher level
ds.save(**ckwa)
assert_repo_status(ds.repo)
assert len(ds.subdatasets(**ckwa)) == 1
# now replace subdataset with a file
rmtree(foo)
foo.write_text('some')
ds.save(**ckwa)
assert_repo_status(ds.repo)
@with_tempfile
def test_save_to_git(path=None):
ds = Dataset(path).create(result_renderer='disabled')
create_tree(
ds.path,
{
'file_ingit': 'file_ingit',
'file_inannex': 'file_inannex',
}
)
ds.repo.save(paths=['file_ingit'], git=True)
ds.repo.save(paths=['file_inannex'])
assert_repo_status(ds.repo)
for f, p in get_annexstatus(ds.repo).items():
eq_(p['state'], 'clean')
if f.match('*ingit'):
assert_not_in('key', p, f)
elif f.match('*inannex'):
assert_in('key', p, f)
@with_tempfile
def test_save_subds_change(path=None):
ckwa = dict(result_renderer='disabled')
ds = Dataset(path).create(**ckwa)
subds = ds.create('sub', **ckwa)
assert_repo_status(ds.repo)
rmtree(subds.path)
res = ds.save(**ckwa)
assert_repo_status(ds.repo)
# updated .gitmodules, deleted subds, saved superds
assert len(res) == 3
assert_in_results(
res, type='dataset', path=ds.path, action='save')
assert_in_results(
res, type='dataset', path=subds.path, action='delete')
assert_in_results(
res, type='file', path=str(ds.pathobj / '.gitmodules'), action='add')
# now add one via save
subds2 = create(ds.pathobj / 'sub2', **ckwa)
res = ds.save(**ckwa)
# updated .gitmodules, added subds, saved superds
assert len(res) == 3
assert_repo_status(ds.repo)
assert_in_results(
res, type='dataset', path=ds.path, action='save')
assert_in_results(
res, type='dataset', path=subds2.path, action='add')
assert_in_results(
res, type='file', path=str(ds.pathobj / '.gitmodules'), action='add')
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_repodates.py 0000644 0001751 0001751 00000007443 15137634221 022437 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from unittest.mock import patch
from datalad.support.annexrepo import AnnexRepo
from datalad.support.gitrepo import GitRepo
from datalad.support.repodates import check_dates
from datalad.tests.utils_pytest import (
assert_equal,
assert_false,
assert_in,
assert_not_in,
assert_raises,
eq_,
ok_,
set_date,
with_tempfile,
with_tree,
)
@with_tempfile(mkdir=True)
def test_check_dates_empty_repo(path=None):
assert_false(check_dates(GitRepo(path, create=True))["objects"])
@with_tree(tree={"foo": "foo content",
"bar": "bar content"})
def test_check_dates(path=None):
refdate = 1218182889
with set_date(refdate - 1):
ar = AnnexRepo(path, create=True)
def tag_object(tag):
"""Return object for tag. Do not dereference it.
"""
# We can't use ar.get_tags because that returns the commit's hexsha,
# not the tag's, and ar.get_hexsha is limited to commit objects.
return ar.call_git_oneline(
["rev-parse", "refs/tags/{}".format(tag)], read_only=True)
ar.add("foo")
ar.commit("add foo")
foo_commit = ar.get_hexsha()
ar.commit("add foo")
ar.tag("foo-tag", "tag before refdate")
foo_tag = tag_object("foo-tag")
# Make a lightweight tag to make sure `tag_dates` doesn't choke on it.
ar.tag("light")
with set_date(refdate + 1):
ar.add("bar")
ar.commit("add bar")
bar_commit = ar.get_hexsha()
ar.tag("bar-tag", "tag after refdate")
bar_tag = tag_object("bar-tag")
with set_date(refdate + 2):
# Drop an annexed file so that we have more blobs in the git-annex
# branch than its current tree.
ar.drop("bar", options=["--force"])
results = {}
for which in ["older", "newer"]:
result = check_dates(ar, refdate, which=which)["objects"]
ok_(result)
if which == "newer":
assert_in(bar_commit, result)
assert_not_in(foo_commit, result)
assert_in(bar_tag, result)
elif which == "older":
assert_in(foo_commit, result)
assert_not_in(bar_commit, result)
assert_in(foo_tag, result)
results[which] = result
ok_(any(x.get("filename") == "uuid.log"
for x in results["older"].values()))
newer_tree = check_dates(ar, refdate, annex="tree")["objects"]
def is_annex_log_blob(entry):
return (entry["type"] == "annex-blob"
and entry["filename"].endswith(".log"))
def num_logs(entries):
return sum(map(is_annex_log_blob, entries.values()))
# Because we dropped bar above, we should have one more blob in the
# git-annex branch than in the current tree of the git-annex branch.
eq_(num_logs(results["newer"]) - num_logs(newer_tree), 1)
# Act like today is one day from the reference timestamp to check that we
# get the same results with the one-day-back default.
seconds_in_day = 60 * 60 * 24
with patch('time.time', return_value=refdate + seconds_in_day):
assert_equal(check_dates(ar, annex="tree")["objects"],
newer_tree)
# We can give a path (str) instead of a GitRepo object.
assert_equal(check_dates(path, refdate, annex="tree")["objects"],
newer_tree)
with assert_raises(ValueError):
check_dates(ar, refdate, which="unrecognized")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_sshconnector.py 0000644 0001751 0001751 00000026623 15137634221 023162 0 ustar 00runner runner # ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test classes SSHConnection and SSHManager
"""
import logging
import os.path as op
from os.path import (
exists,
getmtime,
isdir,
)
from os.path import join as opj
from datalad.tests.utils_pytest import (
SkipTest,
assert_false,
assert_in,
assert_is_instance,
assert_raises,
eq_,
get_most_obscure_supported_name,
get_ssh_port,
ok_,
patch_config,
skip_if_on_windows,
skip_nomultiplex_ssh,
skip_ssh,
swallow_logs,
with_tempfile,
with_tree,
)
from datalad.utils import Path
from ..sshconnector import (
MultiplexSSHConnection,
MultiplexSSHManager,
NoMultiplexSSHConnection,
SSHConnection,
SSHManager,
get_connection_hash,
sh_quote,
)
# Some tests test the internals and assumptions of multiplex connections
_ssh_manager_is_multiplex = SSHManager is MultiplexSSHManager
@skip_ssh
def test_ssh_get_connection():
manager = SSHManager()
if _ssh_manager_is_multiplex:
assert manager._socket_dir is None, \
"Should be unset upon initialization. Got %s" % str(manager._socket_dir)
c1 = manager.get_connection('ssh://datalad-test')
if _ssh_manager_is_multiplex:
assert manager._socket_dir, "Should be set after interactions with the manager"
assert_is_instance(c1, MultiplexSSHConnection)
# subsequent call returns the very same instance:
ok_(manager.get_connection('ssh://datalad-test') is c1)
else:
assert_is_instance(c1, NoMultiplexSSHConnection)
# fail on malformed URls (meaning: our fancy URL parser can't correctly
# deal with them):
#assert_raises(ValueError, manager.get_connection, 'localhost')
# we now allow those simple specifications of host to get_connection
c2 = manager.get_connection('datalad-test')
assert_is_instance(c2, SSHConnection)
# but should fail if it looks like something else
assert_raises(ValueError, manager.get_connection, 'datalad-test/')
assert_raises(ValueError, manager.get_connection, ':datalad-test')
# we can do what urlparse cannot
# assert_raises(ValueError, manager.get_connection, 'someone@localhost')
# next one is considered a proper url by urlparse (netloc:'',
# path='/localhost), but eventually gets turned into SSHRI(hostname='ssh',
# path='/localhost') -- which is fair IMHO -> invalid test
# assert_raises(ValueError, manager.get_connection, 'ssh:/localhost')
manager.close()
@skip_if_on_windows
@skip_ssh
@with_tree(tree={'f0': 'f0', 'f1': 'f1'})
@with_tempfile(suffix=get_most_obscure_supported_name(),
content="1")
def test_ssh_open_close(tmp_path=None, tfile1=None):
manager = SSHManager()
socket_path = None
if _ssh_manager_is_multiplex:
socket_path = opj(str(manager.socket_dir),
get_connection_hash('datalad-test'))
# TODO: facilitate the test when it didn't exist
existed_before = exists(socket_path)
c1 = manager.get_connection('ssh://datalad-test')
c1.open()
if socket_path:
# control master exists for sure now
ok_(exists(socket_path))
# use connection to execute remote command:
# we list explicitly local HOME since we override it in module_setup
#
# Note: Use realpath() below because we know that the resolved temporary
# test directory exists on the target (many tests rely on that), but it
# doesn't necessarily have the unresolved variant.
out, err = c1('ls -a {}'.format(sh_quote(op.realpath(tmp_path))))
remote_ls = [entry for entry in out.splitlines()
if entry != '.' and entry != '..']
eq_(set(remote_ls), {"f0", "f1"})
if socket_path:
ok_(exists(socket_path))
# now test for arguments containing spaces and other pleasant symbols
out, err = c1('ls -l {}'.format(sh_quote(tfile1)))
assert_in(tfile1, out)
# on a crippled FS it will actually say something like
# Control socket connect(...6258b3a7): Connection refused\r\n'
# but still work.
#eq_(err, '')
c1.close()
if socket_path:
# control master doesn't exist anymore:
ok_(exists(socket_path) == existed_before)
@skip_nomultiplex_ssh
def test_ssh_manager_close():
manager = SSHManager()
# check for previously existing sockets:
existed_before_1 = exists(opj(str(manager.socket_dir),
get_connection_hash('datalad-test')))
existed_before_2 = exists(opj(str(manager.socket_dir),
get_connection_hash('datalad-test2')))
manager.get_connection('ssh://datalad-test').open()
manager.get_connection('ssh://datalad-test2').open()
if existed_before_1 and existed_before_2:
# we need one connection to be closed and therefore being opened
# by `manager`
manager.get_connection('ssh://datalad-test').close()
manager.get_connection('ssh://datalad-test').open()
ok_(exists(opj(str(manager.socket_dir),
get_connection_hash('datalad-test'))))
ok_(exists(opj(str(manager.socket_dir),
get_connection_hash('datalad-test2'))))
manager.close()
still_exists_1 = exists(opj(str(manager.socket_dir),
get_connection_hash('datalad-test')))
still_exists_2 = exists(opj(str(manager.socket_dir),
get_connection_hash('datalad-test2')))
eq_(existed_before_1, still_exists_1)
eq_(existed_before_2, still_exists_2)
@with_tempfile
def test_ssh_manager_close_no_throw(bogus_socket=None):
manager = MultiplexSSHManager()
class bogus:
def close(self):
raise Exception("oh I am so bad")
@property
def ctrl_path(self):
with open(bogus_socket, "w") as f:
f.write("whatever")
return Path(bogus_socket)
# since we are digging into protected area - should also set _prev_connections
manager._prev_connections = {}
manager._connections['bogus'] = bogus()
assert_raises(Exception, manager.close)
assert_raises(Exception, manager.close)
# but should proceed just fine if allow_fail=False
with swallow_logs(new_level=logging.DEBUG) as cml:
manager.close(allow_fail=False)
assert_in('Failed to close a connection: oh I am so bad', cml.out)
@skip_if_on_windows
@skip_ssh
@with_tempfile(mkdir=True)
@with_tempfile(content='one')
@with_tempfile(content='two')
def test_ssh_copy(sourcedir=None, sourcefile1=None, sourcefile2=None):
port = get_ssh_port('datalad-test')
remote_url = 'ssh://datalad-test:{}'.format(port)
manager = SSHManager()
ssh = manager.get_connection(remote_url)
# copy content of sourcefile3 to an obscurely named file in sourcedir
obscure_file = get_most_obscure_supported_name()
obscure_path = opj(sourcedir, obscure_file)
with open(obscure_path, 'w') as f:
f.write('three')
# copy first two temp files to remote_url:sourcedir
sourcefiles = [sourcefile1, sourcefile2]
ssh.put(sourcefiles, opj(remote_url, sourcedir))
# copy obscure file to remote_url:sourcedir/'.c opy'
# we copy to a different name because the test setup maps local dir and
# remote dir to the same directory on the test machine. That means the file
# is copied onto itself. With ssh version 9 this leads to an empty file.
# We perform copy instead of just writing the content to the destination
# file, because ww want to ensure that the source file is picked up by
# 'ssh.put()'.
ssh.put([obscure_path], opj(remote_url, sourcedir, obscure_file + '.c opy'))
# docs promise that connection is auto-opened in case of multiplex
if _ssh_manager_is_multiplex:
ok_(ssh.is_open())
# recursive copy tempdir to remote_url:targetdir
targetdir = sourcedir + '.c opy'
ssh.put(sourcedir, opj(remote_url, targetdir),
recursive=True, preserve_attrs=True)
# check if sourcedir copied to remote_url:targetdir
ok_(isdir(targetdir))
# check if scp preserved source directory attributes
# if source_mtime=1.12s, scp -p sets target_mtime = 1.0s, test that
eq_(getmtime(targetdir), int(getmtime(sourcedir)) + 0.0)
# check if targetfiles(and its content) exist in remote_url:targetdir,
# this implies file(s) and recursive directory copying pass
for targetfile, content in zip(sourcefiles + [obscure_file + '.c opy'],
['one', 'two', 'three']):
targetpath = opj(targetdir, targetfile)
ok_(exists(targetpath))
with open(targetpath, 'r') as fp:
eq_(content, fp.read())
# and now a quick smoke test for get
# but simplify the most obscure filename slightly to not trip `scp` itself
togetfile = Path(targetdir) / (obscure_file.replace('`', '') + '2')
togetfile.write_text(str('something'))
ssh.get(opj(remote_url, str(togetfile)), sourcedir)
ok_((Path(sourcedir) / togetfile.name).exists())
ssh.close()
@skip_if_on_windows
@skip_ssh
def test_ssh_compound_cmds():
ssh = SSHManager().get_connection('ssh://datalad-test')
out, err = ssh('[ 1 = 2 ] && echo no || echo success')
eq_(out.strip(), 'success')
ssh.close() # so we get rid of the possibly lingering connections
@skip_if_on_windows
@skip_ssh
def test_ssh_custom_identity_file():
ifile = "/tmp/dl-test-ssh-id" # Travis
if not op.exists(ifile):
raise SkipTest("Travis-specific '{}' identity file does not exist"
.format(ifile))
with patch_config({"datalad.ssh.identityfile": ifile}):
with swallow_logs(new_level=logging.DEBUG) as cml:
manager = SSHManager()
ssh = manager.get_connection('ssh://datalad-test')
cmd_out, _ = ssh("echo blah")
if _ssh_manager_is_multiplex:
expected_socket = op.join(
str(manager.socket_dir),
get_connection_hash("datalad-test", identity_file=ifile))
ok_(exists(expected_socket))
manager.close()
assert_in("-i", cml.out)
assert_in(ifile, cml.out)
@skip_if_on_windows
@skip_ssh
def test_ssh_git_props():
remote_url = 'ssh://datalad-test'
manager = SSHManager()
ssh = manager.get_connection(remote_url)
# Note: Avoid comparing these versions directly to the versions in
# external_versions because the ssh://localhost versions detected might
# differ depending on how git-annex is installed.
ok_(ssh.get_annex_version())
ok_(ssh.get_git_version())
manager.close() # close possibly still present connections
# situation on our test windows boxes is complicated
# login shell is a POSIX one, path handling and equivalence between
# local and "remote" needs more research
@skip_if_on_windows
@skip_ssh
@with_tempfile(mkdir=True)
def test_bundle_invariance(path=None):
remote_url = 'ssh://datalad-test'
manager = SSHManager()
testfile = Path(path) / 'dummy'
for flag in (True, False):
assert_false(testfile.exists())
ssh = manager.get_connection(remote_url, use_remote_annex_bundle=flag)
ssh('cd .>{}'.format(str(testfile)))
ok_(testfile.exists())
testfile.unlink()
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_sshrun.py 0000644 0001751 0001751 00000007253 15137634221 021772 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import sys
from io import (
StringIO,
UnsupportedOperation,
)
from unittest.mock import patch
import pytest
from datalad.api import sshrun
from datalad.cli.main import main
from datalad.cmd import (
StdOutCapture,
WitlessRunner,
)
from datalad.tests.utils_pytest import (
SkipTest,
assert_equal,
assert_raises,
skip_if_on_windows,
skip_ssh,
swallow_outputs,
with_tempfile,
)
@pytest.mark.xfail(reason="under pytest for some reason gets 1 not 42")
@skip_if_on_windows
@skip_ssh
def test_exit_code():
# will relay actual exit code on CommandError
cmd = ['datalad', 'sshrun', 'datalad-test', 'exit 42']
with assert_raises(SystemExit) as cme:
# running nosetests without -s
if isinstance(sys.stdout, StringIO): # pragma: no cover
with swallow_outputs(): # need to give smth with .fileno ;)
main(cmd)
else:
# to test both scenarios
main(cmd)
assert_equal(cme.value.code, 42)
@skip_if_on_windows
@skip_ssh
@with_tempfile(content="123magic")
def test_no_stdin_swallow(fname=None):
# will relay actual exit code on CommandError
cmd = ['datalad', 'sshrun', 'datalad-test', 'cat']
out = WitlessRunner().run(
cmd, stdin=open(fname), protocol=StdOutCapture)
assert_equal(out['stdout'].rstrip(), '123magic')
# test with -n switch now, which we could place even at the end
out = WitlessRunner().run(
cmd + ['-n'], stdin=open(fname), protocol=StdOutCapture)
assert_equal(out['stdout'], '')
@skip_if_on_windows
@skip_ssh
@with_tempfile(suffix="1 space", content="magic")
def test_fancy_quotes(f=None):
cmd = ['datalad', 'sshrun', 'datalad-test', """'cat '"'"'%s'"'"''""" % f]
out = WitlessRunner().run(cmd, protocol=StdOutCapture)
assert_equal(out['stdout'], 'magic')
@skip_if_on_windows
@skip_ssh
def test_ssh_option():
# This test is hacky in that detecting the sent value depends on systems
# commonly configuring `AcceptEnv LC_*` in their sshd_config. If we get
# back an empty value, assume that isn't configured, and skip the test.
with patch.dict('os.environ', {"LC_DATALAD_HACK": 'hackbert'}):
with swallow_outputs() as cmo:
with assert_raises(SystemExit):
main(["datalad", "sshrun", "-oSendEnv=LC_DATALAD_HACK",
"datalad-test", "echo $LC_DATALAD_HACK"])
out = cmo.out.strip()
if not out:
raise SkipTest(
"SSH target probably does not accept LC_* variables. "
"Skipping")
assert_equal(out, "hackbert")
@skip_if_on_windows
@skip_ssh
def test_ssh_ipv4_6_incompatible():
with assert_raises(SystemExit):
main(["datalad", "sshrun", "-4", "-6", "datalad-test", "true"])
@skip_if_on_windows
@skip_ssh
def test_ssh_ipv4_6():
# This should fail with a RuntimeError if a version is not supported (we're
# not bothering to check what datalad-test supports), but if the processing
# fails, it should be something else.
for kwds in [{"ipv4": True}, {"ipv6": True}]:
try:
sshrun("datalad-test", "true", **kwds)
except RuntimeError:
pass
except UnsupportedOperation as exc:
pytest.skip(f"stdin is swallowed by pytest: {exc}")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_stats.py 0000644 0001751 0001751 00000006111 15137634221 021576 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from ...tests.utils_pytest import (
assert_equal,
assert_in,
assert_not_equal,
assert_raises,
)
from ..stats import (
_COUNTS,
ActivityStats,
)
def test_ActivityStats_basic():
stats = ActivityStats()
assert_raises(AttributeError, setattr, stats, "unknown_attribute", 1)
for c in _COUNTS:
assert_equal(getattr(stats, c), 0)
stats.files += 1
assert_equal(stats.files, 1)
stats.increment('files')
assert_equal(stats.files, 2)
assert_equal(stats.as_dict()['files'], 2)
# smoke tests
assert_equal(stats.as_str(), stats.as_str(mode='full'))
assert_equal(len(stats.as_str(mode='line').split('\n')), 1)
assert_in('files=2', repr(stats))
stats.reset()
for c in _COUNTS:
assert_equal(getattr(stats, c), 0)
# Check a copy of stats
stats_total = stats.get_total()
assert_equal(stats_total.files, 2)
stats.files += 1
assert_equal(stats.files, 1)
assert_equal(stats_total.files, 2) # shouldn't change -- a copy!
# Let's add some merges
stats.merges.append(('upstream', 'master'))
stats_total = stats.get_total()
assert_equal(stats_total.merges, stats.merges)
assert_equal(stats.as_str(), """Files processed: 1
Branches merged: upstream->master""")
assert_equal(stats.as_str(mode='line'), "Files processed: 1, Branches merged: upstream->master")
stats.urls += 2
stats.downloaded += 1
stats.downloaded_size += 123456789 # will invoke formatter
assert_in("size: 123.5 MB", stats.as_str())
def test_ActivityStats_comparisons():
stats1 = ActivityStats()
stats2 = ActivityStats()
assert_equal(stats1, stats2)
stats1.files += 1
assert_not_equal(stats1, stats2)
# if we reset -- should get back the same although totals should be different
stats1.reset()
assert_equal(stats1.as_str(), stats2.as_str())
assert_equal(stats1, stats2)
assert_not_equal(stats1.get_total(), stats2.get_total())
#stats1.reset(full=True)
#assert_equal(stats1, stats2)
def test_add():
stats1 = ActivityStats()
stats2 = ActivityStats()
stats1.files += 1
stats2.files += 1
stats2.urls += 1
assert_equal(stats1, ActivityStats(files=1))
assert_equal(stats2, ActivityStats(files=1, urls=1))
stats1 += stats2
assert_equal(stats1, ActivityStats(files=2, urls=1))
assert_equal(stats1.get_total(), ActivityStats(files=2, urls=1))
stats3 = stats1 + stats2
# no changes to stats1 or stats2
assert_equal(stats1, ActivityStats(files=2, urls=1))
assert_equal(stats1.get_total(), ActivityStats(files=2, urls=1))
assert_equal(stats2, ActivityStats(files=1, urls=1))
assert_equal(stats3.get_total(), ActivityStats(files=3, urls=2))
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_status.py 0000644 0001751 0001751 00000002731 15137634221 021767 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from datalad.tests.utils_pytest import (
assert_equal,
assert_not_equal,
)
from ..status import FileStatus
def test_FileStatus_basic():
assert_equal(FileStatus(size=0), FileStatus(size=0))
assert_not_equal(FileStatus(size=0), FileStatus(size=1))
# mtimes allow trimming if one is int
assert_equal(FileStatus(mtime=0), FileStatus(mtime=0.9999))
assert_equal(FileStatus(mtime=0), FileStatus(mtime=0.0001))
assert_not_equal(FileStatus(mtime=0.2), FileStatus(mtime=0.1))
assert_not_equal(FileStatus(mtime=0.2), FileStatus(mtime=None))
assert_not_equal(FileStatus(mtime=1), FileStatus(mtime=None))
# And with None should be False
assert_not_equal(FileStatus(mtime=1), None)
assert_not_equal(None, FileStatus(mtime=1))
# adding more information would result in not-equal
assert_not_equal(FileStatus(size=0), FileStatus(size=0, mtime=123))
# empty ones can't be compared
# TODO: actually not clear why that NotImplemented singleton is not returned
assert_not_equal(FileStatus(), FileStatus())
#assert_false(FileStatus() != FileStatus())
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/test_vcr_.py 0000644 0001751 0001751 00000002054 15137634221 021373 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Tests for vcr adapter"""
from ...tests.utils_pytest import (
SkipTest,
eq_,
)
from ..vcr_ import use_cassette
def test_use_cassette_if_no_vcr():
# just test that our do nothing decorator does the right thing if vcr is not present
skip = False
try:
import vcr
skip = True
except ImportError:
pass
except:
# if anything else goes wrong with importing vcr, we still should be able to
# run use_cassette
pass
if skip:
raise SkipTest("vcr is present, can't test behavior with vcr presence ATM")
@use_cassette("some_path")
def checker(x):
return x + 1
eq_(checker(1), 2)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/tests/utils.py 0000644 0001751 0001751 00000000675 15137634221 020552 0 ustar 00runner runner # ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Utils for testing support module
"""
from datalad.support.external_versions import external_versions
from datalad.tests.utils_pytest import *
././@PaxHeader 0000000 0000000 0000000 00000000033 00000000000 010211 x ustar 00 27 mtime=1769945274.887061
datalad-1.3.1/datalad/support/third/ 0000755 0001751 0001751 00000000000 15137634273 017007 5 ustar 00runner runner ././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/third/__init__.py 0000644 0001751 0001751 00000000000 15137634221 021077 0 ustar 00runner runner ././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/third/loris_token_generator.py 0000644 0001751 0001751 00000002646 15137634221 023760 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### #
import json
import sys
from urllib.error import HTTPError
from urllib.request import (
Request,
urlopen,
)
from datalad.support.exceptions import AccessDeniedError
from datalad.utils import ensure_unicode
class LORISTokenGenerator(object):
"""
Generate a LORIS API token by making a request to the
LORIS login API endpoint with the given username
and password.
url is the complete URL of the $LORIS/api/$VERSION/login
endpoint.
"""
def __init__(self, url=None):
assert(url is not None)
self.url = url
def generate_token(self, user=None, password=None):
data = {'username': user, 'password' : password}
encoded_data = json.dumps(data).encode('utf-8')
request = Request(self.url, encoded_data)
try:
response = urlopen(request)
except HTTPError:
raise AccessDeniedError("Could not authenticate into LORIS")
str_response = ensure_unicode(response.read())
data = json.loads(str_response)
return data["token"]
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/third/nda_aws_token_generator.py 0000644 0001751 0001751 00000007221 15137634221 024236 0 ustar 00runner runner ## NDA AWS Token Generator
## Author: NIMH Data Archives
## http://ndar.nih.gov
## License: MIT
## https://opensource.org/licenses/MIT
import binascii
import hashlib
import logging
import xml.etree.ElementTree as etree
from urllib import request as urllib_request
class NDATokenGenerator(object):
__schemas = {
'soap': 'http://schemas.xmlsoap.org/soap/envelope/',
'data': 'http://gov/nih/ndar/ws/datamanager/server/bean/jaxb'
}
def __init__(self, url):
assert url is not None
self.url = url
logging.debug('constructed with url %s', url)
def generate_token(self, username, password):
logging.info('request to generate AWS token')
encoded_password = self.__encode_password(password)
request_xml = self.__construct_request_xml(username, encoded_password)
return self.__make_request(request_xml)
def __encode_password(self, password):
logging.debug('encoding password')
hasher = hashlib.sha1()
hasher.update(password.encode('utf-8'))
digest_bytes = hasher.digest()
byte_string = binascii.hexlify(digest_bytes)
output = byte_string.decode('utf-8')
logging.debug('encoded password hash: %s', output)
return output
def __construct_request_xml(self, username, encoded_password):
logging.debug('constructing request with %s - %s', username, encoded_password)
soap_schema = self.__schemas['soap']
datamanager_schema = self.__schemas['data']
element = etree.Element('{%s}Envelope' % soap_schema)
body = etree.SubElement(element, '{%s}Body' % soap_schema)
userelement = etree.SubElement(body, '{%s}UserElement' % datamanager_schema)
user = etree.SubElement(userelement, "user")
uid = etree.SubElement(user, "id")
uid.text = '0'
uid = etree.SubElement(user, "name")
uid.text = username
uid = etree.SubElement(user, "password")
uid.text = encoded_password
uid = etree.SubElement(user, "threshold")
uid.text = '0'
logging.debug(etree.tostring(element))
return etree.tostring(element)
def __make_request(self, request_message):
logging.debug('making post request to %s', self.url)
headers = {
'SOAPAction': '"generateToken"',
'Content-Type': 'text/xml; charset=utf-8'
}
request = urllib_request.Request(self.url, data=request_message, headers=headers)
logging.debug(request)
response = urllib_request.urlopen(request)
return self.__parse_response(response.read())
def __parse_response(self, response):
logging.debug('parsing response')
tree = etree.fromstring(response)
error = tree.find('.//errorMessage')
if error is not None:
error_msg = error.text
logging.error('response had error message: %s', error_msg)
raise Exception(error_msg)
generated_token = tree[0][0]
token_elements = [e.text for e in generated_token[0:4]]
token = Token(*token_elements)
return token
class Token:
def __init__(self, access_key, secret_key, session, expiration):
logging.debug('constructing token')
self._access_key = access_key
self._secret_key = secret_key
self._session = session
self._expiration = expiration
@property
def access_key(self):
return self._access_key
@property
def secret_key(self):
return self._secret_key
@property
def session(self):
return self._session
@property
def expiration(self):
return self._expiration
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/support/vcr_.py 0000644 0001751 0001751 00000010117 15137634221 017171 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Adapters and decorators for vcr
"""
import logging
from contextlib import contextmanager
from functools import wraps
from os.path import isabs
from datalad.support.exceptions import CapturedException
from datalad.utils import Path
lgr = logging.getLogger("datalad.support.vcr")
def _get_cassette_path(path):
"""Return a path to the cassette within our unified 'storage'"""
if not isabs(path): # so it was given as a name
return "fixtures/vcr_cassettes/%s.yaml" % path
return path
try:
# TEMP: Just to overcome problem with testing on jessie with older requests
# https://github.com/kevin1024/vcrpy/issues/215
import requests as _
import vcr.patch as _vcrp
try:
from requests.packages.urllib3.connectionpool import \
HTTPConnection as _a
from requests.packages.urllib3.connectionpool import \
VerifiedHTTPSConnection as _b
except ImportError:
def returnnothing(*args, **kwargs):
return()
_vcrp.CassettePatcherBuilder._requests = returnnothing
from vcr import VCR as _VCR
from vcr import use_cassette as _use_cassette
def use_cassette(path, return_body=None, skip_if_no_vcr=False, **kwargs):
"""Adapter so we could create/use custom use_cassette with custom parameters
Parameters
----------
path : str
If not absolute path, treated as a name for a cassette under fixtures/vcr_cassettes/
skip_if_no_vcr : bool
Rather than running without VCR it would throw unittest.SkipTest
exception. Of effect only if vcr import fails (so not in this
implementation but the one below)
"""
path = _get_cassette_path(path)
lgr.debug("Using cassette %s", path)
if return_body is not None:
my_vcr = _VCR(
before_record_response=lambda r: dict(r, body={'string': return_body.encode()}))
return my_vcr.use_cassette(path, **kwargs) # with a custom response
else:
return _use_cassette(path, **kwargs) # just a straight one
# shush vcr
vcr_lgr = logging.getLogger('vcr')
if lgr.getEffectiveLevel() > logging.DEBUG:
vcr_lgr.setLevel(logging.WARN)
except Exception as exc:
if not isinstance(exc, ImportError):
# something else went hairy (e.g. vcr failed to import boto due to some syntax error)
lgr.warning("Failed to import vcr, no cassettes will be available: %s",
CapturedException(exc))
# If there is no vcr.py -- provide a do nothing decorator for use_cassette
def use_cassette(path, return_body=None, skip_if_no_vcr=False, **kwargs):
if skip_if_no_vcr:
def skip_decorator(t):
@wraps(t)
def wrapper(*args, **kwargs):
from unittest import SkipTest
raise SkipTest("No vcr")
return wrapper
return skip_decorator
else:
def do_nothing_decorator(t):
@wraps(t)
def wrapper(*args, **kwargs):
lgr.debug("Not using vcr cassette")
return t(*args, **kwargs)
return wrapper
return do_nothing_decorator
@contextmanager
def externals_use_cassette(name):
"""Helper to pass instruction via env variables to use specified cassette
For instance whenever we are testing custom special remotes invoked by the annex
but want to minimize their network traffic by using vcr.py
"""
from unittest.mock import patch
cassette_path = str(Path(_get_cassette_path(name)).resolve()) # realpath OK
with patch.dict('os.environ', {'DATALAD_TESTS_USECASSETTE': cassette_path}):
yield
././@PaxHeader 0000000 0000000 0000000 00000000034 00000000000 010212 x ustar 00 28 mtime=1769945274.8920612
datalad-1.3.1/datalad/tests/ 0000755 0001751 0001751 00000000000 15137634273 015323 5 ustar 00runner runner ././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/__init__.py 0000644 0001751 0001751 00000001450 15137634221 017425 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import os
import shutil
import tempfile
from logging import getLogger
lgr = getLogger("datalad.tests")
# We will delay generation of some test files/directories until they are
# actually used but then would remove them here
_TEMP_PATHS_GENERATED = []
# Give a custom template so we could hunt them down easily
tempfile.template = os.path.join(tempfile.gettempdir(),
'tmp-page2annex')
././@PaxHeader 0000000 0000000 0000000 00000000034 00000000000 010212 x ustar 00 28 mtime=1769945274.8930612
datalad-1.3.1/datalad/tests/ca/ 0000755 0001751 0001751 00000000000 15137634273 015706 5 ustar 00runner runner ././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/ca/README 0000644 0001751 0001751 00000002451 15137634221 016561 0 ustar 00runner runner # Provenance info for the complete root CA/cert setup in this directory
password=OoTh9oM9
# make CA authority, needs password for ca-key.pem
openssl genrsa -aes256 -out ca-key.pem 2048
# make root certificate, needs password for ca-key.pem
openssl req -x509 -new -subj "/C=XX/CN=datalad-ci" -nodes -extensions v3_ca -key ca-key.pem -days 3650 -out ca-root.pem -sha512
# deploy certificate on Debian
sudo mkdir -p /usr/local/share/ca-certificates/
sudo cp ca-root.pem /usr/local/share/ca-certificates/datalad-root.crt
sudo update-ca-certificates
# generate a private key for certificates
openssl genrsa -out certificate-key.pem 2048
# generate certificate signing request for a specific machine
openssl req -new -subj "/C=XX/CN=localhost" -key certificate-key.pem -out certificate.csr -sha512 -batch
# generate 10 year certificate, needs password for ca-key.pem
openssl x509 -req -in certificate.csr -CA ca-root.pem -CAkey ca-key.pem -CAcreateserial -out certificate-pub.pem -days 3650 -sha512
# whenever root CA and public certificate cannot be provisioned separately,
# they could be combined
cat ca-root.pem certificate-pub.pem > ca_bundle.pem
# check if certificate is considered valid
openssl verify -trusted ca_bundle.pem certificate-pub.pem
# potentially
#export REQUESTS_CA_BUNDLE=/path/to/ca_bundle.pem
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/ca/ca-key.pem 0000644 0001751 0001751 00000003346 15137634221 017561 0 ustar 00runner runner -----BEGIN RSA PRIVATE KEY-----
Proc-Type: 4,ENCRYPTED
DEK-Info: AES-256-CBC,C35B75517DFEB3957C55AD52F9AA1A15
vzYQVDau0Cl+H+pe9xmMNGvXpLSWUJ/Yd5Gw7675yz8/LCySyL2Q/4hpXxad9vI1
QfIIuUMaGG6UiKgDNgWg7i3FIDGsyJny+NIrYuCdsDNAArzpmESe5xxPckntQXpV
Ca1RdqlDhfYuoavFnBQNo6Ln4vm2HgEmsD1FWiPCtN7N9SBXJq5nwUW7LRco9HCn
WxpG2+/9oYZ+f1+Q+HcREHgqspEDUH6F28jPfP6r/SgrcTiXEmRhRNhs6wXW6xhS
O8JZptitMru3z99MQgH83+AWSdTZysGCvKrZi/yVfOjv+kyfkm1w4Sip/MQ2fh2j
wK3dINqlfLWyBTGDwP6ud4sWFKBAtgzC2Qq0FHWOiTb62Fuo+wZZNVfFjQrv/3bb
qnxl8/iebk1oYuovidhqGPOk4koGn4uSec3IAqEhf13eQ8Y9K3f3wew1f8HDUVn7
clexFqm8r4YLDWxNOZTRC9BSw/y143E8QJ1pRBt1SVEPSYioOArr9WyhR/SUt7zG
Chs5GzLtQZpppyoh0lmwqzbQ8TwkaK3NIuHBeHWWoQCCJCLm0DQJiG0VE1HmdGsk
Kor5m6c8Y5URqDGEVD/Wk8rzzGk5DJ/zt4RRSFkIHAcclpZ6P+Xgyeu3i0rR/mEq
/9Xy7bDop1I1cpABWsrlcvNywO7xHNT8ZdnIrRd107GoF9uSmo4xw+SP3I2bTvVf
9R5VoxzdstLKPn806Y+Lv0lQ+3aw5+sUSQSeckvJ9uG+jl7quCssiF5WkkPdN9oh
nZETEUH0pPaAMPSPj18BoK9smKNC29xoq41p/U2EUsBL7XPYF313r2uOor20zcAb
ghy3SYPJKQylPgjdqoxcQQPJUDtTOIdtzRhIKaI5iB3ExhxUgQaBcJr/1sGwbVu8
SzFm8IsAFZisJ8FEpQp9vrFqEOCMnQOEFezhzBENYec2TXr6uKsD7Ew4WDl8acp+
ZJTJIcfm9yQREfYSBeCSxcPUHB25uLJQZebq5GICXFMzyH/az+w7T/CkipfKWo19
lzshl0/+T0zk4zxkkrfyn0HykHy2DD/gxwg/s+XYjj99g3UHc9iQX5osVA4KKQdz
1glfo0sXaICvGTl9qZ2T85N6bBDdM5j81+kTgwcPtwWumMvEluXmVNOVbkcoxVM5
DU2v4lV37cJhhP5wo2WhuNogfEKu2pmm36Q04dc2fT4u0bveLXiGw29F4Mk6/h/P
c+ERGQWW7S6GkL1l7XybSIFTFr2I2l95C3dBUpQbMZQG8CjRQJ5prgXQ0yJN1/C0
vteJHe/q7iKQG2M1WT4RVkS4trx01bmTQLLMQzydTusLw9zy+Yd+mchdjOX9l2Of
Y3k1gZrdZ5GNYcKuZ0nD4fzFxk+r7Q72AcjcfaKCRY0jFZBIv84x2qssVveLabB+
s6tuokFAgaP3e036dlrvXs6+GDr2kS5ZNMWGBc+bIzMcxkD6W8xA043/fO5iv3wn
gu11q8OJe7owJVvhAq+bGtv2tzvlOxR47+LyShoAU8LjDojqalQKDjDVAUXdVDjF
IftSLbLMRYqIhghmutbO7pmiNBttY1PCbbBvmM9dV7hbSM5vMF/Dw2ekmo67G7lQ
-----END RSA PRIVATE KEY-----
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/ca/ca-root.pem 0000644 0001751 0001751 00000002177 15137634221 017755 0 ustar 00runner runner -----BEGIN CERTIFICATE-----
MIIDJTCCAg2gAwIBAgIUCjAucz9dliVpJXcDcs/qM2ARvWAwDQYJKoZIhvcNAQEN
BQAwIjELMAkGA1UEBhMCWFgxEzARBgNVBAMMCmRhdGFsYWQtY2kwHhcNMjExMTA4
MTMzODA5WhcNMzExMTA2MTMzODA5WjAiMQswCQYDVQQGEwJYWDETMBEGA1UEAwwK
ZGF0YWxhZC1jaTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAJvL4zjP
GwOLcJ/l5m+u0LJ6zF1VeLLwA8Cll0teAWDakiQFpoKvFpYc28gwetrn8iYA9K0+
yPQzd+4DO6A2H4coE6nN2xoNgIWbbmupcAEZipNrsrYXqjMcEZHurFWYlCOHejSs
c6Xnhq9cDr8wHdE7AsGgPkZbeVeW64V5JEfSLXuOqEscPPmUAWuqDDYDRHoK8pgD
7wRhFJYmNt0HxIzdsZSZw1Gq7YRb0E9K25W1w6wIzpsjuCFzIEhN5bDYhnbFVSM+
15I51fHkXxro/F7WxqnPlqavy6ORTR5oLWGepaoNtsI804FJ8AjkqhCXUCjZ9p17
mR+O3R2LVjjbHHcCAwEAAaNTMFEwHQYDVR0OBBYEFMZ82ji8X5tBjW7BbgHgWibT
lF+AMB8GA1UdIwQYMBaAFMZ82ji8X5tBjW7BbgHgWibTlF+AMA8GA1UdEwEB/wQF
MAMBAf8wDQYJKoZIhvcNAQENBQADggEBABTOYJoHPw8gaA12l/pVOuF++zExTdwY
63o3JHJaVNkww1mcpnrFUnRknMqElRFWO5Y/i7+jlSUoaCUCfDV6ltx2zW6dtBvq
+2z8A+5bmm3SzUmniFdrG2LTCdF0/bBHjsREKywzjzgymGh/0WIq0VV9MWXvxFO7
0U89y3DRHBsm+Ub3epq1S9FWKN4RiAcmaKIJqF1ZShR5C4v5U+ep1OTPi17xBToo
gl15O3htUNiVst2MhyVc3ETSfnP4AFClx2orXenjtmej7g1skh3LoWnFA0k4/Qch
c0zZWjfzyUF5io43WcXxOK+m+s0O99DNTyUMpM42WguciWb1aoopQKY=
-----END CERTIFICATE-----
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/ca/ca-root.srl 0000644 0001751 0001751 00000000051 15137634221 017761 0 ustar 00runner runner 4340233949FA8655F954BB83BFB9A6D1A4F80E23
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/ca/ca_bundle.pem 0000644 0001751 0001751 00000004204 15137634221 020316 0 ustar 00runner runner -----BEGIN CERTIFICATE-----
MIIDJTCCAg2gAwIBAgIUCjAucz9dliVpJXcDcs/qM2ARvWAwDQYJKoZIhvcNAQEN
BQAwIjELMAkGA1UEBhMCWFgxEzARBgNVBAMMCmRhdGFsYWQtY2kwHhcNMjExMTA4
MTMzODA5WhcNMzExMTA2MTMzODA5WjAiMQswCQYDVQQGEwJYWDETMBEGA1UEAwwK
ZGF0YWxhZC1jaTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAJvL4zjP
GwOLcJ/l5m+u0LJ6zF1VeLLwA8Cll0teAWDakiQFpoKvFpYc28gwetrn8iYA9K0+
yPQzd+4DO6A2H4coE6nN2xoNgIWbbmupcAEZipNrsrYXqjMcEZHurFWYlCOHejSs
c6Xnhq9cDr8wHdE7AsGgPkZbeVeW64V5JEfSLXuOqEscPPmUAWuqDDYDRHoK8pgD
7wRhFJYmNt0HxIzdsZSZw1Gq7YRb0E9K25W1w6wIzpsjuCFzIEhN5bDYhnbFVSM+
15I51fHkXxro/F7WxqnPlqavy6ORTR5oLWGepaoNtsI804FJ8AjkqhCXUCjZ9p17
mR+O3R2LVjjbHHcCAwEAAaNTMFEwHQYDVR0OBBYEFMZ82ji8X5tBjW7BbgHgWibT
lF+AMB8GA1UdIwQYMBaAFMZ82ji8X5tBjW7BbgHgWibTlF+AMA8GA1UdEwEB/wQF
MAMBAf8wDQYJKoZIhvcNAQENBQADggEBABTOYJoHPw8gaA12l/pVOuF++zExTdwY
63o3JHJaVNkww1mcpnrFUnRknMqElRFWO5Y/i7+jlSUoaCUCfDV6ltx2zW6dtBvq
+2z8A+5bmm3SzUmniFdrG2LTCdF0/bBHjsREKywzjzgymGh/0WIq0VV9MWXvxFO7
0U89y3DRHBsm+Ub3epq1S9FWKN4RiAcmaKIJqF1ZShR5C4v5U+ep1OTPi17xBToo
gl15O3htUNiVst2MhyVc3ETSfnP4AFClx2orXenjtmej7g1skh3LoWnFA0k4/Qch
c0zZWjfzyUF5io43WcXxOK+m+s0O99DNTyUMpM42WguciWb1aoopQKY=
-----END CERTIFICATE-----
-----BEGIN CERTIFICATE-----
MIICyjCCAbICFENAIzlJ+oZV+VS7g7+5ptGk+A4jMA0GCSqGSIb3DQEBDQUAMCIx
CzAJBgNVBAYTAlhYMRMwEQYDVQQDDApkYXRhbGFkLWNpMB4XDTIxMTEwODEzMzkx
NFoXDTMxMTEwNjEzMzkxNFowITELMAkGA1UEBhMCWFgxEjAQBgNVBAMMCWxvY2Fs
aG9zdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMGiBxMPD5UxakpZ
xG8jVDi3wB80sy21TRiCNkpNi3nDp9lMdUXPK/sJB2KgyrK5+AHTFJYm0XP3W5qq
ddiyrVRDA8MTgLSa9aC044RuGHcqPajkQkXCyIJdVitLLj2mijVGJyLyV8FP7r95
u+l56l7yx8f6K0H7BePOCosIp5tY0aw/+A/b87y+Ucp5S2y/IzD5bt5IbVD9qSwi
CYm38JjvGOJobW+ozjnBYXNtiH6MPfUiwHjidu3saVeZAYi+Fi3kd1sW/JusVWp2
TEZaAnYNb5qE3XAPlP0fOZKLK0NH0i+7F56ONuGUCnBjPeJ0payoZjoy+wSAxTqM
+HG9oFECAwEAATANBgkqhkiG9w0BAQ0FAAOCAQEAbPfz/+5DMgDXvKkFrA4prruw
rczd9+91b4udcaYzCYN+VTLK34GPFL462LuCebkY55qoYBn6eNP0aIPfRAKWAApm
PcomgSi92gGr1lFNkvrtI4IF4I9NNZVYq5DPxiJv5mVskfwiNJyD0RXkw+wiH3Fn
4rxTVNcN2BK3ax6pDqXXEHjs7mkLCCvERGfnOHCQUSTKwStXI4JU8UV40Xl9lnl+
R5zXgmRd6jhNQ6YFJdf24i98g50wb4jZF4bqwpPOI5slrQ2UY9EOkIyR+00hmesG
k2X8BvEK4GDIVLVIpNO8zFuQiRYsL8FBaY83LIx6EZ5RX1rN45K8SrJekHhwgQ==
-----END CERTIFICATE-----
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/ca/certificate-key.pem 0000644 0001751 0001751 00000003217 15137634221 021455 0 ustar 00runner runner -----BEGIN RSA PRIVATE KEY-----
MIIEpAIBAAKCAQEAwaIHEw8PlTFqSlnEbyNUOLfAHzSzLbVNGII2Sk2LecOn2Ux1
Rc8r+wkHYqDKsrn4AdMUlibRc/dbmqp12LKtVEMDwxOAtJr1oLTjhG4Ydyo9qORC
RcLIgl1WK0suPaaKNUYnIvJXwU/uv3m76XnqXvLHx/orQfsF484Kiwinm1jRrD/4
D9vzvL5RynlLbL8jMPlu3khtUP2pLCIJibfwmO8Y4mhtb6jOOcFhc22Ifow99SLA
eOJ27expV5kBiL4WLeR3Wxb8m6xVanZMRloCdg1vmoTdcA+U/R85kosrQ0fSL7sX
no424ZQKcGM94nSlrKhmOjL7BIDFOoz4cb2gUQIDAQABAoIBAEtBbNx0awsCFJxm
/Rhws6gDinDWa5klMJyI5w7DyqR9bXvf4mhKV36OiMdFVOJXstMKtaKR3a/jgD/B
ZmHdN85a6Vhk+Q24eCSIiTCMFYgi66n7ZOJ2DLXWDspuwr8FPgvsgwNrCr5AEbtr
VaQtVPCoDQtk6HERXb2m86QLrhCKqvSfkXP45vWTPiuhQOxnvC50bDdmQ9f6Bnfc
6GIDLSqGu8m96J5Fa4pExi6wCBMNbC4CJBIvePe63/1yf1UlHcpw2tp71Josjre9
a0Stp/AcApfM7QNpWqo4XtYBg0NoLPBLXp6iN73HT7R+6dO4mNFz2LxZs2TiwR13
gaH7jZ0CgYEA/uWXKENQ+Gu7a6tnFgGKbhSQukLuKnVCIHlvxq+NFCTbfyCE+j1/
/1n2HOx3gZgkAVjb7Juh2cNyGgHEfuAsXzc7n+7M9tWJS+dFobW+2zOYT0LZQ3To
fO+KkzyjZp2rsKVG1bjncHvGteCn9FVin5e3S2lIIv1nMA8P3IK0hzcCgYEAwniP
hgWWOBFbyhTTRBYKRx1jTTKR2+CCB55j52HQvDbExS5y1DFil1ZjczUyZ1Clxkel
onEkZnVOcQ/fauRtJBuhHqDglyV+69cH/cDKJNwfU7QCrA8MnqpmiW0Ws7PMB2eI
+5Kax+KcRz8MgNp3VsIB23cCi7zIsfFRwnpiyLcCgYEA2WFNxv2ZyTJ/VPSJZqAL
wZNyVhcMbFMyAOXvqH9AjqRcOQ+vuIXIain7lXEo/fGVCz0P2bpkjoV7AqjtPdLK
pK1DAUMzs3Iqs/vZS27nBKP8yePVzAyg1SOoh6bYJ4RhzrU1MsGNR0WYErY0JlDH
WhmWLQ7tgUGgxcCio0ko0oMCgYEAvb2xkzsKOUQ+ganWA51Zg89Znu2TQKwJUnOo
2A4pAiqRzC2onf5dSnlgaPzjY8hdJlurcRXhRRJ5aXjqLLoY/tzyVBb1aqOL9YGF
a4FZ+aLH8fw1izjXwEtT5gwI1Wa80BClp4d/+zzP6/fJNENzorOB6mZwlnSogG2K
ryyIC3sCgYBbrgodm0wWlsDfYBDB9aDMMWWuPN6vYnTsXk7SxhaJkOrE7W2IdXXq
NIXU3rX1vTNFEZGZekxzjKwGqqJoaJ0eV+0/oaGIFSbmBNjtCJ9jQ6+O0gmgTzW9
Sc+x63I5wGTcOnthnEO04IP22RxHaXfgaX5HBFuq/yt/ggEyq8YO/w==
-----END RSA PRIVATE KEY-----
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/ca/certificate-pub.pem 0000644 0001751 0001751 00000002005 15137634221 021445 0 ustar 00runner runner -----BEGIN CERTIFICATE-----
MIICyjCCAbICFENAIzlJ+oZV+VS7g7+5ptGk+A4jMA0GCSqGSIb3DQEBDQUAMCIx
CzAJBgNVBAYTAlhYMRMwEQYDVQQDDApkYXRhbGFkLWNpMB4XDTIxMTEwODEzMzkx
NFoXDTMxMTEwNjEzMzkxNFowITELMAkGA1UEBhMCWFgxEjAQBgNVBAMMCWxvY2Fs
aG9zdDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMGiBxMPD5UxakpZ
xG8jVDi3wB80sy21TRiCNkpNi3nDp9lMdUXPK/sJB2KgyrK5+AHTFJYm0XP3W5qq
ddiyrVRDA8MTgLSa9aC044RuGHcqPajkQkXCyIJdVitLLj2mijVGJyLyV8FP7r95
u+l56l7yx8f6K0H7BePOCosIp5tY0aw/+A/b87y+Ucp5S2y/IzD5bt5IbVD9qSwi
CYm38JjvGOJobW+ozjnBYXNtiH6MPfUiwHjidu3saVeZAYi+Fi3kd1sW/JusVWp2
TEZaAnYNb5qE3XAPlP0fOZKLK0NH0i+7F56ONuGUCnBjPeJ0payoZjoy+wSAxTqM
+HG9oFECAwEAATANBgkqhkiG9w0BAQ0FAAOCAQEAbPfz/+5DMgDXvKkFrA4prruw
rczd9+91b4udcaYzCYN+VTLK34GPFL462LuCebkY55qoYBn6eNP0aIPfRAKWAApm
PcomgSi92gGr1lFNkvrtI4IF4I9NNZVYq5DPxiJv5mVskfwiNJyD0RXkw+wiH3Fn
4rxTVNcN2BK3ax6pDqXXEHjs7mkLCCvERGfnOHCQUSTKwStXI4JU8UV40Xl9lnl+
R5zXgmRd6jhNQ6YFJdf24i98g50wb4jZF4bqwpPOI5slrQ2UY9EOkIyR+00hmesG
k2X8BvEK4GDIVLVIpNO8zFuQiRYsL8FBaY83LIx6EZ5RX1rN45K8SrJekHhwgQ==
-----END CERTIFICATE-----
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/ca/certificate.csr 0000644 0001751 0001751 00000001613 15137634221 020673 0 ustar 00runner runner -----BEGIN CERTIFICATE REQUEST-----
MIICZjCCAU4CAQAwITELMAkGA1UEBhMCWFgxEjAQBgNVBAMMCWxvY2FsaG9zdDCC
ASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMGiBxMPD5UxakpZxG8jVDi3
wB80sy21TRiCNkpNi3nDp9lMdUXPK/sJB2KgyrK5+AHTFJYm0XP3W5qqddiyrVRD
A8MTgLSa9aC044RuGHcqPajkQkXCyIJdVitLLj2mijVGJyLyV8FP7r95u+l56l7y
x8f6K0H7BePOCosIp5tY0aw/+A/b87y+Ucp5S2y/IzD5bt5IbVD9qSwiCYm38Jjv
GOJobW+ozjnBYXNtiH6MPfUiwHjidu3saVeZAYi+Fi3kd1sW/JusVWp2TEZaAnYN
b5qE3XAPlP0fOZKLK0NH0i+7F56ONuGUCnBjPeJ0payoZjoy+wSAxTqM+HG9oFEC
AwEAAaAAMA0GCSqGSIb3DQEBDQUAA4IBAQBydmnJP0Pv+ySADKmrwm8czDRJL8OE
pB5dy77pUqJ46PMKNbU9DuNnuSmu48dn0K6T0ozZoeax7kq9tlQNg7E3kPMlAdkK
vmy7WrlGBT8/18umG1Rty5jh+sbqZGCfGxOizJH1NO5PmWGFhhcL0OvRdhxsS+tt
yBrpfmc0PvagRQfEbwYQ20asuJET+KopXtnDjyyWEGRSuD/iTGJF1F3j6qtYGP6G
ASHb7KrewxuL4r3md0yvR5eIi8+BDnTZKAEWSzeyHU9hDw9lor8gSpXUqcop+Nn5
8eAiWPXi+FBjZvqVWGABJftWva1SemEVz4gjhm0S2/DMKRolNPVYUts+
-----END CERTIFICATE REQUEST-----
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/heavyoutput.py 0000644 0001751 0001751 00000000374 15137634221 020267 0 ustar 00runner runner """
Helper to provide heavy load on stdout and stderr
"""
import sys
if __name__ == "__main__":
x = str(list(range(1000))) + '\n'
for i in range(100):
s = "%d " % i + x
sys.stdout.writelines(s)
sys.stderr.writelines(s)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test__main__.py 0000644 0001751 0001751 00000002112 15137634221 020302 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import sys
from io import StringIO
from unittest.mock import patch
from datalad.tests.utils_pytest import (
assert_equal,
assert_raises,
)
from .. import (
__main__,
__version__,
)
@patch('sys.stdout', new_callable=StringIO)
def test_main_help(stdout=None):
assert_raises(SystemExit, __main__.main, ['__main__.py', '--help'])
assert(
stdout.getvalue().startswith(
"Usage: %s -m datalad [OPTIONS] [ARGS]\n" % sys.executable
))
@patch('sys.stdout', new_callable=StringIO)
def test_main_version(stdout=None):
assert_raises(SystemExit, __main__.main, ['__main__.py', '--version'])
assert_equal(stdout.getvalue().rstrip(), "datalad %s" % __version__)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_api.py 0000644 0001751 0001751 00000006445 15137634221 017507 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the DataLad package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
'''Unit tests for Python API functionality.'''
import re
from datalad.tests.utils_pytest import (
SkipTest,
assert_false,
assert_in,
assert_true,
eq_,
)
from datalad.utils import get_sig_param_names
def test_basic_setup():
# the import alone will verify that all default values match their
# constraints
from datalad import api
# random pick of something that should be there
assert_true(hasattr(api, 'install'))
assert_true(hasattr(api, 'create'))
# make sure all helper utilities do not pollute the namespace
# and we end up only with __...__ attributes
assert_false(list(filter(lambda s: s.startswith('_') and not re.match('__.*__', s), dir(api))))
assert_in('Parameters', api.Dataset.install.__doc__)
assert_in('Parameters', api.Dataset.create.__doc__)
def _test_consistent_order_of_args(intf, spec_posargs):
f = getattr(intf, '__call__')
args, kw_only = get_sig_param_names(f, ('pos_any', 'kw_only'))
# now verify that those spec_posargs are first among args
# TODO*: The last odd one left from "plugins" era. Decided to leave alone
if intf.__name__ in ('ExtractMetadata',):
return
# if we had used * to instruct to have keyword only args, then all
# args should actually be matched entirely
if kw_only:
# "special cases/exclusions"
if intf.__name__ == 'CreateSiblingRia':
# -s|--name is a mandatory option (for uniformity), so allowed to be used as posarg #2
eq_(set(args), spec_posargs.union({'name'}))
else:
eq_(set(args), spec_posargs)
else:
# and if no kw_only -- only those which are known to be positional
eq_(set(args[:len(spec_posargs)]), spec_posargs)
if spec_posargs:
# and really -- we should not even get here if there are some spec_posargs --
# new interfaces should use * to separate pos args from kwargs per our now
# accepted design doc:
# http://docs.datalad.org/en/latest/design/pos_vs_kw_parameters.html
assert False
# TODO?: make parametric again instead of invoking
def test_consistent_order_of_args():
from importlib import import_module
from datalad.interface.base import get_interface_groups
for grp_name, grp_descr, interfaces in get_interface_groups():
for intfspec in interfaces:
# turn the interface spec into an instance
mod = import_module(intfspec[0], package='datalad')
intf = getattr(mod, intfspec[1])
spec = getattr(intf, '_params_', dict())
# figure out which of the specs are "positional"
spec_posargs = {
name
for name, param in spec.items()
if param.cmd_args and not param.cmd_args[0].startswith('-')
}
# we have information about positional args
_test_consistent_order_of_args(intf, spec_posargs)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_archives.py 0000644 0001751 0001751 00000022647 15137634221 020544 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import itertools
import os
from unittest.mock import patch
import pytest
from datalad import cfg as dl_cfg
from datalad.support import path as op
from datalad.support.archive_utils_patool import unixify_path
from datalad.support.archives import (
ArchivesCache,
ExtractedArchive,
compress_files,
decompress_file,
)
from datalad.support.exceptions import MissingExternalDependency
from datalad.support.external_versions import external_versions
from datalad.tests.utils_pytest import (
OBSCURE_FILENAME,
SkipTest,
assert_false,
assert_raises,
assert_true,
eq_,
ok_file_has_content,
ok_generator,
on_github,
on_nfs,
on_travis,
on_windows,
skip_if,
swallow_outputs,
with_tempfile,
with_tree,
)
fn_in_archive_obscure = OBSCURE_FILENAME
fn_archive_obscure = fn_in_archive_obscure.replace('a', 'b')
# Debian sid version of python (3.7.5rc1) introduced a bug in mimetypes
# Reported to cPython: https://bugs.python.org/issue38449
import mimetypes
mimedb = mimetypes.MimeTypes(strict=False)
if None in mimedb.guess_type(fn_archive_obscure + '.tar.gz'):
from . import lgr
lgr.warning("Buggy Python mimetypes, replacing ; in archives test filename")
# fails to detect due to ;
fn_archive_obscure = fn_archive_obscure.replace(';', '-')
# verify
assert None not in mimedb.guess_type(fn_archive_obscure + '.tar.gz')
fn_archive_obscure_ext = fn_archive_obscure + '.tar.gz'
tree_simplearchive = dict(
tree=(
(fn_archive_obscure_ext, (
(fn_in_archive_obscure, '2 load'),
('3.txt', '3 load'))),),
prefix='datalad-')
if on_windows:
def test_unixify_path():
from ..tests.utils_pytest import eq_
eq_(unixify_path(r"a"), "a")
eq_(unixify_path(r"c:\buga"), "/c/buga")
eq_(unixify_path(r"c:\buga\duga.dat"), "/c/buga/duga.dat")
eq_(unixify_path(r"buga\duga.dat"), "buga/duga.dat")
@with_tree(**tree_simplearchive)
def check_decompress_file(leading_directories, path=None):
outdir = op.join(path, 'simple-extracted')
with swallow_outputs() as cmo:
decompress_file(op.join(path, fn_archive_obscure_ext), outdir,
leading_directories=leading_directories)
eq_(cmo.out, "")
eq_(cmo.err, "")
path_archive_obscure = op.join(outdir, fn_archive_obscure)
if leading_directories == 'strip':
assert not op.exists(path_archive_obscure)
testpath = outdir
elif leading_directories is None:
assert op.exists(path_archive_obscure)
testpath = path_archive_obscure
else:
raise NotImplementedError("Dunno about this strategy: %s"
% leading_directories)
assert op.exists(op.join(testpath, '3.txt'))
assert op.exists(op.join(testpath, fn_in_archive_obscure))
with open(op.join(testpath, '3.txt')) as f:
eq_(f.read(), '3 load')
@pytest.mark.xfail((on_travis or on_github) and on_nfs, reason="https://github.com/datalad/datalad/issues/4496")
@pytest.mark.parametrize("leading", [None, 'strip'])
def test_decompress_file(leading):
return check_decompress_file(leading)
def test_decompress_file_unknown():
with pytest.raises(NotImplementedError):
check_decompress_file("unknown")
@with_tree((('empty', ''),
('d1', (
('d2', (
('f1', 'f1 load'),
),),
))))
@with_tempfile()
def check_compress_dir(ext, path=None, name=None):
archive = name + ext
compress_files([os.path.basename(path)], archive,
path=os.path.dirname(path))
assert_true(op.exists(archive))
name_extracted = name + "_extracted"
decompress_file(archive, name_extracted, leading_directories='strip')
assert_true(op.exists(op.join(name_extracted, 'empty')))
assert_true(op.exists(op.join(name_extracted, 'd1', 'd2', 'f1')))
@pytest.mark.parametrize(
"ext",
['.tar.xz',
'.tar.gz',
'.tgz',
'.tbz2',
'.tar',
'.zip',
'.7z',
]
)
def test_compress_dir(ext):
return check_compress_dir(ext)
# space in the filename to test for correct quotations etc
_filename = 'fi le.dat'
@skip_if("cmd:7z" not in external_versions,
msg="Known to fail if p7zip is not installed")
@with_tree(((_filename, 'content'),))
@with_tempfile()
def check_compress_file(ext, annex, path=None, name=None):
# we base the archive name on the filename, in order to also
# be able to properly test compressors where the corresponding
# archive format has no capability of storing a filename
# (i.e. where the archive name itself determines the filename
# of the decompressed file, like .xz)
archive = op.join(name, _filename + ext)
compress_files([_filename], archive,
path=path)
assert_true(op.exists(archive))
if annex:
# It should work even when file is annexed and is a symlink to the
# key
from datalad.support.annexrepo import AnnexRepo
repo = AnnexRepo(path, init=True)
repo.add(_filename)
repo.commit(files=[_filename], msg="commit")
dir_extracted = name + "_extracted"
try:
decompress_file(archive, dir_extracted)
except MissingExternalDependency as exc:
raise SkipTest() from exc
_filepath = op.join(dir_extracted, _filename)
ok_file_has_content(_filepath, 'content')
@pytest.mark.parametrize(
"ext,annex",
list(
itertools.product(
['.xz', '.gz', '.zip', '.7z'],
[True, False])
)
)
def test_compress_file(ext, annex):
check_compress_file(ext, annex)
@pytest.mark.xfail(on_github and on_nfs, reason="unknown. TODO: figure out")
@with_tree(**tree_simplearchive)
def test_ExtractedArchive(path=None):
archive = op.join(path, fn_archive_obscure_ext)
earchive = ExtractedArchive(archive)
assert_false(op.exists(earchive.path))
# no longer the case -- just using hash for now
# assert_in(os.path.basename(archive), earchive.path)
fpath = op.join(fn_archive_obscure, # lead directory
fn_in_archive_obscure)
extracted = earchive.get_extracted_filename(fpath)
eq_(extracted, op.join(earchive.path, fpath))
assert_false(op.exists(extracted)) # not yet
extracted_ = earchive.get_extracted_file(fpath)
eq_(extracted, extracted_)
assert_true(op.exists(extracted)) # now it should
extracted_files = earchive.get_extracted_files()
ok_generator(extracted_files)
try:
eq_(sorted(extracted_files),
sorted([
# ['bbc/3.txt', 'bbc/abc']
op.join(fn_archive_obscure, fn_in_archive_obscure),
op.join(fn_archive_obscure, '3.txt')
]))
except AssertionError:
if 'nfsmount' in fpath:
pytest.xfail("Archive was created before NFS startede to behave. "
"https://github.com/datalad/datalad/issues/4101")
raise
earchive.clean()
if not dl_cfg.get('datalad.tests.temp.keep'):
assert_false(op.exists(earchive.path))
def test_ArchivesCache():
# we don't actually need to test archives handling itself
path1 = "/zuba/duba"
path2 = "/zuba/duba2"
# should not be able to create a persistent cache without topdir
assert_raises(ValueError, ArchivesCache, persistent=True)
cache = ArchivesCache() # by default -- non persistent
archive1_path = op.join(path1, fn_archive_obscure_ext)
archive2_path = op.join(path2, fn_archive_obscure_ext)
cached_archive1_path = cache[archive1_path].path
assert_false(cache[archive1_path].path == cache[archive2_path].path)
assert_true(cache[archive1_path] is cache[archive1_path])
cache.clean()
assert_false(op.exists(cached_archive1_path))
assert_false(op.exists(cache.path))
# test del
cache = ArchivesCache() # by default -- non persistent
assert_true(op.exists(cache.path))
cache_path = cache.path
del cache
assert_false(op.exists(cache_path))
@pytest.mark.parametrize(
"return_value,target_value,kwargs",
[
([], None, {}),
(['file.txt'], None, {}),
(['file.txt', op.join('d', 'f')], None, {}),
([op.join('d', 'f'), op.join('d', 'f2')], 'd', {}),
([op.join('d', 'f'), op.join('d', 'f2')], 'd', {'consider': 'd'}),
([op.join('d', 'f'), op.join('d', 'f2')], None, {'consider': 'dd'}),
([op.join('d', 'f'), op.join('d2', 'f2')], None, {}),
([op.join('d', 'd2', 'f'), op.join('d', 'd2', 'f2')], op.join('d', 'd2'), {}),
([op.join('d', 'd2', 'f'), op.join('d', 'd2', 'f2')], 'd', {'depth': 1}),
# with some parasitic files
([op.join('d', 'f'), op.join('._d')], 'd', {'exclude': [r'\._.*']}),
([op.join('d', 'd1', 'f'), op.join('d', '._d'), '._x'], op.join('d', 'd1'), {'exclude': [r'\._.*']}),
]
)
def test_get_leading_directory(return_value, target_value, kwargs):
ea = ExtractedArchive('/some/bogus', '/some/bogus')
with patch.object(ExtractedArchive, 'get_extracted_files', return_value=return_value):
eq_(ea.get_leading_directory(**kwargs), target_value)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_base.py 0000644 0001751 0001751 00000006371 15137634221 017646 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import os
import os.path as op
import sys
from unittest.mock import patch
from datalad.cmd import (
StdOutErrCapture,
WitlessRunner,
)
from datalad.tests.utils_pytest import (
SkipTest,
assert_in,
assert_raises,
chpwd,
get_dataset_root,
ok_file_has_content,
swallow_logs,
with_tree,
)
from datalad.utils import get_home_envvars
# verify that any target platform can deal with forward slashes
# as os.path.sep, regardless of its native preferences
@with_tree(tree={'subdir': {'testfile': 'testcontent'}})
def test_paths_with_forward_slashes(path=None):
# access file with native absolute path spec
print(path)
ok_file_has_content(op.join(path, 'subdir', 'testfile'), 'testcontent')
with chpwd(path):
# native relative path spec
ok_file_has_content(op.join('subdir', 'testfile'), 'testcontent')
# posix relative path spec
ok_file_has_content('subdir/testfile', 'testcontent')
# abspath with forward slash path sep char
ok_file_has_content(
op.join(path, 'subdir', 'testfile').replace(op.sep, '/'),
'testcontent')
#@with_tempfile(mkdir=True)
# with_tempfile dereferences tempdir, so does not trigger the failure
# on Yarik's laptop where TMPDIR=~/.tmp and ~/.tmp -> /tmp.
# with_tree in turn just passes that ~/.tmp/ directory
@with_tree(tree={})
def test_not_under_git(path=None):
from datalad.distribution.dataset import require_dataset
dsroot = get_dataset_root(path)
assert dsroot is None, "There must be no dataset above tmp %s. Got: %s" % (path, dsroot)
with chpwd(path):
# And require_dataset must puke also
assert_raises(
Exception,
require_dataset,
None, check_installed=True, purpose='test'
)
def test_no_empty_http_proxy():
# in __init__ we might prune http_proxy if it is empty, so it must not be
# empty if present
assert os.environ.get('http_proxy', 'default')
assert os.environ.get('https_proxy', 'default')
@with_tree(tree={})
def test_git_config_warning(path=None):
if 'GIT_AUTHOR_NAME' in os.environ:
raise SkipTest("Found existing explicit identity config")
# Note: An easier way to test this, would be to just set GIT_CONFIG_GLOBAL
# to point somewhere else. However, this is not supported by git before
# 2.32. Hence, stick with changed HOME in this test, but be sure to unset a
# possible GIT_CONFIG_GLOBAL in addition.
patched_env = os.environ.copy()
patched_env.pop('GIT_CONFIG_GLOBAL', None)
patched_env.update(get_home_envvars(path))
with chpwd(path), \
patch.dict('os.environ', patched_env, clear=True), \
swallow_logs(new_level=30) as cml:
out = WitlessRunner().run(
[sys.executable, '-c', 'import datalad'],
protocol=StdOutErrCapture)
assert_in("configure Git before", out['stderr'])
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_cmd.py 0000644 0001751 0001751 00000016106 15137634221 017474 0 ustar 00runner runner # emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil; coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test WitlessRunner
"""
import sys
import unittest.mock
from subprocess import TimeoutExpired
import pytest
from datalad.cmd import (
BatchedCommand,
BatchedCommandError,
readline_rstripped,
)
from datalad.runner.tests.utils import py2cmd
from datalad.tests.utils_pytest import (
assert_equal,
assert_is_none,
assert_is_not_none,
assert_not_equal,
assert_true,
)
def test_readline_rstripped_deprecation():
with unittest.mock.patch("datalad.cmd.warnings.warn") as warn_mock:
class StdoutMock:
def readline(self):
return "abc\n"
readline_rstripped(StdoutMock())
warn_mock.assert_called_once()
def test_batched_command():
bc = BatchedCommand(cmd=[sys.executable, "-i", "-u", "-q", "-"])
response = bc("print('a')")
assert_equal(response, "a")
response = bc("print(2 + 1)")
assert_equal(response, "3")
stderr = bc.close(return_stderr=True)
assert_is_not_none(stderr)
def test_batched_close_abandon():
# Expect a timeout if the process runs longer than timeout and the config
# for "datalad.runtime.stalled-external" is "abandon".
bc = BatchedCommand(
cmd=[sys.executable, "-i", "-u", "-q", "-"],
timeout=.1)
# Send at least one instruction to start the subprocess
response = bc("import time; print('a')")
assert_equal(response, "a")
bc.stdin_queue.put("time.sleep(2); exit(1)\n".encode())
with unittest.mock.patch("datalad.cmd._cfg_val", "abandon"):
bc.close(return_stderr=False)
assert_true(bc.wait_timed_out is True)
assert_is_none(bc.return_code)
@pytest.mark.filterwarnings("ignore:Exception ignored")
def test_batched_close_timeout_exception():
while True:
try:
# Expect a timeout at BatchedCommand.close() if the process runs
# longer than timeout and the config for
# "datalad.runtime.stalled-external" is "abandon".
# In most cases the next commands until `bc.close()` will execute
# faster than `timeout`. If not we just restart the process
bc = BatchedCommand(
cmd=[sys.executable, "-i", "-u", "-q", "-"],
timeout=.5,
exception_on_timeout=True)
# Send at least one instruction to start the subprocess
response = bc("import time; print('a')")
assert_equal(response, "a")
# Send process to sleep for two seconds to trigger a timeout in
# bc.close().
bc.stdin_queue.put("time.sleep(2); exit(1)\n".encode())
with unittest.mock.patch("datalad.cfg") as cfg_mock:
cfg_mock.obtain.return_value = "abandon"
try:
bc.close()
pytest.fail("bc.close() did not generate a timeout")
except TimeoutExpired:
return
except TimeoutExpired:
# Timeout occurred early, try again
continue
def test_batched_close_wait():
# Expect a long wait and no timeout if the process runs longer than timeout
# and the config for "datalad.runtime.stalled-external" has its default
# value.
bc = BatchedCommand(
cmd=[sys.executable, "-i", "-u", "-q", "-"],
timeout=.5)
# Send at least one instruction to start the subprocess
response = bc("import time; print('a')")
assert_equal(response, "a")
bc.stdin_queue.put("time.sleep(2); exit(2)\n".encode())
bc.close(return_stderr=False)
assert_true(bc.wait_timed_out is False)
assert_equal(bc.return_code, 2)
def test_batched_close_ok():
# Expect a long wait and no timeout if the process runs longer than timeout
# seconds and the config for "datalad.runtime.stalled-external" has its
# default value.
bc = BatchedCommand(
cmd=[sys.executable, "-i", "-u", "-q", "-"],
timeout=2)
# Send at least one instruction to start the subprocess
response = bc("import time; print('a')")
assert_equal(response, "a")
bc.stdin_queue.put("time.sleep(.5); exit(3)\n".encode())
bc.close(return_stderr=False)
assert_true(bc.wait_timed_out is False)
assert_equal(bc.return_code, 3)
def test_tuple_requests():
bc = BatchedCommand(
cmd=py2cmd(
"""
import time
import sys
print(f"{time.time()}:{sys.stdin.readline().strip()}")
"""))
start_time_1, line = bc(("one", "line")).split(":")
assert_equal(line, "one line")
start_time_2, line = bc(("end", "now")).split(":")
assert_not_equal(start_time_1, start_time_2)
assert_equal(line, "end now")
bc.close(return_stderr=False)
def test_batched_restart():
# Expect that the process is restarted after exit.
bc = BatchedCommand(
cmd=py2cmd(
"import os\n"
"import sys\n"
"print(os.getpid(), sys.stdin.readline().strip())\n"))
# Send four lines
lines = [f"line-{i}" for i in range(4)]
responses = [bc(lines[i]).split() for i in range(4)]
pid_set = set([int(r[0]) for r in responses])
assert_equal(len(pid_set), 4)
response_lines = [r[1] for r in responses]
assert_equal(lines, response_lines)
bc.close(return_stderr=False)
def test_command_fail_1():
# Expect that a failing command raises a CommandError in which the return
# code and the last successful request is caught, and that the command is
# restarted when called again
bc = BatchedCommand(
cmd=py2cmd(
"""
print("something")
exit(3)
"""))
# Send something to start the process
first_request = "line one"
result = bc(first_request)
assert bc.return_code is None
assert result == "something"
with pytest.raises(BatchedCommandError) as exception_info:
bc("line two")
assert exception_info.value.code == 3
assert exception_info.value.last_processed_request == first_request
assert bc.return_code == 3
# Check for restart
result = bc(first_request)
assert result == "something"
bc.close(return_stderr=False)
def test_command_fail_2():
# Expect that a failing command raises a BatchedCommandError in which the
# return code and the last successful request is caught. In this case the
# last successful request should be None.
bc = BatchedCommand(
cmd=py2cmd(
"""
print(a*b)
"""))
# Send something to start the process
first_request = "line one"
with pytest.raises(BatchedCommandError) as exception_info:
_ = bc(first_request)
assert exception_info.value.code == 1
assert exception_info.value.last_processed_request is None
assert bc.return_code == 1
assert bc.last_request is None
bc.close(return_stderr=False)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_config.py 0000644 0001751 0001751 00000075072 15137634221 020205 0 ustar 00runner runner # ex: set sts=4 ts=4 sw=4 et:
# -*- coding: utf-8 -*-
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""
"""
import logging
import os
from os.path import exists
from os.path import join as opj
from unittest.mock import patch
import pytest
from datalad import cfg as dl_cfg
from datalad.api import create
from datalad.cmd import CommandError
from datalad.config import (
ConfigManager,
_where_to_scope,
parse_gitconfig_dump,
rewrite_url,
write_config_section,
)
from datalad.distribution.dataset import Dataset
from datalad.support.annexrepo import AnnexRepo
from datalad.support.gitrepo import GitRepo
from datalad.tests.utils_pytest import (
DEFAULT_BRANCH,
DEFAULT_REMOTE,
assert_equal,
assert_false,
assert_in,
assert_not_equal,
assert_not_in,
assert_raises,
assert_true,
chpwd,
ok_file_has_content,
with_tempfile,
with_testsui,
with_tree,
)
from datalad.utils import (
Path,
get_home_envvars,
swallow_logs,
)
# XXX tabs are intentional (part of the format)!
# XXX put back! confuses pep8
_config_file_content = """\
[something]
user = name=Jane Doe
user = email=jd@example.com
novalue
empty =
myint = 3
[onemore "complicated γ beast with.dot"]
findme = 5.0
"""
gitcfg_dump = """\
core.withdot
true\0just.a.key\0annex.version
8\0filter.with2dots.some
long\ntext with\nnewlines\0annex.something
abcdef\0"""
# include a "command line" origin
gitcfg_dump_w_origin = """\
file:.git/config\0core.withdot
true\0file:.git/config\0just.a.key\0file:/home/me/.gitconfig\0annex.version
8\0file:.git/config\0filter.with2dots.some
long\ntext with\nnewlines\0file:.git/config\0command line:\0annex.something
abcdef\0"""
gitcfg_parsetarget = {
'core.withdot': 'true',
'just.a.key': None,
'annex.version': '8',
'filter.with2dots.some': 'long\ntext with\nnewlines',
'annex.something': 'abcdef',
}
_dataset_config_template = {
'ds': {
'.datalad': {
'config': _config_file_content}}}
def test_parse_gitconfig_dump():
# simple case, no origin info, clean output
parsed, files = parse_gitconfig_dump(gitcfg_dump)
assert_equal(files, set())
assert_equal(gitcfg_parsetarget, parsed)
# now with origin information in the dump
parsed, files = parse_gitconfig_dump(gitcfg_dump_w_origin, cwd='ROOT')
assert_equal(
files,
# the 'command line:' origin is ignored
set((Path('ROOT/.git/config'), Path('/home/me/.gitconfig'))))
assert_equal(gitcfg_parsetarget, parsed)
# now contaminate the output with a prepended error message
# https://github.com/datalad/datalad/issues/5502
# must work, but really needs the trailing newline
parsed, files = parse_gitconfig_dump(
"unfortunate stdout\non more lines\n" + gitcfg_dump_w_origin)
assert_equal(gitcfg_parsetarget, parsed)
@pytest.mark.filterwarnings("ignore: 'where=\"dataset\"' is deprecated")
@pytest.mark.filterwarnings("ignore: 'source=\"dataset\"' is deprecated")
@with_tree(tree=_dataset_config_template)
@with_tempfile(mkdir=True)
def test_something(path=None, new_home=None):
# will refuse to work on dataset without a dataset
assert_raises(ValueError, ConfigManager, source='branch')
# now read the example config
cfg = ConfigManager(GitRepo(opj(path, 'ds'), create=True), source='branch')
assert_equal(len(cfg), 5)
assert_in('something.user', cfg)
# multi-value
assert_equal(len(cfg['something.user']), 2)
assert_equal(cfg['something.user'], ('name=Jane Doe', 'email=jd@example.com'))
assert_true(cfg.has_section('something'))
assert_false(cfg.has_section('somethingelse'))
assert_equal(sorted(cfg.sections()),
[u'onemore.complicated γ beast with.dot', 'something'])
assert_true(cfg.has_option('something', 'user'))
assert_false(cfg.has_option('something', 'us?er'))
assert_false(cfg.has_option('some?thing', 'user'))
assert_equal(sorted(cfg.options('something')), ['empty', 'myint', 'novalue', 'user'])
assert_equal(cfg.options(u'onemore.complicated γ beast with.dot'), ['findme'])
assert_equal(
sorted(cfg.items()),
[(u'onemore.complicated γ beast with.dot.findme', '5.0'),
('something.empty', ''),
('something.myint', '3'),
('something.novalue', None),
('something.user', ('name=Jane Doe', 'email=jd@example.com'))])
assert_equal(
sorted(cfg.items('something')),
[('something.empty', ''),
('something.myint', '3'),
('something.novalue', None),
('something.user', ('name=Jane Doe', 'email=jd@example.com'))])
# by default get last value only
assert_equal(
cfg.get('something.user'), 'email=jd@example.com')
# but can get all values
assert_equal(
cfg.get('something.user', get_all=True),
('name=Jane Doe', 'email=jd@example.com'))
assert_raises(KeyError, cfg.__getitem__, 'somedthing.user')
assert_equal(cfg.getfloat(u'onemore.complicated γ beast with.dot', 'findme'), 5.0)
assert_equal(cfg.getint('something', 'myint'), 3)
assert_equal(cfg.getbool('something', 'myint'), True)
# git demands a key without value at all to be used as a flag, thus True
assert_equal(cfg.getbool('something', 'novalue'), True)
assert_equal(cfg.get('something.novalue'), None)
# empty value is False
assert_equal(cfg.getbool('something', 'empty'), False)
assert_equal(cfg.get('something.empty'), '')
assert_equal(cfg.getbool('doesnot', 'exist', default=True), True)
assert_raises(TypeError, cfg.getbool, 'something', 'user')
# gitpython-style access
assert_equal(cfg.get('something.myint'), cfg.get_value('something', 'myint'))
assert_equal(cfg.get_value('doesnot', 'exist', default='oohaaa'), 'oohaaa')
# weird, but that is how it is
assert_raises(KeyError, cfg.get_value, 'doesnot', 'exist', default=None)
# modification follows
cfg.add('something.new', 'γ')
assert_equal(cfg.get('something.new'), u'γ')
# sections are added on demand
cfg.add('unheard.of', 'fame')
assert_true(cfg.has_section('unheard.of'))
comp = cfg.items('something')
cfg.rename_section('something', 'this')
assert_true(cfg.has_section('this'))
assert_false(cfg.has_section('something'))
# direct comparison would fail, because of section prefix
assert_equal(len(cfg.items('this')), len(comp))
# fail if no such section
with swallow_logs():
assert_raises(CommandError, cfg.rename_section, 'nothere', 'irrelevant')
assert_true(cfg.has_option('this', 'myint'))
cfg.unset('this.myint')
assert_false(cfg.has_option('this', 'myint'))
# batch a changes
cfg.add('mike.wants.to', 'know', reload=False)
assert_false('mike.wants.to' in cfg)
cfg.add('mike.wants.to', 'eat')
assert_true('mike.wants.to' in cfg)
assert_equal(len(cfg['mike.wants.to']), 2)
# set a new one:
cfg.set('mike.should.have', 'known')
assert_in('mike.should.have', cfg)
assert_equal(cfg['mike.should.have'], 'known')
# set an existing one:
cfg.set('mike.should.have', 'known better')
assert_equal(cfg['mike.should.have'], 'known better')
# set, while there are several matching ones already:
cfg.add('mike.should.have', 'a meal')
assert_equal(len(cfg['mike.should.have']), 2)
# raises with force=False
assert_raises(CommandError,
cfg.set, 'mike.should.have', 'a beer', force=False)
assert_equal(len(cfg['mike.should.have']), 2)
# replaces all matching ones with force=True
cfg.set('mike.should.have', 'a beer', force=True)
assert_equal(cfg['mike.should.have'], 'a beer')
# test deprecated 'where' interface and old 'dataset' (not 'branch') value
# TODO: remove along with the removal of deprecated 'where'
cfg.set('mike.should.have', 'wasknown', where='dataset')
assert_equal(cfg['mike.should.have'], 'wasknown')
assert_equal(cfg.get_from_source('dataset', 'mike.should.have'), 'wasknown')
# fails unknown location
assert_raises(ValueError, cfg.add, 'somesuch', 'shit', scope='umpalumpa')
# very carefully test non-local config
# so carefully that even in case of bad weather Yarik doesn't find some
# lame datalad unittest sections in his precious ~/.gitconfig
# Note: An easier way to test this, would be to just set GIT_CONFIG_GLOBAL
# to point somewhere else. However, this is not supported by git before
# 2.32. Hence, stick with changed HOME in this test, but be sure to unset a
# possible GIT_CONFIG_GLOBAL in addition.
patched_env = os.environ.copy()
patched_env.pop('GIT_CONFIG_GLOBAL', None)
patched_env.update(get_home_envvars(new_home))
with patch.dict('os.environ',
dict(patched_env, DATALAD_SNEAKY_ADDITION='ignore'),
clear=True):
global_gitconfig = opj(new_home, '.gitconfig')
assert(not exists(global_gitconfig))
globalcfg = ConfigManager()
assert_not_in('datalad.unittest.youcan', globalcfg)
assert_in('datalad.sneaky.addition', globalcfg)
cfg.add('datalad.unittest.youcan', 'removeme', scope='global')
assert(exists(global_gitconfig))
# it did not go into the dataset's config!
assert_not_in('datalad.unittest.youcan', cfg)
# does not monitor additions!
globalcfg.reload(force=True)
assert_in('datalad.unittest.youcan', globalcfg)
with swallow_logs():
assert_raises(
CommandError,
globalcfg.unset,
'datalad.unittest.youcan',
scope='local')
assert(globalcfg.has_section('datalad.unittest'))
globalcfg.unset('datalad.unittest.youcan', scope='global')
# but after we unset the only value -- that section is no longer listed
assert (not globalcfg.has_section('datalad.unittest'))
assert_not_in('datalad.unittest.youcan', globalcfg)
ok_file_has_content(global_gitconfig, "")
cfg = ConfigManager(
Dataset(opj(path, 'ds')),
source='branch',
overrides={'datalad.godgiven': True})
assert_equal(cfg.get('datalad.godgiven'), True)
# setter has no effect
cfg.set('datalad.godgiven', 'false')
assert_equal(cfg.get('datalad.godgiven'), True)
@with_tree(tree={
'.gitconfig': """\
[includeIf "gitdir:**/devbgc/**"]
path = ~/.gitconfig_bgc
[custom "datalad"]
variable = value
"""})
def test_includeif_breaking(new_home=None):
patched_env = os.environ.copy()
patched_env.pop('GIT_CONFIG_GLOBAL', None)
patched_env.update(get_home_envvars(new_home))
with patch.dict('os.environ', patched_env, clear=True):
cfg = ConfigManager()
# just want to make sure we read it and didn't crash
assert cfg.get('custom.datalad.variable') == "value"
@with_tree(tree={
'ds': {
'.datalad': {
'config': """\
[crazy]
fa = !git remote | xargs -r -I REMOTE /bin/bash -c 'echo I: Fetching from REMOTE && git fetch --prune REMOTE && git fetch -t REMOTE' && [ -d .git/svn ] && bash -c 'echo I: Fetching from SVN && git svn fetch' || : && [ -e .gitmodules ] && bash -c 'echo I: Fetching submodules && git submodule foreach git fa' && [ -d .git/sd ] && bash -c 'echo I: Fetching bugs into sd && git-sd pull --all' || :
pa = !git paremotes | tr ' ' '\\n' | xargs -r -l1 git push
pt = !git testremotes | tr ' ' '\\n' | xargs -r -l1 -I R git push -f R master
ptdry = !git testremotes | tr ' ' '\\n' | xargs -r -l1 -I R git push -f --dry-run R master
padry = !git paremotes | tr ' ' '\\n' | xargs -r -l1 git push --dry-run
"""}}})
def test_crazy_cfg(path=None):
cfg = ConfigManager(GitRepo(opj(path, 'ds'), create=True), source='branch')
assert_in('crazy.padry', cfg)
# make sure crazy config is not read when in local mode
cfg = ConfigManager(Dataset(opj(path, 'ds')), source='local')
assert_not_in('crazy.padry', cfg)
# it will make it in in 'any' mode though
cfg = ConfigManager(Dataset(opj(path, 'ds')), source='any')
assert_in('crazy.padry', cfg)
# typos in the source mode arg will not have silent side-effects
assert_raises(
ValueError, ConfigManager, Dataset(opj(path, 'ds')), source='locale')
@with_tempfile
def test_obtain(path=None):
ds = create(path)
cfg = ConfigManager(ds)
dummy = 'datalad.test.dummy'
# we know nothing and we don't know how to ask
assert_raises(RuntimeError, cfg.obtain, dummy)
# can report known ones
cfg.add(dummy, '5.3')
assert_equal(cfg.obtain(dummy), '5.3')
# better type
assert_equal(cfg.obtain(dummy, valtype=float), 5.3)
# don't hide type issues, float doesn't become an int magically
assert_raises(ValueError, cfg.obtain, dummy, valtype=int)
# inject some prior knowledge
from datalad.interface.common_cfg import definitions as cfg_defs
cfg_defs[dummy] = dict(type=float)
# no we don't need to specify a type anymore
assert_equal(cfg.obtain(dummy), 5.3)
# but if we remove the value from the config, all magic is gone
cfg.unset(dummy)
# we know nothing and we don't know how to ask
assert_raises(RuntimeError, cfg.obtain, dummy)
#
# test actual interaction
#
@with_testsui()
def ask():
# fail on unknown dialog type
assert_raises(ValueError, cfg.obtain, dummy, dialog_type='Rorschach_test')
ask()
# ask nicely, and get a value of proper type using the preconfiguration
@with_testsui(responses='5.3')
def ask():
assert_equal(
cfg.obtain(dummy, dialog_type='question', text='Tell me'), 5.3)
ask()
# preconfigure even more, to get the most compact call
cfg_defs[dummy]['ui'] = ('question', dict(text='tell me', title='Gretchen Frage'))
@with_testsui(responses='5.3')
def ask():
assert_equal(cfg.obtain(dummy), 5.3)
ask()
@with_testsui(responses='murks')
def ask():
assert_raises(ValueError, cfg.obtain, dummy)
ask()
# fail to store when destination is not specified, will not even ask
@with_testsui()
def ask():
assert_raises(ValueError, cfg.obtain, dummy, store=True)
ask()
# but we can preconfigure it
cfg_defs[dummy]['destination'] = 'broken'
@with_testsui(responses='5.3')
def ask():
assert_raises(ValueError, cfg.obtain, dummy, store=True)
ask()
# fixup destination
cfg_defs[dummy]['destination'] = 'branch'
@with_testsui(responses='5.3')
def ask():
assert_equal(cfg.obtain(dummy, store=True), 5.3)
ask()
# now it won't have to ask again
@with_testsui()
def ask():
assert_equal(cfg.obtain(dummy), 5.3)
ask()
# wipe it out again
cfg.unset(dummy)
assert_not_in(dummy, cfg)
# XXX cannot figure out how I can simulate a simple
## respond with accepting the default
#@with_testsui(responses=...)
#def ask():
# assert_equal(cfg.obtain(dummy, default=5.3), 5.3)
#ask()
def test_from_env():
cfg = ConfigManager()
assert_not_in('datalad.crazy.cfg', cfg)
with patch.dict('os.environ',
{'DATALAD_CRAZY_CFG': 'impossibletoguess'}):
cfg.reload()
assert_in('datalad.crazy.cfg', cfg)
assert_equal(cfg['datalad.crazy.cfg'], 'impossibletoguess')
# not in dataset-only mode
cfg = ConfigManager(Dataset('nowhere'), source='branch')
assert_not_in('datalad.crazy.cfg', cfg)
# check env trumps override
cfg = ConfigManager()
assert_not_in('datalad.crazy.override', cfg)
cfg.set('datalad.crazy.override', 'fromoverride', scope='override')
cfg.reload()
assert_equal(cfg['datalad.crazy.override'], 'fromoverride')
with patch.dict('os.environ',
{'DATALAD_CRAZY_OVERRIDE': 'fromenv'}):
cfg.reload()
assert_equal(cfg['datalad.crazy.override'], 'fromenv')
def test_from_env_overrides():
cfg = ConfigManager()
assert_not_in("datalad.FoO", cfg)
# Some details, like case and underscores, cannot be handled by the direct
# environment variable mapping.
with patch.dict("os.environ",
{"DATALAD_FOO": "val"}):
cfg.reload()
assert_not_in("datalad.FoO", cfg)
assert_equal(cfg["datalad.foo"], "val")
# But they can be handled via DATALAD_CONFIG_OVERRIDES_JSON.
with patch.dict("os.environ",
{"DATALAD_CONFIG_OVERRIDES_JSON": '{"datalad.FoO": "val"}'}):
cfg.reload()
assert_equal(cfg["datalad.FoO"], "val")
# DATALAD_CONFIG_OVERRIDES_JSON isn't limited to datalad variables.
with patch.dict("os.environ",
{"DATALAD_CONFIG_OVERRIDES_JSON": '{"a.b.c": "val"}'}):
cfg.reload()
assert_equal(cfg["a.b.c"], "val")
# Explicitly provided DATALAD_ variables take precedence over those in
# DATALAD_CONFIG_OVERRIDES_JSON.
with patch.dict("os.environ",
{"DATALAD_CONFIG_OVERRIDES_JSON": '{"datalad.foo": "val"}',
"DATALAD_FOO": "val-direct"}):
cfg.reload()
assert_equal(cfg["datalad.foo"], "val-direct")
# JSON decode errors don't lead to crash.
with patch.dict("os.environ",
{"DATALAD_CONFIG_OVERRIDES_JSON": '{'}):
with swallow_logs(logging.WARNING) as cml:
cfg.reload()
assert_in("Failed to load DATALAD_CONFIG_OVERRIDE", cml.out)
def test_overrides():
cfg = ConfigManager()
# any sensible (and also our CI) test environment(s) should have this
assert_in('user.name', cfg)
# set
cfg.set('user.name', 'myoverride', scope='override')
assert_equal(cfg['user.name'], 'myoverride')
# unset just removes override, not entire config
cfg.unset('user.name', scope='override')
assert_in('user.name', cfg)
assert_not_equal('user.name', 'myoverride')
# add
# there is no initial increment
cfg.add('user.name', 'myoverride', scope='override')
assert_equal(cfg['user.name'], 'myoverride')
# same as with add, not a list
assert_equal(cfg['user.name'], 'myoverride')
# but then there is
cfg.add('user.name', 'myother', scope='override')
assert_equal(cfg['user.name'], ['myoverride', 'myother'])
# rename
assert_not_in('ups.name', cfg)
cfg.rename_section('user', 'ups', scope='override')
# original variable still there
assert_in('user.name', cfg)
# rename of override in effect
assert_equal(cfg['ups.name'], ['myoverride', 'myother'])
# remove entirely by section
cfg.remove_section('ups', scope='override')
from datalad.utils import Path
assert_not_in(
'ups.name', cfg,
(cfg._stores,
cfg.overrides,
))
def test_rewrite_url():
test_cases = (
# no match
('unicorn', 'unicorn'),
# custom label replacement
('example:datalad/datalad.git', 'git@example.com:datalad/datalad.git'),
# protocol enforcement
('git://example.com/some', 'https://example.com/some'),
# multi-match
('mylabel', 'ria+ssh://fully.qualified.com'),
('myotherlabel', 'ria+ssh://fully.qualified.com'),
# conflicts, same label pointing to different URLs
('conflict', 'conflict'),
# also conflicts, but hidden in a multi-value definition
('conflict2', 'conflict2'),
)
cfg_in = {
# label rewrite
'git@example.com:': 'example:',
# protocol change
'https://example': 'git://example',
# multi-value
'ria+ssh://fully.qualified.com': ('mylabel', 'myotherlabel'),
# conflicting definitions
'http://host1': 'conflict',
'http://host2': 'conflict',
# hidden conflict
'http://host3': 'conflict2',
'http://host4': ('someokish', 'conflict2'),
}
cfg = {
'url.{}.insteadof'.format(k): v
for k, v in cfg_in.items()
}
for input, output in test_cases:
with swallow_logs(logging.WARNING) as msg:
assert_equal(rewrite_url(cfg, input), output)
if input.startswith('conflict'):
assert_in("Ignoring URL rewrite", msg.out)
# https://github.com/datalad/datalad/issues/4071
@with_tempfile()
@with_tempfile()
def test_no_leaks(path1=None, path2=None):
ds1 = Dataset(path1).create()
ds1.config.set('i.was.here', 'today', scope='local')
assert_in('i.was.here', ds1.config.keys())
ds1.config.reload()
assert_in('i.was.here', ds1.config.keys())
# now we move into this one repo, and create another
# make sure that no config from ds1 leaks into ds2
with chpwd(path1):
ds2 = Dataset(path2)
assert_not_in('i.was.here', ds2.config.keys())
ds2.config.reload()
assert_not_in('i.was.here', ds2.config.keys())
ds2.create()
assert_not_in('i.was.here', ds2.config.keys())
# and that we do not track the wrong files
assert_not_in(ds1.pathobj / '.git' / 'config',
ds2.config._stores['git']['files'])
assert_not_in(ds1.pathobj / '.datalad' / 'config',
ds2.config._stores['branch']['files'])
# these are the right ones
assert_in(ds2.pathobj / '.git' / 'config',
ds2.config._stores['git']['files'])
assert_in(ds2.pathobj / '.datalad' / 'config',
ds2.config._stores['branch']['files'])
@with_tempfile()
def test_no_local_write_if_no_dataset(path=None):
Dataset(path).create()
with chpwd(path):
cfg = ConfigManager()
with assert_raises(CommandError):
cfg.set('a.b.c', 'd', scope='local')
@with_tempfile
def test_dataset_local_mode(path=None):
ds = create(path)
# any sensible (and also our CI) test environment(s) should have this
assert_in('user.name', ds.config)
# from .datalad/config
assert_in('datalad.dataset.id', ds.config)
# from .git/config
assert_in('annex.version', ds.config)
# now check that dataset-local mode doesn't have the global piece
cfg = ConfigManager(ds, source='branch-local')
assert_not_in('user.name', cfg)
assert_in('datalad.dataset.id', cfg)
assert_in('annex.version', cfg)
# https://github.com/datalad/datalad/issues/4071
@with_tempfile
def test_dataset_systemglobal_mode(path=None):
ds = create(path)
# any sensible (and also our CI) test environment(s) should have this
assert_in('user.name', ds.config)
# from .datalad/config
assert_in('datalad.dataset.id', ds.config)
# from .git/config
assert_in('annex.version', ds.config)
with chpwd(path):
# now check that no config from a random dataset at PWD is picked up
# if not dataset instance was provided
cfg = ConfigManager(dataset=None, source='any')
assert_in('user.name', cfg)
assert_not_in('datalad.dataset.id', cfg)
assert_not_in('annex.version', cfg)
def test_global_config():
# from within tests, global config should be read from faked $HOME (see
# setup_package) or from GIT_CONFIG_GLOBAL
if 'GIT_CONFIG_GLOBAL' in os.environ.keys():
glb_cfg_file = Path(os.environ.get('GIT_CONFIG_GLOBAL'))
else:
glb_cfg_file = Path(os.path.expanduser('~')) / '.gitconfig'
assert any(glb_cfg_file.samefile(Path(p)) for p in dl_cfg._stores['git']['files'])
assert_equal(dl_cfg.get("user.name"), "DataLad Tester")
assert_equal(dl_cfg.get("user.email"), "test@example.com")
@pytest.mark.filterwarnings(r"ignore: status\(report_filetype=\) no longer supported")
@with_tempfile()
@with_tempfile()
def test_bare(src=None, path=None):
# create a proper datalad dataset with all bells and whistles
ds = Dataset(src).create()
dlconfig_sha = ds.repo.call_git(['rev-parse', 'HEAD:.datalad/config']).strip()
# can we handle a bare repo version of it?
gr = AnnexRepo.clone(
src, path, clone_options=['--bare', '-b', DEFAULT_BRANCH])
# we had to specifically checkout the standard branch, because on crippled
# FS, HEAD will point to an adjusted branch by default, and the test logic
# below does not account for this case.
# this should just make sure the bare repo has the expected setup,
# but it should still be bare. Let's check that to be sure
assert_true(gr.bare)
# do we read the correct local config?
assert_in(gr.pathobj / 'config', gr.config._stores['git']['files'])
# do we pick up the default branch config too?
assert_in('blob:HEAD:.datalad/config',
gr.config._stores['branch']['files'])
# and track its reload stamp via its file shasum
assert_equal(
dlconfig_sha,
gr.config._stores['branch']['stats']['blob:HEAD:.datalad/config'])
# check that we can pick up the dsid from the commit branch config
assert_equal(ds.id, gr.config.get('datalad.dataset.id'))
# and it is coming from the correct source
assert_equal(
ds.id,
gr.config.get_from_source('branch', 'datalad.dataset.id'))
assert_equal(
None,
gr.config.get_from_source('local', 'datalad.dataset.id'))
# any sensible (and also our CI) test environment(s) should have this
assert_in('user.name', gr.config)
# not set something that wasn't there
obscure_key = 'sec.reallyobscurename!@@.key'
assert_not_in(obscure_key, gr.config)
# to the local config, which is easily accessible
gr.config.set(obscure_key, 'myvalue', scope='local')
assert_equal(gr.config.get(obscure_key), 'myvalue')
# now make sure the config is where we think it is
assert_in(obscure_key.split('.')[1], (gr.pathobj / 'config').read_text())
# update committed config and check update
old_id = ds.id
ds.config.set('datalad.dataset.id', 'surprise!', scope='branch')
ds.save()
# fetch into default branch (like `update`, but for bare-repos)
gr.call_git([
'fetch', f'{DEFAULT_REMOTE}', f'{DEFAULT_BRANCH}:{DEFAULT_BRANCH}'])
# without a reload, no state change, like with non-bare repos
assert_equal(
old_id,
gr.config.get_from_source('branch', 'datalad.dataset.id'))
# a non-forced reload() must be enough, because state change
# detection kicks in
gr.config.reload()
assert_equal('surprise!', gr.config.get('datalad.dataset.id'))
@with_tempfile()
def test_write_config_section(path=None):
# can we handle a bare repo?
gr = GitRepo(path, create=True, bare=True)
obscure = "ds-; &%b5{}# some % "
# test cases
# first 3 args are write_config_section() parameters
# 4th arg is a list with key/value pairs that should end up in a
# ConfigManager after a reload
testcfg = [
('submodule', 'sub', dict(active='true', url='http://example.com'), [
('submodule.sub.active', 'true'),
('submodule.sub.url', 'http://example.com'),
]),
('submodule', 'sub"quote', {"a-b": '"quoted"', 'c': 'with"quote'}, [
('submodule.sub"quote.a-b', '"quoted"'),
('submodule.sub"quote.c', 'with"quote'),
]),
('short', ' s p a c e ', {"a123": ' space all over '}, [
('short. s p a c e .a123', ' space all over '),
]),
('submodule', obscure, {
'path': obscure,
'url': f"./{obscure}"}, [
(f"submodule.{obscure}.path", obscure),
(f"submodule.{obscure}.url", f"./{obscure}"),
]),
]
for tc in testcfg:
# using append mode to provoke potential interference by
# successive calls
with (gr.pathobj / 'config').open('a') as fobj:
write_config_section(fobj, tc[0], tc[1], tc[2])
gr.config.reload()
for testcase in tc[3]:
assert_in(testcase[0], gr.config)
assert_equal(testcase[1], gr.config[testcase[0]])
@with_tempfile()
def test_external_modification(path=None):
from datalad.cmd import WitlessRunner as Runner
runner = Runner(cwd=path)
repo = GitRepo(path, create=True)
config = repo.config
key = 'sec.sub.key'
assert_not_in(key, config)
config.set(key, '1', scope='local')
assert_equal(config[key], '1')
# we pick up the case where we modified so size changed
runner.run(['git', 'config', '--local', '--replace-all', key, '10'])
# unfortunately we do not react for .get unless reload. But here
# we will test if reload is correctly decides to reload without force
config.reload()
assert_equal(config[key], '10')
# and no size change
runner.run(['git', 'config', '--local', '--replace-all', key, '11'])
config.reload()
assert_equal(config[key], '11')
# TODO: remove test along with the removal of deprecated 'where'
@pytest.mark.filterwarnings("ignore: 'where' is deprecated")
@pytest.mark.filterwarnings("ignore: 'where=\"dataset\"' is deprecated")
def test_where_to_scope():
@_where_to_scope
def f(scope=None):
return scope
# others aren't affected but we map where to scope
assert_equal(f(where='local'), 'local')
assert_equal(f(scope='local'), 'local')
# we do mapping to 'branch' for where
assert_equal(f(where='dataset'), 'branch')
# but not for 'scope' -- since that is the target new name, we pass as is
assert_equal(f(scope='dataset'), 'dataset')
# we do not allow both
assert_raises(ValueError, f, where='local', scope='local')
def test_cross_cfgman_update(tmp_path):
myuniqcfg = 'datalad.tester.unique.updatecfg'
myuniqcfg_value = 'some'
myuniqcfg_value2 = 'someother'
assert myuniqcfg not in dl_cfg
ds = Dataset(tmp_path)
assert not ds.is_installed()
# there is no dataset to write to, it rejects it rightfully
# it is a bit versatile in its exception behavior
# https://github.com/datalad/datalad/issues/7300
with pytest.raises((ValueError, CommandError)):
ds.config.set(myuniqcfg, myuniqcfg_value, scope='local')
# but we can write to global scope
ds.config.set(myuniqcfg, myuniqcfg_value, scope='global')
# it can retrieve the update immediately, because set(reload=)
# defaults to True
assert ds.config.get(myuniqcfg) == myuniqcfg_value
# given that we modified the global scope, we expect this to
# be reflected in the global cfgman too
assert dl_cfg.get(myuniqcfg) == myuniqcfg_value
# now we create a repo
ds.create(result_renderer='disabled')
# we had written to global scope, we expect the probe item
# to stick around, even though the cfgman instance is replaced
assert ds.config.get(myuniqcfg) == myuniqcfg_value
# now we replace the value via this new cfgman
ds.config.set(myuniqcfg, myuniqcfg_value2, scope='global')
# and again expect the global instance to catch up with it
assert dl_cfg.get(myuniqcfg) == myuniqcfg_value2
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_constraints.py 0000644 0001751 0001751 00000016316 15137634221 021303 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the DataLad package for the
# copyright and license terms.
#
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
'''Unit tests for basic constraints functionality.'''
from datalad.tests.utils_pytest import (
assert_equal,
assert_raises,
)
from ..support import constraints as ct
def test_int():
c = ct.EnsureInt()
# this should always work
assert_equal(c(7), 7)
assert_equal(c(7.0), 7)
assert_equal(c('7'), 7)
assert_equal(c([7, 3]), [7, 3])
# this should always fail
assert_raises(ValueError, lambda: c('fail'))
assert_raises(ValueError, lambda: c([3, 'fail']))
# this will also fail
assert_raises(ValueError, lambda: c('17.0'))
assert_equal(c.short_description(), 'int')
def test_float():
c = ct.EnsureFloat()
# this should always work
assert_equal(c(7.0), 7.0)
assert_equal(c(7), 7.0)
assert_equal(c('7'), 7.0)
assert_equal(c([7.0, '3.0']), [7.0, 3.0])
# this should always fail
assert_raises(ValueError, lambda: c('fail'))
assert_raises(ValueError, lambda: c([3.0, 'fail']))
def test_bool():
c = ct.EnsureBool()
# this should always work
assert_equal(c(True), True)
assert_equal(c(False), False)
# all that results in True
assert_equal(c('True'), True)
assert_equal(c('true'), True)
assert_equal(c('1'), True)
assert_equal(c('yes'), True)
assert_equal(c('on'), True)
assert_equal(c('enable'), True)
# all that results in False
assert_equal(c('false'), False)
assert_equal(c('False'), False)
assert_equal(c('0'), False)
assert_equal(c('no'), False)
assert_equal(c('off'), False)
assert_equal(c('disable'), False)
# this should always fail
assert_raises(ValueError, c, 0)
assert_raises(ValueError, c, 1)
def test_str():
c = ct.EnsureStr()
# this should always work
assert_equal(c('hello'), 'hello')
assert_equal(c('7.0'), '7.0')
# this should always fail
assert_raises(ValueError, lambda: c(['ab']))
assert_raises(ValueError, lambda: c(['a', 'b']))
assert_raises(ValueError, lambda: c(('a', 'b')))
# no automatic conversion attempted
assert_raises(ValueError, lambda: c(7.0))
assert_equal(c.short_description(), 'str')
def test_str_min_len():
c = ct.EnsureStr(min_len=1)
assert_equal(c('hello'), 'hello')
assert_equal(c('h'), 'h')
assert_raises(ValueError, c, '')
c = ct.EnsureStr(min_len=2)
assert_equal(c('hello'), 'hello')
assert_raises(ValueError, c, 'h')
def test_none():
c = ct.EnsureNone()
# this should always work
assert_equal(c(None), None)
# instance of NoneDeprecated is also None
assert_equal(c(ct.NoneDeprecated), None)
# this should always fail
assert_raises(ValueError, lambda: c('None'))
assert_raises(ValueError, lambda: c([]))
def test_callable():
c = ct.EnsureCallable()
# this should always work
assert_equal(c(range), range)
assert_raises(ValueError, c, 'range')
def test_choice():
c = ct.EnsureChoice('choice1', 'choice2', None)
# this should always work
assert_equal(c('choice1'), 'choice1')
assert_equal(c(None), None)
# this should always fail
assert_raises(ValueError, lambda: c('fail'))
assert_raises(ValueError, lambda: c('None'))
def test_keychoice():
c = ct.EnsureKeyChoice(key='some', values=('choice1', 'choice2', None))
assert_equal(c({'some': 'choice1'}), {'some': 'choice1'})
assert_equal(c({'some': None}), {'some': None})
assert_equal(c({'some': None, 'ign': 'ore'}), {'some': None, 'ign': 'ore'})
assert_raises(ValueError, c, 'fail')
assert_raises(ValueError, c, 'None')
assert_raises(ValueError, c, {'nope': 'None'})
assert_raises(ValueError, c, {'some': 'None'})
assert_raises(ValueError, c, {'some': ('a', 'b')})
def test_range():
c = ct.EnsureRange(min=3, max=7)
# this should always work
assert_equal(c(3.0), 3.0)
# this should always fail
assert_raises(ValueError, lambda: c(2.9999999))
assert_raises(ValueError, lambda: c(77))
assert_raises(TypeError, lambda: c('fail'))
assert_raises(TypeError, lambda: c((3, 4)))
# since no type checks are performed
assert_raises(TypeError, lambda: c('7'))
# Range doesn't have to be numeric
c = ct.EnsureRange(min="e", max="qqq")
assert_equal(c('e'), 'e')
assert_equal(c('fa'), 'fa')
assert_equal(c('qq'), 'qq')
assert_raises(ValueError, c, 'a')
assert_raises(ValueError, c, 'qqqa')
def test_listof():
c = ct.EnsureListOf(str)
assert_equal(c(['a', 'b']), ['a', 'b'])
assert_equal(c(['a1', 'b2']), ['a1', 'b2'])
assert_equal(c('a1 b2'), ['a1 b2'])
def test_tupleof():
c = ct.EnsureTupleOf(str)
assert_equal(c(('a', 'b')), ('a', 'b'))
assert_equal(c(('a1', 'b2')), ('a1', 'b2'))
assert_equal(c('a1 b2'), ('a1 b2',))
def test_constraints():
# this should always work
c = ct.Constraints(ct.EnsureFloat())
assert_equal(c(7.0), 7.0)
c = ct.Constraints(ct.EnsureFloat(), ct.EnsureRange(min=4.0))
assert_equal(c(7.0), 7.0)
# __and__ form
c = ct.EnsureFloat() & ct.EnsureRange(min=4.0)
assert_equal(c(7.0), 7.0)
assert_raises(ValueError, c, 3.9)
c = ct.Constraints(ct.EnsureFloat(), ct.EnsureRange(min=4), ct.EnsureRange(max=9))
assert_equal(c(7.0), 7.0)
assert_raises(ValueError, c, 3.9)
assert_raises(ValueError, c, 9.01)
# __and__ form
c = ct.EnsureFloat() & ct.EnsureRange(min=4) & ct.EnsureRange(max=9)
assert_equal(c(7.0), 7.0)
assert_raises(ValueError, c, 3.99)
assert_raises(ValueError, c, 9.01)
# and reordering should not have any effect
c = ct.Constraints(ct.EnsureRange(max=4), ct.EnsureRange(min=9), ct.EnsureFloat())
assert_raises(ValueError, c, 3.99)
assert_raises(ValueError, c, 9.01)
def test_altconstraints():
# this should always work
c = ct.AltConstraints(ct.EnsureFloat())
assert_equal(c(7.0), 7.0)
c = ct.AltConstraints(ct.EnsureFloat(), ct.EnsureNone())
assert_equal(c.short_description(), '(float or None)')
assert_equal(c(7.0), 7.0)
assert_equal(c(None), None)
# __or__ form
c = ct.EnsureFloat() | ct.EnsureNone()
assert_equal(c(7.0), 7.0)
assert_equal(c(None), None)
# this should always fail
c = ct.Constraints(ct.EnsureRange(min=0, max=4), ct.EnsureRange(min=9, max=11))
assert_raises(ValueError, c, 7.0)
c = ct.EnsureRange(min=0, max=4) | ct.EnsureRange(min=9, max=11)
assert_equal(c(3.0), 3.0)
assert_equal(c(9.0), 9.0)
assert_raises(ValueError, c, 7.0)
assert_raises(ValueError, c, -1.0)
def test_both():
# this should always work
c = ct.AltConstraints(
ct.Constraints(
ct.EnsureFloat(),
ct.EnsureRange(min=7.0, max=44.0)),
ct.EnsureNone())
assert_equal(c(7.0), 7.0)
assert_equal(c(None), None)
# this should always fail
assert_raises(ValueError, lambda: c(77.0))
def test_type_str():
assert_equal(ct._type_str((str,)), 'str')
assert_equal(ct._type_str(str), 'str')
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_direct_mode.py 0000644 0001751 0001751 00000006612 15137634221 021210 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test direct mode mechanic
"""
from unittest.mock import patch
from datalad.support import path as op
from datalad.support.annexrepo import AnnexRepo
from datalad.support.exceptions import (
CommandNotAvailableError,
DirectModeNoLongerSupportedError,
)
from datalad.tests.utils_pytest import (
SkipTest,
assert_in,
assert_raises,
with_tempfile,
)
# if on_windows:
# raise SkipTest("Can't test direct mode switch, "
# "if direct mode is forced by OS anyway.")
#
# repo_version = cfg.get("datalad.repo.version", None)
# if repo_version and int(repo_version) >= 6:
# raise SkipTest("Can't test direct mode switch, "
# "if repository version 6 or later is enforced.")
# originally lifted from AnnexRepo, kept here to simulate a repo
# that is still in direct mode
def _set_direct_mode(self, enable_direct_mode=True):
"""Switch to direct or indirect mode
WARNING! To be used only for internal development purposes.
We no longer support direct mode and thus setting it in a
repository would render it unusable for DataLad
Parameters
----------
enable_direct_mode: bool
True means switch to direct mode,
False switches to indirect mode
Raises
------
CommandNotAvailableError
in case you try to switch to indirect mode on a crippled filesystem
"""
if self.is_crippled_fs() and not enable_direct_mode:
# TODO: ?? DIRECT - should we call git annex upgrade?
raise CommandNotAvailableError(
cmd="git-annex indirect",
msg="Can't switch to indirect mode on that filesystem.")
self.call_annex(['direct' if enable_direct_mode else 'indirect']),
self.config.reload()
# For paranoid we will just re-request
self._direct_mode = None
assert(self.is_direct_mode() == enable_direct_mode)
# All further workarounds were stripped - no direct mode is supported
@with_tempfile
@with_tempfile
def test_direct_cfg(path1=None, path2=None):
# and if repo already exists and we have env var - we fail too
# Adding backend so we get some commit into the repo
ar = AnnexRepo(path1, create=True, backend='MD5E')
del ar; AnnexRepo._unique_instances.clear() # fight flyweight
for path in (path1, path2):
with patch.dict('os.environ', {'DATALAD_REPO_DIRECT': 'True'}):
# try to create annex repo in direct mode as see how it fails
with assert_raises(DirectModeNoLongerSupportedError) as cme:
AnnexRepo(path, create=True)
assert_in("no longer supported by DataLad", str(cme.value)) # we have generic part
assert_in("datalad.repo.direct configuration", str(cme.value)) # situation specific part
# assert not op.exists(path2) # that we didn't create it - we do!
# fixing for that would be too cumbersome since we first call GitRepo.__init__
# with create
ar = AnnexRepo(path1)
# check if we somehow didn't reset the flag
assert not ar.is_direct_mode()
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_dochelpers.py 0000644 0001751 0001751 00000007644 15137634221 021070 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Tests for dochelpers (largely copied from PyMVPA, the same copyright)
"""
from unittest.mock import patch
from datalad.tests.utils_pytest import (
assert_equal,
assert_re_in,
assert_true,
)
from ..dochelpers import (
borrowdoc,
borrowkwargs,
single_or_plural,
)
def test_basic():
assert_equal(single_or_plural('a', 'b', 1), 'a')
assert_equal(single_or_plural('a', 'b', 0), 'b')
assert_equal(single_or_plural('a', 'b', 123), 'b')
assert_equal(single_or_plural('a', 'b', 123, include_count=True), '123 b')
def test_borrow_doc():
class A(object):
def met1(self):
"""met1doc"""
pass # pragma: no cover
def met2(self):
"""met2doc"""
pass # pragma: no cover
class B(object):
@borrowdoc(A)
def met1(self):
pass # pragma: no cover
@borrowdoc(A, 'met1')
def met2(self):
pass # pragma: no cover
assert_equal(B.met1.__doc__, A.met1.__doc__)
assert_equal(B.met2.__doc__, A.met1.__doc__)
def test_borrow_kwargs():
class A(object):
def met1(self, kp1=None, kp2=1):
"""met1 doc
Parameters
----------
kp1 : None or int
keyword parameter 1
kp2 : int, optional
something
"""
pass # pragma: no cover
def met2(self):
"""met2doc"""
pass # pragma: no cover
class B(object):
@borrowkwargs(A)
def met1(self, desc, **kwargs):
"""B.met1 doc
Parameters
----------
desc
description
**kwargs
Same as in A.met1
Some postamble
"""
pass # pragma: no cover
@borrowkwargs(A, 'met1')
def met_nodoc(self, **kwargs):
pass # pragma: no cover
@borrowkwargs(methodname=A.met1)
def met_anothermet(self, **kwargs):
pass # pragma: no cover
@borrowkwargs(A, 'met1')
def met_nodockwargs(self, bogus=None, **kwargs):
"""B.met_nodockwargs
Parameters
----------
bogus
something
"""
pass # pragma: no cover
if True:
# Just so we get different indentation level
@borrowkwargs(A, 'met1', ['kp1'])
def met_excludes(self, boguse=None, **kwargs):
"""B.met_excludes
Parameters
----------
boguse
something
"""
pass # pragma: no cover
assert_true('B.met1 doc' in B.met1.__doc__)
for m in (B.met1,
B.met_nodoc,
B.met_anothermet,
B.met_nodockwargs,
B.met_excludes):
docstring = m.__doc__
assert_true('Parameters' in docstring)
assert_true(not '*kwargs' in docstring,
msg="We shouldn't carry kwargs in docstring now,"
"Got %r for %s" % (docstring, m))
assert_true('kp2 ' in docstring)
assert_true((('kp1 ' in docstring)
^ (m == B.met_excludes)))
# indentation should have been squashed properly
assert_true(not ' ' in docstring)
# some additional checks to see if we are not losing anything
assert_true('Some postamble' in B.met1.__doc__)
assert_true('B.met_nodockwargs' in B.met_nodockwargs.__doc__)
assert_true('boguse' in B.met_excludes.__doc__)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_installed.py 0000644 0001751 0001751 00000003131 15137634221 020702 0 ustar 00runner runner # emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test invocation of datalad utilities "as is installed"
"""
import os
from unittest.mock import patch
from datalad.cmd import (
StdOutErrCapture,
WitlessRunner,
)
from datalad.support.exceptions import CommandError
from datalad.tests.utils_pytest import (
assert_cwd_unchanged,
eq_,
ok_startswith,
)
def check_run_and_get_output(cmd):
runner = WitlessRunner()
try:
# suppress log output happen it was set to high values
with patch.dict('os.environ', {'DATALAD_LOG_LEVEL': 'WARN'}):
output = runner.run(
["datalad", "--help"],
protocol=StdOutErrCapture)
except CommandError as e:
raise AssertionError("'datalad --help' failed to start normally. "
"Exited with %d and output %s" % (e.code, (e.stdout, e.stderr)))
return output['stdout'], output['stderr']
@assert_cwd_unchanged
def test_run_datalad_help():
out, err = check_run_and_get_output("datalad --help")
ok_startswith(out, "Usage: ")
# There could be a warning from coverage that no data was collected, should be benign
lines = [l for l in err.split(os.linesep) if ('no-data-collected' not in l) and l]
eq_(lines, [])
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_interface.py 0000644 0001751 0001751 00000010102 15137634221 020657 0 ustar 00runner runner # emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test command call wrapper
"""
import re
from datalad.interface.base import (
Interface,
get_api_name,
)
from datalad.support import constraints as cnstr
from datalad.support.param import Parameter
from datalad.tests.utils_pytest import (
assert_equal,
assert_in,
assert_is,
assert_raises,
assert_re_in,
assert_true,
swallow_outputs,
)
class Demo(Interface):
"""I am a demo"""
_params_ = dict(
demoposarg=Parameter(
doc="demoposdoc",
constraints=cnstr.EnsureInt(),
nargs=2),
demooptposarg1=Parameter(
args=('demooptposarg1',),
doc="demooptposdoc1",
constraints=cnstr.EnsureInt(),
nargs='?'),
demooptposarg2=Parameter(
args=('demooptposarg2',),
doc="demooptposdoc2",
constraints=cnstr.EnsureInt(),
nargs='?'),
demoarg=Parameter(
doc="demodoc",
constraints=cnstr.EnsureInt()))
def __call__(self, demoposarg, demooptposarg1=99, demooptposarg2=999, demoarg=100):
return demoarg
def test_param():
# having a parameter with no information is fine
# it doesn't need a name, because it comes from the signature
# of the actual implementation that is described
p = Parameter()
pname = 'testname'
# minimal docstring
assert_equal(pname, p.get_autodoc('testname'))
doc = 'somedoc'
p = Parameter(doc=doc)
assert_equal('%s\n %s.' % (pname, doc), p.get_autodoc('testname'))
# constraints
p = Parameter(doc=doc, constraints=cnstr.EnsureInt() | cnstr.EnsureStr())
autodoc = p.get_autodoc('testname')
assert_true('int or str' in autodoc)
with assert_raises(ValueError) as cmr:
Parameter(unknown_arg=123)
assert_in('Detected unknown argument(s) for the Parameter: unknown_arg',
str(cmr.value))
def test_interface():
di = Demo()
import argparse
parser = argparse.ArgumentParser()
from datalad.cli.parser import setup_parser_for_interface
setup_parser_for_interface(parser, di)
with swallow_outputs() as cmo:
assert_equal(parser.print_help(), None)
assert(cmo.out)
assert_equal(cmo.err, '')
args = parser.parse_args(['42', '11', '1', '2', '--demoarg', '23'])
assert_is(args.demoarg, 23)
assert_equal(args.demoposarg, [42, 11])
assert_equal(args.demooptposarg1, 1)
assert_equal(args.demooptposarg2, 2)
# wrong type
with swallow_outputs() as cmo:
assert_raises(SystemExit, parser.parse_args, ['--demoarg', 'abc'])
# that is what we dump upon folks atm. TODO: improve reporting of illspecified options
assert_re_in(".*invalid constraint:int value:.*",
cmo.err, re.DOTALL)
# missing argument to option
with swallow_outputs() as cmo:
assert_raises(SystemExit, parser.parse_args, ['--demoarg'])
assert_re_in(".*--demoarg: expected one argument", cmo.err, re.DOTALL)
# missing positional argument
with swallow_outputs() as cmo:
assert_raises(SystemExit, parser.parse_args, [''])
# PY2|PY3
assert_re_in(".*error: (too few arguments|the following arguments are required: demoposarg)",
cmo.err, re.DOTALL)
def test_name_generation():
assert_equal(
get_api_name(("some.module", "SomeClass")),
'module')
assert_equal(
get_api_name(("some.module", "SomeClass", "cmdline-override")),
'module')
assert_equal(
get_api_name(("some.module",
"SomeClass",
"cmdline_override",
"api_override-dont-touch")),
"api_override-dont-touch")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_log.py 0000644 0001751 0001751 00000020037 15137634221 017510 0 ustar 00runner runner # emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test logging facilities """
import inspect
import logging
import os.path
from logging import makeLogRecord
from os.path import exists
from unittest.mock import patch
from datalad import cfg as dl_cfg
from datalad.log import (
ColorFormatter,
LoggerHelper,
TraceBack,
log_progress,
with_progress,
with_result_progress,
)
from datalad.support import ansi_colors as colors
from datalad.support.constraints import EnsureBool
from datalad.tests.utils_pytest import (
SkipTest,
assert_equal,
assert_in,
assert_no_open_files,
assert_not_in,
assert_re_in,
known_failure_githubci_win,
ok_,
ok_endswith,
ok_generator,
swallow_logs,
with_tempfile,
)
from datalad.utils import on_windows
# pretend we are in interactive mode so we could check if coloring is
# disabled
@patch("datalad.log.is_interactive", lambda: True)
@with_tempfile
def test_logging_to_a_file(dst=None):
ok_(not exists(dst))
lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst)
ok_(exists(dst)) # nothing was logged -- no file created
msg = "Oh my god, they killed Kenny"
lgr.error(msg)
with open(dst) as f:
lines = f.readlines()
assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines)
line = lines[0]
ok_(msg in line)
ok_('\033[' not in line,
msg="There should be no color formatting in log files. Got: %s" % line)
# verify that time stamp and level are present in the log line
# do not want to rely on not having race conditions around date/time changes
# so matching just with regexp
# (...)? is added to swallow possible traceback logs
regex = r"\[ERROR\]"
if EnsureBool()(dl_cfg.get('datalad.log.timestamp', False)):
regex = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} " + regex
if EnsureBool()(dl_cfg.get('datalad.log.vmem', False)):
regex += r' RSS/VMS: \S+/\S+( \S+)?\s*'
regex += r"(\s+\S+\s*)? " + msg
assert_re_in(regex, line, match=True)
# Python's logger is ok (although not documented as supported) to accept
# non-string messages, which could be str()'ed. We should not puke
msg2 = "Kenny is alive"
lgr.error(RuntimeError(msg2))
with open(dst) as f:
assert_in(msg2, f.read())
# Close all handlers so windows is happy -- apparently not closed fast enough
for handler in lgr.handlers:
handler.close()
assert_no_open_files(dst)
@with_tempfile
def test_logtarget_via_env_variable(dst=None):
with patch.dict('os.environ', {'DATALADTEST_LOG_TARGET': dst}):
ok_(not exists(dst))
lgr = LoggerHelper("dataladtest-2").get_initialized_logger()
ok_(not exists(dst))
# just to see that mocking patch worked
ok_('DATALADTEST_LOG_TARGET' not in os.environ)
@with_tempfile
@with_tempfile
def test_mutliple_targets(dst1=None, dst2=None):
ok_(not exists(dst1))
ok_(not exists(dst2))
lgr = LoggerHelper("dataladtest-3").get_initialized_logger(
logtarget="%s,%s" % (dst1, dst2))
ok_(exists(dst1))
ok_(exists(dst2))
msg = "Oh my god, they killed Kenny"
lgr.error(msg)
for dst in (dst1, dst2):
with open(dst) as f:
lines = f.readlines()
assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines)
ok_(msg in lines[0])
# Close all handlers so windows is happy -- apparently not closed fast enough
for handler in lgr.handlers:
handler.close()
def check_filters(name):
with swallow_logs(new_level=logging.DEBUG, name=name) as cml:
lgr1 = logging.getLogger(name + '.goodone')
lgr2 = logging.getLogger(name + '.anotherone')
lgr3 = logging.getLogger(name + '.bad')
lgr1.debug('log1')
lgr2.info('log2')
lgr3.info('log3')
assert_in('log1', cml.out)
assert_in('log2', cml.out)
assert_not_in('log3', cml.out)
def test_filters():
def _mock_names(self, v, d=None):
return 'datalad1.goodone,datalad1.anotherone' if v == 'names' else d
with patch.object(LoggerHelper, '_get_config', _mock_names):
LoggerHelper('datalad1').get_initialized_logger()
check_filters('datalad1')
def _mock_namesre(self, v, d=None):
return 'datalad.*one' if v == 'namesre' else d
with patch.object(LoggerHelper, '_get_config', _mock_namesre):
LoggerHelper('datalad2').get_initialized_logger()
check_filters('datalad2')
def test_traceback():
from inspect import (
currentframe,
getframeinfo,
)
# do not move lines below among themselves -- we rely on consistent line numbers ;)
tb_line = getframeinfo(currentframe()).lineno + 2
def rec(tb, n):
return rec(tb, n-1) if n else tb()
tb1 = rec(TraceBack(), 10)
ok_endswith(tb1, ">test_log:%d,%s" % (tb_line + 1, ",".join([str(tb_line)]*10)))
# we limit to the last 100
tb1 = rec(TraceBack(collide=True), 110)
ok_endswith(tb1, "β¦>test_log:%s" % (",".join([str(tb_line)]*100)))
@known_failure_githubci_win
def test_color_formatter():
# want to make sure that coloring doesn't get "stuck"
for use_color in False, True, False:
# we can't reuse the same object since it gets colored etc inplace
rec = makeLogRecord(
dict(msg='very long message',
levelname='DEBUG',
name='some name'))
cf = ColorFormatter(use_color=use_color)
if on_windows:
raise SkipTest('Unclear under which conditions coloring should work')
(assert_in if use_color else assert_not_in)(colors.RESET_SEQ, cf.format(rec))
# TODO: somehow test is stdout/stderr get their stuff
@patch("datalad.log.is_interactive", lambda: False)
def test_log_progress_noninteractive_filter():
name = "dl-test"
lgr = LoggerHelper(name).get_initialized_logger()
pbar_id = "lp_test"
with swallow_logs(new_level=logging.INFO, name=name) as cml:
log_progress(lgr.info, pbar_id, "Start", label="testing", total=3)
log_progress(lgr.info, pbar_id, "THERE0", update=1)
log_progress(lgr.info, pbar_id, "NOT", update=1,
noninteractive_level=logging.DEBUG)
log_progress(lgr.info, pbar_id, "THERE1", update=1,
noninteractive_level=logging.INFO)
log_progress(lgr.info, pbar_id, "Done")
for present in ["Start", "THERE0", "THERE1", "Done"]:
assert_in(present, cml.out)
assert_not_in("NOT", cml.out)
def test_with_result_progress_generator():
# Tests ability for the decorator to decorate a regular function
# or a generator function (then it returns a generator function)
@with_result_progress
def func(l):
return l
generated = []
@with_result_progress
def gen(l):
for i in l:
generated.append(i)
yield i
recs = [{'status': 'ok', 'unrelated': i} for i in range(2)]
# still works for a func and returns provided list
ok_(not inspect.isgeneratorfunction(func))
assert_equal(func(recs), recs)
# generator should still yield and next iteration should only happen
# when requested
ok_(inspect.isgeneratorfunction(gen))
g = gen(recs)
ok_generator(g)
assert_equal(generated, []) # nothing yet
assert_equal(next(g), recs[0])
assert_equal(generated, recs[:1])
assert_equal(next(g), recs[1])
assert_equal(generated, recs)
# just to make sure all good to redo
assert_equal(list(gen(recs)), recs)
def test_with_progress_generator():
# Well, we could also pass an iterable directly now and display
# progress iterative over it
g = with_progress(range(3))
ok_generator(g)
assert_equal(list(g), list(range(3)))
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_misc.py 0000644 0001751 0001751 00000001745 15137634221 017667 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from packaging.version import Version
import datalad
from datalad.support.network import (
get_url_response_stamp,
is_url_quoted,
)
from .utils_pytest import *
def test_is_url_quoted():
ok_(is_url_quoted('%22%27%3ba&b&cd|'))
ok_(not is_url_quoted('a b'))
def test_get_response_stamp():
r = get_url_response_stamp("http://www.example.com/1.dat",
{'Content-length': '101',
'Last-modified': 'Wed, 01 May 2013 03:02:00 GMT'})
eq_(r['size'], 101)
eq_(r['mtime'], 1367377320)
eq_(r['url'], "http://www.example.com/1.dat")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_s3.py 0000644 0001751 0001751 00000011460 15137634221 017254 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test S3 supporting functionality
"""
from datalad.downloaders.tests.utils import get_test_providers
from datalad.support.network import URL
from datalad.support.s3 import (
add_version_to_url,
get_versioned_url,
)
from datalad.tests.utils_pytest import (
assert_raises,
eq_,
ok_startswith,
skip_if_no_network,
use_cassette,
)
def test_add_version_to_url():
base_url = "http://ex.com/f.txt"
base_url_query = "http://ex.com/f.txt?k=v"
for replace in True, False:
eq_(add_version_to_url(URL(base_url), "new.id", replace=replace),
base_url + "?versionId=new.id")
eq_(add_version_to_url(URL(base_url_query),
"new.id", replace=replace),
base_url_query + "&versionId=new.id")
expected = "new.id" if replace else "orig.id"
eq_(add_version_to_url(URL(base_url + "?versionId=orig.id"),
"new.id",
replace=replace),
base_url + "?versionId=" + expected)
eq_(add_version_to_url(URL(base_url_query + "&versionId=orig.id"),
"new.id",
replace=replace),
base_url_query + "&versionId=" + expected)
@skip_if_no_network
@use_cassette('s3_test_version_url')
def test_get_versioned_url():
get_test_providers('s3://openfmri/tarballs') # to verify having credentials to access openfmri via S3
for url_pref in ('http://openfmri.s3.amazonaws.com', 'https://s3.amazonaws.com/openfmri'):
eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz"),
url_pref + "/tarballs/ds001_raw.tgz?versionId=null")
eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz?param=1"),
url_pref + "/tarballs/ds001_raw.tgz?param=1&versionId=null")
# We don't duplicate the version if it already exists.
eq_(get_versioned_url(url_pref + "/tarballs/ds001_raw.tgz?versionId=null"),
url_pref + "/tarballs/ds001_raw.tgz?versionId=null")
# something is wrong there
#print(get_versioned_url("http://openfmri.s3.amazonaws.com/ds001/demographics.txt"))
eq_(get_versioned_url("someurl"), "someurl") # should just return original one
assert_raises(RuntimeError, get_versioned_url, "someurl", guarantee_versioned=True)
# TODO: on a bucket without versioning
url = "http://datalad-test0-nonversioned.s3.amazonaws.com/2versions-removed-recreated.txt"
eq_(get_versioned_url(url), url)
eq_(get_versioned_url(url, return_all=True), [url])
assert_raises(NotImplementedError, get_versioned_url, "s3://buga")
urls = get_versioned_url("http://datalad-test0-versioned.s3.amazonaws.com/2versions-removed-recreated.txt",
return_all=True, verify=True)
eq_(len(set(urls)), len(urls)) # all unique
for url in urls:
# so we didn't grab other files along with the same prefix
ok_startswith(url, 'http://datalad-test0-versioned.s3.amazonaws.com/2versions-removed-recreated.txt?versionId=')
# Update a versioned URL with a newer version tag.
url_3ver = "http://datalad-test0-versioned.s3.amazonaws.com/3versions-allversioned.txt"
url_3ver_input = url_3ver + "?versionId=b.qCuh7Sg58VIYj8TVHzbRS97EvejzEl"
eq_(get_versioned_url(url_3ver_input), url_3ver_input)
eq_(get_versioned_url(url_3ver_input, update=True),
url_3ver + "?versionId=Kvuind11HZh._dCPaDAb0OY9dRrQoTMn")
@skip_if_no_network
@use_cassette('s3_test_version_url_anon')
def test_get_versioned_url_anon():
# The one without any authenticator, was crashing.
# Also it triggered another bug about having . in the bucket name
url_on = "http://dandiarchive.s3.amazonaws.com/ros3test.nwb"
url_on_versioned = get_versioned_url(url_on)
ok_startswith(url_on_versioned, url_on + "?versionId=")
@skip_if_no_network
@use_cassette('s3_test_version_url_deleted')
def test_version_url_deleted():
get_test_providers('s3://datalad-test0-versioned/', reload=True) # to verify having credentials to access
# openfmri via S3
# it existed and then was removed
fpath = "1version-removed.txt"
url = "http://datalad-test0-versioned.s3.amazonaws.com/%s" % fpath
turl = "http://datalad-test0-versioned.s3.amazonaws.com/%s" \
"?versionId=eZ5Hgwo8azfBv3QT7aW9dmm2sbLUY.QP" % fpath
eq_(get_versioned_url(url), turl)
# too heavy for verification!
#eq_(get_versioned_url(url, verify=True), turl)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_strings.py 0000644 0001751 0001751 00000002355 15137634221 020423 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from ..support.strings import apply_replacement_rules
from .utils_pytest import *
def test_apply_replacement_rules():
# replacement rule should be at least 3 char long
assert_raises(ValueError, apply_replacement_rules, '/', 'some')
assert_raises(ValueError, apply_replacement_rules, ['/a/b', '/'], 'some')
# and pattern should have the separator only twice
assert_raises(ValueError, apply_replacement_rules, '/ab', 'some')
assert_raises(ValueError, apply_replacement_rules, '/a/b/', 'some')
eq_(apply_replacement_rules('/a/b', 'abab'), 'bbbb')
eq_(apply_replacement_rules('/a/', 'abab'), 'bb')
eq_(apply_replacement_rules(['/a/b'], 'abab'), 'bbbb')
eq_(apply_replacement_rules(['/a/b', ',b,ab'], 'abab'), 'abababab')
# with regular expression groups
eq_(apply_replacement_rules(r'/st(.*)n(.*)$/\1-\2', 'string'), 'ri-g')
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_tests_utils_pytest.py 0000644 0001751 0001751 00000052200 15137634221 022716 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil; coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import base64
import logging
import os
import platform
import random
import sys
try:
# optional direct dependency we might want to kick out
import bs4
except ImportError: # pragma: no cover
bs4 = None
from glob import glob
from os.path import (
basename,
exists,
)
from os.path import join as opj
from unittest.mock import patch
from urllib.parse import quote as url_quote
from urllib.request import (
Request,
urlopen,
)
import pytest
from _pytest.outcomes import (
Failed,
Skipped,
)
from datalad import cfg as dl_cfg
from datalad.support import path as op
from datalad.support.gitrepo import GitRepo
from datalad.tests.utils_pytest import (
OBSCURE_FILENAMES,
OBSCURE_PREFIX,
assert_cwd_unchanged,
assert_dict_equal,
assert_false,
assert_in,
assert_not_in,
assert_raises,
assert_re_in,
assert_str_equal,
assert_true,
eq_,
get_most_obscure_supported_name,
ignore_nose_capturing_stdout,
known_failure_githubci_win,
known_failure_windows,
local_testrepo_flavors,
nok_startswith,
ok_,
ok_broken_symlink,
ok_file_has_content,
ok_file_under_git,
ok_generator,
ok_good_symlink,
ok_startswith,
ok_symlink,
on_github,
on_nfs,
on_windows,
patch_config,
probe_known_failure,
rmtemp,
run_under_dir,
serve_path_via_http,
skip_if,
skip_if_no_module,
skip_if_no_network,
skip_if_on_windows,
skip_ssh,
skip_wo_symlink_capability,
swallow_logs,
with_tempfile,
with_testsui,
with_tree,
without_http_proxy,
)
from datalad.utils import (
Path,
chpwd,
getpwd,
)
#
# Test with_tempfile, especially nested invocations
#
@with_tempfile
def _with_tempfile_decorated_dummy(path):
return path
def test_with_tempfile_dir_via_env_variable():
target = os.path.join(os.path.expanduser("~"), "dataladtesttmpdir")
assert_false(os.path.exists(target), "directory %s already exists." % target)
with patch_config({'datalad.tests.temp.dir': target}):
filename = _with_tempfile_decorated_dummy()
ok_startswith(filename, target)
@with_tempfile
@with_tempfile
def test_nested_with_tempfile_basic(f1=None, f2=None):
ok_(f1 != f2)
ok_(not os.path.exists(f1))
ok_(not os.path.exists(f2))
# And the most obscure case to test. Generator for the test is
# used as well to verify that every one of those functions adds new argument
# to the end of incoming arguments.
@with_tempfile(prefix="TEST", suffix='big')
@with_tree((('f1.txt', 'load'),))
@with_tempfile(suffix='.cfg')
@with_tempfile(suffix='.cfg.old')
def check_nested_with_tempfile_parametrized_surrounded(
param, f0=None, tree=None, f1=None, f2=None, repo=None):
eq_(param, "param1")
ok_(f0.endswith('big'), msg="got %s" % f0)
ok_(os.path.basename(f0).startswith('TEST'), msg="got %s" % f0)
ok_(os.path.exists(os.path.join(tree, 'f1.txt')))
ok_(f1 != f2)
ok_(f1.endswith('.cfg'), msg="got %s" % f1)
ok_(f2.endswith('.cfg.old'), msg="got %s" % f2)
def test_nested_with_tempfile_parametrized_surrounded():
check_nested_with_tempfile_parametrized_surrounded("param1")
@with_tempfile(content="testtest")
def test_with_tempfile_content(f=None):
ok_file_has_content(f, "testtest")
ok_file_has_content(f, "test*", re_=True)
def test_with_tempfile_content_raises_on_mkdir():
@with_tempfile(content="test", mkdir=True)
def t(): # pragma: no cover
raise AssertionError("must not be run")
with assert_raises(ValueError):
# after this commit, it will check when invoking, not when decorating
t()
def test_get_resolved_values():
from datalad.tests.utils_pytest import _get_resolved_flavors
flavors = ['networkish', 'local']
eq_(([] if dl_cfg.get('datalad.tests.nonetwork') else ['networkish'])
+ ['local'],
_get_resolved_flavors(flavors))
with patch_config({'datalad.tests.nonetwork': '1'}):
eq_(_get_resolved_flavors(flavors), ['local'])
@pytest.mark.xfail(on_github and on_nfs, reason="unknown. TODO: figure out")
def test_with_tempfile_mkdir():
dnames = [] # just to store the name within the decorated function
@with_tempfile(mkdir=True)
def check_mkdir(d1):
ok_(os.path.exists(d1))
ok_(os.path.isdir(d1))
dnames.append(d1)
eq_(glob(os.path.join(d1, '*')), [])
# Create a file to assure we can remove later the temporary load
with open(os.path.join(d1, "test.dat"), "w") as f:
f.write("TEST LOAD")
check_mkdir()
if not dl_cfg.get('datalad.tests.temp.keep'):
ok_(not os.path.exists(dnames[0])) # got removed
@with_tempfile()
def test_with_tempfile_default_prefix(d1=None):
d = basename(d1)
short = 'datalad_temp_'
full = short + \
'test_with_tempfile_default_prefix'
if on_windows:
ok_startswith(d, short)
nok_startswith(d, full)
else:
ok_startswith(d, full)
@with_tempfile(prefix="nodatalad_")
def test_with_tempfile_specified_prefix(d1=None):
ok_startswith(basename(d1), 'nodatalad_')
ok_('test_with_tempfile_specified_prefix' not in d1)
def test_get_most_obscure_supported_name():
n = get_most_obscure_supported_name()
ok_startswith(n, OBSCURE_PREFIX)
ok_(len(OBSCURE_FILENAMES) > 1)
# from more complex to simpler ones
ok_(len(OBSCURE_FILENAMES[0]) > len(OBSCURE_FILENAMES[-1]))
print(repr(n))
def test_keeptemp_via_env_variable():
if dl_cfg.get('datalad.tests.temp.keep'): # pragma: no cover
pytest.skip("We have env variable set to preserve tempfiles")
files = []
@with_tempfile()
def check(f):
open(f, 'w').write("LOAD")
files.append(f)
with patch.dict('os.environ', {}):
check()
with patch.dict('os.environ', {'DATALAD_TESTS_TEMP_KEEP': '1'}):
check()
eq_(len(files), 2)
ok_(not exists(files[0]), msg="File %s still exists" % files[0])
ok_( exists(files[1]), msg="File %s not exists" % files[1])
rmtemp(files[-1])
@skip_wo_symlink_capability
@with_tempfile
def test_ok_symlink_helpers(tmpfile=None):
assert_raises(AssertionError, ok_symlink, tmpfile)
assert_raises(AssertionError, ok_good_symlink, tmpfile)
assert_raises(AssertionError, ok_broken_symlink, tmpfile)
tmpfile_symlink = tmpfile + '_symlink'
Path(tmpfile_symlink).symlink_to(Path(tmpfile))
# broken symlink
ok_symlink(tmpfile_symlink)
ok_broken_symlink(tmpfile_symlink)
assert_raises(AssertionError, ok_good_symlink, tmpfile_symlink)
with open(tmpfile, 'w') as tf:
tf.write('test text')
# tmpfile is still not a symlink here
assert_raises(AssertionError, ok_symlink, tmpfile)
assert_raises(AssertionError, ok_good_symlink, tmpfile)
assert_raises(AssertionError, ok_broken_symlink, tmpfile)
ok_symlink(tmpfile_symlink)
ok_good_symlink(tmpfile_symlink)
assert_raises(AssertionError, ok_broken_symlink, tmpfile_symlink)
def test_ok_startswith():
ok_startswith('abc', 'abc')
ok_startswith('abc', 'a')
ok_startswith('abc', '')
ok_startswith(' abc', ' ')
ok_startswith('abc\r\n', 'a') # no effect from \r\n etc
assert_raises(AssertionError, ok_startswith, 'abc', 'b')
assert_raises(AssertionError, ok_startswith, 'abc', 'abcd')
def test_nok_startswith():
nok_startswith('abc', 'bc')
nok_startswith('abc', 'c')
assert_raises(AssertionError, nok_startswith, 'abc', 'a')
assert_raises(AssertionError, nok_startswith, 'abc', 'abc')
def test_ok_generator():
def func(a, b=1):
return a+b
def gen(a, b=1): # pragma: no cover
yield a+b
# not sure how to determine if xrange is a generator
assert_raises(AssertionError, ok_generator, range(2))
assert_raises(AssertionError, ok_generator, gen)
ok_generator(gen(1))
assert_raises(AssertionError, ok_generator, func)
assert_raises(AssertionError, ok_generator, func(1))
@pytest.mark.parametrize("func", [os.chdir, chpwd])
def test_assert_Xwd_unchanged(func):
orig_cwd = os.getcwd()
orig_pwd = getpwd()
@assert_cwd_unchanged
def do_chdir():
func(os.pardir)
with assert_raises(AssertionError) as cm:
do_chdir()
eq_(orig_cwd, os.getcwd(),
"assert_cwd_unchanged didn't return us back to cwd %s" % orig_cwd)
eq_(orig_pwd, getpwd(),
"assert_cwd_unchanged didn't return us back to pwd %s" % orig_pwd)
@pytest.mark.parametrize("func", [os.chdir, chpwd])
def test_assert_Xwd_unchanged_ok_chdir(func):
# Test that we are not masking out other "more important" exceptions
orig_cwd = os.getcwd()
orig_pwd = getpwd()
@assert_cwd_unchanged(ok_to_chdir=True)
def do_chdir_value_error():
func(os.pardir)
return "a value"
with swallow_logs() as cml:
eq_(do_chdir_value_error(), "a value")
eq_(orig_cwd, os.getcwd(),
"assert_cwd_unchanged didn't return us back to cwd %s" % orig_cwd)
eq_(orig_pwd, getpwd(),
"assert_cwd_unchanged didn't return us back to cwd %s" % orig_pwd)
assert_not_in("Mitigating and changing back", cml.out)
def test_assert_cwd_unchanged_not_masking_exceptions():
# Test that we are not masking out other "more important" exceptions
orig_cwd = os.getcwd()
@assert_cwd_unchanged
def do_chdir_value_error():
os.chdir(os.pardir)
raise ValueError("error exception")
with swallow_logs(new_level=logging.WARN) as cml:
with assert_raises(ValueError) as cm:
do_chdir_value_error()
# retrospect exception
eq_(orig_cwd, os.getcwd(),
"assert_cwd_unchanged didn't return us back to %s" % orig_cwd)
assert_in("Mitigating and changing back", cml.out)
# and again but allowing to chdir
@assert_cwd_unchanged(ok_to_chdir=True)
def do_chdir_value_error():
os.chdir(os.pardir)
raise ValueError("error exception")
with swallow_logs(new_level=logging.WARN) as cml:
assert_raises(ValueError, do_chdir_value_error)
eq_(orig_cwd, os.getcwd(),
"assert_cwd_unchanged didn't return us back to %s" % orig_cwd)
assert_not_in("Mitigating and changing back", cml.out)
@with_tempfile(mkdir=True)
def _test_serve_path_via_http(test_fpath, use_ssl, auth, tmp_dir): # pragma: no cover
tmp_dir = Path(tmp_dir)
test_fpath = Path(test_fpath)
# First verify that filesystem layer can encode this filename
# verify first that we could encode file name in this environment
try:
filesysencoding = sys.getfilesystemencoding()
test_fpath_encoded = str(test_fpath.as_posix()).encode(filesysencoding)
except UnicodeEncodeError: # pragma: no cover
pytest.skip("Environment doesn't support unicode filenames")
if test_fpath_encoded.decode(filesysencoding) != test_fpath.as_posix(): # pragma: no cover
pytest.skip("Can't convert back/forth using %s encoding"
% filesysencoding)
test_fpath_full = tmp_dir / test_fpath
test_fpath_full.parent.mkdir(parents=True, exist_ok=True)
test_fpath_full.write_text(
f'some txt and a randint {random.randint(1, 10)}')
@serve_path_via_http(tmp_dir, use_ssl=use_ssl, auth=auth)
def test_path_and_url(path, url):
def _urlopen(url, auth=None):
req = Request(url)
if auth:
req.add_header(
"Authorization",
b"Basic " + base64.standard_b64encode(
'{0}:{1}'.format(*auth).encode('utf-8')))
return urlopen(req)
# @serve_ should remove http_proxy from the os.environ if was present
if not on_windows:
assert_false('http_proxy' in os.environ)
# get the "dir-view"
dirurl = url + test_fpath.parent.as_posix()
u = _urlopen(dirurl, auth)
assert_true(u.getcode() == 200)
html = u.read()
# get the actual content
file_html = _urlopen(
url + url_quote(test_fpath.as_posix()), auth).read().decode()
# verify we got the right one
eq_(file_html, test_fpath_full.read_text())
if bs4 is None:
return
# MIH is not sure what this part below is supposed to do
# possibly some kind of internal consistency test
soup = bs4.BeautifulSoup(html, "html.parser")
href_links = [txt.get('href') for txt in soup.find_all('a')]
assert_true(len(href_links) == 1)
parsed_url = f"{dirurl}/{href_links[0]}"
u = _urlopen(parsed_url, auth)
html = u.read().decode()
eq_(html, file_html)
test_path_and_url()
@pytest.mark.parametrize("test_fpath", [
'test1.txt',
Path('test_dir', 'test2.txt'),
Path('test_dir', 'd2', 'd3', 'test3.txt'),
'file with space test4',
u'ΠΠΆΡΠΉΡΠΎΠ½',
get_most_obscure_supported_name(),
])
@pytest.mark.parametrize("use_ssl,auth", [
(False, None),
(True, None),
(False, ('ernie', 'bert')),
])
def test_serve_path_via_http(test_fpath, use_ssl, auth):
_test_serve_path_via_http(test_fpath, use_ssl, auth)
def test_serve_path_via_http_local_proxy():
# just with the last one check that we did remove proxy setting
with patch.dict('os.environ', {'http_proxy': 'http://127.0.0.1:9/'}):
_test_serve_path_via_http(get_most_obscure_supported_name(), False, None)
@known_failure_githubci_win
def test_without_http_proxy():
@without_http_proxy
def check(a, kw=False):
assert_false('http_proxy' in os.environ)
assert_false('https_proxy' in os.environ)
assert_in(kw, [False, 'custom'])
check(1)
with patch.dict('os.environ', {'http_proxy': 'http://127.0.0.1:9/'}):
check(1)
check(1, "custom")
with assert_raises(AssertionError):
check(1, "wrong")
with patch.dict('os.environ', {'https_proxy': 'http://127.0.0.1:9/'}):
check(1)
with patch.dict('os.environ', {'http_proxy': 'http://127.0.0.1:9/',
'https_proxy': 'http://127.0.0.1:9/'}):
check(1)
def test_assert_re_in():
assert_re_in(".*", "")
assert_re_in(".*", ["any"])
# should do match not search
assert_re_in("ab", "abc")
assert_raises(AssertionError, assert_re_in, "ab", "cab")
assert_raises(AssertionError, assert_re_in, "ab$", "abc")
# Sufficient to have one entry matching
assert_re_in("ab", ["", "abc", "laskdjf"])
assert_raises(AssertionError, assert_re_in, "ab$", ["ddd", ""])
# Tuples should be ok too
assert_re_in("ab", ("", "abc", "laskdjf"))
assert_raises(AssertionError, assert_re_in, "ab$", ("ddd", ""))
# shouldn't "match" the empty list
assert_raises(AssertionError, assert_re_in, "", [])
def test_skip_if_no_network():
cleaned_env = os.environ.copy()
cleaned_env.pop('DATALAD_TESTS_NONETWORK', None)
# we need to run under cleaned env to make sure we actually test in both conditions
with patch('os.environ', cleaned_env):
@skip_if_no_network
def somefunc(a1):
return a1
#ok_(hasattr(somefunc, "network"))
with patch_config({'datalad.tests.nonetwork': '1'}):
assert_raises(Skipped, somefunc, 1)
with patch.dict('os.environ', {}):
eq_(somefunc(1), 1)
# and now if used as a function, not a decorator
with patch_config({'datalad.tests.nonetwork': '1'}):
assert_raises(Skipped, skip_if_no_network)
with patch.dict('os.environ', {}):
eq_(skip_if_no_network(), None)
def test_skip_if_no_module():
def testish():
skip_if_no_module("nonexistingforsuremodule")
raise ValueError
assert_raises(Skipped, testish)
def testish2():
skip_if_no_module("datalad")
return "magic"
eq_(testish2(), "magic")
def test_skip_if():
with assert_raises(Skipped):
@skip_if(True)
def f(): # pragma: no cover
raise AssertionError("must have not been ran")
f()
@skip_if(False)
def f():
return "magical"
eq_(f(), 'magical')
@assert_cwd_unchanged
@with_tempfile(mkdir=True)
def test_run_under_dir(d=None):
orig_pwd = getpwd()
orig_cwd = os.getcwd()
@run_under_dir(d)
def f(arg, kwarg=None):
eq_(arg, 1)
eq_(kwarg, 2)
eq_(getpwd(), d)
f(1, 2)
eq_(getpwd(), orig_pwd)
eq_(os.getcwd(), orig_cwd)
# and if fails
assert_raises(AssertionError, f, 1, 3)
eq_(getpwd(), orig_pwd)
eq_(os.getcwd(), orig_cwd)
def test_assert_dict_equal():
assert_dict_equal({}, {})
assert_dict_equal({"a": 3}, {"a": 3})
assert_raises(AssertionError, assert_dict_equal, {1: 3}, {1: 4})
assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4})
assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4, 1: 3})
assert_raises(AssertionError, assert_dict_equal, {1: 3}, {2: 4, 1: 'a'})
try:
import numpy as np
except: # pragma: no cover
pytest.skip("need numpy for this tiny one")
# one is scalar another one array
assert_raises(AssertionError, assert_dict_equal, {1: 0}, {1: np.arange(1)})
assert_raises(AssertionError, assert_dict_equal, {1: 0}, {1: np.arange(3)})
def test_assert_str_equal():
assert_str_equal("a", "a")
assert_str_equal("a\n", "a\n")
assert_str_equal("a\nb", "a\nb")
assert_raises(AssertionError, assert_str_equal, "a", "a\n")
assert_raises(AssertionError, assert_str_equal, "a", "b")
assert_raises(AssertionError, assert_str_equal, "ab", "b")
def test_testsui():
# just one for now to test conflicting arguments
with assert_raises(ValueError):
@with_testsui(responses='some', interactive=False)
def some_func(): # pragma: no cover
pass
from datalad.ui import ui
@with_testsui(responses=['yes', "maybe so"])
def func2(x):
assert x == 1
eq_(ui.yesno("title"), True)
eq_(ui.question("title2"), "maybe so")
assert_raises(AssertionError, ui.question, "asking more than we know")
return x*2
eq_(func2(1), 2)
@with_testsui(interactive=False)
def func3(x):
assert_false(ui.is_interactive)
return x*3
eq_(func3(2), 6)
def test_setup():
# just verify that we monkey patched consts correctly
from datalad.consts import DATASETS_TOPURL
eq_(DATASETS_TOPURL, 'https://datasets-tests.datalad.org/')
from datalad.tests.utils_pytest import get_datasets_topdir
eq_(get_datasets_topdir(), 'datasets-tests.datalad.org')
def test_skip_ssh():
with patch_config({'datalad.tests.ssh': False}):
with assert_raises(Skipped):
skip_ssh(lambda: False)()
def test_probe_known_failure():
# should raise assert error if function no longer fails
with patch_config({'datalad.tests.knownfailures.probe': True}):
with assert_raises(Failed):
probe_known_failure(lambda: True)()
with patch_config({'datalad.tests.knownfailures.probe': False}):
ok_(probe_known_failure(lambda: True))
def test_ignore_nose_capturing_stdout():
# Just test the logic, not really a situation under overwritten stdout
def raise_exc():
raise AttributeError('nose causes a message which includes words '
'StringIO and fileno')
with assert_raises(AttributeError):
ignore_nose_capturing_stdout(raise_exc)()
@skip_wo_symlink_capability
@with_tree(tree={'ingit': '', 'staged': 'staged', 'notingit': ''})
def test_ok_file_under_git_symlinks(path=None):
# Test that works correctly under symlinked path
orepo = GitRepo(path)
orepo.add('ingit')
orepo.commit('msg')
orepo.add('staged')
lpath = path + "-symlink" # will also be removed AFAIK by our tempfile handling
Path(lpath).symlink_to(Path(path))
ok_symlink(lpath)
ok_file_under_git(op.join(path, 'ingit'))
ok_file_under_git(op.join(lpath, 'ingit'))
ok_file_under_git(op.join(lpath, 'staged'))
with assert_raises(AssertionError):
ok_file_under_git(op.join(lpath, 'notingit'))
with assert_raises(AssertionError):
ok_file_under_git(op.join(lpath, 'nonexisting'))
def test_assert_raises():
# rudimentary test of assert_raises shim prompted by suspicion in
# https://github.com/datalad/datalad/issues/6846#issuecomment-1363878497
def raise_ValueError():
raise ValueError("exc ValueError")
def raise_TypeError():
raise TypeError("exc TypeError")
# Test both forms of use
with assert_raises(ValueError):
raise_ValueError()
assert_raises(ValueError, raise_ValueError)
# can we specify multiple in a tuple?
with assert_raises((ValueError, TypeError)):
raise_ValueError()
with assert_raises((ValueError, TypeError)):
raise_TypeError()
assert_raises((ValueError, TypeError), raise_TypeError)
assert_raises((ValueError, TypeError), raise_ValueError)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_utils.py 0000644 0001751 0001751 00000131714 15137634221 020074 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# -*- coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Test testing utilities
"""
import inspect
import logging
import os
import os.path as op
import shutil
import stat
import sys
import time
from functools import wraps
from operator import itemgetter
from os.path import (
abspath,
basename,
dirname,
exists,
expanduser,
expandvars,
isabs,
)
from os.path import join as opj
from os.path import (
normpath,
pardir,
)
from unittest.mock import patch
import pytest
from datalad import cfg as dl_cfg
from datalad.support.annexrepo import AnnexRepo
from datalad.utils import (
CMD_MAX_ARG,
Path,
_path_,
any_re_search,
auto_repr,
better_wraps,
chpwd,
create_tree,
disable_logger,
dlabspath,
ensure_write_permission,
expandpath,
file_basename,
find_files,
generate_chunks,
get_dataset_root,
get_open_files,
get_path_prefix,
get_sig_param_names,
get_timestamp_suffix,
get_trace,
getargspec,
getpwd,
import_module_from_file,
import_modules,
is_explicit_path,
is_interactive,
join_cmdline,
knows_annex,
line_profile,
make_tempfile,
map_items,
md5sum,
never_fail,
not_supported_on_windows,
obtain_write_permission,
on_windows,
partition,
path_is_subpath,
path_startswith,
rotree,
split_cmdline,
swallow_logs,
swallow_outputs,
todo_interface_for_extensions,
unique,
unlink,
updated,
)
from .utils_pytest import (
OBSCURE_FILENAME,
SkipTest,
assert_cwd_unchanged,
assert_equal,
assert_false,
assert_greater,
assert_in,
assert_not_in,
assert_raises,
assert_true,
ensure_bool,
ensure_dict_from_str,
ensure_iter,
ensure_list,
ensure_list_from_str,
ensure_unicode,
eq_,
has_symlink_capability,
known_failure,
nok_,
ok_,
ok_file_has_content,
ok_generator,
ok_startswith,
on_travis,
probe_known_failure,
skip_if,
skip_if_no_module,
skip_if_on_windows,
skip_if_root,
skip_known_failure,
skip_wo_symlink_capability,
with_tempfile,
with_tree,
)
def test_better_wraps():
def wraps_decorator(func):
@wraps(func)
def _wrap_wraps_decorator(*args, **kwargs):
return func(*args, **kwargs)
return _wrap_wraps_decorator
def better_decorator(func):
@better_wraps(func)
def _wrap_better_decorator(*args, **kwargs):
return func(*args, **kwargs)
return _wrap_better_decorator
@wraps_decorator
def function1(a, b, c):
return "function1"
@better_decorator
def function2(a, b, c):
return "function2"
eq_("function1", function1(1, 2, 3))
# getargspec shim now can handle @wraps'ed functions just fine
eq_(getargspec(function1)[0], ['a', 'b', 'c'])
eq_("function2", function2(1, 2, 3))
eq_(getargspec(function2)[0], ['a', 'b', 'c'])
# TODO?: make again parametric on eq_argspec invocations?
@pytest.mark.filterwarnings(r"ignore: inspect.getargspec\(\) is deprecated")
def test_getargspec():
def eq_argspec(f, expected, has_kwonlyargs=False):
"""A helper to centralize testing of getargspec on original and wrapped function
has_kwonlyargs is to instruct if function has kwonly args so we do not try to compare
to inspect.get*spec functions, which would barf ValueError if attempted to run on a
function with kwonlys. And also we pass it as include_kwonlyargs to our getargspec
"""
# so we know that our expected is correct
if not has_kwonlyargs:
# if False - we test function with kwonlys - inspect.getargspec would barf
if sys.version_info < (3, 11):
eq_(inspect.getargspec(f), expected)
# and getfullargspec[:4] wouldn't provide a full picture
eq_(inspect.getfullargspec(f)[:4], expected)
else:
if sys.version_info < (3, 11):
assert_raises(ValueError, inspect.getargspec, f)
inspect.getfullargspec(f) # doesn't barf
eq_(getargspec(f, include_kwonlyargs=has_kwonlyargs), expected)
# and lets try on a wrapped one -- only ours can do the right thing
def decorator(f):
@wraps(f)
def wrapper(*args, **kwargs): # pragma: no cover
return f(*args, **kwargs)
return wrapper
fw = decorator(f)
if has_kwonlyargs:
# We barf ValueError similarly to inspect.getargspec, unless explicitly requested
# to include kwonlyargs
assert_raises(ValueError, getargspec, fw)
eq_(getargspec(fw, include_kwonlyargs=has_kwonlyargs), expected)
def f0(): # pragma: no cover
pass
eq_argspec(f0, ([], None, None, None))
def f1(a1, kw1=None, kw0=1): # pragma: no cover
pass
eq_argspec(f1, (['a1', 'kw1', 'kw0'], None, None, (None, 1)))
# Having *a already makes keyword args to be kwonlyargs, in that
# inspect.get*spec would barf
def f1_args(a1, *a, kw1=None, kw0=1, **kw): # pragma: no cover
pass
eq_argspec(f1_args, (['a1', 'kw1', 'kw0'], 'a', 'kw', (None, 1)), True)
def f1_star(a1, *, kw1=None, kw0=1): # pragma: no cover
pass
assert_raises(ValueError, getargspec, f1_star)
eq_argspec(f1_star, (['a1', 'kw1', 'kw0'], None, None, (None, 1)), True)
def test_get_sig_param_names():
def f(a1, kw1=None, *args, kw2=None, **kwargs):
pass # pragma: no cover
# note: `a1` could be used either positionally or via keyword, so is listed in kw_any
assert_equal(get_sig_param_names(f, ('kw_only', 'kw_any')), (['kw2'], ['a1', 'kw1', 'kw2']))
assert_equal(get_sig_param_names(f, ('any',)), (['a1', 'kw1', 'kw2'],))
assert_equal(get_sig_param_names(f, tuple()), ())
assert_raises(ValueError, get_sig_param_names, f, ('mumba',))
@with_tempfile(mkdir=True)
def test_rotree(d=None):
d2 = opj(d, 'd1', 'd2') # deep nested directory
f = opj(d2, 'f1')
os.makedirs(d2)
with open(f, 'w') as f_:
f_.write("LOAD")
with swallow_logs():
ar = AnnexRepo(d2)
rotree(d)
# we shouldn't be able to delete anything UNLESS in "crippled" situation:
# root, or filesystem is FAT etc
# Theoretically annex should declare FS as crippled when ran as root, but
# see http://git-annex.branchable.com/bugs/decides_that_FS_is_crippled_
# under_cowbuilder___40__symlinks_supported_etc__41__/#comment-60c3cbe2710d6865fb9b7d6e247cd7aa
# so explicit 'or'
if not (ar.is_crippled_fs() or (os.getuid() == 0)):
assert_raises(OSError, os.unlink, f) # OK to use os.unlink
assert_raises(OSError, unlink, f) # and even with waiting and trying!
assert_raises(OSError, shutil.rmtree, d)
# but file should still be accessible
with open(f) as f_:
eq_(f_.read(), "LOAD")
# make it RW
rotree(d, False)
unlink(f)
shutil.rmtree(d)
def test_swallow_outputs():
with swallow_outputs() as cm:
eq_(cm.out, '')
sys.stdout.write("out normal")
sys.stderr.write("out error")
eq_(cm.out, 'out normal')
sys.stdout.write(" and more")
eq_(cm.out, 'out normal and more') # incremental
eq_(cm.err, 'out error')
eq_(cm.err, 'out error') # the same value if multiple times
@with_tempfile
def test_swallow_logs(logfile=None):
lgr = logging.getLogger('datalad')
with swallow_logs(new_level=9) as cm:
eq_(cm.out, '')
lgr.log(8, "very heavy debug")
eq_(cm.out, '') # not even visible at level 9
lgr.log(9, "debug1")
eq_(cm.out, '[Level 9] debug1\n') # not even visible at level 9
lgr.info("info")
# not even visible at level 9
eq_(cm.out, '[Level 9] debug1\n[INFO] info\n')
with swallow_logs(new_level=9, file_=logfile) as cm:
eq_(cm.out, '')
lgr.info("next info")
from datalad.tests.utils_pytest import ok_file_has_content
ok_file_has_content(logfile, "[INFO] next info", strip=True)
def test_swallow_logs_assert():
lgr = logging.getLogger('datalad.tests')
with swallow_logs(new_level=9) as cm:
# nothing was logged so should fail
assert_raises(AssertionError, cm.assert_logged)
lgr.info("something")
cm.assert_logged("something")
cm.assert_logged(level="INFO")
cm.assert_logged("something", level="INFO")
# even with regex = False should match above
cm.assert_logged("something", regex=False)
cm.assert_logged(level="INFO", regex=False)
cm.assert_logged("something", level="INFO", regex=False)
# different level
assert_raises(AssertionError,
cm.assert_logged, "something", level="DEBUG")
assert_raises(AssertionError, cm.assert_logged, "else")
cm.assert_logged("some.hing", level="INFO") # regex ;-)
# does match
assert_raises(AssertionError,
cm.assert_logged, "ome.hing", level="INFO")
# but we can change it
cm.assert_logged("some.hing", level="INFO", match=False)
# and we can continue doing checks after we left the cm block
cm.assert_logged("some.hing", level="INFO", match=False)
# and we indeed logged something
cm.assert_logged(match=False)
def test_disable_logger():
# get a logger hierarchy:
lgr_top = logging.getLogger('datalad')
lgr_middle = logging.getLogger('datalad.tests')
lgr_bottom = logging.getLogger('datalad.tests.utils_pytest')
with swallow_logs(new_level=logging.DEBUG) as cml:
with disable_logger(): # default: 'datalad':
lgr_top.debug("log sth at top level")
lgr_middle.debug("log sth at mid level")
lgr_bottom.debug("log sth at bottom level")
# nothing logged:
assert_raises(AssertionError, cml.assert_logged)
# again, but pass in the logger at mid level:
with swallow_logs(new_level=logging.DEBUG) as cml:
with disable_logger(lgr_middle):
lgr_top.debug("log sth at top level")
lgr_middle.debug("log sth at mid level")
lgr_bottom.debug("log sth at bottom level")
# top level unaffected:
cml.assert_logged("log sth at top level", level="DEBUG", regex=False)
# but both of the lower ones don't log anything:
assert_raises(AssertionError, cml.assert_logged, "log sth at mid level")
assert_raises(AssertionError, cml.assert_logged, "log sth at bottom level")
def test_md5sum():
# just a smoke (encoding/decoding) test for md5sum
_ = md5sum(__file__)
@with_tree([('1.tar.gz', (('1 f.txt', '1 f load'),))])
def test_md5sum_archive(d=None):
# just a smoke (encoding/decoding) test for md5sum
_ = md5sum(opj(d, '1.tar.gz'))
def test_updated():
d = {}
eq_(updated(d, {1: 2}), {1: 2})
eq_(d, {})
d = {'a': 'b'}
eq_(updated(d, ((0, 1), (2, 3))), {0: 1, 'a': 'b', 2: 3})
eq_(d, {'a': 'b'})
# and that it would maintain the type
d = dict(((99, 0), ('z', 0), ('a', 0)))
d_ = updated(d, {0: 1})
ok_(isinstance(d_, dict))
eq_(d_, dict(((99, 0), ('z', 0), ('a', 0), (0, 1))))
def test_get_local_file_url_windows():
raise SkipTest("TODO")
@assert_cwd_unchanged
def test_getpwd_basic():
pwd = getpwd()
ok_(isabs(pwd))
eq_(os.getcwd(), abspath(pwd))
# that we do not chdir anywhere if None provided
with patch('os.chdir') as oschdir:
with chpwd(None):
eq_(getpwd(), pwd)
assert_false(oschdir.called)
@with_tempfile(mkdir=True)
@assert_cwd_unchanged(ok_to_chdir=True)
def test_getpwd_change_mode(tdir=None):
from datalad import utils
if utils._pwd_mode != 'PWD':
raise SkipTest("Makes sense to be tested only in PWD mode, "
"but we seems to be beyond that already")
# The evil plain chdir call
os.chdir(tdir)
# Just testing the logic of switching to cwd mode and issuing a warning
with swallow_logs(new_level=logging.DEBUG) as cml:
pwd = getpwd()
eq_(pwd, str(Path(pwd).resolve())) # might have symlinks, thus realpath
assert_in("symlinks in the paths will be resolved", cml.out)
eq_(utils._pwd_mode, 'cwd')
@skip_wo_symlink_capability
@skip_if_on_windows
@with_tempfile(mkdir=True)
@assert_cwd_unchanged
def test_getpwd_symlink(tdir=None):
sdir = opj(tdir, 's1')
pwd_orig = getpwd()
Path(sdir).symlink_to(Path('.'))
s1dir = opj(sdir, 's1')
s2dir = opj(sdir, 's2')
try:
chpwd(sdir)
pwd = getpwd()
eq_(pwd, sdir)
chpwd('s1')
eq_(getpwd(), s1dir)
chpwd('.')
eq_(getpwd(), s1dir)
chpwd('..')
eq_(getpwd(), sdir)
finally:
chpwd(pwd_orig)
# test context handler way of use
with chpwd(s1dir):
eq_(getpwd(), s1dir)
eq_(getpwd(), pwd_orig)
assert_false(exists(s2dir))
with assert_raises(OSError):
with chpwd(s2dir):
pass
with chpwd(s2dir, mkdir=True):
ok_(exists(s2dir))
eq_(getpwd(), s2dir)
@with_tempfile(mkdir=True)
def test_chpwd_obscure_name(topdir=None):
path = op.join(topdir, OBSCURE_FILENAME)
os.mkdir(path)
# Just check that call doesn't fail.
with chpwd(path):
pass
def test_auto_repr():
class WithoutReprClass:
def __init__(self):
self.a = "does not matter"
@auto_repr
class buga:
def __init__(self):
self.a = 1
self.b = list(range(20))
self.c = WithoutReprClass()
self._c = "protect me"
def some(self):
return "some"
@auto_repr(short=False)
class buga_long(object):
def __init__(self):
self.a = 1
self.b = list(range(20))
def some(self):
return "some"
assert_equal(
repr(buga()),
"buga(a=1, b=<<[0, 1, 2, 3, 4++52 chars++ 19]>>, c=)"
)
assert_equal(buga().some(), "some")
assert_equal(
repr(buga_long()),
f"buga_long(a=1, b=[{', '.join(map(str, range(20)))}])"
)
assert_equal(buga_long().some(), "some")
def test_todo_interface_for_extensions():
@todo_interface_for_extensions
def f(i, j):
return i*j
assert_equal(f(2, 3), 6)
def test_assure_iter():
s = {1}
assert ensure_iter(None, set) == set()
assert ensure_iter(1, set) == s
assert ensure_iter(1, list) == [1]
assert ensure_iter(s, set) is s
assert ensure_iter(s, set, copy=True) is not s
def test_assure_list_copy():
l = [1]
assert ensure_list(l) is l
assert ensure_list(l, copy=True) is not l
@pytest.mark.parametrize(
"value,result",
[
('', None),
([], None),
('somestring', ['somestring']),
('some\nmultiline\nstring', ['some', 'multiline', 'string']),
(['something'], ['something']),
(['a', 'listof', 'stuff'], ['a', 'listof', 'stuff']),
]
)
def test_assure_list_from_str(value, result):
assert ensure_list_from_str(value) == result
def test_assure_dict_from_str():
assert_equal(ensure_dict_from_str(''), None)
assert_equal(ensure_dict_from_str({}), None)
target_dict = dict(
__ac_name='{user}', __ac_password='{password}',
cookies_enabled='', submit='Log in'
)
string = '__ac_name={user}\n__ac_password={password}\nsubmit=Log ' \
'in\ncookies_enabled='
assert_equal(ensure_dict_from_str(string), target_dict)
assert_equal(ensure_dict_from_str(
target_dict),
target_dict)
def test_assure_bool():
for values, t in [
(['True', 1, '1', 'yes', 'on'], True),
(['False', 0, '0', 'no', 'off'], False)
]:
for v in values:
eq_(ensure_bool(v), t)
assert_raises(ValueError, ensure_bool, "unknown")
def test_generate_chunks():
ok_generator(generate_chunks([1], 1))
eq_(list(generate_chunks([1], 1)), [[1]])
eq_(list(generate_chunks([1], 2)), [[1]])
eq_(list(generate_chunks([1, 2, 3], 2)), [[1, 2], [3]])
# type is preserved
eq_(list(generate_chunks((1, 2, 3), 2)), [(1, 2), (3,)])
# no hangers
eq_(list(generate_chunks((1, 2, 3, 4), 2)), [(1, 2), (3, 4)])
assert_raises(AssertionError, list, generate_chunks([1], 0))
def test_any_re_search():
assert_true(any_re_search('a', 'a'))
assert_true(any_re_search('a', 'bab'))
assert_false(any_re_search('^a', 'bab'))
assert_true(any_re_search(['b', '.ab'], 'bab'))
assert_false(any_re_search(['^b', 'bab'], 'ab'))
def test_find_files():
tests_dir = dirname(__file__)
proj_dir = normpath(opj(dirname(__file__), pardir))
ff = find_files('.*', proj_dir)
ok_generator(ff)
files = list(ff)
assert(len(files) > 10) # we have more than 10 test files here
assert_in(opj(tests_dir, 'test_utils.py'), files)
# and no directories should be mentioned
assert_not_in(tests_dir, files)
ff2 = find_files('.*', proj_dir, dirs=True)
files2 = list(ff2)
assert_in(opj(tests_dir, 'test_utils.py'), files2)
assert_in(tests_dir, files2)
# now actually matching the path
ff3 = find_files(
r'.*\\test_.*\.py$' if on_windows else r'.*/test_.*\.py$',
proj_dir, dirs=True)
files3 = list(ff3)
assert_in(opj(tests_dir, 'test_utils.py'), files3)
assert_not_in(tests_dir, files3)
for f in files3:
ok_startswith(basename(f), 'test_')
@with_tree(tree={
'.git': {
'1': '2'
},
'd1': {
'.git': 'possibly a link from submodule'
},
'git': 'just a file'
})
def test_find_files_exclude_vcs(repo=None):
ff = find_files('.*', repo, dirs=True)
files = list(ff)
assert_equal({basename(f) for f in files}, {'d1', 'git'})
assert_not_in(opj(repo, '.git'), files)
ff = find_files('.*', repo, dirs=True, exclude_vcs=False)
files = list(ff)
assert_equal({basename(f) for f in files}, {'d1', 'git', '.git', '1'})
assert_in(opj(repo, '.git'), files)
def test_not_supported_on_windows():
with patch('datalad.utils.on_windows', True):
assert_raises(NotImplementedError, not_supported_on_windows)
assert_raises(NotImplementedError, not_supported_on_windows, "msg")
with patch('datalad.utils.on_windows', False):
assert_equal(not_supported_on_windows(), None)
assert_equal(not_supported_on_windows("msg"), None)
def test_file_basename():
eq_(file_basename('1'), '1')
eq_(file_basename('d1/1'), '1')
eq_(file_basename('/d1/1'), '1')
eq_(file_basename('1.'), '1.')
eq_(file_basename('1.tar.gz'), '1')
eq_(file_basename('1.Tar.gz'), '1')
eq_(file_basename('1._bak.gz'), '1')
eq_(file_basename('1.tar.gz', return_ext=True), ('1', 'tar.gz'))
eq_(file_basename('/tmp/1.tar.gz'), '1')
eq_(file_basename('/tmp/1.longish.gz'), '1.longish')
eq_(file_basename('1_R1.1.1.tar.gz'), '1_R1.1.1')
eq_(file_basename('ds202_R1.1.1.tgz'), 'ds202_R1.1.1')
def test_expandpath():
eq_(expandpath("some", False), expanduser('some'))
eq_(expandpath("some", False), expandvars('some'))
assert_true(isabs(expandpath('some')))
# this may have to go because of platform issues
if not on_windows:
# expanduser is not influenced by our HOME setting adjustments
# for the tests on windows
eq_(expandpath("$HOME"), expanduser('~'))
def test_is_explicit_path():
# by default expanded paths are absolute, hence explicit
assert_true(is_explicit_path(expandpath('~')))
assert_false(is_explicit_path("here"))
@with_tempfile
@with_tempfile
def test_knows_annex(here=None, there=None):
from datalad.support.annexrepo import AnnexRepo
from datalad.support.gitrepo import GitRepo
GitRepo(path=here, create=True)
assert_false(knows_annex(here))
AnnexRepo(path=here, create=True)
assert_true(knows_annex(here))
GitRepo.clone(path=there, url=here, create=True)
assert_true(knows_annex(there))
def test_make_tempfile():
# check if mkdir, content conflict caught
with assert_raises(ValueError):
with make_tempfile(content="blah", mkdir=True): # pragma: no cover
pass
def test_unique():
eq_(unique(range(3)), [0, 1, 2])
eq_(unique(range(3), reverse=True), [0, 1, 2])
eq_(unique((1, 0, 1, 3, 2, 0, 1)), [1, 0, 3, 2])
eq_(unique((1, 0, 1, 3, 2, 0, 1), reverse=True), [3, 2, 0, 1])
eq_(unique([]), [])
eq_(unique([], reverse=True), [])
eq_(unique([(1, 2), (1,), (1, 2), (0, 3)]), [(1, 2), (1,), (0, 3)])
eq_(unique([(1, 2), (1,), (1, 2), (0, 3)], reverse=True),
[(1,), (1, 2), (0, 3)])
# with a key now
eq_(unique([(1, 2), (1,), (1, 2), (0, 3)],
key=itemgetter(0)), [(1, 2), (0, 3)])
eq_(unique([(1, 2), (1,), (1, 2), (0, 3)],
key=itemgetter(0), reverse=True), [(1, 2), (0, 3)])
eq_(unique([(1, 2), (1, 3), (1, 2), (0, 3)],
key=itemgetter(1)), [(1, 2), (1, 3)])
eq_(unique([(1, 2), (1, 3), (1, 2), (0, 3)],
key=itemgetter(1), reverse=True), [(1, 2), (0, 3)])
def test_partition():
def fn(*args, **kwargs):
left, right = partition(*args, **kwargs)
return list(left), list(right)
eq_(fn([False, True, False]),
([False, False], [True]))
eq_(fn([1, 5, 4, 10], lambda x: x > 4),
([1, 4], [5, 10]))
eq_(fn([1, 5, 4, 10], lambda x: x < 0),
([1, 5, 4, 10], []))
def test_path_():
eq_(_path_('a'), 'a')
if on_windows:
eq_(_path_('a/b'), r'a\b')
else:
p = 'a/b/c'
assert(_path_(p) is p) # nothing is done to it whatsoever
eq_(_path_(p, 'd'), 'a/b/c/d')
def test_get_timestamp_suffix():
# we need to patch temporarily TZ
with patch.dict('os.environ', {'TZ': 'GMT'}):
# figure out how GMT time zone suffix is represented
# could be +0 or -0, depending on platform
# just use whatever it is, not the subject of this test
tz_suffix = time.strftime('%z', time.gmtime(0))
# skynet DOB
target_ts = '1970-01-01T00:00:00' + tz_suffix
assert_equal(get_timestamp_suffix(0), '-' + target_ts)
assert_equal(get_timestamp_suffix(0, prefix="+"),
'+' + target_ts)
# yoh found no way to mock things out and didn't want to provide
# explicit call to anything to get current time with the timezone,
# so disabling this test for now besides that it should return smth
# sensible ;)
#with patch.object(time, 'localtime', lambda: 1):
# assert_equal(get_timestamp_suffix(),
# '-1970-01-01T00:00:01+0000') # skynet is 1 sec old
assert(get_timestamp_suffix().startswith('-'))
def test_memoized_generator():
called = [0]
def g1(n):
"""a generator"""
called[0] += 1
for i in range(n):
yield i
from ..utils import saved_generator
ok_generator(g1(3))
g1_, g2_ = saved_generator(g1(3))
ok_generator(g1_)
ok_generator(g2_)
target = list(g1(3))
eq_(called[0], 1)
eq_(target, list(g1_))
eq_(called[0], 2)
eq_(target, list(g2_))
eq_(called[0], 2) # no new call to make a generator
# but we can't (ab)use 2nd time
eq_([], list(g2_))
def test_assure_unicode():
ok_(isinstance(ensure_unicode("m"), str))
ok_(isinstance(ensure_unicode('grandchild_Àâüζ±'), str))
ok_(isinstance(ensure_unicode(u'grandchild_Àâüζ±'), str))
eq_(ensure_unicode('grandchild_Àâüζ±'), u'grandchild_Àâüζ±')
# now, non-utf8
# Decoding could be deduced with high confidence when the string is
# really encoded in that codepage
mom_koi8r = u"ΠΌΠ°ΠΌΠ°".encode('koi8-r')
eq_(ensure_unicode(mom_koi8r), u"ΠΌΠ°ΠΌΠ°")
eq_(ensure_unicode(mom_koi8r, confidence=0.9), u"ΠΌΠ°ΠΌΠ°")
mom_iso8859 = u'mamΓ‘'.encode('iso-8859-1')
eq_(ensure_unicode(mom_iso8859), u'mamΓ‘')
eq_(ensure_unicode(mom_iso8859, confidence=0.5), u'mamΓ‘')
# but when we mix, it does still guess something allowing to decode:
mixedin = mom_koi8r + u'ζ±'.encode('iso2022_jp') + u'ζ±'.encode('utf-8')
ok_(isinstance(ensure_unicode(mixedin), str))
# but should fail if we request high confidence result:
with assert_raises(ValueError):
ensure_unicode(mixedin, confidence=0.9)
# For other, non string values, actually just returns original value
# TODO: RF to actually "assure" or fail?? For now hardcoding that assumption
assert ensure_unicode(1) == 1
def test_pathlib_unicode():
eq_(str(Path("a")), u"a")
eq_(str(Path(u"Ξ²")), u"Ξ²")
@with_tempfile(mkdir=True)
def test_path_prefix(path=None):
eq_(get_path_prefix(_p('/d1/d2'), _p('/d1/d2')), _p(''))
# so we are under /d1/d2 so path prefix is ..
eq_(get_path_prefix(_p('/d1/d2'), _p('/d1/d2/d3')), _p('..'))
eq_(get_path_prefix(_p('/d1/d2/d3'), _p('/d1/d2')), _p('d3'))
# but if outside -- full path
eq_(get_path_prefix(_p('/d1/d2'), _p('/d1/d20/d3')), _p('/d1/d2'))
with chpwd(path):
eq_(get_path_prefix('.'), '')
eq_(get_path_prefix('d1'), 'd1')
eq_(get_path_prefix('d1', 'd2'), opj(path, 'd1'))
eq_(get_path_prefix('..'), '..')
def test_get_trace():
assert_raises(ValueError, get_trace, [], 'boom', 'does_not_matter')
eq_(get_trace([('A', 'B')], 'A', 'A'), None)
eq_(get_trace([('A', 'B')], 'A', 'B'), [])
eq_(get_trace([('A', 'B')], 'A', 'C'), None)
eq_(get_trace([('A', 'B'),
('B', 'C')], 'A', 'C'), ['B'])
# order of edges doesn't matter
eq_(get_trace([
('B', 'C'),
('A', 'B')
], 'A', 'C'), ['B'])
# mixed rubbish
eq_(get_trace([
(1, 3),
('B', 'C'),
(None, ('schwak', 7)),
('A', 'B'),
], 'A', 'C'), ['B'])
# long
eq_(get_trace([
('B', 'C'),
('A', 'B'),
('distract', 'me'),
('C', 'D'),
('D', 'E'),
], 'A', 'E'), ['B', 'C', 'D'])
@with_tempfile(mkdir=True)
def test_get_dataset_root(path=None):
eq_(get_dataset_root('/nonexistent'), None)
with chpwd(path):
repo = AnnexRepo(os.curdir, create=True)
subdir = opj('some', 'deep')
fname = opj(subdir, 'dummy')
os.makedirs(subdir)
with open(fname, 'w') as f:
f.write('some')
repo.add(fname)
# we can find this repo
eq_(get_dataset_root(os.curdir), os.curdir)
# and we get the type of path that we fed in
eq_(get_dataset_root(abspath(os.curdir)), abspath(os.curdir))
# subdirs are no issue
eq_(get_dataset_root(subdir), os.curdir)
# even more subdirs are no issue
eq_(get_dataset_root(opj(subdir, subdir)), os.curdir)
# non-dir paths are no issue
eq_(get_dataset_root(fname), os.curdir)
def _p(p: str) -> str:
"""A helper to code paths as POSIX paths in tests below. Would prepend fake drive
C: to absolute paths on Windows"""
if on_windows:
pm = p.replace('/', os.sep)
if p.startswith('/'):
return f"C:{pm}"
else:
return pm
return p
def test_path_startswith():
ok_(path_startswith(_p('/a/b'), _p('/a')))
ok_(path_startswith(_p('/a/b'), _p('/a/b')))
ok_(path_startswith(_p('/a/b'), _p('/a/b/')))
ok_(path_startswith(_p('/a/b/'), _p('/a/b')))
ok_(path_startswith(_p('/a/b'), _p('/')))
ok_(path_startswith(_p('/aaa/b/c'), _p('/aaa')))
nok_(path_startswith(_p('/aaa/b/c'), _p('/aa')))
nok_(path_startswith(_p('/a/b'), _p('/a/c')))
nok_(path_startswith(_p('/a/b/c'), _p('/a/c')))
# must not mix relative and abs
assert_raises(ValueError, path_startswith, _p('a/b'), _p('/a'))
assert_raises(ValueError, path_startswith, _p('/a/b'), _p('a'))
def test_path_is_subpath():
ok_(path_is_subpath(_p('/a/b'), _p('/a')))
ok_(path_is_subpath(_p('/a/b/c'), _p('/a')))
nok_(path_is_subpath(_p('/a/b'), _p('/a/b')))
nok_(path_is_subpath(_p('/a/b'), _p('/a/b/')))
nok_(path_is_subpath(_p('/a/b/'), _p('/a/b')))
ok_(path_is_subpath(_p('/a/b'), _p('/')))
ok_(path_is_subpath(_p('/aaa/b/c'), _p('/aaa')))
nok_(path_is_subpath(_p('/aaa/b/c'), _p('/aa')))
nok_(path_is_subpath(_p('/a/b'), _p('/a/c')))
nok_(path_is_subpath(_p('/a/b/c'), _p('/a/c')))
# must not mix relative and abs
assert_raises(ValueError, path_is_subpath, _p('a/b'), _p('/a'))
assert_raises(ValueError, path_is_subpath, _p('/a/b'), _p('a'))
def test_probe_known_failure():
# Note: we can't test the switch "datalad.tests.knownfailures.probe"
# directly, since it was evaluated in the decorator already. So we need
# to have different assertions in this test based on config and have it
# tested across builds, which use different settings for that switch.
@probe_known_failure
def not_failing():
pass
@probe_known_failure
def failing():
raise AssertionError("Failed")
switch = dl_cfg.obtain("datalad.tests.knownfailures.probe")
if switch:
# if probing is enabled the failing is considered to be expected and
# therefore the decorated function doesn't actually fail:
failing()
# in opposition a function that doesn't fail raises an AssertionError:
assert_raises(AssertionError, not_failing)
else:
# if probing is disabled it should just fail/pass as is:
assert_raises(AssertionError, failing)
not_failing()
def test_skip_if():
def dummy():
raise AssertionError
assert_raises(AssertionError, dummy)
# if cond is False, call the decorated function:
assert_raises(AssertionError, skip_if(cond=False, method='raise')(dummy))
# raises SkipTest if cond is True
assert_raises(SkipTest, skip_if(cond=True, method='raise')(dummy))
# but with method 'pass', there is neither SkipTest nor AssertionError.
# Instead the function call is just skipped:
skip_if(cond=True, method='pass')(dummy)
# But if condition is False, the original function is still called:
assert_raises(AssertionError, skip_if(cond=False, method='pass')(dummy))
def test_skip_known_failure():
# Note: we can't test the switch "datalad.tests.knownfailures.skip"
# directly, since it was evaluated in the decorator already. So we need
# to have different assertions in this test based on config and have it
# tested across builds, which use different settings for that switch.
@skip_known_failure
def failing():
raise AssertionError("Failed")
switch = dl_cfg.obtain("datalad.tests.knownfailures.skip")
if switch:
# if skipping is enabled, we shouldn't see the exception:
failing()
else:
# if it's disabled, failing() is executed and therefore exception
# is raised:
assert_raises(AssertionError, failing)
def test_known_failure():
@known_failure
def failing():
raise AssertionError("Failed")
skip = dl_cfg.obtain("datalad.tests.knownfailures.skip")
probe = dl_cfg.obtain("datalad.tests.knownfailures.probe")
if skip:
# skipping takes precedence over probing
failing()
elif probe:
# if we probe a known failure it's okay to fail:
failing()
else:
# not skipping and not probing results in the original failure:
assert_raises(AssertionError, failing)
from datalad.utils import read_csv_lines
def test_known_failure_direct_mode():
# Decorator is deprecated now and that is what we check
from .utils_pytest import known_failure_direct_mode
x = []
with swallow_logs(new_level=logging.WARNING) as cml:
@known_failure_direct_mode
def failing():
x.append('ok')
raise AssertionError("Failed")
assert_raises(AssertionError, failing) # nothing is swallowed
eq_(x, ['ok']) # everything runs
assert_in("Direct mode support is deprecated", cml.out)
@with_tempfile(content="h1 h2\nv1 2\nv2 3")
def test_read_csv_lines_basic(infile=None):
# Just a basic test, next one with unicode
gen = read_csv_lines(infile)
ok_generator(gen)
eq_(
list(gen),
[
{u'h1': u'v1', u'h2': u'2'},
{u'h1': u'v2', u'h2': u'3'},
]
)
@with_tempfile(content=u"h1\th2\nv1\tΠ΄Π°ΡΠ°".encode('utf-8'))
def test_read_csv_lines_tsv_unicode(infile=None):
# Just a basic test, next one with unicode
gen = read_csv_lines(infile)
ok_generator(gen)
eq_(
list(gen),
[
{u'h1': u'v1', u'h2': u'Π΄Π°ΡΠ°'},
]
)
@with_tempfile(content=u"h1\nv1\nv2")
def test_read_csv_lines_one_column(infile=None):
# Just a basic test, next one with unicode
eq_(
list(read_csv_lines(infile)),
[
{u'h1': u'v1'},
{u'h1': u'v2'},
]
)
def _get_testm_tree(ind):
"""Generate a fake package with submodules
We need to increment index for different tests since otherwise e.g.
import_modules fails to import submodule if first import_module_from_file
imports that one
"""
return {
'dltestm%d' % ind: {
'__init__.py': '',
'dlsub1': {'__init__.py': 'var = 1'},
'dlsub2.py': 'var = 2'}
}
@with_tree(tree=_get_testm_tree(1))
def test_import_modules(topdir=None):
try:
sys.path.append(topdir)
mods = import_modules(['dlsub1', 'bogus'], 'dltestm1')
finally:
sys.path.pop(sys.path.index(topdir))
eq_(len(mods), 1)
eq_(mods[0].__name__, 'dltestm1.dlsub1')
@with_tree(tree=_get_testm_tree(2))
def test_import_module_from_file(topdir=None):
with assert_raises(AssertionError):
# we support only submodule files ending with .py ATM. TODO
import_module_from_file(op.join(topdir, 'dltestm2', 'dlsub1'))
dlsub2_path = op.join(topdir, 'dltestm2', 'dlsub2.py')
mod = import_module_from_file(dlsub2_path)
eq_(mod.__name__, 'dlsub2') # we are not asking to import as submod of the dltestm1
assert_in('dlsub2', sys.modules)
try:
sys.path.append(topdir)
import dltestm2
mod = import_module_from_file(dlsub2_path, pkg=dltestm2)
eq_(mod.__name__, 'dltestm2.dlsub2')
assert_in('dltestm2.dlsub2', sys.modules)
finally:
sys.path.pop(sys.path.index(topdir))
def test_import_modules_fail():
# test that we log failures correctly
failures = []
import_modules(['bogus'], 'datalad', 'Fail {package}.{module}', failures.append)
eq_(len(failures), 1)
ok_startswith(failures[0], "Fail datalad.bogus: No module")
# Should be the last one since as discovered in NICEMAN might screw up coverage
def test_line_profile():
skip_if_no_module('line_profiler')
@line_profile
def f(j):
i = j + 1 # xyz
return i
with swallow_outputs() as cmo:
assert_equal(f(3), 4)
assert_equal(cmo.err, '')
assert_in('i = j + 1 # xyz', cmo.out)
@with_tempfile(mkdir=True)
def test_dlabspath(path=None):
if not has_symlink_capability():
raise SkipTest
# initially ran into on OSX https://github.com/datalad/datalad/issues/2406
opath = opj(path, "origin")
os.makedirs(opath)
lpath = opj(path, "linked")
os.symlink('origin', lpath)
for d in opath, lpath:
# regardless under which directory, all results should not resolve
# anything
eq_(d, dlabspath(d))
# in the root of ds
with chpwd(d):
eq_(dlabspath("path"), opj(d, "path"))
eq_(dlabspath("./path"), opj(d, "./path")) # we do not normpath by default
eq_(dlabspath("./path", norm=True), opj(d, "path"))
@with_tree({'1': 'content', 'd': {'2': 'more'}})
def test_get_open_files(p=None):
pobj = Path(p)
skip_if_no_module('psutil')
eq_(get_open_files(p), {})
f1 = pobj / '1'
subd = pobj / 'd'
with f1.open() as f:
# since lsof does not care about PWD env var etc, paths
# will not contain symlinks, we better realpath them
# all before comparison
eq_(get_open_files(p, log_open=40)[str(f1.resolve())].pid,
os.getpid())
assert not get_open_files(str(subd))
if on_windows:
# the remainder of the test assume a certain performance.
# however, on windows get_open_files() can be very slow
# (e.g. the first invocation in this test (above) can easily
# take 30-50s). It is not worth slowing the tests to
# accommodate this issue, given we have tested proper functioning
# in principle already above).
return
# if we start a process within that directory, should get informed
from subprocess import (
PIPE,
Popen,
)
from time import time
t0 = time()
proc = Popen([sys.executable, '-c',
r'import sys; sys.stdout.write("OK\n"); sys.stdout.flush();'
r'import time; time.sleep(10)'],
stdout=PIPE,
cwd=str(subd))
# Assure that it started and we read the OK
eq_(ensure_unicode(proc.stdout.readline().strip()), u"OK")
assert time() - t0 < 5 # that we were not stuck waiting for process to finish
eq_(get_open_files(p)[str(subd.resolve())].pid, proc.pid)
eq_(get_open_files(subd)[str(subd.resolve())].pid, proc.pid)
proc.terminate()
assert_equal(get_open_files(str(subd)), {})
def test_map_items():
def add10(x):
return x + 10
eq_(map_items(add10, {2: 3}), {12: 13})
class Custom(object):
"""For testing with custom items possibly of varying length etc"""
def __init__(self, items):
self._items = list(items)
def items(self):
return self._items
c = Custom([(1,), (2, 3), (4, 5, 6)])
c_mapped = map_items(add10, c)
assert type(c) is type(c_mapped)
eq_(c_mapped.items(), [(11,), (12, 13), (14, 15, 16)])
def test_CMD_MAX_ARG():
# 100 is arbitrarily large small integer ;)
# if fails -- we are unlikely to be able to work on this system
# and something went really wrong!
assert_greater(CMD_MAX_ARG, 100)
@with_tempfile(mkdir=True)
def test_create_tree(path=None):
content = u"ΠΌΠ°ΠΌΠ° ΠΌΡΠ»Π° ΡΠ°ΠΌΡ"
create_tree(path, dict([
('1', content),
('sd', dict(
[
# right away an obscure case where we have both 1 and 1.gz
('1', content*2),
('1.gz', content*3),
('1.xz', content*4),
('1.lzma', content*5),
]
)),
]))
ok_file_has_content(op.join(path, '1'), content)
ok_file_has_content(op.join(path, 'sd', '1'), content*2)
ok_file_has_content(op.join(path, 'sd', '1.gz'), content*3, decompress=True)
ok_file_has_content(op.join(path, 'sd', '1.xz'), content*4, decompress=True)
ok_file_has_content(op.join(path, 'sd', '1.lzma'), content*5, decompress=True)
def test_never_fail():
@never_fail
def iamok(arg):
return arg
eq_(iamok(1), 1)
@never_fail
def ifail(arg):
raise ValueError
eq_(ifail(1), None)
with patch.dict('os.environ', {'DATALAD_ALLOW_FAIL': '1'}):
# decision to create failing or not failing function
# is done at the time of decoration
@never_fail
def ifail2(arg):
raise ValueError
assert_raises(ValueError, ifail2, 1)
@pytest.mark.xfail(reason="TODO: for some reason fails on Travis")
@with_tempfile
def test_is_interactive(fout=None):
# must not fail if one of the streams is no longer open:
# https://github.com/datalad/datalad/issues/3267
from datalad.cmd import (
KillOutput,
NoCapture,
StdOutErrCapture,
WitlessRunner,
)
from datalad.support.annexrepo import (
AnnexInitOutput,
AnnexJsonProtocol,
)
from datalad.support.gitrepo import GitProgress
bools = ["False", "True"]
def get_interactive(py_pre="", **run_kwargs):
out = WitlessRunner().run(
[sys.executable,
"-c",
py_pre +
'from datalad.utils import is_interactive; '
'f = open(%r, "w"); '
'f.write(str(is_interactive())); '
'f.close()'
% fout
],
**run_kwargs
)
with open(fout) as f:
out = f.read()
assert_in(out, bools)
return bool(bools.index(out))
# verify that NoCapture can make fully interactive execution
# happen, also test the core protocols
# (we can only be interactive in a runner, if the test execution
# itself happens in an interactive environment)
for proto, interactive in ((NoCapture,
# It is unclear why (on travis only) a child
# process can report to be interactive
# whenever the parent process is not.
# Maintain this test exception until
# someone can provide insight. The point of
# this test is to ensure that NoCapture
# in an interactive parent also keeps the
# child interactive, so this oddity is not
# relevant.
True if on_travis else is_interactive()),
(KillOutput, False),
(StdOutErrCapture, False),
(GitProgress, False),
(AnnexInitOutput, False),
(AnnexJsonProtocol, False)):
eq_(get_interactive(protocol=proto),
interactive,
msg='{} -> {}'.format(str(proto), interactive))
# and it must not crash if smth is closed
for o in ('stderr', 'stdin', 'stdout'):
eq_(get_interactive("import sys; sys.%s.close(); " % o), False)
def test_splitjoin_cmdline():
# Do full round trip on a number of tricky samples
for args in (
['cmd', '-o1', 'simple'],
['c o', r'\m', ''],
['c o', ' '],
):
cmdline = join_cmdline(args)
assert isinstance(cmdline, str)
eq_(split_cmdline(cmdline), args)
# assure that there is no needless quoting
if on_windows:
# in quote_cmdlinearg we always quote on Windows
eq_(join_cmdline(['abc', 'def']), '"abc" "def"')
else:
eq_(join_cmdline(['abc', 'def']), 'abc def')
@skip_if_root
@with_tempfile
def test_obtain_write_permission(path=None):
path = Path(path)
# there's nothing at path yet:
assert_raises(FileNotFoundError, obtain_write_permission, path)
# Revoke write permission
path.write_text("something")
path.chmod(path.stat().st_mode & ~stat.S_IWRITE)
assert_raises(PermissionError, path.write_text, "different thing")
# Obtain and try again:
obtain_write_permission(path)
path.write_text("different thing")
# Already having permission is no issue:
obtain_write_permission(path)
path.write_text("yet another thing")
@skip_if_root
@with_tempfile(mkdir=True)
def test_ensure_write_permission(path=None):
# This is testing the usecase of write protected directories needed for
# messing with an annex object tree (as done by the ORA special remote).
# However, that doesn't work on Windows since we can't revoke write
# permissions for the owner of a directory (at least on VFAT - may be
# true for NTFS as well - don't know).
# Hence, on windows/crippledFS only test on a file.
dir_ = Path(path)
if not on_windows and has_symlink_capability:
# set up write-protected dir containing a file
file_ = dir_ / "somefile"
file_.write_text("whatever")
dir_.chmod(dir_.stat().st_mode & ~stat.S_IWRITE)
assert_raises(PermissionError, file_.unlink)
# contextmanager lets us do it and restores permissions afterwards:
mode_before = dir_.stat().st_mode
with ensure_write_permission(dir_):
file_.unlink()
mode_after = dir_.stat().st_mode
assert_equal(mode_before, mode_after)
assert_raises(PermissionError, file_.write_text, "new file can't be "
"written")
assert_raises(FileNotFoundError, ensure_write_permission(dir_ /
"non" / "existent").__enter__)
# deletion within context doesn't let mode restoration fail:
with ensure_write_permission(dir_):
dir_.rmdir()
dir_.mkdir() # recreate, since next block is executed unconditionally
# set up write-protected file:
file2 = dir_ / "protected.txt"
file2.write_text("unchangeable")
file2.chmod(file2.stat().st_mode & ~stat.S_IWRITE)
assert_raises(PermissionError, file2.write_text, "modification")
# within context we can:
with ensure_write_permission(file2):
file2.write_text("modification")
# mode is restored afterwards:
assert_raises(PermissionError, file2.write_text, "modification2")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_utils_cached_dataset.py 0000644 0001751 0001751 00000030060 15137634221 023060 0 ustar 00runner runner """Testing cached test dataset utils"""
from unittest.mock import patch
from datalad.distribution.dataset import Dataset
from datalad.support.annexrepo import AnnexRepo
from datalad.support.gitrepo import GitRepo
from datalad.tests.utils_cached_dataset import (
cached_dataset,
cached_url,
get_cached_dataset,
url2filename,
)
from datalad.tests.utils_pytest import (
DEFAULT_REMOTE,
assert_equal,
assert_false,
assert_in,
assert_is,
assert_is_instance,
assert_not_equal,
assert_not_in,
assert_raises,
assert_result_count,
assert_true,
skip_if_no_network,
with_tempfile,
)
from datalad.utils import (
Path,
ensure_list,
opj,
)
CACHE_PATCH_STR = "datalad.tests.utils_cached_dataset.DATALAD_TESTS_CACHE"
CLONE_PATCH_STR = "datalad.tests.utils_cached_dataset.Clone.__call__"
@skip_if_no_network
@with_tempfile(mkdir=True)
def test_get_cached_dataset(cache_dir=None):
# patch DATALAD_TESTS_CACHE to not use the actual cache with
# the test testing that very cache.
cache_dir = Path(cache_dir)
# store file-based values for testrepo-minimalds for readability:
annexed_file = opj('inannex', 'animated.gif')
annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif"
with patch(CACHE_PATCH_STR, new=cache_dir):
# tuples to test (url, version, keys, class):
test_cases = [
# a simple testrepo
("https://github.com/datalad/testrepo--minimalds",
"541cf855d13c2a338ff2803d4488daf0035e568f",
None,
AnnexRepo),
# Same repo, but request paths to be present. This should work
# with a subsequent call, although the first one did not already
# request any:
("https://github.com/datalad/testrepo--minimalds",
"9dd8b56cc706ab56185f2ceb75fbe9de9b606724",
annexed_file_key,
AnnexRepo),
# Same repo again, but invalid version
("https://github.com/datalad/testrepo--minimalds",
"nonexistent",
"irrelevantkey", # invalid version; don't even try to get the key
AnnexRepo),
# same thing with different name should be treated as a new thing:
("https://github.com/datalad/testrepo--minimalds",
"git-annex",
None,
AnnexRepo),
# try a plain git repo to make sure we can deal with that:
# Note, that we first need a test case w/o a `key` parameter to not
# blow up the test when Clone is patched, resulting in a MagicMock
# instead of a Dataset instance within get_cached_dataset. In the
# second case it's already cached then, so the patched Clone is
# never executed.
("https://github.com/datalad/datalad.org",
None,
None,
GitRepo),
("https://github.com/datalad/datalad.org",
"gh-pages",
"ignored-key", # it's a git repo; don't even try to get a key
GitRepo),
]
for url, version, keys, cls in test_cases:
target = cache_dir / url2filename(url)
# assuming it doesn't exist yet - patched cache dir!
in_cache_before = target.exists()
with patch(CLONE_PATCH_STR) as exec_clone:
try:
ds = get_cached_dataset(url, version, keys)
invalid_version = False
except AssertionError:
# should happen only if `version` wasn't found. Implies
# that the dataset exists in cache (although not returned
# due to exception)
assert_true(version)
assert_false(Dataset(target).repo.commit_exists(version))
# mark for later assertions (most of them should still hold
# true)
invalid_version = True
assert_equal(exec_clone.call_count, 0 if in_cache_before else 1)
# Patch prevents actual execution. Now do it for real. Note, that
# this might be necessary for content retrieval even if dataset was
# in cache before.
try:
ds = get_cached_dataset(url, version, keys)
except AssertionError:
# see previous call
assert_true(invalid_version)
assert_is_instance(ds, Dataset)
assert_true(ds.is_installed())
assert_equal(target, ds.pathobj)
assert_is_instance(ds.repo, cls)
if keys and not invalid_version and \
AnnexRepo.is_valid_repo(ds.path):
# Note: it's not supposed to get that content if passed
# `version` wasn't available. get_cached_dataset would then
# raise before and not download anything only to raise
# afterwards.
here = ds.config.get("annex.uuid")
where = ds.repo.whereis(ensure_list(keys), key=True)
assert_true(all(here in remotes for remotes in where))
# version check. Note, that all `get_cached_dataset` is supposed to
# do, is verifying, that specified version exists - NOT check it
# out"
if version and not invalid_version:
assert_true(ds.repo.commit_exists(version))
# re-execution
with patch(CLONE_PATCH_STR) as exec_clone:
try:
ds2 = get_cached_dataset(url, version, keys)
except AssertionError:
assert_true(invalid_version)
exec_clone.assert_not_called()
# returns the same Dataset as before:
assert_is(ds, ds2)
@skip_if_no_network
@with_tempfile(mkdir=True)
def test_cached_dataset(cache_dir=None):
# patch DATALAD_TESTS_CACHE to not use the actual cache with
# the test testing that very cache.
cache_dir = Path(cache_dir)
ds_url = "https://github.com/datalad/testrepo--minimalds"
name_in_cache = url2filename(ds_url)
annexed_file = Path("inannex") / "animated.gif"
with patch(CACHE_PATCH_STR, new=cache_dir):
@cached_dataset(url=ds_url)
def decorated_test1(ds):
# we get a Dataset instance
assert_is_instance(ds, Dataset)
# it's a clone in a temp. location, not within the cache
assert_not_in(cache_dir, ds.pathobj.parents)
assert_result_count(ds.siblings(), 1, type="sibling",
name=DEFAULT_REMOTE,
url=(cache_dir / name_in_cache).as_posix())
here = ds.config.get("annex.uuid")
origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid")
where = ds.repo.whereis(str(annexed_file))
assert_not_in(here, where)
assert_not_in(origin, where)
return ds.pathobj, ds.repo.pathobj
@cached_dataset(url=ds_url, paths=str(annexed_file))
def decorated_test2(ds):
# we get a Dataset instance
assert_is_instance(ds, Dataset)
# it's a clone in a temp. location, not within the cache
assert_not_in(cache_dir, ds.pathobj.parents)
assert_result_count(ds.siblings(), 1, type="sibling",
name=DEFAULT_REMOTE,
url=(cache_dir / name_in_cache).as_posix())
here = ds.config.get("annex.uuid")
origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid")
where = ds.repo.whereis(str(annexed_file))
assert_in(here, where)
assert_in(origin, where)
return ds.pathobj, ds.repo.pathobj
@cached_dataset(url=ds_url)
def decorated_test3(ds):
# we get a Dataset instance
assert_is_instance(ds, Dataset)
# it's a clone in a temp. location, not within the cache
assert_not_in(cache_dir, ds.pathobj.parents)
assert_result_count(ds.siblings(), 1, type="sibling",
name=DEFAULT_REMOTE,
url=(cache_dir / name_in_cache).as_posix())
# origin is the same cached dataset, that got this content in
# decorated_test2 before. Should still be there. But "here" we
# didn't request it
here = ds.config.get("annex.uuid")
origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid")
where = ds.repo.whereis(str(annexed_file))
assert_not_in(here, where)
assert_in(origin, where)
return ds.pathobj, ds.repo.pathobj
@cached_dataset(url=ds_url,
version="541cf855d13c2a338ff2803d4488daf0035e568f")
def decorated_test4(ds):
# we get a Dataset instance
assert_is_instance(ds, Dataset)
# it's a clone in a temp. location, not within the cache
assert_not_in(cache_dir, ds.pathobj.parents)
assert_result_count(ds.siblings(), 1, type="sibling",
name=DEFAULT_REMOTE,
url=(cache_dir / name_in_cache).as_posix())
# origin is the same cached dataset, that got this content in
# decorated_test2 before. Should still be there. But "here" we
# didn't request it
here = ds.config.get("annex.uuid")
origin = ds.config.get(f"remote.{DEFAULT_REMOTE}.annex-uuid")
where = ds.repo.whereis(str(annexed_file))
assert_not_in(here, where)
assert_in(origin, where)
assert_equal(ds.repo.get_hexsha(),
"541cf855d13c2a338ff2803d4488daf0035e568f")
return ds.pathobj, ds.repo.pathobj
first_dspath, first_repopath = decorated_test1()
second_dspath, second_repopath = decorated_test2()
decorated_test3()
decorated_test4()
# first and second are not the same, only their origin is:
assert_not_equal(first_dspath, second_dspath)
assert_not_equal(first_repopath, second_repopath)
@skip_if_no_network
@with_tempfile(mkdir=True)
def test_cached_url(cache_dir=None):
# patch DATALAD_TESTS_CACHE to not use the actual cache with
# the test testing that very cache.
cache_dir = Path(cache_dir)
ds_url = "https://github.com/datalad/testrepo--minimalds"
name_in_cache = url2filename(ds_url)
annexed_file = Path("inannex") / "animated.gif"
annexed_file_key = "MD5E-s144625--4c458c62b7ac8ec8e19c8ff14b2e34ad.gif"
with patch(CACHE_PATCH_STR, new=cache_dir):
@cached_url(url=ds_url)
def decorated_test1(url):
# we expect a file-scheme url to a cached version of `ds_url`
expect_origin_path = cache_dir / name_in_cache
assert_equal(expect_origin_path.as_uri(),
url)
origin = Dataset(expect_origin_path)
assert_true(origin.is_installed())
assert_false(origin.repo.file_has_content(str(annexed_file)))
decorated_test1()
@cached_url(url=ds_url, keys=annexed_file_key)
def decorated_test2(url):
# we expect a file-scheme url to a "different" cached version of
# `ds_url`
expect_origin_path = cache_dir / name_in_cache
assert_equal(expect_origin_path.as_uri(),
url)
origin = Dataset(expect_origin_path)
assert_true(origin.is_installed())
assert_true(origin.repo.file_has_content(str(annexed_file)))
decorated_test2()
# disable caching. Note, that in reality DATALAD_TESTS_CACHE is determined
# on import time of datalad.tests.fixtures based on the config
# "datalad.tests.cache". We patch the result here, not the config itself.
with patch(CACHE_PATCH_STR, new=None):
@cached_url(url=ds_url)
def decorated_test3(url):
# we expect the original url, since caching is disabled
assert_equal(url, ds_url)
decorated_test3()
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_utils_testrepos.py 0000644 0001751 0001751 00000003636 15137634221 022205 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Tests for test repositories
"""
from datalad.tests.utils_pytest import (
assert_repo_status,
ok_,
ok_file_under_git,
skip_if_on_windows,
swallow_outputs,
with_tempfile,
)
from datalad.tests.utils_testrepos import (
BasicAnnexTestRepo,
BasicGitTestRepo,
)
def _test_BasicAnnexTestRepo(repodir):
trepo = BasicAnnexTestRepo(repodir)
trepo.create()
assert_repo_status(trepo.path)
ok_file_under_git(trepo.path, 'test.dat')
ok_file_under_git(trepo.path, 'INFO.txt')
ok_file_under_git(trepo.path, 'test-annex.dat', annexed=True)
ok_(trepo.repo.file_has_content('test-annex.dat') is False)
with swallow_outputs():
trepo.repo.get('test-annex.dat')
ok_(trepo.repo.file_has_content('test-annex.dat'))
# Use of @with_tempfile() apparently is not friendly to test generators yet
# so generating two tests manually
# something is wrong with the implicit tempfile generation on windows
# a bunch of tested assumptions aren't met, and which ones depends on the
# windows version being tested
@skip_if_on_windows
def test_BasicAnnexTestRepo_random_location_generated():
_test_BasicAnnexTestRepo(None) # without explicit path -- must be generated
@with_tempfile()
def test_BasicAnnexTestRepo(path=None):
_test_BasicAnnexTestRepo(path)
@with_tempfile()
def test_BasicGitTestRepo(path=None):
trepo = BasicGitTestRepo(path)
trepo.create()
assert_repo_status(trepo.path, annex=False)
ok_file_under_git(trepo.path, 'test.dat')
ok_file_under_git(trepo.path, 'INFO.txt')
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/test_version.py 0000644 0001751 0001751 00000006106 15137634221 020415 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import re
from packaging.version import Version
from datalad.support import path as op
from datalad.tests.utils_pytest import (
SkipTest,
assert_equal,
assert_greater,
assert_in,
assert_not_in,
ok_startswith,
)
from datalad.utils import ensure_unicode
from .. import __version__
def test__version__():
# in released stage, version in the last CHANGELOG entry
# should correspond to the one in datalad
CHANGELOG_filename = op.join(
op.dirname(__file__), op.pardir, op.pardir, 'CHANGELOG.md')
if not op.exists(CHANGELOG_filename):
raise SkipTest("no %s found" % CHANGELOG_filename)
regex = re.compile(r'^# '
r'(?P[0-9]+\.[0-9.abcrc~]+)\s+'
r'\((?P.*)\)'
)
with open(CHANGELOG_filename, 'rb') as f:
for line in f:
line = line.rstrip()
if not line.startswith(b'# '):
# The first section header we hit, must be our changelog entry
continue
reg = regex.match(ensure_unicode(line))
if not reg: # first one at that level is the one
raise AssertionError(
"Following line must have matched our regex: %r" % line)
regd = reg.groupdict()
changelog_version = regd['version']
lv_changelog_version = Version(changelog_version)
# we might have a suffix - sanitize
san__version__ = __version__.rstrip('.dirty')
lv__version__ = Version(san__version__)
if '???' in regd['date'] and 'will be better than ever' in regd['codename']:
# we only have our template
# we can only assert that its version should be higher than
# the one we have now
assert_greater(lv_changelog_version, lv__version__)
else:
# should be a "release" record
assert_not_in('???', regd['date'])
ok_startswith(__version__, changelog_version)
if lv__version__ != lv_changelog_version:
# It was not tagged yet and Changelog has no new records
# (they are composed by auto upon release)
assert_greater(lv__version__, lv_changelog_version)
assert_in('+', san__version__) # we have build suffix
else:
# all is good, tagged etc
assert_equal(lv_changelog_version, lv__version__)
assert_equal(changelog_version, san__version__)
return
raise AssertionError(
"No log line matching our regex found in %s" % CHANGELOG_filename
)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/utils_cached_dataset.py 0000644 0001751 0001751 00000023360 15137634221 022026 0 ustar 00runner runner """Utils for cached test datasets"""
from datalad import cfg
from datalad.core.distributed.clone import Clone
from datalad.distribution.dataset import Dataset
from datalad.support.annexrepo import AnnexRepo
from datalad.tests.utils_pytest import (
DEFAULT_REMOTE,
with_tempfile,
)
from datalad.utils import (
Path,
better_wraps,
ensure_list,
optional_args,
rmtree,
)
DATALAD_TESTS_CACHE = cfg.obtain("datalad.tests.cache")
def url2filename(url):
"""generate file/directory name from a URL"""
# TODO: Not really important for now, but there should be a more
# sophisticated approach to replace. May be just everything that
# isn't alphanumeric? Or simply hash the URL?
# URL: Will include version eventually. Would need parsing to hash
# w/o any parameters. Having separate clones per requested version
# would defy point of cache, particularly wrt downloading content.
# Depends on usecase, of course, but immediate one is about container
# images -> not cheap.
# make it a Path, too, so pathlib can raise if we are creating an invalid
# path on some system we run the tests on.
return Path(
url.lower().replace("/", "_").replace(":", "_").replace("?", "_"))
def get_cached_dataset(url, version=None, keys=None):
""" Helper to get a cached clone from url
Intended for use from within `cached_dataset` and `cached_url` decorators.
Clones `url` into user's cache under datalad/tests/`name`. If such a clone
already exists, don't clone but return the existing one. So, it's supposed
to cache the original source in order to reduce time and traffic for tests,
by letting subsequent requests clone from a local location directly.
If it's an annex get the content as provided by `keys`, too.
Note, that as a transparent cache replacing the repo at URL from the POV of
a test, we can't address content via paths, since those are valid only with
respect to a particular worktree. If different tests clone from the same
cached dataset, each requesting different versions and different paths
thereof, we run into trouble if the cache itself checks out a particular
requested version.
Verifies that `version` can be checked out, but doesn't actually do it,
since the cached dataset is intended to be used as origin instead of the
original remote at URL by the `cached_dataset` test decorator. Checkout of
a particular version should happen in its clone.
Parameters
----------
url: str
URL to clone from
keys: str or list or None
(list of) annex keys to get content for.
version: str or None
A commit or an object that can be dereferenced to one.
Returns
-------
Dataset
"""
# TODO: What about recursive? Might be complicated. We would need to make
# sure we can recursively clone _from_ here then, potentially
# requiring submodule URL rewrites. Not sure about that ATM.
# TODO: Given that it is supposed to be a cache for the original repo at
# `url`, we prob. should make this a bare repository. We don't need
# a potentially expensive checkout here. Need to double check
# `annex-get --key` in bare repos, though. Plus datalad-clone doesn't
# have --bare yet. But we want all the annex/special-remote/ria magic
# of datalad. So, plain git-clone --bare is not an option.
if not DATALAD_TESTS_CACHE:
raise ValueError("Caching disabled by config")
ds = Dataset(DATALAD_TESTS_CACHE / url2filename(url))
if not ds.is_installed():
ds = Clone()(url, ds.pathobj)
# When/How to update a dataset in cache? If version is a commit SHA and we
# have it, there's no need for an update. Otherwise it gets tricky, because
# this is a cache, not a checkout a test would operate on. It needs to
# behave as if it was the thing at `url` from the point of view of the test
# using it (cloning/getting content from here). We would need to update all
# references, not just fetch them!
#
# Can we even (cheaply) tell whether `version` is an absolute reference
# (actual SHA, not a branch/tag)?
#
# NOTE: - consider git-clone --mirror, but as w/ --bare: not an option for
# datalad-clone yet.
# - --reference[-if-able] might also be worth thinking about for
# the clone @cached_dataset creates wrt clone in cacheq
#
# So, for now fetch, figure whether there actually was something to fetch
# and if so simply invalidate cache and re-clone/get. Don't overcomplicate
# things. It's about datasets used in the tests - they shouldn't change too
# frequently.
elif any('uptodate' not in c['operations']
for c in ds.repo.fetch(DEFAULT_REMOTE)):
rmtree(ds.path)
ds = Clone()(url, ds.pathobj)
if version:
# check whether version is available
assert ds.repo.commit_exists(version)
if keys and AnnexRepo.is_valid_repo(ds.path):
ds.repo.get(keys, key=True)
return ds
@optional_args
def cached_dataset(f, url=None, version=None, paths=None):
"""Test decorator providing a clone of `url` from cache
If config datalad.tests.cache is not set, delivers a clone in a temporary
location of the original `url`. Otherwise that clone is in fact a clone of a
cached dataset (origin being the cache instead of `url`).
This allows to reduce time and network traffic when using a dataset in
different tests.
The clone will checkout `version` and get the content for `paths`.
Parameters
----------
url: str
URL to the to be cloned dataset
version: str
committish to checkout in the clone
paths: str or list
annexed content to get
Returns
-------
Dataset
a clone of the dataset at `url` at a temporary location (cleaned up,
after decorated test is finished - see with_tempfile). If caching is
enabled, it's actually a clone of a clone, 'origin' being the clone in
cache rather than the original repo at `url`.
"""
@better_wraps(f)
@with_tempfile
def _wrap_cached_dataset(*arg, **kw):
if DATALAD_TESTS_CACHE:
# Note: We can't pass keys based on `paths` parameter to
# get_cached_dataset yet, since translation to keys depends on a
# worktree. We'll have the worktree of `version` only after cloning.
ds = get_cached_dataset(url, version=version)
clone_ds = Clone()(ds.pathobj, arg[-1])
else:
clone_ds = Clone()(url, arg[-1])
#save some cycles
clone_repo = clone_ds.repo
if version:
clone_repo.checkout(version)
if paths and AnnexRepo.is_valid_repo(clone_ds.path):
# just assume ds is annex as well. Otherwise `Clone` wouldn't
# work correctly - we don't need to test its implementation here
if DATALAD_TESTS_CACHE:
# cache is enabled; we need to make sure it has the desired
# content, so clone_ds can get it from there. However, we got
# `paths` and potentially a `version` they refer to. We can't
# assume the same (or any) worktree in cache. Hence we need to
# translate to keys.
# MIH Despite the variable names used in this function
# (pathS, keyS) they ultimately are passed to get(..., key=True)
# which means that it can ever only be a single path and a
# single key -- this is very confusing.
# the key determination could hence be done with
# get_file_annexinfo() in a much simpler way, but it seems this
# function wants to be ready for more, sigh
keys = [
p['key']
for p in clone_repo.get_content_annexinfo(
ensure_list(paths), init=None).values()
if 'key' in p
]
if keys:
ds.repo.get(keys, key=True)
clone_repo.fsck(remote=DEFAULT_REMOTE, fast=True)
clone_ds.get(paths)
return f(*(arg[:-1] + (clone_ds,)), **kw)
return _wrap_cached_dataset
@optional_args
def cached_url(f, url=None, keys=None):
"""Test decorator providing a URL to clone from, pointing to cached dataset
If config datalad.tests.cache is not set, delivers the original `url`,
otherwise a file-scheme url to the cached clone thereof.
Notes
-----
While this is similar to `cached_dataset`, there are important differences.
1. As we deliver an URL, `version` parameter is irrelevant. The only
relevant notion of version would need to be included in the URL
2. We cannot request particular paths to be present in cache, since we
a version to refer to by those paths. Therefore keys need to be
specified.
Parameters
----------
url: str
URL to the original dataset
keys: str or list or None
(list of) annex keys to get content for.
Returns
-------
str
URL to the cached dataset or the original URL if caching was disabled
"""
# TODO: See Notes 1.)
# Append fragments/parameters of `url` to what we return -
# depending on how we generally decide to address versioned
# URLs for clone etc.
@better_wraps(f)
def _wrap_cached_url(*arg, **kw):
if DATALAD_TESTS_CACHE:
ds = get_cached_dataset(url, version=None)
if keys:
ds.repo.get(keys, key=True)
new_url = ds.pathobj.as_uri()
else:
new_url = url
return f(*(arg + (new_url,)), **kw)
return _wrap_cached_url
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/utils_pytest.py 0000644 0001751 0001751 00000203125 15137634221 020441 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil; coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Miscellaneous utilities to assist with testing"""
import base64
import lzma
import multiprocessing
import multiprocessing.queues
import ssl
import textwrap
from difflib import unified_diff
from functools import lru_cache
from http.server import (
HTTPServer,
SimpleHTTPRequestHandler,
)
from json import dumps
from typing import Optional
from unittest import SkipTest
from unittest.mock import patch
import pytest
import datalad.utils as ut
from datalad import cfg as dl_cfg
from datalad import utils
from datalad.cmd import (
StdOutErrCapture,
WitlessRunner,
)
from datalad.consts import ARCHIVES_TEMP_DIR
from datalad.dochelpers import borrowkwargs
from datalad.support.external_versions import (
LooseVersion,
external_versions,
)
from datalad.support.keyring_ import MemoryKeyring
from datalad.support.network import RI
from datalad.support.vcr_ import *
# TODO this must go
from datalad.utils import *
# temp paths used by clones
_TEMP_PATHS_CLONES = set()
# Additional indicators
on_travis = bool(os.environ.get('TRAVIS', False))
on_appveyor = bool(os.environ.get('APPVEYOR', False))
on_github = bool(os.environ.get('GITHUB_ACTION', False))
on_nfs = 'nfs' in os.getenv('TMPDIR', '')
if external_versions["cmd:git"] >= "2.28":
# The specific value here doesn't matter, but it should not be the default
# from any Git version to test that we work with custom values.
DEFAULT_BRANCH = "dl-test-branch" # Set by setup_package().
else:
DEFAULT_BRANCH = "master"
if external_versions["cmd:git"] >= "2.30.0":
# The specific value here doesn't matter, but it should not be the default
# from any Git version to test that we work with custom values.
DEFAULT_REMOTE = "dl-test-remote" # Set by setup_package().
else:
DEFAULT_REMOTE = "origin"
def attr(name):
return getattr(pytest.mark, name)
def assert_equal(first, second, msg=None):
if msg is None:
assert first == second
else:
assert first == second, msg
def assert_false(expr, msg=None):
if msg is None:
assert not expr
else:
assert not expr, msg
def assert_greater(first, second, msg=None):
if msg is None:
assert first > second
else:
assert first > second, msg
def assert_greater_equal(first, second, msg=None):
if msg is None:
assert first >= second
else:
assert first >= second, msg
def assert_in(first, second, msg=None):
if msg is None:
assert first in second
else:
assert first in second, msg
in_ = assert_in
def assert_is(first, second, msg=None):
if msg is None:
assert first is second
else:
assert first is second, msg
def assert_is_instance(first, second, msg=None):
if msg is None:
assert isinstance(first, second)
else:
assert isinstance(first, second), msg
def assert_is_none(expr, msg=None):
if msg is None:
assert expr is None
else:
assert expr is None, msg
def assert_is_not(first, second, msg=None):
if msg is None:
assert first is not second
else:
assert first is not second, msg
def assert_is_not_none(expr, msg=None):
if msg is None:
assert expr is not None
else:
assert expr is not None, msg
def assert_not_equal(first, second, msg=None):
if msg is None:
assert first != second
else:
assert first != second, msg
def assert_not_in(first, second, msg=None):
if msg is None:
assert first not in second
else:
assert first not in second, msg
def assert_not_is_instance(first, second, msg=None):
if msg is None:
assert not isinstance(first, second)
else:
assert not isinstance(first, second), msg
assert_raises = pytest.raises
assert_set_equal = assert_equal
def assert_true(expr, msg=None):
if msg is None:
assert expr
else:
assert expr, msg
eq_ = assert_equal
ok_ = assert_true
# additional shortcuts
neq_ = assert_not_equal
nok_ = assert_false
lgr = logging.getLogger("datalad.tests.utils_pytest")
def skip_if_no_module(module):
# Using pytest.importorskip here won't always work, as some imports (e.g.,
# libxmp) can fail with exceptions other than ImportError
try:
imp = __import__(module)
except Exception as exc:
pytest.skip("Module %s fails to load" % module, allow_module_level=True)
def skip_if_scrapy_without_selector():
"""A little helper to skip some tests which require recent scrapy"""
try:
import scrapy
from scrapy.selector import Selector
except ImportError:
pytest.skip(
"scrapy misses Selector (too old? version: %s)"
% getattr(scrapy, '__version__'))
def skip_if_url_is_not_available(url, regex=None):
# verify that dataset is available
from datalad.downloaders.base import DownloadError
from datalad.downloaders.providers import Providers
providers = Providers.from_config_files()
try:
content = providers.fetch(url)
if regex and re.search(regex, content):
pytest.skip("%s matched %r -- skipping the test" % (url, regex))
except DownloadError:
pytest.skip("%s failed to download" % url)
def check_not_generatorfunction(func):
"""Internal helper to verify that we are not decorating generator tests"""
if inspect.isgeneratorfunction(func):
raise RuntimeError("{}: must not be decorated, is a generator test"
.format(func.__name__))
def skip_if_no_network(func=None):
"""Skip test completely in NONETWORK settings
If not used as a decorator, and just a function, could be used at the module level
"""
check_not_generatorfunction(func)
def check_and_raise():
if dl_cfg.get('datalad.tests.nonetwork'):
pytest.skip("Skipping since no network settings", allow_module_level=True)
if func:
@wraps(func)
@attr('network')
@attr('skip_if_no_network')
def _wrap_skip_if_no_network(*args, **kwargs):
check_and_raise()
return func(*args, **kwargs)
return _wrap_skip_if_no_network
else:
check_and_raise()
def skip_if_on_windows(func=None):
"""Skip test completely under Windows
"""
check_not_generatorfunction(func)
def check_and_raise():
if on_windows:
pytest.skip("Skipping on Windows")
if func:
@wraps(func)
@attr('skip_if_on_windows')
def _wrap_skip_if_on_windows(*args, **kwargs):
check_and_raise()
return func(*args, **kwargs)
return _wrap_skip_if_on_windows
else:
check_and_raise()
def skip_if_root(func=None):
"""Skip test if uid == 0.
Note that on Windows (or anywhere else `os.geteuid` is not available) the
test is _not_ skipped.
"""
check_not_generatorfunction(func)
def check_and_raise():
if hasattr(os, "geteuid") and os.geteuid() == 0:
pytest.skip("Skipping: test assumptions fail under root")
if func:
@wraps(func)
@attr('skip_if_root')
def _wrap_skip_if_root(*args, **kwargs):
check_and_raise()
return func(*args, **kwargs)
return _wrap_skip_if_root
else:
check_and_raise()
@optional_args
def skip_if(func, cond=True, msg=None, method='raise'):
"""Skip test for specific condition
Parameters
----------
cond: bool
condition on which to skip
msg: str
message to print if skipping
method: str
either 'raise' or 'pass'. Whether to skip by raising `SkipTest` or by
just proceeding and simply not calling the decorated function.
This is particularly meant to be used, when decorating single assertions
in a test with method='pass' in order to not skip the entire test, but
just that assertion.
"""
check_not_generatorfunction(func)
@wraps(func)
def _wrap_skip_if(*args, **kwargs):
if cond:
if method == 'raise':
pytest.skip(msg if msg else "condition was True")
elif method == 'pass':
print(msg if msg else "condition was True")
return
return func(*args, **kwargs)
return _wrap_skip_if
def skip_ssh(func):
"""Skips SSH tests if on windows or if environment variable
DATALAD_TESTS_SSH was not set
"""
check_not_generatorfunction(func)
@wraps(func)
@attr('skip_ssh')
def _wrap_skip_ssh(*args, **kwargs):
test_ssh = dl_cfg.get("datalad.tests.ssh", '')
if not test_ssh or test_ssh in ('0', 'false', 'no'):
raise SkipTest("Run this test by setting DATALAD_TESTS_SSH")
return func(*args, **kwargs)
return _wrap_skip_ssh
def skip_nomultiplex_ssh(func):
"""Skips SSH tests if default connection/manager does not support multiplexing
e.g. currently on windows or if set via datalad.ssh.multiplex-connections config variable
"""
check_not_generatorfunction(func)
from ..support.sshconnector import (
MultiplexSSHManager,
SSHManager,
)
@wraps(func)
@attr('skip_nomultiplex_ssh')
@skip_ssh
def _wrap_skip_nomultiplex_ssh(*args, **kwargs):
if SSHManager is not MultiplexSSHManager:
pytest.skip("SSH without multiplexing is used")
return func(*args, **kwargs)
return _wrap_skip_nomultiplex_ssh
#
# Addition "checkers"
#
import os
from datalad.distribution.dataset import Dataset
from datalad.support.annexrepo import (
AnnexRepo,
FileNotInAnnexError,
)
from datalad.support.gitrepo import GitRepo
from ..utils import (
chpwd,
getpwd,
)
def ok_clean_git(path, annex=None, index_modified=[], untracked=[]):
"""Obsolete test helper. Use assert_repo_status() instead.
Still maps a few common cases to the new helper, to ease transition
in extensions.
"""
kwargs = {}
if index_modified:
kwargs['modified'] = index_modified
if untracked:
kwargs['untracked'] = untracked
assert_repo_status(
path,
annex=annex,
**kwargs,
)
def ok_file_under_git(path, filename=None, annexed=False):
"""Test if file is present and under git/annex control
If relative path provided, then test from current directory
"""
annex, file_repo_path, filename, path, repo = _prep_file_under_git(path, filename)
assert_in(file_repo_path, repo.get_indexed_files()) # file is known to Git
if annex:
in_annex = 'key' in repo.get_file_annexinfo(file_repo_path)
else:
in_annex = False
assert(annexed == in_annex)
def put_file_under_git(path, filename=None, content=None, annexed=False):
"""Place file under git/annex and return used Repo
"""
annex, file_repo_path, filename, path, repo = _prep_file_under_git(path, filename)
if content is None:
content = ""
with open(opj(repo.path, file_repo_path), 'w') as f_:
f_.write(content)
if annexed:
if not isinstance(repo, AnnexRepo):
repo = AnnexRepo(repo.path)
repo.add(file_repo_path)
else:
repo.add(file_repo_path, git=True)
repo.commit(_datalad_msg=True)
ok_file_under_git(repo.path, file_repo_path, annexed)
return repo
def _prep_file_under_git(path, filename):
"""Get instance of the repository for the given filename
Helper to be used by few functions
"""
path = Path(path)
if filename is None:
# path provides the path and the name
filename = Path(path.name)
path = path.parent
else:
filename = Path(filename)
ds = Dataset(utils.get_dataset_root(path))
return isinstance(ds.repo, AnnexRepo), \
str(path.absolute().relative_to(ds.path) / filename) \
if not filename.is_absolute() \
else str(filename.relative_to(ds.pathobj)), \
filename, \
str(path), \
ds.repo
def get_annexstatus(ds, paths=None):
"""Report a status for annexed contents.
Assembles states for git content info, amended with annex info on 'HEAD'
(to get the last committed stage and with it possibly vanished content),
and lastly annex info wrt to the present worktree, to also get info on
added/staged content this fuses the info reported from
- git ls-files
- git annex findref HEAD
- git annex find --include '*'"""
info = ds.get_content_annexinfo(
paths=paths,
eval_availability=False,
init=ds.get_content_annexinfo(
paths=paths,
ref='HEAD',
eval_availability=False,
init=ds.status(
paths=paths,
eval_submodule_state='full')
)
)
ds._mark_content_availability(info)
return info
#
# Helpers to test symlinks
#
def ok_symlink(path):
"""Checks whether path is either a working or broken symlink"""
link_path = os.path.islink(path)
if not link_path:
raise AssertionError("Path {} seems not to be a symlink".format(path))
def ok_good_symlink(path):
ok_symlink(path)
rpath = Path(path).resolve()
ok_(rpath.exists(),
msg="Path {} seems to be missing. Symlink {} is broken".format(
rpath, path))
def ok_broken_symlink(path):
ok_symlink(path)
rpath = Path(path).resolve()
assert_false(rpath.exists(),
msg="Path {} seems to be present. Symlink {} is not broken".format(
rpath, path))
def ok_startswith(s, prefix):
ok_(s.startswith(prefix),
msg="String %r doesn't start with %r" % (s, prefix))
def ok_endswith(s, suffix):
ok_(s.endswith(suffix),
msg="String %r doesn't end with %r" % (s, suffix))
def nok_startswith(s, prefix):
assert_false(s.startswith(prefix),
msg="String %r starts with %r" % (s, prefix))
def ok_git_config_not_empty(ar):
"""Helper to verify that nothing rewritten the config file"""
# TODO: we don't support bare -- do we?
assert_true(os.stat(opj(ar.path, '.git', 'config')).st_size)
def ok_annex_get(ar, files, network=True):
"""Helper to run .get decorated checking for correct operation
get passes through stderr from the ar to the user, which pollutes
screen while running tests
Note: Currently not true anymore, since usage of --json disables
progressbars
"""
ok_git_config_not_empty(ar) # we should be working in already inited repo etc
with swallow_outputs() as cmo:
ar.get(files)
# verify that load was fetched
ok_git_config_not_empty(ar) # whatever we do shouldn't destroy the config file
has_content = ar.file_has_content(files)
if isinstance(has_content, bool):
ok_(has_content)
else:
ok_(all(has_content))
def ok_generator(gen):
assert_true(inspect.isgenerator(gen), msg="%s is not a generator" % gen)
assert_is_generator = ok_generator # just an alias
def ok_archives_caches(repopath, n=1, persistent=None):
"""Given a path to repository verify number of archives
Parameters
----------
repopath : str
Path to the repository
n : int, optional
Number of archives directories to expect
persistent: bool or None, optional
If None -- both persistent and not count.
"""
# looking into subdirectories
glob_ptn = opj(repopath,
ARCHIVES_TEMP_DIR + {None: '*', True: '', False: '-*'}[persistent],
'*')
dirs = glob.glob(glob_ptn)
n2 = n * 2 # per each directory we should have a .stamp file
assert_equal(len(dirs), n2,
msg="Found following dirs when needed %d of them: %s" % (n2, dirs))
def ok_exists(path):
assert Path(path).exists(), 'path %s does not exist (or dangling symlink)' % path
def ok_file_has_content(path, content, strip=False, re_=False,
decompress=False, **kwargs):
"""Verify that file exists and has expected content"""
path = Path(path)
ok_exists(path)
if decompress:
if path.suffix == '.gz':
open_func = gzip.open
elif path.suffix in ('.xz', '.lzma'):
open_func = lzma.open
else:
raise NotImplementedError("Don't know how to decompress %s" % path)
else:
open_func = open
with open_func(str(path), 'rb') as f:
file_content = f.read()
if isinstance(content, str):
file_content = ensure_unicode(file_content)
if os.linesep != '\n':
# for consistent comparisons etc. Apparently when reading in `b` mode
# on Windows we would also get \r
# https://github.com/datalad/datalad/pull/3049#issuecomment-444128715
file_content = file_content.replace(os.linesep, '\n')
if strip:
file_content = file_content.strip()
if re_:
assert_re_in(content, file_content, **kwargs)
else:
assert_equal(content, file_content, **kwargs)
#
# Decorators
#
@optional_args
def with_tree(t, tree=None, archives_leading_dir=True, delete=True, **tkwargs):
@wraps(t)
def _wrap_with_tree(*arg, **kw):
if 'dir' not in tkwargs.keys():
# if not specified otherwise, respect datalad.tests.temp.dir config
# as this is a test helper
tkwargs['dir'] = dl_cfg.get("datalad.tests.temp.dir")
tkwargs_ = get_tempfile_kwargs(tkwargs, prefix="tree", wrapped=t)
d = tempfile.mkdtemp(**tkwargs_)
create_tree(d, tree, archives_leading_dir=archives_leading_dir)
try:
return t(*(arg + (d,)), **kw)
finally:
if delete:
rmtemp(d)
return _wrap_with_tree
lgr = logging.getLogger('datalad.tests')
class SilentHTTPHandler(SimpleHTTPRequestHandler):
"""A little adapter to silence the handler
"""
def __init__(self, *args, **kwargs):
self._silent = lgr.getEffectiveLevel() > logging.DEBUG
SimpleHTTPRequestHandler.__init__(self, *args, **kwargs)
def log_message(self, format, *args):
if self._silent:
return
lgr.debug("HTTP: " + format, *args)
def _multiproc_serve_path_via_http(
hostname, path_to_serve_from, queue, use_ssl=False, auth=None): # pragma: no cover
handler = SilentHTTPHandler
if auth:
# to-be-expected key for basic auth
auth_test = (b'Basic ' + base64.b64encode(
bytes('%s:%s' % auth, 'utf-8'))).decode('utf-8')
# ad-hoc basic-auth handler
class BasicAuthHandler(SilentHTTPHandler):
def do_HEAD(self, authenticated):
if authenticated:
self.send_response(200)
else:
self.send_response(401)
self.send_header(
'WWW-Authenticate', 'Basic realm=\"Protected\"')
self.send_header('content-type', 'text/html')
self.end_headers()
def do_GET(self):
if self.headers.get('Authorization') == auth_test:
super().do_GET()
else:
self.do_HEAD(False)
self.wfile.write(bytes('Auth failed', 'utf-8'))
handler = BasicAuthHandler
chpwd(path_to_serve_from)
httpd = HTTPServer((hostname, 0), handler)
if use_ssl:
ca_dir = Path(__file__).parent / 'ca'
ssl_key = ca_dir / 'certificate-key.pem'
ssl_cert = ca_dir / 'certificate-pub.pem'
if any(not p.exists for p in (ssl_key, ssl_cert)):
raise RuntimeError(
'SSL requested, but no key/cert file combination can be '
f'located under {ca_dir}')
# turn on SSL
context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
context.load_cert_chain(str(ssl_cert), str(ssl_key))
httpd.socket = context.wrap_socket (
httpd.socket,
server_side=True)
queue.put(httpd.server_port)
httpd.serve_forever()
class HTTPPath(object):
"""Serve the content of a path via an HTTP URL.
This class can be used as a context manager, in which case it returns the
URL.
Alternatively, the `start` and `stop` methods can be called directly.
Parameters
----------
path : str
Directory with content to serve.
use_ssl : bool
auth : tuple
Username, password
"""
def __init__(self, path, use_ssl=False, auth=None):
self.path = path
self.url = None
self._env_patch = None
self._mproc = None
self.use_ssl = use_ssl
self.auth = auth
def __enter__(self):
self.start()
return self.url
def __exit__(self, *args):
self.stop()
def start(self):
"""Start serving `path` via HTTP.
"""
# There is a problem with Haskell on wheezy trying to
# fetch via IPv6 whenever there is a ::1 localhost entry in
# /etc/hosts. Apparently fixing that docker image reliably
# is not that straightforward, although see
# http://jasonincode.com/customizing-hosts-file-in-docker/
# so we just force to use 127.0.0.1 while on wheezy
#hostname = '127.0.0.1' if on_debian_wheezy else 'localhost'
if self.use_ssl:
# we cannot use IPs with SSL certificates
hostname = 'localhost'
else:
hostname = '127.0.0.1'
queue = multiprocessing.Queue()
self._mproc = multiprocessing.Process(
target=_multiproc_serve_path_via_http,
args=(hostname, self.path, queue),
kwargs=dict(use_ssl=self.use_ssl, auth=self.auth))
self._mproc.start()
try:
port = queue.get(timeout=300)
except multiprocessing.queues.Empty as e:
if self.use_ssl:
pytest.skip('No working SSL support')
else:
raise
self.url = 'http{}://{}:{}/'.format(
's' if self.use_ssl else '',
hostname,
port)
lgr.debug("HTTP: serving %s under %s", self.path, self.url)
# Such tests don't require real network so if http_proxy settings were
# provided, we remove them from the env for the duration of this run
env = os.environ.copy()
if self.use_ssl:
env.pop('https_proxy', None)
env['REQUESTS_CA_BUNDLE'] = str(
Path(__file__).parent / 'ca' / 'ca_bundle.pem')
else:
env.pop('http_proxy', None)
self._env_patch = patch.dict('os.environ', env, clear=True)
self._env_patch.start()
if self.use_ssl:
# verify that the SSL/cert setup is functional, if not skip the
# test
# python-requests does its own thing re root CA trust
# if this fails, check datalad/tests/ca/prov.sh for ca_bundle
try:
import requests
from requests.auth import HTTPBasicAuth
r = requests.get(
self.url,
verify=True,
auth=HTTPBasicAuth(*self.auth) if self.auth else None)
r.raise_for_status()
# be robust and skip if anything goes wrong, rather than just a
# particular SSL issue
#except requests.exceptions.SSLError as e:
except Exception as e:
self.stop()
pytest.skip('No working HTTPS setup')
# now verify that the stdlib tooling also works
# if this fails, check datalad/tests/ca/prov.sh
# for info on deploying a datalad-root.crt
from urllib.request import (
Request,
urlopen,
)
try:
req = Request(self.url)
if self.auth:
req.add_header(
"Authorization",
b"Basic " + base64.standard_b64encode(
'{0}:{1}'.format(*self.auth).encode('utf-8')))
urlopen(req)
# be robust and skip if anything goes wrong, rather than just a
# particular SSL issue
#except URLError as e:
except Exception as e:
self.stop()
pytest.skip('No working HTTPS setup')
def stop(self):
"""Stop serving `path`.
"""
lgr.debug("HTTP: stopping server under %s", self.path)
self._env_patch.stop()
self._mproc.terminate()
@optional_args
def serve_path_via_http(tfunc, *targs, use_ssl=False, auth=None):
"""Decorator which serves content of a directory via http url
Parameters
----------
path : str
Directory with content to serve.
use_ssl : bool
Flag whether to set up SSL encryption and return a HTTPS
URL. This require a valid certificate setup (which is tested
for proper function) or it will cause a SkipTest to be raised.
auth : tuple or None
If a (username, password) tuple is given, the server access will
be protected via HTTP basic auth.
"""
@wraps(tfunc)
@attr('serve_path_via_http')
def _wrap_serve_path_via_http(*args, **kwargs):
if targs:
# if a path is passed into serve_path_via_http, then it's in targs
assert len(targs) == 1
path = targs[0]
elif len(args) > 1:
args, path = args[:-1], args[-1]
else:
args, path = (), args[0]
with HTTPPath(path, use_ssl=use_ssl, auth=auth) as url:
return tfunc(*(args + (path, url)), **kwargs)
return _wrap_serve_path_via_http
@optional_args
def with_memory_keyring(t):
"""Decorator to use non-persistent MemoryKeyring instance
"""
@wraps(t)
@attr('with_memory_keyring')
def _wrap_with_memory_keyring(*args, **kwargs):
keyring = MemoryKeyring()
with patch("datalad.downloaders.credentials.keyring_", keyring):
return t(*(args + (keyring,)), **kwargs)
return _wrap_with_memory_keyring
@optional_args
def without_http_proxy(tfunc):
"""Decorator to remove http*_proxy env variables for the duration of the test
"""
@wraps(tfunc)
@attr('without_http_proxy')
def _wrap_without_http_proxy(*args, **kwargs):
if on_windows:
pytest.skip('Unclear why this is not working on windows')
# Such tests don't require real network so if http_proxy settings were
# provided, we remove them from the env for the duration of this run
env = os.environ.copy()
env.pop('http_proxy', None)
env.pop('https_proxy', None)
with patch.dict('os.environ', env, clear=True):
return tfunc(*args, **kwargs)
return _wrap_without_http_proxy
@borrowkwargs(methodname=make_tempfile)
@optional_args
def with_tempfile(t, **tkwargs):
"""Decorator function to provide a temporary file name and remove it at the end
Parameters
----------
To change the used directory without providing keyword argument 'dir' set
DATALAD_TESTS_TEMP_DIR.
Examples
--------
::
@with_tempfile
def test_write(tfile=None):
open(tfile, 'w').write('silly test')
"""
@wraps(t)
def _wrap_with_tempfile(*arg, **kw):
if 'dir' not in tkwargs.keys():
# if not specified otherwise, respect datalad.tests.temp.dir config
# as this is a test helper
tkwargs['dir'] = dl_cfg.get("datalad.tests.temp.dir")
with make_tempfile(wrapped=t, **tkwargs) as filename:
return t(*(arg + (filename,)), **kw)
return _wrap_with_tempfile
# ### ###
# START known failure decorators
# ### ###
def probe_known_failure(func):
"""Test decorator allowing the test to pass when it fails and vice versa
Setting config datalad.tests.knownfailures.probe to True tests, whether or
not the test is still failing. If it's not, an AssertionError is raised in
order to indicate that the reason for failure seems to be gone.
"""
@wraps(func)
@attr('probe_known_failure')
def _wrap_probe_known_failure(*args, **kwargs):
if dl_cfg.obtain("datalad.tests.knownfailures.probe"):
assert_raises(Exception, func, *args, **kwargs) # marked as known failure
# Note: Since assert_raises lacks a `msg` argument, a comment
# in the same line is helpful to determine what's going on whenever
# this assertion fails and we see a trace back. Otherwise that line
# wouldn't be very telling.
else:
return func(*args, **kwargs)
return _wrap_probe_known_failure
@optional_args
def skip_known_failure(func, method='raise'):
"""Test decorator allowing to skip a test that is known to fail
Setting config datalad.tests.knownfailures.skip to a bool enables/disables
skipping.
"""
@skip_if(cond=dl_cfg.obtain("datalad.tests.knownfailures.skip"),
msg="Skip test known to fail",
method=method)
@wraps(func)
@attr('skip_known_failure')
def _wrap_skip_known_failure(*args, **kwargs):
return func(*args, **kwargs)
return _wrap_skip_known_failure
def known_failure(func):
"""Test decorator marking a test as known to fail
This combines `probe_known_failure` and `skip_known_failure` giving the
skipping precedence over the probing.
"""
@skip_known_failure
@probe_known_failure
@wraps(func)
@attr('known_failure')
def _wrap_known_failure(*args, **kwargs):
return func(*args, **kwargs)
return _wrap_known_failure
def known_failure_direct_mode(func):
"""DEPRECATED. Stop using. Does nothing
Test decorator marking a test as known to fail in a direct mode test run
If datalad.repo.direct is set to True behaves like `known_failure`.
Otherwise the original (undecorated) function is returned.
"""
# TODO: consider adopting nibabel/deprecated.py nibabel/deprecator.py
# mechanism to consistently deprecate functionality and ensure they are
# displayed.
# Since 2.7 Deprecation warnings aren't displayed by default
# and thus kinda pointless to issue a warning here, so we will just log
msg = "Direct mode support is deprecated, so no point in using " \
"@known_failure_direct_mode for %r since glorious future " \
"DataLad 0.12" % func.__name__
lgr.warning(msg)
return func
def known_failure_windows(func):
"""Test decorator marking a test as known to fail on windows
On Windows behaves like `known_failure`.
Otherwise the original (undecorated) function is returned.
"""
if on_windows:
@known_failure
@wraps(func)
@attr('known_failure_windows')
@attr('windows')
def dm_func(*args, **kwargs):
return func(*args, **kwargs)
return dm_func
return func
def known_failure_githubci_win(func):
"""Test decorator for a known test failure on Github's Windows CI
"""
if 'GITHUB_WORKFLOW' in os.environ and on_windows:
@known_failure
@wraps(func)
@attr('known_failure_githubci_win')
@attr('githubci_win')
def dm_func(*args, **kwargs):
return func(*args, **kwargs)
return dm_func
return func
def known_failure_githubci_osx(func):
"""Test decorator for a known test failure on Github's macOS CI
"""
if 'GITHUB_WORKFLOW' in os.environ and on_osx:
@known_failure
@wraps(func)
@attr('known_failure_githubci_osx')
@attr('githubci_osx')
def dm_func(*args, **kwargs):
return func(*args, **kwargs)
return dm_func
return func
def known_failure_osx(func):
"""Test decorator for a known test failure on macOS
"""
if on_osx:
@known_failure
@wraps(func)
@attr('known_failure_osx')
@attr('osx')
def dm_func(*args, **kwargs):
return func(*args, **kwargs)
return dm_func
return func
# ### ###
# xfails - like known failures but never to be checked to pass etc.
# e.g. for specific versions of core tools with regressions
# ### ###
xfail_buggy_annex_info = pytest.mark.xfail(
# 10.20230127 is lower bound since bug was introduced before next 10.20230214
# release, and thus snapshot builds would fail. There were no release on
# '10.20230221' - but that is the next day after the fix
external_versions['cmd:annex'] and ('10.20230127' <= external_versions['cmd:annex'] < '10.20230221'),
reason="Regression in git-annex info. https://github.com/datalad/datalad/issues/7286"
)
def _get_resolved_flavors(flavors):
#flavors_ = (['local', 'clone'] + (['local-url'] if not on_windows else [])) \
# if flavors == 'auto' else flavors
flavors_ = (['local', 'clone', 'local-url', 'network'] if not on_windows
else ['network', 'network-clone']) \
if flavors == 'auto' else flavors
if not isinstance(flavors_, list):
flavors_ = [flavors_]
if dl_cfg.get('datalad.tests.nonetwork'):
flavors_ = [x for x in flavors_ if not x.startswith('network')]
return flavors_
local_testrepo_flavors = ['local'] # 'local-url'
_TESTREPOS = None
@optional_args
def with_sameas_remote(func, autoenabled=False):
"""Provide a repository with a git-annex sameas remote configured.
The repository will have two special remotes: r_dir (type=directory) and
r_rsync (type=rsync). The rsync remote will be configured with
--sameas=r_dir, and autoenabled if `autoenabled` is true.
"""
from datalad.support.annexrepo import AnnexRepo
from datalad.support.exceptions import CommandError
@wraps(func)
@attr('with_sameas_remotes')
@skip_if_on_windows
@skip_ssh
@with_tempfile(mkdir=True)
@with_tempfile(mkdir=True)
def _wrap_with_sameas_remote(*args, **kwargs):
sr_path, repo_path = args[-2:]
fn_args = args[:-2]
repo = AnnexRepo(repo_path)
repo.init_remote("r_dir",
options=["type=directory",
"encryption=none",
"directory=" + sr_path])
options = ["type=rsync",
"rsyncurl=datalad-test:" + sr_path]
if autoenabled:
options.append("autoenable=true")
options.append("--sameas=r_dir")
repo.init_remote("r_rsync", options=options)
return func(*(fn_args + (repo,)), **kwargs)
return _wrap_with_sameas_remote
@optional_args
def with_fake_cookies_db(func, cookies={}):
"""mock original cookies db with a fake one for the duration of the test
"""
from ..support.cookies import cookies_db
@wraps(func)
@attr('with_fake_cookies_db')
def _wrap_with_fake_cookies_db(*args, **kwargs):
try:
orig_cookies_db = cookies_db._cookies_db
cookies_db._cookies_db = cookies.copy()
return func(*args, **kwargs)
finally:
cookies_db._cookies_db = orig_cookies_db
return _wrap_with_fake_cookies_db
@optional_args
def assert_cwd_unchanged(func, ok_to_chdir=False):
"""Decorator to test whether the current working directory remains unchanged
Parameters
----------
ok_to_chdir: bool, optional
If True, allow to chdir, so this decorator would not then raise exception
if chdir'ed but only return to original directory
"""
@wraps(func)
def _wrap_assert_cwd_unchanged(*args, **kwargs):
cwd_before = os.getcwd()
pwd_before = getpwd()
exc_info = None
# record previous state of PWD handling
utils_pwd_mode = utils._pwd_mode
try:
ret = func(*args, **kwargs)
except:
exc_info = sys.exc_info()
finally:
utils._pwd_mode = utils_pwd_mode
try:
cwd_after = os.getcwd()
except OSError as e:
lgr.warning("Failed to getcwd: %s" % e)
cwd_after = None
if cwd_after != cwd_before:
chpwd(pwd_before)
# Above chpwd could also trigger the change of _pwd_mode, so we
# would need to reset it again since we know that it is all kosher
utils._pwd_mode = utils_pwd_mode
if not ok_to_chdir:
lgr.warning(
"%s changed cwd to %s. Mitigating and changing back to %s"
% (func, cwd_after, pwd_before))
# If there was already exception raised, we better reraise
# that one since it must be more important, so not masking it
# here with our assertion
if exc_info is None:
assert_equal(cwd_before, cwd_after,
"CWD changed from %s to %s" % (cwd_before, cwd_after))
if exc_info is not None:
raise exc_info[1]
return ret
return _wrap_assert_cwd_unchanged
@optional_args
def run_under_dir(func, newdir='.'):
"""Decorator to run tests under another directory
It is somewhat ugly since we can't really chdir
back to a directory which had a symlink in its path.
So using this decorator has potential to move entire
testing run under the dereferenced directory name -- sideeffect.
The only way would be to instruct testing framework (i.e. nose
in our case ATM) to run a test by creating a new process with
a new cwd
"""
@wraps(func)
def _wrap_run_under_dir(*args, **kwargs):
pwd_before = getpwd()
try:
chpwd(newdir)
func(*args, **kwargs)
finally:
chpwd(pwd_before)
return _wrap_run_under_dir
def assert_re_in(regex, c, flags=0, match=True, msg=None):
"""Assert that container (list, str, etc) contains entry matching the regex
"""
if not isinstance(c, (list, tuple)):
c = [c]
for e in c:
if (re.match if match else re.search)(regex, e, flags=flags):
return
raise AssertionError(
msg or "Not a single entry matched %r in %r" % (regex, c)
)
def assert_dict_equal(d1, d2):
msgs = []
if set(d1).difference(d2):
msgs.append(" keys in the first dict but not in the second: %s"
% list(set(d1).difference(d2)))
if set(d2).difference(d1):
msgs.append(" keys in the second dict but not in the first: %s"
% list(set(d2).difference(d1)))
for k in set(d1).intersection(d2):
same = True
try:
if isinstance(d1[k], str):
# do not compare types for string types to avoid all the hassle
# with the distinction of str and unicode in PY3, and simple
# test for equality
same = bool(d1[k] == d2[k])
else:
same = type(d1[k]) == type(d2[k]) and bool(d1[k] == d2[k])
except: # if comparison or conversion to bool (e.g. with numpy arrays) fails
same = False
if not same:
msgs.append(" [%r] differs: %r != %r" % (k, d1[k], d2[k]))
if len(msgs) > 10:
msgs.append("and more")
break
if msgs:
raise AssertionError("dicts differ:\n%s" % "\n".join(msgs))
# do generic comparison just in case we screwed up to detect difference correctly above
eq_(d1, d2)
def assert_str_equal(s1, s2):
"""Helper to compare two lines"""
diff = list(unified_diff(s1.splitlines(), s2.splitlines()))
assert not diff, '\n'.join(diff)
assert_equal(s1, s2)
def assert_status(label, results):
"""Verify that each status dict in the results has a given status label
`label` can be a sequence, in which case status must be one of the items
in this sequence.
"""
label = ensure_list(label)
results = ensure_result_list(results)
if len(results) == 0:
# If there are no results, an assertion about all results must fail.
raise AssertionError("No results retrieved")
for i, r in enumerate(results):
try:
assert_in('status', r)
assert_in(r['status'], label)
except AssertionError:
raise AssertionError('Test {}/{}: expected status {} not found in:\n{}'.format(
i + 1,
len(results),
label,
dumps(results, indent=1, default=lambda x: str(x))))
def assert_message(message, results):
"""Verify that each status dict in the results has a message
This only tests the message template string, and not a formatted message
with args expanded.
"""
results = ensure_result_list(results)
if len(results) == 0:
# If there are no results, an assertion about all results must fail.
raise AssertionError("No results retrieved")
for r in results:
assert_in('message', r)
m = r['message'][0] if isinstance(r['message'], tuple) else r['message']
assert_equal(m, message)
def _format_res(x):
return textwrap.indent(
dumps(x, indent=1, default=str, sort_keys=True),
prefix=" ")
def assert_result_count(results, n, **kwargs):
"""Verify specific number of results (matching criteria, if any)"""
count = 0
results = ensure_result_list(results)
for r in results:
if not len(kwargs):
count += 1
elif all(k in r and r[k] == v for k, v in kwargs.items()):
count += 1
if not n == count:
raise AssertionError(
'Got {} instead of {} expected results matching\n{}\nInspected {} record(s):\n{}'.format(
count,
n,
_format_res(kwargs),
len(results),
_format_res(results)))
def _check_results_in(should_contain, results, **kwargs):
results = ensure_result_list(results)
found = False
for r in results:
if all(k in r and r[k] == v for k, v in kwargs.items()):
found = True
break
if found ^ should_contain:
if should_contain:
msg = "Desired result\n{}\nnot found among\n{}"
else:
msg = "Result\n{}\nunexpectedly found among\n{}"
raise AssertionError(msg.format(_format_res(kwargs),
_format_res(results)))
def assert_in_results(results, **kwargs):
"""Verify that the particular combination of keys and values is found in
one of the results"""
_check_results_in(True, results, **kwargs)
def assert_not_in_results(results, **kwargs):
"""Verify that the particular combination of keys and values is not in any
of the results"""
_check_results_in(False, results, **kwargs)
def assert_result_values_equal(results, prop, values):
"""Verify that the values of all results for a given key in the status dicts
match the given sequence"""
results = ensure_result_list(results)
assert_equal(
[r[prop] for r in results],
values)
def assert_result_values_cond(results, prop, cond):
"""Verify that the values of all results for a given key in the status dicts
fulfill condition `cond`.
Parameters
----------
results:
prop: str
cond: callable
"""
results = ensure_result_list(results)
for r in results:
ok_(cond(r[prop]),
msg="r[{prop}]: {value}".format(prop=prop, value=r[prop]))
def ignore_nose_capturing_stdout(func):
"""DEPRECATED and will be removed soon. Does nothing!
Originally was intended as a decorator workaround for nose's behaviour
with redirecting sys.stdout, but now we monkey patch nose now so no test
should no longer be skipped.
See issue reported here:
https://code.google.com/p/python-nose/issues/detail?id=243&can=1&sort=-id&colspec=ID%20Type%20Status%20Priority%20Stars%20Milestone%20Owner%20Summary
"""
lgr.warning(
"@ignore_nose_capturing_stdout no longer does anything - nose should "
"just be monkey patched in setup_package. %s still has it",
func.__name__
)
return func
# Helper to run parametric test with possible combinations of batch and direct
with_parametric_batch = pytest.mark.parametrize("batch", [False, True])
# List of most obscure filenames which might or not be supported by different
# filesystems across different OSs. Start with the most obscure
OBSCURE_PREFIX = os.getenv('DATALAD_TESTS_OBSCURE_PREFIX', '')
# Those will be tried to be added to the base name if filesystem allows
OBSCURE_FILENAME_PARTS = [' ', '/', '|', ';', '&', '%b5', '{}', "'", '"', '<', '>']
UNICODE_FILENAME = u"ΞΠΧ§Ω
ΰΉγ"
# OSX is exciting -- some I guess FS might be encoding differently from decoding
# so Π might get recoded
# (ref: https://github.com/datalad/datalad/pull/1921#issuecomment-385809366)
if sys.getfilesystemencoding().lower() == 'utf-8':
if on_osx:
# TODO: figure it really out
UNICODE_FILENAME = UNICODE_FILENAME.replace(u"Π", u"")
if on_windows:
# TODO: really figure out unicode handling on windows
UNICODE_FILENAME = ''
if UNICODE_FILENAME:
OBSCURE_FILENAME_PARTS.append(UNICODE_FILENAME)
# space before extension, simple extension and trailing space to finish it up
OBSCURE_FILENAME_PARTS += [' ', '.datc', ' ']
@with_tempfile(mkdir=True)
def get_most_obscure_supported_name(tdir, return_candidates=False):
"""Return the most obscure filename that the filesystem would support under TEMPDIR
Parameters
----------
return_candidates: bool, optional
if True, return a tuple of (good, candidates) where candidates are "partially"
sorted from trickiest considered
TODO: we might want to use it as a function where we would provide tdir
"""
# we need separate good_base so we do not breed leading/trailing spaces
initial = good = OBSCURE_PREFIX
system = platform.system()
OBSCURE_FILENAMES = []
def good_filename(filename):
OBSCURE_FILENAMES.append(candidate)
try:
# Windows seems to not tollerate trailing spaces and
# ATM we do not distinguish obscure filename and dirname.
# So here we will test for both - being able to create dir
# with obscure name and obscure filename under
os.mkdir(opj(tdir, filename))
with open(opj(tdir, filename, filename), 'w') as f:
f.write("TEST LOAD")
return True
except:
lgr.debug("Filename %r is not supported on %s under %s",
filename, system, tdir)
return False
# incrementally build up the most obscure filename from parts
for part in OBSCURE_FILENAME_PARTS:
candidate = good + part
if good_filename(candidate):
good = candidate
if good == initial:
raise RuntimeError("Could not create any of the files under %s among %s"
% (tdir, OBSCURE_FILENAMES))
lgr.debug("Tested %d obscure filename candidates. The winner: %r", len(OBSCURE_FILENAMES), good)
if return_candidates:
return good, OBSCURE_FILENAMES[::-1]
else:
return good
OBSCURE_FILENAME, OBSCURE_FILENAMES = get_most_obscure_supported_name(return_candidates=True)
@optional_args
def with_testsui(t, responses=None, interactive=True):
"""Switch main UI to be 'tests' UI and possibly provide answers to be used"""
@wraps(t)
def _wrap_with_testsui(*args, **kwargs):
from datalad.ui import ui
old_backend = ui.backend
try:
ui.set_backend('tests' if interactive else 'tests-noninteractive')
if responses:
ui.add_responses(responses)
ret = t(*args, **kwargs)
if responses:
responses_left = ui.get_responses()
assert not len(responses_left), "Some responses were left not used: %s" % str(responses_left)
return ret
finally:
ui.set_backend(old_backend)
if not interactive and responses is not None:
raise ValueError("Non-interactive UI cannot provide responses")
return _wrap_with_testsui
with_testsui.__test__ = False
def assert_no_errors_logged(func, skip_re=None):
"""Decorator around function to assert that no errors logged during its execution"""
@wraps(func)
def _wrap_assert_no_errors_logged(*args, **kwargs):
with swallow_logs(new_level=logging.ERROR) as cml:
out = func(*args, **kwargs)
if cml.out:
if not (skip_re and re.search(skip_re, cml.out)):
raise AssertionError(
"Expected no errors to be logged, but log output is %s"
% cml.out
)
return out
return _wrap_assert_no_errors_logged
def get_mtimes_and_digests(target_path):
"""Return digests (md5) and mtimes for all the files under target_path"""
from datalad.support.digests import Digester
from datalad.utils import find_files
digester = Digester(['md5'])
# bother only with existing ones for this test, i.e. skip annexed files without content
target_files = [
f for f in find_files('.*', topdir=target_path, exclude_vcs=False, exclude_datalad=False)
if exists(f)
]
# let's leave only relative paths for easier analysis
target_files_ = [relpath(f, target_path) for f in target_files]
digests = {frel: digester(f) for f, frel in zip(target_files, target_files_)}
mtimes = {frel: os.stat(f).st_mtime for f, frel in zip(target_files, target_files_)}
return digests, mtimes
def get_datasets_topdir():
"""Delayed parsing so it could be monkey patched etc"""
from datalad.consts import DATASETS_TOPURL
return RI(DATASETS_TOPURL).hostname
def assert_repo_status(path, annex=None, untracked_mode='normal', **kwargs):
"""Compare a repo status against (optional) exceptions.
Anything file/directory that is not explicitly indicated must have
state 'clean', i.e. no modifications and recorded in Git.
Parameters
----------
path: str or Repo
in case of a str: path to the repository's base dir;
Note, that passing a Repo instance prevents detecting annex. This might
be useful in case of a non-initialized annex, a GitRepo is pointing to.
annex: bool or None
explicitly set to True or False to indicate, that an annex is (not)
expected; set to None to autodetect, whether there is an annex.
Default: None.
untracked_mode: {'no', 'normal', 'all'}
If and how untracked content is reported. The specification of untracked
files that are OK to be found must match this mode. See `Repo.status()`
**kwargs
Files/directories that are OK to not be in 'clean' state. Each argument
must be one of 'added', 'untracked', 'deleted', 'modified' and each
value must be a list of filenames (relative to the root of the
repository, in POSIX convention).
"""
r = None
if isinstance(path, AnnexRepo):
if annex is None:
annex = True
# if `annex` was set to False, but we find an annex => fail
assert_is(annex, True)
r = path
elif isinstance(path, GitRepo):
if annex is None:
annex = False
# explicitly given GitRepo instance doesn't make sense with
# 'annex' True
assert_is(annex, False)
r = path
else:
# 'path' is an actual path
try:
r = AnnexRepo(path, init=False, create=False)
if annex is None:
annex = True
# if `annex` was set to False, but we find an annex => fail
assert_is(annex, True)
except Exception:
# Instantiation failed => no annex
try:
r = GitRepo(path, init=False, create=False)
except Exception:
raise AssertionError("Couldn't find an annex or a git "
"repository at {}.".format(path))
if annex is None:
annex = False
# explicitly given GitRepo instance doesn't make sense with
# 'annex' True
assert_is(annex, False)
status = r.status(untracked=untracked_mode)
# for any file state that indicates some kind of change (all but 'clean)
for state in ('added', 'untracked', 'deleted', 'modified'):
oktobefound = sorted(r.pathobj.joinpath(ut.PurePosixPath(p))
for p in kwargs.get(state, []))
state_files = sorted(k for k, v in status.items()
if v.get('state', None) == state)
eq_(state_files, oktobefound,
'unexpected content of state "%s": %r != %r'
% (state, state_files, oktobefound))
def get_convoluted_situation(path, repocls=AnnexRepo):
from datalad.api import create
ckwa = dict(result_renderer='disabled')
#if 'APPVEYOR' in os.environ:
# # issue only happens on appveyor, Python itself implodes
# # cannot be reproduced on a real windows box
# pytest.skip(
# 'get_convoluted_situation() causes appveyor to crash, '
# 'reason unknown')
repo = repocls(path, create=True)
# use create(force) to get an ID and config into the empty repo
# Pass explicit `annex` to ensure that GitRepo does get .noannex
ds = Dataset(path).create(force=True, annex=repocls is AnnexRepo, **ckwa)
# base content
create_tree(
ds.path,
{
'.gitignore': '*.ignored',
'subdir': {
'file_clean': 'file_clean',
'file_deleted': 'file_deleted',
'file_modified': 'file_clean',
},
'subdir-only-ignored': {
'1.ignored': '',
},
'file_clean': 'file_clean',
'file_deleted': 'file_deleted',
'file_staged_deleted': 'file_staged_deleted',
'file_modified': 'file_clean',
}
)
if isinstance(ds.repo, AnnexRepo):
create_tree(
ds.path,
{
'subdir': {
'file_dropped_clean': 'file_dropped_clean',
},
'file_dropped_clean': 'file_dropped_clean',
}
)
ds.save(**ckwa)
if isinstance(ds.repo, AnnexRepo):
# some files straight in git
create_tree(
ds.path,
{
'subdir': {
'file_ingit_clean': 'file_ingit_clean',
'file_ingit_modified': 'file_ingit_clean',
},
'file_ingit_clean': 'file_ingit_clean',
'file_ingit_modified': 'file_ingit_clean',
}
)
ds.save(to_git=True, **ckwa)
ds.drop([
'file_dropped_clean',
opj('subdir', 'file_dropped_clean')],
reckless='kill', **ckwa)
# clean and proper subdatasets
ds.create('subds_clean', **ckwa)
ds.create(opj('subdir', 'subds_clean'), **ckwa)
ds.create('subds_unavailable_clean', **ckwa)
ds.create(opj('subdir', 'subds_unavailable_clean'), **ckwa)
# uninstall some subdatasets (still clean)
ds.drop([
'subds_unavailable_clean',
opj('subdir', 'subds_unavailable_clean')],
what='all', reckless='kill', recursive=True, **ckwa)
assert_repo_status(ds.path)
# make a dirty subdataset
ds.create('subds_modified', **ckwa)
ds.create(opj('subds_modified', 'someds'), **ckwa)
ds.create(opj('subds_modified', 'someds', 'dirtyds'), **ckwa)
# make a subdataset with additional commits
ds.create(opj('subdir', 'subds_modified'), **ckwa)
pdspath = opj(ds.path, 'subdir', 'subds_modified', 'progressedds')
ds.create(pdspath, **ckwa)
create_tree(
pdspath,
{'file_clean': 'file_ingit_clean'}
)
Dataset(pdspath).save(**ckwa)
assert_repo_status(pdspath)
# staged subds, and files
create(opj(ds.path, 'subds_added'), **ckwa)
# use internal helper to get subdataset into an 'added' state
# that would not happen in standard datalad workflows
list(ds.repo._save_add_submodules([ds.pathobj / 'subds_added']))
create(opj(ds.path, 'subdir', 'subds_added'), **ckwa)
list(ds.repo._save_add_submodules([ds.pathobj / 'subdir' / 'subds_added']))
# some more untracked files
create_tree(
ds.path,
{
'subdir': {
'file_untracked': 'file_untracked',
'file_added': 'file_added',
},
'file_untracked': 'file_untracked',
'file_added': 'file_added',
'dir_untracked': {
'file_untracked': 'file_untracked',
},
'subds_modified': {
'someds': {
"dirtyds": {
'file_untracked': 'file_untracked',
},
},
},
}
)
ds.repo.add(['file_added', opj('subdir', 'file_added')])
# untracked subdatasets
create(opj(ds.path, 'subds_untracked'), **ckwa)
create(opj(ds.path, 'subdir', 'subds_untracked'), **ckwa)
# deleted files
os.remove(opj(ds.path, 'file_deleted'))
os.remove(opj(ds.path, 'subdir', 'file_deleted'))
# staged deletion
ds.repo.remove('file_staged_deleted')
# modified files
if isinstance(ds.repo, AnnexRepo):
ds.repo.unlock(['file_modified', opj('subdir', 'file_modified')])
create_tree(
ds.path,
{
'subdir': {
'file_ingit_modified': 'file_ingit_modified',
},
'file_ingit_modified': 'file_ingit_modified',
}
)
create_tree(
ds.path,
{
'subdir': {
'file_modified': 'file_modified',
},
'file_modified': 'file_modified',
}
)
return ds
def get_deeply_nested_structure(path):
""" Here is what this does (assuming UNIX, locked):
| .
| βββ directory_untracked
| β βββ link2dir -> ../subdir
| βββ OBSCURE_FILENAME_file_modified
| βββ link2dir -> subdir
| βββ link2subdsdir -> subds_modified/subdir
| βββ link2subdsroot -> subds_modified
| βββ subdir
| β βββ annexed_file.txt -> ../.git/annex/objects/...
| β βββ file_modified
| β βββ git_file.txt
| β βββ link2annex_files.txt -> annexed_file.txt
| βββ subds_modified
| βββ link2superdsdir -> ../subdir
| βββ subdir
| β βββ annexed_file.txt -> ../.git/annex/objects/...
| βββ subds_lvl1_modified
| βββ OBSCURE_FILENAME_directory_untracked
| βββ untracked_file
When a system has no symlink support, the link2... components are not
included.
"""
ds = Dataset(path).create()
(ds.pathobj / 'subdir').mkdir()
(ds.pathobj / 'subdir' / 'annexed_file.txt').write_text(u'dummy')
ds.save()
(ds.pathobj / 'subdir' / 'git_file.txt').write_text(u'dummy')
ds.save(to_git=True)
# a subtree of datasets
subds = ds.create('subds_modified')
# another dataset, plus an additional dir in it
ds.create(opj('subds_modified', 'subds_lvl1_modified'))
create_tree(
ds.path,
{
'subdir': {
'file_modified': 'file_modified',
},
OBSCURE_FILENAME + u'file_modified_': 'file_modified',
}
)
create_tree(
str(ds.pathobj / 'subds_modified' / 'subds_lvl1_modified'),
{OBSCURE_FILENAME + u'_directory_untracked': {"untracked_file": ""}}
)
(ut.Path(subds.path) / 'subdir').mkdir()
(ut.Path(subds.path) / 'subdir' / 'annexed_file.txt').write_text(u'dummy')
subds.save()
(ds.pathobj / 'directory_untracked').mkdir()
if not has_symlink_capability():
return ds
# symlink farm #1
# symlink to annexed file
(ds.pathobj / 'subdir' / 'link2annex_files.txt').symlink_to(
'annexed_file.txt')
# symlink to directory within the dataset
(ds.pathobj / 'link2dir').symlink_to('subdir')
# upwards pointing symlink to directory within the same dataset
(ds.pathobj / 'directory_untracked' / 'link2dir').symlink_to(
opj('..', 'subdir'))
# symlink pointing to a subdataset mount in the same dataset
(ds.pathobj / 'link2subdsroot').symlink_to('subds_modified')
# symlink to a dir in a subdataset (across dataset boundaries)
(ds.pathobj / 'link2subdsdir').symlink_to(
opj('subds_modified', 'subdir'))
# symlink to a dir in a superdataset (across dataset boundaries)
(ut.Path(subds.path) / 'link2superdsdir').symlink_to(
opj('..', 'subdir'))
return ds
def maybe_adjust_repo(repo):
"""Put repo into an adjusted branch if it is not already.
"""
if not repo.is_managed_branch():
repo.call_annex(["upgrade"])
repo.config.reload(force=True)
repo.adjust()
@lru_cache()
@with_tempfile
@with_tempfile
def has_symlink_capability(p1, p2):
path = ut.Path(p1)
target = ut.Path(p2)
return utils.check_symlink_capability(path, target)
def skip_wo_symlink_capability(func):
"""Skip test when environment does not support symlinks
Perform a behavioral test instead of top-down logic, as on
windows this could be on or off on a case-by-case basis.
"""
@wraps(func)
@attr('skip_wo_symlink_capability')
def _wrap_skip_wo_symlink_capability(*args, **kwargs):
if not has_symlink_capability():
pytest.skip("no symlink capabilities")
return func(*args, **kwargs)
return _wrap_skip_wo_symlink_capability
_TESTS_ADJUSTED_TMPDIR = None
def skip_if_adjusted_branch(func):
"""Skip test if adjusted branch is used by default on TMPDIR file system.
"""
@wraps(func)
@attr('skip_if_adjusted_branch')
def _wrap_skip_if_adjusted_branch(*args, **kwargs):
global _TESTS_ADJUSTED_TMPDIR
if _TESTS_ADJUSTED_TMPDIR is None:
@with_tempfile
def _check(path):
ds = Dataset(path).create(force=True)
return ds.repo.is_managed_branch()
_TESTS_ADJUSTED_TMPDIR = _check()
if _TESTS_ADJUSTED_TMPDIR:
pytest.skip("Test incompatible with adjusted branch default")
return func(*args, **kwargs)
return _wrap_skip_if_adjusted_branch
def get_ssh_port(host):
"""Get port of `host` in ssh_config.
Our tests depend on the host being defined in ssh_config, including its
port. This method can be used by tests that want to check handling of an
explicitly specified
Note that if `host` does not match a host in ssh_config, the default value
of 22 is returned.
Skips test if port cannot be found.
Parameters
----------
host : str
Returns
-------
port (int)
"""
out = ''
runner = WitlessRunner()
try:
res = runner.run(["ssh", "-G", host], protocol=StdOutErrCapture)
out = res["stdout"]
err = res["stderr"]
except Exception as exc:
err = str(exc)
port = None
for line in out.splitlines():
if line.startswith("port "):
try:
port = int(line.split()[1])
except Exception as exc:
err = str(exc)
break
if port is None:
pytest.skip("port for {} could not be determined: {}"
.format(host, err))
return port
#
# Context Managers
#
def patch_config(vars):
"""Patch our config with custom settings. Returns mock.patch cm
Only the merged configuration from all sources (global, local, dataset)
will be patched. Source-constrained patches (e.g. only committed dataset
configuration) are not supported.
"""
return patch.dict(dl_cfg._merged_store, vars)
@contextmanager
def set_date(timestamp):
"""Temporarily override environment variables for git/git-annex dates.
Parameters
----------
timestamp : int
Unix timestamp.
"""
git_ts = "@{} +0000".format(timestamp)
with patch.dict("os.environ",
{"GIT_COMMITTER_DATE": git_ts,
"GIT_AUTHOR_DATE": git_ts,
"GIT_ANNEX_VECTOR_CLOCK": str(timestamp),
"DATALAD_FAKE__DATES": "0"}):
yield
@contextmanager
def set_annex_version(version: Optional[str]):
"""Override the git-annex version.
This temporarily masks the git-annex version present in external_versions
and make AnnexRepo forget its cached version information.
"""
from datalad.support.annexrepo import AnnexRepo
ar_vers = AnnexRepo.git_annex_version
with patch.dict(
"datalad.support.annexrepo.external_versions._versions",
{"cmd:annex": LooseVersion(version) if version else version}):
try:
AnnexRepo.git_annex_version = None
yield
finally:
AnnexRepo.git_annex_version = ar_vers
#
# Test tags
#
# To be explicit, and not "loose" some tests due to typos, decided to make
# explicit decorators for common types
def integration(f):
"""Mark test as an "integration" test which generally is not needed to be run
Generally tend to be slower.
Should be used in combination with @slow and @turtle if that is the case.
"""
return attr('integration')(f)
def slow(f):
"""Mark test as a slow, although not necessarily integration or usecase test
Rule of thumb cut-off to mark as slow is 10 sec
"""
return attr('slow')(f)
def turtle(f):
"""Mark test as very slow, meaning to not run it on Travis due to its
time limit
Rule of thumb cut-off to mark as turtle is 2 minutes
"""
return attr('turtle')(f)
def usecase(f):
"""Mark test as a usecase user ran into and which (typically) caused bug report
to be filed/troubleshooted
Should be used in combination with @slow and @turtle if slow.
"""
return attr('usecase')(f)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/utils_testdatasets.py 0000644 0001751 0001751 00000002753 15137634221 021625 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from os.path import join as opj
from datalad.distribution.dataset import Dataset
def _make_dataset_hierarchy(path):
origin = Dataset(path).create()
origin_sub1 = origin.create('sub1')
origin_sub2 = origin_sub1.create('sub2')
with open(opj(origin_sub2.path, 'file_in_annex.txt'), "w") as f:
f.write('content2')
origin_sub3 = origin_sub2.create('sub3')
with open(opj(origin_sub3.path, 'file_in_annex.txt'), "w") as f:
f.write('content3')
origin_sub4 = origin_sub3.create('sub4')
origin.save(recursive=True)
return origin, origin_sub1, origin_sub2, origin_sub3, origin_sub4
def _mk_submodule_annex(path, fname, fcontent):
ca = dict(result_renderer='disabled')
# a remote dataset with a subdataset underneath
origds = Dataset(path).create(**ca)
(origds.pathobj / fname).write_text(fcontent)
# naming is weird, but a legacy artifact
s1 = origds.create('subm 1', **ca)
(s1.pathobj / fname).write_text(fcontent)
s2 = origds.create('2', **ca)
(s2.pathobj / fname).write_text(fcontent)
origds.save(recursive=True, **ca)
return origds
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/tests/utils_testrepos.py 0000644 0001751 0001751 00000016125 15137634221 021143 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import os
import tempfile
from abc import (
ABCMeta,
abstractmethod,
)
from os.path import exists
from os.path import join as opj
from datalad import cfg as dl_cfg
from datalad.customremotes.base import init_datalad_remote
from .. import __version__
from ..support.annexrepo import AnnexRepo
from ..support.external_versions import external_versions
from ..support.gitrepo import GitRepo
from ..support.network import get_local_file_url
from ..utils import (
swallow_logs,
swallow_outputs,
)
from . import _TEMP_PATHS_GENERATED
from .utils_pytest import get_tempfile_kwargs
# eventually become a URL to a local file served via http
# that can be used for http/url-based testing
remote_file_url = None
class TestRepo(object, metaclass=ABCMeta):
REPO_CLASS = None # Assign to the class to be used in the subclass
def __init__(self, path=None, puke_if_exists=True):
if not path:
path = \
tempfile.mktemp(**get_tempfile_kwargs(
{'dir': dl_cfg.get("datalad.tests.temp.dir")},
prefix='testrepo'))
# to be removed upon teardown
_TEMP_PATHS_GENERATED.append(path)
if puke_if_exists and exists(path):
raise RuntimeError("Directory %s for test repo already exist" % path)
# swallow logs so we don't print all those about crippled FS etc
with swallow_logs():
self.repo = self.REPO_CLASS(path)
# For additional testing of our datalad remote to not interfere
# and manage to handle all http urls and requests:
if self.REPO_CLASS is AnnexRepo and \
os.environ.get('DATALAD_TESTS_DATALADREMOTE'):
init_datalad_remote(self.repo, 'datalad', autoenable=True)
self._created = False
@property
def path(self):
return self.repo.path
@property
def url(self):
return get_local_file_url(self.path, compatibility='git')
def create_file(self, name, content, add=True, annex=False):
filename = opj(self.path, name)
with open(filename, 'wb') as f:
f.write(content.encode())
if add:
if annex:
if isinstance(self.repo, AnnexRepo):
self.repo.add(name)
else:
raise ValueError("Can't annex add to a non-annex repo.")
else:
self.repo.add(name, git=True)
def create(self):
if self._created:
assert(exists(self.path))
return # was already done
with swallow_outputs(): # we don't need those outputs at this point
self.populate()
self._created = True
@abstractmethod
def populate(self):
raise NotImplementedError("Should be implemented in sub-classes")
class BasicAnnexTestRepo(TestRepo):
"""Creates a basic test git-annex repository"""
REPO_CLASS = AnnexRepo
def populate(self):
global remote_file_url
if not remote_file_url:
# we need a local file, that is server via a URL
from datalad.conftest import test_http_server
remote_file_name = 'testrepo-annex.dat'
with open(opj(test_http_server.path, remote_file_name), "w") as f:
f.write("content to be annex-addurl'd")
remote_file_url = '{}/{}'.format(test_http_server.url, remote_file_name)
self.create_info_file()
self.create_file('test.dat', '123\n', annex=False)
self.repo.commit("Adding a basic INFO file and rudimentary load file for annex testing")
self.repo.add_url_to_file("test-annex.dat", remote_file_url)
self.repo.commit("Adding a rudimentary git-annex load file")
self.repo.drop("test-annex.dat") # since available from URL
def create_info_file(self):
annex_version = external_versions['cmd:annex']
git_version = external_versions['cmd:git']
self.create_file('INFO.txt',
"Testrepo: %s\n"
"git: %s\n"
"annex: %s\n"
"datalad: %s\n"
% (self.__class__, git_version, annex_version, __version__),
annex=False)
class BasicGitTestRepo(TestRepo):
"""Creates a basic test git repository."""
REPO_CLASS = GitRepo
def populate(self):
self.create_info_file()
self.create_file('test.dat', '123\n', annex=False)
self.repo.commit("Adding a basic INFO file and rudimentary "
"load file.")
def create_info_file(self):
git_version = external_versions['cmd:git']
self.create_file('INFO.txt',
"Testrepo: %s\n"
"git: %s\n"
"datalad: %s\n"
% (self.__class__, git_version, __version__),
annex=False)
class SubmoduleDataset(BasicAnnexTestRepo):
def populate(self):
super(SubmoduleDataset, self).populate()
# add submodules
annex = BasicAnnexTestRepo()
annex.create()
kw = dict(expect_stderr=True)
self.repo.call_git(
['submodule', 'add', annex.url, 'subm 1'], **kw)
self.repo.call_git(
['submodule', 'add', annex.url, '2'], **kw)
self.repo.commit('Added subm 1 and 2.')
self.repo.call_git(
['submodule', 'update', '--init', '--recursive'], **kw)
# init annex in subdatasets
for s in ('subm 1', '2'):
AnnexRepo(opj(self.path, s), init=True)
class NestedDataset(BasicAnnexTestRepo):
def populate(self):
super(NestedDataset, self).populate()
ds = SubmoduleDataset()
ds.create()
kw = dict(expect_stderr=True)
self.repo.call_git(
['submodule', 'add', ds.url, 'sub dataset1'], **kw)
self.repo.call_git(
['-C', opj(self.path, 'sub dataset1'),
'submodule', 'add', ds.url, 'sub sub dataset1'],
**kw)
GitRepo(opj(self.path, 'sub dataset1')).commit('Added sub dataset.')
self.repo.commit('Added subdatasets.', options=["-a"])
self.repo.call_git(
['submodule', 'update', '--init', '--recursive'],
**kw)
# init all annexes
for s in ('', 'sub dataset1', opj('sub dataset1', 'sub sub dataset1')):
AnnexRepo(opj(self.path, s), init=True)
class InnerSubmodule(object):
def __init__(self):
self._ds = NestedDataset()
@property
def path(self):
return opj(self._ds.path, 'sub dataset1', 'subm 1')
@property
def url(self):
return get_local_file_url(self.path, compatibility='git')
def create(self):
self._ds.create()
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/typing.py 0000644 0001751 0001751 00000001346 15137634221 016042 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
import sys
from typing import (
Concatenate,
Literal,
ParamSpec,
Protocol,
TypedDict,
TypeVar,
)
if sys.version_info >= (3, 11):
from typing import Self
else:
from typing_extensions import Self
__all__ = ["Literal", "ParamSpec", "T", "K", "V", "P"]
T = TypeVar("T")
K = TypeVar("K")
V = TypeVar("V")
P = ParamSpec("P")
././@PaxHeader 0000000 0000000 0000000 00000000033 00000000000 010211 x ustar 00 27 mtime=1769945274.894061
datalad-1.3.1/datalad/ui/ 0000755 0001751 0001751 00000000000 15137634273 014576 5 ustar 00runner runner ././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/ui/__init__.py 0000644 0001751 0001751 00000007130 15137634221 016701 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Interactive User Interface (as Dialog/GUI/etc) support
"""
__docformat__ = 'restructuredtext'
from logging import getLogger
lgr = getLogger('datalad.ui')
lgr.log(5, "Starting importing ui")
from ..utils import (
get_ipython_shell,
is_interactive,
)
from .dialog import (
ConsoleLog,
DialogUI,
IPythonUI,
QuietConsoleLog,
SilentConsoleLog,
UnderAnnexUI,
UnderTestsUI,
)
KNOWN_BACKENDS = {
'console': ConsoleLog,
'dialog': DialogUI,
'ipython': IPythonUI,
'annex': UnderAnnexUI,
'tests': UnderTestsUI,
'tests-noninteractive': QuietConsoleLog,
'no-progress': SilentConsoleLog,
}
# TODO: implement logic on selection of the ui based on the cfg and environment
# e.g. we cannot use DialogUI if session is not interactive
# TODO: GitAnnexUI where interactive queries (such as question) should get to the
# user by proxying some other appropriate (cmdline or GUI) UI, while others, such
# as reporting on progress etc -- should get back to the annex
# TODO: singleton
class _UI_Switcher(object):
"""
Poor man helper to switch between different backends at run-time.
"""
def __init__(self, backend=None):
self._backend = None
self._ui = None
self.set_backend(backend)
def set_backend(self, backend):
if backend and (backend == self._backend):
lgr.debug("not changing backend since the same %s", backend)
return
if backend is None:
# Might be IPython
ipython_shell = get_ipython_shell()
if ipython_shell:
# Good old ipython would have TerminalInteractiveShell
if ipython_shell.__class__.__name__ in ('ZMQInteractiveShell',):
backend = 'ipython'
# well -- this will not even be printed yet since unlikely
# the lgr handlers were set already
lgr.info(
"Detected IPython session. Setting UI backend to %r. "
"If this is not a web IPython notebook session, you "
"might like to datalad.ui.ui.set_backend('dialog'). "
"Other known UI backends: %s",
backend, ', '.join(KNOWN_BACKENDS))
else:
backend = 'dialog'
else:
backend = 'dialog' if is_interactive() else 'no-progress'
self._ui = KNOWN_BACKENDS[backend]()
lgr.debug("UI set to %s", self._ui)
self._backend = backend
@property
def backend(self):
return self._backend
@property
def ui(self):
return self._ui
# Delegate other methods to the actual UI
def __getattribute__(self, key):
if key.startswith('_') or key in {'set_backend', 'backend', 'ui'}:
return super(_UI_Switcher, self).__getattribute__(key)
return getattr(self._ui, key)
def __setattr__(self, key, value):
if key.startswith('_') or key in {'set_backend', 'backend', 'ui'}:
return super(_UI_Switcher, self).__setattr__(key, value)
return setattr(self._ui, key, value)
lgr.log(5, "Initiating UI switcher")
ui = _UI_Switcher()
lgr.log(5, "Done importing ui")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/ui/base.py 0000644 0001751 0001751 00000002567 15137634221 016065 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Base classes for UI
"""
__docformat__ = 'restructuredtext'
from abc import (
ABCMeta,
abstractmethod,
)
from ..utils import auto_repr
@auto_repr
class InteractiveUI(object, metaclass=ABCMeta):
"""Semi-abstract class for interfaces to implement interactive UI"""
@abstractmethod
def question(self, text,
title=None, choices=None,
default=None,
hidden=False,
repeat=None):
pass
def yesno(self, *args, **kwargs):
# Provide some default sugaring
default = kwargs.pop('default', None)
if default is not None:
if default in {True}:
default = 'yes'
elif default in {False}:
default = 'no'
kwargs['default'] = default
response = self.question(*args, choices=['yes', 'no'], **kwargs).rstrip('\n')
assert response in {'yes', 'no'}, "shouldn't happen; question() failed"
return response == 'yes'
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/ui/dialog.py 0000644 0001751 0001751 00000035407 15137634221 016411 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Basic dialog-like interface for interactions in the terminal window
"""
__docformat__ = 'restructuredtext'
from logging import getLogger
lgr = getLogger('datalad.ui.dialog')
lgr.log(5, "Starting importing ui.dialog")
import getpass
import os
import sys
import time
#!!! OPT adds >100ms to import time!!!
# from unittest.mock import patch
from collections import deque
from copy import copy
from datalad.support.exceptions import CapturedException
from ..utils import (
auto_repr,
on_windows,
)
from .base import InteractiveUI
from .utils import can_prompt
# Example APIs which might be useful to look for "inspiration"
# man debconf-devel
# man zenity
#
# "Fancy" output of progress etc in the terminal:
# - docker has multiple simultaneous progressbars. Apparently "navigation"
# is obtained with escape characters in the terminal.
# see docker/pkg/jsonmessage/jsonmessage.go or following snippet
#
#from time import sleep
#import sys
#
#out = sys.stderr
#for i in range(10):
# diff = 2
# if i:
# out.write("%c[%dA" % (27, diff))
# out.write("%d\n%d\n" % (i, i ** 2))
# sleep(0.5)
#
# They also use JSON representation for the message which might provide a nice abstraction
# Other useful codes
# // [2K = erase entire current line
# fmt.Fprintf(out, "%c[2K\r", 27)
# and code in docker: pkg/progressreader/progressreader.go pkg/streamformatter/streamformatter.go
#
# reference for ESC codes: http://ascii-table.com/ansi-escape-sequences.php
@auto_repr
class ConsoleLog(object):
progressbars = None
def __init__(self, out=sys.stdout):
self.out = out
def message(self, msg, cr='\n'):
from datalad.log import no_progress
with no_progress():
try:
self.out.write(msg)
except UnicodeEncodeError as e:
# all unicode magic has failed and the receiving end cannot handle
# a particular unicode char. rather than crashing, we replace the
# offending chars to be able to message at least something, and we
# log that we did that
encoding = self.out.encoding
lgr.debug(
"Replacing unicode chars in message output for display: %s",
e)
self.out.write(
msg.encode(encoding, "replace").decode(encoding))
if cr:
self.out.write(cr)
def error(self, error):
self.message("ERROR: %s" % error)
def get_progressbar(self, *args, **kwargs):
"""Return a progressbar. See e.g. `tqdmProgressBar` about the interface
Additional parameter is backend to choose among available
"""
backend = kwargs.pop('backend', None)
# Delay imports of progressbars until actually needed
if ConsoleLog.progressbars is None:
from .progressbars import progressbars
ConsoleLog.progressbars = progressbars
else:
progressbars = ConsoleLog.progressbars
if backend is None:
# Resort to the configuration
from .. import cfg
backend = cfg.get('datalad.ui.progressbar', None)
if backend is None:
try:
pbar = progressbars['tqdm']
except KeyError:
pbar = progressbars.values()[0] # any
else:
pbar = progressbars[backend]
return pbar(*args, out=self.out, **kwargs)
@property
def is_interactive(self):
return isinstance(self, InteractiveUI)
@auto_repr
class SilentConsoleLog(ConsoleLog):
"""A ConsoleLog with a SilentProgressbar"""
def get_progressbar(self, *args, **kwargs):
from .progressbars import SilentProgressBar
return SilentProgressBar(*args, **kwargs)
def question(self, text, title=None, **kwargs):
msg = "A non-interactive silent UI was asked for a response to a question: %s." % text
if title is not None:
msg += ' Title: %s.' % title
if not kwargs.get('hidden'):
kwargs_str = ', '.join(
('%s=%r' % (k, v)
for k, v in kwargs.items()
if v is not None))
if kwargs_str:
msg += " Additional arguments: %s" % kwargs_str
else:
msg += " Additional arguments are not shown because 'hidden' is set."
raise RuntimeError(msg)
@auto_repr
class QuietConsoleLog(ConsoleLog):
"""A ConsoleLog with a LogProgressbar"""
def get_progressbar(self, *args, **kwargs):
from .progressbars import LogProgressBar
return LogProgressBar(*args, **kwargs)
def getpass_echo(prompt='Password', stream=None):
"""Q&D workaround until we have proper 'centralized' UI -- just use getpass BUT enable echo
"""
if on_windows:
# Can't do anything fancy yet, so just ask the one without echo
return getpass.getpass(prompt=prompt, stream=stream)
else:
# We can mock patch termios so that ECHO is not turned OFF.
# Side-effect -- additional empty line is printed
# def _no_emptyline_write(out):
# # Additional mock to prevent not needed empty line print since we do have echo
# # doesn't work since we don't know the stream here really
# if out == '\n':
# return
# stream.write(out)
from unittest.mock import patch
with patch('termios.ECHO', 255 ** 2):
#patch.object(stream, 'write', _no_emptyline_write(stream)):
return getpass.getpass(prompt=prompt, stream=stream)
def _get_value(value, hidden):
return "" if hidden else value
@auto_repr
class DialogUI(ConsoleLog, InteractiveUI):
def __init__(self, *args, **kwargs):
super(DialogUI, self).__init__(*args, **kwargs)
# ATM doesn't make sense to print the same title for subsequent questions
# so we will store previous one and not show it if was the previous one shown
# within 5 seconds from prev question
self._prev_title = None
self._prev_title_time = 0
def input(self, prompt, hidden=False):
"""Request user input
Parameters
----------
prompt: str
Prompt for the entry
"""
# if not hidden:
# self.out.write(msg + ": ")
# self.out.flush() # not effective for stderr for some reason under annex
#
# # TODO: raw_input works only if stdin was not controlled by
# # (e.g. if coming from annex). So we might need to do the
# # same trick as get_pass() does while directly dealing with /dev/pty
# # and provide per-OS handling with stdin being override
# response = (raw_input if PY2 else input)()
# else:
return (getpass.getpass if hidden else getpass_echo)(prompt)
def question(self, text,
title=None, choices=None,
default=None,
hidden=False,
repeat=None):
# Do initial checks first
if default and choices and default not in choices:
raise ValueError("default value %r is not among choices: %s"
% (_get_value(default, hidden), choices))
msg = ''
if title and not (title == self._prev_title and time.time() - self._prev_title_time < 5):
# might not actually get displayed if all in/out redirected
# self.out.write(title + "\n")
# so merge into msg for getpass
msg += title + os.linesep
def mark_default(x):
return "[%s]" % x \
if default is not None and x == default \
else x
if choices is not None:
msg += "%s (choices: %s)" % (text, ', '.join(map(mark_default, choices)))
elif default is not None:
msg += '{} [{}]'.format(text, default)
else:
msg += text
# Like this:
#Anaconda format:
#
#Question? [choice1|choice2]
#[default] >>> yes
attempt = 0
while True:
attempt += 1
if attempt >= 100:
raise RuntimeError("This is 100th attempt. Something really went wrong")
response = self.input("{}: ".format(msg), hidden=hidden)
# TODO: dedicated option? got annoyed by this one
# multiple times already, typically we are not defining
# new credentials where repetition would be needed.
if hidden and repeat is None:
repeat = hidden and choices is None
if repeat:
response_r = self.input('{} (repeat): '.format(msg), hidden=hidden)
if response != response_r:
self.error("input mismatch, please start over")
continue
if response and '\x03' in response:
# Ctrl-C is part of the response -> clearly we should not pretend it's all good
raise KeyboardInterrupt
if not response and default:
response = default
break
if choices and response not in choices:
self.error("%r is not among choices: %s. Repeat your answer"
% (_get_value(response, hidden), choices))
continue
break
self._prev_title = title
self._prev_title_time = time.time()
return response
class IPythonUI(DialogUI):
"""Custom to IPython frontend UI implementation
There is no way to discriminate between web notebook or qt console,
so we have just a single class for all.
TODO: investigate how to provide 'proper' displays for
IPython of progress bars so backend could choose the
appropriate one
"""
_tqdm_frontend = "unknown"
def input(self, prompt, hidden=False):
# We cannot and probably do not need to "abuse" termios
if not hidden:
self.out.write(prompt)
self.out.flush()
return input()
else:
return getpass.getpass(prompt=prompt)
def get_progressbar(self, *args, **kwargs):
"""Return a progressbar. See e.g. `tqdmProgressBar` about the
interface
Additional parameter is backend to choose among available
"""
backend = kwargs.pop('backend', None)
if self._tqdm_frontend == "unknown":
try:
from tqdm import tqdm_notebook # check if available etc
self.__class__._tqdm_frontend = 'ipython'
except Exception as exc:
lgr.warning(
"Regular progressbar will be used -- cannot import tqdm_notebook: %s",
CapturedException(exc)
)
self.__class__._tqdm_frontend = None
if self._tqdm_frontend:
kwargs.update()
return super(IPythonUI, self).get_progressbar(
*args, frontend=self._tqdm_frontend, **kwargs)
# poor man thingie for now
@auto_repr
class UnderAnnexUI(DialogUI):
def __init__(self, specialremote=None, **kwargs):
if 'out' not in kwargs:
# to avoid buffering
# http://stackoverflow.com/a/181654/1265472
#kwargs['out'] = os.fdopen(sys.stderr.fileno(), 'w', 0)
# but wasn't effective! sp kist straogjt for now
kwargs['out'] = sys.stderr
super(UnderAnnexUI, self).__init__(**kwargs)
self.specialremote = specialremote
def set_specialremote(self, specialremote):
lgr.debug("Setting specialremote of UI %s to %s", self, specialremote)
self.specialremote = specialremote
def get_progressbar(self, *args, **kwargs):
if self.specialremote:
kwargs = kwargs.copy()
kwargs['backend'] = 'annex-remote'
kwargs['remote'] = self.specialremote
return super(UnderAnnexUI, self).get_progressbar(
*args, **kwargs)
def input(self, prompt, hidden=False):
if not can_prompt():
# we are not interactive
raise RuntimeError('Interactive input not available for `ui.input()` in annex remotes')
return super(UnderAnnexUI, self).input(prompt, hidden)
def question(self,
text,
title=None,
choices=None,
default=None,
hidden=False,
repeat=None):
if not can_prompt():
# need to do a more specific that `is_interactive` check, since `is_interactive` checks
# all streams including stdin/out which are to "talk" to git-annex, and thus not tty.
raise RuntimeError('A terminal required for interactive input in annex remotes')
return super(UnderAnnexUI, self).question(
text,
title=title,
choices=choices,
default=default,
hidden=hidden,
repeat=repeat
)
@auto_repr
class UnderTestsUI(DialogUI):
"""UI to help with testing functionality requiring interaction
It will provide additional method to push responses to be provided,
and could be used as a context manager
"""
def __init__(self, **kwargs):
super(UnderTestsUI, self).__init__(**kwargs)
self._responses = deque()
# TODO: possibly allow to provide expected messages etc, so we could
# test that those are the actual ones which were given
def add_responses(self, responses):
if not isinstance(responses, (list, tuple)):
responses = [responses]
self._responses += list(responses)
return self # so we could use it as a context manager
def get_responses(self):
return self._responses
def clear_responses(self):
self._responses = deque()
def question(self, *args, **kwargs):
if not self._responses:
raise AssertionError(
"We are asked for a response whenever none is left to give"
)
return self._responses.popleft()
# Context manager mode of operation which would also verify that
# no responses left upon exiting
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
responses = copy(self._responses)
# we should clear the state so there is no side-effect
self.clear_responses()
assert not len(responses), \
"Still have some responses left: %s" % repr(self._responses)
lgr.log(5, "Done importing ui.dialog")
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/ui/progressbars.py 0000644 0001751 0001751 00000030132 15137634221 017654 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Progress bar implementations to be used.
Should not be imported until we know that interface needs it
"""
import sys
import time
import humanize
from .. import lgr
#
# Haven't found an ideal progress bar yet, so to make things modular etc
# we will provide our interface and adapters for few popular ones
#
class ProgressBarBase(object):
"""Base class for any progress bar"""
def __init__(self, label=None, fill_text=None, total=None, out=None, unit='B'):
self.label = label
self.fill_text = fill_text
self.total = total
self.unit = unit
self.out = out
self._current = 0
def refresh(self):
"""Force update"""
pass
def update(self, size, increment=False, total=None):
if total:
self.total = total
if not size:
return
if increment:
self._current += size
else:
self._current = size
@property
def current(self):
return self._current
@current.setter
def current(self, value):
assert value >= 0, "Total cannot be negative"
self._current = value
def start(self, initial=0):
self._current = initial
def finish(self, partial=False):
"""
Parameters
----------
partial: bool
To signal that finish is called possibly before the activity properly
finished, so .total count might have not been reached
Returns
-------
"""
pass
def clear(self):
pass
def set_desc(self, value):
pass # to override in subclass on how to handle description
class SilentProgressBar(ProgressBarBase):
def __init__(self, label='', fill_text=None, total=None, unit='B', out=sys.stdout):
super(SilentProgressBar, self).__init__(total=total)
class LogProgressBar(ProgressBarBase):
"""A progress bar which logs upon completion of the item
Note that there is also :func:`~datalad.log.log_progress` which can be used
to get progress bars when attached to a tty but incremental log messages
otherwise (as opposed to just the final log message provided by
`LogProgressBar`).
"""
def __init__(self, *args, **kwargs):
super(LogProgressBar, self).__init__(*args, **kwargs)
# I think we never generate progress bars unless we are at the beginning
# of reporting something lengthy. .start is not always invoked so
# we cannot reliably set it there instead of the constructor (here)
self._start_time = time.time()
@staticmethod
def _naturalfloat(x):
"""Return string representation of a number for human consumption
For abs(x) <= 1000 would use 'scientific' (%g) notation, and for the
larger a regular int (after rounding)
"""
return ('%g' % x) if abs(x) <= 1000 else '%i' % int(round(x))
def _naturalsize(self, x):
if self.unit == 'B':
return humanize.naturalsize(x)
else:
return '%s%s' % (self._naturalfloat(x), self.unit or '')
@staticmethod
def _naturaldelta(x):
# humanize is too human for little things
return humanize.naturaldelta(x) \
if x > 2 \
else LogProgressBar._naturalfloat(x) + ' sec'
def start(self, initial=0):
super().start(initial=initial)
msg = " with initial specified to be {initial}" if initial else ''
lgr.info("Start %s%s", self.label, msg)
def finish(self, partial=False):
msg, args = ' %s ', [self.label]
if partial:
# that is the best we know so far:
amount = self.current
if self.total is not None:
if amount != self.total:
perc_done = 100. * amount / self.total
if perc_done <= 100:
msg += "partially (%.2f%% of %s) "
args += [
perc_done,
self._naturalsize(self.total)
]
else:
# well well -- we still probably have some issue with
# over-reporting when getting data from datalad-archives
# Instead of providing non-sense % here, just report
# our best guess
msg += "possibly partially "
else:
# well -- that means that we did manage to get all of it
pass
else:
msg += "possibly partially "
msg += "done"
else:
# Are we "finish"ed because interrupted or done?
amount = self.total
if amount:
msg += '%s done'
args += [self._naturalsize(amount)]
else:
msg += "done"
dt = float(time.time() - self._start_time)
if dt:
msg += ' in %s'
args += [self._naturaldelta(dt)]
if amount:
speed = amount / dt
msg += ' at %s/sec'
args += [self._naturalsize(speed)]
lgr.info(msg, *args)
progressbars = {
# let for compatibility, use "none" instead
'silent': SilentProgressBar,
'none': SilentProgressBar,
'log': LogProgressBar,
}
try:
from tqdm import tqdm
from datalad.utils import updated
class tqdmProgressBar(ProgressBarBase):
"""Adapter for tqdm.ProgressBar"""
backend = 'tqdm'
_frontends = {
None: tqdm,
'ipython': None # to be loaded
}
_default_pbar_params = {
'mininterval': 0.1,
'dynamic_ncols': True, # react to changes in the terminal width
}
def __init__(self, label='', fill_text=None,
total=None, unit='B', out=sys.stdout, leave=False,
frontend=None):
"""
Parameters
----------
label
fill_text
total
unit
out
leave
frontend: (None, 'ipython'), optional
tqdm module to use. Could be tqdm_notebook if under IPython
"""
super(tqdmProgressBar, self).__init__(label=label,
total=total,
unit=unit)
if frontend not in self._frontends:
raise ValueError(
"Know only about following tqdm frontends: %s. Got %s"
% (', '.join(map(str, self._frontends)),
frontend))
tqdm_frontend = self._frontends[frontend]
if not tqdm_frontend:
if frontend == 'ipython':
from tqdm import tqdm_notebook
tqdm_frontend = self._frontends[frontend] = tqdm_notebook
else:
lgr.error(
"Something went wrong here, using default tqdm frontend for %s",
frontend)
tqdm_frontend = self._frontends[frontend] = self._frontends[None]
self._tqdm = tqdm_frontend
self._pbar_params = updated(
self._default_pbar_params,
dict(desc=label, unit=unit,
unit_scale=True, total=total, file=out,
leave=leave,
))
if label and 'total' in label.lower() and 'smoothing' not in self._pbar_params:
# ad-hoc: All tqdm totals will report total mean, and not some
# momentary speed
self._pbar_params['smoothing'] = 0
self._pbar = None
def _create(self, initial=0):
if self._pbar is None:
self._pbar = self._tqdm(initial=initial, **self._pbar_params)
def update(self, size, increment=False, total=None):
self._create()
if total is not None:
# only a reset can change the total of an existing pbar
self._pbar.reset(total)
# we need to (re-)advance the pbar back to the old state
self._pbar.update(self.current)
# an update() does not (reliably) trigger a refresh, hence
# without the next, the pbar may still show zero progress
if not size:
# whenever a total is changed, we need a refresh. If there is
# no progress update, we do it here, else we'll do it after
# the progress update
self._pbar.refresh()
# if we set a new total and also advance the progress bar:
if not size:
return
inc = size - self.current
try:
self._pbar.update(size if increment else inc)
if total:
# refresh to new total and progress
self._pbar.refresh()
except ValueError:
# Do not crash entire process because of some glitch with
# progressbar update
# TODO: issue a warning?
pass
super(tqdmProgressBar, self).update(size,
increment=increment,
total=total)
def start(self, initial=0):
super(tqdmProgressBar, self).start(initial=initial)
self._create(initial=initial)
def refresh(self):
super(tqdmProgressBar, self).refresh()
# older tqdms might not have refresh yet but I think we can live
# without it for a bit there
if hasattr(self._tqdm, 'refresh'):
self._pbar.refresh()
def finish(self, clear=False, partial=False):
"""
Parameters
----------
clear : bool, optional
Explicitly clear the progress bar. Note that we are
creating them with leave=False so they should disappear on their
own and explicit clear call should not be necessary
Returns
-------
"""
if clear:
self.clear()
# be tolerant to bugs in those
try:
if self._pbar is not None:
self._pbar.close()
finally:
self._pbar = None
try:
super(tqdmProgressBar, self).finish()
except Exception as exc: # pragma: no cover
#lgr.debug("Finishing tqdmProgresBar thrown %s", str_exc(exc))
pass
def clear(self):
try:
self._pbar.clear()
except:
# if has none -- we can't do anything about it for now ;)
# 4.7.4 seems to have it
pass
def set_desc(self, value):
self._pbar.desc = value
progressbars['tqdm'] = tqdmProgressBar
except ImportError: # pragma: no cover
pass
assert len(progressbars), "We need tqdm library to report progress"
class AnnexSpecialRemoteProgressBar(ProgressBarBase):
"""Hook up to the special remote and report progress back to annex"""
def __init__(self, *args, **kwargs):
# not worth passing anything since we don't care about anything
remote = kwargs.get('remote')
super(AnnexSpecialRemoteProgressBar, self).__init__()
self.remote = remote
def update(self, *args, **kwargs):
super(AnnexSpecialRemoteProgressBar, self).update(*args, **kwargs)
# now use stored value
if self.remote:
self.remote.send_progress(self.current)
progressbars['annex-remote'] = AnnexSpecialRemoteProgressBar
././@PaxHeader 0000000 0000000 0000000 00000000033 00000000000 010211 x ustar 00 27 mtime=1769945274.894061
datalad-1.3.1/datalad/ui/tests/ 0000755 0001751 0001751 00000000000 15137634273 015740 5 ustar 00runner runner ././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/ui/tests/__init__.py 0000644 0001751 0001751 00000000670 15137634221 020045 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""UI tests
"""
__docformat__ = 'restructuredtext'
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/ui/tests/test_base.py 0000644 0001751 0001751 00000005411 15137634221 020255 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-wstrth: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""tests for UI switcher"""
__docformat__ = 'restructuredtext'
from unittest.mock import patch
from ...tests.utils_pytest import (
assert_equal,
assert_false,
assert_not_equal,
assert_raises,
with_testsui,
)
from .. import _UI_Switcher
from ..dialog import (
ConsoleLog,
DialogUI,
IPythonUI,
)
def test_ui_switcher():
ui = _UI_Switcher('dialog')
assert(isinstance(ui.ui, DialogUI))
message_str = str(ui.message)
assert_equal(message_str, str(ui._ui.message))
ui.set_backend('console')
assert(isinstance(ui.ui, ConsoleLog))
assert_equal(str(ui.message), str(ui._ui.message))
assert_not_equal(message_str, str(ui._ui.message))
with assert_raises(AttributeError):
ui.yesno
ui.set_backend('annex')
# Let's pretend we are under IPython
class ZMQInteractiveShell(object):
pass
with patch('datalad.utils.get_ipython',
lambda: ZMQInteractiveShell(),
create=True):
ui = _UI_Switcher()
assert (isinstance(ui.ui, IPythonUI))
def test_tests_ui():
ui = _UI_Switcher('dialog')
# Let's test our responses construct
ui.set_backend('tests')
with ui.add_responses('abc'):
assert_equal(ui.question("text"), 'abc')
with ui.add_responses(['a', 'bb']):
assert_equal(ui.question("text"), 'a')
assert_equal(ui.question("text"), 'bb')
# should raise exception if not all responses were
# used
with assert_raises(AssertionError):
with ui.add_responses(['a', 'bb']):
assert_equal(ui.question("text"), 'a')
# but clear it up
assert_false(ui.get_responses())
# assure that still works
with ui.add_responses('abc'):
assert_equal(ui.question("text"), 'abc')
# and if we switch back to some other backend -- we would loose *responses methods
ui.set_backend('annex')
assert_false(hasattr(ui, 'add_responses'))
def test_with_testsui():
@with_testsui
def nothing(x, k=1):
assert_equal(x, 1)
assert_equal(k, 2)
nothing(1, k=2)
@with_testsui(responses='a')
def nothing(x, k=1):
assert_equal(x, 1)
assert_equal(k, 2)
# responses were not used
assert_raises(AssertionError, nothing, 1, k=2)
from datalad.ui import ui
@with_testsui(responses='a')
def ask():
assert_equal(ui.question('what is a?'), 'a')
ask()
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/ui/tests/test_dialog.py 0000644 0001751 0001751 00000016474 15137634221 020615 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""tests for dialog UI """
__docformat__ = 'restructuredtext'
import builtins
from io import StringIO
from unittest.mock import (
call,
patch,
)
import pytest
from datalad.ui.progressbars import progressbars
from datalad.utils import swallow_logs
from ...tests.utils_pytest import (
assert_in,
assert_not_in,
assert_raises,
assert_re_in,
eq_,
ok_endswith,
ok_startswith,
)
from ..dialog import (
ConsoleLog,
DialogUI,
IPythonUI,
)
def patch_input(**kwargs):
"""A helper to provide mocked cm patching input function which was renamed in PY3"""
return patch.object(builtins, 'input', **kwargs)
def patch_getpass(**kwargs):
return patch('getpass.getpass', **kwargs)
def test_yesno():
for expected_value, defaults in {True: ('yes', True),
False: ('no', False)}.items():
for d in defaults:
with patch_getpass(return_value=''):
out = StringIO()
response = DialogUI(out=out).yesno("?", default=d)
eq_(response, expected_value)
def test_question_choices():
# TODO: come up with a reusable fixture for testing here
choices = {
'a': '[a], b, cc',
'b': 'a, [b], cc',
'cc': 'a, b, [cc]'
}
for hidden in (True, False):
for default_value in ['a', 'b']:
choices_str = choices[default_value]
for entered_value, expected_value in [(default_value, default_value),
('', default_value),
('cc', 'cc')]:
with patch_getpass(return_value=entered_value) as gpcm:
out = StringIO()
response = DialogUI(out=out).question(
"prompt", choices=sorted(choices), default=default_value,
hidden=hidden
)
# .assert_called_once() is not available on older mock's
# e.g. on 1.3.0 on nd16.04
eq_(gpcm.call_count, 1) # should have asked only once
eq_(response, expected_value)
# getpass doesn't use out -- goes straight to the terminal
eq_(out.getvalue(), '')
# TODO: may be test that the prompt was passed as a part of the getpass arg
#eq_(out.getvalue(), 'prompt (choices: %s): ' % choices_str)
# check some expected exceptions to be thrown
out = StringIO()
ui = DialogUI(out=out)
assert_raises(ValueError, ui.question, "prompt", choices=['a'], default='b')
eq_(out.getvalue(), '')
with patch_getpass(return_value='incorrect'):
assert_raises(RuntimeError, ui.question, "prompt", choices=['a', 'b'])
assert_re_in(".*ERROR: .incorrect. is not among choices.*", out.getvalue())
def test_hidden_doubleentry():
# In above test due to 'choices' there were no double entry for a hidden
out = StringIO()
ui = DialogUI(out=out)
with patch_getpass(return_value='ab') as gpcm:
response = ui.question(
"?", hidden=True)
eq_(response, 'ab')
gpcm.assert_has_calls([call('?: '), call('? (repeat): ')])
# explicitly request no repeats
with patch_getpass(return_value='ab') as gpcm:
response = ui.question(
"?", hidden=True, repeat=False)
eq_(response, 'ab')
gpcm.assert_has_calls([call('?: ')])
@pytest.mark.parametrize("backend", progressbars)
@pytest.mark.parametrize("len", [0, 4, 10, 1000])
@pytest.mark.parametrize("increment", [True, False])
def test_progress_bar(backend, len, increment):
# More of smoke testing given various lengths of fill_text
out = StringIO()
fill_str = ('123456890' * (len//10))[:len]
pb = DialogUI(out).get_progressbar(
'label', fill_str, total=10, backend=backend)
pb.start()
# we can't increment 11 times
SILENT_BACKENDS = ('annex-remote', 'silent', 'none')
ONLY_THE_END_BACKENDS = ('log',)
for x in range(11):
if not (increment and x == 0):
# do not increment on 0
pb.update(x if not increment else 1, increment=increment)
#out.flush() # needed atm... no longer?
# Progress bar is having 0.1 sec between updates by default, so
# we could either sleep:
#import time; time.sleep(0.1)
# or just force the refresh
pb.refresh()
pstr = out.getvalue()
if backend not in SILENT_BACKENDS + ONLY_THE_END_BACKENDS: # no str repr
ok_startswith(pstr.lstrip('\r'), 'label:')
assert_re_in(r'.*\b%d%%.*' % (10*x), pstr)
if backend == 'progressbar':
assert_in('ETA', pstr)
pb.finish()
output = out.getvalue()
if backend not in SILENT_BACKENDS:
# returns back and there is no spurious newline
if output:
ok_endswith(output, '\r')
def test_IPythonUI():
# largely just smoke tests to see if nothing is horribly bad
with patch_input(return_value='a'):
out = StringIO()
response = IPythonUI(out=out).question(
"prompt", choices=sorted(['b', 'a'])
)
eq_(response, 'a')
eq_(out.getvalue(), 'prompt (choices: a, b): ')
ui = IPythonUI()
pbar = ui.get_progressbar(total=10)
assert_in('notebook', str(pbar._tqdm))
def test_silent_question():
# SilentConsoleLog must not be asked questions.
# If it is asked, RuntimeError would be thrown with details to help
# troubleshooting WTF is happening
from ..dialog import SilentConsoleLog
ui = SilentConsoleLog()
with assert_raises(RuntimeError) as cme:
ui.question("could you help me", title="Pretty please")
assert_in('question: could you help me. Title: Pretty please.', str(cme.value))
with assert_raises(RuntimeError) as cme:
ui.question("could you help me", title="Pretty please", choices=['secret1'], hidden=True)
assert_in('question: could you help me. Title: Pretty please.', str(cme.value))
assert_not_in('secret1', str(cme.value))
assert_in('not shown', str(cme.value))
# additional kwargs, no title, choices
with assert_raises(RuntimeError) as cme:
ui.question("q", choices=['secret1'])
assert_in('secret1', str(cme.value))
@patch("datalad.log.is_interactive", lambda: False)
def test_message_pbar_state_logging_is_demoted():
from datalad.log import LoggerHelper
name = "dl-test"
lgr = LoggerHelper(name).get_initialized_logger()
ui = ConsoleLog()
with patch("datalad.log.lgr", lgr):
with swallow_logs(name=name, new_level=20) as cml:
ui.message("testing 0")
assert_not_in("Clear progress bars", cml.out)
assert_not_in("Refresh progress bars", cml.out)
with swallow_logs(name=name, new_level=5) as cml:
ui.message("testing 1")
assert_in("Clear progress bars", cml.out)
assert_in("Refresh progress bars", cml.out)
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/ui/utils.py 0000644 0001751 0001751 00000005610 15137634221 016303 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Various utils oriented to UI"""
import struct
from datalad.support import ansi_colors
from datalad.utils import on_windows
# origin http://stackoverflow.com/a/3010495/1265472
def get_terminal_size():
"""Return current terminal size"""
if on_windows:
try:
from ctypes import (
create_string_buffer,
windll,
)
# stdin handle is -10
# stdout handle is -11
# stderr handle is -12
h = windll.kernel32.GetStdHandle(-12)
csbi = create_string_buffer(22)
res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi)
except:
return None, None
if res:
(bufx, bufy, curx, cury, wattr,
left, top, right, bottom, maxx, maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw)
sizex = right - left + 1
sizey = bottom - top + 1
return sizex, sizey
else:
return None, None
else:
import fcntl
import termios
try:
h, w, hp, wp = struct.unpack(
'HHHH',
fcntl.ioctl(0, termios.TIOCGWINSZ,
struct.pack('HHHH', 0, 0, 0, 0))
)
return w, h
except:
return None, None
def get_console_width(default_min=20):
"""Return console width to use
In some cases shutil reports it to be 0, so we cannot rely on it
alone
"""
console_width = get_terminal_size()[0] or 0
# it might still be 0, e.g. in conda builds for 0.10.0
if console_width <= 0:
console_width = 80
elif console_width <= 20:
# or some other too small to be real number,
# to prevent crashes below guarantee that it is at least 20
console_width = 20
return console_width
def show_hint(msg):
from datalad.ui import ui
ui.message("{}".format(
ansi_colors.color_word(
msg,
ansi_colors.YELLOW)))
def can_prompt() -> bool:
"""Return True if the process can prompt for credentials
On Linux this method checks for a controlling terminal.
On Windows it always returns True.
Unlike :func:`datalad.utils.is_interactive` it does not check all the streams to be a tty,
and just tries to open `/dev/tty` directly.
"""
if on_windows:
return True
else:
try:
open('/dev/tty', 'r').close()
return True
except (IOError, OSError):
return False
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/utils.py 0000644 0001751 0001751 00000274121 15137634221 015673 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
from __future__ import annotations
import builtins
import collections
import gc
import glob
import gzip
import inspect
import logging
import os
import os.path as op
import platform
import posixpath
import re
import shutil
import stat
import string
import sys
import tempfile
import threading
import time
import warnings
from collections.abc import (
Callable,
Iterable,
Iterator,
Sequence,
)
from contextlib import contextmanager
from copy import copy as shallow_copy
from functools import (
lru_cache,
wraps,
)
from itertools import tee
# this import is required because other modules import opj from here.
from os.path import (
abspath,
basename,
commonprefix,
curdir,
dirname,
exists,
expanduser,
expandvars,
isabs,
isdir,
islink,
)
from os.path import join as opj
from os.path import (
lexists,
normpath,
pardir,
relpath,
sep,
split,
splitdrive,
)
from pathlib import (
Path,
PurePath,
PurePosixPath,
)
from shlex import quote as shlex_quote
from shlex import split as shlex_split
from tempfile import NamedTemporaryFile
from time import sleep
from types import (
ModuleType,
TracebackType,
)
from typing import (
IO,
Any,
Dict,
List,
NamedTuple,
Optional,
TextIO,
Tuple,
TypeVar,
Union,
cast,
overload,
)
# from datalad.dochelpers import get_docstring_split
from datalad.consts import TIMESTAMP_FMT
from datalad.support.exceptions import CapturedException
from datalad.typing import (
K,
Literal,
P,
T,
V,
)
# handle this dance once, and import pathlib from here
# in all other places
lgr = logging.getLogger("datalad.utils")
lgr.log(5, "Importing datalad.utils")
#
# Some useful variables
#
platform_system = platform.system().lower()
on_windows = platform_system == 'windows'
on_osx = platform_system == 'darwin'
on_linux = platform_system == 'linux'
# COPY_BUFSIZE sort of belongs into datalad.consts, but that would lead to
# circular import due to `on_windows`
try:
from shutil import COPY_BUFSIZE # type: ignore[attr-defined]
except ImportError: # pragma: no cover
# too old
from datalad.utils import on_windows
# from PY3.10
COPY_BUFSIZE = 1024 * 1024 if on_windows else 64 * 1024
# Takes ~200msec, so should not be called at import time
@lru_cache() # output should not change through life time of datalad process
def get_linux_distribution() -> tuple[str, str, str]:
"""Compatibility wrapper for {platform,distro}.linux_distribution().
"""
if hasattr(platform, "linux_distribution"):
# Use deprecated (but faster) method if it's available.
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
result = platform.linux_distribution()
else:
import distro # We require this for Python 3.8 and above.
return (
distro.id(),
distro.version(),
distro.codename(),
)
return result
# Those weren't used for any critical decision making, thus we just set them to None
# Use get_linux_distribution() directly where needed
linux_distribution_name = linux_distribution_release = None
# Maximal length of cmdline string
# Query the system and use hardcoded "knowledge" if None
# probably getconf ARG_MAX might not be available
# The last one would be the most conservative/Windows
CMD_MAX_ARG_HARDCODED = 2097152 if on_linux else 262144 if on_osx else 32767
try:
CMD_MAX_ARG = os.sysconf('SC_ARG_MAX')
assert CMD_MAX_ARG > 0
if CMD_MAX_ARG > CMD_MAX_ARG_HARDCODED * 1e6:
# workaround for some kind of a bug which comes up with python 3.4
# see https://github.com/datalad/datalad/issues/3150
# or on older CentOS with conda and python as new as 3.9
# see https://github.com/datalad/datalad/issues/5943
# TODO: let Yarik know that the world is a paradise now whenever 1e6
# is not large enough
CMD_MAX_ARG = min(CMD_MAX_ARG, CMD_MAX_ARG_HARDCODED)
except Exception as exc:
# ATM (20181005) SC_ARG_MAX available only on POSIX systems
# so exception would be thrown e.g. on Windows, or
# somehow during Debian build for nd14.04 it is coming up with -1:
# https://github.com/datalad/datalad/issues/3015
CMD_MAX_ARG = CMD_MAX_ARG_HARDCODED
lgr.debug(
"Failed to query or got useless SC_ARG_MAX sysconf, "
"will use hardcoded value: %s", exc)
# Even with all careful computations we do, due to necessity to account for
# environment and what not, we still could not figure out "exact" way to
# estimate it, but it was shown that 300k safety margin on linux was sufficient.
# https://github.com/datalad/datalad/pull/2977#issuecomment-436264710
# 300k is ~15%, so to be safe, and for paranoid us we will just use up to 50%
# of the length for "safety margin". We might probably still blow due to
# env vars, unicode, etc... so any hard limit imho is not a proper solution
CMD_MAX_ARG = int(0.5 * CMD_MAX_ARG)
lgr.debug(
"Maximal length of cmdline string (adjusted for safety margin): %d",
CMD_MAX_ARG)
#
# Little helpers
#
# `getargspec` has been deprecated in Python 3.
class ArgSpecFake(NamedTuple):
args: list[str]
varargs: Optional[str]
keywords: Optional[str]
defaults: Optional[tuple[Any, ...]]
# adding cache here somehow does break it -- even 'datalad wtf' does not run
# @lru_cache() # signatures stay the same, why to "redo"? brings it into ns from mks
def getargspec(func: Callable[..., Any], *, include_kwonlyargs: bool=False) -> ArgSpecFake:
"""Compat shim for getargspec deprecated in python 3.
The main difference from inspect.getargspec (and inspect.getfullargspec
for that matter) is that by using inspect.signature we are providing
correct args/defaults for functools.wraps'ed functions.
`include_kwonlyargs` option was added to centralize getting all args,
even the ones which are kwonly (follow the ``*,``).
For internal use and not advised for use in 3rd party code.
Please use inspect.signature directly.
"""
# We use signature, and not getfullargspec, because only signature properly
# "passes" args from a functools.wraps decorated function.
# Note: getfullargspec works Ok on wrapt-decorated functions
f_sign = inspect.signature(func)
# Loop through parameters and compose argspec
args: list[str] = []
varargs: Optional[str] = None
keywords: Optional[str] = None
defaults: dict[str, Any] = {}
# Collect all kwonlyargs into a dedicated dict - name: default
kwonlyargs: dict[str, Any] = {}
P = inspect.Parameter
for p_name, p in f_sign.parameters.items():
if p.kind in (P.POSITIONAL_ONLY, P.POSITIONAL_OR_KEYWORD):
assert not kwonlyargs # yoh: must not come after kwonlyarg
args.append(p_name)
if p.default is not P.empty:
defaults[p_name] = p.default
elif p.kind == P.VAR_POSITIONAL:
varargs = p_name
elif p.kind == P.VAR_KEYWORD:
keywords = p_name
elif p.kind == P.KEYWORD_ONLY:
assert p.default is not P.empty
kwonlyargs[p_name] = p.default
if kwonlyargs:
if not include_kwonlyargs:
raise ValueError(
'Function has keyword-only parameters or annotations, either use '
'inspect.signature() API which can support them, or provide include_kwonlyargs=True '
'to this function'
)
else:
args.extend(list(kwonlyargs))
defaults.update(kwonlyargs)
# harmonize defaults to how original getargspec returned them -- just a tuple
d_defaults = None if not defaults else tuple(defaults.values())
return ArgSpecFake(args, varargs, keywords, d_defaults)
# Definitions to be (re)used in the next function
_SIG_P = inspect.Parameter
_SIG_KIND_SELECTORS: dict[str, set[int]] = {
'pos_only': {_SIG_P.POSITIONAL_ONLY,},
'pos_any': {_SIG_P.POSITIONAL_ONLY, _SIG_P.POSITIONAL_OR_KEYWORD},
'kw_any': {_SIG_P.POSITIONAL_OR_KEYWORD, _SIG_P.KEYWORD_ONLY},
'kw_only': {_SIG_P.KEYWORD_ONLY,},
}
_SIG_KIND_SELECTORS['any'] = set().union(*_SIG_KIND_SELECTORS.values())
@lru_cache() # signatures stay the same, why to "redo"? brings it into ns from mks
def get_sig_param_names(f: Callable[..., Any], kinds: tuple[str, ...]) -> tuple[list[str], ...]:
"""A helper to selectively return parameters from inspect.signature.
inspect.signature is the ultimate way for introspecting callables. But
its interface is not so convenient for a quick selection of parameters
(AKA arguments) of desired type or combinations of such. This helper
should make it easier to retrieve desired collections of parameters.
Since often it is desired to get information about multiple specific types
of parameters, `kinds` is a list, so in a single invocation of `signature`
and looping through the results we can obtain all information.
Parameters
----------
f: callable
kinds: tuple with values from {'pos_any', 'pos_only', 'kw_any', 'kw_only', 'any'}
Is a list of what kinds of args to return in result (tuple). Each element
should be one of: 'any_pos' - positional or keyword which could be used
positionally. 'kw_only' - keyword only (cannot be used positionally) arguments,
'any_kw` - any keyword (could be a positional which could be used as a keyword),
`any` -- any type from the above.
Returns
-------
tuple:
Each element is a list of parameters (names only) of that "kind".
"""
selectors: list[set[int]] = []
for kind in kinds:
if kind not in _SIG_KIND_SELECTORS:
raise ValueError(f"Unknown 'kind' {kind}. Known are: {', '.join(_SIG_KIND_SELECTORS)}")
selectors.append(_SIG_KIND_SELECTORS[kind])
out: list[list[str]] = [[] for _ in kinds]
for p_name, p in inspect.signature(f).parameters.items():
for i, selector in enumerate(selectors):
if p.kind in selector:
out[i].append(p_name)
return tuple(out)
def any_re_search(regexes: str | list[str], value: str) -> bool:
"""Return if any of regexes (list or str) searches successfully for value"""
for regex in ensure_tuple_or_list(regexes):
if re.search(regex, value):
return True
return False
def not_supported_on_windows(msg: Optional[str]=None) -> None:
"""A little helper to be invoked to consistently fail whenever functionality is
not supported (yet) on Windows
"""
if on_windows:
raise NotImplementedError("This functionality is not yet implemented for Windows OS"
+ (": %s" % msg if msg else ""))
def get_home_envvars(new_home: str | Path) -> dict[str, str]:
"""Return dict with env variables to be adjusted for a new HOME
Only variables found in current os.environ are adjusted.
Parameters
----------
new_home: str or Path
New home path, in native to OS "schema"
"""
new_home = str(new_home)
out = {'HOME': new_home}
if on_windows:
# requires special handling, since it has a number of relevant variables
# and also Python changed its behavior and started to respect USERPROFILE only
# since python 3.8: https://bugs.python.org/issue36264
out['USERPROFILE'] = new_home
out['HOMEDRIVE'], out['HOMEPATH'] = splitdrive(new_home)
return {v: val for v, val in out.items() if v in os.environ}
def _is_stream_tty(stream: Optional[IO]) -> bool:
try:
# TODO: check on windows if hasattr check would work correctly and
# add value:
return stream is not None and stream.isatty()
except ValueError as exc:
# Who knows why it is a ValueError, but let's try to be specific
# If there is a problem with I/O - non-interactive, otherwise reraise
if "I/O" in str(exc):
return False
raise
def is_interactive() -> bool:
"""Return True if all in/outs are open and tty.
Note that in a somewhat abnormal case where e.g. stdin is explicitly
closed, and any operation on it would raise a
`ValueError("I/O operation on closed file")` exception, this function
would just return False, since the session cannot be used interactively.
"""
return all(_is_stream_tty(s) for s in (sys.stdin, sys.stdout, sys.stderr))
def get_ipython_shell() -> Optional[Any]:
"""Detect if running within IPython and returns its `ip` (shell) object
Returns None if not under ipython (no `get_ipython` function)
"""
try:
return get_ipython() # type: ignore[name-defined]
except NameError:
return None
def md5sum(filename: str | Path) -> str:
"""Compute an MD5 sum for the given file
"""
from datalad.support.digests import Digester
return Digester(digests=['md5'])(filename)['md5']
_encoded_dirsep = r'\\' if on_windows else r'/'
_VCS_REGEX = r'%s\.(?:git|gitattributes|svn|bzr|hg)(?:%s|$)' % (
_encoded_dirsep, _encoded_dirsep)
_DATALAD_REGEX = r'%s\.(?:datalad)(?:%s|$)' % (
_encoded_dirsep, _encoded_dirsep)
def find_files(regex: str, topdir: str | Path = curdir, exclude: Optional[str]=None, exclude_vcs: bool =True, exclude_datalad: bool =False, dirs: bool =False) -> Iterator[str]:
"""Generator to find files matching regex
Parameters
----------
regex: string
exclude: string, optional
Matches to exclude
exclude_vcs:
If True, excludes commonly known VCS subdirectories. If string, used
as regex to exclude those files (regex: `%r`)
exclude_datalad:
If True, excludes files known to be datalad meta-data files (e.g. under
.datalad/ subdirectory) (regex: `%r`)
topdir: string, optional
Directory where to search
dirs: bool, optional
Whether to match directories as well as files
"""
for dirpath, dirnames, filenames in os.walk(topdir):
names = (dirnames + filenames) if dirs else filenames
# TODO: might want to uniformize on windows to use '/'
paths = (op.join(dirpath, name) for name in names)
for path in filter(re.compile(regex).search, paths):
path = path.rstrip(sep)
if exclude and re.search(exclude, path):
continue
if exclude_vcs and re.search(_VCS_REGEX, path):
continue
if exclude_datalad and re.search(_DATALAD_REGEX, path):
continue
yield path
find_files.__doc__ %= (_VCS_REGEX, _DATALAD_REGEX) # type: ignore[operator]
def expandpath(path: str | Path, force_absolute: bool =True) -> str:
"""Expand all variables and user handles in a path.
By default return an absolute path
"""
path = expandvars(expanduser(path))
if force_absolute:
path = abspath(path)
return path
def posix_relpath(path: str | Path, start: Optional[str | Path]=None) -> str:
"""Behave like os.path.relpath, but always return POSIX paths...
on any platform."""
# join POSIX style
return posixpath.join(
# split and relpath native style
# python2.7 ntpath implementation of relpath cannot handle start=None
*split(
relpath(path, start=start if start is not None else '')))
def is_explicit_path(path: str | Path) -> bool:
"""Return whether a path explicitly points to a location
Any absolute path, or relative path starting with either '../' or
'./' is assumed to indicate a location on the filesystem. Any other
path format is not considered explicit."""
path = expandpath(path, force_absolute=False)
return isabs(path) \
or path.startswith(os.curdir + os.sep) \
or path.startswith(os.pardir + os.sep)
def rotree(path: str | Path, ro: bool =True, chmod_files: bool =True) -> None:
"""To make tree read-only or writable
Parameters
----------
path : string
Path to the tree/directory to chmod
ro : bool, optional
Whether to make it R/O (default) or RW
chmod_files : bool, optional
Whether to operate also on files (not just directories)
"""
if ro:
chmod = lambda f: os.chmod(f, os.stat(f).st_mode & ~stat.S_IWRITE)
else:
chmod = lambda f: os.chmod(f, os.stat(f).st_mode | stat.S_IWRITE | stat.S_IREAD)
for root, dirs, files in os.walk(path, followlinks=False):
if chmod_files:
for f in files:
fullf = op.join(root, f)
# might be the "broken" symlink which would fail to stat etc
if exists(fullf):
chmod(fullf)
chmod(root)
def rmtree(path: str | Path, chmod_files: bool | Literal["auto"] ='auto', children_only: bool =False, *args: Any, **kwargs: Any) -> None:
"""To remove git-annex .git it is needed to make all files and directories writable again first
Parameters
----------
path: Path or str
Path to remove
chmod_files : string or bool, optional
Whether to make files writable also before removal. Usually it is just
a matter of directories to have write permissions.
If 'auto' it would chmod files on windows by default
children_only : bool, optional
If set, all files and subdirectories would be removed while the path
itself (must be a directory) would be preserved
`*args` :
`**kwargs` :
Passed into shutil.rmtree call
"""
# Give W permissions back only to directories, no need to bother with files
if chmod_files == 'auto':
chmod_files = on_windows
# TODO: yoh thinks that if we could quickly check our Flyweight for
# repos if any of them is under the path, and could call .precommit
# on those to possibly stop batched processes etc, we did not have
# to do it on case by case
# Check for open files
assert_no_open_files(path)
# TODO the whole thing should be reimplemented with pathlib, but for now
# at least accept Path
path = str(path)
if children_only:
if not isdir(path):
raise ValueError("Can remove children only of directories")
for p in os.listdir(path):
rmtree(op.join(path, p))
return
if not (islink(path) or not isdir(path)):
rotree(path, ro=False, chmod_files=chmod_files)
if on_windows:
# shutil fails to remove paths that exceed 260 characters on Windows machines
# that did not enable long path support. A workaround to remove long paths
# anyway is to prepend \\?\ to the path.
# https://docs.microsoft.com/en-us/windows/win32/fileio/naming-a-file?redirectedfrom=MSDN#win32-file-namespaces
path = r'\\?\ '.strip() + path
_rmtree(path, *args, **kwargs)
else:
# just remove the symlink
unlink(path)
def rmdir(path: str | Path, *args: Any, **kwargs: Any) -> None:
"""os.rmdir with our optional checking for open files"""
assert_no_open_files(path)
os.rmdir(path)
def get_open_files(path: str | Path, log_open: int = False) -> dict[str, Any]:
"""Get open files under a path
Note: This function is very slow on Windows.
Parameters
----------
path : str
File or directory to check for open files under
log_open : bool or int
If set - logger level to use
Returns
-------
dict
path : pid
"""
# Original idea: https://stackoverflow.com/a/11115521/1265472
import psutil
files = {}
# since the ones returned by psutil would not be aware of symlinks in the
# path we should also get realpath for path
# do absolute() in addition to always get an absolute path
# even with non-existing paths on windows
path = str(Path(path).resolve().absolute())
for proc in psutil.process_iter():
try:
open_paths = [p.path for p in proc.open_files()] + [proc.cwd()]
for p in open_paths:
# note: could be done more efficiently so we do not
# renormalize path over and over again etc
if path_startswith(p, path):
files[p] = proc
# Catch a race condition where a process ends
# before we can examine its files
except psutil.NoSuchProcess:
pass
except psutil.AccessDenied:
pass
if files and log_open:
lgr.log(log_open, "Open files under %s: %s", path, files)
return files
_assert_no_open_files_cfg = os.environ.get('DATALAD_ASSERT_NO_OPEN_FILES')
if _assert_no_open_files_cfg:
def assert_no_open_files(path: str | Path) -> None:
files = get_open_files(path, log_open=40)
if _assert_no_open_files_cfg == 'assert':
assert not files, "Got following files still open: %s" % ','.join(files)
elif files:
if _assert_no_open_files_cfg == 'pdb':
import pdb
pdb.set_trace()
elif _assert_no_open_files_cfg == 'epdb':
import epdb # type: ignore[import]
epdb.serve()
pass
# otherwise we would just issue that error message in the log
else:
def assert_no_open_files(path: str | Path) -> None:
pass
def rmtemp(f: str | Path, *args: Any, **kwargs: Any) -> None:
"""Wrapper to centralize removing of temp files so we could keep them around
It will not remove the temporary file/directory if DATALAD_TESTS_TEMP_KEEP
environment variable is defined
"""
if not os.environ.get('DATALAD_TESTS_TEMP_KEEP'):
if not os.path.lexists(f):
lgr.debug("Path %s does not exist, so can't be removed", f)
return
lgr.log(5, "Removing temp file: %s", f)
# Can also be a directory
if isdir(f):
rmtree(f, *args, **kwargs)
else:
unlink(f)
else:
lgr.info("Keeping temp file: %s", f)
@overload
def file_basename(name: str | Path, return_ext: Literal[True]) -> tuple[str, str]:
...
@overload
def file_basename(name: str | Path, return_ext: Literal[False] = False) -> str:
...
def file_basename(name: str | Path, return_ext: bool =False) -> str | tuple[str, str]:
"""
Strips up to 2 extensions of length up to 4 characters and starting with alpha
not a digit, so we could get rid of .tar.gz etc
"""
bname = basename(name)
fbname = re.sub(r'(\.[a-zA-Z_]\S{1,4}){0,2}$', '', bname)
if return_ext:
return fbname, bname[len(fbname) + 1:]
else:
return fbname
# unused in -core
def escape_filename(filename: str) -> str:
"""Surround filename in "" and escape " in the filename
"""
filename = filename.replace('"', r'\"').replace('`', r'\`')
filename = '"%s"' % filename
return filename
# unused in -core
def encode_filename(filename: str | bytes) -> bytes:
"""Encode unicode filename
"""
if isinstance(filename, str):
return filename.encode(sys.getfilesystemencoding())
else:
return filename
# unused in -core
def decode_input(s: str | bytes) -> str:
"""Given input string/bytes, decode according to stdin codepage (or UTF-8)
if not defined
If fails -- issue warning and decode allowing for errors
being replaced
"""
if isinstance(s, str):
return s
else:
encoding = sys.stdin.encoding or 'UTF-8'
try:
return s.decode(encoding)
except UnicodeDecodeError as exc:
lgr.warning(
"Failed to decode input string using %s encoding. "
"Decoding allowing for errors", encoding)
return s.decode(encoding, errors='replace')
# unused in -core
if on_windows:
def lmtime(filepath: str | Path, mtime: int | float) -> None:
"""Set mtime for files. On Windows a merely adapter to os.utime
"""
os.utime(filepath, (time.time(), mtime))
else:
def lmtime(filepath: str | Path, mtime: int | float) -> None:
"""Set mtime for files, while not de-referencing symlinks.
To overcome absence of os.lutime
Works only on linux and OSX ATM
"""
from .cmd import WitlessRunner
# convert mtime to format touch understands [[CC]YY]MMDDhhmm[.SS]
smtime = time.strftime("%Y%m%d%H%M.%S", time.localtime(mtime))
lgr.log(3, "Setting mtime for %s to %s == %s", filepath, mtime, smtime)
WitlessRunner().run(['touch', '-h', '-t', '%s' % smtime, str(filepath)])
filepath = Path(filepath)
rfilepath = filepath.resolve()
if filepath.is_symlink() and rfilepath.exists():
# trust no one - adjust also of the target file
# since it seemed like downloading under OSX (was it using curl?)
# didn't bother with timestamps
lgr.log(3, "File is a symlink to %s Setting mtime for it to %s",
rfilepath, mtime)
os.utime(str(rfilepath), (time.time(), mtime))
# doesn't work on OSX
# Runner().run(['touch', '-h', '-d', '@%s' % mtime, filepath])
# See for a request for a
# better way to annotate this function.
def ensure_tuple_or_list(obj: Any) -> list | tuple:
"""Given an object, wrap into a tuple if not list or tuple
"""
if isinstance(obj, (list, tuple)):
return tuple(obj)
return (obj,)
ListOrSet = TypeVar("ListOrSet", list, set)
# TODO: Improve annotation:
def ensure_iter(s: Any, cls: type[ListOrSet], copy: bool=False, iterate: bool=True) -> ListOrSet:
"""Given not a list, would place it into a list. If None - empty list is returned
Parameters
----------
s: list or anything
cls: class
Which iterable class to ensure
copy: bool, optional
If correct iterable is passed, it would generate its shallow copy
iterate: bool, optional
If it is not a list, but something iterable (but not a str)
iterate over it.
"""
if isinstance(s, cls):
return s if not copy else shallow_copy(s)
elif isinstance(s, str):
return cls((s,))
elif iterate and hasattr(s, '__iter__'):
return cls(s)
elif s is None:
return cls()
else:
return cls((s,))
# TODO: Improve annotation:
def ensure_list(s: Any, copy: bool=False, iterate: bool=True) -> list:
"""Given not a list, would place it into a list. If None - empty list is returned
Parameters
----------
s: list or anything
copy: bool, optional
If list is passed, it would generate a shallow copy of the list
iterate: bool, optional
If it is not a list, but something iterable (but not a str)
iterate over it.
"""
return ensure_iter(s, list, copy=copy, iterate=iterate)
# TODO: Improve annotation:
def ensure_result_list(r: Any) -> list:
"""Return a list of result records
Largely same as ensure_list, but special casing a single dict being passed
in, which a plain `ensure_list` would iterate over. Hence, this deals with
the three ways datalad commands return results:
- single dict
- list of dicts
- generator
Used for result assertion helpers.
"""
return [r] if isinstance(r, dict) else ensure_list(r)
@overload
def ensure_list_from_str(s: str, sep: str='\n') -> Optional[list[str]]:
...
@overload
def ensure_list_from_str(s: list[T], sep: str='\n') -> Optional[list[T]]:
...
def ensure_list_from_str(s: str | list[T], sep: str='\n') -> Optional[list[str]] | Optional[list[T]]:
"""Given a multiline string convert it to a list of return None if empty
Parameters
----------
s: str or list
"""
if not s:
return None
if isinstance(s, list):
return s
return s.split(sep)
@overload
def ensure_dict_from_str(s: str, sep: str = '\n') -> Optional[dict[str, str]]:
...
@overload
def ensure_dict_from_str(s: dict[K, V], sep: str = '\n') -> Optional[dict[K, V]]:
...
def ensure_dict_from_str(s: str | dict[K, V], sep: str = '\n') -> Optional[dict[str, str]] | Optional[dict[K, V]]:
"""Given a multiline string with key=value items convert it to a dictionary
Parameters
----------
s: str or dict
Returns None if input s is empty
"""
if not s:
return None
if isinstance(s, dict):
return s
out: dict[str, str] = {}
values = ensure_list_from_str(s, sep=sep)
assert values is not None
for value_str in values:
if '=' not in value_str:
raise ValueError("{} is not in key=value format".format(repr(value_str)))
k, v = value_str.split('=', 1)
if k in out:
err = "key {} was already defined in {}, but new value {} was provided".format(k, out, v)
raise ValueError(err)
out[k] = v
return out
def ensure_bytes(s: str | bytes, encoding: str='utf-8') -> bytes:
"""Convert/encode unicode string to bytes.
If `s` isn't a string, return it as is.
Parameters
----------
encoding: str, optional
Encoding to use. "utf-8" is the default
"""
if not isinstance(s, str):
return s
return s.encode(encoding)
def ensure_unicode(s: str | bytes, encoding: Optional[str]=None, confidence: Optional[float]=None) -> str:
"""Convert/decode bytestring to unicode.
If `s` isn't a bytestring, return it as is.
Parameters
----------
encoding: str, optional
Encoding to use. If None, "utf-8" is tried, and then if not a valid
UTF-8, encoding will be guessed
confidence: float, optional
A value between 0 and 1, so if guessing of encoding is of lower than
specified confidence, ValueError is raised
"""
if not isinstance(s, bytes):
return s
if encoding is None:
# Figure out encoding, defaulting to 'utf-8' which is our common
# target in contemporary digital society
try:
return s.decode('utf-8')
except UnicodeDecodeError as exc:
lgr.debug("Failed to decode a string as utf-8: %s",
CapturedException(exc))
# And now we could try to guess
from chardet import detect
enc = detect(s)
denc = enc.get('encoding', None)
if denc:
denc_confidence = enc.get('confidence', 0)
if confidence is not None and denc_confidence < confidence:
raise ValueError(
"Failed to auto-detect encoding with high enough "
"confidence. Highest confidence was %s for %s"
% (denc_confidence, denc)
)
lgr.log(5, "Auto-detected encoding to be %s", denc)
return s.decode(denc)
else:
raise ValueError(
"Could not decode value as utf-8, or to guess its encoding: %s"
% repr(s)
)
else:
return s.decode(encoding)
def ensure_bool(s: Any) -> bool:
"""Convert value into boolean following convention for strings
to recognize on,True,yes as True, off,False,no as False
"""
if isinstance(s, str):
if s.isdigit():
return bool(int(s))
sl = s.lower()
if sl in {'y', 'yes', 'true', 'on'}:
return True
elif sl in {'n', 'no', 'false', 'off'}:
return False
else:
raise ValueError("Do not know how to treat %r as a boolean" % s)
return bool(s)
def unique(seq: Sequence[T], key: Optional[Callable[[T], Any]]=None, reverse: bool=False) -> list[T]:
"""Given a sequence return a list only with unique elements while maintaining order
This is the fastest solution. See
https://www.peterbe.com/plog/uniqifiers-benchmark
and
http://stackoverflow.com/a/480227/1265472
for more information.
Enhancement -- added ability to compare for uniqueness using a key function
Parameters
----------
seq:
Sequence to analyze
key: callable, optional
Function to call on each element so we could decide not on a full
element, but on its member etc
reverse: bool, optional
If True, uniqueness checked in the reverse order, so that the later ones
will take the order
"""
seen: set[T] = set()
seen_add = seen.add
if reverse:
def trans(x: Sequence[T]) -> Iterable[T]:
return reversed(x)
else:
def trans(x: Sequence[T]) -> Iterable[T]:
return x
if key is None:
out = [x for x in trans(seq) if not (x in seen or seen_add(x))]
else:
# OPT: could be optimized, since key is called twice, but for our cases
# should be just as fine
out = [x for x in trans(seq) if not (key(x) in seen or seen_add(key(x)))]
return out[::-1] if reverse else out
# TODO: Annotate (would be made easier if the return value was always a dict
# instead of doing `v.__class__(...)`)
def map_items(func, v):
"""A helper to apply `func` to all elements (keys and values) within dict
No type checking of values passed to func is done, so `func`
should be resilient to values which it should not handle
Initial usecase - apply_recursive(url_fragment, ensure_unicode)
"""
# map all elements within item
return v.__class__(
item.__class__(map(func, item))
for item in v.items()
)
def partition(items: Iterable[T], predicate: Callable[[T], Any]=bool) -> tuple[Iterator[T], Iterator[T]]:
"""Partition `items` by `predicate`.
Parameters
----------
items : iterable
predicate : callable
A function that will be mapped over each element in `items`. The
elements will partitioned based on whether the return value is false or
true.
Returns
-------
A tuple with two generators, the first for 'false' items and the second for
'true' ones.
Notes
-----
Taken from Peter Otten's snippet posted at
https://nedbatchelder.com/blog/201306/filter_a_list_into_two_parts.html
"""
a, b = tee((predicate(item), item) for item in items)
return ((item for pred, item in a if not pred),
(item for pred, item in b if pred))
def generate_chunks(container: list[T], size: int) -> Iterator[list[T]]:
"""Given a container, generate chunks from it with size up to `size`
"""
# There could be a "smarter" solution but I think this would suffice
assert size > 0, "Size should be non-0 positive"
while container:
yield container[:size]
container = container[size:]
def generate_file_chunks(files: list[str], cmd: str | list[str] | None = None) -> Iterator[list[str]]:
"""Given a list of files, generate chunks of them to avoid exceeding cmdline length
Parameters
----------
files: list of str
cmd: str or list of str, optional
Command to account for as well
"""
files = ensure_list(files)
cmd = ensure_list(cmd)
maxl = max(map(len, files)) if files else 0
chunk_size = max(
1, # should at least be 1. If blows then - not our fault
(CMD_MAX_ARG
- sum((len(x) + 3) for x in cmd)
- 4 # for '--' below
) // (maxl + 3) # +3 for possible quotes and a space
)
# TODO: additional treatment for "too many arguments"? although
# as https://github.com/datalad/datalad/issues/1883#issuecomment
# -436272758
# shows there seems to be no hardcoded limit on # of arguments,
# but may be we decide to go for smth like follow to be on safe side
# chunk_size = min(10240 - len(cmd), chunk_size)
file_chunks = generate_chunks(files, chunk_size)
return file_chunks
#
# Generators helpers
#
def saved_generator(gen: Iterable[T]) -> tuple[Iterator[T], Iterator[T]]:
"""Given a generator returns two generators, where 2nd one just replays
So the first one would be going through the generated items and 2nd one
would be yielding saved items
"""
saved = []
def gen1() -> Iterator[T]:
for x in gen: # iterating over original generator
saved.append(x)
yield x
def gen2() -> Iterator[T]:
for x in saved: # yielding saved entries
yield x
return gen1(), gen2()
#
# Decorators
#
# Originally better_wraps was created to provide `wrapt`-based, instead of
# `functools.wraps` implementation to preserve the correct signature of the
# decorated function. By using inspect.signature in our getargspec, which
# works fine on `functools.wraps`ed functions, we mediated this necessity.
better_wraps = wraps
# TODO: Annotate:
# Borrowed from pandas
# Copyright: 2011-2014, Lambda Foundry, Inc. and PyData Development Team
# License: BSD-3
def optional_args(decorator):
"""allows a decorator to take optional positional and keyword arguments.
Assumes that taking a single, callable, positional argument means that
it is decorating a function, i.e. something like this::
@my_decorator
def function(): pass
Calls decorator with decorator(f, `*args`, `**kwargs`)"""
@better_wraps(decorator)
def wrapper(*args, **kwargs):
def dec(f):
return decorator(f, *args, **kwargs)
is_decorating = not kwargs and len(args) == 1 and isinstance(args[0], Callable)
if is_decorating:
f = args[0]
args = []
return dec(f)
else:
return dec
return wrapper
# TODO: just provide decorators for tempfile.mk* functions. This is ugly!
def get_tempfile_kwargs(tkwargs: Optional[dict[str, Any]]=None, prefix: str="", wrapped: Optional[Callable]=None) -> dict[str, Any]:
"""Updates kwargs to be passed to tempfile. calls depending on env vars
"""
if tkwargs is None:
tkwargs_ = {}
else:
# operate on a copy of tkwargs to avoid any side-effects
tkwargs_ = tkwargs.copy()
# TODO: don't remember why I had this one originally
# if len(targs)<2 and \
if 'prefix' not in tkwargs_:
tkwargs_['prefix'] = '_'.join(
['datalad_temp'] +
([prefix] if prefix else []) +
([''] if (on_windows or not wrapped) else [wrapped.__name__]))
directory = os.environ.get('TMPDIR')
if directory and 'dir' not in tkwargs_:
tkwargs_['dir'] = directory
return tkwargs_
def line_profile(func: Callable[P, T]) -> Callable[P, T]:
"""Q&D helper to line profile the function and spit out stats
"""
import line_profiler # type: ignore[import]
prof = line_profiler.LineProfiler()
@wraps(func)
def _wrap_line_profile(*args: P.args, **kwargs: P.kwargs) -> T:
try:
pfunc = prof(func)
return pfunc(*args, **kwargs)
finally:
prof.print_stats()
return _wrap_line_profile
# unused in -core
@optional_args
def collect_method_callstats(func: Callable[P, T]) -> Callable[P, T]:
"""Figure out methods which call the method repeatedly on the same instance
Use case(s):
- .repo is expensive since does all kinds of checks.
- .config is expensive transitively since it calls .repo each time
TODO:
- fancy one could look through the stack for the same id(self) to see if
that location is already in memo. That would hint to the cases where object
is not passed into underlying functions, causing them to redo the same work
over and over again
- ATM might flood with all "1 lines" calls which are not that informative.
The underlying possibly suboptimal use might be coming from their callers.
It might or not relate to the previous TODO
"""
import traceback
from collections import defaultdict
from time import time
memo: defaultdict[tuple[int, str], defaultdict[int, int]] = defaultdict(lambda: defaultdict(int)) # it will be a dict of lineno: count
# gross timing
times = []
toppath = dirname(__file__) + sep
@wraps(func)
def _wrap_collect_method_callstats(*args: P.args, **kwargs: P.kwargs) -> T:
try:
self = args[0]
stack = traceback.extract_stack()
caller = stack[-2]
stack_sig = \
"{relpath}:{s.name}".format(
s=caller, relpath=relpath(caller.filename, toppath))
sig = (id(self), stack_sig)
# we will count based on id(self) + wherefrom
if caller.lineno is not None:
memo[sig][caller.lineno] += 1
t0 = time()
return func(*args, **kwargs)
finally:
times.append(time() - t0)
pass
def print_stats() -> None:
print("The cost of property {}:".format(func.__name__))
if not memo:
print("None since no calls")
return
# total count
counts = {k: sum(v.values()) for k,v in memo.items()}
total = sum(counts.values())
ids = {self_id for (self_id, _) in memo}
print(" Total: {} calls from {} objects with {} contexts taking {:.2f} sec"
.format(total, len(ids), len(memo), sum(times)))
# now we need to sort by value
for (self_id, caller), count in sorted(counts.items(), key=lambda x: x[1], reverse=True):
print(" {} {}: {} from {} lines"
.format(self_id, caller, count, len(memo[(self_id, caller)])))
# Upon total exit we print the stats
import atexit
atexit.register(print_stats)
return _wrap_collect_method_callstats
# Borrowed from duecredit to wrap duecredit-handling to guarantee failsafe
def never_fail(f: Callable[P, T]) -> Callable[P, Optional[T]]:
"""Assure that function never fails -- all exceptions are caught
Returns `None` if function fails internally.
"""
@wraps(f)
def wrapped_func(*args: P.args, **kwargs: P.kwargs) -> Optional[T]:
try:
return f(*args, **kwargs)
except Exception as e:
lgr.warning(
"DataLad internal failure while running %s: %r. "
"Please report at https://github.com/datalad/datalad/issues"
% (f, e)
)
return None
if os.environ.get('DATALAD_ALLOW_FAIL', False):
return f
else:
return wrapped_func
def shortened_repr(value: Any, l: int=30) -> str:
try:
if hasattr(value, '__repr__') and (value.__repr__ is not object.__repr__):
value_repr = repr(value)
if not value_repr.startswith('<') and len(value_repr) > l:
value_repr = "<<%s++%d chars++%s>>" % (
value_repr[:l - 16],
len(value_repr) - (l - 16 + 4),
value_repr[-4:]
)
elif value_repr.startswith('<') and value_repr.endswith('>') and ' object at 0x':
raise ValueError("I hate those useless long reprs")
else:
raise ValueError("gimme class")
except Exception as e:
value_repr = "<%s>" % value.__class__.__name__.split('.')[-1]
return value_repr
def __auto_repr__(obj: Any, short: bool =True) -> str:
attr_names: tuple[str, ...] = tuple()
if hasattr(obj, '__dict__'):
attr_names += tuple(obj.__dict__.keys())
if hasattr(obj, '__slots__'):
attr_names += tuple(obj.__slots__)
items = []
for attr in sorted(set(attr_names)):
if attr.startswith('_'):
continue
value = getattr(obj, attr)
# TODO: should we add this feature to minimize some talktative reprs
# such as of URL?
#if value is None:
# continue
items.append("%s=%s" % (attr, shortened_repr(value) if short else value))
return "%s(%s)" % (obj.__class__.__name__, ', '.join(items))
@optional_args
def auto_repr(cls: type[T], short: bool=True) -> type[T]:
"""Decorator for a class to assign it an automagic quick and dirty __repr__
It uses public class attributes to prepare repr of a class
Original idea: http://stackoverflow.com/a/27799004/1265472
"""
cls.__repr__ = lambda obj:__auto_repr__(obj, short=short) # type: ignore[assignment]
return cls
def todo_interface_for_extensions(f: T) -> T:
return f
#
# Context Managers
#
# unused in -core
@contextmanager
def nothing_cm() -> Iterator[None]:
"""Just a dummy cm to programmically switch context managers"""
yield
class SwallowOutputsAdapter:
"""Little adapter to help getting out/err values
"""
def __init__(self) -> None:
kw = get_tempfile_kwargs({}, prefix="outputs")
self._out = NamedTemporaryFile(delete=False, mode='w', **kw)
self._err = NamedTemporaryFile(delete=False, mode='w', **kw)
def _read(self, h: IO[str]) -> str:
with open(h.name) as f:
return f.read()
@property
def out(self) -> str:
if not self._out.closed:
self._out.flush()
return self._read(self._out)
@property
def err(self) -> str:
if not self._err.closed:
self._err.flush()
return self._read(self._err)
@property
def handles(self) -> tuple[TextIO, TextIO]:
return (cast(TextIO, self._out), cast(TextIO, self._err))
def cleanup(self) -> None:
self._out.close()
self._err.close()
out_name = self._out.name
err_name = self._err.name
from datalad import cfg
if cfg.getbool('datalad.log', 'outputs', default=False) \
and lgr.getEffectiveLevel() <= logging.DEBUG:
for s, sname in ((self.out, 'stdout'),
(self.err, 'stderr')):
if s:
pref = os.linesep + "| "
lgr.debug("Swallowed %s:%s%s", sname, pref, s.replace(os.linesep, pref))
else:
lgr.debug("Nothing was swallowed for %s", sname)
del self._out
del self._err
gc.collect()
rmtemp(out_name)
rmtemp(err_name)
@contextmanager
def swallow_outputs() -> Iterator[SwallowOutputsAdapter]:
"""Context manager to help consuming both stdout and stderr, and print()
stdout is available as cm.out and stderr as cm.err whenever cm is the
yielded context manager.
Internally uses temporary files to guarantee absent side-effects of swallowing
into StringIO which lacks .fileno.
print mocking is necessary for some uses where sys.stdout was already bound
to original sys.stdout, thus mocking it later had no effect. Overriding
print function had desired effect
"""
def fake_print(*args: str, sep: str = ' ', end: str = "\n", file: Optional[IO[str]] = None) -> None:
if file is None:
file = sys.stdout
if file in (oldout, olderr, sys.stdout, sys.stderr):
# we mock
try:
sys.stdout.write(sep.join(args) + end)
except UnicodeEncodeError as exc:
lgr.error(
"Failed to write to mocked stdout, got %s, continue as it "
"didn't happen", exc)
else:
# must be some other file one -- leave it alone
oldprint(*args, sep=sep, end=end, file=file)
from .ui import ui
# preserve -- they could have been mocked already
oldprint = getattr(builtins, 'print')
oldout, olderr = sys.stdout, sys.stderr
olduiout = ui.out
adapter = SwallowOutputsAdapter()
try:
sys.stdout, sys.stderr = adapter.handles
ui.out = adapter.handles[0]
setattr(builtins, 'print', fake_print)
yield adapter
finally:
sys.stdout, sys.stderr, ui.out = oldout, olderr, olduiout
setattr(builtins, 'print', oldprint)
adapter.cleanup()
# Let's log everything into a string
# TODO: generalize with the one for swallow_outputs
class SwallowLogsAdapter:
"""Little adapter to help getting out values
And to stay consistent with how swallow_outputs behaves
"""
def __init__(self, file_: str | Path | None) -> None:
self._out: IO[str]
if file_ is None:
kw = get_tempfile_kwargs({}, prefix="logs")
self._out = NamedTemporaryFile(mode='a', delete=False, **kw)
else:
out_file = file_
# PY3 requires clearly one or another. race condition possible
self._out = open(out_file, 'a')
self.file = file_
self._final_out: Optional[str] = None
def _read(self, h: IO[str]) -> str:
with open(h.name) as f:
return f.read()
@property
def out(self) -> str:
if self._final_out is not None:
# we closed and cleaned up already
return self._final_out
else:
self._out.flush()
return self._read(self._out)
@property
def lines(self) -> list[str]:
return self.out.split('\n')
@property
def handle(self) -> IO[str]:
return self._out
def cleanup(self) -> None:
# store for access while object exists
self._final_out = self.out
self._out.close()
out_name = self._out.name
del self._out
gc.collect()
if not self.file:
rmtemp(out_name)
def assert_logged(self, msg: Optional[str]=None, level: Optional[str]=None, regex: bool =True, **kwargs: Any) -> None:
"""Provide assertion on whether a msg was logged at a given level
If neither `msg` nor `level` provided, checks if anything was logged
at all.
Parameters
----------
msg: str, optional
Message (as a regular expression, if `regex`) to be searched.
If no msg provided, checks if anything was logged at a given level.
level: str, optional
String representing the level to be logged
regex: bool, optional
If False, regular `assert_in` is used
**kwargs: str, optional
Passed to `assert_re_in` or `assert_in`
"""
from datalad.tests.utils_pytest import (
assert_in,
assert_re_in,
)
if regex:
match = r'\[%s\] ' % level if level else r"\[\S+\] "
else:
match = '[%s] ' % level if level else ''
if msg:
match += msg
if match:
(assert_re_in if regex else assert_in)(match, self.out, **kwargs)
else:
assert not kwargs, "no kwargs to be passed anywhere"
assert self.out, "Nothing was logged!?"
@contextmanager
def swallow_logs(new_level: str | int | None = None, file_ : str | Path | None = None, name: str='datalad') -> Iterator[SwallowLogsAdapter]:
"""Context manager to consume all logs."""
lgr = logging.getLogger(name)
# Keep old settings
old_level = lgr.level
old_handlers = lgr.handlers
adapter = SwallowLogsAdapter(file_)
# TODO: it does store messages but without any formatting, i.e. even without
# date/time prefix etc. IMHO it should preserve formatting in case if file_ is
# set
swallow_handler = logging.StreamHandler(adapter.handle)
# we want to log levelname so we could test against it
swallow_handler.setFormatter(
logging.Formatter('[%(levelname)s] %(message)s'))
swallow_handler.filters = sum([h.filters for h in old_handlers],
[])
lgr.handlers = [swallow_handler]
if old_level < logging.DEBUG: # so if HEAVYDEBUG etc -- show them!
lgr.handlers += old_handlers
if isinstance(new_level, str):
new_level = getattr(logging, new_level)
if new_level is not None:
lgr.setLevel(new_level)
try:
yield adapter
# TODO: if file_ and there was an exception -- most probably worth logging it?
# although ideally it should be the next log outside added to that file_ ... oh well
finally:
lgr.handlers = old_handlers
lgr.setLevel(old_level)
adapter.cleanup()
# TODO: May be melt in with swallow_logs at some point:
@contextmanager
def disable_logger(logger: Optional[logging.Logger]=None) -> Iterator[logging.Logger]:
"""context manager to temporarily disable logging
This is to provide one of swallow_logs' purposes without unnecessarily
creating temp files (see gh-1865)
Parameters
----------
logger: Logger
Logger whose handlers will be ordered to not log anything.
Default: datalad's topmost Logger ('datalad')
"""
class NullFilter(logging.Filter):
"""Filter class to reject all records
"""
def filter(self, record: logging.LogRecord) -> bool:
return False
if logger is None:
# default: all of datalad's logging:
logger = logging.getLogger('datalad')
filter_ = NullFilter(logger.name)
for h in logger.handlers:
h.addFilter(filter_)
try:
yield logger
finally:
for h in logger.handlers:
h.removeFilter(filter_)
@contextmanager
def lock_if_required(lock_required: bool, lock: threading.Lock) -> Iterator[threading.Lock]:
""" Acquired and released the provided lock if indicated by a flag"""
if lock_required:
lock.acquire()
try:
yield lock
finally:
if lock_required:
lock.release()
#
# Additional handlers
#
def ensure_dir(*args: str) -> str:
"""Make sure directory exists.
Joins the list of arguments to an os-specific path to the desired
directory and creates it, if it not exists yet.
"""
dirname = op.join(*args)
if not exists(dirname):
os.makedirs(dirname)
return dirname
def updated(d: dict[K, V], update: dict[K, V]) -> dict[K, V]:
"""Return a copy of the input with the 'update'
Primarily for updating dictionaries
"""
d = d.copy()
d.update(update)
return d
_pwd_mode: Optional[str] = None
def _switch_to_getcwd(msg: str, *args: Any) -> None:
global _pwd_mode
_pwd_mode = 'cwd'
lgr.debug(
msg + ". From now on will be returning os.getcwd(). Directory"
" symlinks in the paths will be resolved",
*args
)
# TODO: we might want to mitigate by going through all flywheighted
# repos and tuning up their .paths to be resolved?
def getpwd() -> str:
"""Try to return a CWD without dereferencing possible symlinks
This function will try to use PWD environment variable to provide a current
working directory, possibly with some directories along the path being
symlinks to other directories. Unfortunately, PWD is used/set only by the
shell and such functions as `os.chdir` and `os.getcwd` nohow use or modify
it, thus `os.getcwd()` returns path with links dereferenced.
While returning current working directory based on PWD env variable we
verify that the directory is the same as `os.getcwd()` after resolving all
symlinks. If that verification fails, we fall back to always use
`os.getcwd()`.
Initial decision to either use PWD env variable or os.getcwd() is done upon
the first call of this function.
"""
global _pwd_mode
if _pwd_mode is None:
# we need to decide!
try:
pwd = os.environ['PWD']
if on_windows and pwd and pwd.startswith('/'):
# It should be a path from MSYS.
# - it might start with a drive letter or not
# - it seems to be "illegal" to have a single letter directories
# under / path, i.e. if created - they aren't found
# - 'ln -s' does not fail to create a "symlink" but it just
# copies!
# so we are not likely to need original PWD purpose on
# those systems
# Verdict:
_pwd_mode = 'cwd'
else:
_pwd_mode = 'PWD'
except KeyError:
_pwd_mode = 'cwd'
if _pwd_mode == 'cwd':
return os.getcwd()
elif _pwd_mode == 'PWD':
try:
cwd = os.getcwd()
except OSError as exc:
if "o such file" in str(exc):
# directory was removed but we promised to be robust and
# still report the path we might know since we are still in PWD
# mode
cwd = None
else:
raise
try:
pwd = os.environ['PWD']
# do absolute() in addition to always get an absolute path
# even with non-existing paths on windows
pwd_real = str(Path(pwd).resolve().absolute())
# This logic would fail to catch the case where chdir did happen
# to the directory where current PWD is pointing to, e.g.
# $> ls -ld $PWD
# lrwxrwxrwx 1 yoh yoh 5 Oct 11 13:27 /home/yoh/.tmp/tmp -> /tmp//
# hopa:~/.tmp/tmp
# $> python -c 'import os; os.chdir("/tmp"); from datalad.utils import getpwd; print(getpwd(), os.getcwd())'
# ('/home/yoh/.tmp/tmp', '/tmp')
# but I guess that should not be too harmful
if cwd is not None and pwd_real != cwd:
_switch_to_getcwd(
"realpath of PWD=%s is %s whenever os.getcwd()=%s",
pwd, pwd_real, cwd
)
return cwd
return pwd
except KeyError:
_switch_to_getcwd("PWD env variable is no longer available")
if cwd is not None:
return cwd # Must not happen, but may be someone
# evil purges PWD from environ?
raise RuntimeError(
"Must have not got here. "
"pwd_mode must be either cwd or PWD. And it is now %r" % (_pwd_mode,)
)
class chpwd:
"""Wrapper around os.chdir which also adjusts environ['PWD']
The reason is that otherwise PWD is simply inherited from the shell
and we have no ability to assess directory path without dereferencing
symlinks.
If used as a context manager it allows to temporarily change directory
to the given path
"""
def __init__(self, path: str | Path | None, mkdir: bool=False, logsuffix: str='') -> None:
self._prev_pwd: Optional[str]
if path:
pwd = getpwd()
self._prev_pwd = pwd
else:
self._prev_pwd = None
return
if not isabs(path):
path = normpath(op.join(pwd, path))
if not os.path.exists(path) and mkdir:
self._mkdir = True
os.mkdir(path)
else:
self._mkdir = False
lgr.debug("chdir %r -> %r %s", self._prev_pwd, path, logsuffix)
os.chdir(path) # for grep people -- ok, to chdir here!
os.environ['PWD'] = str(path)
def __enter__(self) -> None:
# nothing more to do really, chdir was in the constructor
pass
def __exit__(self, exc_type: Optional[type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]) -> None:
if self._prev_pwd:
# Need to use self.__class__ so this instance, if the entire
# thing mocked during the test, still would use correct chpwd
self.__class__(self._prev_pwd, logsuffix="(coming back)")
def dlabspath(path: str | Path, norm: bool =False) -> str:
"""Symlinks-in-the-cwd aware abspath
os.path.abspath relies on os.getcwd() which would not know about symlinks
in the path
TODO: we might want to norm=True by default to match behavior of
os .path.abspath?
"""
if not isabs(path):
# if not absolute -- relative to pwd
path = op.join(getpwd(), path)
return normpath(path) if norm else str(path)
def with_pathsep(path: str) -> str:
"""Little helper to guarantee that path ends with /"""
return path + sep if not path.endswith(sep) else path
def get_path_prefix(path: str | Path, pwd: Optional[str]=None) -> str:
"""Get path prefix (for current directory)
Returns relative path to the topdir, if we are under topdir, and if not
absolute path to topdir. If `pwd` is not specified - current directory
assumed
"""
pwd = pwd or getpwd()
path = dlabspath(path)
path_ = with_pathsep(path)
pwd_ = with_pathsep(pwd)
common = commonprefix((path_, pwd_))
if common.endswith(sep) and common in {path_, pwd_}:
# we are in subdir or above the path = use relative path
location_prefix = relpath(path, pwd)
# if benign "here" - cut off
if location_prefix in (curdir, curdir + sep):
location_prefix = ''
return location_prefix
else:
# just return absolute path
return path
def _get_normalized_paths(path: str, prefix: str) -> tuple[str, str]:
if isabs(path) != isabs(prefix):
raise ValueError("Both paths must either be absolute or relative. "
"Got %r and %r" % (path, prefix))
path = with_pathsep(path)
prefix = with_pathsep(prefix)
return path, prefix
def path_startswith(path: str, prefix: str) -> bool:
"""Return True if path starts with prefix path
Parameters
----------
path: str
prefix: str
"""
path, prefix = _get_normalized_paths(path, prefix)
return path.startswith(prefix)
def path_is_subpath(path: str, prefix: str) -> bool:
"""Return True if path is a subpath of prefix
It will return False if path == prefix.
Parameters
----------
path: str
prefix: str
"""
path, prefix = _get_normalized_paths(path, prefix)
return (len(prefix) < len(path)) and path.startswith(prefix)
def knows_annex(path: str | Path) -> bool:
"""Returns whether at a given path there is information about an annex
It is just a thin wrapper around GitRepo.is_with_annex() classmethod
which also checks for `path` to exist first.
This includes actually present annexes, but also uninitialized ones, or
even the presence of a remote annex branch.
"""
from os.path import exists
if not exists(path):
lgr.debug("No annex: test path %s doesn't exist", path)
return False
from datalad.support.gitrepo import GitRepo
return GitRepo(path, init=False, create=False).is_with_annex()
@contextmanager
def make_tempfile(content: str | bytes | None = None, wrapped: Optional[Callable[..., Any]] = None, **tkwargs: Any) -> Iterator[str]:
"""Helper class to provide a temporary file name and remove it at the end (context manager)
Parameters
----------
mkdir : bool, optional (default: False)
If True, temporary directory created using tempfile.mkdtemp()
content : str or bytes, optional
Content to be stored in the file created
wrapped : function, optional
If set, function name used to prefix temporary file name
`**tkwargs`:
All other arguments are passed into the call to tempfile.mk{,d}temp(),
and resultant temporary filename is passed as the first argument into
the function t. If no 'prefix' argument is provided, it will be
constructed using module and function names ('.' replaced with
'_').
To change the used directory without providing keyword argument 'dir' set
DATALAD_TESTS_TEMP_DIR.
Examples
--------
>>> from os.path import exists
>>> from datalad.utils import make_tempfile
>>> with make_tempfile() as fname:
... k = open(fname, 'w').write('silly test')
>>> assert not exists(fname) # was removed
>>> with make_tempfile(content="blah") as fname:
... assert open(fname).read() == "blah"
"""
if tkwargs.get('mkdir', None) and content is not None:
raise ValueError("mkdir=True while providing content makes no sense")
tkwargs_ = get_tempfile_kwargs(tkwargs, wrapped=wrapped)
# if DATALAD_TESTS_TEMP_DIR is set, use that as directory,
# let mktemp handle it otherwise. However, an explicitly provided
# dir=... will override this.
mkdir = bool(tkwargs_.pop('mkdir', False))
filename = {False: tempfile.mktemp,
True: tempfile.mkdtemp}[mkdir](**tkwargs_)
# MIH: not clear to me why we need to perform this (possibly expensive)
# resolve. It was already part of the original implementation
# 008d9ab8cc3e0170c0a9b8479e80dee9ffe6eb7f
filepath = Path(filename).resolve()
if content:
if isinstance(content, bytes):
filepath.write_bytes(content)
else:
filepath.write_text(content)
# TODO globbing below can also be done with pathlib
filename = str(filepath)
if __debug__:
lgr.debug(
'Created temporary %s named %s',
'directory' if mkdir else 'file',
filename)
try:
yield filename
finally:
# glob here for all files with the same name (-suffix)
# would be useful whenever we requested .img filename,
# and function creates .hdr as well
# MIH: this is undocumented behavior, and undesired in the general
# case. it should be made conditional and explicit
lsuffix = len(tkwargs_.get('suffix', ''))
filename_ = lsuffix and filename[:-lsuffix] or filename
filenames = glob.glob(filename_ + '*')
if len(filename_) < 3 or len(filenames) > 5:
# For paranoid yoh who stepped into this already ones ;-)
lgr.warning("It is unlikely that it was intended to remove all"
" files matching %r. Skipping" % filename_)
return
for f in filenames:
try:
rmtemp(f)
except OSError: # pragma: no cover
pass
def _path_(*p: str) -> str:
"""Given a path in POSIX notation, regenerate one in native to the env one"""
if on_windows:
return op.join(*map(lambda x: op.join(*x.split('/')), p))
else:
# Assume that all others as POSIX compliant so nothing to be done
return op.join(*p)
def get_timestamp_suffix(time_: int | time.struct_time | None=None, prefix: str='-') -> str:
"""Return a time stamp (full date and time up to second)
primarily to be used for generation of log files names
"""
args = []
if time_ is not None:
if isinstance(time_, int):
time_ = time.gmtime(time_)
args.append(time_)
return time.strftime(prefix + TIMESTAMP_FMT, *args)
# unused in -core
def get_logfilename(dspath: str | Path, cmd: str='datalad') -> str:
"""Return a filename to use for logging under a dataset/repository
directory would be created if doesn't exist, but dspath must exist
and be a directory
"""
assert(exists(dspath))
assert(isdir(dspath))
ds_logdir = ensure_dir(str(dspath), '.git', 'datalad', 'logs') # TODO: use WEB_META_LOG whenever #789 merged
return op.join(ds_logdir, 'crawl-%s.log' % get_timestamp_suffix())
def get_trace(edges: Sequence[tuple[T, T]], start: T, end: T, trace: Optional[list[T]]=None) -> Optional[list[T]]:
"""Return the trace/path to reach a node in a tree.
Parameters
----------
edges : sequence(2-tuple)
The tree given by a sequence of edges (parent, child) tuples. The
nodes can be identified by any value and data type that supports
the '==' operation.
start :
Identifier of the start node. Must be present as a value in the parent
location of an edge tuple in order to be found.
end :
Identifier of the target/end node. Must be present as a value in the child
location of an edge tuple in order to be found.
trace : list
Mostly useful for recursive calls, and used internally.
Returns
-------
None or list
Returns a list with the trace to the target (the starts and the target
are not included in the trace, hence if start and end are directly connected
an empty list is returned), or None when no trace to the target can be found,
or start and end are identical.
"""
# the term trace is used to avoid confusion with a path in the sense
# of a filesystem path, but the analogy fits and nodes can be paths
if trace is None:
trace = []
if not edges:
raise ValueError("no edges given")
for cand in edges:
cand_super, cand_sub = cand
if cand_sub in trace:
# only DAGs, skip any cyclic traces
continue
if trace and cand_super != trace[-1]:
# only consider edges that lead off the end of the trace
continue
if not trace and cand_super != start:
# we got nothing yet, and this edges is not matching the start
continue
if cand_sub == end:
return trace
# dive into potential subnodes
cand_trace = get_trace(
edges,
start,
end,
trace + [cand_sub])
if cand_trace:
return cand_trace
return None
def get_dataset_root(path: str | Path) -> Optional[str]:
"""Return the root of an existent dataset containing a given path
The root path is returned in the same absolute or relative form
as the input argument. If no associated dataset exists, or the
input path doesn't exist, None is returned.
If `path` is a symlink or something other than a directory, its
the root dataset containing its parent directory will be reported.
If none can be found, at a symlink at `path` is pointing to a
dataset, `path` itself will be reported as the root.
Parameters
----------
path : Path-like
Returns
-------
str or None
"""
# NOTE: path = "" is effectively "."
path = str(path)
suffix = '.git'
altered = None
if islink(path) or not isdir(path):
altered = path
path = dirname(path)
apath = abspath(path)
# while we can still go up
while split(apath)[1]:
if exists(op.join(path, suffix)):
return path
# new test path in the format we got it
path = normpath(op.join(path, os.pardir))
# no luck, next round
apath = abspath(path)
# if we applied dirname() at the top, we give it another go with
# the actual path, if it was itself a symlink, it could be the
# top-level dataset itself
if altered and exists(op.join(altered, suffix)):
return altered
return None
# ATM used in datalad_crawler extension, so do not remove yet
def try_multiple(ntrials: int, exception: type[BaseException], base: float, f: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
"""Call f multiple times making exponentially growing delay between the calls"""
for trial in range(1, ntrials+1):
try:
return f(*args, **kwargs)
except exception as exc:
if trial == ntrials:
raise # just reraise on the last trial
t = base ** trial
lgr.warning("Caught %s on trial #%d. Sleeping %f and retrying",
CapturedException(exc), trial, t)
sleep(t)
raise ValueError("ntrials must be > 0")
@optional_args
def try_multiple_dec(
f: Callable[P, T],
ntrials: Optional[int] = None,
duration: float = 0.1,
exceptions: type[BaseException] | tuple[type[BaseException], ...] | None = None,
increment_type: Literal["exponential"] | None = None,
exceptions_filter: Optional[Callable[[BaseException], Any]] = None,
logger: Optional[Callable] = None,
) -> Callable[P, T]:
"""Decorator to try function multiple times.
Main purpose is to decorate functions dealing with removal of files/directories
and which might need a few seconds to work correctly on Windows which takes
its time to release files/directories.
Parameters
----------
ntrials: int, optional
duration: float, optional
Seconds to sleep before retrying.
increment_type: {None, 'exponential'}
Note that if it is exponential, duration should typically be > 1.0
so it grows with higher power
exceptions: Exception or tuple of Exceptions, optional
Exception or a tuple of multiple exceptions, on which to retry
exceptions_filter: callable, optional
If provided, this function will be called with a caught exception
instance. If function returns True - we will re-try, if False - exception
will be re-raised without retrying.
logger: callable, optional
Logger to log upon failure. If not provided, will use stock logger
at the level of 5 (heavy debug).
"""
# We need to bind these to new names so that mypy doesn't complain about
# the values possibly being `None` inside the inner function:
exceptions_: type[BaseException] | tuple[type[BaseException], ...]
if not exceptions:
exceptions_ = (OSError, PermissionError) if on_windows else OSError
else:
exceptions_ = exceptions
if not ntrials:
# Life goes fast on proper systems, no need to delay it much
ntrials_ = 100 if on_windows else 10
else:
ntrials_ = ntrials
if logger is None:
def logger_(*args: Any, **kwargs: Any) -> None:
return lgr.log(5, *args, **kwargs)
else:
logger_ = logger
assert increment_type in {None, 'exponential'}
@wraps(f)
def _wrap_try_multiple_dec(*args: P.args, **kwargs: P.kwargs) -> T:
t = duration
for trial in range(ntrials_):
try:
return f(*args, **kwargs)
except exceptions_ as exc:
if exceptions_filter and not exceptions_filter(exc):
raise
if trial < ntrials_ - 1:
if increment_type == 'exponential':
t = duration ** (trial + 1)
logger_(
"Caught %s on trial #%d. Sleeping %f and retrying",
CapturedException(exc), trial, t)
sleep(t)
else:
raise
raise ValueError("ntrials must be > 0")
return _wrap_try_multiple_dec
@try_multiple_dec
def unlink(f: str | Path) -> None:
"""'Robust' unlink. Would try multiple times
On windows boxes there is evidence for a latency of more than a second
until a file is considered no longer "in-use".
WindowsError is not known on Linux, and if IOError or any other
exception
is thrown then if except statement has WindowsError in it -- NameError
also see gh-2533
"""
# Check for open files
assert_no_open_files(f)
return os.unlink(f)
@try_multiple_dec
def _rmtree(*args: Any, **kwargs: Any) -> None:
"""Just a helper to decorate shutil.rmtree.
rmtree defined above does more and ideally should not itself be decorated
since a recursive definition and does checks for open files inside etc -
might be too runtime expensive
"""
shutil.rmtree(*args, **kwargs)
def slash_join(base: Optional[str], extension: Optional[str]) -> Optional[str]:
"""Join two strings with a '/', avoiding duplicate slashes
If any of the strings is None the other is returned as is.
"""
if extension is None:
return base
if base is None:
return extension
return '/'.join(
(base.rstrip('/'),
extension.lstrip('/')))
#
# IO Helpers
#
# unused in -core
def open_r_encdetect(fname: str | Path, readahead: int=1000) -> IO[str]:
"""Return a file object in read mode with auto-detected encoding
This is helpful when dealing with files of unknown encoding.
Parameters
----------
readahead: int, optional
How many bytes to read for guessing the encoding type. If
negative - full file will be read
"""
import io
from chardet import detect
# read some bytes from the file
with open(fname, 'rb') as f:
head = f.read(readahead)
enc = detect(head)
denc = enc.get('encoding', None)
lgr.debug("Auto-detected encoding %s for file %s (confidence: %s)",
denc,
fname,
enc.get('confidence', 'unknown'))
return io.open(fname, encoding=denc)
@overload
def read_file(fname: str | Path, decode: Literal[True] =True) -> str:
...
@overload
def read_file(fname: str | Path, decode: Literal[False]) -> bytes:
...
def read_file(fname: str | Path, decode: Literal[True, False] =True) -> str | bytes:
"""A helper to read file passing content via ensure_unicode
Parameters
----------
decode: bool, optional
if False, no ensure_unicode and file content returned as bytes
"""
with open(fname, 'rb') as f:
content = f.read()
return ensure_unicode(content) if decode else content
def read_csv_lines(fname: str | Path, dialect: Optional[str] = None, readahead: int=16384, **kwargs: Any) -> Iterator[dict[str, str]]:
"""A generator of dict records from a CSV/TSV
Automatically guesses the encoding for each record to convert to UTF-8
Parameters
----------
fname: str
Filename
dialect: str, optional
Dialect to specify to csv.reader. If not specified -- guessed from
the file, if fails to guess, "excel-tab" is assumed
readahead: int, optional
How many bytes to read from the file to guess the type
**kwargs
Passed to `csv.reader`
"""
import csv
csv_dialect: str | type[csv.Dialect]
if dialect is None:
with open(fname) as tsvfile:
# add robustness, use a sniffer
try:
csv_dialect = csv.Sniffer().sniff(tsvfile.read(readahead))
except Exception as exc:
lgr.warning(
'Could not determine file-format, assuming TSV: %s',
CapturedException(exc)
)
csv_dialect = 'excel-tab'
else:
csv_dialect = dialect
with open(fname, 'r', encoding="utf-8") as tsvfile:
csv_reader = csv.reader(
tsvfile,
dialect=csv_dialect,
**kwargs
)
header: Optional[list[str]] = None
for row in csv_reader:
if header is None:
header = row
else:
yield dict(zip(header, row))
def import_modules(modnames: Iterable[str], pkg: str, msg: str="Failed to import {module}", log: Callable[[str], Any]=lgr.debug) -> list[ModuleType]:
"""Helper to import a list of modules without failing if N/A
Parameters
----------
modnames: list of str
List of module names to import
pkg: str
Package under which to import
msg: str, optional
Message template for .format() to log at DEBUG level if import fails.
Keys {module} and {package} will be provided and ': {exception}' appended
log: callable, optional
Logger call to use for logging messages
"""
from importlib import import_module
_globals = globals()
mods_loaded = []
if pkg and not pkg in sys.modules:
# with python 3.5.1 (ok with 3.5.5) somehow kept running into
# Failed to import dlsub1: Parent module 'dltestm1' not loaded
# while running the test. Preloading pkg resolved the issue
import_module(pkg)
for modname in modnames:
try:
_globals[modname] = mod = import_module(
'.{}'.format(modname),
pkg)
mods_loaded.append(mod)
except Exception as exc:
from datalad.support.exceptions import CapturedException
ce = CapturedException(exc)
log((msg + ': {exception}').format(
module=modname, package=pkg, exception=ce.message))
return mods_loaded
def import_module_from_file(modpath: str, pkg: Optional[ModuleType]=None, log: Callable[[str], Any]=lgr.debug) -> ModuleType:
"""Import provided module given a path
TODO:
- RF/make use of it in pipeline.py which has similar logic
- join with import_modules above?
Parameters
----------
pkg: module, optional
If provided, and modpath is under pkg.__path__, relative import will be
used
"""
assert(modpath.endswith('.py')) # for now just for .py files
log("Importing %s" % modpath)
modname = basename(modpath)[:-3]
relmodpath = None
if pkg:
for pkgpath in pkg.__path__:
if path_is_subpath(modpath, pkgpath):
# for now relying on having .py extension -- assertion above
relmodpath = '.' + relpath(modpath[:-3], pkgpath).replace(sep, '.')
break
try:
if relmodpath:
from importlib import import_module
mod = import_module(relmodpath, pkg.__name__ if pkg is not None else None)
else:
dirname_ = dirname(modpath)
try:
sys.path.insert(0, dirname_)
mod = __import__(modname, level=0)
finally:
if dirname_ in sys.path:
sys.path.pop(sys.path.index(dirname_))
else:
log("Expected path %s to be within sys.path, but it was gone!" % dirname_)
except Exception as e:
raise RuntimeError(
"Failed to import module from %s" % modpath) from e
return mod
def get_encoding_info() -> dict[str, str]:
"""Return a dictionary with various encoding/locale information"""
import locale
import sys
return dict([
('default', sys.getdefaultencoding()),
('filesystem', sys.getfilesystemencoding()),
('locale.prefered', locale.getpreferredencoding()),
])
def get_envvars_info() -> dict[str, str]:
envs = []
for var, val in os.environ.items():
if (
var.startswith('PYTHON') or
var.startswith('LC_') or
var.startswith('GIT_') or
var in ('LANG', 'LANGUAGE', 'PATH')
):
envs.append((var, val))
return dict(envs)
# This class is modified from Snakemake (v5.1.4)
class SequenceFormatter(string.Formatter):
"""string.Formatter subclass with special behavior for sequences.
This class delegates formatting of individual elements to another
formatter object. Non-list objects are formatted by calling the
delegate formatter's "format_field" method. List-like objects
(list, tuple, set, frozenset) are formatted by formatting each
element of the list according to the specified format spec using
the delegate formatter and then joining the resulting strings with
a separator (space by default).
"""
def __init__(self, separator: str=" ", element_formatter: string.Formatter =string.Formatter(),
*args: Any, **kwargs: Any) -> None:
self.separator = separator
self.element_formatter = element_formatter
def format_element(self, elem: Any, format_spec: str) -> Any:
"""Format a single element
For sequences, this is called once for each element in a
sequence. For anything else, it is called on the entire
object. It is intended to be overridden in subclases.
"""
return self.element_formatter.format_field(elem, format_spec)
def format_field(self, value: Any, format_spec: str) -> Any:
if isinstance(value, (list, tuple, set, frozenset)):
return self.separator.join(self.format_element(v, format_spec)
for v in value)
else:
return self.format_element(value, format_spec)
# TODO: eventually we might want to make use of attr module
class File:
"""Helper for a file entry in the create_tree/@with_tree
It allows to define additional settings for entries
"""
def __init__(self, name: str, executable: bool=False) -> None:
"""
Parameters
----------
name : str
Name of the file
executable: bool, optional
Make it executable
"""
self.name = name
self.executable = executable
def __str__(self) -> str:
return self.name
TreeSpec = Union[
Tuple[Tuple[Union[str, File], "Load"], ...],
List[Tuple[Union[str, File], "Load"]],
Dict[Union[str, File], "Load"],
]
Load = Union[str, bytes, "TreeSpec"]
def create_tree_archive(path: str, name: str, load: TreeSpec, overwrite: bool=False, archives_leading_dir: bool=True) -> None:
"""Given an archive `name`, create under `path` with specified `load` tree
"""
from datalad.support.archives import compress_files
dirname = file_basename(name)
full_dirname = op.join(path, dirname)
os.makedirs(full_dirname)
create_tree(full_dirname, load, archives_leading_dir=archives_leading_dir)
# create archive
if archives_leading_dir:
compress_files([dirname], name, path=path, overwrite=overwrite)
else:
compress_files(
#
list(map(basename, glob.glob(op.join(full_dirname, '*')))), # type: ignore[arg-type]
op.join(pardir, name),
path=op.join(path, dirname),
overwrite=overwrite)
# remove original tree
rmtree(full_dirname)
def create_tree(path: str, tree: TreeSpec, archives_leading_dir: bool =True, remove_existing: bool =False) -> None:
"""Given a list of tuples (name, load) create such a tree
if load is a tuple itself -- that would create either a subtree or an archive
with that content and place it into the tree if name ends with .tar.gz
"""
lgr.log(5, "Creating a tree under %s", path)
if not exists(path):
os.makedirs(path)
if isinstance(tree, dict):
tree = list(tree.items())
for file_, load in tree:
if isinstance(file_, File):
executable = file_.executable
name = file_.name
else:
executable = False
name = file_
full_name = op.join(path, name)
if remove_existing and lexists(full_name):
rmtree(full_name, chmod_files=True)
if isinstance(load, (tuple, list, dict)):
if name.endswith('.tar.gz') or name.endswith('.tar') or name.endswith('.zip'):
create_tree_archive(
path, name, load,
archives_leading_dir=archives_leading_dir)
else:
create_tree(
full_name, load,
archives_leading_dir=archives_leading_dir,
remove_existing=remove_existing)
else:
if full_name.endswith('.gz'):
def open_func() -> IO[bytes]:
return gzip.open(full_name, "wb") # type: ignore[return-value]
elif full_name.split('.')[-1] in ('xz', 'lzma'):
import lzma
def open_func() -> IO[bytes]:
return lzma.open(full_name, "wb")
else:
def open_func() -> IO[bytes]:
return open(full_name, "wb")
with open_func() as f:
f.write(ensure_bytes(load, 'utf-8'))
if executable:
os.chmod(full_name, os.stat(full_name).st_mode | stat.S_IEXEC)
def get_suggestions_msg(values: Optional[str | Iterable[str]], known: str, sep: str="\n ") -> str:
"""Return a formatted string with suggestions for values given the known ones
"""
import difflib
suggestions = []
if not values:
values = []
elif isinstance(values, str):
values = [values]
for value in values: # might not want to do it if we change presentation below
suggestions += difflib.get_close_matches(value, known)
suggestions = unique(suggestions)
msg = "Did you mean any of these?"
if suggestions:
if '\n' in sep:
# if separator includes new line - we add entire separator right away
msg += sep
else:
msg += ' '
return msg + "%s\n" % sep.join(suggestions)
return ''
def bytes2human(n: int | float, format: str ='%(value).1f %(symbol)sB') -> str:
"""
Convert n bytes into a human readable string based on format.
symbols can be either "customary", "customary_ext", "iec" or "iec_ext",
see: http://goo.gl/kTQMs
>>> from datalad.utils import bytes2human
>>> bytes2human(1)
'1.0 B'
>>> bytes2human(1024)
'1.0 KB'
>>> bytes2human(1048576)
'1.0 MB'
>>> bytes2human(1099511627776127398123789121)
'909.5 YB'
>>> bytes2human(10000, "%(value).1f %(symbol)s/sec")
'9.8 K/sec'
>>> # precision can be adjusted by playing with %f operator
>>> bytes2human(10000, format="%(value).5f %(symbol)s")
'9.76562 K'
Taken from: http://goo.gl/kTQMs and subsequently simplified
Original Author: Giampaolo Rodola'
License: MIT
"""
n = int(n)
if n < 0:
raise ValueError("n < 0")
symbols = ('', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
prefix = {}
for i, s in enumerate(symbols[1:]):
prefix[s] = 1 << (i + 1) * 10
for symbol in reversed(symbols[1:]):
if n >= prefix[symbol]:
value = float(n) / prefix[symbol]
return format % locals()
return format % dict(symbol=symbols[0], value=n)
def quote_cmdlinearg(arg: str) -> str:
"""Perform platform-appropriate argument quoting"""
# https://stackoverflow.com/a/15262019
return '"{}"'.format(
arg.replace('"', '""')
) if on_windows else shlex_quote(arg)
def guard_for_format(arg: str) -> str:
"""Replace { and } with {{ and }}
To be used in cases if arg is not expected to have provided
by user .format() placeholders, but 'arg' might become a part
of a composite passed to .format(), e.g. via 'Run'
"""
return arg.replace('{', '{{').replace('}', '}}')
def join_cmdline(args: Iterable[str]) -> str:
"""Join command line args into a string using quote_cmdlinearg
"""
return ' '.join(map(quote_cmdlinearg, args))
def split_cmdline(s: str) -> list[str]:
"""Perform platform-appropriate command line splitting.
Identical to `shlex.split()` on non-windows platforms.
Modified from https://stackoverflow.com/a/35900070
"""
if not on_windows:
return shlex_split(s)
# the rest is for windows
RE_CMD_LEX = r'''"((?:""|\\["\\]|[^"])*)"?()|(\\\\(?=\\*")|\\")|(&&?|\|\|?|\d?>|[<])|([^\s"&|<>]+)|(\s+)|(.)'''
args = []
accu = None # collects pieces of one arg
for qs, qss, esc, pipe, word, white, fail in re.findall(RE_CMD_LEX, s):
if word:
pass # most frequent
elif esc:
word = esc[1]
elif white or pipe:
if accu is not None:
args.append(accu)
if pipe:
args.append(pipe)
accu = None
continue
elif fail:
raise ValueError("invalid or incomplete shell string")
elif qs:
word = qs.replace('\\"', '"').replace('\\\\', '\\')
if platform == 0:
word = word.replace('""', '"')
else:
word = qss # may be even empty; must be last
accu = (accu or '') + word
if accu is not None:
args.append(accu)
return args
def get_wrapped_class(wrapped: Callable) -> type:
"""Determine the command class a wrapped __call__ belongs to"""
mod = sys.modules[wrapped.__module__]
command_class_name = wrapped.__qualname__.split('.')[-2]
_func_class = mod.__dict__[command_class_name]
lgr.debug("Determined class of decorated function: %s", _func_class)
return _func_class
def _make_assure_kludge(fn: Callable[P, T]) -> Callable[P, T]:
old_name = fn.__name__.replace("ensure", "assure")
@wraps(fn)
def compat_fn(*args: P.args, **kwargs: P.kwargs) -> T:
warnings.warn(
"{} is deprecated and will be removed in a future release. "
"Use {} instead."
.format(old_name, fn.__name__),
DeprecationWarning)
return fn(*args, **kwargs)
compat_fn.__doc__ = ("Note: This function is deprecated. Use {} instead."
.format(fn.__name__))
return compat_fn
assure_tuple_or_list = _make_assure_kludge(ensure_tuple_or_list)
assure_iter = _make_assure_kludge(ensure_iter)
assure_list = _make_assure_kludge(ensure_list)
assure_list_from_str = _make_assure_kludge(ensure_list_from_str)
assure_dict_from_str = _make_assure_kludge(ensure_dict_from_str)
assure_bytes = _make_assure_kludge(ensure_bytes)
assure_unicode = _make_assure_kludge(ensure_unicode)
assure_bool = _make_assure_kludge(ensure_bool)
assure_dir = _make_assure_kludge(ensure_dir)
lgr.log(5, "Done importing datalad.utils")
def check_symlink_capability(path: Path, target: Path) -> bool:
"""helper similar to datalad.tests.utils_pytest.has_symlink_capability
However, for use in a datalad command context, we shouldn't
assume to be able to write to tmpfile and also not import a whole lot from
datalad's test machinery. Finally, we want to know, whether we can create a
symlink at a specific location, not just somewhere. Therefore use
arbitrary path to test-build a symlink and delete afterwards. Suitable
location can therefore be determined by high lever code.
Parameters
----------
path: Path
target: Path
Returns
-------
bool
"""
try:
target.touch()
path.symlink_to(target)
return True
except Exception:
return False
finally:
if path.exists():
path.unlink()
if target.exists():
target.unlink()
def obtain_write_permission(path: Path) -> Optional[int]:
"""Obtains write permission for `path` and returns previous mode if a
change was actually made.
Parameters
----------
path: Path
path to try to obtain write permission for
Returns
-------
int or None
previous mode of `path` as return by stat().st_mode if a change in
permission was actually necessary, `None` otherwise.
"""
mode = path.stat().st_mode
# only IWRITE works on Windows, in principle
if not mode & stat.S_IWRITE:
path.chmod(mode | stat.S_IWRITE)
return mode
else:
return None
@contextmanager
def ensure_write_permission(path: Path) -> Iterator[None]:
"""Context manager to get write permission on `path` and
restore original mode afterwards.
Parameters
----------
path: Path
path to the target file
Raises
------
PermissionError
if write permission could not be obtained
"""
restore = None
try:
restore = obtain_write_permission(path)
yield
finally:
if restore is not None:
try:
path.chmod(restore)
except FileNotFoundError:
# If `path` was deleted within the context block, there's
# nothing to do. Don't test exists(), though - asking for
# forgiveness to save a call.
pass
././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945233.0
datalad-1.3.1/datalad/version.py 0000644 0001751 0001751 00000001506 15137634221 016213 0 ustar 00runner runner # emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# Compatibility kludge for now to not break anything relying on datalad.version
#
import warnings
from ._version import get_versions
warnings.warn(
"datalad.version module will be removed in 0.16. "
"Please use datalad.__version__ (no other __*_version__ variables are to be provided).",
DeprecationWarning)
__version__ = get_versions()['version']
__hardcoded_version__ = __version__
__full_version__ = __version__
del get_versions
././@PaxHeader 0000000 0000000 0000000 00000000034 00000000000 010212 x ustar 00 28 mtime=1769945274.9160612
datalad-1.3.1/datalad.egg-info/ 0000755 0001751 0001751 00000000000 15137634273 015653 5 ustar 00runner runner ././@PaxHeader 0000000 0000000 0000000 00000000026 00000000000 010213 x ustar 00 22 mtime=1769945274.0
datalad-1.3.1/datalad.egg-info/PKG-INFO 0000644 0001751 0001751 00000077651 15137634272 016767 0 ustar 00runner runner Metadata-Version: 2.4
Name: datalad
Version: 1.3.1
Summary: Distributed system for joint management of code, data, and their relationship
Author-email: The DataLad Team and Contributors
License-Expression: MIT
Project-URL: Homepage, https://www.datalad.org
Project-URL: Developer docs, https://docs.datalad.org/en/stable
Project-URL: Documentation, https://handbook.datalad.org
Project-URL: Repository, https://github.com/datalad/datalad
Project-URL: Issues, https://github.com/datalad/datalad/issues
Project-URL: RRID, https://identifiers.org/RRID:SCR_003931
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Console
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: Education
Classifier: Intended Audience :: End Users/Desktop
Classifier: Intended Audience :: Science/Research
Classifier: Natural Language :: English
Classifier: Operating System :: POSIX
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Unix Shell
Classifier: Topic :: Communications :: File Sharing
Classifier: Topic :: Education
Classifier: Topic :: Internet
Classifier: Topic :: Other/Nonlisted Topic
Classifier: Topic :: Scientific/Engineering
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Software Development :: Version Control :: Git
Classifier: Topic :: Utilities
Requires-Python: >=3.10
Description-Content-Type: text/markdown
License-File: COPYING
Requires-Dist: platformdirs
Requires-Dist: chardet>=3.0.4
Requires-Dist: colorama; platform_system == "Windows"
Requires-Dist: distro
Requires-Dist: iso8601
Requires-Dist: humanize
Requires-Dist: fasteners>=0.14
Requires-Dist: packaging
Requires-Dist: patool>=1.7
Requires-Dist: tqdm>=4.32.0
Requires-Dist: typing_extensions>=4.0.0; python_version < "3.11"
Requires-Dist: annexremote
Requires-Dist: looseversion
Requires-Dist: boto3
Requires-Dist: keyring!=23.9.0,>=20.0
Requires-Dist: keyrings.alt
Requires-Dist: msgpack
Requires-Dist: requests>=1.2
Requires-Dist: python-gitlab
Provides-Extra: downloaders-extra
Requires-Dist: requests_ftp; extra == "downloaders-extra"
Provides-Extra: misc
Requires-Dist: argcomplete>=1.12.3; extra == "misc"
Requires-Dist: pyperclip; extra == "misc"
Requires-Dist: python-dateutil; extra == "misc"
Provides-Extra: tests
Requires-Dist: BeautifulSoup4; extra == "tests"
Requires-Dist: httpretty>=0.9.4; extra == "tests"
Requires-Dist: mypy; extra == "tests"
Requires-Dist: pytest>=7.0; extra == "tests"
Requires-Dist: pytest-cov; extra == "tests"
Requires-Dist: pytest-retry; extra == "tests"
Requires-Dist: pytest-fail-slow~=0.2; extra == "tests"
Requires-Dist: types-python-dateutil; extra == "tests"
Requires-Dist: types-requests; extra == "tests"
Requires-Dist: vcrpy; extra == "tests"
Provides-Extra: duecredit
Requires-Dist: duecredit; extra == "duecredit"
Provides-Extra: full
Requires-Dist: datalad[downloaders-extra,duecredit,misc,tests]; extra == "full"
Provides-Extra: devel-docs
Requires-Dist: pypandoc; extra == "devel-docs"
Requires-Dist: sphinx>=4.3.0; extra == "devel-docs"
Requires-Dist: sphinx-autodoc-typehints; extra == "devel-docs"
Requires-Dist: sphinx-rtd-theme>=0.5.1; extra == "devel-docs"
Provides-Extra: devel-utils
Requires-Dist: asv; extra == "devel-utils"
Requires-Dist: coverage!=7.13.1,!=7.6.5; extra == "devel-utils"
Requires-Dist: gprof2dot; extra == "devel-utils"
Requires-Dist: psutil; extra == "devel-utils"
Requires-Dist: pytest-xdist; extra == "devel-utils"
Requires-Dist: scriv; extra == "devel-utils"
Provides-Extra: devel
Requires-Dist: datalad[devel-docs,devel-utils,full]; extra == "devel"
Dynamic: license-file
____ _ _ _
| _ \ __ _ | |_ __ _ | | __ _ __| |
| | | | / _` | | __| / _` | | | / _` | / _` |
| |_| | | (_| | | |_ | (_| | | |___ | (_| | | (_| |
|____/ \__,_| \__| \__,_| |_____| \__,_| \__,_|
Read me
[](https://doi.org/10.21105/joss.03262)
[](https://github.com/datalad/datalad/actions/workflows/test.yml)
[](https://ci.appveyor.com/project/mih/datalad/branch/master)
[](https://github.com/datalad/datalad/actions/workflows/test_extensions.yml)
[](https://github.com/datalad/datalad/actions/workflows/lint.yml)
[](https://codecov.io/github/datalad/datalad?branch=master)
[](http://datalad.rtfd.org)
[](https://opensource.org/licenses/MIT)
[](https://GitHub.com/datalad/datalad/releases/)
[](https://pypi.org/project/datalad/)
[](https://github.com/datalad/datalad/wiki/Testimonials)
[](https://github.com/datalad/datalad/blob/master/CODE_OF_CONDUCT.md)
[](https://doi.org/10.5281/zenodo.808846)
[](https://identifiers.org/RRID:SCR_003931)
[](#contributors-)
## Distribution
[](https://anaconda.org/conda-forge/datalad)
[](https://repology.org/project/datalad/versions)
[](https://packages.debian.org/stable/datalad)
[](https://packages.debian.org/unstable/datalad)
[](https://repology.org/project/datalad/versions)
[](https://repology.org/project/datalad/versions)
[](https://repology.org/project/datalad/versions)
# 10000-ft. overview
DataLad's purpose is to make data management and data distribution more accessible.
To do so, it stands on the shoulders of [Git] and [Git-annex] to deliver a
decentralized system for data exchange. This includes automated ingestion of
data from online portals and exposing it in readily usable form as Git(-annex)
repositories - or datasets. However, the actual data storage and permission
management remains with the original data provider(s).
The full documentation is available at http://docs.datalad.org and
http://handbook.datalad.org provides a hands-on crash-course on DataLad.
# Extensions
A number of extensions are available that provide additional functionality for
DataLad. Extensions are separate packages that are to be installed in addition
to DataLad. In order to install DataLad customized for a particular domain, one
can simply install an extension directly, and DataLad itself will be
automatically installed with it. An [annotated list of
extensions](http://handbook.datalad.org/extension_pkgs.html) is available in
the [DataLad handbook](http://handbook.datalad.org).
# Support
The documentation for this project is found here:
http://docs.datalad.org
All bugs, concerns, and enhancement requests for this software can be submitted here:
https://github.com/datalad/datalad/issues
If you have a problem or would like to ask a question about how to use DataLad,
please [submit a question to
NeuroStars.org](https://neurostars.org/new-topic?body=-%20Please%20describe%20the%20problem.%0A-%20What%20steps%20will%20reproduce%20the%20problem%3F%0A-%20What%20version%20of%20DataLad%20are%20you%20using%20%28run%20%60datalad%20--version%60%29%3F%20On%20what%20operating%20system%20%28consider%20running%20%60datalad%20plugin%20wtf%60%29%3F%0A-%20Please%20provide%20any%20additional%20information%20below.%0A-%20Have%20you%20had%20any%20luck%20using%20DataLad%20before%3F%20%28Sometimes%20we%20get%20tired%20of%20reading%20bug%20reports%20all%20day%20and%20a%20lil'%20positive%20end%20note%20does%20wonders%29&tags=datalad)
with a `datalad` tag. NeuroStars.org is a platform similar to StackOverflow
but dedicated to neuroinformatics.
All previous DataLad questions are available here:
http://neurostars.org/tags/datalad/
# Installation
## Debian-based systems
On Debian-based systems, we recommend enabling [NeuroDebian], via which we
provide recent releases of DataLad. Once enabled, just do:
apt-get install datalad
## Gentoo-based systems
On Gentoo-based systems (i.e. all systems whose package manager can parse ebuilds as per the [Package Manager Specification]), we recommend [enabling the ::science overlay], via which we
provide recent releases of DataLad. Once enabled, just run:
emerge datalad
## Other Linux'es via conda
conda install -c conda-forge datalad
will install the most recently released version, and release candidates are
available via
conda install -c conda-forge/label/rc datalad
## Other Linux'es, macOS via pip
Before you install this package, please make sure that you [install a recent
version of git-annex](https://git-annex.branchable.com/install). Afterwards,
install the latest version of `datalad` from
[PyPI](https://pypi.org/project/datalad). It is recommended to use
a dedicated [virtualenv](https://virtualenv.pypa.io):
# Create and enter a new virtual environment (optional)
virtualenv --python=python3 ~/env/datalad
. ~/env/datalad/bin/activate
# Install from PyPI
pip install datalad
By default, installation via pip installs the core functionality of DataLad,
allowing for managing datasets etc. Additional installation schemes
are available, so you can request enhanced installation via
`pip install datalad[SCHEME]`, where `SCHEME` could be:
- `tests`
to also install dependencies used by DataLad's battery of unit tests
- `full`
to install all dependencies.
More details on installation and initial configuration can be found in the
[DataLad Handbook: Installation].
# License
MIT/Expat
# Contributing
See [CONTRIBUTING.md](CONTRIBUTING.md) if you are interested in internals or
contributing to the project.
## Acknowledgements
The DataLad project received support through the following grants:
- US-German collaboration in computational neuroscience (CRCNS) project
"DataGit: converging catalogues, warehouses, and deployment logistics into a
federated 'data distribution'" (Halchenko/Hanke), co-funded by the US National
Science Foundation (NSF 1429999) and the German Federal Ministry of
Education and Research (BMBF 01GQ1411).
- CRCNS US-German Data Sharing "DataLad - a decentralized system for integrated
discovery, management, and publication of digital objects of science"
(Halchenko/Pestilli/Hanke), co-funded by the US National Science Foundation
(NSF 1912266) and the German Federal Ministry of Education and Research
(BMBF 01GQ1905).
- Helmholtz Research Center JΓΌlich, FDM challenge 2022
- German federal state of Saxony-Anhalt and the European Regional Development
Fund (ERDF), Project: Center for Behavioral Brain Sciences, Imaging Platform
- ReproNim project (NIH 1P41EB019936-01A1).
- Deutsche Forschungsgemeinschaft (DFG, German Research Foundation) under grant
SFB 1451 ([431549029](https://gepris.dfg.de/gepris/projekt/431549029),
INF project)
- European Unionβs Horizon 2020 research and innovation programme under grant
agreements:
- [Human Brain Project SGA3 (H2020-EU.3.1.5.3, grant no. 945539)](https://cordis.europa.eu/project/id/945539)
- [VirtualBrainCloud (H2020-EU.3.1.5.3, grant no. 826421)](https://cordis.europa.eu/project/id/826421)
Mac mini instance for development is provided by
[MacStadium](https://www.macstadium.com/).
### Contributors β¨
Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):