# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Run prepared procedures (DataLad scripts) on a dataset"""
__docformat__ = 'restructuredtext'
import logging
from glob import iglob
from argparse import REMAINDER
import os
import sys
import os.path as op
import stat
from datalad import cfg
from datalad.interface.base import Interface
from datalad.interface.utils import eval_results
from datalad.interface.base import build_doc
from datalad.interface.results import get_status_dict
from datalad.distribution.dataset import Dataset
from datalad.distribution.dataset import require_dataset
from datalad.distribution.dataset import EnsureDataset
from datalad.support.constraints import EnsureNone
from datalad.support.param import Parameter
from datalad.distribution.dataset import datasetmethod
from datalad.support.exceptions import InsufficientArgumentsError
from datalad.support.exceptions import NoDatasetFound
from datalad.utils import (
guard_for_format,
join_cmdline,
quote_cmdlinearg,
split_cmdline,
)
from datalad.utils import ensure_list
import datalad.support.ansi_colors as ac
from datalad.core.local.run import Run
lgr = logging.getLogger('datalad.interface.run_procedures')
def _get_file_match(dir, name='*'):
targets = (name, ('[!_]*.py'), ('[!_]*.sh'))
lgr.debug("Looking for procedure '%s' in '%s'", name, dir)
for target in targets:
for m in iglob(op.join(dir, target)):
m_bn = op.basename(m)
if name == '*':
report_name = m_bn[:-3] if m_bn.endswith('.py') or \
m_bn.endswith('.sh') \
else m_bn
yield m, report_name
elif m_bn == name or m_bn.startswith('{}.'.format(name)):
yield m, name
def _get_proc_config(name, ds=None):
"""get configuration of named procedure
Figures call format string and help message for a given procedure name,
based on dataset.
Returns
-------
tuple
(call format string, help string) or possibly None for either value,
if there's nothing configured
"""
# figure what ConfigManager to ask
cm = cfg if ds is None else ds.config
# ConfigManager may not be up-to-date, particularly if we are in a
# subdataset due to recursion in `_get_procedure_implementation` where
# outside caller operates (and reloads) on superdataset only. With
# force=False, this shouldn't be expensive.
cm.reload()
# ConfigManager might return a tuple for different reasons.
# The config might have been defined multiple times in the same location
# (within .datalad/config for example) or there are multiple values for
# it on different levels of git-config (system, user, repo). git-config
# in turn does report such things ordered from most general to most
# specific configuration. We do want the most specific one here, so
# config.get(), which returns the last entry, works here.
# TODO: At this point we cannot determine whether it was actually
# configured to yield several values by the very same config, in which
# case we should actually issue a warning, since we then have no idea
# of a priority. But ConfigManager isn't able yet to tell us or to
# restrict the possibility to define multiple values to particular items
v = cm.get('datalad.procedures.{}.call-format'.format(name), None)
h = cm.get('datalad.procedures.{}.help'.format(name), None)
return v, h
def _get_procedure_implementation(name='*', ds=None):
"""get potential procedures: path, name, configuration, and a help message
The order of consideration is user-level, system-level, extra locations, dataset,
datalad extensions, datalad. Therefore local definitions/configurations take
precedence over ones, that come from outside (via a datalad-extension or a
dataset with its .datalad/config). If a dataset had precedence (as it was
before), the addition (or just an update) of a (sub-)dataset would otherwise
surprisingly cause you to execute code different from what you defined
within ~/.gitconfig or your local repository's .git/config.
So, local definitions take precedence over remote ones and more specific
ones over more general ones.
Yields
------
tuple
path, name, format string, help message
"""
ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None
# 1. check system and user account for procedure
for loc in (cfg.obtain('datalad.locations.user-procedures'),
cfg.obtain('datalad.locations.system-procedures'),
cfg.get('datalad.locations.extra-procedures', get_all=True)):
for dir in ensure_list(loc):
for m, n in _get_file_match(dir, name):
yield (m, n,) + _get_proc_config(n)
# 2. check dataset for procedure
if ds is not None and ds.is_installed():
# could be more than one
dirs = ensure_list(
ds.config.obtain('datalad.locations.dataset-procedures'))
for dir in dirs:
# TODO `get` dirs if necessary
for m, n in _get_file_match(op.join(ds.path, dir), name):
yield (m, n,) + _get_proc_config(n, ds=ds)
# 2.1. check subdatasets recursively
for subds in ds.subdatasets(return_type='generator',
result_xfm='datasets'):
for m, n, f, h in _get_procedure_implementation(name=name, ds=subds):
yield m, n, f, h
# 3. check extensions for procedure
# delay heavy import until here
from pkg_resources import iter_entry_points
from pkg_resources import resource_isdir
from pkg_resources import resource_filename
for entry_point in iter_entry_points('datalad.extensions'):
# use of '/' here is OK wrt to platform compatibility
if resource_isdir(entry_point.module_name, 'resources/procedures'):
for m, n in _get_file_match(
resource_filename(
entry_point.module_name,
'resources/procedures'),
name):
yield (m, n,) + _get_proc_config(n)
# 4. at last check datalad itself for procedure
for m, n in _get_file_match(
resource_filename('datalad', 'resources/procedures'),
name):
yield (m, n,) + _get_proc_config(n)
def _guess_exec(script_file):
state = None
try:
is_exec = os.stat(script_file).st_mode & stat.S_IEXEC
except OSError as e:
from errno import ENOENT
if e.errno == ENOENT and op.islink(script_file):
# broken symlink
# does not exist; there's nothing to detect at all
return {'type': None, 'template': None, 'state': 'absent'}
else:
raise e
# TODO check for exec permission and rely on interpreter
if is_exec and not os.path.isdir(script_file):
return {'type': u'executable',
'template': u'{script} {ds} {args}',
'state': 'executable'}
elif script_file.endswith('.sh'):
return {'type': u'bash_script',
'template': u'bash {script} {ds} {args}',
'state': 'executable'}
elif script_file.endswith('.py'):
ex = quote_cmdlinearg(sys.executable)
return {'type': u'python_script',
'template': u'%s {script} {ds} {args}' % ex,
'state': 'executable'}
else:
return {'type': None, 'template': None, 'state': None}
@build_doc
class RunProcedure(Interface):
"""Run prepared procedures (DataLad scripts) on a dataset
*Concept*
A "procedure" is an algorithm with the purpose to process a dataset in a
particular way. Procedures can be useful in a wide range of scenarios,
like adjusting dataset configuration in a uniform fashion, populating
a dataset with particular content, or automating other routine tasks,
such as synchronizing dataset content with certain siblings.
Implementations of some procedures are shipped together with DataLad,
but additional procedures can be provided by 1) any DataLad extension,
2) any (sub-)dataset, 3) a local user, or 4) a local system administrator.
DataLad will look for procedures in the following locations and order:
Directories identified by the configuration settings
- 'datalad.locations.user-procedures' (determined by
appdirs.user_config_dir; defaults to '$HOME/.config/datalad/procedures'
on GNU/Linux systems)
- 'datalad.locations.system-procedures' (determined by
appdirs.site_config_dir; defaults to '/etc/xdg/datalad/procedures' on
GNU/Linux systems)
- 'datalad.locations.dataset-procedures'
and subsequently in the 'resources/procedures/' directories of any
installed extension, and, lastly, of the DataLad installation itself.
Please note that a dataset that defines
'datalad.locations.dataset-procedures' provides its procedures to
any dataset it is a subdataset of. That way you can have a collection of
such procedures in a dedicated dataset and install it as a subdataset into
any dataset you want to use those procedures with. In case of a naming
conflict with such a dataset hierarchy, the dataset you're calling
run-procedures on will take precedence over its subdatasets and so on.
Each configuration setting can occur multiple times to indicate multiple
directories to be searched. If a procedure matching a given name is found
(filename without a possible extension), the search is aborted and this
implementation will be executed. This makes it possible for individual
datasets, users, or machines to override externally provided procedures
(enabling the implementation of customizable processing "hooks").
*Procedure implementation*
A procedure can be any executable. Executables must have the appropriate
permissions and, in the case of a script, must contain an appropriate
"shebang" line. If a procedure is not executable, but its filename ends
with '.py', it is automatically executed by the 'python' interpreter
(whichever version is available in the present environment). Likewise,
procedure implementations ending on '.sh' are executed via 'bash'.
Procedures can implement any argument handling, but must be capable
of taking at least one positional argument (the absolute path to the
dataset they shall operate on).
For further customization there are two configuration settings per procedure
available:
- 'datalad.procedures.<NAME>.call-format'
fully customizable format string to determine how to execute procedure
NAME (see also datalad-run).
It currently requires to include the following placeholders:
- '{script}': will be replaced by the path to the procedure
- '{ds}': will be replaced by the absolute path to the dataset the
procedure shall operate on
- '{args}': (not actually required) will be replaced by
[CMD: all additional arguments passed into run-procedure after NAME CMD]
[PY: all but the first element of `spec` if `spec` is a list or tuple PY]
As an example the default format string for a call to a python script is:
"python {script} {ds} {args}"
- 'datalad.procedures.<NAME>.help'
will be shown on `datalad run-procedure --help-proc NAME` to provide a
description and/or usage info for procedure NAME
"""
_params_ = dict(
spec=Parameter(
args=("spec",),
metavar='NAME [ARGS]',
nargs=REMAINDER,
doc="""Name and possibly additional arguments of the
to-be-executed procedure. [CMD: Note, that all options to
run-procedure need to be put before NAME, since all ARGS get
assigned to NAME CMD]"""),
dataset=Parameter(
args=("-d", "--dataset"),
metavar="PATH",
doc="""specify the dataset to run the procedure on.
An attempt is made to identify the dataset based on the current
working directory.""",
constraints=EnsureDataset() | EnsureNone()),
discover=Parameter(
args=('--discover',),
action='store_true',
doc="""if given, all configured paths are searched for procedures
and one result record per discovered procedure is yielded, but
no procedure is executed"""),
help_proc=Parameter(
args=('--help-proc',),
action='store_true',
doc="""if given, get a help message for procedure NAME from config
setting datalad.procedures.NAME.help"""
)
)
_examples_ = [
dict(text="Find out which procedures are available on the current system",
code_py="run_procedure(discover=True)",
code_cmd="datalad run-procedure --discover"),
dict(text="Run the 'yoda' procedure in the current dataset",
code_py="run_procedure(spec='cfg_yoda', recursive=True)",
code_cmd="datalad run-procedure cfg_yoda"),
]
result_renderer = 'tailored'
@staticmethod
@datasetmethod(name='run_procedure')
@eval_results
def __call__(
spec=None,
dataset=None,
discover=False,
help_proc=False):
if not spec and not discover:
raise InsufficientArgumentsError('requires at least a procedure name')
if help_proc and not spec:
raise InsufficientArgumentsError('requires a procedure name')
try:
ds = require_dataset(
dataset, check_installed=False,
purpose='run a procedure')
except NoDatasetFound:
ds = None
if discover:
# specific path of procedures that were already reported
reported = set()
# specific names of procedure for which an active one has been
# found
active = set()
for m, cmd_name, cmd_tmpl, cmd_help in \
_get_procedure_implementation('*', ds=ds):
if m in reported:
continue
ex = _guess_exec(m)
# configured template (call-format string) takes precedence:
if cmd_tmpl:
ex['template'] = cmd_tmpl
if ex['state'] is None:
# doesn't seem like a match
lgr.debug("%s does not look like a procedure, ignored.", m)
continue
state = 'overridden' if cmd_name in active else ex['state']
message = ex['type'] if ex['type'] else 'unknown type'
message += ' ({})'.format(state) if state != 'executable' else ''
res = get_status_dict(
action='discover_procedure',
path=m,
type='file',
logger=lgr,
refds=ds.path if ds else None,
status='ok',
state=state,
procedure_name=cmd_name,
procedure_type=ex['type'],
procedure_callfmt=ex['template'],
procedure_help=cmd_help,
message=message)
reported.add(m)
if state == 'executable':
active.add(cmd_name)
yield res
return
if not isinstance(spec, (tuple, list)):
# maybe coming from config
spec = split_cmdline(spec)
name = spec[0]
args = spec[1:]
try:
# get the first match an run with it
procedure_file, cmd_name, cmd_tmpl, cmd_help = \
next(_get_procedure_implementation(name, ds=ds))
except StopIteration:
res = get_status_dict(
action='run_procedure',
# TODO: Default renderer requires a key "path" to exist.
# Doesn't make a lot of sense in this case
path=name,
logger=lgr,
refds=ds.path if ds else None,
status='impossible',
message="Cannot find procedure with name '%s'" % name)
yield res
return
ex = _guess_exec(procedure_file)
# configured template (call-format string) takes precedence:
if cmd_tmpl:
ex['template'] = cmd_tmpl
if help_proc:
if cmd_help:
res = get_status_dict(
action='procedure_help',
path=procedure_file,
type='file',
logger=lgr,
refds=ds.path if ds else None,
status='ok',
state=ex['state'],
procedure_name=cmd_name,
procedure_type=ex['type'],
procedure_callfmt=ex['template'],
message=cmd_help)
else:
res = get_status_dict(
action='procedure_help',
path=procedure_file,
type='file',
logger=lgr,
refds=ds.path if ds else None,
status='impossible',
state=ex['state'],
procedure_name=cmd_name,
procedure_type=ex['type'],
procedure_callfmt=ex['template'],
message="No help available for '%s'" % name)
yield res
return
if not ex['template']:
raise ValueError("No idea how to execute procedure %s. "
"Missing 'execute' permissions?" % procedure_file)
cmd = ex['template'].format(
script=guard_for_format(quote_cmdlinearg(procedure_file)),
ds=guard_for_format(quote_cmdlinearg(ds.path)) if ds else '',
args=join_cmdline(args) if args else '')
lgr.info(u"Running procedure %s", name)
lgr.debug(u'Full procedure command: %r', cmd)
for r in Run.__call__(
cmd=cmd,
dataset=ds,
explicit=True,
inputs=None,
outputs=None,
# pass through here
on_failure='ignore',
return_type='generator'
):
yield r
if ds:
# the procedure ran and we have to anticipate that it might have
# changed the dataset config, so we need to trigger an unforced
# reload.
# we have to do this despite "being done here", because
# run_procedure() runs in the same process and reuses dataset (config
# manager) instances, and the next interaction with a dataset should
# be able to count on an up-to-date config
ds.config.reload()
@staticmethod
def custom_result_renderer(res, **kwargs):
from datalad.ui import ui
from datalad.interface.utils import default_result_renderer
if res['status'] != 'ok':
# logging complained about this already
return
if 'procedure' not in res.get('action', ''):
# it's not our business
default_result_renderer(res)
return
if kwargs.get('discover', None):
ui.message('{name} ({path}){msg}'.format(
# bold-faced name, if active
name=ac.color_word(res['procedure_name'], ac.BOLD)
if res['state'] == 'executable' else res['procedure_name'],
path=res['path'],
msg=' [{}]'.format(
res['message'][0] % res['message'][1:]
if isinstance(res['message'], tuple) else res['message'])
if 'message' in res else ''
))
elif kwargs.get('help_proc', None):
ui.message('{name} ({path}){help}'.format(
name=ac.color_word(res['procedure_name'], ac.BOLD),
path=op.relpath(
res['path'],
res['refds'])
if res.get('refds', None) else res['path'],
help='{nl}{msg}'.format(
nl=os.linesep,
msg=res['message'][0] % res['message'][1:]
if isinstance(res['message'], tuple) else res['message'])
if 'message' in res else ''
))
else:
default_result_renderer(res)