Source code for datalad.distribution.publish

# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
#   See COPYING file distributed along with the datalad package for the
#   copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""High-level interface for dataset (component) publishing

"""

import logging
import re
from collections import OrderedDict
from os.path import join as opj

from datalad import ssh_manager
from datalad.interface.annotate_paths import AnnotatePaths
from datalad.interface.annotate_paths import annotated2content_by_ds
from datalad.interface.base import Interface
from datalad.interface.base import build_doc
from datalad.interface.utils import eval_results
from datalad.interface.results import get_status_dict
from datalad.interface.common_opts import annex_copy_opts, recursion_flag, \
    recursion_limit, git_opts, annex_opts, jobs_opt
from datalad.interface.common_opts import missing_sibling_opt
from datalad.support.param import Parameter
from datalad.support.constraints import EnsureStr
from datalad.support.constraints import EnsureChoice
from datalad.support.constraints import EnsureNone
from datalad.support.annexrepo import AnnexRepo
from datalad.support.sshconnector import sh_quote
from datalad.support.exceptions import (
    InsufficientArgumentsError,
)
from datalad.support.network import URL, RI, SSHRI, is_ssh

from datalad.utils import ensure_list
from datalad.dochelpers import exc_str

from .dataset import EnsureDataset
from .dataset import Dataset
from .dataset import datasetmethod
from .dataset import require_dataset

__docformat__ = 'restructuredtext'

lgr = logging.getLogger('datalad.distribution.publish')


def _push(ds, remote, things2push, force=False):
    lgr.debug("Attempt to push '%s' to sibling '%s'", things2push, remote)
    push_res = ds.repo.push(remote=remote, refspec=things2push, force=force)
    if things2push and ds.config.get('remote.{}.push'.format(remote)):
        # we aim to push both auto-detected and possibly configured once
        # above we pushed the result of auto-detection, now push the
        # configured ones
        lgr.debug("Secondary push since custom push targets provided")
        push_res.extend(
            ds.repo.push(remote=remote, force=force))
    if not push_res:
        return 'notneeded', 'Git reported nothing was pushed'
    errors = [
        '{} -> {} {}'.format(
            pi['from_ref'],
            pi['to_ref'],
            pi['note'])
        for pi in push_res
        if 'error' in pi['operations']]
    successes = [
        pi['note']
        for pi in push_res
        if 'error' not in pi['operations']]
    if errors:
        return 'error', \
               ('failed to push to %s: %s;%s',
                remote,
                '; '.join(errors),
                ' pushed: {}'.format(successes) if successes else '')
    else:
        return 'ok', ('pushed to %s: %s', remote, successes)


def _get_remote_branch(ds, refspec=None):
    if refspec:
        remote_branch_name = refspec[11:] \
            if refspec.startswith('refs/heads/') \
            else refspec
    else:
        # there was no tracking branch, check the push target
        remote_branch_name = ds.repo.get_active_branch()
    return remote_branch_name


def has_diff(ds, remote_branch_name, remote, paths):
    """Return bool if a dataset was modified wrt to a given remote state"""
    remote_ref = '/'.join((remote, remote_branch_name))
    if remote_ref not in ds.repo.get_remote_branches():
        lgr.debug("Remote '%s' has no branch matching %r. Will publish",
                  remote, remote_branch_name)
        # we don't have any remote state, need to push for sure
        return True

    lgr.debug("Testing for changes with respect to '%s' of remote '%s'",
              remote_branch_name, remote)
    current_commit = ds.repo.get_hexsha()
    within_ds_paths = [p['path'] for p in paths if p['path'] != ds.path]
    commit_differ = current_commit != ds.repo.get_hexsha(remote_ref)
    # yoh: not sure what "logic" was intended here for comparing only
    # some files.  By now we get a list of files, if any were changed,
    # from the commit on remote, and somehow diff says below that they didn't differ...
    # but if commit is different -- there must be differences and we
    # should publish. otherwise now skips publishing root dataset
    # although its master is behind by 1 commit.  Moreover there could
    # be an empty commit -- shouldn't we publish then???
    if not commit_differ and within_ds_paths:
        # only if any paths is different from just the parentds root
        # in which case we can do the same muuuch cheaper (see below)
        # if there were custom paths, we will look at the diff
        lgr.debug("Since paths provided, looking at diff")
        return any(r["state"] != "clean"
                   for r in ds.diff(path=within_ds_paths,
                                    fr="HEAD",
                                    to=remote_ref,
                                    untracked="no"))
    else:
        # if commits differ at all
        lgr.debug("Since no paths provided, comparing commits")
        return commit_differ


def _publish_data(ds, remote, paths, annex_copy_options, force, transfer_data, **kwargs):
    # paths are annotated paths for now, changes below
    if not isinstance(ds.repo, AnnexRepo):
        # impossible to publish annex'ed data
        return

    if ds.config.getbool('remote.{}'.format(remote), 'annex-ignore', False):
        # configuration says: don't do it
        return

    if not ds.config.get('.'.join(('remote', remote, 'annex-uuid')), None):
        # this remote either isn't an annex, or hasn't been properly initialized
        for ap in paths:
            # this is only a problem if this path
            ap['status'] = 'impossible' \
                           if transfer_data == 'all' or ap.get('raw_input', False) \
                           else 'notneeded'
            ap['message'] = \
                ("annex for remote '%s' not available, or not properly configured",
                 remote)
            yield ap
        return

    # what data to transfer?
    if transfer_data == 'all':
        paths = ['.']
    elif transfer_data == 'auto':
        # keep only paths that were requested and are not the base path of the dataset
        # if the resulting list is empty, the "auto" mode of _publish_data() will
        # kick in and consult "wanted"
        paths = [p['path'] for p in paths
                 if p.get('raw_input', False) and
                 not p['path'] == ds.path]
    else:
        raise ValueError(
            "unknown label '{}' for `transfer_data` option".format(
                transfer_data))

    # TODO do we really have to call annex for that, or can we take it from
    # the config instead?
    remote_wanted = ds.repo.get_preferred_content('wanted', remote)
    if not (paths or annex_copy_options or remote_wanted):
        # nothing that we could tell git annex
        return

    # we should now know what needs doing
    lgr.info("Publishing {0} data to {1}".format(ds, remote))
    # overwrite URL with pushurl if any, reason:
    # https://git-annex.branchable.com/bugs/annex_ignores_pushurl_and_uses_only_url_upon___34__copy_--to__34__/
    # Note: This shouldn't happen anymore with newly added siblings.
    #       But for now check for it, until we agree on how to fix existing
    #       ones.
    pushurl = ds.config.get('remote.{}.pushurl'.format(remote), None)
    annexurl = ds.config.get('remote.{}.annexurl'.format(remote), None)
    annex_copy_options_ = annex_copy_options or ''
    if pushurl and not annexurl:
        annex_copy_options_ += ' -c "remote.{}.annexurl={}"'.format(remote, pushurl)
    if not paths and remote_wanted:
        lgr.debug("Invoking copy --auto")
        annex_copy_options_ += ' --auto'
    # TODO:  we might need additional logic comparing the state of git-annex
    # branch locally and on remote to see if information about the 'copy'
    # was also reflected on the remote end
    #git_annex_hexsha = ds.repo.get_hexsha('git-annex')
    # TODO: must be the same if we merged/pushed before, if not -- skip
    # special logic may be with a warning
    if not force:
        # if we force, we do not trust local knowledge and do the checks
        annex_copy_options_ += ' --fast'
    # TODO this things needs to return JSON
    ncopied = 0
    for r in ds.repo.copy_to(
            files=[p for p in paths
                   # TODO we may have to check for any file in Git, but this one can
                   # easily happen with --since
                   if not p == opj(ds.path, '.gitmodules')],
            remote=remote,
            options=annex_copy_options_):
        ncopied += 1
        # TODO RF to have copy_to() yield JSON and convert that one
        # at present only the "good" results come out
        yield get_status_dict(status='ok', path=opj(ds.path, r),
                              type='file', parentds=ds.path, **kwargs)

    if ncopied:
        _check_and_update_remote_server_info(ds, remote)

    # if ds.submodules:
    #     # NOTE: we might need to init them on the remote, but needs to
    #     #  be done only if remote is sshurl and it is not bare there
    #     #  (which I think we do not even support ATM)...
    #     #  or we could do that in the hook, as it is done for now
    #     #  (see create_sibling.py)
    #     #
    #     pass

    # TODO unclear why this was commented out
    # if ds.repo.get_hexsha('git-annex') != git_annex_hexsha:
    #     # there were changes which should be pushed
    #     lgr.debug(
    #         "We have progressed git-annex branch should fetch/merge/push it to %s again",
    #         remote)
    #     ds.repo.fetch(remote=remote, refspec='git-annex')
    #     ds.repo.merge_annex(remote)
    #     _log_push_info(ds.repo.push(remote=remote, refspec=['git-annex']))


def _check_and_update_remote_server_info(ds, remote):
    # if we managed to copy to "http" url  we should should try to trigger git
    # update-server-info hook on the remote if there was ssh annexurl defined
    # for it. Apparently we do that already in create_sibling ones, but here
    # we need more checks and preparation
    remote_url = ds.repo.config.get('remote.%s.url' % remote, None)
    if remote_url:
        remote_url = RI(remote_url)
        if isinstance(remote_url, URL) and remote_url.scheme in (
        'http', 'https'):
            remote_annexurl = ds.repo.config.get('remote.%s.annexurl' % remote,
                                                 None)
            if remote_annexurl:
                remote_annexurl_ri = RI(remote_annexurl)
                if is_ssh(remote_annexurl_ri):
                    ssh = ssh_manager.get_connection(remote_annexurl_ri)
                    ssh('git -C {} update-server-info'.format(
                        sh_quote(remote_annexurl_ri.path)))
                    return True
                else:
                    lgr.debug(
                        "There is no annexurl defined but not ssh: %s, "
                        "dunno if "
                        "we could/should do anything", remote_annexurl
                    )
    return False


def _maybe_fetch(repo, remote):
    if repo.config.get("remote.{}.fetch".format(remote)):
        repo.fetch(remote=remote, recurse_submodules="no")
    else:
        # Fetching would lead to "Couldn't find remote ref HEAD" if no
        # branch" error.  See gh-4199 for an example.
        lgr.warning("Remote %s has no configured refspec", remote)


def _publish_dataset(ds, remote, refspec, paths, annex_copy_options, force=False, jobs=None,
                     transfer_data='auto', **kwargs):
    remote_branch_name = _get_remote_branch(ds, refspec)
    if not remote_branch_name:
        yield get_status_dict(
            ds=ds,
            status='impossible',
            message=(
                'Cannot determine remote branch name from %s',
                'HEAD' if not refspec else refspec,
            ),
            **kwargs)
        return
    # TODO: this setup is now quite ugly. The only way `refspec` can come
    # in, is when there is a tracking branch, and we get its state via
    # `refspec`

    # define config var name for potential publication dependencies
    depvar = 'remote.{}.datalad-publish-depends'.format(remote)
    # list of remotes that are publication dependencies for the
    # target remote
    publish_depends = ensure_list(ds.config.get(depvar, []))

    # remote might be set to be ignored by annex, or we might not even know yet its uuid
    # make sure we are up-to-date on this topic on all affected remotes, before
    # we start making decisions
    for r in publish_depends + [remote]:
        if not ds.config.get('.'.join(('remote', r, 'annex-uuid')), None):
            lgr.debug("Obtain remote annex info from '%s'", r)
            _maybe_fetch(ds.repo, r)
            # in order to be able to use git's config to determine what to push,
            # we need to annex merge first. Otherwise a git push might be
            # rejected if involving all matching branches for example.
            # NOTE we should not use a precomputed 'is_annex' test here, as
            # each fetch could give evidence that there is an annex
            # somewhere and replace the repo class...
            if isinstance(ds.repo, AnnexRepo):
                ds.repo.localsync(r)
    ds.config.reload()

    # anything that follows will not change the repo type anymore, cache
    is_annex_repo = isinstance(ds.repo, AnnexRepo)

    # Plan:
    # 1. Check if there is anything to push, and if so
    #    2. process push dependencies
    #    3. fetch and merge annex branch
    #    4. push non-annex branch(es)
    # 5. copy data to the remote if paths are provided or it wants something generally

    # upstream refspec needed for update (merge) and subsequent push,
    # in case there is no.
    # no tracking refspec yet?

    # TODO: i think this whole modification detection could be done by path
    # annotation at the very beginning -- keeping it for now to not get too
    # dizzy in the forehead....

    # if forced -- we push regardless if there are differences or not
    diff = True if force else has_diff(ds, remote_branch_name, remote, paths)

    # We might have got new information in git-annex branch although no other
    # changes
    if not diff and is_annex_repo:
        try:
            git_annex_commit = next(ds.repo.get_branch_commits_('git-annex'))
        except StopIteration:
            git_annex_commit = None
        #diff = _get_remote_diff(ds, [], git_annex_commit, remote, 'git-annex')
        diff = _get_remote_diff(ds.repo, git_annex_commit, remote, 'git-annex')
        if diff:
            lgr.info("Will publish updated git-annex")

    #
    # publish data (annex copy --to)
    #
    # # remote might be set to be ignored by annex, or we might not even know yet its uuid
    # annex_ignore = ds.config.getbool('remote.{}.annex-ignore'.format(remote), None)
    # annex_uuid = ds.config.get('remote.{}.annex-uuid'.format(remote), None)
    # if not annex_ignore:
    #     if annex_uuid is None:
    #         # most probably not yet 'known' and might require some annex

    copied_data = False
    # skip right away if data transfer is not desired
    if transfer_data != 'none' and isinstance(ds.repo, AnnexRepo):
        # publishing of `remote` might depend on publishing other
        # remote(s) first, so they need to receive the data first:
        for d, desc in [
            (d, "configured publication dependency")
            for d in publish_depends
        ] + [
            # no message the target remote as before
            (remote, None)
        ]:
            if desc:
                lgr.info("Transferring data to %s: '%s'", desc, d)
            # properly initialized remote annex -> publish data
            for r in _publish_data(
                    ds,
                    d,
                    paths,
                    annex_copy_options,
                    force,
                    transfer_data,
                    **kwargs):
                # note if we published any data, notify to sync annex branch below
                if r['status'] == 'ok' and r['action'] == 'publish' and \
                        r.get('type', None) == 'file':
                    copied_data = True
                yield r

    #
    # publish dataset (git push)
    #
    if not diff and not copied_data:
        lgr.debug("No changes detected with respect to state of '%s'", remote)
        yield get_status_dict(ds=ds, status='notneeded', **kwargs)
    else:
        # publishing of `remote` might depend on publishing other
        # remote(s) first:
        for d in publish_depends:
            lgr.info("Publishing to configured dependency: '%s'", d)
            # call this again to take care of the dependency first,
            # but keep the paths the same, as the goal is to publish those
            # to the primary remote, and not anything elase to a dependency
            for r in _publish_dataset(
                    ds,
                    d,
                    # should get the same as the base dataset
                    refspec,
                    paths,
                    annex_copy_options,
                    force=force,
                    jobs=jobs,
                    transfer_data=transfer_data,
                    **kwargs):
                yield r

        if is_annex_repo and \
                ds.repo.is_special_annex_remote(remote):
            # There is nothing else to "publish"
            lgr.debug(
                "{0} is a special annex remote, no git push is needed".format(remote)
            )
            return

        lgr.info("Publishing {0} to {1}".format(ds, remote))
        # in order to be able to use git's config to determine what to push,
        # we need to annex merge first. Otherwise a git push might be
        # rejected if involving all matching branches for example
        # even if we already fetched above we need to do it again
        if is_annex_repo:
            lgr.debug("Obtain remote annex info from '%s'", remote)
            _maybe_fetch(ds.repo, remote)
            ds.repo.localsync(remote)

        # Note: git's push.default is 'matching', which doesn't work for first
        # time publication (a branch, that doesn't exist on remote yet)
        # But if we want to respect remote.*.push entries, etc. we need to
        # not pass a specific refspec (like active branch) to `git push`
        # by default.
        # hence we amend any existing config on the fly
        # TODO: what else to push by default?
        # consider also: --follow-tags, --tags, --atomic
        # make sure we push
        things2push = []
        current_branch = ds.repo.get_active_branch()
        if current_branch:  # possibly make this conditional on a switch
            # TODO: this should become it own helper
            if is_annex_repo:
                # annex could manage this branch
                match_adjusted = re.match(
                    r'adjusted/(.*)\([a-z]*\)',
                    current_branch)
                if match_adjusted:
                    # adjusted/master(...)
                    # TODO:  this code is not tested
                    # see https://codecov.io/gh/datalad/datalad/src/17e67045a088ae0372b38aa4d8d46ecf7c821cb7/datalad/distribution/publish.py#L156
                    # and thus probably broken -- test me!
                    current_branch = match_adjusted.group(1)
            things2push.append(current_branch)
        if is_annex_repo:
            things2push.append('git-annex')
        # check that all our magic found valid branches
        things2push = [t for t in things2push if t in ds.repo.get_branches()]
        # check that we don't ask to push things that are already configured
        # -> would cause error
        # TODO need to find a way to properly do this, when wildcards are used
        # in the push configuration variable
        things2push = [t for t in things2push
                       if t not in ds.config.get('remote.{}.push'.format(remote), [])]
        # now we know what to push where
        status, msg = _push(ds, remote, things2push, force)
        yield get_status_dict(ds=ds, status=status, message=msg, **kwargs)


def _get_remote_info(ds_path, ds_remote_info, to, missing):
    """Returns None if desired info was obtained, or a tuple (status, message)
    if not"""
    ds = Dataset(ds_path)
    if ds.repo is None:
        # There is no repository, nothing could be done
        return ('impossible',
                'No repository found for %s' % ds)
    if to is None:
        # we need an upstream remote, if there's none given. We could
        # wait for git push to complain, but we need to explicitly
        # figure it out for pushing annex branch anyway and we might as
        # well fail right here.
        track_remote, track_refspec = ds.repo.get_tracking_branch()
        if not track_remote:
            # no tracking remote configured, but let try one more
            # if we only have one remote, and it has a push target
            # configured that is "good enough" for us
            cand_remotes = [r for r in ds.repo.get_remotes()
                            if 'remote.{}.push'.format(r) in ds.config]
            if len(cand_remotes) > 1:
                lgr.warning('Target sibling ambiguous, please specify via --to')
            elif len(cand_remotes) == 1:
                track_remote = cand_remotes[0]
            else:
                return ('impossible',
                        'No target sibling configured for default publication, '
                        'please specify via --to')
        if track_remote:
            ds_remote_info[ds_path] = dict(zip(
                ('remote', 'refspec'),
                (track_remote, track_refspec)))
        elif missing == 'skip':
            ds_remote_info[ds_path] = None
            return ('notneeded',
                    'Cannot determine target sibling, skipping publication')
        else:
            # we have no remote given and no upstream
            return 'error', 'Cannot determine a default target sibling for publication'
    elif to not in ds.repo.get_remotes():
        # unknown given remote
        if missing == 'skip':
            ds_remote_info[ds_path] = None
            return ('notneeded',
                    ("Unknown target sibling '%s', skipping publication", to))
        elif missing == 'inherit':
            superds = ds.get_superdataset()
            if not superds:
                return ('error',
                        ("No super-dataset to inherit settings for remote %s", to))
            # XXX due to difference between create-sibling and create-sibling-github
            # would not be as transparent to inherit for -github
            lgr.info("Will try to create a sibling inheriting settings from %s", superds)
            # XXX explicit None as sshurl for now
            # TODO this is not good: e.g. #1344
            ds.create_sibling(None, name=to, inherit=True)
            ds_remote_info[ds_path] = {'remote': to}
        else:
            return ('error',
                    ("Unknown target sibling '%s' for publication", to))
    else:
        # all good: remote given and is known
        ds_remote_info[ds_path] = {'remote': to}


def _get_remote_diff(repo, current_commit, remote, remote_branch_name):
    """Helper to check if remote has different state of the branch"""
    remote_ref = '/'.join((remote, remote_branch_name))
    if remote_ref in repo.get_remote_branches():
        lgr.debug("Testing for changes with respect to '%s' of remote '%s'",
                  remote_branch_name, remote)
        if current_commit is None:
            current_commit = repo.get_hexsha()
        remote_ref = repo.get_hexsha(remote_ref)
        diff = current_commit != remote_ref
    else:
        lgr.debug("Remote '%s' has no branch matching %r. Will publish",
                  remote, remote_branch_name)
        # we don't have any remote state, need to push for sure
        diff = True

    return diff


@build_doc
class Publish(Interface):
    """Publish a dataset to a known :term:`sibling`.

    This makes the last saved state of a dataset available to a sibling
    or special remote data store of a dataset. Any target sibling must already
    exist and be known to the dataset.

    Optionally, it is possible to limit publication to change sets relative
    to a particular point in the version history of a dataset (e.g. a release
    tag). By default, the state of the local dataset is evaluated against the
    last known state of the target sibling. Actual publication is only attempted
    if there was a change compared to the reference state, in order to speed up
    processing of large collections of datasets. Evaluation with respect to
    a particular "historic" state is only supported in conjunction with a
    specified reference dataset. Change sets are also evaluated recursively, i.e.
    only those subdatasets are published where a change was recorded that is
    reflected in to current state of the top-level reference dataset.
    See "since" option for more information.

    Only publication of saved changes is supported. Any unsaved changes in a
    dataset (hierarchy) have to be saved before publication.

    .. note::
      Power-user info: This command uses :command:`git push`, and :command:`git annex copy`
      to publish a dataset. Publication targets are either configured remote
      Git repositories, or git-annex special remotes (if they support data
      upload).

    .. note::
      This command is deprecated. It will be removed from DataLad eventually,
      but no earlier than the 0.15 release. The `push` command (new in 0.13.0)
      provides an alternative interface. Critical differences are that `push`
      transfers annexed data by default and does not handle sibling creation
      (i.e. it does not have a `--missing` option).
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True
    # TODO: Figure out, how to tell about tracking branch/upstream
    #      (and the respective remote)
    #      - it is used, when no destination is given
    #      - it is configured to be the given destination, if there was no
    #        upstream set up before, so you can use just "datalad publish" next
    #        time.

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the (top-level) dataset to be published. If no dataset
            is given, the datasets are determined based on the input arguments""",
            constraints=EnsureDataset() | EnsureNone()),
        to=Parameter(
            args=("--to",),
            metavar='LABEL',
            doc="""name of the target sibling. If no name is given an attempt is
            made to identify the target based on the dataset's configuration
            (i.e. a configured tracking branch, or a single sibling that is
            configured for publication)""",
            # TODO: See TODO at top of class!
            constraints=EnsureStr() | EnsureNone()),
        since=Parameter(
            args=("--since",),
            constraints=EnsureStr() | EnsureNone(),
            doc="""specifies commit-ish (tag, shasum, etc.) from which to look for
            changes to decide whether pushing is necessary.
            If '^' is given, the last state of the current branch at the sibling
            is taken as a starting point. An empty string ('') for the same effect is
            still supported)."""),
        # since: commit => .gitmodules diff to head => submodules to publish
        missing=missing_sibling_opt,
        path=Parameter(
            args=("path",),
            metavar='PATH',
            # TODO this description is no longer correct
            doc="path(s), that may point to file handle(s) to publish including "
                "their actual content or to subdataset(s) to be published. If a "
                "file handle is published with its data, this implicitly means "
                "to also publish the (sub)dataset it belongs to. '.' as a path "
                "is treated in a special way in the sense, that it is passed "
                "to subdatasets in case `recursive` is also given.",
            constraints=EnsureStr() | EnsureNone(),
            nargs='*'),
        force=Parameter(
            args=("-f", "--force",),
            doc="""enforce doing publish activities (git push etc) regardless of
            the analysis if they seemed needed""",
            action='store_true'),
        # TODO add option to decide what branch/repo to push
        transfer_data=Parameter(
            args=("--transfer-data",),
            doc="""ADDME""",
            constraints=EnsureChoice('auto', 'none', 'all')),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        git_opts=git_opts,
        annex_opts=annex_opts,
        annex_copy_opts=annex_copy_opts,
        jobs=jobs_opt,
    )

    @staticmethod
    @datasetmethod(name='publish')
    @eval_results
    def __call__(
            path=None,
            dataset=None,
            to=None,
            since=None,
            missing='fail',
            force=False,
            transfer_data='auto',
            recursive=False,
            recursion_limit=None,
            git_opts=None,
            annex_opts=None,
            annex_copy_opts=None,
            jobs=None
    ):

        import warnings
        warnings.warn("`publish` is deprecated. Use `datalad push` instead.",
                      DeprecationWarning)

        # if ever we get a mode, for "with-data" we would need this
        #if dataset and not path:
        #    # act on the whole dataset if nothing else was specified
        #    path = dataset.path if isinstance(dataset, Dataset) else dataset

        if not (isinstance(dataset, Dataset) or (dataset is None and path)):
            # try to find a dataset in PWD
            dataset = require_dataset(
                dataset, check_installed=True, purpose='publish')

        if (since and since != '^') and not dataset:
            raise InsufficientArgumentsError(
                'Modification detection (--since) without a base dataset '
                'is not supported')

        if dataset and since in ('', '^'):
            # only update since last update so we figure out what was the last update
            active_branch = dataset.repo.get_active_branch()
            if to:
                # XXX here we assume one to one mapping of names from local branches
                # to the remote
                since = '%s/%s' % (to, active_branch)
                # test if such branch already exists,
                if since not in dataset.repo.get_remote_branches():
                    lgr.debug("No remote branch %s yet, so since will not be used", since)
                    since = None
            else:
                # take tracking remote for the active branch
                tracked_remote, tracked_refspec = dataset.repo.get_tracking_branch()
                if tracked_remote:
                    if tracked_refspec.startswith('refs/heads/'):
                        tracked_refspec = tracked_refspec[len('refs/heads/'):]
                    #to = tracked_remote
                    since = '%s/%s' % (tracked_remote, tracked_refspec)
                else:
                    lgr.info(
                        "No tracked remote for %s. since option is of no effect",
                        active_branch
                    )
                    since = None

        # here is the plan
        # 1. figure out remote to publish to
        # 2. figure out which content needs to be published to this remote
        # 3. look for any pre-publication dependencies of that remote
        #    (i.e. remotes that need to be published to before)
        # 4. publish the content needed to go to the primary remote to
        #    the dependencies first, and to the primary afterwards
        ds_remote_info = {}

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(refds=refds_path, logger=lgr, action='publish')

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='publish',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified="%s..HEAD" % since if since else since,
                return_type='generator',
                on_failure='ignore',
                force_no_revision_change_discovery=False, # we cannot publish what was not committed
                force_untracked_discovery=False  # we cannot publish untracked
        ):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            remote_info_result = None
            if ap.get('type', ap.get('type_src', 'dataset')) != 'dataset':
                # for everything that is not a dataset get the remote info
                # for the parent
                parentds = ap.get('parentds', None)
                if parentds and parentds not in ds_remote_info:
                    remote_info_result = _get_remote_info(
                        parentds, ds_remote_info, to, missing)
            else:
                # this is a dataset
                if ap.get('state', None) == 'absent':
                    continue
                # get the remote info for itself
                remote_info_result = _get_remote_info(
                    ap['path'], ds_remote_info, to, missing)
                ap['process_content'] = True
            if remote_info_result is not None:
                ap['status'] = remote_info_result[0]
                ap['message'] = remote_info_result[1]
                yield ap
                continue
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        lgr.debug(
            "Evaluating %i dataset publication candidate(s)",
            len(content_by_ds))
        # TODO: fancier sorting, so we still follow somewhat the hierarchy
        #       in sorted order, e.g.
        #  d1/sub1/sub1
        #  d1/sub1
        #  d1
        #  d2/sub1
        #  d2
        content_by_ds = OrderedDict(
            (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)
        )

        lgr.debug("Attempt to publish %i datasets", len(content_by_ds))
        for ds_path in content_by_ds:
            remote_info = ds_remote_info.get(ds_path, None)
            if remote_info is None:
                # maybe this dataset wasn't annotated above, try to get info
                # MIH: I think this entire if-branch is practically impossible
                # to reach. It is certainly untested, but I think this is due
                # to mutually exclusive conditions during remote_info detection
                remote_info_result = _get_remote_info(
                    ds_path, ds_remote_info, to, missing)
                if remote_info_result is not None:
                    yield get_status_dict(
                        type='dataset',
                        path=ds_path,
                        status=remote_info_result[0],
                        message=remote_info_result[1],
                        **res_kwargs)
                    continue
                # continue with freshly obtained info
                remote_info = ds_remote_info[ds_path]
                # condition above must catch all other cases
                assert remote_info
            # and publish
            ds = Dataset(ds_path)
            for r in _publish_dataset(
                    ds,
                    remote=remote_info['remote'],
                    refspec=remote_info.get('refspec', None),
                    # only send paths that were explicitly requested
                    paths=[p for p in content_by_ds[ds_path]
                           # do not feed (sub)dataset paths into the beast
                           # makes no sense to try to annex copy them
                           # for the base dataset itself let `transfer_data`
                           # decide
                           if p.get('type', None) != 'dataset'],
                    annex_copy_options=annex_copy_opts,
                    force=force,
                    jobs=jobs,
                    transfer_data=transfer_data,
                    **res_kwargs):
                yield r