# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""High-level interface for creating a publication target on GitHub
"""
__docformat__ = 'restructuredtext'
import logging
import re
from os.path import relpath
from ..interface.base import (
build_doc,
Interface,
)
from ..interface.common_opts import (
recursion_flag,
recursion_limit,
publish_depends,
)
from ..support.param import Parameter
from ..support.constraints import (
EnsureChoice,
EnsureNone,
EnsureStr,
)
from ..utils import (
ensure_list,
)
from .dataset import (
datasetmethod,
EnsureDataset,
require_dataset,
)
from .siblings import Siblings
lgr = logging.getLogger('datalad.distribution.create_sibling_github')
# presently only implemented method to turn subdataset paths into Github
# compliant repository name suffixes
template_fx = lambda x: re.sub(r'\s+', '_', re.sub(r'[/\\]+', '-', x))
@build_doc
class CreateSiblingGithub(Interface):
"""Create dataset sibling on GitHub.
An existing GitHub project, or a project created via the GitHub website can
be configured as a sibling with the :command:`siblings` command.
Alternatively, this command can create a repository under a user's GitHub
account, or any organization a user is a member of (given appropriate
permissions). This is particularly helpful for recursive sibling creation
for subdatasets. In such a case, a dataset hierarchy is represented as a
flat list of GitHub repositories.
GitHub cannot host dataset content (but LFS special remote could be used,
http://handbook.datalad.org/r.html?LFS). However, in combination with
other data sources (and siblings), publishing a dataset to GitHub can
facilitate distribution and exchange, while still allowing any dataset
consumer to obtain actual data content from alternative sources.
For GitHub authentication a personal access token is needed.
Such a token can be generated by visiting https://github.com/settings/tokens
or navigating via GitHub Web UI through:
Settings -> Developer settings -> Personal access tokens.
We will first consult Git configuration *hub.oauthtoken* for tokens possibly
available there, and then from the system credential store.
If you provide [PY: `github_login` PY][CMD: --github-login NAME CMD],
we will consider only tokens associated with that GitHub login from
*hub.oauthtoken*, and store/check the token in credential store as associated
with that specific login name.
"""
# XXX prevent common args from being added to the docstring
_no_eval_results = True
_params_ = dict(
dataset=Parameter(
args=("--dataset", "-d",),
doc="""specify the dataset to create the publication target for. If
no dataset is given, an attempt is made to identify the dataset
based on the current working directory""",
constraints=EnsureDataset() | EnsureNone()),
reponame=Parameter(
args=('reponame',),
metavar='REPONAME',
doc="""GitHub repository name. When operating recursively,
a suffix will be appended to this name for each subdataset""",
constraints=EnsureStr()),
recursive=recursion_flag,
recursion_limit=recursion_limit,
name=Parameter(
args=('-s', '--name',),
metavar='NAME',
doc="""name to represent the GitHub repository in the local
dataset installation""",
constraints=EnsureStr()),
existing=Parameter(
args=("--existing",),
constraints=EnsureChoice('skip', 'error', 'reconfigure', 'replace'),
metavar='MODE',
doc="""desired behavior when already existing or configured
siblings are discovered. In this case, a dataset can be skipped
('skip'), the sibling configuration be updated ('reconfigure'),
or process interrupts with error ('error'). DANGER ZONE: If 'replace'
is used, an existing github repository will be irreversibly removed,
re-initialized, and the sibling (re-)configured (thus implies 'reconfigure').
`replace` could lead to data loss, so use with care. To minimize
possibility of data loss, in interactive mode DataLad will ask for
confirmation, but it would raise an exception in non-interactive mode.
""",),
github_login=Parameter(
args=('--github-login',),
constraints=EnsureStr() | EnsureNone(),
metavar='NAME',
doc="""GitHub user name or access token"""),
github_organization=Parameter(
args=('--github-organization',),
constraints=EnsureStr() | EnsureNone(),
metavar='NAME',
doc="""If provided, the repository will be created under this
GitHub organization. The respective GitHub user needs appropriate
permissions."""),
access_protocol=Parameter(
args=("--access-protocol",),
constraints=EnsureChoice('https', 'ssh'),
doc="""Which access protocol/URL to configure for the sibling"""),
publish_depends=publish_depends,
private=Parameter(
args=("--private",),
action="store_true",
default=False,
doc="""If this flag is set, the repository created on github
will be marked as private and only visible to those granted
access or by membership of a team/organization/etc.
"""),
dryrun=Parameter(
args=("--dryrun",),
action="store_true",
doc="""If this flag is set, no communication with GitHub is
performed, and no repositories will be created. Instead
would-be repository names are reported for all relevant datasets
"""),
)
@staticmethod
@datasetmethod(name='create_sibling_github')
def __call__(
reponame,
dataset=None,
recursive=False,
recursion_limit=None,
name='github',
existing='error',
github_login=None,
github_organization=None,
access_protocol='https',
publish_depends=None,
private=False,
dryrun=False):
# this is an absolute leaf package, import locally to avoid
# unnecessary dependencies
from datalad.support.github_ import _make_github_repos
# what to operate on
ds = require_dataset(
dataset, check_installed=True, purpose='create GitHub sibling(s)')
# gather datasets and essential info
# dataset instance and mountpoint relative to the top
toprocess = [(ds, '')]
if recursive:
for sub in ds.subdatasets(
fulfilled=None, # we want to report on missing dataset in here
recursive=recursive,
recursion_limit=recursion_limit,
result_xfm='datasets'):
if not sub.is_installed():
lgr.info('Ignoring unavailable subdataset %s', sub)
continue
toprocess.append((sub, relpath(sub.path, start=ds.path)))
# check for existing remote configuration
filtered = []
for d, mp in toprocess:
if name in d.repo.get_remotes():
if existing == 'error':
msg = '{} already has a configured sibling "{}"'.format(
d, name)
if dryrun:
lgr.error(msg)
else:
raise ValueError(msg)
elif existing == 'skip':
continue
gh_reponame = '{}{}{}'.format(
reponame,
'-' if mp else '',
template_fx(mp))
filtered.append((d, gh_reponame))
if not filtered:
# all skipped
return []
# actually make it happen on GitHub
rinfo = _make_github_repos(
github_login, github_organization, filtered,
existing, access_protocol, private, dryrun)
# lastly configure the local datasets
for d, url, existed in rinfo:
if not dryrun:
extra_remote_vars = {
# first make sure that annex doesn't touch this one
# but respect any existing config
'annex-ignore': 'true',
# first push should separately push active branch first
# to overcome github issue of choosing "default" branch
# alphabetically if its name does not match the default
# branch for the user (or organization) which now defaults
# to "main"
'datalad-push-default-first': 'true'
}
for var_name, var_value in extra_remote_vars.items():
var = 'remote.{}.{}'.format(name, var_name)
if var not in d.config:
d.config.add(var, var_value, where='local')
Siblings()(
'configure',
dataset=d,
name=name,
url=url,
recursive=False,
# TODO fetch=True, maybe only if one existed already
publish_depends=publish_depends)
# TODO let submodule URLs point to GitHub (optional)
return rinfo
@staticmethod
def result_renderer_cmdline(res, args):
from datalad.ui import ui
res = ensure_list(res)
if args.dryrun:
ui.message('DRYRUN -- Anticipated results:')
if not len(res):
ui.message("Nothing done")
else:
for d, url, existed in res:
ui.message(
"'{}'{} configured as sibling '{}' for {}".format(
url,
" (existing repository)" if existed else '',
args.name,
d))