# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the datalad package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""export a dataset as a TAR/ZIP archive to figshare"""
__docformat__ = 'restructuredtext'
from datalad.utils import unlink
from datalad.interface.base import Interface
from datalad.interface.base import build_doc
import logging
lgr = logging.getLogger('datalad.export_to_figshare')
class FigshareRESTLaison(object):
"""A little helper to provide minimal interface to interact with Figshare
"""
API_URL = 'https://api.figshare.com/v2'
def __init__(self):
self._token = None
from datalad.ui import ui
self.ui = ui # we will be chatty here
@property
def token(self):
if self._token is None:
from datalad.downloaders.providers import Providers
providers = Providers.from_config_files()
provider = providers.get_provider(self.API_URL)
credential = provider.credential
self._token = credential().get('token')
return self._token
def __call__(self, m, url, data=None, success=None, binary=False,
headers=None, return_json=True):
"""A wrapper around requests calls
to interpolate deposition_id, do basic checks and conversion
"""
import json
if '://' not in url:
url_ = self.API_URL + '/' + url
else:
url_ = url
headers = headers or {}
if data is not None and not binary:
data = json.dumps(data)
headers["Content-Type"] = "application/json"
headers['Authorization'] = "token %s" % self.token
lgr.debug(
"Submitting %s request to %s with data %s (headers: %s)",
m.__name__, url_, data, 'sanitized' # headers
)
r = m(url_, data=data, headers=headers)
status_code = r.status_code
if (success != "donotcheck") and \
((success and status_code not in success)
or (not success and status_code >= 400)):
msg = "Got return code %(status_code)s for %(m)s(%(url_)s." \
% locals()
raise RuntimeError("Error status %s" % msg)
if return_json:
return r.json() if r.content else {}
else:
return r.content
def put(self, *args, **kwargs):
import requests
return self(requests.put, *args, **kwargs)
def post(self, *args, **kwargs):
import requests
return self(requests.post, *args, **kwargs)
def get(self, *args, **kwargs):
import requests
return self(requests.get, *args, **kwargs)
def upload_file(self, fname, files_url):
# In v2 API seems no easy way to "just upload". Need to initiate,
# do uploads
# and finalize
# TODO: check if the file with the same name already available, and offer
# to remove/prune it
import os
from datalad.utils import md5sum
from datalad.ui import ui
file_rec = {'md5': md5sum(fname),
'name': os.path.basename(fname),
'size': os.stat(fname).st_size
}
# Initiate upload
j = self.post(files_url, file_rec)
file_endpoint = j['location']
file_info = self.get(file_endpoint)
file_upload_info = self.get(file_info['upload_url'])
pbar = ui.get_progressbar(label=fname, # fill_text=f.name,
total=file_rec['size'])
with open(fname, 'rb') as f:
for part in file_upload_info['parts']:
udata = dict(file_info, **part)
if part['status'] == 'PENDING':
f.seek(part['startOffset'])
data = f.read(part['endOffset'] - part['startOffset'] + 1)
url = '{upload_url}/{partNo}'.format(**udata)
ok = self.put(url, data=data, binary=True, return_json=False)
assert ok == b'OK'
pbar.update(part['endOffset'], increment=False)
pbar.finish()
# complete upload
jcomplete = self.post(file_endpoint, return_json=False)
return file_info
def get_article_ids(self):
articles = self.get('account/articles')
ids = []
for item in articles or []:
self.ui.message(' {id} {url} - {title}'.format(**item))
ids.append(item['id'])
return ids
def create_article(self, title):
data = {
'title': title
}
# we could prefill more fields interactively if desired
result = self.post('account/articles', data=data)
result = self.get(result['location'])
return result
def _get_default_title(dataset):
"""Create default title as dataset directory[#UUID][@version]
with any of [] missing if not defined
"""
from ..support.path import basename
title = basename(dataset.path)
if dataset.id:
title += "#{dataset.id}".format(**locals())
version = dataset.repo.describe()
if version:
title += "@{version}".format(**locals())
# 3 is minimal length. Just in case there is no UUID or version and dir
# is short
if len(title) < 3:
title += "0"*(3 - len(title))
return title
def _enter_title(ui, dataset):
default = _get_default_title(dataset)
while True:
title = ui.question(
"Please enter the title (must be at least 3 characters long).",
title="New article",
default=default
)
if len(title) < 3:
ui.error("Title must be at least 3 characters long.")
else:
return title
@build_doc
class ExportToFigshare(Interface):
"""Export the content of a dataset as a ZIP archive to figshare
Very quick and dirty approach. Ideally figshare should be supported as
a proper git annex special remote. Unfortunately, figshare does not support
having directories, and can store only a flat list of files. That makes
it impossible for any sensible publishing of complete datasets.
The only workaround is to publish dataset as a zip-ball, where the entire
content is wrapped into a .zip archive for which figshare would provide a
navigator.
"""
from datalad.support.param import Parameter
from datalad.distribution.dataset import datasetmethod
from datalad.interface.utils import eval_results
from datalad.distribution.dataset import EnsureDataset
from datalad.support.constraints import (
EnsureChoice,
EnsureInt,
EnsureNone,
EnsureStr,
)
_params_ = dict(
dataset=Parameter(
args=("-d", "--dataset"),
doc=""""specify the dataset to export. If no dataset is given, an
attempt is made to identify the dataset based on the current
working directory.""",
constraints=EnsureDataset() | EnsureNone()),
filename=Parameter(
args=("filename",),
metavar="PATH",
nargs='?',
doc="""File name of the generated ZIP archive. If no file name is
given the archive will be generated in the top directory
of the dataset and will be named: datalad_<dataset_uuid>.zip.""",
constraints=EnsureStr() | EnsureNone()),
no_annex=Parameter(
args=("--no-annex",),
action="store_true",
doc="""By default the generated .zip file would be added to annex,
and all files would get registered in git-annex to be available
from such a tarball. Also upon upload we will register for that
archive to be a possible source for it in annex. Setting this flag
disables this behavior."""),
missing_content=Parameter(
args=("--missing-content",),
doc="""By default, any discovered file with missing content will
result in an error and the plugin is aborted. Setting this to
'continue' will issue warnings instead of failing on error. The
value 'ignore' will only inform about problem at the 'debug' log
level. The latter two can be helpful when generating a TAR archive
from a dataset where some file content is not available
locally.""",
constraints=EnsureChoice("error", "continue", "ignore")),
# article_id=Parameter(
# args=("--project-id",),
# metavar="ID",
# doc="""If given, article (if article_id is not provided) will be
# created in that project.""",
# constraints=EnsureInt() | EnsureNone()),
article_id=Parameter(
args=("--article-id",),
metavar="ID",
doc="""Which article to publish to.""",
constraints=EnsureInt() | EnsureNone()),
)
@staticmethod
@datasetmethod(name='export_to_figshare')
@eval_results
def __call__(dataset, filename=None, missing_content='error', no_annex=False,
# TODO: support working with projects and articles within them
# project_id=None,
article_id=None):
import os
import logging
lgr = logging.getLogger('datalad.plugin.export_to_figshare')
from datalad.ui import ui
from datalad.api import add_archive_content
from datalad.api import export_archive
from datalad.distribution.dataset import require_dataset
from datalad.support.annexrepo import AnnexRepo
dataset = require_dataset(dataset, check_installed=True,
purpose='export to figshare')
if not isinstance(dataset.repo, AnnexRepo):
raise ValueError(
"%s is not an annex repo, so annexification could be done"
% dataset
)
if dataset.repo.dirty:
raise RuntimeError(
"Paranoid authors of DataLad refuse to proceed in a dirty repository"
)
if filename is None:
filename = dataset.path
lgr.info(
"Exporting current tree as an archive under %s since figshare "
"does not support directories",
filename
)
archive_out = next(
export_archive(
dataset,
filename=filename,
archivetype='zip',
missing_content=missing_content,
return_type="generator"
)
)
assert archive_out['status'] == 'ok'
fname = archive_out['path']
lgr.info("Uploading %s to figshare", fname)
figshare = FigshareRESTLaison()
if not article_id:
# TODO: ask if it should be an article within a project
if ui.is_interactive:
# or should we just upload to a new article?
if ui.yesno(
"Would you like to create a new article to upload to? "
"If not - we will list existing articles",
title="Article"
):
article = figshare.create_article(
title=_enter_title(ui, dataset)
)
lgr.info(
"Created a new (private) article %(id)s at %(url_private_html)s. "
"Please visit it, enter additional meta-data and make public",
article
)
article_id = article['id']
else:
article_id = int(ui.question(
"Which of the articles should we upload to.",
choices=list(map(str, figshare.get_article_ids()))
))
if not article_id:
raise ValueError("We need an article to upload to.")
file_info = figshare.upload_file(
fname,
files_url='account/articles/%s/files' % article_id
)
if no_annex:
lgr.info("Removing generated tarball")
unlink(fname)
else:
# I will leave all the complaining etc to the dataset add if path
# is outside etc
lgr.info("'Registering' %s within annex", fname)
repo = dataset.repo
repo.add(fname, git=False)
key = repo.get_file_key(fname)
lgr.info("Adding URL %(download_url)s for it", file_info)
repo.call_annex([
"registerurl", '-c', 'annex.alwayscommit=false',
key, file_info['download_url']])
lgr.info("Registering links back for the content of the archive")
add_archive_content(
fname,
annex=dataset.repo,
delete_after=True, # just remove extracted into a temp dir
allow_dirty=True, # since we have a tarball
commit=False # we do not want to commit anything we have done here
)
lgr.info("Removing generated and now registered in annex archive")
repo.drop(key, key=True, options=['--force'])
repo.remove(fname, force=True) # remove the tarball
# if annex in {'delete'}:
# dataset.repo.remove(fname)
# else:
# # kinda makes little sense I guess.
# # Made more sense if export_archive could export an arbitrary treeish
# # so we could create a branch where to dump and export to figshare
# # (kinda closer to my idea)
# dataset.save(fname, message="Added the entire dataset into a zip file")
# TODO: add to downloader knowledge about figshare token so it could download-url
# those zipballs before they go public
yield dict(
status='ok',
# TODO: add article url (which needs to be queried if only ID is known
message="Published archive {}".format(
file_info['download_url']),
file_info=file_info,
path=dataset,
action='export_to_figshare',
logger=lgr
)
__datalad_plugin__ = ExportToFigshare