Source code for ibllib.oneibl.aws

import logging
from pathlib import Path
from time import time

import boto3
import numpy as np

from one.api import ONE
from one.alf.files import add_uuid_string
from iblutil.io.parquet import np2str


_logger = logging.getLogger(__name__)

AWS_ROOT_PATH = Path('data')
BUCKET_NAME = 'ibl-brain-wide-map-private'

# To get aws credentials follow
# https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2-linux.html to install aws cli
# https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html to set up
# credentials


[docs]class AWS:
    def __init__(self, s3_bucket_name=None, one=None):
        # TODO some initialisation routine to set up credentials for the first time

        s3 = boto3.resource('s3')
        self.bucket_name = s3_bucket_name or BUCKET_NAME
        self.bucket = s3.Bucket(self.bucket_name)
        self.one = one or ONE()

    def _download_datasets(self, datasets):

        files = []
        for i, d in datasets.iterrows():
            rel_file_path = Path(d['session_path']).joinpath(d['rel_path'])
            file_path = Path(self.one.cache_dir).joinpath(rel_file_path)
            file_path.parent.mkdir(exist_ok=True, parents=True)

            if file_path.exists():
                # already downloaded, need to have some options for overwrite, clobber, look
                # for file mismatch like in ONE
                _logger.info(f'{file_path} already exists wont redownload')
                continue

            if self.one._index_type() is int:
                uuid = np2str(np.r_[i[0], i[1]])
            elif self.one._index_type() is str:
                uuid = i

            aws_path = AWS_ROOT_PATH.joinpath(
                add_uuid_string(rel_file_path, uuid)).as_posix()
            # maybe should avoid this and do a try catch instead?, see here
            # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/collections.html#filtering
            # probably better to do filter on collection ? Not for today
            objects = list(self.bucket.objects.filter(Prefix=aws_path))
            if len(objects) == 1:
                ts = time()
                _logger.info(f'Downloading {aws_path} to {file_path}')
                self.bucket.download_file(aws_path, file_path.as_posix())
                _logger.debug(f'Complete. Time elapsed {time() - ts} for {file_path}')
                files.append(file_path)
            else:
                _logger.warning(f'{aws_path} not found on s3 bucket: {self.bucket.name}')

        return files


[docs]def download_folder_aws(folder_path, one, save_path=None):
    save_path = save_path or one.cache_dir.joinpath(folder_path)

    repo_json = one.alyx.rest('data-repository', 'read', id='aws_cortexlab')['json']
    bucket_name = repo_json['bucket_name']
    session_keys = {
        'aws_access_key_id': repo_json.get('Access key ID', None),
        'aws_secret_access_key': repo_json.get('Secret access key', None)
    }
    session = boto3.Session(**session_keys)
    s3 = session.resource('s3')
    bucket = s3.Bucket(bucket_name)

    for i, obj in enumerate(bucket.objects.filter(Prefix=f'{folder_path}')):
        download_path = save_path.joinpath(Path(obj.key).relative_to(folder_path))
        download_path.parent.mkdir(exist_ok=True, parents=True)
        bucket.download_file(obj.key, str(download_path))