Source code for one.alf.files

"""
Module for identifying and parsing ALF file names.

An ALF file has the following components (those in brackets are optional):
    (_namespace_)object.attribute(_timescale)(.extra.parts).ext

Note the following:
    Object attributes may not contain an underscore unless followed by 'times' or 'intervals'.
    A namespace must not contain extra underscores (i.e. `name_space` and `__namespace__` are not
    valid).
    ALF files must always have an extension.

For more information, see the following documentation:
    https://int-brain-lab.github.io/ONE/alf_intro.html

"""
from collections import OrderedDict
from datetime import datetime
from typing import Union, Optional
from pathlib import Path
import logging

from . import spec
from .spec import SESSION_SPEC, COLLECTION_SPEC, FILE_SPEC, REL_PATH_SPEC

_logger = logging.getLogger(__name__)



[docs]
def rel_path_parts(rel_path, as_dict=False, assert_valid=True):
    """Parse a relative path into the relevant parts.

    A relative path follows the pattern
        (collection/)(#revision#/)_namespace_object.attribute_timescale.extra.extension

    Parameters
    ----------
    rel_path : str, pathlib.Path
        A relative path string.
    as_dict : bool
        If true, an OrderedDict of parts are returned with the keys ('lab', 'subject', 'date',
        'number'), otherwise a tuple of values are returned.
    assert_valid : bool
        If true a ValueError is raised when the session cannot be parsed, otherwise an empty
        dict of tuple of Nones is returned.

    Returns
    -------
    OrderedDict, tuple
        A dict if as_dict is true, or a tuple of parsed values.
    """
    return _path_parts(rel_path, REL_PATH_SPEC, True, as_dict, assert_valid)




[docs]
def session_path_parts(session_path, as_dict=False, assert_valid=True):
    """Parse a session path into the relevant parts.

    Return keys:
        - lab
        - subject
        - date
        - number

    Parameters
    ----------
    session_path : str, pathlib.Path
        A session path string.
    as_dict : bool
        If true, an OrderedDict of parts are returned with the keys ('lab', 'subject', 'date',
        'number'), otherwise a tuple of values are returned.
    assert_valid : bool
        If true a ValueError is raised when the session cannot be parsed, otherwise an empty
        dict of tuple of Nones is returned.

    Returns
    -------
    OrderedDict, tuple
        A dict if as_dict is true, or a tuple of parsed values.

    Raises
    ------
    ValueError
        Invalid ALF session path (assert_valid is True).
    """
    return _path_parts(session_path, SESSION_SPEC, False, as_dict, assert_valid)



def _path_parts(path, spec_str, match=True, as_dict=False, assert_valid=True):
    """Given a ALF and a spec string, parse into parts.

    Parameters
    ----------
    path : str, pathlib.Path
        An ALF path or dataset.
    match : bool
        If True, string must match exactly, otherwise search for expression within path.
    as_dict : bool
        When true a dict of matches is returned.
    assert_valid : bool
        When true an exception is raised when the filename cannot be parsed.

    Returns
    -------
    OrderedDict, tuple
        A dict if as_dict is true, or a tuple of parsed values.

    Raises
    ------
    ValueError
        Invalid ALF path (assert_valid is True).
    """
    if hasattr(path, 'as_posix'):
        path = path.as_posix()
    pattern = spec.regex(spec_str)
    empty = OrderedDict.fromkeys(pattern.groupindex.keys())
    parsed = (pattern.match if match else pattern.search)(path)
    if parsed:  # py3.8
        parsed_dict = parsed.groupdict()
        return OrderedDict(parsed_dict) if as_dict else tuple(parsed_dict.values())
    elif assert_valid:
        raise ValueError(f'Invalid ALF: "{path}"')
    else:
        return empty if as_dict else tuple(empty.values())



[docs]
def filename_parts(filename, as_dict=False, assert_valid=True) -> Union[dict, tuple]:
    """
    Return the parsed elements of a given ALF filename.

    Parameters
    ----------
    filename : str
        The name of the file.
    as_dict : bool
        When true a dict of matches is returned.
    assert_valid : bool
        When true an exception is raised when the filename cannot be parsed.

    Returns
    -------
    namespace : str
        The _namespace_ or None if not present.
    object : str
        ALF object.
    attribute : str
        The ALF attribute.
    timescale : str
        The ALF _timescale or None if not present.
    extra : str
        Any extra parts to the filename, or None if not present.
    extension : str
        The file extension.

    Examples
    --------
    >>> filename_parts('_namespace_obj.times_timescale.extra.foo.ext')
    ('namespace', 'obj', 'times', 'timescale', 'extra.foo', 'ext')
    >>> filename_parts('spikes.clusters.npy', as_dict=True)
    {'namespace': None,
     'object': 'spikes',
     'attribute': 'clusters',
     'timescale': None,
     'extra': None,
     'extension': 'npy'}
    >>> filename_parts('spikes.times_ephysClock.npy')
    (None, 'spikes', 'times', 'ephysClock', None, 'npy')
    >>> filename_parts('_iblmic_audioSpectrogram.frequencies.npy')
    ('iblmic', 'audioSpectrogram', 'frequencies', None, None, 'npy')
    >>> filename_parts('_spikeglx_ephysData_g0_t0.imec.wiring.json')
    ('spikeglx', 'ephysData_g0_t0', 'imec', None, 'wiring', 'json')
    >>> filename_parts('_spikeglx_ephysData_g0_t0.imec0.lf.bin')
    ('spikeglx', 'ephysData_g0_t0', 'imec0', None, 'lf', 'bin')
    >>> filename_parts('_ibl_trials.goCue_times_bpod.csv')
    ('ibl', 'trials', 'goCue_times', 'bpod', None, 'csv')

    Raises
    ------
    ValueError
        Invalid ALF dataset (assert_valid is True).
    """
    return _path_parts(filename, FILE_SPEC, True, as_dict, assert_valid)




[docs]
def full_path_parts(path, as_dict=False, assert_valid=True) -> Union[dict, tuple]:
    """Parse all filename and folder parts.

    Parameters
    ----------
    path : str, pathlib.Path.
        The ALF path
    as_dict : bool
        When true a dict of matches is returned.
    assert_valid : bool
        When true an exception is raised when the filename cannot be parsed.

    Returns
    -------
    OrderedDict, tuple
        A dict if as_dict is true, or a tuple of parsed values.

    Examples
    --------
    >>> full_path_parts(
    ...    'lab/Subjects/subject/2020-01-01/001/collection/#revision#/'
    ...    '_namespace_obj.times_timescale.extra.foo.ext')
    ('lab', 'subject', '2020-01-01', '001', 'collection', 'revision',
    'namespace', 'obj', 'times','timescale', 'extra.foo', 'ext')
    >>> full_path_parts('spikes.clusters.npy', as_dict=True)
    {'lab': None,
     'subject': None,
     'date': None,
     'number': None,
     'collection': None,
     'revision': None,
     'namespace': None,
     'object': 'spikes',
     'attribute': 'clusters',
     'timescale': None,
     'extra': None,
     'extension': 'npy'}

    Raises
    ------
    ValueError
        Invalid ALF path (assert_valid is True).
    """
    path = Path(path)
    # NB We try to determine whether we have a folder or filename path.  Filenames contain at
    # least two periods, however it is currently permitted to have any number of periods in a
    # collection, making the ALF path ambiguous.
    if sum(x == '.' for x in path.name) < 2:  # folder only
        folders = folder_parts(path, as_dict, assert_valid)
        dataset = filename_parts('', as_dict, assert_valid=False)
    elif '/' not in path.as_posix():  # filename only
        folders = folder_parts('', as_dict, assert_valid=False)
        dataset = filename_parts(path.name, as_dict, assert_valid)
    else:  # full filepath
        folders = folder_parts(path.parent, as_dict, assert_valid)
        dataset = filename_parts(path.name, as_dict, assert_valid)
    if as_dict:
        return OrderedDict(**folders, **dataset)
    else:
        return folders + dataset




[docs]
def folder_parts(folder_path, as_dict=False, assert_valid=True) -> Union[dict, tuple]:
    """Parse all folder parts, including session, collection and revision.

    Parameters
    ----------
    folder_path : str, pathlib.Path
        The ALF folder path.
    as_dict : bool
        When true a dict of matches is returned.
    assert_valid : bool
        When true an exception is raised when the filename cannot be parsed.

    Returns
    -------
    OrderedDict, tuple
        A dict if as_dict is true, or a tuple of parsed values.

    Examples
    --------
    >>> folder_parts('lab/Subjects/subject/2020-01-01/001/collection/#revision#')
    ('lab', 'subject', '2020-01-01', '001', 'collection', 'revision')
    >>> folder_parts(Path('lab/Subjects/subject/2020-01-01/001'), as_dict=True)
    {'lab': 'lab',
     'subject': 'subject',
     'date': '2020-01-01',
     'number': '001',
     'collection': None,
     'revision': None}

    Raises
    ------
    ValueError
        Invalid ALF path (assert_valid is True).
    """
    if hasattr(folder_path, 'as_posix'):
        folder_path = folder_path.as_posix()
    if folder_path and folder_path[-1] != '/':  # Slash required for regex pattern
        folder_path = folder_path + '/'
    spec_str = f'{SESSION_SPEC}/{COLLECTION_SPEC}'
    return _path_parts(folder_path, spec_str, False, as_dict, assert_valid)



def _isdatetime(s: str) -> bool:
    """Returns True if input is valid ISO date string."""
    try:
        datetime.strptime(s, '%Y-%m-%d')
        return True
    except ValueError:
        return False



[docs]
def get_session_path(path: Union[str, Path]) -> Optional[Path]:
    """
    Returns the session path from any filepath if the date/number pattern is found,
    including the root directory.

    Returns
    -------
    pathlib.Path
        The session path part of the input path or None if path invalid.

    Examples
    --------
    >>> get_session_path('/mnt/sd0/Data/lab/Subjects/subject/2020-01-01/001')
    Path('/mnt/sd0/Data/lab/Subjects/subject/2020-01-01/001')

    >>> get_session_path('C:\\Data\\subject\\2020-01-01\\1\\trials.intervals.npy')
    Path('C:/Data/subject/2020-01-01/1')
    """
    if path is None:
        return
    if isinstance(path, str):
        path = Path(path)
    sess = None
    for i, p in enumerate(path.parts):
        if p.isdigit() and _isdatetime(path.parts[i - 1]):
            sess = Path().joinpath(*path.parts[:i + 1])

    return sess




[docs]
def get_alf_path(path: Union[str, Path]) -> str:
    """Returns the ALF part of a path or filename.
    Attempts to return the first valid part of the path, first searching for a session path,
    then relative path (collection/revision/filename), then just the filename.  If all invalid,
    None is returned.

    Parameters
    ----------
    path : str, pathlib.Path
        A path to parse.

    Returns
    -------
    str
        A string containing the full ALF path, session path, relative path or filename.

    Examples
    --------
    >>> get_alf_path('etc/etc/lab/Subjects/subj/2021-01-21/001')
    'lab/Subjects/subj/2021-01-21/001/collection/file.attr.ext'

    >>> get_alf_path('etc/etc/subj/2021-01-21/001/collection/file.attr.ext')
    'subj/2021-01-21/001/collection/file.attr.ext'

    >>> get_alf_path('collection/file.attr.ext')
    'collection/file.attr.ext'
    """
    if not isinstance(path, str):
        path = Path(path).as_posix()
    path = path.strip('/')

    # Check if session path
    match_session = spec.regex(SESSION_SPEC).search(path)
    if match_session:
        return path[match_session.start():]

    # Check if filename / relative path (i.e. collection + filename)
    parts = path.rsplit('/', 1)
    match_filename = spec.regex(FILE_SPEC).match(parts[-1])
    if match_filename:
        return path if spec.regex(f'{COLLECTION_SPEC}{FILE_SPEC}').match(path) else parts[-1]




[docs]
def add_uuid_string(file_path, uuid):
    """
    Add a UUID to the filename of an ALF path.

    Adds a UUID to an ALF filename as an extra part, e.g.
    'obj.attr.ext' -> 'obj.attr.a976e418-c8b8-4d24-be47-d05120b18341.ext'.

    Parameters
    ----------
    file_path : str, pathlib.Path, pathlib.PurePath
        An ALF path to add the UUID to.
    uuid : str, uuid.UUID
        The UUID to add.

    Returns
    -------
    pathlib.Path, pathlib.PurePath
        A new Path or PurePath object with a UUID in the filename.

    Examples
    --------
    >>> add_uuid_string('/path/to/trials.intervals.npy', 'a976e418-c8b8-4d24-be47-d05120b18341')
    Path('/path/to/trials.intervals.a976e418-c8b8-4d24-be47-d05120b18341.npy')

    Raises
    ------
    ValueError
        `uuid` must be a valid hyphen-separated hexadecimal UUID.

    See Also
    --------
    one.alf.files.remove_uuid_string
    one.alf.spec.is_uuid
    """
    if isinstance(uuid, str) and not spec.is_uuid_string(uuid):
        raise ValueError('Should provide a valid UUID v4')
    uuid = str(uuid)
    # NB: Only instantiate as Path if not already a Path, otherwise we risk changing the class
    if isinstance(file_path, str):
        file_path = Path(file_path)
    name_parts = file_path.stem.split('.')
    if spec.is_uuid(name_parts[-1]):
        *name_parts, old_uuid = name_parts
        if old_uuid == uuid:
            _logger.warning(f'UUID already found in file name: {file_path.name}: IGNORE')
            return file_path
        else:
            _logger.debug('Replacing %s with %s in %s', old_uuid, uuid, file_path)
    return file_path.parent.joinpath(f"{'.'.join(name_parts)}.{uuid}{file_path.suffix}")




[docs]
def remove_uuid_string(file_path):
    """
    Remove UUID from a filename of an ALF path.

    Parameters
    ----------
    file_path : str, pathlib.Path, pathlib.PurePath
        An ALF path to add the UUID to.

    Returns
    -------
    pathlib.Path, pathlib.PurePath
        A new Path or PurePath object without a UUID in the filename.

    Examples
    --------
    >>> add_uuid_string('/path/to/trials.intervals.a976e418-c8b8-4d24-be47-d05120b18341.npy')
    Path('/path/to/trials.intervals.npy')

    >>> add_uuid_string('/path/to/trials.intervals.npy')
    Path('/path/to/trials.intervals.npy')

    See Also
    --------
    one.alf.files.add_uuid_string
    """
    if isinstance(file_path, str):
        file_path = Path(file_path)
    name_parts = file_path.stem.split('.')

    if spec.is_uuid_string(name_parts[-1]):
        file_path = file_path.with_name('.'.join(name_parts[:-1]) + file_path.suffix)
    return file_path




[docs]
def padded_sequence(file_path):
    """
    Ensures a file path contains a zero-padded experiment sequence folder.

    Parameters
    ----------
    file_path : str, pathlib.Path, pathlib.PurePath
        A session or file path to convert.

    Returns
    -------
    pathlib.Path, pathlib.PurePath
        The same path but with the experiment sequence folder zero-padded.  If a PurePath was
        passed, a PurePath will be returned, otherwise a Path object is returned.

    Examples
    --------
    >>> file_path = '/iblrigdata/subject/2023-01-01/1/_ibl_experiment.description.yaml'
    >>> padded_sequence(file_path)
    pathlib.Path('/iblrigdata/subject/2023-01-01/001/_ibl_experiment.description.yaml')

    Supports folders and will not affect already padded paths

    >>> session_path = pathlib.PurePosixPath('subject/2023-01-01/001')
    >>> padded_sequence(file_path)
    pathlib.PurePosixPath('subject/2023-01-01/001')
    """
    if isinstance(file_path, str):
        file_path = Path(file_path)
    if (session_path := get_session_path(file_path)) is None:
        raise ValueError('path must include a valid ALF session path, e.g. subject/YYYY-MM-DD/N')
    idx = len(file_path.parts) - len(session_path.parts)
    sequence = str(int(session_path.parts[-1])).zfill(3)  # zero-pad if necessary
    return file_path.parents[idx].joinpath(sequence, file_path.relative_to(session_path))




[docs]
def without_revision(file_path):
    """
    Return file path without a revision folder.

    Parameters
    ----------
    file_path : str, pathlib.Path
        A valid ALF dataset path.

    Returns
    -------
    pathlib.Path
        The input file path without a revision folder.

    Examples
    --------
    >>> without_revision('/lab/Subjects/subject/2023-01-01/001/collection/#revision#/obj.attr.ext')
    Path('/lab/Subjects/subject/2023-01-01/001/collection/obj.attr.ext')
    """
    if isinstance(file_path, str):
        file_path = Path(file_path)
    *_, collection, revision = folder_parts(file_path.parent)
    return get_session_path(file_path).joinpath(*filter(None, (collection, file_path.name)))