Source code for one.alf.spec

"""The complete AlF specification descriptors and validators
TODO Currently extensions are not optional
TODO Make Subjects an optional requirement with lab, i.e. lab/Subjects/subject/date/num OR
 subject/date/num but NOT lab/subject/date/num
"""
import re
import textwrap
from uuid import UUID
from typing import Union

from iblutil.util import flatten

SPEC_DESCRIPTION = {
    'lab': 'The name of the lab where the data were collected',
    'Subjects': 'An optional directory to indicate that the experiment data are divided by '
                'subject',
    'subject': 'The subject name, typically an arbitrary label',
    'date': 'The date on which the experiment session took place, in ISO format, i.e. yyyy-mm-dd',
    'number': 'The sequential session number of the day, optionally zero-padded to be three '
              'numbers, e.g. 001, 002, etc.',
    'collection': 'An optional folder to group data by modality, device, etc.  This is necessary '
                  'when a session contains multiple measurements of the same type, from example '
                  'spike times from multiple probes.  Label examples include "probe00", '
                  '"raw_video_data".',
    'revision': 'An optional folder to organize data by version.  The version label is arbitrary, '
                'however the folder must start and end with pound signs, e.g. "#v1.0.0#". '
                'Unlike collections, if a specified revision is not found, the previous revision '
                'will be returned.  The revisions are ordered lexicographically.',
    'namespace': 'An option filename prefix for data that are not not expected to be a community '
                 'standard, for example task specific events.  The namespace may also be used to '
                 'indicate data unique to a given piece of hardware or software, and is '
                 'identified by underscores, e.g. "_iblrig_", "_phy_".',
    'object': 'Every file describing a given object has the same number of rows (i.e. the 1st '
              'dimension of an npy file, number of frames in a video file, etc).  You can '
              'therefore think of the files for an object as together defining a table, with '
              'column headings given by the attribute in the file names, and values given by the '
              'file contents.  Object names should be in Haskell case and pluralized, '
              'e.g. "wheelMoves", "sparseNoise", "trials".\nEncoding of relations between objects '
              'can be achieved by a simplified relational model.  If the attribute name of one '
              'file matches the object name of a second, then the first file is guaranteed to '
              'contain integers referring to the rows of the second. For example, '
              '"spikes.clusters.npy" would contain integer references to the rows of '
              '"clusters.brain_location.json" and "clusters.probes.npy"; and '
              '"clusters.probes.npy" would contain integer references to "probes.insertion.json". '
              '\nBe careful of plurals ("clusters.probe.npy" would not correspond to '
              '"probes.insertion.json") and remember we count arrays starting from 0.',
    'attribute': 'Together with the object, the attribute represents the type of data in the '
                 'file, for example "times", "amplitudes", "clusters".  The names should be in '
                 'Haskell case, however the following three attributes may be separated by an '
                 'underscore, e.g. "stimOn_times".\nThe attribute "times" is reserved for '
                 'discrete event times and comprises a numerical array containing times of the '
                 'events in seconds, relative to a universal timescale common to all files.\n'
                 'The attribute "intervals" should have two columns, indicating the start and end '
                 'times of each interval relative to the universal timescale.\n'
                 'Continuous timeseries are represented by the "timestamps" attribute.  The file '
                 'may contain a vector of times in universal seconds if unevenly sampled, or two '
                 'rows each representing a synchronization point, the first column giving the '
                 'sample number (counting from 0), and the second column giving the '
                 'corresponding time in universal seconds.  The times corresponding to all '
                 'samples are then found by linear interpolation.  NB: the "timestamps" file is '
                 'an exception to the rule that all files representing a continuous timeseries '
                 'object must have one row per sample, as it will often have substantially less.',
    'timescale': 'If you want to represent times relative to another (non-universal) timescale, '
                 'a timescale can be appended after an underscore e.g. '
                 '"spikes.times_ephysClock.npy", "trials.intervals_nidaq", '
                 '"wheel.timestamps_bpod.csv".',
    'extra': 'File names could have as many optional parts as you like: '
             '"object.attribute.x1.x2.[…].xN.extension".  The extra name parts play no formal '
             'role, but can serve several additional purposes. For example, it could be a UUID or '
             'file hash for archiving purposes.  If there are multiple files with the same '
             'object, attribute, and extensions but different extra parts, these should be '
             'treated as files to be concatenated, for example to allow multiple-part tif files '
             'as produced by scanimage to be encoded in ALF. The concatenation would happen in '
             'hierarchical lexicographical order: i.e. by lexicographic order of x1, '
             'then x2, etc.',
    'extension': 'ALF can deal with any sort of file, as long as it has a concept of a number of '
                 'rows (or primary dimension). The type of file is recognized by its extension. \n'
                 'Preferred choices:\n\n.npy: numpy array file. This is recommended over flat '
                 'binary since datatype and shape is stored in the file.  If you have an array of '
                 '3 or more dimensions, the first dimension counts as the number of rows.\n\n'
                 '.tsv: tab-delimited text file. This is recommended over comma-separated files'
                 'since text fields often have commas in. All rows should have the same number '
                 'of columns. The first row contains tab-separated names for each column.\n\n'
                 '.bin: flat binary file. It’s better to use .npy for storing binary data but '
                 'some recording systems save in flat binary.  Rather than convert them, '
                 'you can ALFize a flat binary file by adding a metadata file, which specifies '
                 'the number of columns (as the size of the "columns" array) and the binary '
                 'datatype as a top-level key "dtype", using numpy naming conventions.'
}


"""The following are the specifications and patterns for ALFs"""
SESSION_SPEC = '{lab}/(Subjects/)?{subject}/{date}/{number}'
COLLECTION_SPEC = r'({collection}/)?(#{revision}#/)?'
FILE_SPEC = r'_?{namespace}?_?{object}\.{attribute}_?{timescale}*\.?{extra}*\.{extension}$'
REL_PATH_SPEC = f'{COLLECTION_SPEC}{FILE_SPEC}'
FULL_SPEC = f'{SESSION_SPEC}/{REL_PATH_SPEC}'

_DEFAULT = (
    ('lab', r'\w+'),
    ('subject', r'[\w-]+'),
    ('date', r'\d{4}-\d{2}-\d{2}'),
    ('number', r'\d{1,3}'),
    ('collection', r'[\w/]+'),
    ('revision', r'[\w-]+'),  # brackets
    # to include underscores: r'(?P<namespace>(?:^_)\w+(?:_))?'
    ('namespace', '(?<=_)[a-zA-Z0-9]+'),  # brackets
    ('object', r'\w+'),
    # to treat _times and _intervals as timescale: (?P<attribute>[a-zA-Z]+)_?
    # (?:_[a-z]+_)? allows attribute level namespaces (deprecated)
    ('attribute', r'(?:_[a-z]+_)?[a-zA-Z0-9]+(?:_times(?=[_.])|_intervals(?=[_.]))?'),  # brackets
    ('timescale', r'(?:_?)\w+'),  # brackets
    ('extra', r'[.\w-]+'),  # brackets
    ('extension', r'\w+')
)


[docs]def path_pattern(): """Returns a template string representing the where the ALF parts lie in an ALF path Brackets denote optional parts. This is used for documentation purposes only. """ return ''.join(filter(lambda c: c not in '{}?*\\$', FULL_SPEC))
[docs]def describe(part=None, width=99): """Print a description of an ALF part Prints the path pattern along with a description of the given ALF part (or all parts if None). Parameters ---------- part : str ALF part to describe. One from `SPEC_DESCRIPTION.keys()`. If None, all parts are described. width : int The max line length. Returns ------- None Examples ------- >>> describe() >>> describe('collection') >>> describe('extension', width=120) """ full_spec = path_pattern() print(full_spec) if part: if part not in SPEC_DESCRIPTION.keys(): all_parts = '"' + '", "'.join(SPEC_DESCRIPTION.keys()) + '"' raise ValueError(f'Unknown ALF part "{part}", should be one of {all_parts}') parts = [part] span = re.search(part, full_spec).span() ' ' * len(full_spec) print(' ' * span[0] + '^' * (span[1] - span[0]) + ' ' * (len(full_spec) - span[1])) else: parts = SPEC_DESCRIPTION.keys() for part in parts: print('\n' + part.upper()) # Split by max width lines = flatten(textwrap.wrap(ln, width, replace_whitespace=False) for ln in SPEC_DESCRIPTION[part].splitlines()) [print(ln) for ln in lines]
def _dromedary(string) -> str: """ Convert a string to camel case. Acronyms/initialisms are preserved. Examples: _dromedary('Hello world') == 'helloWorld' _dromedary('motion_energy') == 'motionEnergy' _dromedary('passive_RFM') == 'passive RFM' _dromedary('FooBarBaz') == 'fooBarBaz' :param string: To be converted to camel case :return: The string in camel case """ def _capitalize(x): return x if x.isupper() else x.capitalize() if not string: # short circuit on None and '' return string first, *other = re.split(r'[_\s]', string) if len(other) == 0: # Already camel/Pascal case, ensure first letter lower case return first[0].lower() + first[1:] # Convert to camel case, preserving all-uppercase elements first = first if first.isupper() else first.casefold() return ''.join([first, *map(_capitalize, other)]) def _named(pattern, name): """Wraps a regex pattern in a named capture group""" return f'(?P<{name}>{pattern})'
[docs]def regex(spec: str = FULL_SPEC, **kwargs) -> re.Pattern: """ Construct a regular expression pattern for parsing or validating an ALF Parameters ---------- spec : str The spec string to construct the regular expression from kwargs : dict[str] Optional patterns to replace the defaults Returns ------- A regular expression Pattern object Examples -------- # Regex for a filename pattern = regex(spec=FILE_SPEC) # Regex for a complete path (including root) pattern = '.*' + regex(spec=FULL_SPEC) # Regex pattern for specific object name pattern = regex(object='trials) """ fields = dict(_DEFAULT) if not fields.keys() >= kwargs.keys(): unknown = next(k for k in kwargs.keys() if k not in fields.keys()) raise KeyError(f'Unknown field "{unknown}"') fields.update({k: v for k, v in kwargs.items() if v is not None}) spec_str = spec.format(**{k: _named(fields[k], k) for k in re.findall(r'(?<={)\w+', spec)}) return re.compile(spec_str)
[docs]def is_valid(filename): """ Returns a True for a given file name if it is an ALF file, otherwise returns False Parameters ---------- filename : str The name of the file to evaluate Returns ------- True if filename is valid ALF Examples -------- >>> is_valid('trials.feedbackType.npy') True >>> is_valid('_ns_obj.attr1.2622b17c-9408-4910-99cb-abf16d9225b9.metadata.json') True >>> is_valid('spike_train.npy') False >>> is_valid('channels._phy_ids.csv') # WARNING: attribute level namespaces are deprecated True """ return regex(FILE_SPEC).match(filename) is not None
[docs]def is_session_path(path_object): """ Checks if the syntax corresponds to a session path. Note that there is no physical check about existence nor contents Parameters ---------- path_object : str, pathlib.Path Returns ------- True if session path a valid ALF session path """ session_spec = re.compile(regex(SESSION_SPEC).pattern + '$') if hasattr(path_object, 'as_posix'): path_object = path_object.as_posix() path_object = path_object.strip('/') return session_spec.search(path_object) is not None
[docs]def is_uuid_string(string: str) -> bool: """ Bool test for randomly generated hexadecimal uuid validity NB: unlike is_uuid, is_uuid_string checks that uuid is correctly hyphen separated """ return isinstance(string, str) and is_uuid(string, (3, 4, 5)) and str(UUID(string)) == string
[docs]def is_uuid(uuid: Union[str, int, bytes, UUID], versions=(4,)) -> bool: """Bool test for randomly generated hexadecimal uuid validity Unlike `is_uuid_string`, this function accepts UUID objects """ if not isinstance(uuid, (UUID, str, bytes, int)): return False elif not isinstance(uuid, UUID): try: uuid = UUID(uuid) if isinstance(uuid, str) else UUID(**{type(uuid).__name__: uuid}) except ValueError: return False return isinstance(uuid, UUID) and uuid.version in versions
[docs]def to_alf(object, attribute, extension, namespace=None, timescale=None, extra=None): """ Given a set of ALF file parts, return a valid ALF file name. Essential periods and underscores are added by the function. Parameters ---------- object : str The ALF object name attribute : str The ALF object attribute name extension : str The file extension namespace : str An optional namespace timescale : str An optional timescale extra : str, tuple One or more optional extra ALF attributes Returns ------- A file name string built from the ALF parts Examples -------- >>> to_alf('spikes', 'times', 'ssv') 'spikes.times.ssv' >>> to_alf('spikes', 'times', 'ssv', namespace='ibl') '_ibl_spikes.times.ssv' >>> to_alf('spikes', 'times', 'ssv', namespace='ibl', timescale='ephysClock') '_ibl_spikes.times_ephysClock.ssv' >>> to_alf('spikes', 'times', 'npy', namespace='ibl', timescale='ephysClock', extra='raw') '_ibl_spikes.times_ephysClock.raw.npy' >>> to_alf('wheel', 'timestamps', 'npy', 'ibl', 'bpod', ('raw', 'v12')) '_ibl_wheel.timestamps_bpod.raw.v12.npy' """ # Validate inputs if not extension: raise TypeError('An extension must be provided') elif extension.startswith('.'): extension = extension[1:] if re.search('_(?!times$|intervals)', attribute): raise ValueError('Object attributes must not contain underscores') if any(pt is not None and '.' in pt for pt in (object, attribute, namespace, extension, timescale)): raise ValueError('ALF parts must not contain a period (`.`)') if '_' in (namespace or ''): raise ValueError('Namespace must not contain extra underscores') if object[0] == '_': raise ValueError('Objects must not contain underscores; use namespace arg instead') # Ensure parts are camel case (converts whitespace and snake case) object, timescale = map(_dromedary, (object, timescale)) # Optional extras may be provided as string or tuple of strings if not extra: extra = () elif isinstance(extra, str): extra = extra.split('.') # Construct ALF file parts = (('_%s_' % namespace if namespace else '') + object, attribute + ('_%s' % timescale if timescale else ''), *extra, extension) return '.'.join(parts)