Source code for one.alf.spec

"""The complete ALF specification descriptors and validators."""
import re
import textwrap
from enum import IntEnum
from uuid import UUID
from typing import Union

from iblutil.util import flatten

"""dict: The ALF part names and their definitions."""
SPEC_DESCRIPTION = {
    'lab': 'The name of the lab where the data were collected (optional).',
    'Subjects': 'An optional directory to indicate that the experiment data are divided by '
                'subject.  If organizing by lab, this directory is required.',
    'subject': 'The subject name, typically an arbitrary label',
    'date': 'The date on which the experiment session took place, in ISO format, i.e. yyyy-mm-dd',
    'number': 'The sequential session number of the day, optionally zero-padded to be three '
              'numbers, e.g. 001, 002, etc.',
    'collection': 'An optional folder to group data by modality, device, etc.  This is necessary '
                  'when a session contains multiple measurements of the same type, from example '
                  'spike times from multiple probes.  Label examples include "probe00", '
                  '"raw_video_data".',
    'revision': 'An optional folder to organize data by version.  The version label is arbitrary, '
                'however the folder must start and end with pound signs, e.g. "#v1.0.0#". '
                'Unlike collections, if a specified revision is not found, the previous revision '
                'will be returned.  The revisions are ordered lexicographically.',
    'namespace': 'An option filename prefix for data that are not not expected to be a community '
                 'standard, for example task specific events.  The namespace may also be used to '
                 'indicate data unique to a given piece of hardware or software, and is '
                 'identified by underscores, e.g. "_iblrig_", "_phy_".',
    'object': 'Every file describing a given object has the same number of rows (i.e. the 1st '
              'dimension of an npy file, number of frames in a video file, etc).  You can '
              'therefore think of the files for an object as together defining a table, with '
              'column headings given by the attribute in the file names, and values given by the '
              'file contents.  Object names should be in Haskell case and pluralized, '
              'e.g. "wheelMoves", "sparseNoise", "trials".\nEncoding of relations between objects '
              'can be achieved by a simplified relational model.  If the attribute name of one '
              'file matches the object name of a second, then the first file is guaranteed to '
              'contain integers referring to the rows of the second. For example, '
              '"spikes.clusters.npy" would contain integer references to the rows of '
              '"clusters.brain_location.json" and "clusters.probes.npy"; and '
              '"clusters.probes.npy" would contain integer references to "probes.insertion.json". '
              '\nBe careful of plurals ("clusters.probe.npy" would not correspond to '
              '"probes.insertion.json") and remember we count arrays starting from 0.',
    'attribute': 'Together with the object, the attribute represents the type of data in the '
                 'file, for example "times", "amplitudes", "clusters".  The names should be in '
                 'Haskell case, however the following three attributes may be separated by an '
                 'underscore, e.g. "stimOn_times".\nThe attribute "times" is reserved for '
                 'discrete event times and comprises a numerical array containing times of the '
                 'events in seconds, relative to a universal timescale common to all files.\n'
                 'The attribute "intervals" should have two columns, indicating the start and end '
                 'times of each interval relative to the universal timescale.\n'
                 'Continuous timeseries are represented by the "timestamps" attribute.  The file '
                 'may contain a vector of times in universal seconds if unevenly sampled, or two '
                 'rows each representing a synchronization point, the first column giving the '
                 'sample number (counting from 0), and the second column giving the '
                 'corresponding time in universal seconds.  The times corresponding to all '
                 'samples are then found by linear interpolation.  NB: the "timestamps" file is '
                 'an exception to the rule that all files representing a continuous timeseries '
                 'object must have one row per sample, as it will often have substantially less.',
    'timescale': 'If you want to represent times relative to another (non-universal) timescale, '
                 'a timescale can be appended after an underscore e.g. '
                 '"spikes.times_ephysClock.npy", "trials.intervals_nidaq", '
                 '"wheel.timestamps_bpod.csv".',
    'extra': 'File names could have as many optional parts as you like: '
             '"object.attribute.x1.x2.[…].xN.extension".  The extra name parts play no formal '
             'role, but can serve several additional purposes. For example, it could be a UUID or '
             'file hash for archiving purposes.  If there are multiple files with the same '
             'object, attribute, and extensions but different extra parts, these should be '
             'treated as files to be concatenated, for example to allow multiple-part tif files '
             'as produced by scanimage to be encoded in ALF. The concatenation would happen in '
             'hierarchical lexicographical order: i.e. by lexicographic order of x1, '
             'then x2, etc.',
    'extension': 'ALF can deal with any sort of file, as long as it has a concept of a number of '
                 'rows (or primary dimension). The type of file is recognized by its extension. \n'
                 'Preferred choices:\n\n.npy: numpy array file. This is recommended over flat '
                 'binary since datatype and shape is stored in the file.  If you have an array of '
                 '3 or more dimensions, the first dimension counts as the number of rows.\n\n'
                 '.tsv: tab-delimited text file. This is recommended over comma-separated files'
                 'since text fields often have commas in. All rows should have the same number '
                 'of columns. The first row contains tab-separated names for each column.\n\n'
                 '.bin: flat binary file. It’s better to use .npy for storing binary data but '
                 'some recording systems save in flat binary.  Rather than convert them, '
                 'you can ALFize a flat binary file by adding a metadata file, which specifies '
                 'the number of columns (as the size of the "columns" array) and the binary '
                 'datatype as a top-level key "dtype", using numpy naming conventions.'
}
"""dict: The ALF part names and their definitions."""

# ========================================================== #
# The following are the specifications and patterns for ALFs #
# ========================================================== #

SESSION_SPEC = '({lab}/Subjects/)?{subject}/{date}/{number}'
"""str: The session specification pattern"""

COLLECTION_SPEC = r'({collection}/)?(#{revision}#/)?'
"""str: The collection and revision specification pattern"""

FILE_SPEC = r'_?{namespace}?_?{object}\.{attribute}(?:_{timescale})?(?:\.{extra})*\.{extension}$'
"""str: The filename specification pattern"""

REL_PATH_SPEC = f'{COLLECTION_SPEC}{FILE_SPEC}'
"""str: The collection, revision and filename specification pattern"""

FULL_SPEC = f'{SESSION_SPEC}/{REL_PATH_SPEC}'
"""str: The full ALF path specification pattern"""

_DEFAULT = (
    ('lab', r'\w+'),
    ('subject', r'[\w.-]+'),
    ('date', r'\d{4}-\d{2}-\d{2}'),
    ('number', r'\d{1,3}'),
    ('collection', r'[\w./-]+'),
    ('revision', r'[\w.-]+'),  # brackets
    # to include underscores: r'(?P<namespace>(?:^_)\w+(?:_))?'
    ('namespace', '(?<=_)[a-zA-Z0-9]+'),  # brackets
    ('object', r'\w+'),
    # to treat _times and _intervals as timescale: (?P<attribute>[a-zA-Z]+)_?
    # (?:_[a-z]+_)? allows attribute level namespaces (deprecated)
    ('attribute', r'(?:_[a-z]+_)?[a-zA-Z0-9]+(?:_times(?=[_.])|_intervals(?=[_.]))?'),  # brackets
    ('timescale', r'\w+'),  # brackets
    ('extra', r'[.\w-]+'),  # brackets
    ('extension', r'\w+')
)


[docs] class QC(IntEnum): """Data QC outcomes. This enumeration is used by the Alyx database. NB: Pandas cache tables use different codes. """ CRITICAL = 50 """Dataset practically unusable, e.g. clock can't be aligned; data missing or inaccurate.""" FAIL = 40 """Dataset does not meet expected standards, e.g. trial event timings different to protocol.""" WARNING = 30 """ Dataset has minor quality issues, e.g. relatively high SNR, that should not affect most analyses. """ NOT_SET = 0 """Dataset quality has not been assessed.""" PASS = 10 """Dataset considered 'gold-standard', e.g. tight trial event timings, low recorded SNR."""
[docs] @staticmethod def validate(v): """ Validate QC input and return equivalent enumeration. Parameters ---------- v : int, str, QC A QC enumeration, or equivalent int value or name. Returns ------- QC The enumeration. Raises ------ ValueError An invalid QC value was passed. """ if isinstance(v, QC): return v elif isinstance(v, str): if v.isnumeric(): return QC(int(v)) try: return QC[v.upper()] except KeyError: raise ValueError(f'{v} is not a valid QC') else: return QC(v)
[docs] def path_pattern() -> str: """Returns a template string representing the where the ALF parts lie in an ALF path. Brackets denote optional parts. This is used for documentation purposes only. """ return ''.join(filter(lambda c: c not in '{}?*\\$', FULL_SPEC))
[docs] def describe(part=None, width=99): """Print a description of an ALF part. Prints the path pattern along with a description of the given ALF part (or all parts if None). Parameters ---------- part : str ALF part to describe. One from `SPEC_DESCRIPTION.keys()`. If None, all parts are described. width : int The max line length. Returns ------- None Examples ------- >>> describe() >>> describe('collection') >>> describe('extension', width=120) """ full_spec = path_pattern() print(full_spec) if part: if part not in SPEC_DESCRIPTION.keys(): all_parts = '"' + '", "'.join(SPEC_DESCRIPTION.keys()) + '"' raise ValueError(f'Unknown ALF part "{part}", should be one of {all_parts}') parts = [part] span = re.search(part, full_spec).span() ' ' * len(full_spec) print(' ' * span[0] + '^' * (span[1] - span[0]) + ' ' * (len(full_spec) - span[1])) else: parts = SPEC_DESCRIPTION.keys() for part in parts: print('\n' + part.upper()) # Split by max width lines = flatten(textwrap.wrap(ln, width, replace_whitespace=False) for ln in SPEC_DESCRIPTION[part].splitlines()) [print(ln) for ln in lines]
def _dromedary(string) -> str: """ Convert a string to camel case. Acronyms/initialisms are preserved. Parameters ---------- string : str To be converted to camel case Returns ------- str The string in camel case Examples -------- >>> _dromedary('Hello world') == 'helloWorld' >>> _dromedary('motion_energy') == 'motionEnergy' >>> _dromedary('passive_RFM') == 'passive RFM' >>> _dromedary('FooBarBaz') == 'fooBarBaz' See Also -------- readableALF """ def _capitalize(x): return x if x.isupper() else x.capitalize() if not string: # short circuit on None and '' return string first, *other = re.split(r'[_\s]', string) if len(other) == 0: # Already camel/Pascal case, ensure first letter lower case return first[0].lower() + first[1:] # Convert to camel case, preserving all-uppercase elements first = first if first.isupper() else first.casefold() return ''.join([first, *map(_capitalize, other)]) def _named(pattern, name): """Wraps a regex pattern in a named capture group""" return f'(?P<{name}>{pattern})'
[docs] def regex(spec: str = FULL_SPEC, **kwargs) -> re.Pattern: """ Construct a regular expression pattern for parsing or validating an ALF Parameters ---------- spec : str The spec string to construct the regular expression from kwargs : dict[str] Optional patterns to replace the defaults Returns ------- re.Pattern A regular expression Pattern object Examples -------- Regex for a filename >>> pattern = regex(spec=FILE_SPEC) Regex for a complete path (including root) >>> pattern = '.*' + regex(spec=FULL_SPEC).pattern Regex pattern for specific object name >>> pattern = regex(object='trials') """ fields = dict(_DEFAULT) if not fields.keys() >= kwargs.keys(): unknown = next(k for k in kwargs.keys() if k not in fields.keys()) raise KeyError(f'Unknown field "{unknown}"') fields.update({k: v for k, v in kwargs.items() if v is not None}) spec_str = spec.format(**{k: _named(fields[k], k) for k in re.findall(r'(?<={)\w+', spec)}) return re.compile(spec_str)
[docs] def is_valid(filename): """ Returns a True for a given file name if it is an ALF file, otherwise returns False Parameters ---------- filename : str The name of the file to evaluate Returns ------- bool True if filename is valid ALF Examples -------- >>> is_valid('trials.feedbackType.npy') True >>> is_valid('_ns_obj.attr1.2622b17c-9408-4910-99cb-abf16d9225b9.metadata.json') True >>> is_valid('spike_train.npy') False >>> is_valid('channels._phy_ids.csv') # WARNING: attribute level namespaces are deprecated True """ return regex(FILE_SPEC).match(filename) is not None
[docs] def is_session_path(path_object): """ Checks if the syntax corresponds to a session path. Note that there is no physical check about existence nor contents Parameters ---------- path_object : str, pathlib.Path The path object to validate Returns ------- bool True if session path a valid ALF session path """ session_spec = re.compile(regex(SESSION_SPEC).pattern + '$') if hasattr(path_object, 'as_posix'): path_object = path_object.as_posix() path_object = path_object.strip('/') return session_spec.search(path_object) is not None
[docs] def is_uuid_string(string: str) -> bool: """ Bool test for randomly generated hexadecimal uuid validity. NB: unlike is_uuid, is_uuid_string checks that uuid is correctly hyphen separated """ return isinstance(string, str) and is_uuid(string, (3, 4, 5)) and str(UUID(string)) == string
[docs] def is_uuid(uuid: Union[str, int, bytes, UUID], versions=(4,)) -> bool: """Bool test for randomly generated hexadecimal uuid validity. Unlike `is_uuid_string`, this function accepts UUID objects """ if not isinstance(uuid, (UUID, str, bytes, int)): return False elif not isinstance(uuid, UUID): try: uuid = UUID(uuid) if isinstance(uuid, str) else UUID(**{type(uuid).__name__: uuid}) except ValueError: return False return isinstance(uuid, UUID) and uuid.version in versions
[docs] def to_alf(object, attribute, extension, namespace=None, timescale=None, extra=None): """ Given a set of ALF file parts, return a valid ALF file name. Essential periods and underscores are added by the function. Parameters ---------- object : str The ALF object name attribute : str The ALF object attribute name extension : str The file extension namespace : str An optional namespace timescale : str, tuple An optional timescale extra : str, tuple One or more optional extra ALF attributes Returns ------- str A file name string built from the ALF parts Examples -------- >>> to_alf('spikes', 'times', 'ssv') 'spikes.times.ssv' >>> to_alf('spikes', 'times', 'ssv', namespace='ibl') '_ibl_spikes.times.ssv' >>> to_alf('spikes', 'times', 'ssv', namespace='ibl', timescale='ephysClock') '_ibl_spikes.times_ephysClock.ssv' >>> to_alf('spikes', 'times', 'ssv', namespace='ibl', timescale=('ephys clock', 'minutes')) '_ibl_spikes.times_ephysClock_minutes.ssv' >>> to_alf('spikes', 'times', 'npy', namespace='ibl', timescale='ephysClock', extra='raw') '_ibl_spikes.times_ephysClock.raw.npy' >>> to_alf('wheel', 'timestamps', 'npy', 'ibl', 'bpod', ('raw', 'v12')) '_ibl_wheel.timestamps_bpod.raw.v12.npy' """ # Validate inputs if not extension: raise TypeError('An extension must be provided') elif extension.startswith('.'): extension = extension[1:] if any(pt is not None and '.' in pt for pt in (object, attribute, namespace, extension, timescale)): raise ValueError('ALF parts must not contain a period (`.`)') if '_' in (namespace or ''): raise ValueError('Namespace must not contain extra underscores') if object[0] == '_': raise ValueError('Objects must not contain underscores; use namespace arg instead') # Ensure parts are camel case (converts whitespace and snake case) if timescale: timescale = filter(None, [timescale] if isinstance(timescale, str) else timescale) timescale = '_'.join(map(_dromedary, timescale)) # Convert attribute to camel case, leaving '_times', etc. in tact times_re = re.search('_(times|timestamps|intervals)$', attribute) idx = times_re.start() if times_re else len(attribute) attribute = _dromedary(attribute[:idx]) + attribute[idx:] object = _dromedary(object) # Optional extras may be provided as string or tuple of strings if not extra: extra = () elif isinstance(extra, str): extra = extra.split('.') # Construct ALF file parts = (('_%s_' % namespace if namespace else '') + object, attribute + ('_%s' % timescale if timescale else ''), *extra, extension) return '.'.join(parts)
[docs] def readableALF(name: str, capitalize: bool = False) -> str: """Convert camel case string to space separated string. Given an ALF object name or attribute, return a string where the camel case words are space separated. Acronyms/initialisms are preserved. Parameters ---------- name : str The ALF part to format (e.g. object name or attribute). capitalize : bool If true, return with the first letter capitalized. Returns ------- str The name formatted for display, with spaces and capitalization. Examples -------- >>> readableALF('sparseNoise') == 'sparse noise' >>> readableALF('someROIDataset') == 'some ROI dataset' >>> readableALF('someROIDataset', capitalize=True) == 'Some ROI dataset' See Also -------- _dromedary """ words = [] i = 0 matches = re.finditer(r'[A-Z](?=[a-z0-9])|(?<=[a-z0-9])[A-Z]', name) for j in map(re.Match.start, matches): words.append(name[i:j]) i = j words.append(name[i:]) display_str = ' '.join(map(lambda s: s if s.isupper() else s.lower(), words)) return display_str[0].upper() + display_str[1:] if capitalize else display_str