Source code for brainbox.modeling.design_matrix

from warnings import warn
import numpy as np
import pandas as pd
import scipy.sparse as sp
import numba as nb


[docs]class DesignMatrix:
    """
    Design matrix constructor that will take in information about the temporal structure of a trial
    and allow the generation of a design matrix with specified regressors
    """

    def __init__(self, trialsdf, vartypes, binwidth=0.02):
        """
        Class for generating design matrices to model neural data. Provides handy routines for
        describing neural spiking activity using basis functions and other primitives.

        Based on work by Memming Park in:
        Il Memming Park, Miriam LR Meister, Alex C Huk, & Jonathan W Pillow Nature Neuroscience 17,
        1395-1403. (2014)
        and the accompanying code in MATLAB.

        Parameters
        ----------
        trialsdf : pandas.DataFrame
            Dataframe in which each row is a trial, and each column is a trial-by-trial covariate.
            This includes, optionally, continuous covariates like eye-position and wheel movement
            per trial, which can be used with object-datatype dataframes. The length of vectors
            stored in continuous-variable columns must match the length of the trial.

            Obligatory columns for the dataframe are "trial_start" and "trial_end", which tell the
            constructor which time points to associate with that trial.
        vartypes : dict
            Dictionary of types for each of the columns in trialsdf. Columns must be of the types:
            -- timing: timing events, in which the column values are times since the start of the
                session of an event within that trial, e.g. stimulus onset.
            -- value: scalars which describe a whole-trial value, such as stimulus contrast or
                probability block
            -- continuous: Columns which contain 1-D vectors per row that describe a covariate that
                changes within the trial. e.g. pupil diameter.
            Dictionary keys should be columns in trialsdf, values should be strings that are equal
            to one of the above.
        binwidth : float, optional
            Length of time bins which will be used for design matrix, by default 0.02
        """
        # Data checks #
        validtypes = ('timing', 'continuous', 'value')
        if not all([name in vartypes for name in trialsdf.columns]):
            raise KeyError("Some columns were not described in vartypes")
        if not all([value in validtypes for value in vartypes.values()]):
            raise ValueError("Invalid values were passed in vartypes")

        # Filter out cells which don't meet the criteria for minimum spiking, while doing trial
        # assignment
        vartypes['duration'] = 'value'
        base_df = trialsdf.copy()
        trialsdf = trialsdf.copy()  # Make sure we don't modify the original dataframe
        trbounds = trialsdf[['trial_start', 'trial_end']]  # Get the start/end of trials
        # Empty trial duration value to use later
        trialsdf['duration'] = np.nan
        # Figure out which columns are timing variables if vartypes was passed
        timingvars = [col for col in trialsdf.columns if vartypes[col] == 'timing']

        for i, (start, end) in trbounds.iterrows():
            if any(np.isnan((start, end))):
                warn(f"NaN values found in trial start or end at trial number {i}. "
                     "Discarding trial.")
                trialsdf.drop(i, inplace=True)
                continue
            for col in timingvars:
                # Round values for the timing variables to the 5th decimal place and subtract
                # trial start time.
                trialsdf.at[i, col] = np.round(trialsdf.at[i, col] - start, decimals=5)
            trialsdf.at[i, 'duration'] = end - start

        # Set model parameters to begin with
        self.binwidth = binwidth
        self.covar = {}
        self.trialsdf = trialsdf
        self.vartypes = vartypes
        self.base_df = base_df
        self.compiled = False
        return

[docs]    def binf(self, t):
        """
        Function to bin time t into binwidth of DM

        Parameters
        ----------
        t : float
            time, in seconds

        Returns
        -------
        int
            number of time bins
        """
        return np.ceil(t / self.binwidth).astype(int)

[docs]    def add_covariate_timing(self, covlabel, eventname, bases,
                             offset=0, deltaval=None, cond=None, desc=''):
        """
        Convenience wrapper for adding timing event regressors to the design matrix.

        Timing events are regressed against using basis functions, such as those generated by the
        functions in this module, which are convolved with a kronecker delta at the time where the
        event occurred. This operation is effectively a copy/paste of the basis functions, each of
        which have their own column in the design matrix. This means that when we fit weights, a
        single weight can govern the prediction of the model over a longer time period.

        Can be offset, such that the bases functions are applied before or after the timing.

        The height of the bases can be modified by a column in the design matrix, which will then
        multiply the delta function which is convolved with the bases.

        Parameters
        ----------
        covlabel : str
            Label which the covariate will use. Can be accessed via dot syntax of the instance
            usually.
        eventname : str
            Label of the column in trialsdf which has the event timing for each trial.
        bases : numpy.array
            nTB x nB array, i.e. number of time bins for the bases functions by number of bases.
            Each column in the array is used together to describe the response of a unit to that
            timing event.
        offset : float, seconds
            Offset of bases functions relative to timing event. Negative values mean bases will
            be applied *before* the timing event by that amount.
        deltaval : None, str, or pandas series, optional
            Values of the kronecker delta function peak used to encode the event. If a string, the
            column in trialsdf with that label will be used. If a pandas series with indexes
            matching trialsdf, corresponding elements of the series will be the delta funtion val.
            If None (default) height is 1.
        cond : None, list, or fun, optional
            Condition which to apply this covariate. Can either be a list of trial indices, or a
            function which takes in rows of the trialsdf and returns booleans.
        desc : str, optional
            Additional information about the covariate, if desired. by default ''
        """
        if covlabel in self.covar:
            raise AttributeError(f'Covariate {covlabel} already exists in model.')
        self._compile_check()
        if deltaval is None:
            gainmod = False
        elif isinstance(deltaval, pd.Series):
            gainmod = True
        elif isinstance(deltaval, str) and deltaval in self.trialsdf.columns:
            gainmod = True
            deltaval = self.trialsdf[deltaval]
        else:
            raise TypeError('deltaval must be None, pandas series, or string reference'
                            f' to trialsdf column. {type(deltaval)} was passed instead.')
        if self.vartypes[eventname] != 'timing':
            raise TypeError(f'Column {eventname} in trialsdf is not registered as a timing')

        vecsizes = self.trialsdf['duration'].apply(self.binf)
        stiminds = self.trialsdf[eventname].apply(self.binf)
        stimvecs = []
        for i in self.trialsdf.index:
            vec = np.zeros(vecsizes[i])
            if gainmod:
                vec[stiminds[i]] = deltaval[i]
            else:
                vec[stiminds[i]] = 1
            stimvecs.append(vec.reshape(-1, 1))
        regressor = pd.Series(stimvecs, index=self.trialsdf.index)
        self.add_covariate(covlabel, regressor, bases, offset, cond, desc)
        return

[docs]    def add_covariate_boxcar(self, covlabel, boxstart, boxend,
                             cond=None, height=None, desc=''):
        """
        Convenience wrapped on add_covariate to add a boxcar covariate on the given start and end
        variables, such that the covariate is a step function with non-zero value between those
        values.

        Note: This has not been tested yet and is not guaranteed to work, or work correctly.

        Parameters
        ----------
        covlabel : str
            Name of the covariate for accessing later. Can be accessed via dot syntax of the
            instance usually.
        boxstart : str
            Column name in trialsdf which will be used to define the start of the boxcar
        boxend : str
            Column name in trialsdf which defines the end of boxcar variable
        cond : None, list, or func, optional
            Condition in which to apply this covariate. Can either be a list of trial indices, or
            a function which takes in a row of the trialsdf and returns a boolen on inclusion,
            by default None
        height : None, str, or pandas series, optional
            Values for the height of the boxcar during the period defined per trial. Can be a
            reference to a column in trialsdf or a separate series, by default None
        desc : str, optional
            Additional information about the covariate to store as a string, by default ''

        """
        if covlabel in self.covar:
            raise AttributeError(f'Covariate {covlabel} already exists in model.')
        self._compile_check()
        if boxstart not in self.trialsdf.columns or boxend not in self.trialsdf.columns:
            raise KeyError('boxstart or boxend not found in trialsdf columns.')
        if self.vartypes[boxstart] != 'timing':
            raise TypeError(f'Column {boxstart} in trialsdf is not registered as a timing. '
                            'boxstart and boxend need to refer to timing events in trialsdf.')
        if self.vartypes[boxend] != 'timing':
            raise TypeError(f'Column {boxend} in trialsdf is not registered as a timing. '
                            'boxstart and boxend need to refer to timing events in trialsdf.')

        if isinstance(height, str):
            if height in self.trialsdf.columns:
                height = self.trialsdf[height]
            else:
                raise KeyError(f'{height} is str not in columns of trialsdf')
        elif isinstance(height, pd.Series):
            if not all(height.index == self.trialsdf.index):
                raise IndexError('Indices of height series does not match trialsdf.')
        elif height is None:
            height = pd.Series(np.ones(len(self.trialsdf.index)), index=self.trialsdf.index)
        vecsizes = self.trialsdf['duration'].apply(self.binf)
        stind = self.trialsdf[boxstart].apply(self.binf)
        endind = self.trialsdf[boxend].apply(self.binf)
        stimvecs = []
        for i in self.trialsdf.index:
            bxcar = np.zeros(vecsizes[i])
            bxcar[stind[i]:endind[i] + 1] = height[i]
            stimvecs.append(bxcar)
        regressor = pd.Series(stimvecs, index=self.trialsdf.index)
        self.add_covariate(covlabel, regressor, None, cond, desc)
        return

[docs]    def add_covariate_raw(self, covlabel, raw,
                          cond=None, desc=''):
        """
        Convenience wrapper to add a 'raw' covariate, that is to say a covariate which is a
        continuous value that changes with time during the course of a trial.

        Note: This has not been tested and is not guaranteed to work or to work correctly.

        Parameters
        ----------
        covlabel : str
            String used to reference covariate, can usually be accessed by instance's dot syntax
        raw : str, func, or pandas series
            The covariate to add to the design matrix. Can be a str reference to a column in
            trialsdf, a function which takes in rows of trialsdf and produces a vector for each
            row of the appropriate size given binwidth and trial duration, or a pandas series
            of vectors of said appropriate type.
        cond : None, list, or func, optional
            Trials in which to apply the given covariate. Can be a list of trial numbers,
            or a function which accepts rows of the trialsdf and returns a boolean, by default None
        desc : str, optional
            Additional information about the covariate for access later, by default ''
        """
        stimlens = self.trialsdf.duration.apply(self.binf)
        if isinstance(raw, str):
            if raw not in self.trialsdf.columns:
                raise KeyError(f'String {raw} not found in columns of trialsdf. Strings must'
                               'refer to valid column names.')
            covseries = self.trialsdf[raw]
            if np.any(covseries.apply(len) != stimlens):
                raise IndexError(f'Some array shapes in {raw} do not match binned duration.')
            self.add_covariate(covlabel, covseries, None, cond=cond)

        if callable(raw):
            try:
                covseries = self.trialsdf.apply(raw, axis=1)
            except Exception:
                raise TypeError('Function for raw covariate generation did not run properly.'
                                'Make sure that the function passed takes in rows of trialsdf.')
            if np.any(covseries.apply(len) != stimlens):
                raise IndexError(f'Some array shapes in {raw} do not match binned duration.')
            self.add_covariate(covlabel, covseries, None, cond=cond)

        if isinstance(raw, pd.Series):
            if np.any(raw.index != self.trialsdf.index):
                raise IndexError('Indices of raw do not match indices of trialsdf.')
            if np.any(raw.apply(len) != stimlens):
                raise IndexError(f'Some array shapes in {raw} do not match binned duration.')
            self.add_covariate(covlabel, raw, None, cond=cond)

[docs]    def add_covariate(self, covlabel, regressor, bases,
                      offset=0, cond=None, desc=''):
        """
        Parent function to add covariates to model object. Takes a regressor in the form of a
        pandas Series object, a T x M array of M bases, and stores them for use in the design
        matrix generation.

        Parameters
        ----------
        covlabel : str
            Label for the covariate being added. Will be exposed, if possible, through
            (instance).(covlabel) attribute.
        regressor : pandas.Series
            Series in which each element is the value(s) of a regressor for a trial at that index.
            These will be convolved with the bases functions (if provided) to produce the
            components of the design matrix. *Regressor must be (T / dt) x 1 array for each trial*
        bases : numpy.array or None
            T x M array of M basis functions over T timesteps. Columns will be convolved with the
            elements of `regressor` to produce elements of the design matrix. If None, it is
            assumed a raw regressor is being used.
        offset : int, optional
            Offset of the regressor from the bases during convolution. Negative values indicate
            that the firing of the unit will be , by default 0
        cond : list or func, optional
            Condition for which to apply covariate. Either a list of trials which the covariate
            applies to, or a function of the form f(dataframerow) which returns a boolean,
            by default None
        desc : str, optional
            Description of the covariate for reference purposes, by default '' (empty)
        """
        if covlabel in self.covar:
            raise AttributeError(f'Covariate {covlabel} already exists in model.')
        self._compile_check()
        # Test for mismatch in length of regressor vs trials
        mismatch = np.zeros(len(self.trialsdf.index), dtype=bool)
        for i in self.trialsdf.index:
            currtr = self.trialsdf.loc[i]
            nT = self.binf(currtr.duration)
            if regressor.loc[i].shape[0] != nT:
                mismatch[i] = True

        if np.any(mismatch):
            raise ValueError('Length mismatch between regressor and trial on trials'
                             f'{np.argwhere(mismatch)}.')

        # Initialize containers for the covariate dicts
        if not hasattr(self, 'currcol'):
            self.currcol = 0
        if callable(cond):
            cond = self.trialsdf.index[self.trialsdf.apply(cond, axis=1)].to_numpy()
        if not all(regressor.index == self.trialsdf.index):
            raise IndexError('Indices of regressor and trials dataframes do not match.')

        cov = {'description': desc,
               'bases': bases,
               'valid_trials': cond if cond is not None else self.trialsdf.index,
               'offset': offset,
               'regressor': regressor,
               'dmcol_idx': np.arange(self.currcol, self.currcol + bases.shape[1])
               if bases is not None else self.currcol}
        if bases is None:
            self.currcol += 1
        else:
            self.currcol += bases.shape[1]

        self.covar[covlabel] = cov
        return

[docs]    def compile_design_matrix(self, dense=True):
        """
        Compiles design matrix for the current experiment based on the covariates which were added
        with the various NeuralGLM.add_covariate methods available. Can optionally compile a sparse
        design matrix using the scipy.sparse package, however that method may take longer depending
        on the degree of sparseness.

        Parameters
        ----------
        dense : bool, optional
            Whether or not to compute a dense design matrix or a sparse one, by default True
        """
        covars = self.covar
        # Go trial by trial and compose smaller design matrices
        miniDMs = []
        rowtrials = []
        for i, trial in self.trialsdf.iterrows():
            nT = self.binf(trial.duration)
            miniX = np.zeros((nT, self.currcol))
            rowlabs = np.ones((nT, 1), dtype=int) * i
            for cov in covars.values():
                sidx = cov['dmcol_idx']
                # Optionally use cond to filter out which trials to apply certain regressors,
                if i not in cov['valid_trials']:
                    continue
                stim = cov['regressor'][i]
                # Convolve Kernel or basis function with stimulus or regressor
                if cov['bases'] is None:
                    miniX[:, sidx] = stim
                else:
                    if len(stim.shape) == 1:
                        stim = stim.reshape(-1, 1)
                    miniX[:, sidx] = convbasis(stim, cov['bases'], self.binf(cov['offset']))
            # Sparsify convolved result and store in miniDMs
            if dense:
                miniDMs.append(miniX)
            else:
                miniDMs.append(sp.lil_matrix(miniX))
            rowtrials.append(rowlabs)
        if dense:
            dm = np.vstack(miniDMs)

        else:
            dm = sp.vstack(miniDMs).to_csc()
        trlabels = np.vstack(rowtrials)
        if hasattr(self, 'binnedspikes'):
            assert self.binnedspikes.shape[0] == dm.shape[0], "Oh shit. Indexing error."
        self.dm = dm
        self.trlabels = trlabels
        self.compiled = True
        return

    def __getitem__(self, key):
        if not self.compiled:
            raise AttributeError('Cannot index uncompiled design matrix')
        return self.dm[key]

    def _compile_check(self):
        if self.compiled:
            warn('Design matrix was already compiled once. Be sure to compile again if adding'
                 ' additional covariates.')
        return


# Precompilation for speed
[docs]@nb.njit
def denseconv(X, bases):
    T, dx = X.shape
    TB, M = bases.shape
    indices = np.ones((dx, M))
    sI = np.sum(indices, axis=1)
    BX = np.zeros((T, int(np.sum(sI))))
    sI = np.cumsum(sI)
    k = 0
    for kCov in range(dx):
        A = np.zeros((T + TB - 1, int(np.sum(indices[kCov, :]))))
        for i, j in enumerate(np.argwhere(indices[kCov, :]).flat):
            A[:, i] = np.convolve(X[:, kCov], bases[:, j])
        BX[:, k: sI[kCov]] = A[:T, :]
        k = sI[kCov]
    return BX


[docs]def convbasis(stim, bases, offset=0):
    if offset < 0:
        stim = np.pad(stim, ((0, -offset), (0, 0)), 'constant')
    elif offset > 0:
        stim = np.pad(stim, ((offset, 0), (0, 0)), 'constant')

    X = denseconv(stim, bases)

    if offset < 0:
        X = X[-offset:, :]
    elif offset > 0:
        X = X[:-offset, :]
    return X