Source code for brainbox.io.parquet

import uuid
import numpy as np

import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd

from brainbox.core import Bunch

import warnings
warnings.warn('Please use iblutil.io.parquet instead', category=DeprecationWarning)


[docs]def load(file):
    """
    Loads parquet file into pandas dataframe
    :param file:
    :return:
    """
    return pq.read_table(file).to_pandas()


[docs]def save(file, table):
    """
    Save pandas dataframe to parquet
    :param file:
    :param table:
    :return:
    """
    pq.write_table(pa.Table.from_pandas(table), file)


[docs]def uuid2np(eids_uuid):
    return np.asfortranarray(
        np.array([np.frombuffer(eid.bytes, dtype=np.int64) for eid in eids_uuid]))


[docs]def str2np(eids_str):
    """
    Converts uuid string or list of uuid strings to int64 numpy array with 2 cols
    Returns [0, 0] for None list entries
    """
    if isinstance(eids_str, str):
        eids_str = [eids_str]
    return uuid2np([uuid.UUID(eid) if eid else uuid.UUID('0' * 32) for eid in eids_str])


[docs]def np2uuid(eids_np):
    if isinstance(eids_np, pd.DataFrame) | isinstance(eids_np, pd.Series):
        eids_np = eids_np.to_numpy()
    if eids_np.ndim >= 2:
        return [uuid.UUID(bytes=npu.tobytes()) for npu in eids_np]
    else:
        return uuid.UUID(bytes=eids_np.tobytes())


[docs]def np2str(eids_np):
    eids = np2uuid(eids_np)
    eids = str(eids) if isinstance(eids, uuid.UUID) else [str(u) for u in np2uuid(eids_np)]
    return eids


[docs]def rec2col(rec, join=None, include=None, exclude=None, uuid_fields=None, types=None):
    """
    Change a record list (usually from a REST API endpoint) to a column based dictionary
    (pandas dataframe).
    :param rec (list): list of dictionaries with consistent keys
    :param join (dictionary): dictionary of scalar keys that will be replicated over the full
    array (join operation)
    :param include: list of strings representing dictionary keys: if specified will only include
    the keys specified here
    :param exclude: list of strings representing dictionary keys: if specified will exclude the
    keys specified here
    :param uuid_fields: if the field is an UUID, will split it into 2 distinct int64 columns for
    efficient lookups and intersections
    :param types: for a given key, will force the type; example: types = {'file_size': np.double}
    :return: a Bunch
    """
    if isinstance(rec, dict):
        rec = [rec]
    if len(rec) == 0:
        return Bunch()
    if include is None:
        include = rec[0].keys() if isinstance(rec, list) else rec.keys()
    if exclude is None:
        exclude = []
    if uuid_fields is None:
        uuid_fields = []
    if join is None:
        join = {}

    # first loop over the records and create each columns as a numpy array
    nrecs = len(rec)
    col = {}
    keys = [k for k in rec[0] if k in include and k not in exclude]
    for key in keys:
        if key in uuid_fields:
            npuuid = str2np(np.array([c[key] for c in rec]))
            col[f"{key}_0"] = npuuid[:, 0]
            col[f"{key}_1"] = npuuid[:, 1]
        elif types and key in types:
            col[key] = np.array([c[key] for c in rec]).astype(types[key])
        else:
            col[key] = np.array([c[key] for c in rec])

    # then perform the joins if any
    for key in join:
        if key in uuid_fields:
            npuuid = str2np([join[key]])
            col[f"{key}_0"] = np.tile(npuuid[0, 0], (nrecs,))
            col[f"{key}_1"] = np.tile(npuuid[0, 1], (nrecs,))
        else:
            col[key] = np.tile(np.array(join[key]), (nrecs,))

    return Bunch(col)