Source code for iblatlas.genomics.merfish

import logging
from pathlib import Path

import pandas as pd
import numpy as np

import one.remote.aws as aws

from iblatlas import atlas

_logger = logging.getLogger(__name__)



[docs]
def load(folder_cache=None):
    """
    Reads in the Allen gene expression experiments tables
    :param folder_cache:
    :return:
    df_cells: a dataframe of cells (8_879_868, 11), where each record corresponds to a single cell
    df_classes: a dataframe of classes (35, 3), where each record corresponds to a single class
    df_subclasses: a dataframe of subclasses (339, 4), where each record corresponds to a single subclass
    df_supertypes: a dataframe of supertypes (1202, 4), where each record corresponds to a single supertype
    df_clusters: a dataframe of clusters (5323, 5), where each record corresponds to a single cluster
    df_genes: a dataframe of genes (1672, 4), where each record corresponds to a single gene
    df_neurotransmitters: a dataframe of neurotransmitters (9, 2), where each record corresponds to a single
     neurotransmitter
    """
    OLD_VERSIONS = ['2023-06-12']
    folder_cache = Path(folder_cache or atlas.AllenAtlas._get_cache_dir().joinpath('merfish'))
    # check the AWS version and download the files if needed
    version_flag = next(folder_cache.glob('*.version'), None)
    if version_flag is None or version_flag.stem in OLD_VERSIONS:
        _logger.info(f'downloading gene expression data from {aws.S3_BUCKET_IBL} s3 bucket...')
        aws.s3_download_folder('atlas/merfish', folder_cache)
    # it is faster and more memory efficient to read the parquet files with dask, but we do
    # not want to require dask as a dependency so we provide the pandas alternative
    try:
        import dask.dataframe as dd
        df_cells = dd.read_parquet(list(folder_cache.rglob('*_cells.pqt')))
        df_cells = df_cells.compute()
    except Exception:  # there are more subtle errors than import errors if dask is intalled partially
        df_cells = pd.concat([pd.read_parquet(f) for f in folder_cache.rglob('*_cells.pqt')])
    # reads in the other tables
    df_classes = pd.read_parquet(folder_cache.joinpath('classes.pqt'))
    df_subclasses = pd.read_parquet(folder_cache.joinpath('subclasses.pqt'))
    df_supertypes = pd.read_parquet(folder_cache.joinpath('supertypes.pqt'))
    df_clusters = pd.read_parquet(folder_cache.joinpath('clusters.pqt'))
    df_genes = pd.read_parquet(folder_cache.joinpath('genes.pqt'))
    df_neurotransmitters = pd.read_parquet(folder_cache.joinpath('neurotransmitters.pqt'))
    return df_cells, df_classes, df_subclasses, df_supertypes, df_clusters, df_genes, df_neurotransmitters




[docs]
def int2rgb(array, dtype=None):
    """
    One liner to convert rgba values stored as integer in dataframes
    :param array: rgba column of a dataframe or slice of the column
    :param dtype: optional, if int will return the uint8 view from 0-255 else will return floats from 0-1
    :return:
    """
    if dtype in (int, np.int8):
        return np.array(array).view('uint8').reshape(array.shape[0], 4)
    else:
        return np.array(array).view('uint8').reshape(array.shape[0], 4).astype(float) / 255