Source code for one.tests.alf.test_cache

"""Unit tests for the one.alf.cache module."""
import unittest
import tempfile
from pathlib import Path
import shutil
import datetime

import pandas as pd
from pandas.testing import assert_frame_equal

from iblutil.io import parquet
import one.alf.cache as apt
from one.tests.util import revisions_datasets_table


[docs] class TestsONEParquet(unittest.TestCase): """Tests for the make_parquet_db function and its helpers""" rel_ses_path = 'mylab/Subjects/mysub/2021-02-28/001/' ses_info = { 'id': 'mylab/mysub/2021-02-28/001', 'lab': 'mylab', 'subject': 'mysub', 'date': datetime.date.fromisoformat('2021-02-28'), 'number': int('001'), 'projects': '', 'task_protocol': '', } rel_ses_files = [Path('alf/spikes.clusters.npy'), Path('alf/spikes.times.npy')]
[docs] def setUp(self) -> None: pd.set_option('display.max_columns', 12) # root path: self.tmpdir = Path(tempfile.gettempdir()) / 'pqttest' self.tmpdir.mkdir(exist_ok=True) # full session path: self.full_ses_path = self.tmpdir / self.rel_ses_path (self.full_ses_path / 'alf').mkdir(exist_ok=True, parents=True) self.file_path = self.full_ses_path / 'alf/spikes.times.npy' self.file_path.write_text('mock') sc = self.full_ses_path / 'alf/spikes.clusters.npy' sc.write_text('mock2') # Create a second session containing an invalid dataset second_session = self.tmpdir.joinpath(self.rel_ses_path).parent.joinpath('002') second_session.mkdir() second_session.joinpath('trials.intervals.npy').touch() second_session.joinpath('.invalid').touch()
[docs] def test_parse(self): self.assertEqual(apt._get_session_info(self.rel_ses_path), tuple(self.ses_info.values())) self.assertTrue( self.full_ses_path.as_posix().endswith(self.rel_ses_path[:-1]))
[docs] def test_parquet(self): # Test data columns = ('colA', 'colB') rows = [('a1', 'b1'), ('a2', 'b2')] metadata = apt._metadata('dbname') filename = self.tmpdir.resolve() / 'mypqt.pqt' # Save parquet file. df = pd.DataFrame(rows, columns=columns) parquet.save(filename, df, metadata=metadata) # Load parquet file df2, metadata2 = parquet.load(filename) assert_frame_equal(df, df2) self.assertTrue(metadata == metadata2)
[docs] def test_sessions_df(self): df = apt._make_sessions_df(self.tmpdir) print('Sessions dataframe') print(df) self.assertEqual(df.loc[0].to_dict(), self.ses_info)
[docs] def test_datasets_df(self): df = apt._make_datasets_df(self.tmpdir) print('Datasets dataframe') print(df) dset_info = df.loc[0].to_dict() self.assertEqual(dset_info['rel_path'], self.rel_ses_files[0].as_posix()) self.assertTrue(dset_info['file_size'] > 0) self.assertFalse(df.rel_path.str.contains('invalid').any())
[docs] def tests_db(self): fn_ses, fn_dsets = apt.make_parquet_db(self.tmpdir, hash_ids=False) metadata_exp = apt._metadata(self.tmpdir.resolve()) df_ses, metadata = parquet.load(fn_ses) # Check sessions dataframe. self.assertEqual(metadata, metadata_exp) self.assertEqual(df_ses.loc[0].to_dict(), self.ses_info) # Check datasets dataframe. df_dsets, metadata2 = parquet.load(fn_dsets) self.assertEqual(metadata2, metadata_exp) dset_info = df_dsets.loc[0].to_dict() self.assertEqual(dset_info['rel_path'], self.rel_ses_files[0].as_posix()) # Check behaviour when no files found with tempfile.TemporaryDirectory() as tdir: with self.assertWarns(RuntimeWarning): fn_ses, fn_dsets = apt.make_parquet_db(tdir, hash_ids=False) self.assertTrue(parquet.load(fn_ses)[0].empty) self.assertTrue(parquet.load(fn_dsets)[0].empty) # Check labname arg with self.assertRaises(AssertionError): apt.make_parquet_db(self.tmpdir, hash_ids=False, lab='another') # Create some more datasets in a session folder outside of a lab directory with tempfile.TemporaryDirectory() as tdir: session_path = Path(tdir).joinpath('subject', '1900-01-01', '001') _ = revisions_datasets_table(touch_path=session_path) # create some files fn_ses, _ = apt.make_parquet_db(tdir, hash_ids=False, lab='another') df_ses, _ = parquet.load(fn_ses) self.assertTrue((df_ses['lab'] == 'another').all())
[docs] def test_hash_ids(self): # Build and load caches with int UUIDs (ses, _), (dsets, _) = map(parquet.load, apt.make_parquet_db(self.tmpdir, hash_ids=True)) # Check ID fields in both dataframes self.assertTrue(ses.index.nlevels == 1 and ses.index.name == 'id') self.assertTrue(dsets.index.nlevels == 2 and tuple(dsets.index.names) == ('eid', 'id'))
[docs] def test_remove_missing_datasets(self): # Add a session that will only contains missing datasets ghost_session = self.tmpdir.joinpath('lab', 'Subjects', 'sub', '2021-01-30', '001') ghost_session.mkdir(parents=True) tables = { 'sessions': apt._make_sessions_df(self.tmpdir), 'datasets': apt._make_datasets_df(self.tmpdir) } # Touch some files and folders for deletion empty_missing_session = self.tmpdir.joinpath(self.rel_ses_path).parent.joinpath('003') empty_missing_session.mkdir() missing_dataset = self.tmpdir.joinpath(self.rel_ses_path).joinpath('foo.bar.npy') missing_dataset.touch() ghost_dataset = ghost_session.joinpath('foo.bar.npy') ghost_dataset.touch() # Test dry to_remove = apt.remove_missing_datasets( self.tmpdir, tables=tables, dry=True, remove_empty_sessions=False ) self.assertTrue(all(map(Path.exists, to_remove)), 'Removed files during dry run!') self.assertTrue(all(map(Path.is_file, to_remove)), 'Failed to ignore empty folders') self.assertNotIn(empty_missing_session, to_remove, 'Failed to ignore empty folders') self.assertNotIn(next(self.tmpdir.rglob('.invalid')), to_remove, 'Removed non-ALF file') # Test removal of files and folders removed = apt.remove_missing_datasets( self.tmpdir, tables=tables, dry=False, remove_empty_sessions=True ) self.assertTrue(sum(map(Path.exists, to_remove)) == 0, 'Failed to remove all files') self.assertIn(empty_missing_session, removed, 'Failed to remove empty session folder') self.assertIn(missing_dataset, removed, 'Failed to remove missing dataset') self.assertIn(ghost_dataset, removed, 'Failed to remove missing dataset') self.assertNotIn(ghost_session, removed, 'Removed empty session that was in session table') # Check without tables input apt.make_parquet_db(self.tmpdir, hash_ids=False) removed = apt.remove_missing_datasets(self.tmpdir, dry=False) self.assertTrue(len(removed) == 0)
[docs] def tearDown(self) -> None: shutil.rmtree(self.tmpdir)
if __name__ == '__main__': unittest.main(exit=False)