Source code for ark.utils.metacluster_remap_gui.file_reader


import pandas as pd
from alpineer import io_utils, misc_utils

from .metaclusterdata import MetaClusterData


[docs]def metaclusterdata_from_files(cluster_path, cluster_type='pixel', prefix_trim=None): """Read and validate raw CSVs and return an initialized MetaClusterData Args: cluster_path (str or IO): file path or filelike object cluster_type (str): the type of cluster data to read, needs to be either `'pixel'` or `'cell'` prefix_trim (str): If set, remove this prefix from each column of the data in `cluster_path` Returns: MetaClusterData: fully initialized metacluster data """ # assert the path to the data is valid if a string if isinstance(cluster_path, str): io_utils.validate_paths(cluster_path) # assert the cluster type provided is valid misc_utils.verify_in_list( provided_cluster_type=[cluster_type], valid_cluster_types=['pixel', 'cell'] ) # read in the cluster data cluster_data = pd.read_csv(cluster_path) if prefix_trim is not None: cluster_data = cluster_data.rename(columns={ col: col.replace(prefix_trim, '') for col in cluster_data.columns.values }) # TODO: might want to rename and standardize everything in metacluster_remap_gui # with {cluster_type}_{som/meta}_cluster, not high priority cluster_data = cluster_data.rename(columns={ '%s_som_cluster' % cluster_type: 'cluster', '%s_meta_cluster' % cluster_type: 'metacluster', '%s_meta_cluster_rename' % cluster_type: 'metacluster_rename' }) if 'cluster' not in cluster_data.columns: raise ValueError("Cluster table must include column named \"cluster\"") if 'metacluster' not in cluster_data.columns: raise ValueError("Cluster table must include column named \"metacluster\"") if 'count' not in cluster_data.columns: raise ValueError("Cluster table must include column named \"count\"") if len(set(cluster_data['cluster'].values)) != len(list(cluster_data['cluster'].values)): raise ValueError("SOM cluster ids must be unique") if 1 not in cluster_data['cluster'].values: raise ValueError("SOM cluster ids must be int type, starting with 1.") if 0 in cluster_data['cluster'].values: raise ValueError("SOM cluster ids start with 1, but a zero was detected.") # extract the SOM cluster counts separately som_counts = cluster_data[['cluster', 'count']].copy() # drop the 'count' column from the cluster_data to produce the averages table # NOTE: channel avg for pixel clusters, pixel count avg for cell clusters som_expression = cluster_data.drop(columns='count') return MetaClusterData(cluster_type, som_expression, som_counts)