Source code for ark.phenotyping.cell_cluster_utils

import os
import warnings

import feather
import numpy as np
import pandas as pd
from alpineer import io_utils, misc_utils


[docs]def compute_cell_som_cluster_cols_avg(cell_cluster_data, cell_som_cluster_cols, cell_cluster_col, keep_count=False): """For each cell SOM cluster, compute the average expression of all `cell_som_cluster_cols` Args: cell_cluster_data (pandas.DataFrame): The cell data with SOM and/or meta labels, created by `cluster_cells` or `cell_consensus_cluster` cell_som_cluster_cols (list): The list of columns used for SOM training cell_cluster_col (str): Name of the cell cluster column to group by, should be `'cell_som_cluster'` or `'cell_meta_cluster'` keep_count (bool): Whether to include the cell counts or not, should only be set to `True` for visualization support Returns: pandas.DataFrame: Contains the average values for each column across cell SOM clusters """ # verify the cell cluster col prefix specified is valid misc_utils.verify_in_list( provided_cluster_col=cell_cluster_col, valid_cluster_cols=['cell_som_cluster', 'cell_meta_cluster'] ) # verify that the cluster columns are valid misc_utils.verify_in_list( provided_cluster_col=cell_som_cluster_cols, cluster_data_valid_cols=cell_cluster_data.columns.values ) # subset the data by columns used for SOM training, as well as the cell SOM assignments cell_cluster_data_subset = cell_cluster_data.loc[ :, list(cell_som_cluster_cols) + [cell_cluster_col] ] # average each column grouped by the cell cluster column mean_count_totals = cell_cluster_data_subset.groupby(cell_cluster_col).mean().reset_index() mean_count_totals[cell_cluster_col] = mean_count_totals[cell_cluster_col].astype(np.int64) # if keep_count is included, add the count column to the cell table if keep_count: cell_cluster_totals = cell_cluster_data_subset.groupby( cell_cluster_col ).size().to_frame('count') cell_cluster_totals = cell_cluster_totals.reset_index(drop=True) mean_count_totals['count'] = cell_cluster_totals['count'] return mean_count_totals
[docs]def create_c2pc_data(fovs, pixel_data_path, cell_table_path, pixel_cluster_col='pixel_meta_cluster_rename'): """Create a matrix with each fov-cell label pair and their SOM pixel/meta cluster counts Args: fovs (list): The list of fovs to subset on pixel_data_path (str): Path to directory with the pixel data with SOM and meta labels attached. Created by `pixel_consensus_cluster`. cell_table_path (str): Path to the cell table, needs to be created with `Segment_Image_Data.ipynb` pixel_cluster_col (str): The name of the pixel cluster column to count per cell Should be `'pixel_som_cluster'` or `'pixel_meta_cluster_rename'` Returns: tuple: - `pandas.DataFrame`: cell x cluster counts of each pixel SOM/meta cluster per each cell - `pandas.DataFrame`: same as above, but normalized by `cell_size` """ # verify the pixel_cluster_col provided is valid misc_utils.verify_in_list( provided_cluster_col=[pixel_cluster_col], valid_cluster_cols=['pixel_som_cluster', 'pixel_meta_cluster_rename'] ) # read the cell table data cell_table = pd.read_csv(cell_table_path) # verify that the user has specified fov, label, and cell_size columns in their cell table misc_utils.verify_in_list( required_cell_table_cols=['fov', 'label', 'cell_size'], provided_cell_table_cols=cell_table.columns.values ) # subset on fov, label, and cell size cell_table = cell_table[['fov', 'label', 'cell_size']] # convert labels to int type cell_table['label'] = cell_table['label'].astype(int) # subset on only the fovs the user has specified cell_table = cell_table[cell_table['fov'].isin(fovs)] # define cell_table columns to subset on for merging cell_table_cols = ['fov', 'label', 'cell_size'] for fov in fovs: # read in the pixel dataset for the fov fov_pixel_data = feather.read_dataframe( os.path.join(pixel_data_path, fov + '.feather') ) if "segmentation_label" in fov_pixel_data.columns: fov_pixel_data.rename(columns={"segmentation_label": "label"}, inplace=True) # create a groupby object that aggregates the label and the pixel_cluster_col # intermediate step for creating a pivot table, makes it easier group_by_cluster_col = fov_pixel_data.groupby( ['label', pixel_cluster_col] ).size().reset_index(name='count') # if cluster labels end up as float (can happen with numeric types), convert to int if group_by_cluster_col[pixel_cluster_col].dtype == float: group_by_cluster_col[pixel_cluster_col] = group_by_cluster_col[ pixel_cluster_col ].astype(int) # counts number of pixel SOM/meta clusters per cell num_cluster_per_seg_label = group_by_cluster_col.pivot( index='label', columns=pixel_cluster_col, values='count' ).fillna(0).astype(int) # renames the columns to have 'pixel_som_cluster_' or 'pixel_meta_cluster_rename_' prefix new_columns = [ '%s_' % pixel_cluster_col + str(c) for c in num_cluster_per_seg_label.columns ] num_cluster_per_seg_label.columns = new_columns # get intersection of the segmentation labels between cell_table_indices # and num_cluster_per_seg_label cell_table_labels = list(cell_table[cell_table['fov'] == fov]['label']) cluster_labels = list(num_cluster_per_seg_label.index.values) label_intersection = list(set(cell_table_labels).intersection(cluster_labels)) # subset on the label intersection num_cluster_per_seg_label = num_cluster_per_seg_label.loc[label_intersection] cell_table_indices = pd.Index( cell_table[ (cell_table['fov'] == fov) & (cell_table['label'].isin(label_intersection)) ].index.values ) # combine the data of num_cluster_per_seg_label into cell_table_indices num_cluster_per_seg_label = num_cluster_per_seg_label.set_index(cell_table_indices) cell_table = cell_table.combine_first(num_cluster_per_seg_label) # NaN means the cluster wasn't found in the specified fov-cell pair cell_table = cell_table.fillna(0) # drop rows from the cell table that don't have any pixel clusters expressed count_cols = [c for c in cell_table.columns if '%s_' % pixel_cluster_col in c] cell_table = cell_table[cell_table[count_cols].sum(axis=1) != 0] # also produce a cell table with counts normalized by cell_size cell_table_norm = cell_table.copy() cell_table_norm[count_cols] = cell_table_norm[count_cols].div(cell_table_norm['cell_size'], axis=0) # reset the indices of cell_table and cell_table_norm to make things consistent cell_table = cell_table.reset_index(drop=True) cell_table_norm = cell_table_norm.reset_index(drop=True) # find columns that are set to all 0 cell_zero_cols = list(cell_table_norm[count_cols].columns[ (cell_table_norm[count_cols] == 0).all() ].values) # filter out these columns (they will cause normalization to fail) if len(cell_zero_cols) > 0: warnings.warn('Pixel clusters %s do not appear in any cells, removed from analysis' % ','.join(cell_zero_cols)) cell_table = cell_table.drop(columns=cell_zero_cols) cell_table_norm = cell_table_norm.drop(columns=cell_zero_cols) return cell_table, cell_table_norm
[docs]def add_consensus_labels_cell_table(base_dir, cell_table_path, cell_som_input_data): """Adds the consensus cluster labels to the cell table, then resaves data to `{cell_table_path}_cell_labels.csv` Args: base_dir (str): The path to the data directory cell_table_path (str): Path of the cell table, needs to be created with `Segment_Image_Data.ipynb` cell_som_input_data (pandas.DataFrame): The input data used for SOM training """ # file path validation io_utils.validate_paths([cell_table_path]) # read in the data, ensure sorted by FOV column just in case cell_table = pd.read_csv(cell_table_path) # for a simpler merge, rename label to label in consensus_data # if it is `segmentation_label` if "segmentation_label" in cell_som_input_data.columns: cell_som_input_data.rename(columns={"segmentation_label": "label"}, inplace=True) # merge the cell table with the consensus data to retrieve the meta clusters cell_table_merged = cell_table.merge( cell_som_input_data, how='left', on=['fov', 'label'] ) # adjust column names and drop consensus data-specific columns # NOTE: non-pixel cluster inputs will not have the cell size attribute for normalization if 'cell_size_y' in cell_table_merged.columns.values: cell_table_merged = cell_table_merged.drop(columns=['cell_size_y']) cell_table_merged = cell_table_merged.rename( {'cell_size_x': 'cell_size'}, axis=1 ) # subset on just the cell table columns plus the meta cluster rename column # NOTE: rename cell_meta_cluster_rename to just cell_meta_cluster for simplicity cell_table_merged = cell_table_merged[ list(cell_table.columns.values) + ['cell_meta_cluster_rename'] ] cell_table_merged = cell_table_merged.rename( {'cell_meta_cluster_rename': 'cell_meta_cluster'}, axis=1 ) # fill any N/A cell_meta_cluster values with 'Unassigned' # NOTE: this happens when a cell is so small no pixel clusters are detected inside of them cell_table_merged['cell_meta_cluster'] = cell_table_merged['cell_meta_cluster'].fillna( 'Unassigned' ) # resave cell table with new meta cluster column new_cell_table_path = os.path.splitext(cell_table_path)[0] + '_cell_labels.csv' cell_table_merged.to_csv(new_cell_table_path, index=False)