Source code for ark.phenotyping.cell_meta_clustering

import os

import numpy as np
import pandas as pd
from alpineer import io_utils, misc_utils

from ark.phenotyping import cell_cluster_utils, cluster_helpers


[docs]def cell_consensus_cluster(base_dir, cell_som_cluster_cols, cell_som_input_data, cell_som_expr_col_avg_name, max_k=20, cap=3, seed=42, overwrite=False): """Run consensus clustering algorithm on cell-level data averaged across each cell SOM cluster. Saves data with consensus cluster labels to cell_consensus_name. Args: base_dir (str): The path to the data directory cell_som_cluster_cols (list): The list of columns used for SOM training cell_som_input_data (pandas.DataFrame): The data used for SOM training with SOM labels attached cell_som_expr_col_avg_name (str): The name of the file with the average expression per column across cell SOM clusters. Used to run consensus clustering on. max_k (int): The number of consensus clusters cap (int): z-score cap to use when hierarchical clustering seed (int): The random seed to set for consensus clustering overwrite (bool): If set, overwrites the meta cluster assignments if they exist Returns: tuple: - cluster_helpers.PixieConsensusCluster: the consensus cluster object containing the SOM to meta mapping - pandas.DataFrame: the input data used for SOM training with meta labels attached """ # define the paths to the data som_expr_col_avg_path = os.path.join(base_dir, cell_som_expr_col_avg_name) # check paths io_utils.validate_paths([som_expr_col_avg_path]) # load in the cell SOM average expression data cluster_count_sub = pd.read_csv(som_expr_col_avg_path, nrows=1) # verify the SOM cluster cols provided exist in cluster_count_sub misc_utils.verify_in_list( provided_cluster_cols=cell_som_cluster_cols, som_cluster_counts_cols=cluster_count_sub.columns.values ) # define the cell consensus cluster object cell_cc = cluster_helpers.PixieConsensusCluster( 'cell', som_expr_col_avg_path, cell_som_cluster_cols, max_k=max_k, cap=cap ) if 'cell_meta_cluster' in cell_som_input_data: # if cell_meta_cluster column exists and no overwrite set, return immediately if not overwrite: print("Meta clusters already assigned to each cell") return cell_cc, cell_som_input_data print("Overwrite flag set, reassigning meta cluster labels") cell_som_input_data = cell_som_input_data.drop(columns='cell_meta_cluster') # z-score and cap the data print("z-score scaling and capping data") cell_cc.scale_data() # set random seed for consensus clustering np.random.seed(seed) # run consensus clustering print("Running consensus clustering") cell_cc.run_consensus_clustering() # generate the som to meta cluster map print("Mapping cell data to consensus cluster labels") cell_cc.generate_som_to_meta_map() # assign the consensus cluster labels to cell_som_input_data cell_meta_assign = cell_cc.assign_consensus_labels(cell_som_input_data) return cell_cc, cell_meta_assign
[docs]def generate_meta_avg_files(base_dir, cell_cc, cell_som_cluster_cols, cell_som_input_data, cell_som_expr_col_avg_name, cell_meta_expr_col_avg_name, overwrite=False): """Computes and saves the average cluster column expression across pixel meta clusters. Assigns meta cluster labels to the data stored in `cell_som_expr_col_avg_name`. Args: base_dir (str): The path to the data directory cell_cc (cluster_helpers.PixieConsensusCluster): The consensus cluster object containing the SOM to meta mapping cell_som_cluster_cols (list): The list of columns used for SOM training cell_som_input_data (pandas.DataFrame): The input data used for SOM training. Will have meta labels appended after this process is run. cell_som_expr_col_avg_name (str): The average values of `cell_som_cluster_cols` per cell SOM cluster. Used to run consensus clustering on. cell_meta_expr_col_avg_name (str): Same as above except for cell meta clusters overwrite (bool): If set, regenerate the averages of `cell_som_cluster_cols` per meta cluster """ # define the paths to the data som_expr_col_avg_path = os.path.join(base_dir, cell_som_expr_col_avg_name) meta_expr_col_avg_path = os.path.join(base_dir, cell_meta_expr_col_avg_name) # check paths io_utils.validate_paths([som_expr_col_avg_path]) # raise error if cell_som_input_data doesn't contain meta labels if 'cell_meta_cluster' not in cell_som_input_data.columns.values: raise ValueError('cell_som_input_data does not have meta labels assigned') # if the column average file for cell meta clusters already exists, skip if os.path.exists(meta_expr_col_avg_path): if not overwrite: print("Already generated average expression file for cell meta clusters, skipping") return print( "Overwrite flag set, regenerating average expression file for cell meta clusters" ) # compute the average value of each expression column per cell meta cluster print("Computing the average value of each training column specified per cell meta cluster") cell_meta_cluster_avgs = cell_cluster_utils.compute_cell_som_cluster_cols_avg( cell_som_input_data, cell_som_cluster_cols, 'cell_meta_cluster', keep_count=True ) # save the average expression values of cell_som_cluster_cols per cell meta cluster cell_meta_cluster_avgs.to_csv( meta_expr_col_avg_path, index=False ) print( "Mapping meta cluster values onto average expression values across cell SOM clusters" ) # read in the average number of pixel/SOM clusters across all cell SOM clusters cell_som_cluster_avgs = pd.read_csv(som_expr_col_avg_path) cell_som_cluster_avgs['cell_som_cluster'] = cell_som_cluster_avgs['cell_som_cluster'].astype( int) # this happens if the overwrite flag is set with previously generated data, need to overwrite if 'cell_meta_cluster' in cell_som_cluster_avgs.columns.values: cell_som_cluster_avgs = cell_som_cluster_avgs.drop(columns='cell_meta_cluster') # merge metacluster assignments in cell_som_cluster_avgs = pd.merge_asof( cell_som_cluster_avgs, cell_cc.mapping, on='cell_som_cluster' ) # resave average number of pixel/SOM clusters across all cell SOM clusters # with metacluster assignments cell_som_cluster_avgs.to_csv( som_expr_col_avg_path, index=False )
[docs]def apply_cell_meta_cluster_remapping(base_dir, cell_som_input_data, cell_remapped_name): """Apply the meta cluster remapping to the data in `cell_consensus_name`. Resave the re-mapped consensus data to `cell_consensus_name`. Args: base_dir (str): The path to the data directory cell_som_input_data (pandas.DataFrame): The input data used for SOM training cell_remapped_name (str): Name of the file containing the cell SOM clusters to their remapped meta clusters Returns: pandas.DataFrame: The input data used for SOM training with renamed meta labels attached """ # define the data paths cell_remapped_path = os.path.join(base_dir, cell_remapped_name) # file path validation io_utils.validate_paths([cell_remapped_path]) # read in the remapping cell_remapped_data = pd.read_csv(cell_remapped_path) # assert the correct columns are contained misc_utils.verify_in_list( required_cols=['cell_som_cluster', 'cell_meta_cluster', 'cell_meta_cluster_rename'], remapped_data_cols=cell_remapped_data.columns.values ) # create the mapping from cell SOM to cell meta cluster # TODO: generated cell_remapped_dict and cell_renamed_meta_dict should be returned # to prevent repeat computation in summary file generation functions cell_remapped_dict = dict( cell_remapped_data[ ['cell_som_cluster', 'cell_meta_cluster'] ].values ) # ensure no duplicated renamed meta clusters make it in cluster_helpers.verify_unique_meta_clusters(cell_remapped_data, meta_cluster_type="cell") # create the mapping from cell meta cluster to cell renamed meta cluster cell_renamed_meta_dict = dict(cell_remapped_data[ ['cell_meta_cluster', 'cell_meta_cluster_rename'] ].drop_duplicates().values) # load the cell consensus data in print("Using re-mapping scheme to re-label cell meta clusters") # ensure that no SOM clusters are missing from the mapping misc_utils.verify_in_list( fov_som_labels=cell_som_input_data['cell_som_cluster'], som_labels_in_mapping=list(cell_remapped_dict.keys()) ) # assign the new meta cluster labels cell_som_input_data['cell_meta_cluster'] = \ cell_som_input_data['cell_som_cluster'].map(cell_remapped_dict) # assign the new renamed meta cluster names # assign the new meta cluster labels cell_som_input_data['cell_meta_cluster_rename'] = \ cell_som_input_data['cell_meta_cluster'].map(cell_renamed_meta_dict) return cell_som_input_data
[docs]def generate_remap_avg_count_files(base_dir, cell_som_input_data, cell_remapped_name, cell_som_cluster_cols, cell_som_expr_col_avg_name, cell_meta_expr_col_avg_name): """Apply the cell cluster remapping to the average count files Args: base_dir (str): The path to the data directory cell_som_input_data (pandas.DataFrame): The input data used for SOM training cell_remapped_name (str): Name of the file containing the cell SOM clusters to their remapped meta clusters cell_som_cluster_cols (list): The list of columns used for SOM training cell_som_expr_col_avg_name (str): The average values of `cell_som_cluster_cols` per cell SOM cluster cell_meta_expr_col_avg_name (str): Same as above except for cell meta clusters """ # define the data paths cell_remapped_path = os.path.join(base_dir, cell_remapped_name) som_expr_col_avg_path = os.path.join(base_dir, cell_som_expr_col_avg_name) meta_expr_col_avg_path = os.path.join(base_dir, cell_meta_expr_col_avg_name) # file path validation io_utils.validate_paths([cell_remapped_path, som_expr_col_avg_path, meta_expr_col_avg_path]) # read in the remapping cell_remapped_data = pd.read_csv(cell_remapped_path) # assert the correct columns are contained misc_utils.verify_in_list( required_cols=['cell_som_cluster', 'cell_meta_cluster', 'cell_meta_cluster_rename'], remapped_data_cols=cell_remapped_data.columns.values ) # create the mapping from cell SOM to cell meta cluster cell_remapped_dict = dict( cell_remapped_data[ ['cell_som_cluster', 'cell_meta_cluster'] ].values ) # create the mapping from cell meta cluster to cell renamed meta cluster cell_renamed_meta_dict = dict( cell_remapped_data[ ['cell_meta_cluster', 'cell_meta_cluster_rename'] ].drop_duplicates().values ) # re-compute the average value of each expression column per meta cluster # add renamed meta cluster in print("Re-compute average value of each training column specified per cell meta cluster") cell_meta_cluster_avgs = cell_cluster_utils.compute_cell_som_cluster_cols_avg( cell_som_input_data, cell_som_cluster_cols, 'cell_meta_cluster', keep_count=True ) cell_meta_cluster_avgs['cell_meta_cluster_rename'] = \ cell_meta_cluster_avgs['cell_meta_cluster'].map(cell_renamed_meta_dict) # re-save the average expression value of all cell SOM columns specified per cell meta cluster cell_meta_cluster_avgs.to_csv( meta_expr_col_avg_path, index=False ) # re-assign cell meta cluster labels back to the average pixel cluster counts # per cell SOM cluster table print("Re-assigning meta cluster column in cell SOM cluster average pixel cluster counts data") cell_som_cluster_avgs = pd.read_csv(som_expr_col_avg_path) cell_som_cluster_avgs['cell_meta_cluster'] = \ cell_som_cluster_avgs['cell_som_cluster'].map(cell_remapped_dict) cell_som_cluster_avgs['cell_meta_cluster_rename'] = \ cell_som_cluster_avgs['cell_meta_cluster'].map(cell_renamed_meta_dict) # re-save the cell SOM cluster average pixel cluster counts table cell_som_cluster_avgs.to_csv(som_expr_col_avg_path, index=False)