Source code for ark.phenotyping.cell_meta_clustering
import os
import numpy as np
import pandas as pd
from alpineer import io_utils, misc_utils
from ark.phenotyping import cell_cluster_utils, cluster_helpers
[docs]def cell_consensus_cluster(base_dir, cell_som_cluster_cols, cell_som_input_data,
cell_som_expr_col_avg_name, max_k=20, cap=3, seed=42, overwrite=False):
"""Run consensus clustering algorithm on cell-level data averaged across each cell SOM cluster.
Saves data with consensus cluster labels to cell_consensus_name.
Args:
base_dir (str):
The path to the data directory
cell_som_cluster_cols (list):
The list of columns used for SOM training
cell_som_input_data (pandas.DataFrame):
The data used for SOM training with SOM labels attached
cell_som_expr_col_avg_name (str):
The name of the file with the average expression per column across cell SOM clusters.
Used to run consensus clustering on.
max_k (int):
The number of consensus clusters
cap (int):
z-score cap to use when hierarchical clustering
seed (int):
The random seed to set for consensus clustering
overwrite (bool):
If set, overwrites the meta cluster assignments if they exist
Returns:
tuple:
- cluster_helpers.PixieConsensusCluster: the consensus cluster object containing the
SOM to meta mapping
- pandas.DataFrame: the input data used for SOM training with meta labels attached
"""
# define the paths to the data
som_expr_col_avg_path = os.path.join(base_dir, cell_som_expr_col_avg_name)
# check paths
io_utils.validate_paths([som_expr_col_avg_path])
# load in the cell SOM average expression data
cluster_count_sub = pd.read_csv(som_expr_col_avg_path, nrows=1)
# verify the SOM cluster cols provided exist in cluster_count_sub
misc_utils.verify_in_list(
provided_cluster_cols=cell_som_cluster_cols,
som_cluster_counts_cols=cluster_count_sub.columns.values
)
# define the cell consensus cluster object
cell_cc = cluster_helpers.PixieConsensusCluster(
'cell', som_expr_col_avg_path, cell_som_cluster_cols, max_k=max_k, cap=cap
)
if 'cell_meta_cluster' in cell_som_input_data:
# if cell_meta_cluster column exists and no overwrite set, return immediately
if not overwrite:
print("Meta clusters already assigned to each cell")
return cell_cc, cell_som_input_data
print("Overwrite flag set, reassigning meta cluster labels")
cell_som_input_data = cell_som_input_data.drop(columns='cell_meta_cluster')
# z-score and cap the data
print("z-score scaling and capping data")
cell_cc.scale_data()
# set random seed for consensus clustering
np.random.seed(seed)
# run consensus clustering
print("Running consensus clustering")
cell_cc.run_consensus_clustering()
# generate the som to meta cluster map
print("Mapping cell data to consensus cluster labels")
cell_cc.generate_som_to_meta_map()
# assign the consensus cluster labels to cell_som_input_data
cell_meta_assign = cell_cc.assign_consensus_labels(cell_som_input_data)
return cell_cc, cell_meta_assign
[docs]def generate_meta_avg_files(base_dir, cell_cc, cell_som_cluster_cols,
cell_som_input_data,
cell_som_expr_col_avg_name,
cell_meta_expr_col_avg_name, overwrite=False):
"""Computes and saves the average cluster column expression across pixel meta clusters.
Assigns meta cluster labels to the data stored in `cell_som_expr_col_avg_name`.
Args:
base_dir (str):
The path to the data directory
cell_cc (cluster_helpers.PixieConsensusCluster):
The consensus cluster object containing the SOM to meta mapping
cell_som_cluster_cols (list):
The list of columns used for SOM training
cell_som_input_data (pandas.DataFrame):
The input data used for SOM training.
Will have meta labels appended after this process is run.
cell_som_expr_col_avg_name (str):
The average values of `cell_som_cluster_cols` per cell SOM cluster.
Used to run consensus clustering on.
cell_meta_expr_col_avg_name (str):
Same as above except for cell meta clusters
overwrite (bool):
If set, regenerate the averages of `cell_som_cluster_cols` per meta cluster
"""
# define the paths to the data
som_expr_col_avg_path = os.path.join(base_dir, cell_som_expr_col_avg_name)
meta_expr_col_avg_path = os.path.join(base_dir, cell_meta_expr_col_avg_name)
# check paths
io_utils.validate_paths([som_expr_col_avg_path])
# raise error if cell_som_input_data doesn't contain meta labels
if 'cell_meta_cluster' not in cell_som_input_data.columns.values:
raise ValueError('cell_som_input_data does not have meta labels assigned')
# if the column average file for cell meta clusters already exists, skip
if os.path.exists(meta_expr_col_avg_path):
if not overwrite:
print("Already generated average expression file for cell meta clusters, skipping")
return
print(
"Overwrite flag set, regenerating average expression file for cell meta clusters"
)
# compute the average value of each expression column per cell meta cluster
print("Computing the average value of each training column specified per cell meta cluster")
cell_meta_cluster_avgs = cell_cluster_utils.compute_cell_som_cluster_cols_avg(
cell_som_input_data,
cell_som_cluster_cols,
'cell_meta_cluster',
keep_count=True
)
# save the average expression values of cell_som_cluster_cols per cell meta cluster
cell_meta_cluster_avgs.to_csv(
meta_expr_col_avg_path,
index=False
)
print(
"Mapping meta cluster values onto average expression values across cell SOM clusters"
)
# read in the average number of pixel/SOM clusters across all cell SOM clusters
cell_som_cluster_avgs = pd.read_csv(som_expr_col_avg_path)
cell_som_cluster_avgs['cell_som_cluster'] = cell_som_cluster_avgs['cell_som_cluster'].astype(
int)
# this happens if the overwrite flag is set with previously generated data, need to overwrite
if 'cell_meta_cluster' in cell_som_cluster_avgs.columns.values:
cell_som_cluster_avgs = cell_som_cluster_avgs.drop(columns='cell_meta_cluster')
# merge metacluster assignments in
cell_som_cluster_avgs = pd.merge_asof(
cell_som_cluster_avgs, cell_cc.mapping, on='cell_som_cluster'
)
# resave average number of pixel/SOM clusters across all cell SOM clusters
# with metacluster assignments
cell_som_cluster_avgs.to_csv(
som_expr_col_avg_path,
index=False
)
[docs]def apply_cell_meta_cluster_remapping(base_dir, cell_som_input_data, cell_remapped_name):
"""Apply the meta cluster remapping to the data in `cell_consensus_name`.
Resave the re-mapped consensus data to `cell_consensus_name`.
Args:
base_dir (str):
The path to the data directory
cell_som_input_data (pandas.DataFrame):
The input data used for SOM training
cell_remapped_name (str):
Name of the file containing the cell SOM clusters to their remapped meta clusters
Returns:
pandas.DataFrame:
The input data used for SOM training with renamed meta labels attached
"""
# define the data paths
cell_remapped_path = os.path.join(base_dir, cell_remapped_name)
# file path validation
io_utils.validate_paths([cell_remapped_path])
# read in the remapping
cell_remapped_data = pd.read_csv(cell_remapped_path)
# assert the correct columns are contained
misc_utils.verify_in_list(
required_cols=['cell_som_cluster', 'cell_meta_cluster', 'cell_meta_cluster_rename'],
remapped_data_cols=cell_remapped_data.columns.values
)
# create the mapping from cell SOM to cell meta cluster
# TODO: generated cell_remapped_dict and cell_renamed_meta_dict should be returned
# to prevent repeat computation in summary file generation functions
cell_remapped_dict = dict(
cell_remapped_data[
['cell_som_cluster', 'cell_meta_cluster']
].values
)
# ensure no duplicated renamed meta clusters make it in
cluster_helpers.verify_unique_meta_clusters(cell_remapped_data, meta_cluster_type="cell")
# create the mapping from cell meta cluster to cell renamed meta cluster
cell_renamed_meta_dict = dict(cell_remapped_data[
['cell_meta_cluster', 'cell_meta_cluster_rename']
].drop_duplicates().values)
# load the cell consensus data in
print("Using re-mapping scheme to re-label cell meta clusters")
# ensure that no SOM clusters are missing from the mapping
misc_utils.verify_in_list(
fov_som_labels=cell_som_input_data['cell_som_cluster'],
som_labels_in_mapping=list(cell_remapped_dict.keys())
)
# assign the new meta cluster labels
cell_som_input_data['cell_meta_cluster'] = \
cell_som_input_data['cell_som_cluster'].map(cell_remapped_dict)
# assign the new renamed meta cluster names
# assign the new meta cluster labels
cell_som_input_data['cell_meta_cluster_rename'] = \
cell_som_input_data['cell_meta_cluster'].map(cell_renamed_meta_dict)
return cell_som_input_data
[docs]def generate_remap_avg_count_files(base_dir, cell_som_input_data,
cell_remapped_name, cell_som_cluster_cols,
cell_som_expr_col_avg_name,
cell_meta_expr_col_avg_name):
"""Apply the cell cluster remapping to the average count files
Args:
base_dir (str):
The path to the data directory
cell_som_input_data (pandas.DataFrame):
The input data used for SOM training
cell_remapped_name (str):
Name of the file containing the cell SOM clusters to their remapped meta clusters
cell_som_cluster_cols (list):
The list of columns used for SOM training
cell_som_expr_col_avg_name (str):
The average values of `cell_som_cluster_cols` per cell SOM cluster
cell_meta_expr_col_avg_name (str):
Same as above except for cell meta clusters
"""
# define the data paths
cell_remapped_path = os.path.join(base_dir, cell_remapped_name)
som_expr_col_avg_path = os.path.join(base_dir, cell_som_expr_col_avg_name)
meta_expr_col_avg_path = os.path.join(base_dir, cell_meta_expr_col_avg_name)
# file path validation
io_utils.validate_paths([cell_remapped_path, som_expr_col_avg_path, meta_expr_col_avg_path])
# read in the remapping
cell_remapped_data = pd.read_csv(cell_remapped_path)
# assert the correct columns are contained
misc_utils.verify_in_list(
required_cols=['cell_som_cluster', 'cell_meta_cluster', 'cell_meta_cluster_rename'],
remapped_data_cols=cell_remapped_data.columns.values
)
# create the mapping from cell SOM to cell meta cluster
cell_remapped_dict = dict(
cell_remapped_data[
['cell_som_cluster', 'cell_meta_cluster']
].values
)
# create the mapping from cell meta cluster to cell renamed meta cluster
cell_renamed_meta_dict = dict(
cell_remapped_data[
['cell_meta_cluster', 'cell_meta_cluster_rename']
].drop_duplicates().values
)
# re-compute the average value of each expression column per meta cluster
# add renamed meta cluster in
print("Re-compute average value of each training column specified per cell meta cluster")
cell_meta_cluster_avgs = cell_cluster_utils.compute_cell_som_cluster_cols_avg(
cell_som_input_data,
cell_som_cluster_cols,
'cell_meta_cluster',
keep_count=True
)
cell_meta_cluster_avgs['cell_meta_cluster_rename'] = \
cell_meta_cluster_avgs['cell_meta_cluster'].map(cell_renamed_meta_dict)
# re-save the average expression value of all cell SOM columns specified per cell meta cluster
cell_meta_cluster_avgs.to_csv(
meta_expr_col_avg_path,
index=False
)
# re-assign cell meta cluster labels back to the average pixel cluster counts
# per cell SOM cluster table
print("Re-assigning meta cluster column in cell SOM cluster average pixel cluster counts data")
cell_som_cluster_avgs = pd.read_csv(som_expr_col_avg_path)
cell_som_cluster_avgs['cell_meta_cluster'] = \
cell_som_cluster_avgs['cell_som_cluster'].map(cell_remapped_dict)
cell_som_cluster_avgs['cell_meta_cluster_rename'] = \
cell_som_cluster_avgs['cell_meta_cluster'].map(cell_renamed_meta_dict)
# re-save the cell SOM cluster average pixel cluster counts table
cell_som_cluster_avgs.to_csv(som_expr_col_avg_path, index=False)