Source code for ark.phenotyping.cell_cluster_utils

import os
import warnings

import feather
import numpy as np
import pandas as pd
from alpineer import io_utils, misc_utils


[docs]def compute_cell_som_cluster_cols_avg(cell_cluster_data, cell_som_cluster_cols,
                                      cell_cluster_col, keep_count=False):
    """For each cell SOM cluster, compute the average expression of all `cell_som_cluster_cols`

    Args:
        cell_cluster_data (pandas.DataFrame):
            The cell data with SOM and/or meta labels, created by `cluster_cells` or
            `cell_consensus_cluster`
        cell_som_cluster_cols (list):
            The list of columns used for SOM training
        cell_cluster_col (str):
            Name of the cell cluster column to group by,
            should be `'cell_som_cluster'` or `'cell_meta_cluster'`
        keep_count (bool):
            Whether to include the cell counts or not,
            should only be set to `True` for visualization support

    Returns:
        pandas.DataFrame:
            Contains the average values for each column across cell SOM clusters
    """

    # verify the cell cluster col prefix specified is valid
    misc_utils.verify_in_list(
        provided_cluster_col=cell_cluster_col,
        valid_cluster_cols=['cell_som_cluster', 'cell_meta_cluster']
    )

    # verify that the cluster columns are valid
    misc_utils.verify_in_list(
        provided_cluster_col=cell_som_cluster_cols,
        cluster_data_valid_cols=cell_cluster_data.columns.values
    )

    # subset the data by columns used for SOM training, as well as the cell SOM assignments
    cell_cluster_data_subset = cell_cluster_data.loc[
        :, list(cell_som_cluster_cols) + [cell_cluster_col]
    ]

    # average each column grouped by the cell cluster column
    mean_count_totals = cell_cluster_data_subset.groupby(cell_cluster_col).mean().reset_index()
    mean_count_totals[cell_cluster_col] = mean_count_totals[cell_cluster_col].astype(np.int64)

    # if keep_count is included, add the count column to the cell table
    if keep_count:
        cell_cluster_totals = cell_cluster_data_subset.groupby(
            cell_cluster_col
        ).size().to_frame('count')
        cell_cluster_totals = cell_cluster_totals.reset_index(drop=True)
        mean_count_totals['count'] = cell_cluster_totals['count']
    return mean_count_totals


[docs]def create_c2pc_data(fovs, pixel_data_path, cell_table_path,
                     pixel_cluster_col='pixel_meta_cluster_rename'):
    """Create a matrix with each fov-cell label pair and their SOM pixel/meta cluster counts

    Args:
        fovs (list):
            The list of fovs to subset on
        pixel_data_path (str):
            Path to directory with the pixel data with SOM and meta labels attached.
            Created by `pixel_consensus_cluster`.
        cell_table_path (str):
            Path to the cell table, needs to be created with `Segment_Image_Data.ipynb`
        pixel_cluster_col (str):
            The name of the pixel cluster column to count per cell
            Should be `'pixel_som_cluster'` or `'pixel_meta_cluster_rename'`

    Returns:
        tuple:

        - `pandas.DataFrame`: cell x cluster counts of each pixel SOM/meta cluster per each cell
        - `pandas.DataFrame`: same as above, but normalized by `cell_size`
    """

    # verify the pixel_cluster_col provided is valid
    misc_utils.verify_in_list(
        provided_cluster_col=[pixel_cluster_col],
        valid_cluster_cols=['pixel_som_cluster', 'pixel_meta_cluster_rename']
    )

    # read the cell table data
    cell_table = pd.read_csv(cell_table_path)

    # verify that the user has specified fov, label, and cell_size columns in their cell table
    misc_utils.verify_in_list(
        required_cell_table_cols=['fov', 'label', 'cell_size'],
        provided_cell_table_cols=cell_table.columns.values
    )

    # subset on fov, label, and cell size
    cell_table = cell_table[['fov', 'label', 'cell_size']]

    # convert labels to int type
    cell_table['label'] = cell_table['label'].astype(int)

    # subset on only the fovs the user has specified
    cell_table = cell_table[cell_table['fov'].isin(fovs)]

    # define cell_table columns to subset on for merging
    cell_table_cols = ['fov', 'label', 'cell_size']

    for fov in fovs:
        # read in the pixel dataset for the fov
        fov_pixel_data = feather.read_dataframe(
            os.path.join(pixel_data_path, fov + '.feather')
        )

        if "segmentation_label" in fov_pixel_data.columns:
            fov_pixel_data.rename(columns={"segmentation_label": "label"}, inplace=True)

        # create a groupby object that aggregates the label and the pixel_cluster_col
        # intermediate step for creating a pivot table, makes it easier
        group_by_cluster_col = fov_pixel_data.groupby(
            ['label', pixel_cluster_col]
        ).size().reset_index(name='count')

        # if cluster labels end up as float (can happen with numeric types), convert to int
        if group_by_cluster_col[pixel_cluster_col].dtype == float:
            group_by_cluster_col[pixel_cluster_col] = group_by_cluster_col[
                pixel_cluster_col
            ].astype(int)

        # counts number of pixel SOM/meta clusters per cell
        num_cluster_per_seg_label = group_by_cluster_col.pivot(
            index='label', columns=pixel_cluster_col, values='count'
        ).fillna(0).astype(int)

        # renames the columns to have 'pixel_som_cluster_' or 'pixel_meta_cluster_rename_' prefix
        new_columns = [
            '%s_' % pixel_cluster_col + str(c) for c in num_cluster_per_seg_label.columns
        ]
        num_cluster_per_seg_label.columns = new_columns

        # get intersection of the segmentation labels between cell_table_indices
        # and num_cluster_per_seg_label
        cell_table_labels = list(cell_table[cell_table['fov'] == fov]['label'])
        cluster_labels = list(num_cluster_per_seg_label.index.values)
        label_intersection = list(set(cell_table_labels).intersection(cluster_labels))

        # subset on the label intersection
        num_cluster_per_seg_label = num_cluster_per_seg_label.loc[label_intersection]
        cell_table_indices = pd.Index(
            cell_table[
                (cell_table['fov'] == fov) &
                (cell_table['label'].isin(label_intersection))
            ].index.values
        )

        # combine the data of num_cluster_per_seg_label into cell_table_indices
        num_cluster_per_seg_label = num_cluster_per_seg_label.set_index(cell_table_indices)
        cell_table = cell_table.combine_first(num_cluster_per_seg_label)

    # NaN means the cluster wasn't found in the specified fov-cell pair
    cell_table = cell_table.fillna(0)

    # drop rows from the cell table that don't have any pixel clusters expressed
    count_cols = [c for c in cell_table.columns if '%s_' % pixel_cluster_col in c]
    cell_table = cell_table[cell_table[count_cols].sum(axis=1) != 0]

    # also produce a cell table with counts normalized by cell_size
    cell_table_norm = cell_table.copy()
    cell_table_norm[count_cols] = cell_table_norm[count_cols].div(cell_table_norm['cell_size'],
                                                                  axis=0)

    # reset the indices of cell_table and cell_table_norm to make things consistent
    cell_table = cell_table.reset_index(drop=True)
    cell_table_norm = cell_table_norm.reset_index(drop=True)

    # find columns that are set to all 0
    cell_zero_cols = list(cell_table_norm[count_cols].columns[
        (cell_table_norm[count_cols] == 0).all()
    ].values)

    # filter out these columns (they will cause normalization to fail)
    if len(cell_zero_cols) > 0:
        warnings.warn('Pixel clusters %s do not appear in any cells, removed from analysis' %
                      ','.join(cell_zero_cols))
        cell_table = cell_table.drop(columns=cell_zero_cols)
        cell_table_norm = cell_table_norm.drop(columns=cell_zero_cols)

    return cell_table, cell_table_norm


[docs]def add_consensus_labels_cell_table(base_dir, cell_table_path, cell_som_input_data):
    """Adds the consensus cluster labels to the cell table,
    then resaves data to `{cell_table_path}_cell_labels.csv`

    Args:
        base_dir (str):
            The path to the data directory
        cell_table_path (str):
            Path of the cell table, needs to be created with `Segment_Image_Data.ipynb`
        cell_som_input_data (pandas.DataFrame):
            The input data used for SOM training
    """

    # file path validation
    io_utils.validate_paths([cell_table_path])

    # read in the data, ensure sorted by FOV column just in case
    cell_table = pd.read_csv(cell_table_path)

    # for a simpler merge, rename label to label in consensus_data
    # if it is `segmentation_label`
    if "segmentation_label" in cell_som_input_data.columns:
        cell_som_input_data.rename(columns={"segmentation_label": "label"}, inplace=True)

    # merge the cell table with the consensus data to retrieve the meta clusters
    cell_table_merged = cell_table.merge(
        cell_som_input_data, how='left', on=['fov', 'label']
    )

    # adjust column names and drop consensus data-specific columns
    # NOTE: non-pixel cluster inputs will not have the cell size attribute for normalization
    if 'cell_size_y' in cell_table_merged.columns.values:
        cell_table_merged = cell_table_merged.drop(columns=['cell_size_y'])
        cell_table_merged = cell_table_merged.rename(
            {'cell_size_x': 'cell_size'}, axis=1
        )

    # subset on just the cell table columns plus the meta cluster rename column
    # NOTE: rename cell_meta_cluster_rename to just cell_meta_cluster for simplicity
    cell_table_merged = cell_table_merged[
        list(cell_table.columns.values) + ['cell_meta_cluster_rename']
    ]
    cell_table_merged = cell_table_merged.rename(
        {'cell_meta_cluster_rename': 'cell_meta_cluster'}, axis=1
    )

    # fill any N/A cell_meta_cluster values with 'Unassigned'
    # NOTE: this happens when a cell is so small no pixel clusters are detected inside of them
    cell_table_merged['cell_meta_cluster'] = cell_table_merged['cell_meta_cluster'].fillna(
        'Unassigned'
    )

    # resave cell table with new meta cluster column
    new_cell_table_path = os.path.splitext(cell_table_path)[0] + '_cell_labels.csv'
    cell_table_merged.to_csv(new_cell_table_path, index=False)