Source code for ark.phenotyping.cell_som_clustering

import os

from alpineer import io_utils, misc_utils

from ark.phenotyping import cell_cluster_utils, cluster_helpers


[docs]def train_cell_som(fovs, base_dir, cell_table_path, cell_som_cluster_cols,
                   cell_som_input_data, som_weights_name='cell_som_weights.feather',
                   xdim=10, ydim=10, lr_start=0.05, lr_end=0.01, num_passes=1, seed=42,
                   overwrite=False, normalize=True):
    """Run the SOM training on the expression columns specified in `cell_som_cluster_cols`.

    Saves the SOM weights to `base_dir/som_weights_name`.

    Args:
        fovs (list):
            The list of fovs to subset on
        base_dir (str):
            The path to the data directories
        cell_table_path (str):
            Path of the cell table, needs to be created with `Segment_Image_Data.ipynb`
        cell_som_cluster_cols (list):
            The list of columns in `cell_som_input_data_name` to use for SOM training
        cell_som_input_data (pandas.DataFrame):
            The input data to use for SOM training
        som_weights_name (str):
            The name of the file to save the SOM weights to
        xdim (int):
            The number of x nodes to use for the SOM
        ydim (int):
            The number of y nodes to use for the SOM
        lr_start (float):
            The start learning rate for the SOM, decays to `lr_end`
        lr_end (float):
            The end learning rate for the SOM, decays from `lr_start`
        num_passes (int):
            The number of training passes to make through the dataset
        seed (int):
            The random seed to use for training the SOM
        overwrite (bool):
            If set, force retrains the SOM and overwrites the weights
        normalize (bool):
            Whether to perform 99.9% percentile normalization, default to True.

    Returns:
        cluster_helpers.CellSOMCluster:
            The SOM cluster object containing the cell SOM weights
    """

    # define the data paths
    som_weights_path = os.path.join(base_dir, som_weights_name)

    # check the cell table path exists
    io_utils.validate_paths([cell_table_path])

    # verify the cell_som_cluster_cols columns provided are valid
    misc_utils.verify_in_list(
        provided_cluster_cols=cell_som_cluster_cols,
        som_input_cluster_cols=cell_som_input_data.columns.values
    )

    # define the cell SOM cluster object
    cell_pysom = cluster_helpers.CellSOMCluster(
        cell_som_input_data, som_weights_path, fovs, cell_som_cluster_cols,
        num_passes=num_passes, xdim=xdim, ydim=ydim, lr_start=lr_start, lr_end=lr_end,
        seed=seed, normalize=normalize
    )

    # train the SOM weights
    # NOTE: seed has to be set in cyFlowSOM.pyx, done by passing flag in PixieSOMCluster
    print("Training SOM")
    cell_pysom.train_som(overwrite=overwrite)

    return cell_pysom


[docs]def cluster_cells(base_dir, cell_pysom, cell_som_cluster_cols, num_parallel_cells=1000000,
                  overwrite=False):
    """Uses trained SOM weights to assign cluster labels on full cell data.

    Saves data with cluster labels to `cell_cluster_name`.

    Args:
        base_dir (str):
            The path to the data directory
        cell_pysom (cluster_helpers.CellSOMCluster):
            The SOM cluster object containing the cell SOM weights
        cell_som_cluster_cols (list):
            The list of columns used for SOM training
        num_parallel_cells (int):
            How many cells to label in parallel at once
        overwrite (bool):
            If set, overwrites the SOM cluster assignments if they exist

    Returns:
        pandas.DataFrame:
            The cell data in `cell_pysom.cell_data` with SOM labels assigned
    """

    # raise error if weights haven't been assigned to cell_pysom
    if cell_pysom.weights is None:
        raise ValueError("Using untrained cell_pysom object, please invoke train_cell_som first")

    if "segmentation_label" in cell_pysom.cell_data.columns:
        cell_pysom.cell_data.rename(columns={"segmentation_label": "label"}, inplace=True)

    # non-pixel cluster inputs won't be cell size normalized
    cols_to_drop = ['fov', 'label']
    if 'cell_size' in cell_pysom.cell_data.columns.values:
        cols_to_drop.append('cell_size')

    if 'cell_som_cluster' in cell_pysom.cell_data.columns.values:
        # if cell_som_cluster column exists and no overwrite set, return immediately
        if not overwrite:
            print("SOM clusters already assigned to each cell")
            return cell_pysom.cell_data

        print("Overwrite flag set, reassigning SOM cluster labels")
        cols_to_drop.append('cell_som_cluster')

    # the cell_som_input_data and weights columns are the same
    # minus the metadata columns that appear in cluster_counts_norm
    cell_som_input_data = cell_pysom.cell_data.drop(
        columns=cols_to_drop
    )

    # handles the case if user specifies a subset of columns for generic cell clustering
    # NOTE: CellSOMCluster ensures column ordering by using the preset self.columns as an index
    misc_utils.verify_in_list(
        cell_weights_columns=cell_pysom.weights.columns.values,
        cell_som_input_data_columns=cell_som_input_data.columns.values
    )

    # run the trained SOM on the dataset, assigning clusters
    print("Mapping cell data to SOM cluster labels")
    cell_data_som_labels = cell_pysom.assign_som_clusters(num_parallel_cells)

    return cell_data_som_labels


[docs]def generate_som_avg_files(base_dir, cell_som_input_data, cell_som_cluster_cols,
                           cell_som_expr_col_avg_name, overwrite=False):
    """Computes and saves the average expression of all `cell_som_cluster_cols`
    across cell SOM clusters.

    Args:
        base_dir (str):
            The path to the data directory
        cell_som_input_data (pandas.DataFrame):
            The input data used for SOM training with SOM labels attached
        cell_som_cluster_cols (list):
            The list of columns used for SOM training
        cell_som_expr_col_avg_name (str):
            The name of the file to write the average expression per column
            across cell SOM clusters
        overwrite (bool):
            If set, regenerate the averages of `cell_som_cluster_columns` for SOM clusters
    """

    # define the paths to the data
    som_expr_col_avg_path = os.path.join(base_dir, cell_som_expr_col_avg_name)

    # raise error if cell_som_input_data doesn't contain SOM labels
    if 'cell_som_cluster' not in cell_som_input_data.columns.values:
        raise ValueError('cell_som_input_data does not have SOM labels assigned')

    # if the channel SOM average file already exists and the overwrite flag isn't set, skip
    if os.path.exists(som_expr_col_avg_path):
        if not overwrite:
            print("Already generated average expression file for each cell SOM column, skipping")
            return

        print(
            "Overwrite flag set, regenerating average expression file for cell SOM clusters"
        )

    # compute the average column expression values per cell SOM cluster
    print("Computing the average value of each training column specified per cell SOM cluster")
    cell_som_cluster_avgs = cell_cluster_utils.compute_cell_som_cluster_cols_avg(
        cell_som_input_data,
        cell_som_cluster_cols,
        'cell_som_cluster',
        keep_count=True
    )

    # save the average expression values of cell_som_cluster_cols per cell SOM cluster
    cell_som_cluster_avgs.to_csv(
        som_expr_col_avg_path,
        index=False
    )