Source code for ark.segmentation.marker_quantification

import copy
import warnings
from typing import List
import re

import numpy as np
import pandas as pd
import xarray as xr
from alpineer import io_utils, load_utils, misc_utils
from skimage.measure import regionprops, regionprops_table

import ark.settings as settings
from ark.segmentation import segmentation_utils
from ark.segmentation.regionprops_extraction import REGIONPROPS_FUNCTION
from ark.segmentation.signal_extraction import EXTRACTION_FUNCTION


[docs]def get_single_compartment_props(segmentation_labels, regionprops_base,
                                 regionprops_single_comp, **kwargs):
    """Gets regionprops features from the provided segmentation labels for a fov

    Based on segmentation labels from a single compartment

    Args:
        segmentation_labels (numpy.ndarray):
            rows x columns matrix of masks
        regionprops_base (list):
            base morphology features directly computed by regionprops to extract for each cell
        regionprops_single_comp (list):
            list of single compartment extra properties derived from regionprops to compute
        **kwargs:
            Arbitrary keyword arguments for compute_extra_props

    Returns:
        pandas.DataFrame:
            Contains the regionprops info (base and derived) for each labeled cell
    """

    # verify that all the regionprops single compartment featues provided actually exist
    # NOTE: in case fast_extraction set to True in calling function, bypass
    if len(regionprops_single_comp) > 0:
        misc_utils.verify_in_list(
            extras_props=regionprops_single_comp,
            props_options=list(REGIONPROPS_FUNCTION.keys())
        )

    # if image is just background, return empty df
    if len(np.unique(segmentation_labels)) < 2:
        output_list = regionprops_base + regionprops_single_comp
        blank_df = pd.DataFrame(columns=output_list)
        return blank_df

    # get the base features
    cell_props = pd.DataFrame(regionprops_table(segmentation_labels,
                                                properties=regionprops_base))

    # define an empty list for each regionprop feature
    cell_props_single_comp = {re: [] for re in regionprops_single_comp}

    # get regionprop info needed for single compartment computations
    props = regionprops(segmentation_labels)

    # generate the required data for each cell
    for prop in props:
        for re in regionprops_single_comp:
            cell_props_single_comp[re].append(REGIONPROPS_FUNCTION[re](prop, **kwargs))

    # convert the dictionary to a DataFrame
    cell_props_single_comp = pd.DataFrame.from_dict(cell_props_single_comp)

    # append the single compartment derived props info to the cell_props DataFrame
    cell_props = pd.concat([cell_props, cell_props_single_comp], axis=1)

    return cell_props


[docs]def assign_single_compartment_features(marker_counts, compartment, cell_props, cell_coords,
                                       cell_id, label_id, input_images, regionprops_names,
                                       extraction, **kwargs):
    """Assign computed regionprops features and signal intensity to cell_id in marker_counts

    Args:
        marker_counts (xarray.DataArray):
            xarray containing segmentaed data of cells x markers
        compartment (str):
            either 'whole_cell' or 'nuclear'
        cell_props (pandas.DataFrame):
            regionprops information for each cell
        cell_coords (numpy.ndarray):
            values representing pixels within one cell
        cell_id (int):
            id of the cell
        label_id (int):
            id used to index into cell_props
        input_images (xarray.DataArray):
            rows x columns x channels matrix of imaging data
        regionprops_names (list):
            all of the regionprops features (including derived, except nuclear-specific)
        extraction (str):
            the extraction method to use for signal intensity calculation
        **kwargs:
            arbitrary keyword arguments

    Returns:
        xarray.DataArray:
            the updated marker_counts matrix with data for the specified cell_id and compartment
    """

    # get centroid corresponding to current cell
    kwargs['centroid'] = np.array((
        cell_props.loc[cell_props['label'] == label_id, 'centroid-0'].values,
        cell_props.loc[cell_props['label'] == label_id, 'centroid-1'].values
    )).T

    cell_counts = EXTRACTION_FUNCTION[extraction](cell_coords, input_images, **kwargs)

    # get morphology metrics
    # Filter regionprops_names to only those in cell_props.columns
    filtered_regionprops_names = [rp_name for rp_name in regionprops_names if rp_name in
                                  cell_props.columns]
    current_cell_props = cell_props.loc[cell_props['label'] == label_id,
                                        filtered_regionprops_names]

    # combine marker counts and morphology metrics together
    cell_features = np.concatenate((cell_counts, current_cell_props), axis=None)

    # add counts of each marker to appropriate column
    # Only include the marker_count features up to the last filtered feature.
    marker_counts.loc[
        compartment, cell_id, marker_counts.features[1]:filtered_regionprops_names[-1]
    ] = cell_features

    # add cell size to first column
    marker_counts.loc[compartment, cell_id, marker_counts.features[0]] = cell_coords.shape[0]

    return marker_counts


[docs]def assign_multi_compartment_features(marker_counts, regionprops_multi_comp, **kwargs):
    """Assigns features to marker_counts that depend on multiple compartments

    Args:
        marker_counts (xarray.DataArray):
            xarray containing segmentaed data of cells x markers
        regionprops_multi_comp (list):
            list of multi-compartment properties derived from regionprops to compute,
            each value should correspond to a value in REGIONPROPS_FUNCTION
        **kwargs:
            arbitrary keyword arguments

    Returns:
        xarray.DataArray:
            the updated marker_counts matrix with data for the specified cell_id and compartment
    """

    # if no multi regionprops features set, just return the marker counts array as is
    # NOTE: this often happens when fast_extraction set to True in calling function
    if len(regionprops_multi_comp) == 0:
        return marker_counts

    misc_utils.verify_in_list(
        nuclear_props=regionprops_multi_comp,
        props_options=list(REGIONPROPS_FUNCTION.keys())
    )

    for rn in regionprops_multi_comp:
        # if rn is not a feature, then we add a new dimension to hold this regionprop info
        if rn not in marker_counts.features.values:
            rn_fill = np.zeros((marker_counts.shape[0], marker_counts.shape[1], 1))
            rn_arr = xr.DataArray(
                rn_fill,
                coords=[marker_counts.compartments.values, marker_counts.cell_id.values, [rn]],
                dims=['compartments', 'cell_id', 'features']
            )

            # append new dimension to marker_counts
            marker_counts = xr.concat([marker_counts, rn_arr], dim='features')

        # compute the multi-compatment regionprop info
        marker_counts = REGIONPROPS_FUNCTION[rn](marker_counts, **kwargs)

    return marker_counts


[docs]def compute_marker_counts(input_images, segmentation_labels, nuclear_counts=False,
                          regionprops_base=copy.deepcopy(settings.REGIONPROPS_BASE),
                          regionprops_single_comp=copy.deepcopy(settings.REGIONPROPS_SINGLE_COMP),
                          regionprops_multi_comp=copy.deepcopy(settings.REGIONPROPS_MULTI_COMP),
                          split_large_nuclei=False, extraction='total_intensity',
                          fast_extraction=False, **kwargs):
    """Extract single cell protein expression data from channel TIFs for a single fov

    Args:
        input_images (xarray.DataArray):
            rows x columns x channels matrix of imaging data
        segmentation_labels (numpy.ndarray):
            rows x columns x compartment matrix of masks
        nuclear_counts (bool):
            boolean flag to determine whether nuclear counts are returned
        regionprops_base (list):
            base morphology features directly computed by regionprops to extract for each cell
        regionprops_single_comp (list):
            list of single compartment extra properties derived from regionprops to compute
        regionprops_multi_comp (list):
            list of multi compartment extra properties derived from regionprops to compute
        split_large_nuclei (bool):
            controls whether nuclei which have portions outside of the cell will get relabeled
        extraction (str):
            extraction function used to compute marker counts
        fast_extraction (bool):
            if set, skips custom regionprops and expensive base regionprops extraction steps
            regardless of other params set
        **kwargs:
            arbitrary keyword arguments for get_cell_props

    Returns:
        xarray.DataArray:
            xarray containing segmented data of cells x markers
    """

    misc_utils.verify_in_list(
        extraction=extraction,
        extraction_options=list(EXTRACTION_FUNCTION.keys())
    )

    if 'coords' not in regionprops_base:
        regionprops_base.append('coords')

    # labels are required
    if 'label' not in regionprops_base:
        regionprops_base.append('label')

    # centroid is required
    if not any(['centroid' in rpf for rpf in regionprops_base]):
        regionprops_base.append('centroid')

    # set regionprops_base to just POST_CHANNEL_COL, coords, and centroid if skip extraction set
    # no additional base regionprops names or custom regionprops properties will be extracted
    if fast_extraction:
        regionprops_base = [settings.POST_CHANNEL_COL, 'coords', 'centroid']
        regionprops_single_comp = []
        regionprops_multi_comp = []

    # enforce post channel column is present and first
    if regionprops_base[0] != settings.POST_CHANNEL_COL:
        if settings.POST_CHANNEL_COL in regionprops_base:
            regionprops_base.remove(settings.POST_CHANNEL_COL)
        regionprops_base.insert(0, settings.POST_CHANNEL_COL)

    # create variable to hold names of returned columns only
    regionprops_names = copy.copy(regionprops_base)
    regionprops_names.remove('coords')

    # centroid returns two columns, need to modify names
    if np.isin('centroid', regionprops_names):
        regionprops_names.remove('centroid')
        regionprops_names += ['centroid-0', 'centroid-1']

    # add the single compartment features to regionprops_names
    regionprops_names.extend(regionprops_single_comp)

    # get all the cell ids
    unique_cell_ids = np.unique(segmentation_labels[..., 0].values)
    unique_cell_ids = unique_cell_ids[np.nonzero(unique_cell_ids)]

    # set the channel features
    channel_features = input_images.channels

    # create labels for array holding channel counts and morphology metrics
    feature_names = np.concatenate((np.array(settings.PRE_CHANNEL_COL), channel_features,
                                    regionprops_names), axis=None)

    # create np.array to hold compartment x cell x feature info
    marker_counts_array = np.zeros((len(segmentation_labels.compartments), len(unique_cell_ids),
                                    len(feature_names)))

    marker_counts = xr.DataArray(copy.copy(marker_counts_array),
                                 coords=[segmentation_labels.compartments,
                                         unique_cell_ids.astype('int'),
                                         feature_names],
                                 dims=['compartments', 'cell_id', 'features'])

    # get the regionprops kwargs
    reg_props = kwargs.get('regionprops_kwargs', {})

    # get regionprops for each cell
    cell_props = get_single_compartment_props(segmentation_labels.loc[:, :, 'whole_cell'].values,
                                              regionprops_base, regionprops_single_comp,
                                              **reg_props)

    if len(unique_cell_ids) == 0:
        fov_name = str(segmentation_labels.fovs.values)
        warnings.warn("No cells found in the following image: {}".format(fov_name))

    if nuclear_counts:
        nuc_labels = segmentation_labels.loc[:, :, 'nuclear'].values

        if split_large_nuclei:
            cell_labels = segmentation_labels.loc[:, :, 'whole_cell'].values
            nuc_labels = \
                segmentation_utils.split_large_nuclei(cell_segmentation_labels=cell_labels,
                                                      nuc_segmentation_labels=nuc_labels,
                                                      cell_ids=unique_cell_ids)

        nuc_props = get_single_compartment_props(nuc_labels,
                                                 regionprops_base, regionprops_single_comp,
                                                 **reg_props)
        if len(nuc_props) == 0:
            fov_name = str(segmentation_labels.fovs.values)
            warnings.warn("No nuclei found in the following image: {}".format(fov_name))

    # get the signal kwargs
    sig_kwargs = kwargs.get('signal_kwargs', {})

    # loop through each cell in mask
    for cell_id in cell_props['label']:
        # get coords corresponding to current cell.
        cell_coords = cell_props.loc[cell_props['label'] == cell_id, 'coords'].values[0]

        # assign properties for whole cell compartment
        marker_counts = assign_single_compartment_features(
            marker_counts, 'whole_cell', cell_props, cell_coords, cell_id, cell_id,
            input_images, regionprops_names, extraction, **sig_kwargs
        )

        if nuclear_counts:
            # get id of corresponding nucleus
            nuc_id = segmentation_utils.find_nuclear_label_id(nuc_segmentation_labels=nuc_labels,
                                                              cell_coords=cell_coords)

            if nuc_id is not None:
                # get the coords of the corresponding nucleus
                nuc_coords = nuc_props.loc[nuc_props['label'] == nuc_id, 'coords'].values[0]

                # assign properties for nuclear compartment
                marker_counts = assign_single_compartment_features(
                    marker_counts, 'nuclear', nuc_props, nuc_coords, cell_id, nuc_id,
                    input_images, regionprops_names, extraction, **sig_kwargs
                )

                # generate properties which involve multiple compartments
                marker_counts = assign_multi_compartment_features(
                    marker_counts, regionprops_multi_comp
                )

                # if regionprops names does not contain multi_comp props then add them
                if not set(regionprops_multi_comp).issubset(regionprops_names):
                    regionprops_names.extend(regionprops_multi_comp)

    return marker_counts


[docs]def create_marker_count_matrices(segmentation_labels, image_data, nuclear_counts=False,
                                 split_large_nuclei=False, extraction='total_intensity',
                                 fast_extraction=False, **kwargs):
    """Create a matrix of cells by channels with the total counts of each marker in each cell.

    Args:
        segmentation_labels (xarray.DataArray):
            xarray of shape [fovs, rows, cols, compartment] containing segmentation masks for one
            FOV, potentially across multiple cell compartments
        image_data (xarray.DataArray):
            xarray containing all of the channel data across one FOV
        nuclear_counts (bool):
            boolean flag to determine whether nuclear counts are returned, note that if
            set to True, the compartments coordinate in segmentation_labels must contain 'nuclear'
        split_large_nuclei (bool):
            boolean flag to determine whether nuclei which are larger than their assigned cell
            will get split into two different nuclear objects
        extraction (str):
            extraction function used to compute marker counts.
        fast_extraction (bool):
            if set, skips the custom regionprops and expensive base regionprops extraction steps
        **kwargs:
            arbitrary keyword args for compute_marker_counts

    Returns:
        tuple (pandas.DataFrame, pandas.DataFrame):
        - marker counts per cell normalized by cell size
        - arcsinh transformation of the above
    """

    if type(segmentation_labels) is not xr.DataArray:
        raise ValueError("Incorrect data type for segmentation_labels, expecting xarray")

    if type(image_data) is not xr.DataArray:
        raise ValueError("Incorrect data type for image_data, expecting xarray")

    if nuclear_counts:
        misc_utils.verify_in_list(
            nuclear_label='nuclear',
            compartment_names=segmentation_labels.compartments.values
        )

    misc_utils.verify_in_list(
        extraction=extraction,
        extraction_options=list(EXTRACTION_FUNCTION.keys())
    )

    misc_utils.verify_same_elements(segmentation_labels_fovs=segmentation_labels.fovs.values,
                                    img_data_fovs=image_data.fovs.values)

    # define the FOV associated with this segmentation label
    fov = segmentation_labels.fovs.values[0]

    # current mask
    label = segmentation_labels.loc[fov, :, :, :]

    # extract the counts per cell for each marker
    marker_counts = compute_marker_counts(image_data.loc[fov, :, :, :], label,
                                          nuclear_counts=nuclear_counts,
                                          split_large_nuclei=split_large_nuclei,
                                          extraction=extraction,
                                          fast_extraction=fast_extraction, **kwargs)

    # normalize counts by cell size
    marker_counts_norm = segmentation_utils.transform_expression_matrix(marker_counts,
                                                                        transform='size_norm')

    # arcsinh transform the data
    marker_counts_arcsinh = segmentation_utils.transform_expression_matrix(marker_counts_norm,
                                                                           transform='arcsinh')

    # add data from each fov to array
    normalized = pd.DataFrame(data=marker_counts_norm.loc['whole_cell', :, :].values,
                              columns=marker_counts_norm.features)

    arcsinh = pd.DataFrame(data=marker_counts_arcsinh.values[0, :, :],
                           columns=marker_counts_arcsinh.features)

    normalized[settings.CELL_LABEL] = normalized[settings.CELL_LABEL].astype(np.int32)
    arcsinh[settings.CELL_LABEL] = arcsinh[settings.CELL_LABEL].astype(np.int32)

    if nuclear_counts:
        # append nuclear counts pandas array with modified column name
        nuc_column_names = [feature + '_nuclear' for feature in marker_counts.features.values]

        # add nuclear counts to size normalized data
        normalized_nuc = pd.DataFrame(data=marker_counts_norm.loc['nuclear', :, :].values,
                                      columns=nuc_column_names)
        normalized = pd.concat((normalized, normalized_nuc), axis=1)

        # add nuclear counts to arcsinh transformed data
        arcsinh_nuc = pd.DataFrame(data=marker_counts_arcsinh.loc['nuclear', :, :].values,
                                   columns=nuc_column_names)
        arcsinh = pd.concat((arcsinh, arcsinh_nuc), axis=1)

    # add column for current fov
    normalized['fov'] = fov
    arcsinh['fov'] = fov

    return normalized, arcsinh


[docs]def generate_cell_table(segmentation_dir, tiff_dir, img_sub_folder="TIFs",
                        is_mibitiff=False, fovs=None, extraction='total_intensity',
                        nuclear_counts=False, fast_extraction=False, mask_types=['whole_cell'],
                        add_underscore=True, **kwargs):
    """This function takes the segmented data and computes the expression matrices batch-wise
    while also validating inputs

    Args:
        segmentation_dir (str):
            the path to the directory containing the segmentation labels generated by Mesmer
        tiff_dir (str):
            the name of the directory which contains the single_channel_inputs
        img_sub_folder (str):
            the name of the folder where the TIF images are located
            ignored if is_mibitiff is True
        fovs (list):
            a list of fovs we wish to analyze, if None will default to all fovs
        is_mibitiff (bool):
            a flag to indicate whether or not the base images are MIBItiffs
        extraction (str):
            extraction function used to compute marker counts
        nuclear_counts (bool):
            boolean flag to determine whether nuclear counts are returned, note that if
            set to True, the compartments coordinate in segmentation_labels must contain 'nuclear'
        fast_extraction (bool):
            if set, skips the custom regionprops and expensive base regionprops extraction steps
        mask_types (list):
            list of masks to extract data for, defaults to ['whole_cell']
        add_underscore (str):
            whether to add '_' before the mask type suffix in file names, defaults to True
        **kwargs:
            arbitrary keyword arguments for signal and regionprops extraction

    Returns:
        tuple (pandas.DataFrame, pandas.DataFrame):
        - size normalized data
        - arcsinh transformed data
    """

    # if no fovs are specified, then load all the fovs
    if fovs is None:
        if is_mibitiff:
            fovs = io_utils.list_files(tiff_dir, substrs=[".tif", ".tiff"])
        else:
            fovs = io_utils.list_folders(tiff_dir)

    # drop file extensions
    fovs = io_utils.remove_file_extensions(fovs)

    misc_utils.verify_in_list(
        extraction=extraction,
        extraction_options=list(EXTRACTION_FUNCTION.keys())
    )

    # get full filenames from given fovs
    # TODO: deprecate filenames support as part of MIBItiff phasing out
    filenames = io_utils.list_files(tiff_dir, substrs=fovs, exact_match=True)

    # sort the fovs
    fovs.sort()
    filenames.sort()

    # define number of FOVs for batch processing
    cohort_len = len(fovs)

    # create the final dfs to store the processed data
    normalized_tables = []
    arcsinh_tables = []

    for fov_index, fov_name in enumerate(fovs):
        if is_mibitiff:
            image_data = load_utils.load_imgs_from_mibitiff(data_dir=tiff_dir,
                                                            mibitiff_files=[filenames[fov_index]])
        else:
            image_data = load_utils.load_imgs_from_tree(data_dir=tiff_dir,
                                                        img_sub_folder=img_sub_folder,
                                                        fovs=[fov_name])

        for mask_type in mask_types:
            if mask_type is None:
                mask_type, mask_suff = 'cell_mask', None
            else:
                mask_suff = '_' + mask_type if add_underscore else mask_type
            # load the segmentation labels in
            fov_mask_name = fov_name + mask_suff + ".tiff" if mask_suff else fov_name + ".tiff"
            current_labels_cell = load_utils.load_imgs_from_dir(data_dir=segmentation_dir,
                                                                files=[fov_mask_name],
                                                                xr_dim_name='compartments',
                                                                xr_channel_names=[mask_type],
                                                                trim_suffix=mask_suff)

            compartments = ['whole_cell']
            segmentation_labels = current_labels_cell.values

            if nuclear_counts and mask_type == 'whole_cell':
                nuclear_file = fov_name + '_nuclear.tiff'
                current_labels_nuc = load_utils.load_imgs_from_dir(data_dir=segmentation_dir,
                                                                   files=[nuclear_file],
                                                                   xr_dim_name='compartments',
                                                                   xr_channel_names=['nuclear'],
                                                                   trim_suffix='_nuclear')
                compartments = ['whole_cell', 'nuclear']
                segmentation_labels = np.concatenate((current_labels_cell.values,
                                                      current_labels_nuc.values), axis=-1)

            current_labels = xr.DataArray(segmentation_labels,
                                          coords=[current_labels_cell.fovs,
                                                  current_labels_cell.rows,
                                                  current_labels_cell.cols,
                                                  compartments],
                                          dims=current_labels_cell.dims)

            # segment the imaging data
            cell_table_size_normalized, cell_table_arcsinh_transformed = create_marker_count_matrices(
                segmentation_labels=current_labels,
                image_data=image_data,
                extraction=extraction,
                nuclear_counts=nuclear_counts,
                fast_extraction=fast_extraction,
                **kwargs
            )

            # add mask type column to the data frame
            if mask_type == "final_cells_remaining":
                mask_type_str = "whole_cell"
            else:
                mask_type_str = mask_type
            cell_table_size_normalized['mask_type'] = mask_type_str
            cell_table_arcsinh_transformed['mask_type'] = mask_type_str

            # add to larger dataframe
            normalized_tables.append(cell_table_size_normalized)
            arcsinh_tables.append(cell_table_arcsinh_transformed)

    # now append to the final dfs to return
    combined_cell_table_size_normalized = pd.concat(normalized_tables)
    combined_cell_table_arcsinh_transformed = pd.concat(arcsinh_tables)

    return combined_cell_table_size_normalized, combined_cell_table_arcsinh_transformed


[docs]def get_existing_mask_types(fov_names: List[str], mask_names: List[str]) -> List[str]:
    """ Function to strip prefixes from list: fov_names, strip '.tiff' suffix from list: mask names,
    and remove underscore prefixes, returning unique mask values (i.e. categories of masks).

    Args:
        fov_names (List[str]): list of fov names. Matching fov names in mask names will be returned without fov prefix.
        mask_names (List[str]): list of mask names. Mask names will be returned without tiff suffix.
        
    Returns:
            List[str]: Unique mask names (i.e. categories of masks)
    """
    stripped_mask_names = io_utils.remove_file_extensions(mask_names)

    # break fov names into tokens, compare against mask names and return only those mask names that also contain the fov
    result = []
    for prefix in fov_names:
        prefix_tokens = list(filter(bool, re.split("[^a-zA-Z0-9]", prefix)))
        for itemB in stripped_mask_names:
            itemB_tokens = list(filter(bool, re.split("[^a-zA-Z0-9]", itemB)))
            if set(prefix_tokens).issubset(itemB_tokens):
                result.append(itemB[len(prefix):])

    # Remove underscore prefixes and return unique values
    cleaned_result = [item.lstrip('_') for item in result]
    unique_result = list(set(cleaned_result))
    return unique_result