Source code for ark.phenotyping.pixel_cluster_utils

import os
import random
import warnings
from typing import List

import feather
import numpy as np
import pandas as pd
import scipy.ndimage as ndimage
from alpineer import image_utils, io_utils, load_utils, misc_utils
from pyarrow.lib import ArrowInvalid
from skimage.io import imread
import natsort as ns


[docs]def calculate_channel_percentiles(tiff_dir, fovs, channels, img_sub_folder,
                                  percentile):
    """Calculates average percentile for each channel in the dataset

    Args:
        tiff_dir (str):
            Name of the directory containing the tiff files
        fovs (list):
            List of fovs to include
        channels (list):
            List of channels to include
        img_sub_folder (str):
            Sub folder within each FOV containing image data
        percentile (float):
            The specific percentile to compute

    Returns:
        pd.DataFrame:
            The mapping between each channel and its normalization value
    """

    # create list to hold percentiles
    percentile_means = []

    # loop over channels and FOVs
    for channel in channels:
        percentile_list = []
        for fov in fovs:
            # load image data and remove 0 valued pixels
            img = load_utils.load_imgs_from_tree(data_dir=tiff_dir, img_sub_folder=img_sub_folder,
                                                 channels=[channel], fovs=[fov]).values[0, :, :, 0]
            img = img[img > 0]

            # record and store percentile, skip if no non-zero pixels
            if len(img) > 0:
                img_percentile = np.quantile(img, percentile)
                percentile_list.append(img_percentile)

        # save channel-wide average
        percentile_means.append(np.mean(percentile_list))

    percentile_df = pd.DataFrame(np.expand_dims(percentile_means, axis=0), columns=channels)
    percentile_df.sort_index(axis="columns", key=ns.natsort_key, inplace=True)

    return percentile_df


[docs]def calculate_pixel_intensity_percentile(tiff_dir, fovs, channels, img_sub_folder,
                                         channel_percentiles, percentile=0.05):
    """Calculates average percentile per FOV for total signal in each pixel

    Args:
        tiff_dir (str):
            Name of the directory containing the tiff files
        fovs (list):
            List of fovs to include
        channels (list):
            List of channels to include
        img_sub_folder (str):
            Sub folder within each FOV containing image data
        channel_percentiles (pd.DataFrame):
            The mapping between each channel and its normalization value
            Computed by `calculate_channel_percentiles`
        percentile (float):
            The pixel intensity percentile per FOV to average over


    Returns:
        float:
            The average percentile per FOV for total signal in each pixel
    """

    # create vector of channel percentiles to enable broadcasting
    norm_vect = channel_percentiles.iloc[0].values
    norm_vect = norm_vect.reshape([1, 1, len(norm_vect)])

    intensity_percentile_list = []

    for fov in fovs:
        # load image data
        img_data = load_utils.load_imgs_from_tree(data_dir=tiff_dir, fovs=[fov],
                                                  channels=channels, img_sub_folder=img_sub_folder)

        # normalize each channel by its percentile value
        norm_data = img_data[0].values / norm_vect

        # sum channels together to determine total intensity
        summed_data = np.sum(norm_data, axis=-1)
        intensity_percentile_list.append(np.quantile(summed_data, percentile))

    return np.mean(intensity_percentile_list)


[docs]def normalize_rows(pixel_data, channels, include_seg_label=True):
    """Normalizes the rows of a pixel matrix by their sum

    Args:
        pixel_data (pandas.DataFrame):
            The dataframe containing the pixel data for a given fov
            Includes channel and meta (`fov`, `label`, etc.) columns
        channels (list):
            List of channels to subset over
        include_seg_label (bool):
            Whether to include `'label'` as a metadata column

    Returns:
        pandas.DataFrame:
            The pixel data with rows normalized and 0-sum rows removed
    """

    # subset the fov data by the channels the user trained the pixel SOM on
    pixel_data_sub = pixel_data[channels]

    # divide each row by their sum
    pixel_data_sub = pixel_data_sub.div(pixel_data_sub.sum(axis=1), axis=0)

    # define the meta columns to add back
    meta_cols = ['fov', 'row_index', 'column_index']

    # add the label column if it should be kept
    if include_seg_label:
        meta_cols.append('label')

    # add back meta columns, making sure to remove 0-row indices
    pixel_data_sub[meta_cols] = pixel_data.loc[pixel_data_sub.index.values, meta_cols]

    return pixel_data_sub


[docs]def check_for_modified_channels(tiff_dir, test_fov, img_sub_folder, channels):
    """Checks to make sure the user selected newly modified channels

    Args:
        tiff_dir (str):
            Name of the directory containing the tiff files
        test_fov (str):
            example fov used to check channel names
        img_sub_folder (str):
            sub-folder within each FOV containing image data
        channels (list):
            list of channels to use for analysis
    """

    # convert to path-compatible format
    if img_sub_folder is None:
        img_sub_folder = ''

    # get all channels within example FOV
    all_channels = io_utils.list_files(os.path.join(tiff_dir, test_fov, img_sub_folder))
    all_channels = io_utils.remove_file_extensions(all_channels)
    # define potential modifications to channel names
    mods = ['_smoothed', '_nuc_include', '_nuc_exclude']

    # loop over each user-provided channel
    for channel in channels:
        for mod in mods:
            # check for substring matching
            chan_mod = channel + mod
            if chan_mod in all_channels:
                warnings.warn('You selected {} as the channel to analyze, but there were potential'
                              ' modified channels found: {}. Make sure you selected the correct '
                              'version of the channel for inclusion in '
                              'clustering'.format(channel, chan_mod))
            else:
                pass


[docs]def smooth_channels(fovs, tiff_dir, img_sub_folder, channels, smooth_vals):
    """Adds additional smoothing for selected channels as a preprocessing step

    Args:
        fovs (list):
            List of fovs to process
        tiff_dir (str):
            Name of the directory containing the tiff files
        img_sub_folder (str):
            sub-folder within each FOV containing image data
        channels (list):
            list of channels to apply smoothing to
        smooth_vals (list or int):
            amount to smooth channels. If a single int, applies
            to all channels. Otherwise, a custom value per channel can be supplied

    """

    # no output if no channels specified
    if channels is None or len(channels) == 0:
        return

    # convert to path-compatible format
    if img_sub_folder is None:
        img_sub_folder = ''

    # convert int to list of same length
    if type(smooth_vals) is int:
        smooth_vals = [smooth_vals for _ in range(len(channels))]
    elif type(smooth_vals) is list:
        if len(smooth_vals) != len(channels):
            raise ValueError("A list was provided for variable smooth_vals, but it does not "
                             "have the same length as the list of channels provided")
    else:
        raise ValueError("Variable smooth_vals must be either a single integer or a list")

    for fov in fovs:
        for idx, chan in enumerate(channels):
            img = load_utils.load_imgs_from_tree(data_dir=tiff_dir, img_sub_folder=img_sub_folder,
                                                 fovs=[fov], channels=[chan]).values[0, :, :, 0]
            chan_out = ndimage.gaussian_filter(img, sigma=smooth_vals[idx])
            image_utils.save_image(
                os.path.join(tiff_dir, fov, img_sub_folder, chan + '_smoothed.tiff'),
                chan_out
            )


[docs]def filter_with_nuclear_mask(fovs: List, tiff_dir: str, seg_dir: str, channel: str,
                             nuc_seg_suffix: str = "_nuclear.tiff", img_sub_folder: str = None,
                             exclude: bool = True):
    """Filters out background staining using subcellular marker localization.

    Non-nuclear signal is removed from nuclear markers and vice-versa for membrane markers.

    Args:
        fovs (list):
            The list of fovs to filter
        tiff_dir (str):
            Name of the directory containing the tiff files
        seg_dir (str):
            Name of the directory containing the segmented files
        channel (str):
            Channel to apply filtering to
        nuc_seg_suffix (str):
            The suffix for the nuclear channel.
            (i.e. for "fov1", a suffix of "_nuclear.tiff" would make a file named
            "fov1_nuclear.tiff")
        img_sub_folder (str):
            Name of the subdirectory inside `tiff_dir` containing the tiff files.
            Set to `None` if there isn't any.
        exclude (bool):
            Whether to filter out nuclear or membrane signal
    """

    # if seg_dir is None, the user cannot run filtering
    if seg_dir is None:
        print('No seg_dir provided, you must provide one to run nuclear filtering')
        return

    # raise an error if the provided seg_dir does not exist
    io_utils.validate_paths(seg_dir)

    # convert to path-compatible format
    if img_sub_folder is None:
        img_sub_folder = ''

    for fov in fovs:
        # load the channel image in
        img = load_utils.load_imgs_from_tree(data_dir=tiff_dir, img_sub_folder=img_sub_folder,
                                             fovs=[fov], channels=[channel]).values[0, :, :, 0]

        # load the segmented image in
        seg_img_name: str = f"{fov}{nuc_seg_suffix}"
        seg_img = imread(os.path.join(seg_dir, seg_img_name))[0, ...]

        # mask out the nucleus
        if exclude:
            suffix = "_nuc_exclude.tiff"
            seg_mask = seg_img > 0
        # mask out the membrane
        else:
            suffix = "_nuc_include.tiff"
            seg_mask = seg_img == 0

        # filter out the nucleus or membrane depending on exclude parameter
        img[seg_mask] = 0

        # save filtered image
        image_utils.save_image(os.path.join(tiff_dir, fov, img_sub_folder, channel + suffix), img)


[docs]def compute_pixel_cluster_channel_avg(fovs, channels, base_dir, pixel_cluster_col,
                                      num_pixel_clusters,
                                      pixel_data_dir='pixel_mat_data',
                                      num_fovs_subset=100, seed=42, keep_count=False):
    """Compute the average channel values across each pixel SOM cluster.

    To improve performance, number of FOVs is downsampled by `fov_subset_proportion`

    Args:
        fovs (list):
            The list of fovs to subset on
        channels (list):
            The list of channels to subset on
        base_dir (str):
            The path to the data directories
        pixel_cluster_col (str):
            Name of the column to group by
        num_pixel_clusters (int):
            The number of pixel clusters that are desired, if None then no fixed amount required
        pixel_data_dir (str):
            Name of the directory containing the pixel data with cluster labels
        num_fovs_subset (float):
            The number of FOVs to subset on. Note that if `len(fovs) < num_fovs_subset`, all of
            the FOVs will still be selected
        seed (int):
            The random seed to use for subsetting FOVs
        keep_count (bool):
            Whether to keep the count column when aggregating or not
            This should only be set to `True` for visualization purposes

    Returns:
        pandas.DataFrame:
            Contains the average channel values for each pixel SOM/meta cluster
    """

    # verify the pixel cluster col specified is valid
    misc_utils.verify_in_list(
        provided_cluster_col=[pixel_cluster_col],
        valid_cluster_cols=['pixel_som_cluster', 'pixel_meta_cluster']
    )

    # verify num_pixel_clusters is valid if set
    if num_pixel_clusters is not None and num_pixel_clusters <= 0:
        raise ValueError("If set, number of pixel clusters desired must be a positive integer")

    # verify fovs subset value is valid
    if num_fovs_subset <= 0:
        raise ValueError("Number of fovs to subset must be a positive integer")

    # define a list to hold the cluster averages for each FOV
    fov_cluster_avgs = []

    # warn the user that we can only select so many FOVs if len(fovs) < num_fovs_subset
    if len(fovs) < num_fovs_subset:
        warnings.warn(
            'Provided num_fovs_subset=%d but only %d FOVs in dataset, '
            'subsetting just the %d FOVs' %
            (num_fovs_subset, len(fovs), len(fovs))
        )

    # subset number of FOVs based on num_fovs_subset if less than total number of fovs
    random.seed(seed)
    fovs_sub = random.sample(fovs, num_fovs_subset) if num_fovs_subset < len(fovs) else fovs

    for fov in fovs_sub:
        # read in the fovs data
        try:
            fov_pixel_data = feather.read_dataframe(
                os.path.join(base_dir, pixel_data_dir, fov + '.feather')
            )
        except (ArrowInvalid, OSError, IOError):
            print("The data for FOV %s has been corrupted, skipping" % fov)
            continue

        # aggregate the sums and counts
        sum_by_cluster = fov_pixel_data.groupby(
            pixel_cluster_col
        )[channels].sum()
        count_by_cluster = fov_pixel_data.groupby(
            pixel_cluster_col
        )[channels].size().to_frame('count')

        # merge the results by column
        agg_results = pd.merge(
            sum_by_cluster, count_by_cluster, left_index=True, right_index=True
        ).reset_index()

        # append the result to cluster_avgs
        fov_cluster_avgs.append(agg_results)

    cluster_avgs = pd.concat(fov_cluster_avgs)

    # reset the index of cluster_avgs for consistency
    cluster_avgs = cluster_avgs.reset_index(drop=True)

    # sum the counts and the channel sums
    sum_count_totals = cluster_avgs.groupby(
        pixel_cluster_col
    )[channels + ['count']].sum().reset_index()

    # error out if any clusters were lost during the averaging process
    if num_pixel_clusters is not None and sum_count_totals.shape[0] < num_pixel_clusters:
        raise ValueError(
            'Averaged data contains just %d clusters out of %d. '
            'Average expression file not written. '
            'Consider increasing your num_fovs_subset value.' %
            (sum_count_totals.shape[0], num_pixel_clusters)
        )

    # now compute the means using the count column
    sum_count_totals[channels] = sum_count_totals[channels].div(sum_count_totals['count'], axis=0)

    # convert cluster column to integer type
    sum_count_totals[pixel_cluster_col] = sum_count_totals[pixel_cluster_col].astype(int)

    # sort cluster col in ascending order
    sum_count_totals = sum_count_totals.sort_values(by=pixel_cluster_col)

    # drop the count column if specified
    if not keep_count:
        sum_count_totals = sum_count_totals.drop('count', axis=1)

    return sum_count_totals


[docs]def find_fovs_missing_col(base_dir, data_dir, missing_col):
    """Identify FOV names in `data_dir` without `missing_col`

    Args:
        base_dir (str):
            The path to the data directories
        data_dir (str):
            Name of the directory which contains the full preprocessed pixel data
        missing_col (str):
            Name of the column to identify

    Returns:
        list:
            List of FOVs without `missing_col`
    """

    # define the main data path
    data_path = os.path.join(base_dir, data_dir)

    # define the temp data path
    temp_path = os.path.join(base_dir, data_dir + '_temp')

    # verify the data path exists
    io_utils.validate_paths(data_path)

    # if the temp path does not exist, either all the FOVs need to be run or none of them do
    if not os.path.exists(temp_path):
        # read in one of the FOV files in data_path
        fov_files = io_utils.list_files(data_path, substrs='.feather')

        # read in a sample FOV, do it this way to avoid potentially corrupted files
        # NOTE: this assumes only a few files get corrupted each run per MIBIAN runs
        # NOTE: handling of corrupted files gets propagated to respective R script
        i = 0
        while i < len(fov_files):
            try:
                fov_data = feather.read_dataframe(os.path.join(data_path, fov_files[i]))
            except (ArrowInvalid, OSError, IOError):
                i += 1
                continue
            break

        # if the missing_col is not found in fov_data, we need to run all the FOVs
        if missing_col not in fov_data.columns.values:
            # we will also make the temp directory to store the new files
            os.mkdir(temp_path)
            return io_utils.remove_file_extensions(fov_files)
        # otherwise, the column has already been assigned for this cohort, no need to run anything
        else:
            return []
    # if the temp path does exist, we have FOVs that need further processing
    else:
        # retrieve the FOV file names from both data_path and temp_path
        data_files = set(io_utils.list_files(data_path, substrs='.feather'))
        temp_files = set(io_utils.list_files(temp_path, substrs='.feather'))

        # get the difference between the two of these to determine which ones are valid
        leftover_files = list(data_files.difference(temp_files))

        return io_utils.remove_file_extensions(leftover_files)