Source code for ark.phenotyping.pixel_cluster_utils

import os
import random
import warnings
from typing import List

import feather
import numpy as np
import pandas as pd
import scipy.ndimage as ndimage
from alpineer import image_utils, io_utils, load_utils, misc_utils
from pyarrow.lib import ArrowInvalid
from skimage.io import imread
import natsort as ns


[docs]def calculate_channel_percentiles(tiff_dir, fovs, channels, img_sub_folder, percentile): """Calculates average percentile for each channel in the dataset Args: tiff_dir (str): Name of the directory containing the tiff files fovs (list): List of fovs to include channels (list): List of channels to include img_sub_folder (str): Sub folder within each FOV containing image data percentile (float): The specific percentile to compute Returns: pd.DataFrame: The mapping between each channel and its normalization value """ # create list to hold percentiles percentile_means = [] # loop over channels and FOVs for channel in channels: percentile_list = [] for fov in fovs: # load image data and remove 0 valued pixels img = load_utils.load_imgs_from_tree(data_dir=tiff_dir, img_sub_folder=img_sub_folder, channels=[channel], fovs=[fov]).values[0, :, :, 0] img = img[img > 0] # record and store percentile, skip if no non-zero pixels if len(img) > 0: img_percentile = np.quantile(img, percentile) percentile_list.append(img_percentile) # save channel-wide average percentile_means.append(np.mean(percentile_list)) percentile_df = pd.DataFrame(np.expand_dims(percentile_means, axis=0), columns=channels) percentile_df.sort_index(axis="columns", key=ns.natsort_key, inplace=True) return percentile_df
[docs]def calculate_pixel_intensity_percentile(tiff_dir, fovs, channels, img_sub_folder, channel_percentiles, percentile=0.05): """Calculates average percentile per FOV for total signal in each pixel Args: tiff_dir (str): Name of the directory containing the tiff files fovs (list): List of fovs to include channels (list): List of channels to include img_sub_folder (str): Sub folder within each FOV containing image data channel_percentiles (pd.DataFrame): The mapping between each channel and its normalization value Computed by `calculate_channel_percentiles` percentile (float): The pixel intensity percentile per FOV to average over Returns: float: The average percentile per FOV for total signal in each pixel """ # create vector of channel percentiles to enable broadcasting norm_vect = channel_percentiles.iloc[0].values norm_vect = norm_vect.reshape([1, 1, len(norm_vect)]) intensity_percentile_list = [] for fov in fovs: # load image data img_data = load_utils.load_imgs_from_tree(data_dir=tiff_dir, fovs=[fov], channels=channels, img_sub_folder=img_sub_folder) # normalize each channel by its percentile value norm_data = img_data[0].values / norm_vect # sum channels together to determine total intensity summed_data = np.sum(norm_data, axis=-1) intensity_percentile_list.append(np.quantile(summed_data, percentile)) return np.mean(intensity_percentile_list)
[docs]def normalize_rows(pixel_data, channels, include_seg_label=True): """Normalizes the rows of a pixel matrix by their sum Args: pixel_data (pandas.DataFrame): The dataframe containing the pixel data for a given fov Includes channel and meta (`fov`, `label`, etc.) columns channels (list): List of channels to subset over include_seg_label (bool): Whether to include `'label'` as a metadata column Returns: pandas.DataFrame: The pixel data with rows normalized and 0-sum rows removed """ # subset the fov data by the channels the user trained the pixel SOM on pixel_data_sub = pixel_data[channels] # divide each row by their sum pixel_data_sub = pixel_data_sub.div(pixel_data_sub.sum(axis=1), axis=0) # define the meta columns to add back meta_cols = ['fov', 'row_index', 'column_index'] # add the label column if it should be kept if include_seg_label: meta_cols.append('label') # add back meta columns, making sure to remove 0-row indices pixel_data_sub[meta_cols] = pixel_data.loc[pixel_data_sub.index.values, meta_cols] return pixel_data_sub
[docs]def check_for_modified_channels(tiff_dir, test_fov, img_sub_folder, channels): """Checks to make sure the user selected newly modified channels Args: tiff_dir (str): Name of the directory containing the tiff files test_fov (str): example fov used to check channel names img_sub_folder (str): sub-folder within each FOV containing image data channels (list): list of channels to use for analysis """ # convert to path-compatible format if img_sub_folder is None: img_sub_folder = '' # get all channels within example FOV all_channels = io_utils.list_files(os.path.join(tiff_dir, test_fov, img_sub_folder)) all_channels = io_utils.remove_file_extensions(all_channels) # define potential modifications to channel names mods = ['_smoothed', '_nuc_include', '_nuc_exclude'] # loop over each user-provided channel for channel in channels: for mod in mods: # check for substring matching chan_mod = channel + mod if chan_mod in all_channels: warnings.warn('You selected {} as the channel to analyze, but there were potential' ' modified channels found: {}. Make sure you selected the correct ' 'version of the channel for inclusion in ' 'clustering'.format(channel, chan_mod)) else: pass
[docs]def smooth_channels(fovs, tiff_dir, img_sub_folder, channels, smooth_vals): """Adds additional smoothing for selected channels as a preprocessing step Args: fovs (list): List of fovs to process tiff_dir (str): Name of the directory containing the tiff files img_sub_folder (str): sub-folder within each FOV containing image data channels (list): list of channels to apply smoothing to smooth_vals (list or int): amount to smooth channels. If a single int, applies to all channels. Otherwise, a custom value per channel can be supplied """ # no output if no channels specified if channels is None or len(channels) == 0: return # convert to path-compatible format if img_sub_folder is None: img_sub_folder = '' # convert int to list of same length if type(smooth_vals) is int: smooth_vals = [smooth_vals for _ in range(len(channels))] elif type(smooth_vals) is list: if len(smooth_vals) != len(channels): raise ValueError("A list was provided for variable smooth_vals, but it does not " "have the same length as the list of channels provided") else: raise ValueError("Variable smooth_vals must be either a single integer or a list") for fov in fovs: for idx, chan in enumerate(channels): img = load_utils.load_imgs_from_tree(data_dir=tiff_dir, img_sub_folder=img_sub_folder, fovs=[fov], channels=[chan]).values[0, :, :, 0] chan_out = ndimage.gaussian_filter(img, sigma=smooth_vals[idx]) image_utils.save_image( os.path.join(tiff_dir, fov, img_sub_folder, chan + '_smoothed.tiff'), chan_out )
[docs]def filter_with_nuclear_mask(fovs: List, tiff_dir: str, seg_dir: str, channel: str, nuc_seg_suffix: str = "_nuclear.tiff", img_sub_folder: str = None, exclude: bool = True): """Filters out background staining using subcellular marker localization. Non-nuclear signal is removed from nuclear markers and vice-versa for membrane markers. Args: fovs (list): The list of fovs to filter tiff_dir (str): Name of the directory containing the tiff files seg_dir (str): Name of the directory containing the segmented files channel (str): Channel to apply filtering to nuc_seg_suffix (str): The suffix for the nuclear channel. (i.e. for "fov1", a suffix of "_nuclear.tiff" would make a file named "fov1_nuclear.tiff") img_sub_folder (str): Name of the subdirectory inside `tiff_dir` containing the tiff files. Set to `None` if there isn't any. exclude (bool): Whether to filter out nuclear or membrane signal """ # if seg_dir is None, the user cannot run filtering if seg_dir is None: print('No seg_dir provided, you must provide one to run nuclear filtering') return # raise an error if the provided seg_dir does not exist io_utils.validate_paths(seg_dir) # convert to path-compatible format if img_sub_folder is None: img_sub_folder = '' for fov in fovs: # load the channel image in img = load_utils.load_imgs_from_tree(data_dir=tiff_dir, img_sub_folder=img_sub_folder, fovs=[fov], channels=[channel]).values[0, :, :, 0] # load the segmented image in seg_img_name: str = f"{fov}{nuc_seg_suffix}" seg_img = imread(os.path.join(seg_dir, seg_img_name))[0, ...] # mask out the nucleus if exclude: suffix = "_nuc_exclude.tiff" seg_mask = seg_img > 0 # mask out the membrane else: suffix = "_nuc_include.tiff" seg_mask = seg_img == 0 # filter out the nucleus or membrane depending on exclude parameter img[seg_mask] = 0 # save filtered image image_utils.save_image(os.path.join(tiff_dir, fov, img_sub_folder, channel + suffix), img)
[docs]def compute_pixel_cluster_channel_avg(fovs, channels, base_dir, pixel_cluster_col, num_pixel_clusters, pixel_data_dir='pixel_mat_data', num_fovs_subset=100, seed=42, keep_count=False): """Compute the average channel values across each pixel SOM cluster. To improve performance, number of FOVs is downsampled by `fov_subset_proportion` Args: fovs (list): The list of fovs to subset on channels (list): The list of channels to subset on base_dir (str): The path to the data directories pixel_cluster_col (str): Name of the column to group by num_pixel_clusters (int): The number of pixel clusters that are desired, if None then no fixed amount required pixel_data_dir (str): Name of the directory containing the pixel data with cluster labels num_fovs_subset (float): The number of FOVs to subset on. Note that if `len(fovs) < num_fovs_subset`, all of the FOVs will still be selected seed (int): The random seed to use for subsetting FOVs keep_count (bool): Whether to keep the count column when aggregating or not This should only be set to `True` for visualization purposes Returns: pandas.DataFrame: Contains the average channel values for each pixel SOM/meta cluster """ # verify the pixel cluster col specified is valid misc_utils.verify_in_list( provided_cluster_col=[pixel_cluster_col], valid_cluster_cols=['pixel_som_cluster', 'pixel_meta_cluster'] ) # verify num_pixel_clusters is valid if set if num_pixel_clusters is not None and num_pixel_clusters <= 0: raise ValueError("If set, number of pixel clusters desired must be a positive integer") # verify fovs subset value is valid if num_fovs_subset <= 0: raise ValueError("Number of fovs to subset must be a positive integer") # define a list to hold the cluster averages for each FOV fov_cluster_avgs = [] # warn the user that we can only select so many FOVs if len(fovs) < num_fovs_subset if len(fovs) < num_fovs_subset: warnings.warn( 'Provided num_fovs_subset=%d but only %d FOVs in dataset, ' 'subsetting just the %d FOVs' % (num_fovs_subset, len(fovs), len(fovs)) ) # subset number of FOVs based on num_fovs_subset if less than total number of fovs random.seed(seed) fovs_sub = random.sample(fovs, num_fovs_subset) if num_fovs_subset < len(fovs) else fovs for fov in fovs_sub: # read in the fovs data try: fov_pixel_data = feather.read_dataframe( os.path.join(base_dir, pixel_data_dir, fov + '.feather') ) except (ArrowInvalid, OSError, IOError): print("The data for FOV %s has been corrupted, skipping" % fov) continue # aggregate the sums and counts sum_by_cluster = fov_pixel_data.groupby( pixel_cluster_col )[channels].sum() count_by_cluster = fov_pixel_data.groupby( pixel_cluster_col )[channels].size().to_frame('count') # merge the results by column agg_results = pd.merge( sum_by_cluster, count_by_cluster, left_index=True, right_index=True ).reset_index() # append the result to cluster_avgs fov_cluster_avgs.append(agg_results) cluster_avgs = pd.concat(fov_cluster_avgs) # reset the index of cluster_avgs for consistency cluster_avgs = cluster_avgs.reset_index(drop=True) # sum the counts and the channel sums sum_count_totals = cluster_avgs.groupby( pixel_cluster_col )[channels + ['count']].sum().reset_index() # error out if any clusters were lost during the averaging process if num_pixel_clusters is not None and sum_count_totals.shape[0] < num_pixel_clusters: raise ValueError( 'Averaged data contains just %d clusters out of %d. ' 'Average expression file not written. ' 'Consider increasing your num_fovs_subset value.' % (sum_count_totals.shape[0], num_pixel_clusters) ) # now compute the means using the count column sum_count_totals[channels] = sum_count_totals[channels].div(sum_count_totals['count'], axis=0) # convert cluster column to integer type sum_count_totals[pixel_cluster_col] = sum_count_totals[pixel_cluster_col].astype(int) # sort cluster col in ascending order sum_count_totals = sum_count_totals.sort_values(by=pixel_cluster_col) # drop the count column if specified if not keep_count: sum_count_totals = sum_count_totals.drop('count', axis=1) return sum_count_totals
[docs]def find_fovs_missing_col(base_dir, data_dir, missing_col): """Identify FOV names in `data_dir` without `missing_col` Args: base_dir (str): The path to the data directories data_dir (str): Name of the directory which contains the full preprocessed pixel data missing_col (str): Name of the column to identify Returns: list: List of FOVs without `missing_col` """ # define the main data path data_path = os.path.join(base_dir, data_dir) # define the temp data path temp_path = os.path.join(base_dir, data_dir + '_temp') # verify the data path exists io_utils.validate_paths(data_path) # if the temp path does not exist, either all the FOVs need to be run or none of them do if not os.path.exists(temp_path): # read in one of the FOV files in data_path fov_files = io_utils.list_files(data_path, substrs='.feather') # read in a sample FOV, do it this way to avoid potentially corrupted files # NOTE: this assumes only a few files get corrupted each run per MIBIAN runs # NOTE: handling of corrupted files gets propagated to respective R script i = 0 while i < len(fov_files): try: fov_data = feather.read_dataframe(os.path.join(data_path, fov_files[i])) except (ArrowInvalid, OSError, IOError): i += 1 continue break # if the missing_col is not found in fov_data, we need to run all the FOVs if missing_col not in fov_data.columns.values: # we will also make the temp directory to store the new files os.mkdir(temp_path) return io_utils.remove_file_extensions(fov_files) # otherwise, the column has already been assigned for this cohort, no need to run anything else: return [] # if the temp path does exist, we have FOVs that need further processing else: # retrieve the FOV file names from both data_path and temp_path data_files = set(io_utils.list_files(data_path, substrs='.feather')) temp_files = set(io_utils.list_files(temp_path, substrs='.feather')) # get the difference between the two of these to determine which ones are valid leftover_files = list(data_files.difference(temp_files)) return io_utils.remove_file_extensions(leftover_files)