Source code for ark.phenotyping.pixie_preprocessing

import multiprocessing
import os
from functools import partial
from shutil import rmtree
import natsort as ns
import feather
import numpy as np
import pandas as pd
import scipy.ndimage as ndimage
from alpineer import io_utils, load_utils, misc_utils
from skimage.io import imread

from ark.phenotyping import pixel_cluster_utils

multiprocessing.set_start_method('spawn', force=True)


[docs]def create_fov_pixel_data(fov, channels, img_data, seg_labels, pixel_thresh_val, blur_factor=2, subset_proportion=0.1): """Preprocess pixel data for one fov Args: fov (str): Name of the fov to index channels (list): List of channels to subset over img_data (numpy.ndarray): Array representing image data for one fov seg_labels (numpy.ndarray): Array representing segmentation labels for one fov pixel_thresh_val (float): value used to determine per-pixel cutoff for total signal inclusion blur_factor (int): The sigma to set for the Gaussian blur subset_proportion (float): The proportion of pixels to take from each fov Returns: tuple: Contains the following: - `pandas.DataFrame`: Gaussian blurred and channel sum normalized pixel data for a fov - `pandas.DataFrame`: subset of the preprocessed pixel dataset for a fov """ channels.sort(key=ns.natsort_key) # for each marker, compute the Gaussian blur for marker in range(len(channels)): img_data[:, :, marker] = ndimage.gaussian_filter(img_data[:, :, marker], sigma=blur_factor) # flatten each image, make sure to subset only on channels pixel_mat = img_data.reshape(-1, len(channels)) # convert into a dataframe pixel_mat = pd.DataFrame(pixel_mat, columns=channels) # assign metadata about each entry pixel_mat['fov'] = fov pixel_mat['row_index'] = np.repeat(range(img_data.shape[0]), img_data.shape[1]) pixel_mat['column_index'] = np.tile(range(img_data.shape[1]), img_data.shape[0]) # assign segmentation labels if it is not None if seg_labels is not None: seg_labels_flat = seg_labels.flatten() pixel_mat['label'] = seg_labels_flat # remove any rows with channels with a sum below the threshold rowsums = pixel_mat[channels].sum(axis=1) pixel_mat = pixel_mat.loc[rowsums > pixel_thresh_val, :].reset_index(drop=True) # remove any rows with channels that sum to zero prior to normalization pixel_mat = pixel_mat.loc[(pixel_mat[channels] != 0).any(axis=1), :].reset_index(drop=True) # normalize the row sums of pixel mat pixel_mat = pixel_cluster_utils.normalize_rows(pixel_mat, channels, seg_labels is not None) # subset the pixel matrix for training pixel_mat_subset = pixel_mat.sample(frac=subset_proportion) return pixel_mat, pixel_mat_subset
[docs]def preprocess_fov(base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix, img_sub_folder, is_mibitiff, channels, blur_factor, subset_proportion, pixel_thresh_val, seed, channel_norm_df, fov): """Helper function to read in the FOV-level pixel data, run `create_fov_pixel_data`, and save the preprocessed data. Args: base_dir (str): The path to the data directories tiff_dir (str): Name of the directory containing the tiff files data_dir (str): Name of the directory which contains the full preprocessed pixel data subset_dir (str): The name of the directory containing the subsetted pixel data seg_dir (str): Name of the directory containing the segmented files. Set to `None` if no segmentation directory is available or desired. seg_suffix (str): The suffix that the segmentation images use. Ignored if `seg_dir` is `None`. img_sub_folder (str): Name of the subdirectory inside `tiff_dir` containing the tiff files. Set to `None` if there isn't any. is_mibitiff (bool): Whether to load the images from MIBITiff channels (list): List of channels to subset over, applies only to `pixel_mat_subset` blur_factor (int): The sigma to set for the Gaussian blur subset_proportion (float): The proportion of pixels to take from each fov pixel_thresh_val (float): The value to normalize the pixels by seed (int): The random seed to set for subsetting channel_norm_df (pandas.DataFrame): The channel normalization values to use fov (str): The name of the FOV to preprocess Returns: pandas.DataFrame: The full preprocessed pixel dataset, needed for computing 99.9% normalized values in `create_pixel_matrix` """ # load img_xr from MIBITiff or directory with the fov if is_mibitiff: img_xr = load_utils.load_imgs_from_mibitiff( tiff_dir, mibitiff_files=[fov]) else: img_xr = load_utils.load_imgs_from_tree( tiff_dir, img_sub_folder=img_sub_folder, fovs=[fov]) # ensure the provided channels will actually exist in img_xr misc_utils.verify_in_list( provided_chans=channels, pixel_mat_chans=img_xr.channels.values ) # if seg_dir is not None, load the segmentation labels in for the fov, # otherwise leave seg_labels as None if seg_dir is not None: seg_labels = imread(os.path.join(seg_dir, fov + seg_suffix)) else: seg_labels = None # subset for the channel data img_data = img_xr.loc[fov, :, :, channels].values.astype(np.float32) # create vector for normalizing image data norm_vect = channel_norm_df.iloc[0].values norm_vect = np.array(norm_vect).reshape([1, 1, len(norm_vect)]) # normalize image data img_data = img_data / norm_vect # set seed for subsetting np.random.seed(seed) # create the full and subsetted fov matrices pixel_mat, pixel_mat_subset = create_fov_pixel_data( fov=fov, channels=channels, img_data=img_data, seg_labels=seg_labels, pixel_thresh_val=pixel_thresh_val, blur_factor=blur_factor, subset_proportion=subset_proportion ) # write complete dataset to feather, needed for cluster assignment feather.write_dataframe(pixel_mat, os.path.join(base_dir, data_dir, fov + ".feather"), compression='uncompressed') # write subsetted dataset to feather, needed for training feather.write_dataframe(pixel_mat_subset, os.path.join(base_dir, subset_dir, fov + ".feather"), compression='uncompressed') return pixel_mat
[docs]def create_pixel_matrix(fovs, channels, base_dir, tiff_dir, seg_dir, img_sub_folder="TIFs", seg_suffix='_whole_cell.tiff', pixel_output_dir='pixel_output_dir', data_dir='pixel_mat_data', subset_dir='pixel_mat_subsetted', norm_vals_name_pre_rownorm='channel_norm_pre_rownorm.feather', norm_vals_name_post_rownorm='channel_norm_post_rownorm.feather', pixel_thresh_name='pixel_thresh.feather', channel_percentile_pre_rownorm=0.99, channel_percentile_post_rownorm=0.999, is_mibitiff=False, blur_factor=2, subset_proportion=0.1, seed=42, multiprocess=False, batch_size=5): """For each fov, add a Gaussian blur to each channel and normalize channel sums for each pixel Saves data to `data_dir` and subsetted data to `subset_dir` Args: fovs (list): List of fovs to subset over channels (list): List of channels to subset over, applies only to `pixel_mat_subset` base_dir (str): The path to the data directories tiff_dir (str): Name of the directory containing the tiff files seg_dir (str): Name of the directory containing the segmented files. Set to `None` if no segmentation directory is available or desired. img_sub_folder (str): Name of the subdirectory inside `tiff_dir` containing the tiff files. Set to `None` if there isn't any. seg_suffix (str): The suffix that the segmentation images use. Ignored if `seg_dir` is `None`. pixel_output_dir (str): The name of the data directory containing the pixel data to use for the clustering pipeline. `data_dir` and `subset_dir` should be placed here. data_dir (str): Name of the directory which contains the full preprocessed pixel data. Should be placed in `pixel_output_dir`. subset_dir (str): The name of the directory containing the subsetted pixel data. Should be placed in `pixel_output_dir`. norm_vals_name_pre_rownorm (str): The name of the file to store the pre-pixel-normalized norm values norm_vals_name_post_rownorm (str): The name of the file to store the post-pixel-normalized norm values pixel_thresh_name (str): The name of the file to store the pixel threshold value channel_percentile_pre_rownorm (float): Percentile used to normalize channels before pixel normalization channel_percentile_post_rownorm (float): Percentile used to normalize channels after pixel normalization is_mibitiff (bool): Whether to load the images from MIBITiff blur_factor (int): The sigma to set for the Gaussian blur subset_proportion (float): The proportion of pixels to take from each fov seed (int): The random seed to set for subsetting multiprocess (bool): Whether to use multiprocessing or not batch_size (int): The number of FOVs to process in parallel, ignored if `multiprocess` is `False` """ channels.sort(key=ns.natsort_key) # if the subset_proportion specified is out of range if subset_proportion <= 0 or subset_proportion > 1: raise ValueError('Invalid subset percentage entered: must be in (0, 1]') # path validation io_utils.validate_paths([base_dir, tiff_dir, os.path.join(base_dir, pixel_output_dir)]) # create data_dir if it doesn't already exist if not os.path.exists(os.path.join(base_dir, data_dir)): os.mkdir(os.path.join(base_dir, data_dir)) # create subset_dir if it doesn't already exist if not os.path.exists(os.path.join(base_dir, subset_dir)): os.mkdir(os.path.join(base_dir, subset_dir)) # define path to channel normalization values channel_norm_pre_rownorm_path = os.path.join( base_dir, pixel_output_dir, norm_vals_name_pre_rownorm ) # define path to pixel normalization values pixel_thresh_path = os.path.join( base_dir, pixel_output_dir, pixel_thresh_name ) # reset entire cohort if channels provided are different from ones in existing channel_norm if os.path.exists(channel_norm_pre_rownorm_path): channel_norm_pre_rownorm_df = feather.read_dataframe(channel_norm_pre_rownorm_path) if set(channel_norm_pre_rownorm_df.columns.values) != set(channels): print("New channels provided: overwriting whole cohort") # delete the existing data in data_dir and subset_dir rmtree(os.path.join(base_dir, data_dir)) os.mkdir(os.path.join(base_dir, data_dir)) rmtree(os.path.join(base_dir, subset_dir)) os.mkdir(os.path.join(base_dir, subset_dir)) # delete the existing channel_norm_pre_rownorm.feather and pixel_thresh.feather os.remove(channel_norm_pre_rownorm_path) os.remove(pixel_thresh_path) # create variable for storing 99.9% values quantile_path = os.path.join(base_dir, data_dir, "channel_norm_post_rownorm_perfov.csv") # find all the FOV files in the full data and subsetted directories # NOTE: this handles the case where the data file was written, but not the subset file fovs_sub = io_utils.list_files(os.path.join(base_dir, subset_dir), substrs='.feather') fovs_data = io_utils.list_files(os.path.join(base_dir, data_dir), substrs='.feather') # intersect the two fovs lists together (if a FOV appears in one but not the other, regenerate) fovs_full = list(set(fovs_sub).intersection(fovs_data)) # trim the .feather suffix from the fovs in the subsetted directory fovs_full = io_utils.remove_file_extensions(fovs_full) # define the list of FOVs for preprocessing # NOTE: if an existing FOV is already corrupted, future steps will discard it fovs_list = list(set(fovs).difference(set(fovs_full))) # if there are no FOVs left to preprocess don't run function if len(fovs_list) == 0: print("There are no more FOVs to preprocess, skipping") return # check for missing quant data and add to the list of FOVs for processing quant_dat_all = pd.read_csv(quantile_path, index_col="channel") \ if os.path.exists(quantile_path) else pd.DataFrame() quant_fov_list = quant_dat_all.columns quant_missing = list(set(fovs).difference(set(quant_fov_list))) fovs_list = list(set(fovs_list).union(set(quant_missing))) # if the process is only partially complete, inform the user of restart if len(fovs_list) < len(fovs): print("Restarting preprocessing from FOV %s, " "%d fovs left to process" % (fovs_list[0], len(fovs_list))) # check to make sure correct channels were specified pixel_cluster_utils.check_for_modified_channels( tiff_dir=tiff_dir, test_fov=fovs[0], img_sub_folder=img_sub_folder, channels=channels ) # load existing channel_norm_pre_path if exists, otherwise generate if not os.path.exists(channel_norm_pre_rownorm_path): # compute channel percentiles channel_norm_pre_rownorm_df = pixel_cluster_utils.calculate_channel_percentiles( tiff_dir=tiff_dir, fovs=fovs, channels=channels, img_sub_folder=img_sub_folder, percentile=channel_percentile_pre_rownorm ) # save output feather.write_dataframe( channel_norm_pre_rownorm_df, channel_norm_pre_rownorm_path, compression='uncompressed' ) else: # load previously generated output channel_norm_pre_rownorm_df = feather.read_dataframe(channel_norm_pre_rownorm_path) # load existing pixel_thresh_path if exists, otherwise generate if not os.path.exists(pixel_thresh_path): # compute pixel percentiles pixel_thresh_val = pixel_cluster_utils.calculate_pixel_intensity_percentile( tiff_dir=tiff_dir, fovs=fovs, channels=channels, img_sub_folder=img_sub_folder, channel_percentiles=channel_norm_pre_rownorm_df ) pixel_thresh_df = pd.DataFrame({'pixel_thresh_val': [pixel_thresh_val]}) feather.write_dataframe(pixel_thresh_df, pixel_thresh_path, compression='uncompressed') else: pixel_thresh_df = feather.read_dataframe(pixel_thresh_path) pixel_thresh_val = pixel_thresh_df['pixel_thresh_val'].values[0] # define the partial function to iterate over fov_data_func = partial( preprocess_fov, base_dir, tiff_dir, data_dir, subset_dir, seg_dir, seg_suffix, img_sub_folder, is_mibitiff, channels, blur_factor, subset_proportion, pixel_thresh_val, seed, channel_norm_pre_rownorm_df ) # define variable to keep track of number of fovs processed fovs_processed = 0 # define the columns to drop for 99.9% normalization cols_to_drop = ['fov', 'row_index', 'column_index'] # account for label if seg_dir is set if seg_dir: cols_to_drop.append('label') if multiprocess: # define the multiprocessing context with multiprocessing.get_context('spawn').Pool(batch_size) as fov_data_pool: # asynchronously generate and save the pixel matrices per FOV # NOTE: fov_data_pool should NOT operate on quant_dat since that is a shared resource for fov_batch in [fovs_list[i:(i + batch_size)] for i in range(0, len(fovs_list), batch_size)]: fov_data_batch = fov_data_pool.map(fov_data_func, fov_batch) # compute the 99.9% quantile values for each FOV for pixel_mat_data in fov_data_batch: # retrieve the FOV name, note that there will only be one per FOV DataFrame fov = pixel_mat_data['fov'].unique()[0] # drop the metadata columns and generate the 99.9% quantile values for the FOV fov_full_pixel_data = pixel_mat_data.drop(columns=cols_to_drop) quant_dat_fov = fov_full_pixel_data.replace(0, np.nan).quantile( q=channel_percentile_post_rownorm, axis=0).rename(fov) quant_dat_fov.index.name = "channel" # update the file with the newly processed fov quantile value quant_dat_all = quant_dat_all.merge(quant_dat_fov, how="outer", left_index=True, right_index=True) quant_dat_all.to_csv(quantile_path) # update number of fovs processed fovs_processed += len(fov_batch) print("Processed %d fovs" % fovs_processed) else: for fov in fovs_list: pixel_mat_data = fov_data_func(fov) # drop the metadata columns and generate the 99.9% quantile values for the FOV fov_full_pixel_data = pixel_mat_data.drop(columns=cols_to_drop) quant_dat_fov = fov_full_pixel_data.replace(0, np.nan).quantile( q=channel_percentile_post_rownorm, axis=0).rename(fov) quant_dat_fov.index.name = "channel" # update the file with the newly processed fov quantile values quant_dat_all = quant_dat_all.merge(quant_dat_fov, how="outer", left_index=True, right_index=True) quant_dat_all.to_csv(quantile_path) # update number of fovs processed fovs_processed += 1 # update every 10 FOVs, or at the very end if fovs_processed % 10 == 0 or fovs_processed == len(fovs_list): print("Processed %d fovs" % fovs_processed) # get mean 99.9% across all fovs for all markers, check that none are missing mean_quant = pd.DataFrame(quant_dat_all.mean(axis=1)) # Bug in Pandas w/ natsort_key, so we have to use this ugly workaround ¯\_(ツ)_/¯ # See https://github.com/pandas-dev/pandas/issues/56081 mean_quant.sort_index(axis="index", key=lambda v: pd.Index(ns.natsort_key(v), tupleize_cols=False), inplace=True ) # save 99.9% normalization values feather.write_dataframe(mean_quant.T, os.path.join(base_dir, norm_vals_name_post_rownorm), compression='uncompressed') # delete quantile data file os.remove(quantile_path)