SPACEc: Preprocessing - Signal Preprocessing

# import spacec first
import spacec as sp

# import standard packages
import os
import numpy as np
import pandas as pd

# silencing warnings
import warnings
warnings.filterwarnings('ignore')
INFO:root: * TissUUmaps version: 3.1.1.6
# Specify the path to the data
root_path = "/home/user/path/SPACEc/" # inset your own path
data_path = root_path + 'example_data/raw/' # where the data is stored

# where you want to store the output
output_dir = root_path + 'example_data/output/'
os.makedirs(output_dir, exist_ok=True)
#read in segmentation csv files
#Read and concatenate the csv files (outputs from the cell segmentation algorithms). 
df_seg = sp.pp.read_segdf(
    segfile_list = [ # list of segmented files
        output_dir + "tonsil1_mesmer_result.csv", 
        output_dir + "tonsil2_mesmer_result.csv"
    ],
    seg_method = 'mesmer',
    region_list =["reg001", "reg002"],
    meta_list = ["tonsil", "tonsillitis"]
)

#Get the shape of the data
print(df_seg.shape)

#See what it looks like
df_seg.head()
(52454, 71)
DAPI FoxP3 HLA-DR CD103 CHGA EGFR CD206 GFAP PD-1 BCL2 ... eccentricity perimeter convex_area area axis_major_length axis_minor_length label region_num unique_region condition
1 14.681818 0.072727 0.000000 1.490909 1.390909 1.536364 0.118182 0.763636 4.918182 2.109091 ... 0.654345 36.970563 115.0 110.0 13.606782 10.289396 1 0 reg001 tonsillitis
2 54.510204 0.173469 3.183673 3.877551 4.918367 2.489796 0.561224 0.918367 10.040816 3.561224 ... 0.866483 37.798990 103.0 98.0 15.865016 7.919928 2 0 reg001 tonsillitis
3 54.629630 0.240741 3.462963 4.203704 4.833333 2.407407 0.703704 0.833333 7.611111 3.314815 ... 0.615425 26.485281 57.0 54.0 9.663631 7.616834 3 0 reg001 tonsillitis
4 66.348361 0.090164 7.348361 5.512295 5.196721 2.545082 0.602459 0.893443 8.352459 7.946721 ... 0.769372 58.870058 259.0 244.0 22.080518 14.105070 4 0 reg001 tonsillitis
5 39.724138 0.068966 1.241379 3.206897 3.413793 2.206897 0.379310 0.724138 6.586207 2.896552 ... 0.633516 17.899495 31.0 29.0 6.968100 5.391425 5 0 reg001 tonsillitis

5 rows × 71 columns

Filter cells by DAPI intensity and area

# print smallest 1% of cells by area
one_percent_area = np.percentile(df_seg.area, 1)
one_percent_area
36.0
# print smallest 1% of cells by DAPI intensity
one_percent_nuc = np.percentile(df_seg.DAPI, 1)
one_percent_nuc
33.45025166221338
# If necessary filter the dataframe to remove too small objects or cells without a nucleus. 
# Identify the lowest 1% for cell size and nuclear marker intensity to get a better idea of potential segmentation artifacts.
df_filt = sp.pp.filter_data(
    df_seg, 
    nuc_thres=one_percent_nuc, # remove cells with DAPI intensity below threshold
    size_thres=one_percent_area, # remove cells with area below threshold
    nuc_marker="DAPI", # name of nuclear marker
    cell_size = "area", # name of cell size column
    region_column = "region_num", # column with region numbers
    color_by = "region_num", # color by region number
    log_scale=False) # log scale for size
../_images/bd4f8448b8fb4b611ce370a94ad5c967c3d54399e6c4ce4f91e15685fd138f38.png
<Figure size 800x550 with 0 Axes>
Percentage of cells kept:  98.99149731193046 %

Normalize data

# Normalize data with one of the four available methods (zscore as default)
df_filt.columns
Index(['DAPI', 'FoxP3', 'HLA-DR', 'CD103', 'CHGA', 'EGFR', 'CD206', 'GFAP',
       'PD-1', 'BCL2', 'panCK', 'CD45RO', 'CD11b', 'CD56', 'CD163', 'CD21',
       'CD8', 'S100', 'Vimentin', 'PDGFRb', 'CCR7', 'CD57', 'CD34',
       'Synaptophysin', 'CD31', 'CXCR5', 'CD3', 'CD38', 'LAG3', 'CD25', 'CD16',
       'IL-10', 'Ki67', 'CLEC9A', 'p53', 'CD69', 'CD11c', 'CD68', 'Ox40',
       'aSMA', 'CD20', 'CD4', 'MUC-1', 'Podoplanin', 'CD45RA', 'CD15',
       'betaCatenin', 'PAX5', 'MCT', 'FAP', 'CD138', 'Tbet', 'GranzymeB',
       'IDO-1', 'CD45', 'CollagenIV', 'PD-L1', 'Arginase-1', 'GATA3', 'y', 'x',
       'eccentricity', 'perimeter', 'convex_area', 'area', 'axis_major_length',
       'axis_minor_length', 'label', 'region_num', 'unique_region',
       'condition'],
      dtype='object')
# This is to normalize the data per region/tif
dfz = pd.DataFrame()

for region in df_filt.unique_region.unique():
    df_reg = df_filt[df_filt.unique_region == region]
    df_reg_norm = sp.pp.format(
        data=df_reg, 
        list_out= ['eccentricity', 'perimeter', 'convex_area', 'axis_major_length', 'axis_minor_length',  "label"], # list of features to remove
        list_keep = ["DAPI",'x','y', 'area','region_num',"unique_region", 'condition'], # list of meta information that you would like to keep but don't want to normalize
        method = "zscore") # choose from "zscore", "double_zscore", "MinMax", "ArcSin"
    dfz = pd.concat([dfz,df_reg_norm], axis = 0)

dfz.shape
(51925, 65)

Remove noisy cells

#This section is used to remove noisy cells. This is very important to ensure proper identification of the cells via clustering.
dfz.columns
Index(['FoxP3', 'HLA-DR', 'CD103', 'CHGA', 'EGFR', 'CD206', 'GFAP', 'PD-1',
       'BCL2', 'panCK', 'CD45RO', 'CD11b', 'CD56', 'CD163', 'CD21', 'CD8',
       'S100', 'Vimentin', 'PDGFRb', 'CCR7', 'CD57', 'CD34', 'Synaptophysin',
       'CD31', 'CXCR5', 'CD3', 'CD38', 'LAG3', 'CD25', 'CD16', 'IL-10', 'Ki67',
       'CLEC9A', 'p53', 'CD69', 'CD11c', 'CD68', 'Ox40', 'aSMA', 'CD20', 'CD4',
       'MUC-1', 'Podoplanin', 'CD45RA', 'CD15', 'betaCatenin', 'PAX5', 'MCT',
       'FAP', 'CD138', 'Tbet', 'GranzymeB', 'IDO-1', 'CD45', 'CollagenIV',
       'PD-L1', 'Arginase-1', 'GATA3', 'DAPI', 'x', 'y', 'area', 'region_num',
       'unique_region', 'condition'],
      dtype='object')
# get the column index for the last marker 
col_num_last_marker = dfz.columns.get_loc('GATA3')
print(col_num_last_marker)
57
# This function helps to figure out what the cut-off should be for each region
for region in dfz.unique_region.unique():
    print(region)
    df_reg = dfz[dfz.unique_region == region]
    sp.pl.zcount_thres(dfz = df_reg, 
                col_num = col_num_last_marker, # last antibody index
                cut_off=0.01, #top 1% of cells
                count_bin=50)
reg001
reg002
../_images/252af79a9f74a31ef28081ecb062d30906d90682debfff7fd85be14b28893f39.png ../_images/745790ed8abc40702039e828fbf5ff6f5b39affbf6d5a8a3c78e5ed2c04513b6.png
# This is to remove top 1 % of all cells that are highly expressive for all antibodies
df_nn = pd.DataFrame()
cutoff_list = [[41, 38.38], [45, 46.22]]

for i in range(len(dfz.unique_region.unique())):
    df_reg = dfz[dfz.unique_region == dfz.unique_region.unique()[i]]
    df_reg_nn,cc = sp.pp.remove_noise(
        df=df_reg, 
        col_num=col_num_last_marker, # this is the column index that has the last protein feature
        z_count_thres=cutoff_list[i][0], # number obtained from the function above
        z_sum_thres=cutoff_list[i][1] # number obtained from the function above
    )
    print(df_reg_nn.shape)
    df_nn = pd.concat([df_nn,df_reg_nn], axis = 0)
df_nn.shape
1.4000000000000001% cells are removed.
(28436, 65)
1.7000000000000002% cells are removed.
(22702, 65)
(51138, 65)
#Save the df as a backup. We strongly recommend the Anndata format for further analysis!
df_nn.to_csv(output_dir + "df_nn_demo.csv")
# inspect which markers work, and drop the ones that did not work from the clustering step
# make an anndata to be compatible with the downstream clustering step
adata = sp.hf.make_anndata(
    df_nn = df_nn,
    col_sum = col_num_last_marker, # this is the column index that has the last protein feature # the rest will go into obs
    nonFuncAb_list = [] # Remove the antibodies that are not working from the clustering step
)
adata
AnnData object with n_obs × n_vars = 51138 × 58
    obs: 'DAPI', 'x', 'y', 'area', 'region_num', 'unique_region', 'condition'
# save the anndata object to a file
adata.write_h5ad(output_dir + 'adata_nn_demo.h5ad')
... storing 'region_num' as categorical
... storing 'unique_region' as categorical
... storing 'condition' as categorical

Show the spatial distribution for size (Optional)

import pickle
with open(output_dir + 'overlay_tonsil1.pickle', 'rb') as f:
    overlay_data1 = pickle.load(f)

with open(output_dir + 'overlay_tonsil2.pickle', 'rb') as f:
    overlay_data2 = pickle.load(f)
df_nn.columns
Index(['FoxP3', 'HLA-DR', 'CD103', 'CHGA', 'EGFR', 'CD206', 'GFAP', 'PD-1',
       'BCL2', 'panCK', 'CD45RO', 'CD11b', 'CD56', 'CD163', 'CD21', 'CD8',
       'S100', 'Vimentin', 'PDGFRb', 'CCR7', 'CD57', 'CD34', 'Synaptophysin',
       'CD31', 'CXCR5', 'CD3', 'CD38', 'LAG3', 'CD25', 'CD16', 'IL-10', 'Ki67',
       'CLEC9A', 'p53', 'CD69', 'CD11c', 'CD68', 'Ox40', 'aSMA', 'CD20', 'CD4',
       'MUC-1', 'Podoplanin', 'CD45RA', 'CD15', 'betaCatenin', 'PAX5', 'MCT',
       'FAP', 'CD138', 'Tbet', 'GranzymeB', 'IDO-1', 'CD45', 'CollagenIV',
       'PD-L1', 'Arginase-1', 'GATA3', 'DAPI', 'x', 'y', 'area', 'region_num',
       'unique_region', 'condition'],
      dtype='object')
sp.pl.coordinates_on_image(
    df = df_nn.loc[df_nn['unique_region'] == 'reg001',:], 
    overlay_data = overlay_data1, color='area',  
    scale=False, # whether to scale to 1 or not
    dot_size=5,
    convert_to_grey=True, 
    fig_width=10, fig_height=10)
../_images/24f0ccd5869ed0dccbda62f3d264ae122343303a2800931b91996abb68c2ed49.png
sp.pl.coordinates_on_image(
    df = df_nn.loc[df_nn['unique_region'] == 'reg002',:], 
    overlay_data = overlay_data2, 
    color='area', 
    scale=False, # whether to scale to 1 or not
    dot_size=5,
    convert_to_grey=True, 
    fig_width=10, fig_height=10 )
../_images/4b41a45a23477eee719cc4250199ba0e735b6e4fb84c857e83b09e1ed84fb4aa.png

This function can also be used to inspect where certain markers are expressed in the tissue.

import matplotlib.pyplot as plt
plt.rc('axes', grid=False)  # remove gridlines
marker_list = ['EGFR', 'CD21', 'CD8']

for marker in marker_list:
    sp.pl.coordinates_on_image(
        df = df_nn.loc[df_nn['unique_region'] == 'reg002',:], 
        overlay_data = overlay_data2, 
        color=marker, 
        scale=False, # whether to scale to 1 or not
        dot_size=2,
        convert_to_grey=True, 
        fig_width=3, fig_height=3 )
../_images/831484158e7e76c644d61cb6052bdd9f0862f3a3f236e19ba3baccd3613375ae.png ../_images/a5ef66e185762c699f9fa0b8c526e021a4b220c04015151c69e2c472154cc62a.png ../_images/54a64b51e1f6d847b884be594491127f16277cdbf2718b6b3a594e1aa77019c9.png