SPACEc: Preprocessing - Signal Preprocessing

# import spacec first
import spacec as sp

# import standard packages
import os
import numpy as np
import pandas as pd

# silencing warnings
import warnings
warnings.filterwarnings('ignore')

INFO:root: * TissUUmaps version: 3.1.1.6

# Specify the path to the data
root_path = "/home/user/path/SPACEc/" # inset your own path
data_path = root_path + 'example_data/raw/' # where the data is stored

# where you want to store the output
output_dir = root_path + 'example_data/output/'
os.makedirs(output_dir, exist_ok=True)

#read in segmentation csv files
#Read and concatenate the csv files (outputs from the cell segmentation algorithms). 
df_seg = sp.pp.read_segdf(
    segfile_list = [ # list of segmented files
        output_dir + "tonsil1_mesmer_result.csv", 
        output_dir + "tonsil2_mesmer_result.csv"
    ],
    seg_method = 'mesmer',
    region_list =["reg001", "reg002"],
    meta_list = ["tonsil", "tonsillitis"]
)

#Get the shape of the data
print(df_seg.shape)

#See what it looks like
df_seg.head()

(52454, 71)

	DAPI	FoxP3	HLA-DR	CD103	CHGA	EGFR	CD206	GFAP	PD-1	BCL2	...	eccentricity	perimeter	convex_area	area	axis_major_length	axis_minor_length	label	unique_region	condition
1	14.681818	0.072727	0.000000	1.490909	1.390909	1.536364	0.118182	0.763636	4.918182	2.109091	...	0.654345	36.970563	115.0	110.0	13.606782	10.289396	1	reg001	tonsillitis
2	54.510204	0.173469	3.183673	3.877551	4.918367	2.489796	0.561224	0.918367	10.040816	3.561224	...	0.866483	37.798990	103.0	98.0	15.865016	7.919928	2	reg001	tonsillitis
3	54.629630	0.240741	3.462963	4.203704	4.833333	2.407407	0.703704	0.833333	7.611111	3.314815	...	0.615425	26.485281	57.0	54.0	9.663631	7.616834	3	reg001	tonsillitis
4	66.348361	0.090164	7.348361	5.512295	5.196721	2.545082	0.602459	0.893443	8.352459	7.946721	...	0.769372	58.870058	259.0	244.0	22.080518	14.105070	4	reg001	tonsillitis
5	39.724138	0.068966	1.241379	3.206897	3.413793	2.206897	0.379310	0.724138	6.586207	2.896552	...	0.633516	17.899495	31.0	29.0	6.968100	5.391425	5	reg001	tonsillitis

5 rows × 71 columns

Filter cells by DAPI intensity and area

# print smallest 1% of cells by area
one_percent_area = np.percentile(df_seg.area, 1)
one_percent_area

36.0

# print smallest 1% of cells by DAPI intensity
one_percent_nuc = np.percentile(df_seg.DAPI, 1)
one_percent_nuc

33.45025166221338

# If necessary filter the dataframe to remove too small objects or cells without a nucleus. 
# Identify the lowest 1% for cell size and nuclear marker intensity to get a better idea of potential segmentation artifacts.
df_filt = sp.pp.filter_data(
    df_seg, 
    nuc_thres=one_percent_nuc, # remove cells with DAPI intensity below threshold
    size_thres=one_percent_area, # remove cells with area below threshold
    nuc_marker="DAPI", # name of nuclear marker
    cell_size = "area", # name of cell size column
    region_column = "region_num", # column with region numbers
    color_by = "region_num", # color by region number
    log_scale=False) # log scale for size

../_images/bd4f8448b8fb4b611ce370a94ad5c967c3d54399e6c4ce4f91e15685fd138f38.png

<Figure size 800x550 with 0 Axes>

Percentage of cells kept:  98.99149731193046 %

Normalize data

# Normalize data with one of the four available methods (zscore as default)
df_filt.columns

Index(['DAPI', 'FoxP3', 'HLA-DR', 'CD103', 'CHGA', 'EGFR', 'CD206', 'GFAP',
       'PD-1', 'BCL2', 'panCK', 'CD45RO', 'CD11b', 'CD56', 'CD163', 'CD21',
       'CD8', 'S100', 'Vimentin', 'PDGFRb', 'CCR7', 'CD57', 'CD34',
       'Synaptophysin', 'CD31', 'CXCR5', 'CD3', 'CD38', 'LAG3', 'CD25', 'CD16',
       'IL-10', 'Ki67', 'CLEC9A', 'p53', 'CD69', 'CD11c', 'CD68', 'Ox40',
       'aSMA', 'CD20', 'CD4', 'MUC-1', 'Podoplanin', 'CD45RA', 'CD15',
       'betaCatenin', 'PAX5', 'MCT', 'FAP', 'CD138', 'Tbet', 'GranzymeB',
       'IDO-1', 'CD45', 'CollagenIV', 'PD-L1', 'Arginase-1', 'GATA3', 'y', 'x',
       'eccentricity', 'perimeter', 'convex_area', 'area', 'axis_major_length',
       'axis_minor_length', 'label', 'region_num', 'unique_region',
       'condition'],
      dtype='object')

# This is to normalize the data per region/tif
dfz = pd.DataFrame()

for region in df_filt.unique_region.unique():
    df_reg = df_filt[df_filt.unique_region == region]
    df_reg_norm = sp.pp.format(
        data=df_reg, 
        list_out= ['eccentricity', 'perimeter', 'convex_area', 'axis_major_length', 'axis_minor_length',  "label"], # list of features to remove
        list_keep = ["DAPI",'x','y', 'area','region_num',"unique_region", 'condition'], # list of meta information that you would like to keep but don't want to normalize
        method = "zscore") # choose from "zscore", "double_zscore", "MinMax", "ArcSin"
    dfz = pd.concat([dfz,df_reg_norm], axis = 0)

dfz.shape

(51925, 65)

Remove noisy cells

#This section is used to remove noisy cells. This is very important to ensure proper identification of the cells via clustering.
dfz.columns

Index(['FoxP3', 'HLA-DR', 'CD103', 'CHGA', 'EGFR', 'CD206', 'GFAP', 'PD-1',
       'BCL2', 'panCK', 'CD45RO', 'CD11b', 'CD56', 'CD163', 'CD21', 'CD8',
       'S100', 'Vimentin', 'PDGFRb', 'CCR7', 'CD57', 'CD34', 'Synaptophysin',
       'CD31', 'CXCR5', 'CD3', 'CD38', 'LAG3', 'CD25', 'CD16', 'IL-10', 'Ki67',
       'CLEC9A', 'p53', 'CD69', 'CD11c', 'CD68', 'Ox40', 'aSMA', 'CD20', 'CD4',
       'MUC-1', 'Podoplanin', 'CD45RA', 'CD15', 'betaCatenin', 'PAX5', 'MCT',
       'FAP', 'CD138', 'Tbet', 'GranzymeB', 'IDO-1', 'CD45', 'CollagenIV',
       'PD-L1', 'Arginase-1', 'GATA3', 'DAPI', 'x', 'y', 'area', 'region_num',
       'unique_region', 'condition'],
      dtype='object')

# get the column index for the last marker 
col_num_last_marker = dfz.columns.get_loc('GATA3')
print(col_num_last_marker)

# This function helps to figure out what the cut-off should be for each region
for region in dfz.unique_region.unique():
    print(region)
    df_reg = dfz[dfz.unique_region == region]
    sp.pl.zcount_thres(dfz = df_reg, 
                col_num = col_num_last_marker, # last antibody index
                cut_off=0.01, #top 1% of cells
                count_bin=50)

reg001
reg002

../_images/252af79a9f74a31ef28081ecb062d30906d90682debfff7fd85be14b28893f39.png

../_images/745790ed8abc40702039e828fbf5ff6f5b39affbf6d5a8a3c78e5ed2c04513b6.png

# This is to remove top 1 % of all cells that are highly expressive for all antibodies
df_nn = pd.DataFrame()
cutoff_list = [[41, 38.38], [45, 46.22]]

for i in range(len(dfz.unique_region.unique())):
    df_reg = dfz[dfz.unique_region == dfz.unique_region.unique()[i]]
    df_reg_nn,cc = sp.pp.remove_noise(
        df=df_reg, 
        col_num=col_num_last_marker, # this is the column index that has the last protein feature
        z_count_thres=cutoff_list[i][0], # number obtained from the function above
        z_sum_thres=cutoff_list[i][1] # number obtained from the function above
    )
    print(df_reg_nn.shape)
    df_nn = pd.concat([df_nn,df_reg_nn], axis = 0)
df_nn.shape

1.4000000000000001% cells are removed.
(28436, 65)
1.7000000000000002% cells are removed.
(22702, 65)

(51138, 65)

#Save the df as a backup. We strongly recommend the Anndata format for further analysis!
df_nn.to_csv(output_dir + "df_nn_demo.csv")

# inspect which markers work, and drop the ones that did not work from the clustering step
# make an anndata to be compatible with the downstream clustering step
adata = sp.hf.make_anndata(
    df_nn = df_nn,
    col_sum = col_num_last_marker, # this is the column index that has the last protein feature # the rest will go into obs
    nonFuncAb_list = [] # Remove the antibodies that are not working from the clustering step
)
adata

AnnData object with n_obs × n_vars = 51138 × 58
    obs: 'DAPI', 'x', 'y', 'area', 'region_num', 'unique_region', 'condition'

# save the anndata object to a file
adata.write_h5ad(output_dir + 'adata_nn_demo.h5ad')

... storing 'region_num' as categorical
... storing 'unique_region' as categorical
... storing 'condition' as categorical

Show the spatial distribution for size (Optional)

import pickle
with open(output_dir + 'overlay_tonsil1.pickle', 'rb') as f:
    overlay_data1 = pickle.load(f)

with open(output_dir + 'overlay_tonsil2.pickle', 'rb') as f:
    overlay_data2 = pickle.load(f)

df_nn.columns

Index(['FoxP3', 'HLA-DR', 'CD103', 'CHGA', 'EGFR', 'CD206', 'GFAP', 'PD-1',
       'BCL2', 'panCK', 'CD45RO', 'CD11b', 'CD56', 'CD163', 'CD21', 'CD8',
       'S100', 'Vimentin', 'PDGFRb', 'CCR7', 'CD57', 'CD34', 'Synaptophysin',
       'CD31', 'CXCR5', 'CD3', 'CD38', 'LAG3', 'CD25', 'CD16', 'IL-10', 'Ki67',
       'CLEC9A', 'p53', 'CD69', 'CD11c', 'CD68', 'Ox40', 'aSMA', 'CD20', 'CD4',
       'MUC-1', 'Podoplanin', 'CD45RA', 'CD15', 'betaCatenin', 'PAX5', 'MCT',
       'FAP', 'CD138', 'Tbet', 'GranzymeB', 'IDO-1', 'CD45', 'CollagenIV',
       'PD-L1', 'Arginase-1', 'GATA3', 'DAPI', 'x', 'y', 'area', 'region_num',
       'unique_region', 'condition'],
      dtype='object')

sp.pl.coordinates_on_image(
    df = df_nn.loc[df_nn['unique_region'] == 'reg001',:], 
    overlay_data = overlay_data1, color='area',  
    scale=False, # whether to scale to 1 or not
    dot_size=5,
    convert_to_grey=True, 
    fig_width=10, fig_height=10)

../_images/24f0ccd5869ed0dccbda62f3d264ae122343303a2800931b91996abb68c2ed49.png

sp.pl.coordinates_on_image(
    df = df_nn.loc[df_nn['unique_region'] == 'reg002',:], 
    overlay_data = overlay_data2, 
    color='area', 
    scale=False, # whether to scale to 1 or not
    dot_size=5,
    convert_to_grey=True, 
    fig_width=10, fig_height=10 )

../_images/4b41a45a23477eee719cc4250199ba0e735b6e4fb84c857e83b09e1ed84fb4aa.png

This function can also be used to inspect where certain markers are expressed in the tissue.

import matplotlib.pyplot as plt
plt.rc('axes', grid=False)  # remove gridlines

marker_list = ['EGFR', 'CD21', 'CD8']

for marker in marker_list:
    sp.pl.coordinates_on_image(
        df = df_nn.loc[df_nn['unique_region'] == 'reg002',:], 
        overlay_data = overlay_data2, 
        color=marker, 
        scale=False, # whether to scale to 1 or not
        dot_size=2,
        convert_to_grey=True, 
        fig_width=3, fig_height=3 )

../_images/831484158e7e76c644d61cb6052bdd9f0862f3a3f236e19ba3baccd3613375ae.png

../_images/a5ef66e185762c699f9fa0b8c526e021a4b220c04015151c69e2c472154cc62a.png

../_images/54a64b51e1f6d847b884be594491127f16277cdbf2718b6b3a594e1aa77019c9.png