#!/usr/bin/env python # coding: utf-8 # # Batch correction in Bulk RNA-seq or microarray data # # Variability in datasets are not only the product of biological processes: they are also the product of technical biases (Lander et al, 1999). ComBat is one of the most widely used tool for correcting those technical biases called batch effects. # # pyComBat (Behdenna et al, 2020) is a new Python implementation of ComBat (Johnson et al, 2007), a software widely used for the adjustment of batch effects in microarray data. While the mathematical framework is strictly the same, pyComBat: # # - has similar results in terms of batch effects correction; # - is as fast or faster than the R implementation of ComBat and; # - offers new tools for the community to participate in its development. # # Paper: [pyComBat, a Python tool for batch effects correction in high-throughput molecular data using empirical Bayes methods](https://doi.org/10.1101/2020.03.17.995431) # # Code: https://github.com/epigenelabs/pyComBat # # Colab_Reproducibility:https://colab.research.google.com/drive/121bbIiI3j4pTZ3yA_5p8BRkRyGMMmNAq?usp=sharing # In[7]: import anndata import pandas as pd import omicverse as ov ov.ov_plot_set() # ## Loading dataset # # This minimal usage example illustrates how to use pyComBat in a default setting, and shows some results on ovarian cancer data, freely available on NCBI’s [Gene Expression Omnibus](https://www.ncbi.nlm.nih.gov/geo/), namely: # # - GSE18520 # - GSE66957 # - GSE69428 # # The corresponding expression files are available on [GitHub](https://github.com/epigenelabs/pyComBat/tree/master/data). # In[15]: dataset_1 = pd.read_pickle("data/combat/GSE18520.pickle") adata1=anndata.AnnData(dataset_1.T) adata1.obs['batch']='1' adata1 # In[16]: dataset_2 = pd.read_pickle("data/combat/GSE66957.pickle") adata2=anndata.AnnData(dataset_2.T) adata2.obs['batch']='2' adata2 # In[17]: dataset_3 = pd.read_pickle("data/combat/GSE69428.pickle") adata3=anndata.AnnData(dataset_3.T) adata3.obs['batch']='3' adata3 # We use the concat function to join the three datasets together and take the intersection for the same genes # In[18]: adata=anndata.concat([adata1,adata2,adata3],merge='same') adata # ## Removing batch effect # In[31]: ov.bulk.batch_correction(adata,batch_key='batch') # ## Saving results # # Raw datasets # In[70]: raw_data=adata.to_df().T raw_data.head() # Removing Batch datasets # In[71]: removing_data=adata.to_df(layer='batch_correction').T removing_data.head() # save # In[ ]: raw_data.to_csv('raw_data.csv') removing_data.to_csv('removing_data.csv') # You can also save adata object # In[ ]: adata.write_h5ad('adata_batch.h5ad',compression='gzip') #adata=ov.read('adata_batch.h5ad') # ## Compare the dataset before and after correction # # We specify three different colours for three different datasets # In[51]: color_dict={ '1':ov.utils.red_color[1], '2':ov.utils.blue_color[1], '3':ov.utils.green_color[1], } # In[57]: fig,ax=plt.subplots( figsize = (20,4)) bp=plt.boxplot(adata.to_df().T,patch_artist=True) for i,batch in zip(range(adata.shape[0]),adata.obs['batch']): bp['boxes'][i].set_facecolor(color_dict[batch]) ax.axis(False) plt.show() # In[58]: fig,ax=plt.subplots( figsize = (20,4)) bp=plt.boxplot(adata.to_df(layer='batch_correction').T,patch_artist=True) for i,batch in zip(range(adata.shape[0]),adata.obs['batch']): bp['boxes'][i].set_facecolor(color_dict[batch]) ax.axis(False) plt.show() # In addition to using boxplots to observe the effect of batch removal, we can also use PCA to observe the effect of batch removal # In[59]: adata.layers['raw']=adata.X.copy() # We first calculate the PCA on the raw dataset # In[60]: ov.pp.pca(adata,layer='raw',n_pcs=50) adata # We then calculate the PCA on the batch_correction dataset # In[61]: ov.pp.pca(adata,layer='batch_correction',n_pcs=50) adata # In[62]: ov.utils.embedding(adata, basis='raw|original|X_pca', color='batch', frameon='small') # In[63]: ov.utils.embedding(adata, basis='batch_correction|original|X_pca', color='batch', frameon='small') # In[ ]: