OV_Agentic_EXP_SambaNova / ovrawm /t_cellfate_gene.txt
KeTuTu's picture
Upload 46 files
2999286 verified
#!/usr/bin/env python
# coding: utf-8
# # Timing-associated genes analysis with cellfategenie
#
# In our single-cell analysis, we analyse the underlying temporal state in the cell, which we call pseudotime. and identifying the genes associated with pseudotime becomes the key to unravelling models of gene dynamic regulation. In traditional analysis, we would use correlation coefficients, or gene dynamics model fitting. The correlation coefficient approach will have a preference for genes at the beginning and end of the time series, and the gene dynamics model requires RNA velocity information. Unbiased identification of chronosequence-related genes, as well as the need for no additional dependency information, has become a challenge in current chronosequence analyses.
#
# Here, we developed CellFateGenie, which first removes potential noise from the data through metacells, and then constructs an adaptive ridge regression model to find the minimum set of genes needed to satisfy the timing fit.CellFateGenie has similar accuracy to gene dynamics models while eliminating preferences for the start and end of the time series.
#
# Colab_Reproducibility:https://colab.research.google.com/drive/1Q1Sk5lGCBGBWS5Bs2kncAq9ZbjaDzSR4?usp=sharing
# In[1]:
import omicverse as ov
import scvelo as scv
import matplotlib.pyplot as plt
ov.ov_plot_set()
# ## Data preprocessed
#
# We using dataset of dentategyrus in scvelo to demonstrate the timing-associated genes analysis. Firstly, We use `ov.pp.qc` and `ov.pp.preprocess` to preprocess the dataset.
#
# Then we use `ov.pp.scale` and `ov.pp.pca` to analysis the principal component of the data
# In[18]:
adata = scv.datasets.dentategyrus()
adata
# In[19]:
adata=ov.pp.qc(adata,
tresh={'mito_perc': 0.15, 'nUMIs': 500, 'detected_genes': 250},
)
# In[20]:
ov.utils.store_layers(adata,layers='counts')
adata
# In[21]:
adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',
n_HVGs=2000)
# In[22]:
adata.raw = adata
adata = adata[:, adata.var.highly_variable_features]
adata
# In[23]:
ov.pp.scale(adata)
ov.pp.pca(adata,layer='scaled',n_pcs=50)
adata.obsm["X_mde_pca"] = ov.utils.mde(adata.obsm["scaled|original|X_pca"])
# In[24]:
adata=adata.raw.to_adata()
# In[25]:
fig, ax = plt.subplots(figsize=(3,3))
ov.utils.embedding(adata,
basis='X_mde_pca',frameon='small',
color=['clusters'],show=False,ax=ax)
# ## Meta-cells calculated
#
# To reduce the noisy of the raw dataset and improve the accuracy of the regrssion model. We using `SEACells` to perform the Meta-cells calculated.
# In[451]:
import SEACells
adata=adata[adata.obs['clusters']!='Endothelial']
model = SEACells.core.SEACells(adata,
build_kernel_on='scaled|original|X_pca',
n_SEACells=200,
n_waypoint_eigs=10,
convergence_epsilon = 1e-5)
# In[452]:
model.construct_kernel_matrix()
M = model.kernel_matrix
# Initialize archetypes
model.initialize_archetypes()
# In[453]:
model.fit(min_iter=10, max_iter=50)
# The model will stop early, we can use `model.step` to force the model run additional iterations. Usually, 100 iters can get the best metacells.
# In[454]:
# Check for convergence
get_ipython().run_line_magic('matplotlib', 'inline')
model.plot_convergence()
# In[489]:
# You can force the model to run additional iterations step-wise using the .step() function
print(f'Run for {len(model.RSS_iters)} iterations')
for _ in range(10):
model.step()
print(f'Run for {len(model.RSS_iters)} iterations')
# In[490]:
# Check for convergence
get_ipython().run_line_magic('matplotlib', 'inline')
model.plot_convergence()
# In[491]:
get_ipython().run_line_magic('matplotlib', 'inline')
SEACells.plot.plot_2D(adata, key='X_mde_pca', colour_metacells=False,
figsize=(4,4),cell_size=20,title='Dentategyrus Metacells',
)
# We notice the shape of raw anndata not consistent with the HVGs anndata.
# In[492]:
adata.raw=adata.copy()
# And we use `SEACells.core.summarize_by_soft_SEACell` to get the normalized metacells
# In[493]:
SEACell_soft_ad = SEACells.core.summarize_by_soft_SEACell(adata, model.A_,
celltype_label='clusters',
summarize_layer='raw', minimum_weight=0.05)
SEACell_soft_ad
# We visualized the metacells with PCA and UMAP
# In[494]:
import scanpy as sc
SEACell_soft_ad.raw=SEACell_soft_ad.copy()
sc.pp.highly_variable_genes(SEACell_soft_ad, n_top_genes=2000, inplace=True)
SEACell_soft_ad=SEACell_soft_ad[:,SEACell_soft_ad.var.highly_variable]
# In[495]:
ov.pp.scale(SEACell_soft_ad)
ov.pp.pca(SEACell_soft_ad,layer='scaled',n_pcs=50)
sc.pp.neighbors(SEACell_soft_ad, use_rep='scaled|original|X_pca')
sc.tl.umap(SEACell_soft_ad)
# And we can use the raw color of anndata.
# In[496]:
SEACell_soft_ad.obs['celltype']=SEACell_soft_ad.obs['celltype'].astype('category')
SEACell_soft_ad.obs['celltype']=SEACell_soft_ad.obs['celltype'].cat.reorder_categories(adata.obs['clusters'].cat.categories)
SEACell_soft_ad.uns['celltype_colors']=adata.uns['clusters_colors']
# In[15]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(3,3))
ov.utils.embedding(SEACell_soft_ad,
basis='X_umap',
color=["celltype"],
title='Meta Celltype',
frameon='small',
legend_fontsize=12,
#palette=ov.utils.palette()[11:],
ax=ax,
show=False)
# ## Pseudotime calculated
#
# Accurately calculating the pseudotime in metacells is another challenge we need to face, here we use pyVIA to complete the calculation of the pseudotime. Since the metacell has only 200 cells, we may not get proper proposed time series results by using the default parameters of pyVIA, so we manually adjust the relevant parameters.
#
# We need to set `jac_std_global`, `too_big_factor` and `knn` manually. If you know the origin cells, set the `root_user` is helpful too.
# In[ ]:
v0 = ov.single.pyVIA(adata=SEACell_soft_ad,adata_key='scaled|original|X_pca',
adata_ncomps=50, basis='X_umap',
clusters='celltype',knn=10, root_user=['nIPC','Neuroblast'],
dataset='group',
random_seed=112,is_coarse=True,
preserve_disconnected=True,
piegraph_arrow_head_width=0.05,piegraph_edgeweight_scalingfactor=2.5,
gene_matrix=SEACell_soft_ad.X,velo_weight=0.5,
edgebundle_pruning_twice=False, edgebundle_pruning=0.15,
jac_std_global=0.05,too_big_factor=0.05,
cluster_graph_pruning_std=1,
time_series=False,
)
v0.run()
# In[500]:
v0.get_pseudotime(SEACell_soft_ad)
# In[17]:
#v0.get_pseudotime(SEACell_soft_ad)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(3,3))
ov.utils.embedding(SEACell_soft_ad,
basis='X_umap',
color=["pt_via"],
title='Pseudotime',
frameon='small',
cmap='Reds',
#size=40,
legend_fontsize=12,
#palette=ov.utils.palette()[11:],
ax=ax,
show=False)
# Now we save the result of metacells for under analysis.
# In[502]:
SEACell_soft_ad.write_h5ad('data/tutorial_meta_den.h5ad',compression='gzip')
# In[2]:
SEACell_soft_ad=ov.utils.read('data/tutorial_meta_den.h5ad')
# ## Timing-associated genes analysis
#
# We have encapsulated the cellfategenie algorithm into omicverse, and we can simply use omicverse to analysis.
# In[3]:
cfg_obj=ov.single.cellfategenie(SEACell_soft_ad,pseudotime='pt_via')
cfg_obj.model_init()
# We used Adaptive Threshold Regression to calculate the minimum number of gene sets that would have the same accuracy as the regression model constructed for all genes.
# In[4]:
cfg_obj.ATR(stop=500,flux=0.01)
# In[5]:
fig,ax=cfg_obj.plot_filtering(color='#5ca8dc')
ax.set_title('Dentategyrus Metacells\nCellFateGenie')
# In[6]:
res=cfg_obj.model_fit()
# ## Visualization
#
# We prepared a series of function to visualize the result. we can use `plot_color_fitting` to observe the different cells how to transit with the pseudotime.
# In[7]:
cfg_obj.plot_color_fitting(type='raw',cluster_key='celltype')
# In[8]:
cfg_obj.plot_color_fitting(type='filter',cluster_key='celltype')
# ## Kendalltau test
#
# We can further narrow down the set of genes that satisfy the maximum regression coefficient. We used the kendalltau test to calculate the trend significance for each gene.
# In[9]:
kt_filter=cfg_obj.kendalltau_filter()
kt_filter.head()
# In[10]:
var_name=kt_filter.loc[kt_filter['pvalue']<kt_filter['pvalue'].mean()].index.tolist()
gt_obj=ov.single.gene_trends(SEACell_soft_ad,'pt_via',var_name)
gt_obj.calculate(n_convolve=10)
# In[11]:
print(f"Dimension: {len(var_name)}")
# In[12]:
fig,ax=gt_obj.plot_trend(color=ov.utils.blue_color[3])
ax.set_title(f'Dentategyrus meta\nCellfategenie',fontsize=13)
# In[14]:
g=ov.utils.plot_heatmap(SEACell_soft_ad,var_names=var_name,
sortby='pt_via',col_color='celltype',
n_convolve=10,figsize=(1,6),show=False)
g.fig.set_size_inches(2, 6)
g.fig.suptitle('CellFateGenie',x=0.25,y=0.83,
horizontalalignment='left',fontsize=12,fontweight='bold')
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(),fontsize=12)
plt.show()
# ## Fate Genes
#
# Unlike traditional proposed timing analyses, CellFateGenie can also access key genes/gene sets during fate transitions
# In[26]:
gt_obj.cal_border_cell(SEACell_soft_ad,'pt_via','celltype')
# In[27]:
bordgene_dict=gt_obj.get_multi_border_gene(SEACell_soft_ad,'celltype',
threshold=0.5)
# We use `Granule immature` and `Granule mature` to try calculated the fate related genes.
# In[30]:
gt_obj.get_border_gene(SEACell_soft_ad,'celltype','Granule immature','Granule mature',
threshold=0.5)
# In comparison to the `get_border_gene` function, the `get_special_border_gene` function serves the purpose of extracting exclusive fate information from two specific cell types. However, it operates with a higher degree of stringency.
# In[33]:
gt_obj.get_special_border_gene(SEACell_soft_ad,'celltype','Granule immature','Granule mature')
# We can visualize these genes.
# In[36]:
import matplotlib.pyplot as plt
g=ov.utils.plot_heatmap(SEACell_soft_ad,
var_names=gt_obj.get_border_gene(SEACell_soft_ad,'celltype','Granule immature','Granule mature'),
sortby='pt_via',col_color='celltype',yticklabels=True,
n_convolve=10,figsize=(1,6),show=False)
g.fig.set_size_inches(2, 4)
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(),fontsize=12)
plt.show()
# Similiarly, we can use `get_special_kernel_gene` or `get_kernel_gene` to obtain the celltype special genes.
# In[37]:
gt_obj.get_special_kernel_gene(SEACell_soft_ad,'celltype','Granule immature')
# In[42]:
gt_obj.get_kernel_gene(SEACell_soft_ad,
'celltype','Granule immature',
threshold=0.3,
num_gene=10)
# In[43]:
import matplotlib.pyplot as plt
g=ov.utils.plot_heatmap(SEACell_soft_ad,
var_names=gt_obj.get_kernel_gene(SEACell_soft_ad,
'celltype','Granule immature',
threshold=0.3,
num_gene=10),
sortby='pt_via',col_color='celltype',yticklabels=True,
n_convolve=10,figsize=(1,6),show=False)
g.fig.set_size_inches(2, 4)
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(),fontsize=12)
plt.show()
# In[ ]: