#!/usr/bin/env python # coding: utf-8 # ## Using scMulan to annotate cell types in Heart, Lung, Liver, Bone marrow, Blood, Brain, and Thymus # In this study, the authors enrich the pre-training paradigm by integrating an abundance of metadata and a multiplicity of pre-training tasks, and obtain scMulan, a multitask generative pre-trained language model tailored for single-cell analysis. They represent a cell as a structured cell sentence (c-sentence) by encoding its gene expression, metadata terms, and target tasks as words of tuples, each consisting of entities and their corresponding values. They construct a unified generative framework to model the cell language on c-sentence and design three pretraining tasks to bridge the microscopic and macroscopic information within the c-sentences. They pre-train scMulan on 10 million single-cell transcriptomic data and their corresponding metadata, with 368 million parameters. As a single model, scMulan can accomplish tasks zero-shot for cell type annotation, batch integration, and conditional cell generation, guided by different task prompts. # #### we provide a liver dataset sampled (percentage of 20%) from Suo C, 2022 (doi/10.1126/science.abo0510) # **Paper:** [scMulan: a multitask generative pre-trained language model for single-cell analysis](https://www.biorxiv.org/content/10.1101/2024.01.25.577152v1) # **Data download:** https://cloud.tsinghua.edu.cn/f/45a7fd2a27e543539f59/?dl=1 # **Pre-train model download:** https://cloud.tsinghua.edu.cn/f/2250c5df51034b2e9a85/?dl=1 # # If you found this tutorial helpful, please cite scMulan and OmicVerse: # Bian H, Chen Y, Dong X, et al. scMulan: a multitask generative pre-trained language model for single-cell analysis[C]//International Conference on Research in Computational Molecular Biology. Cham: Springer Nature Switzerland, 2024: 479-482. # In[36]: import os #os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # if use CPU only import scanpy as sc import omicverse as ov ov.plot_set() #import scMulan #from scMulan import GeneSymbolUniform # ## 1. load h5ad # You can download the liver dataset from the following link: https://cloud.tsinghua.edu.cn/f/45a7fd2a27e543539f59/?dl=1 # # It's recommended that you use h5ad here with raw count (and after your QC) # # In[4]: adata = sc.read('./data/liver_test.h5ad') # In[5]: adata # In[6]: from scipy.sparse import csc_matrix adata.X = csc_matrix(adata.X) # ## 2. transform original h5ad with uniformed genes (42117 genes) # This step transform the genes in input adata to 42117 gene symbols and reserves the corresponding gene expression values. The gene symbols are the same as the pre-trained scMulan model. # In[7]: adata_GS_uniformed = ov.externel.scMulan.GeneSymbolUniform(input_adata=adata, output_dir="./data", output_prefix='liver') # ## 3. process uniformed data (simply norm and log1p) # In[8]: ## you can read the saved uniformed adata adata_GS_uniformed=sc.read_h5ad('./data/liver_uniformed.h5ad') # In[9]: adata_GS_uniformed # In[10]: # norm and log1p count matrix # in some case, the count matrix is not normalized, and log1p is not applied. # So we need to normalize the count matrix if adata_GS_uniformed.X.max() > 10: sc.pp.normalize_total(adata_GS_uniformed, target_sum=1e4) sc.pp.log1p(adata_GS_uniformed) # ## 4. load scMulan # In[11]: # you should first download ckpt from https://cloud.tsinghua.edu.cn/f/2250c5df51034b2e9a85/?dl=1 # put it under .ckpt/ckpt_scMulan.pt # by: wget https://cloud.tsinghua.edu.cn/f/2250c5df51034b2e9a85/?dl=1 -O ckpt/ckpt_scMulan.pt ckp_path = './ckpt/ckpt_scMulan.pt' # In[12]: scml = ov.externel.scMulan.model_inference(ckp_path, adata_GS_uniformed) base_process = scml.cuda_count() # In[13]: scml.get_cell_types_and_embds_for_adata(parallel=True, n_process = 1) # scml.get_cell_types_and_embds_for_adata(parallel=False) # for only using CPU, but it is really slow. # The predicted cell types are stored in scml.adata.obs['cell_type_from_scMulan'], besides the cell embeddings (for multibatch integration) in scml.adata.obsm['X_scMulan'] (not used in this tutorial). # ## 5. visualization # # Here, we visualize the cell types predicted by scMulan. And we also visualize the original cell types in the dataset. # In[14]: adata_mulan = scml.adata.copy() # In[15]: # calculated the 2-D embedding of the adata using pyMDE ov.pp.scale(adata_mulan) ov.pp.pca(adata_mulan) #sc.pl.pca_variance_ratio(adata_mulan) ov.pp.mde(adata_mulan,embedding_dim=2,n_neighbors=15, basis='X_mde', n_pcs=10, use_rep='scaled|original|X_pca',) # In[26]: # Here, we can see the cell type annotation from scMulan ov.pl.embedding(adata_mulan,basis='X_mde', color=["cell_type_from_scMulan",], ncols=1,frameon='small') # In[29]: adata_mulan.obsm['X_umap']=adata_mulan.obsm['X_mde'] # In[30]: # you can run smoothing function to filter the false positives ov.externel.scMulan.cell_type_smoothing(adata_mulan, threshold=0.1) # In[31]: # cell_type_from_mulan_smoothing: pred+smoothing # cell_type: original annotations by the authors ov.pl.embedding(adata_mulan,basis='X_mde', color=["cell_type_from_mulan_smoothing","cell_type"], ncols=1,frameon='small') # In[32]: adata_mulan # In[33]: top_celltypes = adata_mulan.obs.cell_type_from_scMulan.value_counts().index[:20] # In[34]: # you can select some cell types of interest (from scMulan's prediction) for visulization # selected_cell_types = ["NK cell", "Kupffer cell", "Conventional dendritic cell 2"] # as example selected_cell_types = top_celltypes ov.externel.scMulan.visualize_selected_cell_types(adata_mulan,selected_cell_types,smoothing=True) # In[ ]: