# iSEEEK A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings ## An simple pipeline for single-cell analysis ```python import torch import gzip import re from tqdm import tqdm import numpy as np import scanpy as sc from torch.utils.data import DataLoader, Dataset from transformers import PreTrainedTokenizerFast, BertForMaskedLM class LineDataset(Dataset): def __init__(self, lines): self.lines = lines self.regex = re.compile(r'\-|\.') def __getitem__(self, i): return self.regex.sub('_', self.lines[i]) def __len__(self): return len(self.lines) device = "cuda" if torch.cuda.is_available() else "cpu" torch.set_num_threads(2) tokenizer = PreTrainedTokenizerFast.from_pretrained("TJMUCH/transcriptome-iseeek") model = BertForMaskedLM.from_pretrained("TJMUCH/transcriptome-iseeek").bert model = model.to(device) model.eval() ## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main lines = [s.strip().decode() for s in gzip.open("pbmc_ranking.txt.gz")] labels = [s.strip().decode() for s in gzip.open("pbmc_label.txt.gz")] labels = np.asarray(labels) ds = LineDataset(lines) dl = DataLoader(ds, batch_size=80) features = [] for a in tqdm(dl, total=len(dl)): batch = tokenizer(a, max_length=128, truncation=True, padding=True, return_tensors="pt") for k, v in batch.items(): batch[k] = v.to(device) with torch.no_grad(): out = model(**batch) f = out.last_hidden_state[:,0,:] features.extend(f.tolist()) features = np.stack(features) adata = sc.AnnData(features) adata.obs['celltype'] = labels adata.obs.celltype = adata.obs.celltype.astype("category") sc.pp.neighbors(adata, use_rep='X') sc.tl.umap(adata) sc.tl.leiden(adata) sc.pl.umap(adata, color=['celltype','leiden'],save= "UMAP") ``` ## Extract token representations ```python cell_counts = len(lines) x = np.zeros((cell_counts, len(tokenizer)), dtype=np.float16) for a in tqdm(dl, total=len(dl)): batch = tokenizer(a, max_length=128, truncation=True, padding=True, return_tensors="pt") for k, v in batch.items(): batch[k] = v.to(device) with torch.no_grad(): out = model(**batch) eos_idxs = batch.attention_mask.sum(dim=1) - 1 f = out.last_hidden_state batch_size = f.shape[0] input_ids = batch.input_ids for i in range(batch_size): ##genes = tokenizer.batch_decode(input_ids[i]) token_norms = [f[i][j].norm().item() for j in range(1, eos_idxs[i])] idxs = input_ids[i].tolist()[1:eos_idxs[i]] x[counter, idxs] = token_norms counter = counter + 1 ```