lixiangchun
commited on
Commit
·
70cd2c0
1
Parent(s):
dc0bd2a
update README
Browse files
README.md
CHANGED
@@ -2,9 +2,10 @@
|
|
2 |
# iSEEEK
|
3 |
A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
|
4 |
|
5 |
-
```python
|
6 |
## An simple pipeline for single-cell analysis
|
|
|
7 |
import torch
|
|
|
8 |
import re
|
9 |
from tqdm import tqdm
|
10 |
import numpy as np
|
@@ -31,8 +32,8 @@ model.eval()
|
|
31 |
|
32 |
|
33 |
## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main
|
34 |
-
lines = [s.strip() for s in gzip.open("pbmc_ranking.txt.gz")]
|
35 |
-
labels = [s.strip() for s in gzip.open("pbmc_label.txt.gz")]
|
36 |
labels = np.asarray(labels)
|
37 |
|
38 |
|
@@ -66,3 +67,31 @@ sc.pl.umap(adata, color=['celltype','leiden'],save= "UMAP")
|
|
66 |
|
67 |
```
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
# iSEEEK
|
3 |
A universal approach for integrating super large-scale single-cell transcriptomes by exploring gene rankings
|
4 |
|
|
|
5 |
## An simple pipeline for single-cell analysis
|
6 |
+
```python
|
7 |
import torch
|
8 |
+
import gzip
|
9 |
import re
|
10 |
from tqdm import tqdm
|
11 |
import numpy as np
|
|
|
32 |
|
33 |
|
34 |
## Data desposited in https://huggingface.co/TJMUCH/transcriptome-iseeek/tree/main
|
35 |
+
lines = [s.strip().decode() for s in gzip.open("pbmc_ranking.txt.gz")]
|
36 |
+
labels = [s.strip().decode() for s in gzip.open("pbmc_label.txt.gz")]
|
37 |
labels = np.asarray(labels)
|
38 |
|
39 |
|
|
|
67 |
|
68 |
```
|
69 |
|
70 |
+
## Extract token representations
|
71 |
+
```python
|
72 |
+
|
73 |
+
cell_counts = len(lines)
|
74 |
+
x = np.zeros((cell_counts, len(tokenizer)), dtype=np.float16)
|
75 |
+
|
76 |
+
for a in tqdm(dl, total=len(dl)):
|
77 |
+
batch = tokenizer(a, max_length=128, truncation=True,
|
78 |
+
padding=True, return_tensors="pt")
|
79 |
+
|
80 |
+
for k, v in batch.items():
|
81 |
+
batch[k] = v.to(device)
|
82 |
+
|
83 |
+
with torch.no_grad():
|
84 |
+
out = model(**batch)
|
85 |
+
|
86 |
+
eos_idxs = batch.attention_mask.sum(dim=1) - 1
|
87 |
+
f = out.last_hidden_state
|
88 |
+
batch_size = f.shape[0]
|
89 |
+
input_ids = batch.input_ids
|
90 |
+
|
91 |
+
for i in range(batch_size):
|
92 |
+
##genes = tokenizer.batch_decode(input_ids[i])
|
93 |
+
token_norms = [f[i][j].norm().item() for j in range(1, eos_idxs[i])]
|
94 |
+
idxs = input_ids[i].tolist()[1:eos_idxs[i]]
|
95 |
+
x[counter, idxs] = token_norms
|
96 |
+
counter = counter + 1
|
97 |
+
```
|