Fixed issues
Browse files- geneformer/tokenizer.py +12 -6
geneformer/tokenizer.py
CHANGED
@@ -167,9 +167,11 @@ class TranscriptomeTokenizer:
|
|
167 |
|
168 |
def tokenize_anndata(self, adata_file_path, target_sum=10_000, chunk_size=512):
|
169 |
adata = ad.read(adata_file_path, backed="r")
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
173 |
|
174 |
coding_miRNA_loc = np.where(
|
175 |
[self.genelist_dict.get(i, False) for i in adata.var["ensembl_id"]]
|
@@ -208,7 +210,8 @@ class TranscriptomeTokenizer:
|
|
208 |
idx = filter_pass_loc[i:i+chunk_size]
|
209 |
X = adata[idx].X
|
210 |
|
211 |
-
|
|
|
212 |
X_norm = sp.csr_matrix(X_norm)
|
213 |
|
214 |
tokenized_cells += [
|
@@ -217,8 +220,11 @@ class TranscriptomeTokenizer:
|
|
217 |
]
|
218 |
|
219 |
# add custom attributes for subview to dict
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
222 |
|
223 |
return tokenized_cells, file_cell_metadata
|
224 |
|
|
|
167 |
|
168 |
def tokenize_anndata(self, adata_file_path, target_sum=10_000, chunk_size=512):
|
169 |
adata = ad.read(adata_file_path, backed="r")
|
170 |
+
|
171 |
+
if self.custom_attr_name_dict is not None:
|
172 |
+
file_cell_metadata = {
|
173 |
+
attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
|
174 |
+
}
|
175 |
|
176 |
coding_miRNA_loc = np.where(
|
177 |
[self.genelist_dict.get(i, False) for i in adata.var["ensembl_id"]]
|
|
|
210 |
idx = filter_pass_loc[i:i+chunk_size]
|
211 |
X = adata[idx].X
|
212 |
|
213 |
+
X_view = X[:, coding_miRNA_loc]
|
214 |
+
X_norm = (X_view / X_view.sum(axis=1) * target_sum / norm_factor_vector)
|
215 |
X_norm = sp.csr_matrix(X_norm)
|
216 |
|
217 |
tokenized_cells += [
|
|
|
220 |
]
|
221 |
|
222 |
# add custom attributes for subview to dict
|
223 |
+
if self.custom_attr_name_dict is not None:
|
224 |
+
for k in file_cell_metadata.keys():
|
225 |
+
file_cell_metadata[k] += adata[idx].obs[k].tolist()
|
226 |
+
else:
|
227 |
+
file_cell_metadata = None
|
228 |
|
229 |
return tokenized_cells, file_cell_metadata
|
230 |
|