Merge remote-tracking branch 'origin/main'
Browse files- geneformer/gene_name_id_dict_gc95M.pkl +2 -2
- geneformer/mtl/data.py +1 -1
- geneformer/tokenizer.py +17 -0
geneformer/gene_name_id_dict_gc95M.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fabfa0c2f49c598c59ae432a32c3499a5908c033756c663b5e0cddf58deea8e1
|
3 |
+
size 1660882
|
geneformer/mtl/data.py
CHANGED
@@ -112,7 +112,7 @@ def preload_and_process_data(config):
|
|
112 |
# Validate that the mappings match
|
113 |
validate_label_mappings(config)
|
114 |
|
115 |
-
return (*train_data, *val_data
|
116 |
|
117 |
|
118 |
def validate_label_mappings(config):
|
|
|
112 |
# Validate that the mappings match
|
113 |
validate_label_mappings(config)
|
114 |
|
115 |
+
return (*train_data[:2], *val_data) # Return train and val data along with mappings
|
116 |
|
117 |
|
118 |
def validate_label_mappings(config):
|
geneformer/tokenizer.py
CHANGED
@@ -88,6 +88,7 @@ def sum_ensembl_ids(
|
|
88 |
collapse_gene_ids,
|
89 |
gene_mapping_dict,
|
90 |
gene_token_dict,
|
|
|
91 |
file_format="loom",
|
92 |
chunk_size=512,
|
93 |
):
|
@@ -104,6 +105,13 @@ def sum_ensembl_ids(
|
|
104 |
"ensembl_id_collapsed" not in data.ra.keys()
|
105 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
# Get the ensembl ids that exist in data
|
109 |
ensembl_ids = data.ra.ensembl_id
|
@@ -208,6 +216,13 @@ def sum_ensembl_ids(
|
|
208 |
assert (
|
209 |
"ensembl_id_collapsed" not in data.var.columns
|
210 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
|
213 |
# Get the ensembl ids that exist in data
|
@@ -461,6 +476,7 @@ class TranscriptomeTokenizer:
|
|
461 |
self.collapse_gene_ids,
|
462 |
self.gene_mapping_dict,
|
463 |
self.gene_token_dict,
|
|
|
464 |
file_format="h5ad",
|
465 |
chunk_size=self.chunk_size,
|
466 |
)
|
@@ -537,6 +553,7 @@ class TranscriptomeTokenizer:
|
|
537 |
self.collapse_gene_ids,
|
538 |
self.gene_mapping_dict,
|
539 |
self.gene_token_dict,
|
|
|
540 |
file_format="loom",
|
541 |
chunk_size=self.chunk_size,
|
542 |
)
|
|
|
88 |
collapse_gene_ids,
|
89 |
gene_mapping_dict,
|
90 |
gene_token_dict,
|
91 |
+
custom_attr_name_dict,
|
92 |
file_format="loom",
|
93 |
chunk_size=512,
|
94 |
):
|
|
|
105 |
"ensembl_id_collapsed" not in data.ra.keys()
|
106 |
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
107 |
|
108 |
+
assert (
|
109 |
+
"n_counts" in data.ca.keys()
|
110 |
+
), "'n_counts' column missing from data.ca.keys()"
|
111 |
+
|
112 |
+
if custom_attr_name_dict is not None:
|
113 |
+
for label in custom_attr_name_dict:
|
114 |
+
assert label in data.ca.keys(), f"Attribute `{label}` not present in dataset features"
|
115 |
|
116 |
# Get the ensembl ids that exist in data
|
117 |
ensembl_ids = data.ra.ensembl_id
|
|
|
216 |
assert (
|
217 |
"ensembl_id_collapsed" not in data.var.columns
|
218 |
), "'ensembl_id_collapsed' column already exists in data.var"
|
219 |
+
assert (
|
220 |
+
"n_counts" in data.obs.columns
|
221 |
+
), "'n_counts' column missing from data.obs"
|
222 |
+
|
223 |
+
if custom_attr_name_dict is not None:
|
224 |
+
for label in custom_attr_name_dict:
|
225 |
+
assert label in data.obs.columns, f"Attribute `{label}` not present in data.obs"
|
226 |
|
227 |
|
228 |
# Get the ensembl ids that exist in data
|
|
|
476 |
self.collapse_gene_ids,
|
477 |
self.gene_mapping_dict,
|
478 |
self.gene_token_dict,
|
479 |
+
self.custom_attr_name_dict,
|
480 |
file_format="h5ad",
|
481 |
chunk_size=self.chunk_size,
|
482 |
)
|
|
|
553 |
self.collapse_gene_ids,
|
554 |
self.gene_mapping_dict,
|
555 |
self.gene_token_dict,
|
556 |
+
self.custom_attr_name_dict,
|
557 |
file_format="loom",
|
558 |
chunk_size=self.chunk_size,
|
559 |
)
|