Christina Theodoris commited on
Commit
5760b30
·
2 Parent(s): bfcada4 c81f6f9

Merge remote-tracking branch 'origin/main'

Browse files
geneformer/gene_name_id_dict_gc95M.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b0fd0521406ed18b2e341ef0acb5f53aa1a62457a07ca5840e1c142f46dd326
3
- size 2038812
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fabfa0c2f49c598c59ae432a32c3499a5908c033756c663b5e0cddf58deea8e1
3
+ size 1660882
geneformer/mtl/data.py CHANGED
@@ -112,7 +112,7 @@ def preload_and_process_data(config):
112
  # Validate that the mappings match
113
  validate_label_mappings(config)
114
 
115
- return (*train_data, *val_data[:2]) # Return train and val data along with mappings
116
 
117
 
118
  def validate_label_mappings(config):
 
112
  # Validate that the mappings match
113
  validate_label_mappings(config)
114
 
115
+ return (*train_data[:2], *val_data) # Return train and val data along with mappings
116
 
117
 
118
  def validate_label_mappings(config):
geneformer/tokenizer.py CHANGED
@@ -88,6 +88,7 @@ def sum_ensembl_ids(
88
  collapse_gene_ids,
89
  gene_mapping_dict,
90
  gene_token_dict,
 
91
  file_format="loom",
92
  chunk_size=512,
93
  ):
@@ -104,6 +105,13 @@ def sum_ensembl_ids(
104
  "ensembl_id_collapsed" not in data.ra.keys()
105
  ), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
106
 
 
 
 
 
 
 
 
107
 
108
  # Get the ensembl ids that exist in data
109
  ensembl_ids = data.ra.ensembl_id
@@ -208,6 +216,13 @@ def sum_ensembl_ids(
208
  assert (
209
  "ensembl_id_collapsed" not in data.var.columns
210
  ), "'ensembl_id_collapsed' column already exists in data.var"
 
 
 
 
 
 
 
211
 
212
 
213
  # Get the ensembl ids that exist in data
@@ -461,6 +476,7 @@ class TranscriptomeTokenizer:
461
  self.collapse_gene_ids,
462
  self.gene_mapping_dict,
463
  self.gene_token_dict,
 
464
  file_format="h5ad",
465
  chunk_size=self.chunk_size,
466
  )
@@ -537,6 +553,7 @@ class TranscriptomeTokenizer:
537
  self.collapse_gene_ids,
538
  self.gene_mapping_dict,
539
  self.gene_token_dict,
 
540
  file_format="loom",
541
  chunk_size=self.chunk_size,
542
  )
 
88
  collapse_gene_ids,
89
  gene_mapping_dict,
90
  gene_token_dict,
91
+ custom_attr_name_dict,
92
  file_format="loom",
93
  chunk_size=512,
94
  ):
 
105
  "ensembl_id_collapsed" not in data.ra.keys()
106
  ), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
107
 
108
+ assert (
109
+ "n_counts" in data.ca.keys()
110
+ ), "'n_counts' column missing from data.ca.keys()"
111
+
112
+ if custom_attr_name_dict is not None:
113
+ for label in custom_attr_name_dict:
114
+ assert label in data.ca.keys(), f"Attribute `{label}` not present in dataset features"
115
 
116
  # Get the ensembl ids that exist in data
117
  ensembl_ids = data.ra.ensembl_id
 
216
  assert (
217
  "ensembl_id_collapsed" not in data.var.columns
218
  ), "'ensembl_id_collapsed' column already exists in data.var"
219
+ assert (
220
+ "n_counts" in data.obs.columns
221
+ ), "'n_counts' column missing from data.obs"
222
+
223
+ if custom_attr_name_dict is not None:
224
+ for label in custom_attr_name_dict:
225
+ assert label in data.obs.columns, f"Attribute `{label}` not present in data.obs"
226
 
227
 
228
  # Get the ensembl ids that exist in data
 
476
  self.collapse_gene_ids,
477
  self.gene_mapping_dict,
478
  self.gene_token_dict,
479
+ self.custom_attr_name_dict,
480
  file_format="h5ad",
481
  chunk_size=self.chunk_size,
482
  )
 
553
  self.collapse_gene_ids,
554
  self.gene_mapping_dict,
555
  self.gene_token_dict,
556
+ self.custom_attr_name_dict,
557
  file_format="loom",
558
  chunk_size=self.chunk_size,
559
  )