Add option to output embs as tensor

Files changed (5) hide show

examples/extract_and_plot_cell_embeddings.ipynb +1 -0
geneformer/emb_extractor.py +13 -7
geneformer/in_silico_perturber.py +1 -1
geneformer/in_silico_perturber_stats.py +1 -1
setup.py +1 -0

examples/extract_and_plot_cell_embeddings.ipynb CHANGED Viewed

@@ -29,6 +29,7 @@
     "                     nproc=16)\n",
     "\n",
     "# extracts embedding from input data\n",
     "# example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
     "embs = embex.extract_embs(\"../fine_tuned_models/geneformer-6L-30M_CellClassifier_cardiomyopathies_220224\",\n",
     "                          \"path/to/input_data/\",\n",

     "                     nproc=16)\n",
     "\n",
     "# extracts embedding from input data\n",
+    "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
     "# example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
     "embs = embex.extract_embs(\"../fine_tuned_models/geneformer-6L-30M_CellClassifier_cardiomyopathies_220224\",\n",
     "                          \"path/to/input_data/\",\n",

geneformer/emb_extractor.py CHANGED Viewed

@@ -40,7 +40,7 @@ import seaborn as sns
 import torch
 from collections import Counter
 from pathlib import Path
-from tqdm.notebook import trange
 from transformers import BertForMaskedLM, BertForTokenClassification, BertForSequenceClassification
 from .tokenizer import TOKEN_DICTIONARY_FILE
@@ -64,7 +64,6 @@ def get_embs(model,
              pad_token_id,
              forward_batch_size,
              summary_stat):
     model_input_size = get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
@@ -138,7 +137,7 @@ def test_emb(model, example, layer_to_quant):
     return embs_test.size()[2]
 def label_embs(embs, downsampled_data, emb_labels):
-    embs_df = pd.DataFrame(embs.cpu())
     if emb_labels is not None:
         for label in emb_labels:
             emb_label = downsampled_data[label]
@@ -367,7 +366,8 @@ class EmbExtractor:
                      model_directory,
                      input_data_file,
                      output_directory,
-                     output_prefix):
         """
         Extract embeddings from input data and save as results in output_directory.
@@ -381,6 +381,9 @@ class EmbExtractor:
             Path to directory where embedding data will be saved as csv
         output_prefix : str
             Prefix for output file
         """
         filtered_input_data = load_and_filter(self.filter_data, self.nproc, input_data_file)
@@ -398,13 +401,16 @@ class EmbExtractor:
         if self.summary_stat is None:
             embs_df = label_embs(embs, downsampled_data, self.emb_label)
         elif self.summary_stat is not None:
-            embs_df = pd.DataFrame(embs.cpu()).T
         # save embeddings to output_path
         output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
         embs_df.to_csv(output_path)
-        return embs_df
     def plot_embs(self,
                   embs,

 import torch
 from collections import Counter
 from pathlib import Path
+from tqdm.auto import trange
 from transformers import BertForMaskedLM, BertForTokenClassification, BertForSequenceClassification
 from .tokenizer import TOKEN_DICTIONARY_FILE
              pad_token_id,
              forward_batch_size,
              summary_stat):
     model_input_size = get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     return embs_test.size()[2]
 def label_embs(embs, downsampled_data, emb_labels):
+    embs_df = pd.DataFrame(embs.cpu().numpy())
     if emb_labels is not None:
         for label in emb_labels:
             emb_label = downsampled_data[label]
                      model_directory,
                      input_data_file,
                      output_directory,
+                     output_prefix,
+                     output_torch_embs=False):
         """
         Extract embeddings from input data and save as results in output_directory.
             Path to directory where embedding data will be saved as csv
         output_prefix : str
             Prefix for output file
+        output_torch_embs : bool
+            Whether or not to also output the embeddings as a tensor.
+            Note, if true, will output embeddings as both dataframe and tensor.
         """
         filtered_input_data = load_and_filter(self.filter_data, self.nproc, input_data_file)
         if self.summary_stat is None:
             embs_df = label_embs(embs, downsampled_data, self.emb_label)
         elif self.summary_stat is not None:
+            embs_df = pd.DataFrame(embs.cpu().numpy()).T
         # save embeddings to output_path
         output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
         embs_df.to_csv(output_path)
+        if output_torch_embs == True:
+            return embs_df, embs
+        else:
+            return embs_df
     def plot_embs(self,
                   embs,

geneformer/in_silico_perturber.py CHANGED Viewed

@@ -34,7 +34,7 @@ import seaborn as sns; sns.set()
 import torch
 from collections import defaultdict
 from datasets import Dataset, load_from_disk
-from tqdm.notebook import trange
 from transformers import BertForMaskedLM, BertForTokenClassification, BertForSequenceClassification
 from .tokenizer import TOKEN_DICTIONARY_FILE

 import torch
 from collections import defaultdict
 from datasets import Dataset, load_from_disk
+from tqdm.auto import trange
 from transformers import BertForMaskedLM, BertForTokenClassification, BertForSequenceClassification
 from .tokenizer import TOKEN_DICTIONARY_FILE

geneformer/in_silico_perturber_stats.py CHANGED Viewed

@@ -27,7 +27,7 @@ import statsmodels.stats.multitest as smt
 from pathlib import Path
 from scipy.stats import ranksums
 from sklearn.mixture import GaussianMixture
-from tqdm.notebook import trange, tqdm
 from .in_silico_perturber import flatten_list

 from pathlib import Path
 from scipy.stats import ranksums
 from sklearn.mixture import GaussianMixture
+from tqdm.auto import trange, tqdm
 from .in_silico_perturber import flatten_list

setup.py CHANGED Viewed

@@ -16,6 +16,7 @@ setup(
         "datasets",
         "loompy",
         "numpy",
         "transformers",
     ],
 )

         "datasets",
         "loompy",
         "numpy",
+        "tdigest",
         "transformers",
     ],
 )