Spaces:

LofiAmazon
/

LofiAmazonSpace

Sleeping

App Files Files Community

jennzhuge commited on Jun 2, 2024

Commit

e86736e

1 Parent(s): a83006f

hi

Browse files

Files changed (2) hide show

app.py +39 -13
xgboost_infer.py → infer.py +7 -0

app.py CHANGED Viewed

@@ -1,25 +1,51 @@
 import json
 import gradio as gr
 with open("default_inputs.json", "r") as default_inputs_file:
     DEFAULT_INPUTS = json.load(default_inputs_file)
 def set_default_inputs():
     return (DEFAULT_INPUTS["dna_sequence"],
             DEFAULT_INPUTS["latitude"],
             DEFAULT_INPUTS["longitude"])
 def predict_genus():
-    dna_df = pd.read_csv(dna_file.name)
-    dnaenv_df = pd.read_csv(dnaenv_file.name)
     results = []
-    # envdna_genuses = predict_genus_dna_env(dnaenv_df)
-    # dna_genuses = predict_genus_dna(dna_df)
-    # images = [get_genus_image(genus) for genus in top_5_genuses]
     genuses = xgboost_infer.infer()
@@ -38,11 +64,11 @@ with gr.Blocks() as demo:
     gr.Markdown("Welcome to Lofi Amazon Beats' DNA Identifier Tool")
     with gr.Tab("Genus Prediction"):
-        gr.Markdown("Input a DNA sequence and the coordinates at which its sample was taken to predict the genus of the DNA. Click 'I'm feeling lucky' to see our predictio for a random sequence.")
         # Collect inputs for app (DNA and location)
         with gr.Row():
-            inp_dna = gr.Textbox(label="DNA", placeholder="e.g. AACAATGTA... (will be automatically truncated to 660 characters)")
         with gr.Row():
             inp_lat = gr.Textbox(label="Latitude", placeholder="e.g. -3.009083")
             inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")
@@ -57,11 +83,11 @@ with gr.Blocks() as demo:
             gr.Markdown('Make plot or table for Top 5 species')
         with gr.Column():
-            genus_out = gr.Dataframe(headers=["DNA", "Coord", "DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
-            btn_run.click(predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out)
     with gr.Tab('DNA Embedding Space Similarity Visualizer'):
-        gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples or clues.")
 demo.launch()

 import json
+import pandas as pd
 import gradio as gr
+from transformers import PreTrainedTokenizerFast, BertForMaskedLM
+from datasets import load_dataset
+import xgboost_infer
+embeddings_train = load_dataset("LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon", split='train').to_pandas()
 with open("default_inputs.json", "r") as default_inputs_file:
     DEFAULT_INPUTS = json.load(default_inputs_file)
 def set_default_inputs():
     return (DEFAULT_INPUTS["dna_sequence"],
             DEFAULT_INPUTS["latitude"],
             DEFAULT_INPUTS["longitude"])
+def preprocess():
+    ''' prepares app input for the genus prediction model
+    '''
+    # preprocess DNA seq
+    # Replace all symbols in nucraw which are not A, C, G, T with N
+    inp_dna = inp_dna.str.replace("[^ACGT]", "N", regex=True)
+    # Truncate trailing Ns from nucraw
+    inp_dna = inp_dna.str.replace("N+$", "", regex=True)
+    # Insert spaces between all k-mers
+    inp_dna = inp_dna.apply(lambda x: " ".join([x[i:i+4] for i in range(0, len(x), 4)]))
+    # load model to calculate new embeddings
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(model, force_download=True)
+    tokenizer.add_special_tokens({"pad_token": "<UNK>"})
+    bert_model = BertForMaskedLM.from_pretrained(model, force_download=True)
+    embed = bert_model.predic(inp_dna)
+    # format lat and lon into coords
+    coords = (inp_lat, inp_lng)
+    # Grab rasters from the tifs
+    ecoLayers = load_dataset("LofiAmazon/Global-Ecolayers")
+    temp = pd.DataFrame([coords, embed], columns = ['coord', 'embeddings'])
+    data = pd.merge(temp, ecoLayers, on='coord', how='left')
+    return data
 def predict_genus():
+    data = preprocess()
+    out = xgboost_infer.infer_dna(data)
     results = []
     genuses = xgboost_infer.infer()
     gr.Markdown("Welcome to Lofi Amazon Beats' DNA Identifier Tool")
     with gr.Tab("Genus Prediction"):
+        gr.Markdown("Enter a DNA sequence and the coordinates at which its sample was taken to get a genus prediction. Click 'I'm feeling lucky' to see a prediction for a random sequence.")
         # Collect inputs for app (DNA and location)
         with gr.Row():
+            inp_dna = gr.Textbox(label="DNA", placeholder="e.g. AACAATGTA... (min 200 and max 660 characters)")
         with gr.Row():
             inp_lat = gr.Textbox(label="Latitude", placeholder="e.g. -3.009083")
             inp_lng = gr.Textbox(label="Longitude", placeholder="e.g. -58.68281")
             gr.Markdown('Make plot or table for Top 5 species')
         with gr.Column():
+            genus_out = gr.Dataframe(headers=["DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
+            btn_run.click(fn=predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out)
     with gr.Tab('DNA Embedding Space Similarity Visualizer'):
+        gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples for clues.")
 demo.launch()

xgboost_infer.py → infer.py RENAMED Viewed

@@ -6,10 +6,16 @@ from sklearn.preprocessing import LabelEncoder
 from datasets import load_dataset
 import pickle
 def infer_dna(args):
     ecoDf = pd.read_csv(args['input_path'], sep='\t')
     dnaEmbeds = load_dataset("LofiAmazon/BOLD-Embeddings", split='train')
     modelDNA = load_checkpoint()
     modelDNAEnv = load_checkpoint()
@@ -49,6 +55,7 @@ def infer_dna(args):
     y_dna_probs = modelDNAEnv.predict_proba(X_dna)
     DNAEnvGenuses = {}
     for i in range(len()):
         topProbs = np.argsort(y_dna_probs[i], axis=1)[:,-3:]
         topClasses = modelDNA.classes_[topProbs]

 from datasets import load_dataset
 import pickle
 def infer_dna(args):
     ecoDf = pd.read_csv(args['input_path'], sep='\t')
     dnaEmbeds = load_dataset("LofiAmazon/BOLD-Embeddings", split='train')
+    # load model to calculate new embeddings
+    tokenizer = PreTrainedTokenizerFast.from_pretrained(model, force_download=True)
+    tokenizer.add_special_tokens({"pad_token": "<UNK>"})
+    bert_model = BertForMaskedLM.from_pretrained(model, force_download=True)
     modelDNA = load_checkpoint()
     modelDNAEnv = load_checkpoint()
     y_dna_probs = modelDNAEnv.predict_proba(X_dna)
     DNAEnvGenuses = {}
     for i in range(len()):
         topProbs = np.argsort(y_dna_probs[i], axis=1)[:,-3:]
         topClasses = modelDNA.classes_[topProbs]