Spaces:
Runtime error
Runtime error
Fixes
Browse files- .gitignore +3 -1
- app.py +20 -7
- config.py +13 -1
- scaler.pkl +0 -0
.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
.venv
|
2 |
-
flagged
|
|
|
|
|
|
1 |
.venv
|
2 |
+
flagged
|
3 |
+
*.tif
|
4 |
+
*.tiff
|
app.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
-
from io import BytesIO
|
2 |
import os
|
|
|
3 |
import re
|
4 |
import PIL.Image
|
5 |
import pandas as pd
|
6 |
import numpy as np
|
7 |
import gradio as gr
|
8 |
from datasets import load_dataset
|
9 |
-
import infer
|
10 |
import matplotlib.pyplot as plt
|
11 |
from sklearn.manifold import TSNE
|
12 |
from sklearn.preprocessing import LabelEncoder
|
@@ -15,8 +14,10 @@ from torch import nn
|
|
15 |
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
|
16 |
from huggingface_hub import PyTorchModelHubMixin
|
17 |
from pinecone import Pinecone
|
|
|
|
|
18 |
|
19 |
-
from config import DEFAULT_INPUTS, MODELS, DATASETS, ID_TO_GENUS_MAP
|
20 |
|
21 |
# We need this for the eco layers because they are too big
|
22 |
PIL.Image.MAX_IMAGE_PIXELS = None
|
@@ -52,15 +53,15 @@ classification_model = DNASeqClassifier.from_pretrained(
|
|
52 |
BertConfig(vocab_size=259, output_hidden_states=True),
|
53 |
),
|
54 |
)
|
|
|
|
|
55 |
|
56 |
embeddings_model.eval()
|
57 |
classification_model.eval()
|
58 |
|
59 |
# Load datasets
|
60 |
-
ecolayers_ds = load_dataset(DATASETS["ecolayers"])
|
61 |
amazon_ds = load_dataset(DATASETS["amazon"])
|
62 |
|
63 |
-
|
64 |
def set_default_inputs():
|
65 |
return (DEFAULT_INPUTS["dna_sequence"],
|
66 |
DEFAULT_INPUTS["latitude"],
|
@@ -99,7 +100,6 @@ def tokenize(dna_sequence: str) -> dict[str, torch.Tensor]:
|
|
99 |
return tokenizer(dna_seq_preprocessed, return_tensors="pt")
|
100 |
|
101 |
|
102 |
-
|
103 |
def get_embedding(dna_sequence: str) -> torch.Tensor:
|
104 |
dna_embedding: torch.Tensor = embeddings_model(
|
105 |
**tokenize(dna_sequence)
|
@@ -126,7 +126,20 @@ def predict_genus(method: str, dna_sequence: str, latitude: str, longitude: str)
|
|
126 |
|
127 |
if method == "fine_tuned_model":
|
128 |
bert_inputs = tokenize(dna_sequence)
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
temperature = 0.2
|
131 |
probs = torch.softmax(logits / temperature, dim=1).squeeze()
|
132 |
top_k = torch.topk(probs, 10)
|
|
|
|
|
1 |
import os
|
2 |
+
import pickle
|
3 |
import re
|
4 |
import PIL.Image
|
5 |
import pandas as pd
|
6 |
import numpy as np
|
7 |
import gradio as gr
|
8 |
from datasets import load_dataset
|
|
|
9 |
import matplotlib.pyplot as plt
|
10 |
from sklearn.manifold import TSNE
|
11 |
from sklearn.preprocessing import LabelEncoder
|
|
|
14 |
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
|
15 |
from huggingface_hub import PyTorchModelHubMixin
|
16 |
from pinecone import Pinecone
|
17 |
+
import rasterio
|
18 |
+
from rasterio.sample import sample_gen
|
19 |
|
20 |
+
from config import DEFAULT_INPUTS, MODELS, DATASETS, ID_TO_GENUS_MAP, LAYER_NAMES
|
21 |
|
22 |
# We need this for the eco layers because they are too big
|
23 |
PIL.Image.MAX_IMAGE_PIXELS = None
|
|
|
53 |
BertConfig(vocab_size=259, output_hidden_states=True),
|
54 |
),
|
55 |
)
|
56 |
+
with open("scaler.pkl", "rb") as f:
|
57 |
+
scaler = pickle.load(f)
|
58 |
|
59 |
embeddings_model.eval()
|
60 |
classification_model.eval()
|
61 |
|
62 |
# Load datasets
|
|
|
63 |
amazon_ds = load_dataset(DATASETS["amazon"])
|
64 |
|
|
|
65 |
def set_default_inputs():
|
66 |
return (DEFAULT_INPUTS["dna_sequence"],
|
67 |
DEFAULT_INPUTS["latitude"],
|
|
|
100 |
return tokenizer(dna_seq_preprocessed, return_tensors="pt")
|
101 |
|
102 |
|
|
|
103 |
def get_embedding(dna_sequence: str) -> torch.Tensor:
|
104 |
dna_embedding: torch.Tensor = embeddings_model(
|
105 |
**tokenize(dna_sequence)
|
|
|
126 |
|
127 |
if method == "fine_tuned_model":
|
128 |
bert_inputs = tokenize(dna_sequence)
|
129 |
+
|
130 |
+
env_data = []
|
131 |
+
for layer in LAYER_NAMES:
|
132 |
+
with rasterio.open(layer) as dataset:
|
133 |
+
# Get the corresponding ecological values for the samples
|
134 |
+
results = sample_gen(dataset, [coords])
|
135 |
+
results = [r for r in results]
|
136 |
+
layer_data = np.mean(results[0])
|
137 |
+
env_data.append(layer_data)
|
138 |
+
|
139 |
+
env_data = scaler.transform([env_data])
|
140 |
+
env_data = torch.from_numpy(env_data).to(torch.float32)
|
141 |
+
|
142 |
+
logits = classification_model(bert_inputs, env_data)
|
143 |
temperature = 0.2
|
144 |
probs = torch.softmax(logits / temperature, dim=1).squeeze()
|
145 |
top_k = torch.topk(probs, 10)
|
config.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import json
|
2 |
|
3 |
|
@@ -24,6 +25,17 @@ MODELS = {
|
|
24 |
}
|
25 |
|
26 |
DATASETS = {
|
27 |
-
"ecolayers": "LofiAmazon/Global-Ecolayers",
|
28 |
"amazon": "LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon",
|
29 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import json
|
3 |
|
4 |
|
|
|
25 |
}
|
26 |
|
27 |
DATASETS = {
|
|
|
28 |
"amazon": "LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon",
|
29 |
}
|
30 |
+
|
31 |
+
HUGGINGFACE_DW_URL = "https://huggingface.co/datasets/LofiAmazon/Global-Ecolayers/resolve/main/{filename}?download=true"
|
32 |
+
|
33 |
+
LAYER_NAMES = [
|
34 |
+
"median_elevation_1km.tiff",
|
35 |
+
"human_footprint.tiff",
|
36 |
+
"population_density_1km.tif",
|
37 |
+
"annual_precipitation.tif",
|
38 |
+
"precipitation_seasonality.tif",
|
39 |
+
"annual_mean_air_temp.tif",
|
40 |
+
"temp_seasonality.tif",
|
41 |
+
]
|
scaler.pkl
ADDED
Binary file (863 Bytes). View file
|
|