vshulev commited on
Commit
2ee1fb2
·
1 Parent(s): b5db198
Files changed (4) hide show
  1. .gitignore +3 -1
  2. app.py +20 -7
  3. config.py +13 -1
  4. scaler.pkl +0 -0
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  .venv
2
- flagged
 
 
 
1
  .venv
2
+ flagged
3
+ *.tif
4
+ *.tiff
app.py CHANGED
@@ -1,12 +1,11 @@
1
- from io import BytesIO
2
  import os
 
3
  import re
4
  import PIL.Image
5
  import pandas as pd
6
  import numpy as np
7
  import gradio as gr
8
  from datasets import load_dataset
9
- import infer
10
  import matplotlib.pyplot as plt
11
  from sklearn.manifold import TSNE
12
  from sklearn.preprocessing import LabelEncoder
@@ -15,8 +14,10 @@ from torch import nn
15
  from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
16
  from huggingface_hub import PyTorchModelHubMixin
17
  from pinecone import Pinecone
 
 
18
 
19
- from config import DEFAULT_INPUTS, MODELS, DATASETS, ID_TO_GENUS_MAP
20
 
21
  # We need this for the eco layers because they are too big
22
  PIL.Image.MAX_IMAGE_PIXELS = None
@@ -52,15 +53,15 @@ classification_model = DNASeqClassifier.from_pretrained(
52
  BertConfig(vocab_size=259, output_hidden_states=True),
53
  ),
54
  )
 
 
55
 
56
  embeddings_model.eval()
57
  classification_model.eval()
58
 
59
  # Load datasets
60
- ecolayers_ds = load_dataset(DATASETS["ecolayers"])
61
  amazon_ds = load_dataset(DATASETS["amazon"])
62
 
63
-
64
  def set_default_inputs():
65
  return (DEFAULT_INPUTS["dna_sequence"],
66
  DEFAULT_INPUTS["latitude"],
@@ -99,7 +100,6 @@ def tokenize(dna_sequence: str) -> dict[str, torch.Tensor]:
99
  return tokenizer(dna_seq_preprocessed, return_tensors="pt")
100
 
101
 
102
-
103
  def get_embedding(dna_sequence: str) -> torch.Tensor:
104
  dna_embedding: torch.Tensor = embeddings_model(
105
  **tokenize(dna_sequence)
@@ -126,7 +126,20 @@ def predict_genus(method: str, dna_sequence: str, latitude: str, longitude: str)
126
 
127
  if method == "fine_tuned_model":
128
  bert_inputs = tokenize(dna_sequence)
129
- logits = classification_model(bert_inputs, torch.zeros(1, 7))
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  temperature = 0.2
131
  probs = torch.softmax(logits / temperature, dim=1).squeeze()
132
  top_k = torch.topk(probs, 10)
 
 
1
  import os
2
+ import pickle
3
  import re
4
  import PIL.Image
5
  import pandas as pd
6
  import numpy as np
7
  import gradio as gr
8
  from datasets import load_dataset
 
9
  import matplotlib.pyplot as plt
10
  from sklearn.manifold import TSNE
11
  from sklearn.preprocessing import LabelEncoder
 
14
  from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
15
  from huggingface_hub import PyTorchModelHubMixin
16
  from pinecone import Pinecone
17
+ import rasterio
18
+ from rasterio.sample import sample_gen
19
 
20
+ from config import DEFAULT_INPUTS, MODELS, DATASETS, ID_TO_GENUS_MAP, LAYER_NAMES
21
 
22
  # We need this for the eco layers because they are too big
23
  PIL.Image.MAX_IMAGE_PIXELS = None
 
53
  BertConfig(vocab_size=259, output_hidden_states=True),
54
  ),
55
  )
56
+ with open("scaler.pkl", "rb") as f:
57
+ scaler = pickle.load(f)
58
 
59
  embeddings_model.eval()
60
  classification_model.eval()
61
 
62
  # Load datasets
 
63
  amazon_ds = load_dataset(DATASETS["amazon"])
64
 
 
65
  def set_default_inputs():
66
  return (DEFAULT_INPUTS["dna_sequence"],
67
  DEFAULT_INPUTS["latitude"],
 
100
  return tokenizer(dna_seq_preprocessed, return_tensors="pt")
101
 
102
 
 
103
  def get_embedding(dna_sequence: str) -> torch.Tensor:
104
  dna_embedding: torch.Tensor = embeddings_model(
105
  **tokenize(dna_sequence)
 
126
 
127
  if method == "fine_tuned_model":
128
  bert_inputs = tokenize(dna_sequence)
129
+
130
+ env_data = []
131
+ for layer in LAYER_NAMES:
132
+ with rasterio.open(layer) as dataset:
133
+ # Get the corresponding ecological values for the samples
134
+ results = sample_gen(dataset, [coords])
135
+ results = [r for r in results]
136
+ layer_data = np.mean(results[0])
137
+ env_data.append(layer_data)
138
+
139
+ env_data = scaler.transform([env_data])
140
+ env_data = torch.from_numpy(env_data).to(torch.float32)
141
+
142
+ logits = classification_model(bert_inputs, env_data)
143
  temperature = 0.2
144
  probs = torch.softmax(logits / temperature, dim=1).squeeze()
145
  top_k = torch.topk(probs, 10)
config.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import json
2
 
3
 
@@ -24,6 +25,17 @@ MODELS = {
24
  }
25
 
26
  DATASETS = {
27
- "ecolayers": "LofiAmazon/Global-Ecolayers",
28
  "amazon": "LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon",
29
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import json
3
 
4
 
 
25
  }
26
 
27
  DATASETS = {
 
28
  "amazon": "LofiAmazon/BOLD-Embeddings-Ecolayers-Amazon",
29
  }
30
+
31
+ HUGGINGFACE_DW_URL = "https://huggingface.co/datasets/LofiAmazon/Global-Ecolayers/resolve/main/{filename}?download=true"
32
+
33
+ LAYER_NAMES = [
34
+ "median_elevation_1km.tiff",
35
+ "human_footprint.tiff",
36
+ "population_density_1km.tif",
37
+ "annual_precipitation.tif",
38
+ "precipitation_seasonality.tif",
39
+ "annual_mean_air_temp.tif",
40
+ "temp_seasonality.tif",
41
+ ]
scaler.pkl ADDED
Binary file (863 Bytes). View file