AmelieSchreiber
/

esm2_t6_8M_finetuned_cafa5

 ```
 Validation Precision: 0.9822020821532512
 Validation Recall: 0.9999363677941498
+```
+## Using the model
+First, download the `train_sequences.fasta` file and the `train_terms.tsv` file, and provide the local paths in the code below:
+```python
+import os
+import numpy as np
+import torch
+from transformers import AutoTokenizer, EsmForSequenceClassification, AdamW
+from torch.nn.functional import binary_cross_entropy_with_logits
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import f1_score, precision_score, recall_score
+# from accelerate import Accelerator
+from Bio import SeqIO
+# Step 1: Data Preprocessing (Replace with your local paths)
+fasta_file = "/Users/amelieschreiber/.cursor-tutor/projects/python/cafa5/cafa-5-protein-function-prediction/Train/train_sequences.fasta"
+tsv_file = "/Users/amelieschreiber/.cursor-tutor/projects/python/cafa5/cafa-5-protein-function-prediction/Train/train_terms.tsv"
+fasta_data = {}
+tsv_data = {}
+for record in SeqIO.parse(fasta_file, "fasta"):
+ fasta_data[record.id] = str(record.seq)
+with open(tsv_file, 'r') as f:
+ for line in f:
+ parts = line.strip().split("\t")
+ tsv_data[parts[0]] = parts[1:]
+# tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")
+seq_length = 1022
+# tokenized_data = tokenizer(list(fasta_data.values()), padding=True, truncation=True, return_tensors="pt", max_length=seq_length)
+unique_terms = list(set(term for terms in tsv_data.values() for term in terms))
+```
+Second, downlowd the file `go-basic.obo` [from here](https://huggingface.co/datasets/AmelieSchreiber/cafa_5)
+and store the file locally, then provide the local path in the the code below:
+```python
+import torch
+from transformers import AutoTokenizer, EsmForSequenceClassification
+from sklearn.metrics import precision_recall_fscore_support
+# 1. Parsing the go-basic.obo file
+def parse_obo_file(file_path):
+ with open(file_path, 'r') as f:
+ data = f.read().split("[Term]")
+ terms = []
+ for entry in data[1:]:
+ lines = entry.strip().split("\n")
+ term = {}
+ for line in lines:
+ if line.startswith("id:"):
+ term["id"] = line.split("id:")[1].strip()
+ elif line.startswith("name:"):
+ term["name"] = line.split("name:")[1].strip()
+ elif line.startswith("namespace:"):
+ term["namespace"] = line.split("namespace:")[1].strip()
+ elif line.startswith("def:"):
+ term["definition"] = line.split("def:")[1].split('"')[1]
+ terms.append(term)
+ return terms
+parsed_terms = parse_obo_file("go-basic.obo") # Replace `go-basic.obo` with your path
+# 2. Load the saved model and tokenizer
+model_path = "AmelieSchreiber/esm2_t6_8M_finetuned_cafa5"
+loaded_model = EsmForSequenceClassification.from_pretrained(model_path)
+loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)
+# 3. The predict_protein_function function
+def predict_protein_function(sequence, model, tokenizer, go_terms):
+ inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=1022)
+ model.eval()
+ with torch.no_grad():
+ outputs = model(**inputs)
+ predictions = torch.sigmoid(outputs.logits)
+ predicted_indices = torch.where(predictions > 0.05)[1].tolist()
+ functions = []
+ for idx in predicted_indices:
+ term_id = unique_terms[idx] # Use the unique_terms list from your training script
+ for term in go_terms:
+ if term["id"] == term_id:
+ functions.append(term["name"])
+ break
+ return functions
+# 4. Predicting protein function for an example sequence
+example_sequence = "MAYLGSLVQRRLELASGDRLEASLGVGSELDVRGDRVKAVGSLDLEEGRLEQAGVSMA" # Replace with your protein sequence
+predicted_functions = predict_protein_function(example_sequence, loaded_model, loaded_tokenizer, parsed_terms)
+print(predicted_functions)
+```