Spaces:
Running
Running
File size: 1,935 Bytes
7476d14 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
from modeling import MT5ForConditionalGeneration
from transformers import AutoTokenizer
import os
class ChemicalConverter:
def __init__(self, mode: str):
self.mode = mode
model_directory = os.path.abspath("models")
model_path = os.path.join(model_directory, mode)
if not os.path.exists(model_path):
raise ValueError(f"Model path does not exist: {model_path}")
self.model = MT5ForConditionalGeneration.from_pretrained(model_path)
self.smiles_tokenizer = AutoTokenizer.from_pretrained("BioMike/smiles")
self.iupac_tokenizer = AutoTokenizer.from_pretrained("BioMike/iupac")
self.smiles_max_len = 128
self.iupac_max_len = 156
def convert(self, input):
if self.mode == "SMILES2IUPAC":
tokenizer = self.smiles_tokenizer
reverse_tokenizer = self.iupac_tokenizer
max_length = self.smiles_max_len
else:
tokenizer = self.iupac_tokenizer
reverse_tokenizer = self.smiles_tokenizer
max_length = self.iupac_max_len
encoding = tokenizer(input,
return_tensors='pt',
padding="max_length",
truncation=True,
max_length=max_length)
# Move the input tensor to GPU
encoding = {key: value.to(self.model.device) for key, value in encoding.items()}
# Generate names
output = self.model.generate(input_ids=encoding['input_ids'],
attention_mask=encoding['attention_mask'],
max_new_tokens=156,
num_beams=1,
num_return_sequences=1)
# Decode names
output = [reverse_tokenizer.decode(ids, skip_special_tokens=True) for ids in output]
return output[0] |