Spaces:
Running
Running
from modeling import MT5ForConditionalGeneration | |
from transformers import AutoTokenizer | |
import os | |
class ChemicalConverter: | |
def __init__(self, mode: str): | |
self.mode = mode | |
model_directory = os.path.abspath("models") | |
model_path = os.path.join(model_directory, mode) | |
if mode == "SMILES2IUPAC": | |
model_path = "knowledgator/SMILES2IUPAC-canonical-base" | |
else: | |
model_path = "knowledgator/IUPAC2SMILES-canonical-small" | |
self.model = MT5ForConditionalGeneration.from_pretrained(model_path) | |
self.smiles_tokenizer = AutoTokenizer.from_pretrained("knowledgator/SMILES-FAST-TOKENIZER") | |
self.iupac_tokenizer = AutoTokenizer.from_pretrained("knowledgator/IUPAC-FAST-TOKENIZER") | |
self.smiles_max_len = 128 | |
self.iupac_max_len = 156 | |
def convert(self, input): | |
if self.mode == "SMILES2IUPAC": | |
tokenizer = self.smiles_tokenizer | |
reverse_tokenizer = self.iupac_tokenizer | |
max_length = self.smiles_max_len | |
else: | |
tokenizer = self.iupac_tokenizer | |
reverse_tokenizer = self.smiles_tokenizer | |
max_length = self.iupac_max_len | |
encoding = tokenizer(input, | |
return_tensors='pt', | |
padding="max_length", | |
truncation=True, | |
max_length=max_length) | |
# Move the input tensor to GPU | |
encoding = {key: value.to(self.model.device) for key, value in encoding.items()} | |
# Generate names | |
output = self.model.generate(input_ids=encoding['input_ids'], | |
attention_mask=encoding['attention_mask'], | |
max_new_tokens=156, | |
num_beams=1, | |
num_return_sequences=1) | |
# Decode names | |
output = [reverse_tokenizer.decode(ids, skip_special_tokens=True) for ids in output] | |
return output[0] |