from modeling import MT5ForConditionalGeneration from transformers import AutoTokenizer import os class ChemicalConverter: def __init__(self, mode: str): self.mode = mode model_directory = os.path.abspath("models") model_path = os.path.join(model_directory, mode) if mode == "SMILES2IUPAC": model_path = "knowledgator/SMILES2IUPAC-canonical-base" else: model_path = "knowledgator/IUPAC2SMILES-canonical-small" self.model = MT5ForConditionalGeneration.from_pretrained(model_path) self.smiles_tokenizer = AutoTokenizer.from_pretrained("knowledgator/SMILES-FAST-TOKENIZER") self.iupac_tokenizer = AutoTokenizer.from_pretrained("knowledgator/IUPAC-FAST-TOKENIZER") self.smiles_max_len = 128 self.iupac_max_len = 156 def convert(self, input): if self.mode == "SMILES2IUPAC": tokenizer = self.smiles_tokenizer reverse_tokenizer = self.iupac_tokenizer max_length = self.smiles_max_len else: tokenizer = self.iupac_tokenizer reverse_tokenizer = self.smiles_tokenizer max_length = self.iupac_max_len encoding = tokenizer(input, return_tensors='pt', padding="max_length", truncation=True, max_length=max_length) # Move the input tensor to GPU encoding = {key: value.to(self.model.device) for key, value in encoding.items()} # Generate names output = self.model.generate(input_ids=encoding['input_ids'], attention_mask=encoding['attention_mask'], max_new_tokens=156, num_beams=1, num_return_sequences=1) # Decode names output = [reverse_tokenizer.decode(ids, skip_special_tokens=True) for ids in output] return output[0]