|
import re |
|
from rdkit import Chem |
|
from rdkit.Chem import MolFromSmiles, SDWriter |
|
import logging |
|
from Bio import SeqIO |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
def process_smiles(smiles: str) -> str: |
|
mol = MolFromSmiles(smiles) |
|
if not mol: |
|
raise ValueError(f"Invalid SMILES string: {smiles}") |
|
|
|
sdf_file = "/tmp/output.sdf" |
|
writer = SDWriter(sdf_file) |
|
writer.write(mol) |
|
writer.close() |
|
|
|
return sdf_file |
|
|
|
def process_pdb(file_path: str) -> str: |
|
sequences = [] |
|
with open(file_path, "r") as handle: |
|
for record in SeqIO.parse(handle, "pdb-seqres"): |
|
sequences.append(str(record.seq)) |
|
return " ".join(sequences) |
|
|
|
def process_sdf(file_path: str) -> str: |
|
return file_path |
|
|
|
def extract_smiles(text: str) -> str: |
|
smiles_pattern = r"([^J][0-9BCOHNSOPrIFla@+\-\[\]\(\)\\\/%=#$]{6,})" |
|
matches = re.findall(smiles_pattern, text) |
|
if matches: |
|
return matches[0] |
|
return "" |
|
|
|
def is_valid_smiles(smiles: str) -> bool: |
|
mol = MolFromSmiles(smiles) |
|
return mol is not None |
|
|
|
def extract_and_convert_to_sdf(text: str) -> str: |
|
smiles = extract_smiles(text) |
|
if smiles and is_valid_smiles(smiles): |
|
return process_smiles(smiles) |
|
raise ValueError("No valid SMILES string found in the text.") |
|
|