import gradio as gr import torch from transformers import AutoModelForTokenClassification, AutoTokenizer title = "Protien Token Classification 🧬." description = "Finds the position of Helix and Beta strand in the Protein Sequence." article = 'Created from finetuning ESM2_150M' model = AutoModelForTokenClassification.from_pretrained('./Model') tokenizer = AutoTokenizer.from_pretrained('facebook/esm2_t30_150M_UR50D') example_list = ['MENFTALFGAQADPPPPPTALGFGPGKPPPPPPPPAGGGPGTAPPPTAATAPPGADKSGAGCGPFYLMRELPGSTELTGSTNLITHYNLEQAYNKFCGKKVKEKLSNFLPDLPGMIDLPGSHDNSSLRSLIEKPPILSSSFNPITGTMLAGFRLHTGPLPEQCRLMHIQPPKKKNKHKHKQSRTQDPVPPETPSDSDHKKKKKKKEEDPDRKRKKKEKKKKKNRHSPDHPGMGSSQASSSSSLR', 'MAFSDLTSRTVHLYDNWIKDADPRVEDWLLMSSPLPQTILLGFYVYFVTSLGPKLMENRKPFELKKAMITYNFFIVLFSVYMCYEFVMSGWGIGYSFRCDIVDYSRSPTALRMARTCWLYYFSKFIELLDTIFFVLRKKNSQVTFLHVFHHTIMPWTWWFGVKFAAGGLGTFHALLNTAVHVVMYSYYGLSALGPAYQKYLWWKKYLTSLQLVQFVIVAIHISQFFFMEDCKYQFPVFACIIMSYSFMFLLLFLHFWYRAYTKGQRLPKTVKNGTCKNKDN', 'MYPSNKKKKVWREEKERLLKMTLEERRKEYLRDYIPLNSILSWKEEMKGKGQNDEENTQETSQVKKSLTEKVSLYRGDITLLEVDAIVNAANASLLGGGGVDGCIHRAAGPCLLAECRNLNGCDTGHAKITCGYDLPAKYVIHTVGPIARGHINGSHKEDLANCYKSSLKLVKENNIRSVAFPCISTGIYGFPNEPAAVIALNTIKEWLAKNHHEVDRIIFCVFLEVDFKIYKKKMNEFFSVDDNNEEEEDVEMKEDSDENGPEEKQSVEEMEEQSQDADGVNTVTVPGPASEEAVEDCKDEDFAKDENITKGGEVTDHSVRDQDHPDGQENDSTKNEIKIETESQSSYMETEELSSNQEDAVIVEQPEVIPLTEDQEEKEGEKAPGEDTPRMPGKSEGSSDLENTPGPDAGAQDEAKEQRNGTK', 'MAGQHLPVPRLEGVSREQFMQHLYPQRKPLVLEGIDLGPCTSKWTVDYLSQVGGKKEVKIHVAAVAQMDFISKNFVYRTLPFDQLVQRAAEEKHKEFFVSEDEKYYLRSLGEDPRKDVADIRKQFPLLKGDIKFPEFFKEEQFFSSVFRISSPGLQLWTHYDVMDNLLIQVTGKKRVVLFSPRDAQYLYLKGTKSEVLNIDNPDLAKYPLFSKARRYECSLEAGDVLFIPALWFHNVISEEFGVGVNIFWKHLPSECYDKTDTYGNKDPTAASRAAQILDRALKTLAELPEEYRDFYARRMVLHIQDKAYSKNSE', 'MEAGPPGSARPAEPGPCLSGQRGADHTASASLQSVAGTEPGRHPQAVAAVLPAGGCGERMGVPTPKQFCPILERPLISYTLQALERVCWIKDIVVAVTGENMEVMKSIIQKYQHKRISLVEAGVTRHRSIFNGLKALAEDQINSKLSKPEVVIIHDAVRPFVEEGVLLKVVTAAKEHGAAGAIRPLVSTVVSPSADGCLDYSLERARHRASEMPQAFLFDVIYEAYQQCSDYDLEFGTECLQLALKYCCTKAKLVEGSPDLWKVTYKRDLYAAESIIKERISQEICVVMDTEEDNKHVGHLLEEVLKSELNHVKVTSEALGHAGRHLQQIILDQCYNFVCVNVTTSDFQETQKLLSMLEESSLCILYPVVVVSVHFLDFKLVPPSQKMENLMQIREFAKEVKERNILLYGLLISYPQDDQKLQESLRQGAIIIASLIKERNSGLIGQLLIA'] def count_helix(helix): final = [] temp = [] for x in range(1, len(helix)): if helix[x] == helix[x-1] + 1: temp.append(helix[x-1]) temp.append(helix[x]) elif len(temp) != 0: final.append((temp[0], temp[-1])) temp = [] return final def count_strand(strand): final = [] temp = [] for x in range(1, len(strand)): if strand[x] == strand[x-1] + 1: temp.append(strand[x-1]) temp.append(strand[x]) elif len(temp) != 0: final.append((temp[0], temp[-1])) temp = [] return final def print_output1(helix): helix_op = count_helix(helix) if len(helix_op) != 0: str1 = str(helix_op)[1:-1] return str1 else: return str('No Helix found.') def print_output2(strand): strand_op = count_strand(strand) if len(strand_op) != 0: str1 = str(strand_op)[1:-1] return str1 else: return str('No Beta strand found.') def predict(ProtienSequence): input = tokenizer(ProtienSequence, return_tensors='pt') with torch.inference_mode(): outputs = model(**input) output = outputs.logits.argmax(axis=2)[0].numpy() helix = [] strand = [] for i in range(len(output)): if output[i] != 0: if output[i] == 1: helix.append(i+1) else: strand.append(i+1) return print_output1(helix), print_output2(strand) iface = gr.Interface(fn=predict, inputs='text', outputs=[gr.Text(label='Helix'), gr.Text(label='Beta Strand')], title=title, description=description, article=article, examples=example_list) iface.launch()