|
import gradio as gr |
|
import torch |
|
from fuse.data.tokenizers.modular_tokenizer.op import ModularTokenizerOp |
|
from mammal.keys import ( |
|
CLS_PRED, |
|
ENCODER_INPUTS_ATTENTION_MASK, |
|
ENCODER_INPUTS_STR, |
|
ENCODER_INPUTS_TOKENS, |
|
SCORES, |
|
) |
|
from mammal.model import Mammal |
|
|
|
from mammal_demo.demo_framework import MammalObjectBroker, MammalTask |
|
|
|
|
|
class PpiTask(MammalTask): |
|
def __init__(self, model_dict): |
|
super().__init__(name="Protein-Protein Interaction", model_dict=model_dict) |
|
self.description = "Protein-Protein Interaction (PPI)" |
|
self.examples = { |
|
"protein_calmodulin": "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMISELDQDGFIDKEDLHDGDGKISFEEFLNLVNKEMTADVDGDGQVNYEEFVTMMTSK", |
|
"protein_calcineurin": "MSSKLLLAGLDIERVLAEKNFYKEWDTWIIEAMNVGDEEVDRIKEFKEDEIFEEAKTLGTAEMQEYKKQKLEEAIEGAFDIFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIRQMWDQNGDWDRIKELKFGEIKKLSAKDTRGTIFIKVFENLGTGVDSEYEDVSKYMLKHQ", |
|
} |
|
self.markup_text = f""" |
|
# Mammal based {self.description} demonstration |
|
|
|
Given two protein sequences, estimate if the proteins interact or not.""" |
|
|
|
def generate_prompt(self, protein_seq_1, protein_seq_2): |
|
"""Formatting prompt to match pre-training syntax |
|
|
|
Args: |
|
protein_seq_1 (str): sequance of protein number 1 |
|
protein_seq_2 (str): sequance of protein number 2 |
|
|
|
Returns: |
|
str: prompt |
|
""" |
|
prompt = ( |
|
"<@TOKENIZER-TYPE=AA><BINDING_AFFINITY_CLASS><SENTINEL_ID_0>" |
|
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>" |
|
+ f"<SEQUENCE_NATURAL_START>{protein_seq_1}<SEQUENCE_NATURAL_END>" |
|
+ "<MOLECULAR_ENTITY><MOLECULAR_ENTITY_GENERAL_PROTEIN>" |
|
+ f"<SEQUENCE_NATURAL_START>{protein_seq_2}<SEQUENCE_NATURAL_END><EOS>" |
|
) |
|
return prompt |
|
|
|
def crate_sample_dict(self, sample_inputs: dict, model_holder: MammalObjectBroker): |
|
|
|
sample_dict = dict() |
|
prompt = self.generate_prompt(**sample_inputs) |
|
sample_dict[ENCODER_INPUTS_STR] = prompt |
|
|
|
|
|
sample_dict = model_holder.tokenizer_op( |
|
sample_dict=sample_dict, |
|
key_in=ENCODER_INPUTS_STR, |
|
key_out_tokens_ids=ENCODER_INPUTS_TOKENS, |
|
key_out_attention_mask=ENCODER_INPUTS_ATTENTION_MASK, |
|
) |
|
sample_dict[ENCODER_INPUTS_TOKENS] = torch.tensor( |
|
sample_dict[ENCODER_INPUTS_TOKENS] |
|
) |
|
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] = torch.tensor( |
|
sample_dict[ENCODER_INPUTS_ATTENTION_MASK] |
|
) |
|
|
|
return sample_dict |
|
|
|
def run_model(self, sample_dict, model: Mammal): |
|
|
|
batch_dict = model.generate( |
|
[sample_dict], |
|
output_scores=True, |
|
return_dict_in_generate=True, |
|
max_new_tokens=5, |
|
) |
|
return batch_dict |
|
|
|
def decode_output(self, batch_dict, tokenizer_op: ModularTokenizerOp) -> list: |
|
|
|
|
|
generated_output = tokenizer_op._tokenizer.decode(batch_dict[CLS_PRED][0]) |
|
score = batch_dict[SCORES][0][1][self.positive_token_id(tokenizer_op)].item() |
|
|
|
ans = [generated_output, score] |
|
return ans |
|
|
|
def create_and_run_prompt(self, model_name, protein_seq_1, protein_seq_2): |
|
model_holder = self.model_dict[model_name] |
|
sample_inputs = {"protein_seq_1": protein_seq_1, "protein_seq_2": protein_seq_2} |
|
sample_dict = self.crate_sample_dict( |
|
sample_inputs=sample_inputs, model_holder=model_holder |
|
) |
|
prompt = sample_dict[ENCODER_INPUTS_STR] |
|
batch_dict = self.run_model(sample_dict=sample_dict, model=model_holder.model) |
|
res = prompt, *self.decode_output( |
|
batch_dict, tokenizer_op=model_holder.tokenizer_op |
|
) |
|
return res |
|
|
|
def create_demo(self, model_name_widget: gr.component): |
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Group() as demo: |
|
gr.Markdown(self.markup_text) |
|
with gr.Row(): |
|
protein_seq_1 = gr.Textbox( |
|
label="Protein 1 sequence", |
|
|
|
interactive=True, |
|
lines=3, |
|
value=self.examples["protein_calmodulin"], |
|
) |
|
protein_seq_2 = gr.Textbox( |
|
label="Protein 2 sequence", |
|
|
|
interactive=True, |
|
lines=3, |
|
value=self.examples["protein_calcineurin"], |
|
) |
|
with gr.Row(): |
|
run_mammal: gr.Button = gr.Button( |
|
"Run Mammal prompt for Protein-Protein Interaction", |
|
variant="primary", |
|
) |
|
with gr.Row(): |
|
prompt_box = gr.Textbox(label="Mammal prompt", lines=5) |
|
with gr.Row(): |
|
decoded = gr.Textbox(label="Mammal output") |
|
score_box = gr.Number(label="PPI score") |
|
run_mammal.click( |
|
fn=self.create_and_run_prompt, |
|
inputs=[model_name_widget, protein_seq_1, protein_seq_2], |
|
outputs=[prompt_box, decoded, score_box], |
|
) |
|
with gr.Row(): |
|
gr.Markdown( |
|
"```<SENTINEL_ID_0>``` contains the binding affinity class, which is ```<1>``` for interacting and ```<0>``` for non-interacting" |
|
) |
|
demo.visible = False |
|
return demo |
|
|