import streamlit as st from transformers import GPT2LMHeadModel, GPT2Tokenizer import torch st.set_page_config(page_title="NeoProtein Designer", page_icon="🧬") st.title("🧬 NeoProtein-GPT Protein Designer") st.markdown(""" ### Design novel protein sequences with unique binding sites *Using the [NeoProtein-GPT](https://huggingface.co/ayyuce/NeoProtein-GPT) model* """) with st.sidebar: st.header("Parameters") binding_motif = st.text_input("Binding site motif (e.g., AXXC):", help="Use X for wildcard positions") seq_length = st.slider("Sequence length", 50, 500, 150) temperature = st.slider("Temperature (creativity)", 0.1, 2.0, 1.0) num_sequences = st.slider("Number of sequences", 1, 5, 3) @st.cache_resource(show_spinner=False) def load_model(): model = GPT2LMHeadModel.from_pretrained( "ayyuce/NeoProtein-GPT", force_download=True, resume_download=False, local_files_only=False, trust_remote_code=True ) tokenizer = GPT2Tokenizer.from_pretrained("ayyuce/NeoProtein-GPT") return model, tokenizer model, tokenizer = load_model() def generate_sequences(): if not binding_motif: st.error("Please enter a binding motif") return prompt = f"BindingMotif:{binding_motif}Seq:" try: inputs = tokenizer(prompt, return_tensors="pt") input_length = inputs.input_ids.shape[1] outputs = model.generate( inputs.input_ids, max_length=input_length + seq_length, temperature=temperature, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=num_sequences, pad_token_id=tokenizer.eos_token_id ) generated_sequences = [ tokenizer.decode(output[input_length:], skip_special_tokens=True) for output in outputs ] return generated_sequences except Exception as e: st.error(f"Generation failed: {str(e)}") return [] if st.button("Generate Protein Sequences"): with st.spinner("Designing novel proteins..."): sequences = generate_sequences() if sequences: st.subheader("Generated Sequences") for i, seq in enumerate(sequences): st.markdown(f""" **Sequence #{i+1}** ```fasta {seq} ``` """) st.markdown(""" ### How to use: 1. Enter your target binding motif using single-letter amino acid codes 2. Adjust parameters in the sidebar 3. Click the generate button 4. Results will appear in FASTA format **Example motifs:** - `GHXXXH` for histidine-rich motifs - `CXXC` for disulfide bond motifs - `DE` for acidic patches """)