import os os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo") import pandas as pd import random from collections import defaultdict from pymongo import MongoClient import streamlit as st # MongoDB connection string (replace with your actual password) client = MongoClient("mongodb+srv://dhruvmangroliya:Eussmh5MbCBIkLJ6@cluster0.rrnbxfw.mongodb.net/BTP_DB?retryWrites=true&w=majority") db = client['BTP_DB'] results_collection = db['protein_results'] # Function to fragment the protein sequence into chunks of max length 1000 def fragment_protein_sequence(sequence, max_length=1000): return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)] # Function to find repeating amino acid sequences def find_hetero_amino_acid_repeats(sequence): repeat_counts = defaultdict(int) for length in range(2, len(sequence) + 1): for i in range(len(sequence) - length + 1): substring = sequence[i:i+length] repeat_counts[substring] += 1 return {k: v for k, v in repeat_counts.items() if v > 1} # Function to check and update repeats at boundaries def check_boundary_repeats(fragments, final_repeats, overlap=50): for i in range(len(fragments) - 1): left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i] right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1] overlap_region = left_overlap + right_overlap boundary_repeats = find_hetero_amino_acid_repeats(overlap_region) for substring, count in boundary_repeats.items(): if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring): final_repeats[substring] += count return final_repeats # Function to find new repeats that only appear at fragmentation points def find_new_boundary_repeats(fragments, final_repeats, overlap=50): new_repeats = defaultdict(int) for i in range(len(fragments) - 1): left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i] right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1] overlap_region = left_overlap + right_overlap boundary_repeats = find_hetero_amino_acid_repeats(overlap_region) for substring, count in boundary_repeats.items(): if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring): if substring not in final_repeats: new_repeats[substring] += count return new_repeats # Main function to process the protein sequence def process_protein_sequence(sequence, overlap=50): fragments = fragment_protein_sequence(sequence) final_repeats = defaultdict(int) # Find repeats in each fragment for fragment in fragments: fragment_repeats = find_hetero_amino_acid_repeats(fragment) for k, v in fragment_repeats.items(): final_repeats[k] += v # Check and update repeats at boundaries final_repeats = check_boundary_repeats(fragments, final_repeats, overlap) # Find new repeats emerging at boundaries new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap) # Merge new repeats into final dictionary for k, v in new_repeats.items(): final_repeats[k] += v return final_repeats # Streamlit UI for uploading and processing the Excel file st.title("Protein Sequence Repeat Finder from Excel") # Step 1: Upload the Excel file uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"]) if uploaded_file is not None: # Step 2: Read the Excel file using Pandas df = pd.read_excel(uploaded_file) # Show the first few rows of the uploaded data for preview st.write("Preview of Uploaded Data:") st.write(df.head()) # Step 3: Process each protein sequence if st.button("Process Protein Sequences"): results = [] for index, row in df.iterrows(): protein_id = row["Entry"] protein_name = row["Entry Name"] sequence = row["Sequence"] # Assuming the protein sequence is in a column named 'Protein_Sequence' # Process the protein sequence repeats = process_protein_sequence(sequence) # Prepare data for MongoDB result_data = { "protein_id": protein_id, "protein_name": protein_name, "protein_sequence": sequence, "calculated_repeats": repeats } # Insert results into MongoDB results_collection.insert_one(result_data) # Add results to display results.append({ "Entry": protein_id, "Entry Name": protein_name, "Repeats": repeats }) # Step 4: Display the results st.subheader("Protein Sequences Processed") st.write(results) st.success("Protein sequences processed and results stored in MongoDB.")