import os
os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")

import pandas as pd
import random
from collections import defaultdict
from pymongo import MongoClient
import streamlit as st

# MongoDB connection string (replace with your actual password)
client = MongoClient("mongodb+srv://dhruvmangroliya:Eussmh5MbCBIkLJ6@cluster0.rrnbxfw.mongodb.net/BTP_DB?retryWrites=true&w=majority")
db = client['BTP_DB']
results_collection = db['protein_results']

# Function to fragment the protein sequence into chunks of max length 1000
def fragment_protein_sequence(sequence, max_length=1000):
    return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]

# Function to find repeating amino acid sequences
def find_hetero_amino_acid_repeats(sequence):
    repeat_counts = defaultdict(int)
    for length in range(2, len(sequence) + 1):
        for i in range(len(sequence) - length + 1):
            substring = sequence[i:i+length]
            repeat_counts[substring] += 1
    return {k: v for k, v in repeat_counts.items() if v > 1}

# Function to check and update repeats at boundaries
def check_boundary_repeats(fragments, final_repeats, overlap=50):
    for i in range(len(fragments) - 1):
        left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
        right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
        overlap_region = left_overlap + right_overlap
        boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
        for substring, count in boundary_repeats.items():
            if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
                final_repeats[substring] += count
    return final_repeats

# Function to find new repeats that only appear at fragmentation points
def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
    new_repeats = defaultdict(int)
    for i in range(len(fragments) - 1):
        left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
        right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
        overlap_region = left_overlap + right_overlap
        boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
        for substring, count in boundary_repeats.items():
            if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
                if substring not in final_repeats:
                    new_repeats[substring] += count
    return new_repeats

# Main function to process the protein sequence
def process_protein_sequence(sequence, overlap=50):
    fragments = fragment_protein_sequence(sequence)
    final_repeats = defaultdict(int)

    # Find repeats in each fragment
    for fragment in fragments:
        fragment_repeats = find_hetero_amino_acid_repeats(fragment)
        for k, v in fragment_repeats.items():
            final_repeats[k] += v

    # Check and update repeats at boundaries
    final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)

    # Find new repeats emerging at boundaries
    new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)

    # Merge new repeats into final dictionary
    for k, v in new_repeats.items():
        final_repeats[k] += v

    return final_repeats

# Streamlit UI for uploading and processing the Excel file
st.title("Protein Sequence Repeat Finder from Excel")

# Step 1: Upload the Excel file
uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"])

if uploaded_file is not None:
    # Step 2: Read the Excel file using Pandas
    df = pd.read_excel(uploaded_file)

    # Show the first few rows of the uploaded data for preview
    st.write("Preview of Uploaded Data:")
    st.write(df.head())

    # Step 3: Process each protein sequence
    if st.button("Process Protein Sequences"):
        results = []
        
        for index, row in df.iterrows():
            protein_id = row["Entry"]
            protein_name = row["Entry Name"]
            sequence = row["Sequence"]  # Assuming the protein sequence is in a column named 'Protein_Sequence'

            # Process the protein sequence
            repeats = process_protein_sequence(sequence)

            # Prepare data for MongoDB
            result_data = {
                "protein_id": protein_id,
                "protein_name": protein_name,
                "protein_sequence": sequence,
                "calculated_repeats": repeats
            }

            # Insert results into MongoDB
            results_collection.insert_one(result_data)

            # Add results to display
            results.append({
                "Entry": protein_id,
                "Entry Name": protein_name,
                "Repeats": repeats
            })

        # Step 4: Display the results
        st.subheader("Protein Sequences Processed")
        st.write(results)

        st.success("Protein sequences processed and results stored in MongoDB.")