File size: 5,151 Bytes
444a81b
eb5a8ee
444a81b
2dc6b65
19a6e11
444a81b
19a6e11
2dc6b65
444a81b
19a6e11
 
 
 
 
 
 
 
 
 
444a81b
 
 
 
 
 
19a6e11
444a81b
19a6e11
444a81b
 
19a6e11
 
444a81b
 
 
 
2dc6b65
444a81b
 
19a6e11
444a81b
 
 
19a6e11
 
444a81b
 
 
 
 
 
 
 
19a6e11
 
444a81b
19a6e11
2dc6b65
 
19a6e11
 
 
02632f9
5a986b1
2dc6b65
19a6e11
 
2dc6b65
19a6e11
 
2dc6b65
19a6e11
 
 
444a81b
 
2dc6b65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ca1e97
 
 
2dc6b65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ca1e97
 
2dc6b65
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")

import pandas as pd
import random
from collections import defaultdict
from pymongo import MongoClient
import streamlit as st

# MongoDB connection string (replace with your actual password)
client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
db = client['BTP_DB']
results_collection = db['protein_results']

# Function to fragment the protein sequence into chunks of max length 1000
def fragment_protein_sequence(sequence, max_length=1000):
    return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]

# Function to find repeating amino acid sequences
def find_hetero_amino_acid_repeats(sequence):
    repeat_counts = defaultdict(int)
    for length in range(2, len(sequence) + 1):
        for i in range(len(sequence) - length + 1):
            substring = sequence[i:i+length]
            repeat_counts[substring] += 1
    return {k: v for k, v in repeat_counts.items() if v > 1}

# Function to check and update repeats at boundaries
def check_boundary_repeats(fragments, final_repeats, overlap=50):
    for i in range(len(fragments) - 1):
        left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
        right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
        overlap_region = left_overlap + right_overlap
        boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
        for substring, count in boundary_repeats.items():
            if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
                final_repeats[substring] += count
    return final_repeats

# Function to find new repeats that only appear at fragmentation points
def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
    new_repeats = defaultdict(int)
    for i in range(len(fragments) - 1):
        left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
        right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
        overlap_region = left_overlap + right_overlap
        boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
        for substring, count in boundary_repeats.items():
            if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
                if substring not in final_repeats:
                    new_repeats[substring] += count
    return new_repeats

# Main function to process the protein sequence
def process_protein_sequence(sequence, overlap=50):
    fragments = fragment_protein_sequence(sequence)
    final_repeats = defaultdict(int)

    # Find repeats in each fragment
    for fragment in fragments:
        fragment_repeats = find_hetero_amino_acid_repeats(fragment)
        for k, v in fragment_repeats.items():
            final_repeats[k] += v

    # Check and update repeats at boundaries
    final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)

    # Find new repeats emerging at boundaries
    new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)

    # Merge new repeats into final dictionary
    for k, v in new_repeats.items():
        final_repeats[k] += v

    return final_repeats

# Streamlit UI for uploading and processing the Excel file
st.title("Protein Sequence Repeat Finder from Excel")

# Step 1: Upload the Excel file
uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"])

if uploaded_file is not None:
    # Step 2: Read the Excel file using Pandas
    df = pd.read_excel(uploaded_file)

    # Show the first few rows of the uploaded data for preview
    st.write("Preview of Uploaded Data:")
    st.write(df.head())

    # Step 3: Process each protein sequence
    if st.button("Process Protein Sequences"):
        results = []
        
        for index, row in df.iterrows():
            protein_id = row["Entry"]
            protein_name = row["Entry Name"]
            sequence = row["Sequence"]  # Assuming the protein sequence is in a column named 'Protein_Sequence'

            # Process the protein sequence
            repeats = process_protein_sequence(sequence)

            # Prepare data for MongoDB
            result_data = {
                "protein_id": protein_id,
                "protein_name": protein_name,
                "protein_sequence": sequence,
                "calculated_repeats": repeats
            }

            # Insert results into MongoDB
            results_collection.insert_one(result_data)

            # Add results to display
            results.append({
                "Entry": protein_id,
                "Entry Name": protein_name,
                "Repeats": repeats
            })

        # Step 4: Display the results
        st.subheader("Protein Sequences Processed")
        st.write(results)

        st.success("Protein sequences processed and results stored in MongoDB.")