Spaces:
Sleeping
Sleeping
File size: 5,151 Bytes
444a81b eb5a8ee 444a81b 2dc6b65 19a6e11 444a81b 19a6e11 2dc6b65 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 2dc6b65 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 444a81b 19a6e11 2dc6b65 19a6e11 02632f9 5a986b1 2dc6b65 19a6e11 2dc6b65 19a6e11 2dc6b65 19a6e11 444a81b 2dc6b65 0ca1e97 2dc6b65 0ca1e97 2dc6b65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import os
os.system("pip install streamlit pandas xlsxwriter openpyxl pymongo")
import pandas as pd
import random
from collections import defaultdict
from pymongo import MongoClient
import streamlit as st
# MongoDB connection string (replace with your actual password)
client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
db = client['BTP_DB']
results_collection = db['protein_results']
# Function to fragment the protein sequence into chunks of max length 1000
def fragment_protein_sequence(sequence, max_length=1000):
return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
# Function to find repeating amino acid sequences
def find_hetero_amino_acid_repeats(sequence):
repeat_counts = defaultdict(int)
for length in range(2, len(sequence) + 1):
for i in range(len(sequence) - length + 1):
substring = sequence[i:i+length]
repeat_counts[substring] += 1
return {k: v for k, v in repeat_counts.items() if v > 1}
# Function to check and update repeats at boundaries
def check_boundary_repeats(fragments, final_repeats, overlap=50):
for i in range(len(fragments) - 1):
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
overlap_region = left_overlap + right_overlap
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
for substring, count in boundary_repeats.items():
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
final_repeats[substring] += count
return final_repeats
# Function to find new repeats that only appear at fragmentation points
def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
new_repeats = defaultdict(int)
for i in range(len(fragments) - 1):
left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
overlap_region = left_overlap + right_overlap
boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
for substring, count in boundary_repeats.items():
if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
if substring not in final_repeats:
new_repeats[substring] += count
return new_repeats
# Main function to process the protein sequence
def process_protein_sequence(sequence, overlap=50):
fragments = fragment_protein_sequence(sequence)
final_repeats = defaultdict(int)
# Find repeats in each fragment
for fragment in fragments:
fragment_repeats = find_hetero_amino_acid_repeats(fragment)
for k, v in fragment_repeats.items():
final_repeats[k] += v
# Check and update repeats at boundaries
final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
# Find new repeats emerging at boundaries
new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
# Merge new repeats into final dictionary
for k, v in new_repeats.items():
final_repeats[k] += v
return final_repeats
# Streamlit UI for uploading and processing the Excel file
st.title("Protein Sequence Repeat Finder from Excel")
# Step 1: Upload the Excel file
uploaded_file = st.file_uploader("Upload Excel file containing Protein Sequences", type=["xlsx"])
if uploaded_file is not None:
# Step 2: Read the Excel file using Pandas
df = pd.read_excel(uploaded_file)
# Show the first few rows of the uploaded data for preview
st.write("Preview of Uploaded Data:")
st.write(df.head())
# Step 3: Process each protein sequence
if st.button("Process Protein Sequences"):
results = []
for index, row in df.iterrows():
protein_id = row["Entry"]
protein_name = row["Entry Name"]
sequence = row["Sequence"] # Assuming the protein sequence is in a column named 'Protein_Sequence'
# Process the protein sequence
repeats = process_protein_sequence(sequence)
# Prepare data for MongoDB
result_data = {
"protein_id": protein_id,
"protein_name": protein_name,
"protein_sequence": sequence,
"calculated_repeats": repeats
}
# Insert results into MongoDB
results_collection.insert_one(result_data)
# Add results to display
results.append({
"Entry": protein_id,
"Entry Name": protein_name,
"Repeats": repeats
})
# Step 4: Display the results
st.subheader("Protein Sequences Processed")
st.write(results)
st.success("Protein sequences processed and results stored in MongoDB.") |