Jayesh13 commited on
Commit
19a6e11
·
verified ·
1 Parent(s): 690d85c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -198
app.py CHANGED
@@ -1,236 +1,109 @@
1
  import os
2
  os.system("pip install streamlit pandas xlsxwriter openpyxl")
3
 
4
- import streamlit as st
5
- import pandas as pd
6
- import xlsxwriter
7
- from io import BytesIO
8
  from collections import defaultdict
9
- import hashlib
10
- import sqlite3
11
- import base64
12
-
13
- # Initialize DB
14
- def init_db():
15
- conn = sqlite3.connect("file_cache.db")
16
- cursor = conn.cursor()
17
- cursor.execute('''
18
- CREATE TABLE IF NOT EXISTS file_cache (
19
- file_hash TEXT PRIMARY KEY,
20
- file_name TEXT,
21
- analysis_type TEXT,
22
- result BLOB
23
- )
24
- ''')
25
- conn.commit()
26
- conn.close()
27
-
28
- init_db()
29
-
30
- # Hashing function
31
- def get_file_hash(file):
32
- return hashlib.sha256(file.read()).hexdigest()
33
-
34
- # Check if file hash exists in DB
35
- def check_cache(file_hash, analysis_type):
36
- conn = sqlite3.connect("file_cache.db")
37
- cursor = conn.cursor()
38
- cursor.execute("SELECT result FROM file_cache WHERE file_hash = ? AND analysis_type = ?", (file_hash, analysis_type))
39
- row = cursor.fetchone()
40
- conn.close()
41
- if row:
42
- return BytesIO(base64.b64decode(row[0]))
43
- return None
44
-
45
- # Store result in DB
46
- def cache_result(file_hash, file_name, analysis_type, result_bytes):
47
- conn = sqlite3.connect("file_cache.db")
48
- cursor = conn.cursor()
49
- cursor.execute(
50
- "INSERT OR REPLACE INTO file_cache (file_hash, file_name, analysis_type, result) VALUES (?, ?, ?, ?)",
51
- (file_hash, file_name, analysis_type, base64.b64encode(result_bytes.read()).decode('utf-8'))
52
- )
53
- conn.commit()
54
- conn.close()
55
-
56
- # === Protein Analysis Logic ===
57
- def is_homo_repeat(s):
58
- return all(c == s[0] for c in s)
59
-
60
- def find_homorepeats(protein):
61
- n = len(protein)
62
- freq = defaultdict(int)
63
- i = 0
64
- while i < n:
65
- curr = protein[i]
66
- repeat = ""
67
- while i < n and curr == protein[i]:
68
- repeat += protein[i]
69
- i += 1
70
- if len(repeat) > 1:
71
- freq[repeat] += 1
72
- return freq
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def find_hetero_amino_acid_repeats(sequence):
75
  repeat_counts = defaultdict(int)
 
 
76
  for length in range(2, len(sequence) + 1):
77
  for i in range(len(sequence) - length + 1):
78
  substring = sequence[i:i+length]
79
  repeat_counts[substring] += 1
80
- return {k: v for k, v in repeat_counts.items() if v > 1}
81
 
82
- def fragment_protein_sequence(sequence, max_length=1000):
83
- return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
84
 
 
85
  def check_boundary_repeats(fragments, final_repeats, overlap=50):
86
  for i in range(len(fragments) - 1):
87
- left_overlap = fragments[i][-overlap:]
88
- right_overlap = fragments[i + 1][:overlap]
89
  overlap_region = left_overlap + right_overlap
 
90
  boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
 
91
  for substring, count in boundary_repeats.items():
92
  if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
93
- final_repeats[substring] += count
 
94
  return final_repeats
95
 
 
96
  def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
97
  new_repeats = defaultdict(int)
 
98
  for i in range(len(fragments) - 1):
99
- left_overlap = fragments[i][-overlap:]
100
- right_overlap = fragments[i + 1][:overlap]
101
  overlap_region = left_overlap + right_overlap
 
102
  boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
 
103
  for substring, count in boundary_repeats.items():
104
  if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
105
  if substring not in final_repeats:
106
  new_repeats[substring] += count
 
107
  return new_repeats
108
 
109
- def process_protein_sequence(sequence, analysis_type, overlap=50):
 
110
  fragments = fragment_protein_sequence(sequence)
111
- final_repeats = defaultdict(int)
112
 
113
- if analysis_type == "Hetero":
114
- for fragment in fragments:
115
- fragment_repeats = find_hetero_amino_acid_repeats(fragment)
116
- for k, v in fragment_repeats.items():
117
- final_repeats[k] += v
118
- final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
119
- new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
120
- for k, v in new_repeats.items():
121
- final_repeats[k] += v
122
- final_repeats = {k: v for k, v in final_repeats.items() if not is_homo_repeat(k)}
123
-
124
- elif analysis_type == "Homo":
125
- final_repeats = find_homorepeats(sequence)
126
-
127
- elif analysis_type == "Both":
128
- hetero_repeats = defaultdict(int)
129
- for fragment in fragments:
130
- fragment_repeats = find_hetero_amino_acid_repeats(fragment)
131
- for k, v in fragment_repeats.items():
132
- hetero_repeats[k] += v
133
- hetero_repeats = check_boundary_repeats(fragments, hetero_repeats)
134
- new_repeats = find_new_boundary_repeats(fragments, hetero_repeats)
135
- for k, v in new_repeats.items():
136
- hetero_repeats[k] += v
137
- hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
138
-
139
- homo_repeats = find_homorepeats(sequence)
140
- final_repeats = homo_repeats.copy()
141
- for k, v in hetero_repeats.items():
142
  final_repeats[k] += v
143
 
 
 
 
 
 
 
 
 
 
 
144
  return final_repeats
145
 
146
- def process_excel(excel_data, analysis_type):
147
- repeats = set()
148
- sequence_data = []
149
- for sheet_name in excel_data.sheet_names:
150
- df = excel_data.parse(sheet_name)
151
- if len(df.columns) < 3:
152
- st.error(f"Error: The sheet '{sheet_name}' must have at least three columns: ID, Protein Name, Sequence")
153
- return None, None
154
- for _, row in df.iterrows():
155
- entry_id = str(row[0])
156
- protein_name = str(row[1])
157
- sequence = str(row[2]).replace('"', '').replace(' ', '')
158
- freq = process_protein_sequence(sequence, analysis_type)
159
- sequence_data.append((entry_id, protein_name, freq))
160
- repeats.update(freq.keys())
161
- return repeats, sequence_data
162
-
163
- def create_excel(sequences_data, repeats, filenames):
164
- output = BytesIO()
165
- workbook = xlsxwriter.Workbook(output, {'in_memory': True})
166
- for file_index, file_data in enumerate(sequences_data):
167
- filename = filenames[file_index]
168
- worksheet = workbook.add_worksheet(filename[:31])
169
- worksheet.write(0, 0, "Entry ID")
170
- worksheet.write(0, 1, "Protein Name")
171
- col = 2
172
- for repeat in sorted(repeats):
173
- worksheet.write(0, col, repeat)
174
- col += 1
175
- row = 1
176
- for entry_id, protein_name, freq in file_data:
177
- worksheet.write(row, 0, entry_id)
178
- worksheet.write(row, 1, protein_name)
179
- col = 2
180
- for repeat in sorted(repeats):
181
- worksheet.write(row, col, freq.get(repeat, 0))
182
- col += 1
183
- row += 1
184
- workbook.close()
185
- output.seek(0)
186
- return output
187
-
188
- # === Streamlit UI ===
189
- st.title("Protein Repeat Analysis with Caching")
190
- analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
191
- uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
192
-
193
- if uploaded_files:
194
- all_repeats = set()
195
- all_sequences_data = []
196
- filenames = []
197
- final_output = BytesIO()
198
- for file in uploaded_files:
199
- file.seek(0)
200
- file_hash = get_file_hash(file)
201
- file.seek(0)
202
- cached = check_cache(file_hash, analysis_type)
203
- if cached:
204
- st.success(f"Using cached result for {file.name}")
205
- cached_content = cached.read()
206
- final_output.write(cached_content)
207
- final_output.seek(0)
208
- else:
209
- st.info(f"Processing {file.name}...")
210
- excel_data = pd.ExcelFile(file)
211
- repeats, sequence_data = process_excel(excel_data, analysis_type)
212
- if repeats is not None:
213
- all_repeats.update(repeats)
214
- all_sequences_data.append(sequence_data)
215
- filenames.append(file.name)
216
- excel_file = create_excel(all_sequences_data, all_repeats, filenames)
217
- cache_result(file_hash, file.name, analysis_type, excel_file)
218
- final_output = excel_file
219
-
220
- st.download_button(
221
- label="Download Excel file",
222
- data=final_output,
223
- file_name="protein_repeat_results.xlsx",
224
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
225
- )
226
-
227
- if st.checkbox("Show Results Table"):
228
- rows = []
229
- for file_index, file_data in enumerate(all_sequences_data):
230
- filename = filenames[file_index]
231
- for entry_id, protein_name, freq in file_data:
232
- row = {"Filename": filename, "Entry ID": entry_id, "Protein Name": protein_name}
233
- row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_repeats)})
234
- rows.append(row)
235
- result_df = pd.DataFrame(rows)
236
- st.dataframe(result_df)
 
1
  import os
2
  os.system("pip install streamlit pandas xlsxwriter openpyxl")
3
 
4
+ import random
 
 
 
5
  from collections import defaultdict
6
+ from pymongo import MongoClient
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # MongoDB connection string (replace with your actual password)
9
+ client = MongoClient("mongodb+srv://dhruvmangroliya:[email protected]/BTP_DB?retryWrites=true&w=majority")
10
+
11
+ # Access the BTP_DB database and protein_results collection
12
+ db = client['BTP_DB']
13
+ results_collection = db['protein_results']
14
+
15
+ # Function to generate a random protein sequence of given length
16
+ def generate_protein_sequence(length):
17
+ amino_acids = "ACDEFGHIKLMNPQRSTVWY" # 20 standard amino acids
18
+ return ''.join(random.choices(amino_acids, k=length))
19
+
20
+ # Function to fragment the protein sequence into chunks of max length 1000
21
+ def fragment_protein_sequence(sequence, max_length=1000):
22
+ return [sequence[i:i+max_length] for i in range(0, len(sequence), max_length)]
23
+
24
+ # Function to find repeating amino acid sequences
25
  def find_hetero_amino_acid_repeats(sequence):
26
  repeat_counts = defaultdict(int)
27
+
28
+ # Iterate over all possible substring lengths
29
  for length in range(2, len(sequence) + 1):
30
  for i in range(len(sequence) - length + 1):
31
  substring = sequence[i:i+length]
32
  repeat_counts[substring] += 1
 
33
 
34
+ # Filter out substrings that occur only once
35
+ return {k: v for k, v in repeat_counts.items() if v > 1}
36
 
37
+ # Function to check and update repeats at boundaries
38
  def check_boundary_repeats(fragments, final_repeats, overlap=50):
39
  for i in range(len(fragments) - 1):
40
+ left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
41
+ right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
42
  overlap_region = left_overlap + right_overlap
43
+
44
  boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
45
+
46
  for substring, count in boundary_repeats.items():
47
  if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
48
+ final_repeats[substring] += count # Only add if spanning both fragments
49
+
50
  return final_repeats
51
 
52
+ # Function to find new repeats that only appear at fragmentation points
53
  def find_new_boundary_repeats(fragments, final_repeats, overlap=50):
54
  new_repeats = defaultdict(int)
55
+
56
  for i in range(len(fragments) - 1):
57
+ left_overlap = fragments[i][-overlap:] if len(fragments[i]) >= overlap else fragments[i]
58
+ right_overlap = fragments[i + 1][:overlap] if len(fragments[i + 1]) >= overlap else fragments[i + 1]
59
  overlap_region = left_overlap + right_overlap
60
+
61
  boundary_repeats = find_hetero_amino_acid_repeats(overlap_region)
62
+
63
  for substring, count in boundary_repeats.items():
64
  if any(aa in left_overlap for aa in substring) and any(aa in right_overlap for aa in substring):
65
  if substring not in final_repeats:
66
  new_repeats[substring] += count
67
+
68
  return new_repeats
69
 
70
+ # Main function to process the protein sequence
71
+ def process_protein_sequence(sequence, overlap=50):
72
  fragments = fragment_protein_sequence(sequence)
 
73
 
74
+ # Step 1: Find repeats in each fragment
75
+ final_repeats = defaultdict(int)
76
+ for fragment in fragments:
77
+ fragment_repeats = find_hetero_amino_acid_repeats(fragment)
78
+ for k, v in fragment_repeats.items():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  final_repeats[k] += v
80
 
81
+ # Step 2: Check and update repeats at boundaries
82
+ final_repeats = check_boundary_repeats(fragments, final_repeats, overlap)
83
+
84
+ # Step 3: Find new repeats emerging at boundaries
85
+ new_repeats = find_new_boundary_repeats(fragments, final_repeats, overlap)
86
+
87
+ # Step 4: Merge new repeats into final dictionary
88
+ for k, v in new_repeats.items():
89
+ final_repeats[k] += v
90
+
91
  return final_repeats
92
 
93
+ # Example to generate a protein sequence
94
+ protein_sequence = generate_protein_sequence(3000)
95
+
96
+ # Process the protein sequence
97
+ calculated_repeats = process_protein_sequence(protein_sequence)
98
+
99
+ # Prepare data to insert into MongoDB
100
+ data_to_insert = {
101
+ "protein_sequence": protein_sequence,
102
+ "calculated_repeats": calculated_repeats
103
+ }
104
+
105
+ # Insert the results into the MongoDB collection
106
+ inserted_id = results_collection.insert_one(data_to_insert).inserted_id
107
+
108
+ # Print out the inserted document's ID
109
+ print(f"Data successfully inserted with ID: {inserted_id}")