Jayesh13 commited on
Commit
690d85c
·
verified ·
1 Parent(s): 10b45fa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -31
app.py CHANGED
@@ -6,8 +6,54 @@ import pandas as pd
6
  import xlsxwriter
7
  from io import BytesIO
8
  from collections import defaultdict
9
-
10
- # Utility to check homo repeat
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def is_homo_repeat(s):
12
  return all(c == s[0] for c in s)
13
 
@@ -84,14 +130,13 @@ def process_protein_sequence(sequence, analysis_type, overlap=50):
84
  fragment_repeats = find_hetero_amino_acid_repeats(fragment)
85
  for k, v in fragment_repeats.items():
86
  hetero_repeats[k] += v
87
- hetero_repeats = check_boundary_repeats(fragments, hetero_repeats, overlap)
88
- new_repeats = find_new_boundary_repeats(fragments, hetero_repeats, overlap)
89
  for k, v in new_repeats.items():
90
  hetero_repeats[k] += v
91
  hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
92
 
93
  homo_repeats = find_homorepeats(sequence)
94
-
95
  final_repeats = homo_repeats.copy()
96
  for k, v in hetero_repeats.items():
97
  final_repeats[k] += v
@@ -140,7 +185,8 @@ def create_excel(sequences_data, repeats, filenames):
140
  output.seek(0)
141
  return output
142
 
143
- st.title("Protein Repeat Analysis")
 
144
  analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
145
  uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
146
 
@@ -148,29 +194,43 @@ if uploaded_files:
148
  all_repeats = set()
149
  all_sequences_data = []
150
  filenames = []
 
151
  for file in uploaded_files:
152
- excel_data = pd.ExcelFile(file)
153
- repeats, sequence_data = process_excel(excel_data, analysis_type)
154
- if repeats is not None:
155
- all_repeats.update(repeats)
156
- all_sequences_data.append(sequence_data)
157
- filenames.append(file.name)
158
- if all_sequences_data:
159
- st.success(f"Processed {len(uploaded_files)} files successfully!")
160
- excel_file = create_excel(all_sequences_data, all_repeats, filenames)
161
- st.download_button(
162
- label="Download Excel file",
163
- data=excel_file,
164
- file_name="protein_repeat_results.xlsx",
165
- mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
166
- )
167
- if st.checkbox("Show Results Table"):
168
- rows = []
169
- for file_index, file_data in enumerate(all_sequences_data):
170
- filename = filenames[file_index]
171
- for entry_id, protein_name, freq in file_data:
172
- row = {"Filename": filename, "Entry ID": entry_id, "Protein Name": protein_name}
173
- row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_repeats)})
174
- rows.append(row)
175
- result_df = pd.DataFrame(rows)
176
- st.dataframe(result_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import xlsxwriter
7
  from io import BytesIO
8
  from collections import defaultdict
9
+ import hashlib
10
+ import sqlite3
11
+ import base64
12
+
13
+ # Initialize DB
14
+ def init_db():
15
+ conn = sqlite3.connect("file_cache.db")
16
+ cursor = conn.cursor()
17
+ cursor.execute('''
18
+ CREATE TABLE IF NOT EXISTS file_cache (
19
+ file_hash TEXT PRIMARY KEY,
20
+ file_name TEXT,
21
+ analysis_type TEXT,
22
+ result BLOB
23
+ )
24
+ ''')
25
+ conn.commit()
26
+ conn.close()
27
+
28
+ init_db()
29
+
30
+ # Hashing function
31
+ def get_file_hash(file):
32
+ return hashlib.sha256(file.read()).hexdigest()
33
+
34
+ # Check if file hash exists in DB
35
+ def check_cache(file_hash, analysis_type):
36
+ conn = sqlite3.connect("file_cache.db")
37
+ cursor = conn.cursor()
38
+ cursor.execute("SELECT result FROM file_cache WHERE file_hash = ? AND analysis_type = ?", (file_hash, analysis_type))
39
+ row = cursor.fetchone()
40
+ conn.close()
41
+ if row:
42
+ return BytesIO(base64.b64decode(row[0]))
43
+ return None
44
+
45
+ # Store result in DB
46
+ def cache_result(file_hash, file_name, analysis_type, result_bytes):
47
+ conn = sqlite3.connect("file_cache.db")
48
+ cursor = conn.cursor()
49
+ cursor.execute(
50
+ "INSERT OR REPLACE INTO file_cache (file_hash, file_name, analysis_type, result) VALUES (?, ?, ?, ?)",
51
+ (file_hash, file_name, analysis_type, base64.b64encode(result_bytes.read()).decode('utf-8'))
52
+ )
53
+ conn.commit()
54
+ conn.close()
55
+
56
+ # === Protein Analysis Logic ===
57
  def is_homo_repeat(s):
58
  return all(c == s[0] for c in s)
59
 
 
130
  fragment_repeats = find_hetero_amino_acid_repeats(fragment)
131
  for k, v in fragment_repeats.items():
132
  hetero_repeats[k] += v
133
+ hetero_repeats = check_boundary_repeats(fragments, hetero_repeats)
134
+ new_repeats = find_new_boundary_repeats(fragments, hetero_repeats)
135
  for k, v in new_repeats.items():
136
  hetero_repeats[k] += v
137
  hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
138
 
139
  homo_repeats = find_homorepeats(sequence)
 
140
  final_repeats = homo_repeats.copy()
141
  for k, v in hetero_repeats.items():
142
  final_repeats[k] += v
 
185
  output.seek(0)
186
  return output
187
 
188
+ # === Streamlit UI ===
189
+ st.title("Protein Repeat Analysis with Caching")
190
  analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
191
  uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
192
 
 
194
  all_repeats = set()
195
  all_sequences_data = []
196
  filenames = []
197
+ final_output = BytesIO()
198
  for file in uploaded_files:
199
+ file.seek(0)
200
+ file_hash = get_file_hash(file)
201
+ file.seek(0)
202
+ cached = check_cache(file_hash, analysis_type)
203
+ if cached:
204
+ st.success(f"Using cached result for {file.name}")
205
+ cached_content = cached.read()
206
+ final_output.write(cached_content)
207
+ final_output.seek(0)
208
+ else:
209
+ st.info(f"Processing {file.name}...")
210
+ excel_data = pd.ExcelFile(file)
211
+ repeats, sequence_data = process_excel(excel_data, analysis_type)
212
+ if repeats is not None:
213
+ all_repeats.update(repeats)
214
+ all_sequences_data.append(sequence_data)
215
+ filenames.append(file.name)
216
+ excel_file = create_excel(all_sequences_data, all_repeats, filenames)
217
+ cache_result(file_hash, file.name, analysis_type, excel_file)
218
+ final_output = excel_file
219
+
220
+ st.download_button(
221
+ label="Download Excel file",
222
+ data=final_output,
223
+ file_name="protein_repeat_results.xlsx",
224
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
225
+ )
226
+
227
+ if st.checkbox("Show Results Table"):
228
+ rows = []
229
+ for file_index, file_data in enumerate(all_sequences_data):
230
+ filename = filenames[file_index]
231
+ for entry_id, protein_name, freq in file_data:
232
+ row = {"Filename": filename, "Entry ID": entry_id, "Protein Name": protein_name}
233
+ row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_repeats)})
234
+ rows.append(row)
235
+ result_df = pd.DataFrame(rows)
236
+ st.dataframe(result_df)