Spaces:

Jayesh13
/

Homo_hetero

Sleeping

App Files Files Community

Jayesh13 commited on 20 days ago

Commit

690d85c

verified ·

1 Parent(s): 10b45fa

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -31

app.py CHANGED Viewed

@@ -6,8 +6,54 @@ import pandas as pd
 import xlsxwriter
 from io import BytesIO
 from collections import defaultdict
-# Utility to check homo repeat
 def is_homo_repeat(s):
     return all(c == s[0] for c in s)
@@ -84,14 +130,13 @@ def process_protein_sequence(sequence, analysis_type, overlap=50):
             fragment_repeats = find_hetero_amino_acid_repeats(fragment)
             for k, v in fragment_repeats.items():
                 hetero_repeats[k] += v
-        hetero_repeats = check_boundary_repeats(fragments, hetero_repeats, overlap)
-        new_repeats = find_new_boundary_repeats(fragments, hetero_repeats, overlap)
         for k, v in new_repeats.items():
             hetero_repeats[k] += v
         hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
         homo_repeats = find_homorepeats(sequence)
         final_repeats = homo_repeats.copy()
         for k, v in hetero_repeats.items():
             final_repeats[k] += v
@@ -140,7 +185,8 @@ def create_excel(sequences_data, repeats, filenames):
     output.seek(0)
     return output
-st.title("Protein Repeat Analysis")
 analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
 uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
@@ -148,29 +194,43 @@ if uploaded_files:
     all_repeats = set()
     all_sequences_data = []
     filenames = []
     for file in uploaded_files:
-        excel_data = pd.ExcelFile(file)
-        repeats, sequence_data = process_excel(excel_data, analysis_type)
-        if repeats is not None:
-            all_repeats.update(repeats)
-            all_sequences_data.append(sequence_data)
-            filenames.append(file.name)
-    if all_sequences_data:
-        st.success(f"Processed {len(uploaded_files)} files successfully!")
-        excel_file = create_excel(all_sequences_data, all_repeats, filenames)
-        st.download_button(
-            label="Download Excel file",
-            data=excel_file,
-            file_name="protein_repeat_results.xlsx",
-            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
-        )
-        if st.checkbox("Show Results Table"):
-            rows = []
-            for file_index, file_data in enumerate(all_sequences_data):
-                filename = filenames[file_index]
-                for entry_id, protein_name, freq in file_data:
-                    row = {"Filename": filename, "Entry ID": entry_id, "Protein Name": protein_name}
-                    row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_repeats)})
-                    rows.append(row)
-            result_df = pd.DataFrame(rows)
-            st.dataframe(result_df)

 import xlsxwriter
 from io import BytesIO
 from collections import defaultdict
+import hashlib
+import sqlite3
+import base64
+# Initialize DB
+def init_db():
+    conn = sqlite3.connect("file_cache.db")
+    cursor = conn.cursor()
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS file_cache (
+            file_hash TEXT PRIMARY KEY,
+            file_name TEXT,
+            analysis_type TEXT,
+            result BLOB
+        )
+    ''')
+    conn.commit()
+    conn.close()
+init_db()
+# Hashing function
+def get_file_hash(file):
+    return hashlib.sha256(file.read()).hexdigest()
+# Check if file hash exists in DB
+def check_cache(file_hash, analysis_type):
+    conn = sqlite3.connect("file_cache.db")
+    cursor = conn.cursor()
+    cursor.execute("SELECT result FROM file_cache WHERE file_hash = ? AND analysis_type = ?", (file_hash, analysis_type))
+    row = cursor.fetchone()
+    conn.close()
+    if row:
+        return BytesIO(base64.b64decode(row[0]))
+    return None
+# Store result in DB
+def cache_result(file_hash, file_name, analysis_type, result_bytes):
+    conn = sqlite3.connect("file_cache.db")
+    cursor = conn.cursor()
+    cursor.execute(
+        "INSERT OR REPLACE INTO file_cache (file_hash, file_name, analysis_type, result) VALUES (?, ?, ?, ?)",
+        (file_hash, file_name, analysis_type, base64.b64encode(result_bytes.read()).decode('utf-8'))
+    )
+    conn.commit()
+    conn.close()
+# === Protein Analysis Logic ===
 def is_homo_repeat(s):
     return all(c == s[0] for c in s)
             fragment_repeats = find_hetero_amino_acid_repeats(fragment)
             for k, v in fragment_repeats.items():
                 hetero_repeats[k] += v
+        hetero_repeats = check_boundary_repeats(fragments, hetero_repeats)
+        new_repeats = find_new_boundary_repeats(fragments, hetero_repeats)
         for k, v in new_repeats.items():
             hetero_repeats[k] += v
         hetero_repeats = {k: v for k, v in hetero_repeats.items() if not is_homo_repeat(k)}
         homo_repeats = find_homorepeats(sequence)
         final_repeats = homo_repeats.copy()
         for k, v in hetero_repeats.items():
             final_repeats[k] += v
     output.seek(0)
     return output
+# === Streamlit UI ===
+st.title("Protein Repeat Analysis with Caching")
 analysis_type = st.radio("Select analysis type:", ["Homo", "Hetero", "Both"], index=2)
 uploaded_files = st.file_uploader("Upload Excel files", accept_multiple_files=True, type=["xlsx"])
     all_repeats = set()
     all_sequences_data = []
     filenames = []
+    final_output = BytesIO()
     for file in uploaded_files:
+        file.seek(0)
+        file_hash = get_file_hash(file)
+        file.seek(0)
+        cached = check_cache(file_hash, analysis_type)
+        if cached:
+            st.success(f"Using cached result for {file.name}")
+            cached_content = cached.read()
+            final_output.write(cached_content)
+            final_output.seek(0)
+        else:
+            st.info(f"Processing {file.name}...")
+            excel_data = pd.ExcelFile(file)
+            repeats, sequence_data = process_excel(excel_data, analysis_type)
+            if repeats is not None:
+                all_repeats.update(repeats)
+                all_sequences_data.append(sequence_data)
+                filenames.append(file.name)
+            excel_file = create_excel(all_sequences_data, all_repeats, filenames)
+            cache_result(file_hash, file.name, analysis_type, excel_file)
+            final_output = excel_file
+    st.download_button(
+        label="Download Excel file",
+        data=final_output,
+        file_name="protein_repeat_results.xlsx",
+        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    )
+    if st.checkbox("Show Results Table"):
+        rows = []
+        for file_index, file_data in enumerate(all_sequences_data):
+            filename = filenames[file_index]
+            for entry_id, protein_name, freq in file_data:
+                row = {"Filename": filename, "Entry ID": entry_id, "Protein Name": protein_name}
+                row.update({repeat: freq.get(repeat, 0) for repeat in sorted(all_repeats)})
+                rows.append(row)
+        result_df = pd.DataFrame(rows)
+        st.dataframe(result_df)