Spaces:
Running
Running
roni
commited on
Commit
·
33eb5d4
1
Parent(s):
e1f535f
per gene aggregation
Browse files- app.py +50 -22
- protein_viz.py +1 -1
app.py
CHANGED
@@ -1,7 +1,10 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
from get_index import get_engines
|
4 |
-
from protein_viz import
|
5 |
|
6 |
index_repo = "ronig/siamese_protein_index"
|
7 |
model_repo = "ronig/protein_search_engine"
|
@@ -13,14 +16,17 @@ This application enables a quick protein-peptide binding search based on sequenc
|
|
13 |
You can use it to search the full [PDB](https://www.rcsb.org/) database or in a specific organism genome.
|
14 |
"""
|
15 |
max_results = 1000
|
|
|
16 |
|
17 |
|
18 |
-
def search_and_display(seq,
|
19 |
-
|
|
|
20 |
engine = engines[index_selection]
|
21 |
-
search_res = engine.search_by_sequence(seq, n=
|
22 |
-
|
23 |
-
formatted_search_results = format_search_results(
|
|
|
24 |
return formatted_search_results, results_options
|
25 |
|
26 |
|
@@ -28,12 +34,42 @@ def limit_n_results(n):
|
|
28 |
return max(min(n, max_results), 1)
|
29 |
|
30 |
|
31 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
choices = []
|
33 |
-
for
|
34 |
-
|
35 |
-
choice =
|
36 |
choices.append(choice)
|
|
|
37 |
if choices:
|
38 |
update = gr.Dropdown.update(
|
39 |
choices=choices, interactive=True, value=choices[0], visible=True
|
@@ -45,14 +81,6 @@ def update_dropdown_menu(search_res):
|
|
45 |
return update
|
46 |
|
47 |
|
48 |
-
def format_search_results(raw_search_results):
|
49 |
-
formatted_search_results = {}
|
50 |
-
for res in raw_search_results:
|
51 |
-
key, value = parse_pdb_search_result(res)
|
52 |
-
formatted_search_results[key] = value
|
53 |
-
return formatted_search_results
|
54 |
-
|
55 |
-
|
56 |
def parse_pdb_search_result(raw_result):
|
57 |
prot = raw_result["pdb_name"]
|
58 |
chain = raw_result["chain_id"]
|
@@ -71,12 +99,12 @@ def switch_viz(new_choice):
|
|
71 |
title_update = gr.Markdown.update(visible=False)
|
72 |
description_update = gr.Markdown.update(value=None, visible=False)
|
73 |
else:
|
74 |
-
choice_parts = new_choice.split(
|
75 |
-
pdb_id, chain = choice_parts[
|
76 |
title_update = gr.Markdown.update(visible=True)
|
77 |
-
|
78 |
|
79 |
-
new_value = f"""**PDB Title**: {
|
80 |
|
81 |
description_update = gr.Markdown.update(value=new_value, visible=True)
|
82 |
html = render_html(pdb_id=pdb_id, chain=chain)
|
|
|
1 |
+
import collections
|
2 |
+
from typing import Dict, List
|
3 |
+
|
4 |
import gradio as gr
|
5 |
|
6 |
from get_index import get_engines
|
7 |
+
from protein_viz import get_pdb_title, render_html
|
8 |
|
9 |
index_repo = "ronig/siamese_protein_index"
|
10 |
model_repo = "ronig/protein_search_engine"
|
|
|
16 |
You can use it to search the full [PDB](https://www.rcsb.org/) database or in a specific organism genome.
|
17 |
"""
|
18 |
max_results = 1000
|
19 |
+
choice_sep = " | "
|
20 |
|
21 |
|
22 |
+
def search_and_display(seq, max_res, index_selection):
|
23 |
+
n_search_res = 10000
|
24 |
+
max_res = int(limit_n_results(max_res))
|
25 |
engine = engines[index_selection]
|
26 |
+
search_res = engine.search_by_sequence(seq, n=n_search_res)
|
27 |
+
agg_search_results = aggregate_search_results(search_res, max_res)
|
28 |
+
formatted_search_results = format_search_results(agg_search_results)
|
29 |
+
results_options = update_dropdown_menu(agg_search_results)
|
30 |
return formatted_search_results, results_options
|
31 |
|
32 |
|
|
|
34 |
return max(min(n, max_results), 1)
|
35 |
|
36 |
|
37 |
+
def aggregate_search_results(raw_results: List[dict], max_res: int) -> Dict[str, dict]:
|
38 |
+
aggregated_by_gene = collections.defaultdict(list)
|
39 |
+
for raw_result in raw_results:
|
40 |
+
entry = select_keys(raw_result, ["pdb_name", "chain_id", "score", "organism"])
|
41 |
+
genes = raw_result["genes"]
|
42 |
+
if genes is not None:
|
43 |
+
gene_names = genes.split(" ")
|
44 |
+
for gene in gene_names:
|
45 |
+
aggregated_by_gene[gene].append(entry)
|
46 |
+
if len(aggregated_by_gene) >= max_res:
|
47 |
+
return dict(aggregated_by_gene)
|
48 |
+
return dict(aggregated_by_gene)
|
49 |
+
|
50 |
+
|
51 |
+
def select_keys(d: dict, keys: List[str]):
|
52 |
+
return {key: d[key] for key in keys}
|
53 |
+
|
54 |
+
|
55 |
+
def format_search_results(agg_search_results):
|
56 |
+
formatted_search_results = {}
|
57 |
+
for gene, entries in agg_search_results.items():
|
58 |
+
entry = entries[0]
|
59 |
+
organism = entry["organism"]
|
60 |
+
score = entry["score"]
|
61 |
+
key = f"Gene: {gene} | Organism: {organism}"
|
62 |
+
formatted_search_results[key] = score
|
63 |
+
return formatted_search_results
|
64 |
+
|
65 |
+
|
66 |
+
def update_dropdown_menu(agg_search_res):
|
67 |
choices = []
|
68 |
+
for gene, entries in agg_search_res.items():
|
69 |
+
for entry in entries:
|
70 |
+
choice = choice_sep.join([gene, entry["pdb_name"], entry["chain_id"]])
|
71 |
choices.append(choice)
|
72 |
+
|
73 |
if choices:
|
74 |
update = gr.Dropdown.update(
|
75 |
choices=choices, interactive=True, value=choices[0], visible=True
|
|
|
81 |
return update
|
82 |
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
def parse_pdb_search_result(raw_result):
|
85 |
prot = raw_result["pdb_name"]
|
86 |
chain = raw_result["chain_id"]
|
|
|
99 |
title_update = gr.Markdown.update(visible=False)
|
100 |
description_update = gr.Markdown.update(value=None, visible=False)
|
101 |
else:
|
102 |
+
choice_parts = new_choice.split(choice_sep)
|
103 |
+
pdb_id, chain = choice_parts[1:3]
|
104 |
title_update = gr.Markdown.update(visible=True)
|
105 |
+
pdb_title = get_pdb_title(pdb_id)
|
106 |
|
107 |
+
new_value = f"""**PDB Title**: {pdb_title}"""
|
108 |
|
109 |
description_update = gr.Markdown.update(value=new_value, visible=True)
|
110 |
html = render_html(pdb_id=pdb_id, chain=chain)
|
protein_viz.py
CHANGED
@@ -30,7 +30,7 @@ def render_html(pdb_id, chain):
|
|
30 |
return iframe
|
31 |
|
32 |
|
33 |
-
def
|
34 |
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
35 |
response = requests.get(url, timeout=1)
|
36 |
if response.ok:
|
|
|
30 |
return iframe
|
31 |
|
32 |
|
33 |
+
def get_pdb_title(pdb_id: str):
|
34 |
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
|
35 |
response = requests.get(url, timeout=1)
|
36 |
if response.ok:
|