Tonic commited on
Commit
1f1f2d8
1 Parent(s): 2de8395

add application file

Browse files
Files changed (3) hide show
  1. README.md +6 -6
  2. app.py +238 -0
  3. requirements.txt +10 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Esm3 Conformity Sampling
3
- emoji:
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.1.0
8
  app_file: app.py
9
- pinned: false
10
  license: mit
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Conformity Protein Dynamics
3
+ emoji: 🧬🪬
4
+ colorFrom: indigo
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.1.0
8
  app_file: app.py
9
+ pinned: true
10
  license: mit
11
+ short_description: 'use the ESM3 model to predict protein structures'
12
  ---
13
 
 
app.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import py3Dmol
3
+ import io
4
+ import torch
5
+ import numpy as np
6
+ import os
7
+ import traceback
8
+ import spaces
9
+ # # Install ESM
10
+ # os.system("pip install esm")
11
+
12
+ from huggingface_hub import login
13
+ from esm.models.esm3 import ESM3
14
+ from esm.sdk.api import ESM3InferenceClient, ESMProtein, GenerationConfig
15
+ from esm.utils.structure.protein_chain import ProteinChain
16
+ from Bio.Data import PDBData
17
+ import biotite.structure as bs
18
+ from biotite.structure.io import pdb
19
+ from esm.utils import residue_constants as RC
20
+
21
+ # Login to Hugging Face Hub
22
+ hf_token = os.environ.get("HUGGINGFACE_TOKEN")
23
+ if not hf_token:
24
+ raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
25
+ login(token=hf_token)
26
+
27
+ # Initialize the model
28
+ model: ESM3InferenceClient = ESM3.from_pretrained("esm3-open").to("cuda" if torch.cuda.is_available() else "cpu")
29
+
30
+ amino3to1 = {
31
+ 'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
32
+ 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
33
+ 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
34
+ 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
35
+ }
36
+
37
+ def read_pdb_io(pdb_file):
38
+ if isinstance(pdb_file, io.StringIO):
39
+ pdb_content = pdb_file.getvalue()
40
+ elif hasattr(pdb_file, 'name'):
41
+ with open(pdb_file.name, 'r') as f:
42
+ pdb_content = f.read()
43
+ else:
44
+ raise ValueError("Unsupported file type")
45
+
46
+ if not pdb_content.strip():
47
+ raise ValueError("The PDB file is empty.")
48
+
49
+ pdb_io = io.StringIO(pdb_content)
50
+ return pdb_io, pdb_content
51
+
52
+ def get_protein(pdb_file) -> ESMProtein:
53
+ try:
54
+ pdb_io, content = read_pdb_io(pdb_file)
55
+
56
+ if not content.strip():
57
+ raise ValueError("The PDB file is empty")
58
+
59
+ # Parse the PDB file using biotite
60
+ pdb_file = pdb.PDBFile.read(pdb_io)
61
+ structure = pdb_file.get_structure()
62
+
63
+ # Check if the structure contains any atoms
64
+ if structure.array_length() == 0:
65
+ raise ValueError("The PDB file does not contain any valid atoms")
66
+
67
+ # Filter for amino acids and create a sequence
68
+ valid_residues = []
69
+ for res in bs.residue_iter(structure):
70
+ res_name = res.res_name
71
+ if isinstance(res_name, np.ndarray):
72
+ res_name = res_name[0] # Take the first element if it's an array
73
+ if res_name in amino3to1:
74
+ valid_residues.append(res)
75
+
76
+ if not valid_residues:
77
+ raise ValueError("No valid amino acid residues found in the PDB file")
78
+
79
+ sequence = ''.join(amino3to1.get(res.res_name[0] if isinstance(res.res_name, np.ndarray) else res.res_name, 'X') for res in valid_residues)
80
+
81
+ # Handle res_id as a potential sequence
82
+ residue_indices = []
83
+ for res in valid_residues:
84
+ if isinstance(res.res_id, (list, tuple, np.ndarray)):
85
+ residue_indices.append(res.res_id[0]) # Take the first element if it's a sequence
86
+ else:
87
+ residue_indices.append(res.res_id)
88
+
89
+ # Create a ProteinChain object
90
+ protein_chain = ProteinChain(
91
+ id="test",
92
+ sequence=sequence,
93
+ chain_id="A",
94
+ entity_id=None,
95
+ residue_index=np.array(residue_indices, dtype=int),
96
+ insertion_code=np.full(len(sequence), "", dtype="<U4"),
97
+ atom37_positions=np.full((len(sequence), 37, 3), np.nan),
98
+ atom37_mask=np.zeros((len(sequence), 37), dtype=bool),
99
+ confidence=np.ones(len(sequence), dtype=np.float32)
100
+ )
101
+
102
+ # Fill in atom positions and mask
103
+ for i, res in enumerate(valid_residues):
104
+ for atom in res:
105
+ atom_name = atom.atom_name
106
+ if isinstance(atom_name, np.ndarray):
107
+ atom_name = atom_name[0] # Take the first element if it's an array
108
+ if atom_name in RC.atom_order:
109
+ idx = RC.atom_order[atom_name]
110
+ coord = atom.coord
111
+ if coord.ndim > 1:
112
+ coord = coord[0] # Take the first coordinate set if multiple are present
113
+ protein_chain.atom37_positions[i, idx] = coord
114
+ protein_chain.atom37_mask[i, idx] = True
115
+
116
+ protein = ESMProtein.from_protein_chain(protein_chain)
117
+ return protein
118
+ except Exception as e:
119
+ print(f"Error processing PDB file: {str(e)}")
120
+ raise ValueError(f"Unable to process the PDB file: {str(e)}")
121
+
122
+ def add_noise_to_coordinates(protein: ESMProtein, noise_level: float) -> ESMProtein:
123
+ """Add Gaussian noise to the atom positions of the protein."""
124
+ coordinates = protein.coordinates
125
+ noise = torch.randn_like(coordinates) * noise_level
126
+ noisy_coordinates = coordinates + noise
127
+ return ESMProtein(sequence=protein.sequence, coordinates=noisy_coordinates)
128
+
129
+ def prediction_visualization(pdb_file, num_runs: int, noise_level: float, num_frames: int):
130
+ protein = get_protein(pdb_file)
131
+ runs = []
132
+
133
+ for frame in range(num_frames):
134
+ noisy_protein = add_noise_to_coordinates(protein, noise_level)
135
+
136
+ for i in range(num_runs):
137
+ structure_prediction = run_structure_prediction(noisy_protein)
138
+ aligned, crmsd = align_after_prediction(protein, structure_prediction)
139
+ runs.append((crmsd, aligned))
140
+
141
+ best_aligned = sorted(runs)[0]
142
+ view = visualize_after_pred(protein, best_aligned[1])
143
+ return view, f"Best cRMSD: {best_aligned[0]:.4f}"
144
+
145
+ def run_structure_prediction(protein: ESMProtein) -> ESMProtein:
146
+ structure_prediction_config = GenerationConfig(
147
+ track="structure",
148
+ num_steps=40,
149
+ temperature=0.7,
150
+ )
151
+ structure_prediction = model.generate(protein, structure_prediction_config)
152
+ return structure_prediction
153
+
154
+ def align_after_prediction(protein: ESMProtein, structure_prediction: ESMProtein) -> tuple[ESMProtein, float]:
155
+ structure_prediction_chain = structure_prediction.to_protein_chain()
156
+ protein_chain = protein.to_protein_chain()
157
+ structure_indices = np.arange(0, len(structure_prediction_chain.sequence))
158
+ aligned_chain = structure_prediction_chain.align(protein_chain, mobile_inds=structure_indices, target_inds=structure_indices)
159
+ crmsd = structure_prediction_chain.rmsd(protein_chain, mobile_inds=structure_indices, target_inds=structure_indices)
160
+ return ESMProtein.from_protein_chain(aligned_chain), crmsd
161
+
162
+ def visualize_after_pred(protein: ESMProtein, aligned: ESMProtein):
163
+ view = py3Dmol.view(width=800, height=600)
164
+ view.addModel(protein.to_pdb_string(), "pdb")
165
+ view.setStyle({"cartoon": {"color": "lightgrey"}})
166
+ view.addModel(aligned.to_pdb_string(), "pdb")
167
+ view.setStyle({"model": 1}, {"cartoon": {"color": "lightgreen"}})
168
+ view.zoomTo()
169
+ return view
170
+
171
+ @spaces.GPU()
172
+ def run_prediction(pdb_file, num_runs, noise_level, num_frames):
173
+ try:
174
+ if pdb_file is None:
175
+ return "Please upload a PDB file.", "No file uploaded"
176
+
177
+ view, crmsd_text = prediction_visualization(pdb_file, num_runs, noise_level, num_frames)
178
+ html = view._make_html()
179
+ return f"""
180
+ <div style="height: 600px;">
181
+ {html}
182
+ </div>
183
+ """, crmsd_text
184
+ except Exception as e:
185
+ error_message = str(e)
186
+ stack_trace = traceback.format_exc()
187
+ return f"""
188
+ <div style='color: red;'>
189
+ <h3>Error:</h3>
190
+ <p>{error_message}</p>
191
+ <h4>Stack Trace:</h4>
192
+ <pre>{stack_trace}</pre>
193
+ </div>
194
+ """, "Error occurred"
195
+
196
+ def create_demo():
197
+ with gr.Blocks() as demo:
198
+ gr.Markdown("# Protein Structure Prediction and Visualization with Noise and MD Frames")
199
+
200
+ with gr.Row():
201
+ with gr.Column(scale=1):
202
+ pdb_file = gr.File(label="Upload PDB file")
203
+ num_runs = gr.Slider(minimum=1, maximum=10, step=1, value=3, label="Number of runs per frame")
204
+ noise_level = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.1, label="Noise level")
205
+ num_frames = gr.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of MD frames")
206
+ run_button = gr.Button("Run Prediction")
207
+
208
+ with gr.Column(scale=2):
209
+ visualization = gr.HTML(label="3D Visualization")
210
+ alignment_result = gr.Textbox(label="Alignment Result")
211
+
212
+ run_button.click(
213
+ fn=run_prediction,
214
+ inputs=[pdb_file, num_runs, noise_level, num_frames],
215
+ outputs=[visualization, alignment_result]
216
+ )
217
+
218
+ gr.Markdown("""
219
+ ## How to use
220
+ 1. Upload a PDB file using the file uploader.
221
+ 2. Adjust the number of prediction runs per frame using the slider.
222
+ 3. Set the noise level to add random perturbations to the structure.
223
+ 4. Choose the number of MD frames to simulate.
224
+ 5. Click the "Run Prediction" button to start the process.
225
+ 6. The 3D visualization will show the original structure (grey) and the best predicted structure (green).
226
+ 7. The alignment result will display the best cRMSD (lower is better).
227
+
228
+ ## About
229
+ This demo uses the ESM3 model to predict protein structures from PDB files.
230
+ It runs multiple predictions with added noise and simulated MD frames, displaying the best result based on the lowest cRMSD.
231
+ """)
232
+
233
+ return demo
234
+
235
+ if __name__ == "__main__":
236
+ demo = create_demo()
237
+ demo.queue()
238
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torchvision
3
+ requests
4
+ py3Dmol
5
+ biopython
6
+ pandas
7
+ torch
8
+ numpy
9
+ esm
10
+ biotite