Simon Duerr commited on
Commit
c884d79
·
1 Parent(s): 738a3b3

add alphafold

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. alphafold/LICENSE +202 -0
  2. alphafold/alphafold/__init__.py +14 -0
  3. alphafold/alphafold/__pycache__/__init__.cpython-36.pyc +0 -0
  4. alphafold/alphafold/__pycache__/__init__.cpython-38.pyc +0 -0
  5. alphafold/alphafold/common/__init__.py +14 -0
  6. alphafold/alphafold/common/__pycache__/__init__.cpython-36.pyc +0 -0
  7. alphafold/alphafold/common/__pycache__/__init__.cpython-38.pyc +0 -0
  8. alphafold/alphafold/common/__pycache__/confidence.cpython-36.pyc +0 -0
  9. alphafold/alphafold/common/__pycache__/confidence.cpython-38.pyc +0 -0
  10. alphafold/alphafold/common/__pycache__/protein.cpython-36.pyc +0 -0
  11. alphafold/alphafold/common/__pycache__/protein.cpython-38.pyc +0 -0
  12. alphafold/alphafold/common/__pycache__/residue_constants.cpython-36.pyc +0 -0
  13. alphafold/alphafold/common/__pycache__/residue_constants.cpython-38.pyc +0 -0
  14. alphafold/alphafold/common/confidence.py +155 -0
  15. alphafold/alphafold/common/protein.py +229 -0
  16. alphafold/alphafold/common/protein_test.py +89 -0
  17. alphafold/alphafold/common/residue_constants.py +895 -0
  18. alphafold/alphafold/common/residue_constants_test.py +190 -0
  19. alphafold/alphafold/common/testdata/2rbg.pdb +0 -0
  20. alphafold/alphafold/data/__init__.py +14 -0
  21. alphafold/alphafold/data/__pycache__/__init__.cpython-36.pyc +0 -0
  22. alphafold/alphafold/data/__pycache__/__init__.cpython-38.pyc +0 -0
  23. alphafold/alphafold/data/__pycache__/mmcif_parsing.cpython-36.pyc +0 -0
  24. alphafold/alphafold/data/__pycache__/mmcif_parsing.cpython-38.pyc +0 -0
  25. alphafold/alphafold/data/__pycache__/parsers.cpython-36.pyc +0 -0
  26. alphafold/alphafold/data/__pycache__/parsers.cpython-38.pyc +0 -0
  27. alphafold/alphafold/data/__pycache__/pipeline.cpython-36.pyc +0 -0
  28. alphafold/alphafold/data/__pycache__/pipeline.cpython-38.pyc +0 -0
  29. alphafold/alphafold/data/__pycache__/templates.cpython-36.pyc +0 -0
  30. alphafold/alphafold/data/__pycache__/templates.cpython-38.pyc +0 -0
  31. alphafold/alphafold/data/mmcif_parsing.py +384 -0
  32. alphafold/alphafold/data/parsers.py +364 -0
  33. alphafold/alphafold/data/pipeline.py +209 -0
  34. alphafold/alphafold/data/templates.py +922 -0
  35. alphafold/alphafold/data/tools/__init__.py +14 -0
  36. alphafold/alphafold/data/tools/__pycache__/__init__.cpython-36.pyc +0 -0
  37. alphafold/alphafold/data/tools/__pycache__/__init__.cpython-38.pyc +0 -0
  38. alphafold/alphafold/data/tools/__pycache__/hhblits.cpython-36.pyc +0 -0
  39. alphafold/alphafold/data/tools/__pycache__/hhblits.cpython-38.pyc +0 -0
  40. alphafold/alphafold/data/tools/__pycache__/hhsearch.cpython-36.pyc +0 -0
  41. alphafold/alphafold/data/tools/__pycache__/hhsearch.cpython-38.pyc +0 -0
  42. alphafold/alphafold/data/tools/__pycache__/jackhmmer.cpython-36.pyc +0 -0
  43. alphafold/alphafold/data/tools/__pycache__/jackhmmer.cpython-38.pyc +0 -0
  44. alphafold/alphafold/data/tools/__pycache__/kalign.cpython-36.pyc +0 -0
  45. alphafold/alphafold/data/tools/__pycache__/kalign.cpython-38.pyc +0 -0
  46. alphafold/alphafold/data/tools/__pycache__/utils.cpython-36.pyc +0 -0
  47. alphafold/alphafold/data/tools/__pycache__/utils.cpython-38.pyc +0 -0
  48. alphafold/alphafold/data/tools/hhblits.py +155 -0
  49. alphafold/alphafold/data/tools/hhsearch.py +91 -0
  50. alphafold/alphafold/data/tools/hmmbuild.py +138 -0
alphafold/LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright [yyyy] [name of copyright owner]
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
alphafold/alphafold/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """An implementation of the inference pipeline of AlphaFold v2.0."""
alphafold/alphafold/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (215 Bytes). View file
 
alphafold/alphafold/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (223 Bytes). View file
 
alphafold/alphafold/common/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Common data types and constants used within Alphafold."""
alphafold/alphafold/common/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (214 Bytes). View file
 
alphafold/alphafold/common/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (222 Bytes). View file
 
alphafold/alphafold/common/__pycache__/confidence.cpython-36.pyc ADDED
Binary file (4.07 kB). View file
 
alphafold/alphafold/common/__pycache__/confidence.cpython-38.pyc ADDED
Binary file (4.07 kB). View file
 
alphafold/alphafold/common/__pycache__/protein.cpython-36.pyc ADDED
Binary file (5.41 kB). View file
 
alphafold/alphafold/common/__pycache__/protein.cpython-38.pyc ADDED
Binary file (5.47 kB). View file
 
alphafold/alphafold/common/__pycache__/residue_constants.cpython-36.pyc ADDED
Binary file (23.9 kB). View file
 
alphafold/alphafold/common/__pycache__/residue_constants.cpython-38.pyc ADDED
Binary file (20.2 kB). View file
 
alphafold/alphafold/common/confidence.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Functions for processing confidence metrics."""
16
+
17
+ from typing import Dict, Optional, Tuple
18
+ import numpy as np
19
+ import scipy.special
20
+
21
+
22
+ def compute_plddt(logits: np.ndarray) -> np.ndarray:
23
+ """Computes per-residue pLDDT from logits.
24
+
25
+ Args:
26
+ logits: [num_res, num_bins] output from the PredictedLDDTHead.
27
+
28
+ Returns:
29
+ plddt: [num_res] per-residue pLDDT.
30
+ """
31
+ num_bins = logits.shape[-1]
32
+ bin_width = 1.0 / num_bins
33
+ bin_centers = np.arange(start=0.5 * bin_width, stop=1.0, step=bin_width)
34
+ probs = scipy.special.softmax(logits, axis=-1)
35
+ predicted_lddt_ca = np.sum(probs * bin_centers[None, :], axis=-1)
36
+ return predicted_lddt_ca * 100
37
+
38
+
39
+ def _calculate_bin_centers(breaks: np.ndarray):
40
+ """Gets the bin centers from the bin edges.
41
+
42
+ Args:
43
+ breaks: [num_bins - 1] the error bin edges.
44
+
45
+ Returns:
46
+ bin_centers: [num_bins] the error bin centers.
47
+ """
48
+ step = (breaks[1] - breaks[0])
49
+
50
+ # Add half-step to get the center
51
+ bin_centers = breaks + step / 2
52
+ # Add a catch-all bin at the end.
53
+ bin_centers = np.concatenate([bin_centers, [bin_centers[-1] + step]],
54
+ axis=0)
55
+ return bin_centers
56
+
57
+
58
+ def _calculate_expected_aligned_error(
59
+ alignment_confidence_breaks: np.ndarray,
60
+ aligned_distance_error_probs: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
61
+ """Calculates expected aligned distance errors for every pair of residues.
62
+
63
+ Args:
64
+ alignment_confidence_breaks: [num_bins - 1] the error bin edges.
65
+ aligned_distance_error_probs: [num_res, num_res, num_bins] the predicted
66
+ probs for each error bin, for each pair of residues.
67
+
68
+ Returns:
69
+ predicted_aligned_error: [num_res, num_res] the expected aligned distance
70
+ error for each pair of residues.
71
+ max_predicted_aligned_error: The maximum predicted error possible.
72
+ """
73
+ bin_centers = _calculate_bin_centers(alignment_confidence_breaks)
74
+
75
+ # Tuple of expected aligned distance error and max possible error.
76
+ return (np.sum(aligned_distance_error_probs * bin_centers, axis=-1),
77
+ np.asarray(bin_centers[-1]))
78
+
79
+
80
+ def compute_predicted_aligned_error(
81
+ logits: np.ndarray,
82
+ breaks: np.ndarray) -> Dict[str, np.ndarray]:
83
+ """Computes aligned confidence metrics from logits.
84
+
85
+ Args:
86
+ logits: [num_res, num_res, num_bins] the logits output from
87
+ PredictedAlignedErrorHead.
88
+ breaks: [num_bins - 1] the error bin edges.
89
+
90
+ Returns:
91
+ aligned_confidence_probs: [num_res, num_res, num_bins] the predicted
92
+ aligned error probabilities over bins for each residue pair.
93
+ predicted_aligned_error: [num_res, num_res] the expected aligned distance
94
+ error for each pair of residues.
95
+ max_predicted_aligned_error: The maximum predicted error possible.
96
+ """
97
+ aligned_confidence_probs = scipy.special.softmax(
98
+ logits,
99
+ axis=-1)
100
+ predicted_aligned_error, max_predicted_aligned_error = (
101
+ _calculate_expected_aligned_error(
102
+ alignment_confidence_breaks=breaks,
103
+ aligned_distance_error_probs=aligned_confidence_probs))
104
+ return {
105
+ 'aligned_confidence_probs': aligned_confidence_probs,
106
+ 'predicted_aligned_error': predicted_aligned_error,
107
+ 'max_predicted_aligned_error': max_predicted_aligned_error,
108
+ }
109
+
110
+
111
+ def predicted_tm_score(
112
+ logits: np.ndarray,
113
+ breaks: np.ndarray,
114
+ residue_weights: Optional[np.ndarray] = None) -> np.ndarray:
115
+ """Computes predicted TM alignment score.
116
+
117
+ Args:
118
+ logits: [num_res, num_res, num_bins] the logits output from
119
+ PredictedAlignedErrorHead.
120
+ breaks: [num_bins] the error bins.
121
+ residue_weights: [num_res] the per residue weights to use for the
122
+ expectation.
123
+
124
+ Returns:
125
+ ptm_score: the predicted TM alignment score.
126
+ """
127
+
128
+ # residue_weights has to be in [0, 1], but can be floating-point, i.e. the
129
+ # exp. resolved head's probability.
130
+ if residue_weights is None:
131
+ residue_weights = np.ones(logits.shape[0])
132
+
133
+ bin_centers = _calculate_bin_centers(breaks)
134
+
135
+ num_res = np.sum(residue_weights)
136
+ # Clip num_res to avoid negative/undefined d0.
137
+ clipped_num_res = max(num_res, 19)
138
+
139
+ # Compute d_0(num_res) as defined by TM-score, eqn. (5) in
140
+ # http://zhanglab.ccmb.med.umich.edu/papers/2004_3.pdf
141
+ # Yang & Skolnick "Scoring function for automated
142
+ # assessment of protein structure template quality" 2004
143
+ d0 = 1.24 * (clipped_num_res - 15) ** (1./3) - 1.8
144
+
145
+ # Convert logits to probs
146
+ probs = scipy.special.softmax(logits, axis=-1)
147
+
148
+ # TM-Score term for every bin
149
+ tm_per_bin = 1. / (1 + np.square(bin_centers) / np.square(d0))
150
+ # E_distances tm(distance)
151
+ predicted_tm_term = np.sum(probs * tm_per_bin, axis=-1)
152
+
153
+ normed_residue_mask = residue_weights / (1e-8 + residue_weights.sum())
154
+ per_alignment = np.sum(predicted_tm_term * normed_residue_mask, axis=-1)
155
+ return np.asarray(per_alignment[(per_alignment * residue_weights).argmax()])
alphafold/alphafold/common/protein.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Protein data type."""
16
+ import dataclasses
17
+ import io
18
+ from typing import Any, Mapping, Optional
19
+ from alphafold.common import residue_constants
20
+ from Bio.PDB import PDBParser
21
+ import numpy as np
22
+
23
+ FeatureDict = Mapping[str, np.ndarray]
24
+ ModelOutput = Mapping[str, Any] # Is a nested dict.
25
+
26
+
27
+ @dataclasses.dataclass(frozen=True)
28
+ class Protein:
29
+ """Protein structure representation."""
30
+
31
+ # Cartesian coordinates of atoms in angstroms. The atom types correspond to
32
+ # residue_constants.atom_types, i.e. the first three are N, CA, CB.
33
+ atom_positions: np.ndarray # [num_res, num_atom_type, 3]
34
+
35
+ # Amino-acid type for each residue represented as an integer between 0 and
36
+ # 20, where 20 is 'X'.
37
+ aatype: np.ndarray # [num_res]
38
+
39
+ # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
40
+ # is present and 0.0 if not. This should be used for loss masking.
41
+ atom_mask: np.ndarray # [num_res, num_atom_type]
42
+
43
+ # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
44
+ residue_index: np.ndarray # [num_res]
45
+
46
+ # B-factors, or temperature factors, of each residue (in sq. angstroms units),
47
+ # representing the displacement of the residue from its ground truth mean
48
+ # value.
49
+ b_factors: np.ndarray # [num_res, num_atom_type]
50
+
51
+
52
+ def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
53
+ """Takes a PDB string and constructs a Protein object.
54
+
55
+ WARNING: All non-standard residue types will be converted into UNK. All
56
+ non-standard atoms will be ignored.
57
+
58
+ Args:
59
+ pdb_str: The contents of the pdb file
60
+ chain_id: If None, then the pdb file must contain a single chain (which
61
+ will be parsed). If chain_id is specified (e.g. A), then only that chain
62
+ is parsed.
63
+
64
+ Returns:
65
+ A new `Protein` parsed from the pdb contents.
66
+ """
67
+ pdb_fh = io.StringIO(pdb_str)
68
+ parser = PDBParser(QUIET=True)
69
+ structure = parser.get_structure('none', pdb_fh)
70
+ models = list(structure.get_models())
71
+ if len(models) != 1:
72
+ raise ValueError(
73
+ f'Only single model PDBs are supported. Found {len(models)} models.')
74
+ model = models[0]
75
+
76
+ if chain_id is not None:
77
+ chain = model[chain_id]
78
+ else:
79
+ chains = list(model.get_chains())
80
+ if len(chains) != 1:
81
+ raise ValueError(
82
+ 'Only single chain PDBs are supported when chain_id not specified. '
83
+ f'Found {len(chains)} chains.')
84
+ else:
85
+ chain = chains[0]
86
+
87
+ atom_positions = []
88
+ aatype = []
89
+ atom_mask = []
90
+ residue_index = []
91
+ b_factors = []
92
+
93
+ for res in chain:
94
+ if res.id[2] != ' ':
95
+ raise ValueError(
96
+ f'PDB contains an insertion code at chain {chain.id} and residue '
97
+ f'index {res.id[1]}. These are not supported.')
98
+ res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
99
+ restype_idx = residue_constants.restype_order.get(
100
+ res_shortname, residue_constants.restype_num)
101
+ pos = np.zeros((residue_constants.atom_type_num, 3))
102
+ mask = np.zeros((residue_constants.atom_type_num,))
103
+ res_b_factors = np.zeros((residue_constants.atom_type_num,))
104
+ for atom in res:
105
+ if atom.name not in residue_constants.atom_types:
106
+ continue
107
+ pos[residue_constants.atom_order[atom.name]] = atom.coord
108
+ mask[residue_constants.atom_order[atom.name]] = 1.
109
+ res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
110
+ if np.sum(mask) < 0.5:
111
+ # If no known atom positions are reported for the residue then skip it.
112
+ continue
113
+ aatype.append(restype_idx)
114
+ atom_positions.append(pos)
115
+ atom_mask.append(mask)
116
+ residue_index.append(res.id[1])
117
+ b_factors.append(res_b_factors)
118
+
119
+ return Protein(
120
+ atom_positions=np.array(atom_positions),
121
+ atom_mask=np.array(atom_mask),
122
+ aatype=np.array(aatype),
123
+ residue_index=np.array(residue_index),
124
+ b_factors=np.array(b_factors))
125
+
126
+
127
+ def to_pdb(prot: Protein) -> str:
128
+ """Converts a `Protein` instance to a PDB string.
129
+
130
+ Args:
131
+ prot: The protein to convert to PDB.
132
+
133
+ Returns:
134
+ PDB string.
135
+ """
136
+ restypes = residue_constants.restypes + ['X']
137
+ res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
138
+ atom_types = residue_constants.atom_types
139
+
140
+ pdb_lines = []
141
+
142
+ atom_mask = prot.atom_mask
143
+ aatype = prot.aatype
144
+ atom_positions = prot.atom_positions
145
+ residue_index = prot.residue_index.astype(np.int32)
146
+ b_factors = prot.b_factors
147
+
148
+ if np.any(aatype > residue_constants.restype_num):
149
+ raise ValueError('Invalid aatypes.')
150
+
151
+ pdb_lines.append('MODEL 1')
152
+ atom_index = 1
153
+ chain_id = 'A'
154
+ # Add all atom sites.
155
+ for i in range(aatype.shape[0]):
156
+ res_name_3 = res_1to3(aatype[i])
157
+ for atom_name, pos, mask, b_factor in zip(
158
+ atom_types, atom_positions[i], atom_mask[i], b_factors[i]):
159
+ if mask < 0.5:
160
+ continue
161
+
162
+ record_type = 'ATOM'
163
+ name = atom_name if len(atom_name) == 4 else f' {atom_name}'
164
+ alt_loc = ''
165
+ insertion_code = ''
166
+ occupancy = 1.00
167
+ element = atom_name[0] # Protein supports only C, N, O, S, this works.
168
+ charge = ''
169
+ # PDB is a columnar format, every space matters here!
170
+ atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
171
+ f'{res_name_3:>3} {chain_id:>1}'
172
+ f'{residue_index[i]:>4}{insertion_code:>1} '
173
+ f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
174
+ f'{occupancy:>6.2f}{b_factor:>6.2f} '
175
+ f'{element:>2}{charge:>2}')
176
+ pdb_lines.append(atom_line)
177
+ atom_index += 1
178
+
179
+ # Close the chain.
180
+ chain_end = 'TER'
181
+ chain_termination_line = (
182
+ f'{chain_end:<6}{atom_index:>5} {res_1to3(aatype[-1]):>3} '
183
+ f'{chain_id:>1}{residue_index[-1]:>4}')
184
+ pdb_lines.append(chain_termination_line)
185
+ pdb_lines.append('ENDMDL')
186
+
187
+ pdb_lines.append('END')
188
+ pdb_lines.append('')
189
+ return '\n'.join(pdb_lines)
190
+
191
+
192
+ def ideal_atom_mask(prot: Protein) -> np.ndarray:
193
+ """Computes an ideal atom mask.
194
+
195
+ `Protein.atom_mask` typically is defined according to the atoms that are
196
+ reported in the PDB. This function computes a mask according to heavy atoms
197
+ that should be present in the given sequence of amino acids.
198
+
199
+ Args:
200
+ prot: `Protein` whose fields are `numpy.ndarray` objects.
201
+
202
+ Returns:
203
+ An ideal atom mask.
204
+ """
205
+ return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
206
+
207
+
208
+ def from_prediction(features: FeatureDict, result: ModelOutput,
209
+ b_factors: Optional[np.ndarray] = None) -> Protein:
210
+ """Assembles a protein from a prediction.
211
+
212
+ Args:
213
+ features: Dictionary holding model inputs.
214
+ result: Dictionary holding model outputs.
215
+ b_factors: (Optional) B-factors to use for the protein.
216
+
217
+ Returns:
218
+ A protein instance.
219
+ """
220
+ fold_output = result['structure_module']
221
+ if b_factors is None:
222
+ b_factors = np.zeros_like(fold_output['final_atom_mask'])
223
+
224
+ return Protein(
225
+ aatype=features['aatype'][0],
226
+ atom_positions=fold_output['final_atom_positions'],
227
+ atom_mask=fold_output['final_atom_mask'],
228
+ residue_index=features['residue_index'][0] + 1,
229
+ b_factors=b_factors)
alphafold/alphafold/common/protein_test.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Tests for protein."""
16
+
17
+ import os
18
+
19
+ from absl.testing import absltest
20
+ from absl.testing import parameterized
21
+ from alphafold.common import protein
22
+ from alphafold.common import residue_constants
23
+ import numpy as np
24
+ # Internal import (7716).
25
+
26
+ TEST_DATA_DIR = 'alphafold/common/testdata/'
27
+
28
+
29
+ class ProteinTest(parameterized.TestCase):
30
+
31
+ def _check_shapes(self, prot, num_res):
32
+ """Check that the processed shapes are correct."""
33
+ num_atoms = residue_constants.atom_type_num
34
+ self.assertEqual((num_res, num_atoms, 3), prot.atom_positions.shape)
35
+ self.assertEqual((num_res,), prot.aatype.shape)
36
+ self.assertEqual((num_res, num_atoms), prot.atom_mask.shape)
37
+ self.assertEqual((num_res,), prot.residue_index.shape)
38
+ self.assertEqual((num_res, num_atoms), prot.b_factors.shape)
39
+
40
+ @parameterized.parameters(('2rbg.pdb', 'A', 282),
41
+ ('2rbg.pdb', 'B', 282))
42
+ def test_from_pdb_str(self, pdb_file, chain_id, num_res):
43
+ pdb_file = os.path.join(absltest.get_default_test_srcdir(), TEST_DATA_DIR,
44
+ pdb_file)
45
+ with open(pdb_file) as f:
46
+ pdb_string = f.read()
47
+ prot = protein.from_pdb_string(pdb_string, chain_id)
48
+ self._check_shapes(prot, num_res)
49
+ self.assertGreaterEqual(prot.aatype.min(), 0)
50
+ # Allow equal since unknown restypes have index equal to restype_num.
51
+ self.assertLessEqual(prot.aatype.max(), residue_constants.restype_num)
52
+
53
+ def test_to_pdb(self):
54
+ with open(
55
+ os.path.join(absltest.get_default_test_srcdir(), TEST_DATA_DIR,
56
+ '2rbg.pdb')) as f:
57
+ pdb_string = f.read()
58
+ prot = protein.from_pdb_string(pdb_string, chain_id='A')
59
+ pdb_string_reconstr = protein.to_pdb(prot)
60
+ prot_reconstr = protein.from_pdb_string(pdb_string_reconstr)
61
+
62
+ np.testing.assert_array_equal(prot_reconstr.aatype, prot.aatype)
63
+ np.testing.assert_array_almost_equal(
64
+ prot_reconstr.atom_positions, prot.atom_positions)
65
+ np.testing.assert_array_almost_equal(
66
+ prot_reconstr.atom_mask, prot.atom_mask)
67
+ np.testing.assert_array_equal(
68
+ prot_reconstr.residue_index, prot.residue_index)
69
+ np.testing.assert_array_almost_equal(
70
+ prot_reconstr.b_factors, prot.b_factors)
71
+
72
+ def test_ideal_atom_mask(self):
73
+ with open(
74
+ os.path.join(absltest.get_default_test_srcdir(), TEST_DATA_DIR,
75
+ '2rbg.pdb')) as f:
76
+ pdb_string = f.read()
77
+ prot = protein.from_pdb_string(pdb_string, chain_id='A')
78
+ ideal_mask = protein.ideal_atom_mask(prot)
79
+ non_ideal_residues = set([102] + list(range(127, 285)))
80
+ for i, (res, atom_mask) in enumerate(
81
+ zip(prot.residue_index, prot.atom_mask)):
82
+ if res in non_ideal_residues:
83
+ self.assertFalse(np.all(atom_mask == ideal_mask[i]), msg=f'{res}')
84
+ else:
85
+ self.assertTrue(np.all(atom_mask == ideal_mask[i]), msg=f'{res}')
86
+
87
+
88
+ if __name__ == '__main__':
89
+ absltest.main()
alphafold/alphafold/common/residue_constants.py ADDED
@@ -0,0 +1,895 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Constants used in AlphaFold."""
16
+
17
+ import collections
18
+ import functools
19
+ from typing import List, Mapping, Tuple
20
+
21
+ import numpy as np
22
+ import tree
23
+
24
+ # Internal import (35fd).
25
+
26
+
27
+ # Distance from one CA to next CA [trans configuration: omega = 180].
28
+ ca_ca = 3.80209737096
29
+
30
+ # Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
31
+ # this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
32
+ # chi angles so their chi angle lists are empty.
33
+ chi_angles_atoms = {
34
+ 'ALA': [],
35
+ # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
36
+ 'ARG': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
37
+ ['CB', 'CG', 'CD', 'NE'], ['CG', 'CD', 'NE', 'CZ']],
38
+ 'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
39
+ 'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
40
+ 'CYS': [['N', 'CA', 'CB', 'SG']],
41
+ 'GLN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
42
+ ['CB', 'CG', 'CD', 'OE1']],
43
+ 'GLU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
44
+ ['CB', 'CG', 'CD', 'OE1']],
45
+ 'GLY': [],
46
+ 'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']],
47
+ 'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']],
48
+ 'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
49
+ 'LYS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
50
+ ['CB', 'CG', 'CD', 'CE'], ['CG', 'CD', 'CE', 'NZ']],
51
+ 'MET': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'SD'],
52
+ ['CB', 'CG', 'SD', 'CE']],
53
+ 'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
54
+ 'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']],
55
+ 'SER': [['N', 'CA', 'CB', 'OG']],
56
+ 'THR': [['N', 'CA', 'CB', 'OG1']],
57
+ 'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
58
+ 'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
59
+ 'VAL': [['N', 'CA', 'CB', 'CG1']],
60
+ }
61
+
62
+ # If chi angles given in fixed-length array, this matrix determines how to mask
63
+ # them for each AA type. The order is as per restype_order (see below).
64
+ chi_angles_mask = [
65
+ [0.0, 0.0, 0.0, 0.0], # ALA
66
+ [1.0, 1.0, 1.0, 1.0], # ARG
67
+ [1.0, 1.0, 0.0, 0.0], # ASN
68
+ [1.0, 1.0, 0.0, 0.0], # ASP
69
+ [1.0, 0.0, 0.0, 0.0], # CYS
70
+ [1.0, 1.0, 1.0, 0.0], # GLN
71
+ [1.0, 1.0, 1.0, 0.0], # GLU
72
+ [0.0, 0.0, 0.0, 0.0], # GLY
73
+ [1.0, 1.0, 0.0, 0.0], # HIS
74
+ [1.0, 1.0, 0.0, 0.0], # ILE
75
+ [1.0, 1.0, 0.0, 0.0], # LEU
76
+ [1.0, 1.0, 1.0, 1.0], # LYS
77
+ [1.0, 1.0, 1.0, 0.0], # MET
78
+ [1.0, 1.0, 0.0, 0.0], # PHE
79
+ [1.0, 1.0, 0.0, 0.0], # PRO
80
+ [1.0, 0.0, 0.0, 0.0], # SER
81
+ [1.0, 0.0, 0.0, 0.0], # THR
82
+ [1.0, 1.0, 0.0, 0.0], # TRP
83
+ [1.0, 1.0, 0.0, 0.0], # TYR
84
+ [1.0, 0.0, 0.0, 0.0], # VAL
85
+ ]
86
+
87
+ # The following chi angles are pi periodic: they can be rotated by a multiple
88
+ # of pi without affecting the structure.
89
+ chi_pi_periodic = [
90
+ [0.0, 0.0, 0.0, 0.0], # ALA
91
+ [0.0, 0.0, 0.0, 0.0], # ARG
92
+ [0.0, 0.0, 0.0, 0.0], # ASN
93
+ [0.0, 1.0, 0.0, 0.0], # ASP
94
+ [0.0, 0.0, 0.0, 0.0], # CYS
95
+ [0.0, 0.0, 0.0, 0.0], # GLN
96
+ [0.0, 0.0, 1.0, 0.0], # GLU
97
+ [0.0, 0.0, 0.0, 0.0], # GLY
98
+ [0.0, 0.0, 0.0, 0.0], # HIS
99
+ [0.0, 0.0, 0.0, 0.0], # ILE
100
+ [0.0, 0.0, 0.0, 0.0], # LEU
101
+ [0.0, 0.0, 0.0, 0.0], # LYS
102
+ [0.0, 0.0, 0.0, 0.0], # MET
103
+ [0.0, 1.0, 0.0, 0.0], # PHE
104
+ [0.0, 0.0, 0.0, 0.0], # PRO
105
+ [0.0, 0.0, 0.0, 0.0], # SER
106
+ [0.0, 0.0, 0.0, 0.0], # THR
107
+ [0.0, 0.0, 0.0, 0.0], # TRP
108
+ [0.0, 1.0, 0.0, 0.0], # TYR
109
+ [0.0, 0.0, 0.0, 0.0], # VAL
110
+ [0.0, 0.0, 0.0, 0.0], # UNK
111
+ ]
112
+
113
+ # Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
114
+ # psi and chi angles:
115
+ # 0: 'backbone group',
116
+ # 1: 'pre-omega-group', (empty)
117
+ # 2: 'phi-group', (currently empty, because it defines only hydrogens)
118
+ # 3: 'psi-group',
119
+ # 4,5,6,7: 'chi1,2,3,4-group'
120
+ # The atom positions are relative to the axis-end-atom of the corresponding
121
+ # rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
122
+ # is defined such that the dihedral-angle-definiting atom (the last entry in
123
+ # chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
124
+ # format: [atomname, group_idx, rel_position]
125
+ rigid_group_atom_positions = {
126
+ 'ALA': [
127
+ ['N', 0, (-0.525, 1.363, 0.000)],
128
+ ['CA', 0, (0.000, 0.000, 0.000)],
129
+ ['C', 0, (1.526, -0.000, -0.000)],
130
+ ['CB', 0, (-0.529, -0.774, -1.205)],
131
+ ['O', 3, (0.627, 1.062, 0.000)],
132
+ ],
133
+ 'ARG': [
134
+ ['N', 0, (-0.524, 1.362, -0.000)],
135
+ ['CA', 0, (0.000, 0.000, 0.000)],
136
+ ['C', 0, (1.525, -0.000, -0.000)],
137
+ ['CB', 0, (-0.524, -0.778, -1.209)],
138
+ ['O', 3, (0.626, 1.062, 0.000)],
139
+ ['CG', 4, (0.616, 1.390, -0.000)],
140
+ ['CD', 5, (0.564, 1.414, 0.000)],
141
+ ['NE', 6, (0.539, 1.357, -0.000)],
142
+ ['NH1', 7, (0.206, 2.301, 0.000)],
143
+ ['NH2', 7, (2.078, 0.978, -0.000)],
144
+ ['CZ', 7, (0.758, 1.093, -0.000)],
145
+ ],
146
+ 'ASN': [
147
+ ['N', 0, (-0.536, 1.357, 0.000)],
148
+ ['CA', 0, (0.000, 0.000, 0.000)],
149
+ ['C', 0, (1.526, -0.000, -0.000)],
150
+ ['CB', 0, (-0.531, -0.787, -1.200)],
151
+ ['O', 3, (0.625, 1.062, 0.000)],
152
+ ['CG', 4, (0.584, 1.399, 0.000)],
153
+ ['ND2', 5, (0.593, -1.188, 0.001)],
154
+ ['OD1', 5, (0.633, 1.059, 0.000)],
155
+ ],
156
+ 'ASP': [
157
+ ['N', 0, (-0.525, 1.362, -0.000)],
158
+ ['CA', 0, (0.000, 0.000, 0.000)],
159
+ ['C', 0, (1.527, 0.000, -0.000)],
160
+ ['CB', 0, (-0.526, -0.778, -1.208)],
161
+ ['O', 3, (0.626, 1.062, -0.000)],
162
+ ['CG', 4, (0.593, 1.398, -0.000)],
163
+ ['OD1', 5, (0.610, 1.091, 0.000)],
164
+ ['OD2', 5, (0.592, -1.101, -0.003)],
165
+ ],
166
+ 'CYS': [
167
+ ['N', 0, (-0.522, 1.362, -0.000)],
168
+ ['CA', 0, (0.000, 0.000, 0.000)],
169
+ ['C', 0, (1.524, 0.000, 0.000)],
170
+ ['CB', 0, (-0.519, -0.773, -1.212)],
171
+ ['O', 3, (0.625, 1.062, -0.000)],
172
+ ['SG', 4, (0.728, 1.653, 0.000)],
173
+ ],
174
+ 'GLN': [
175
+ ['N', 0, (-0.526, 1.361, -0.000)],
176
+ ['CA', 0, (0.000, 0.000, 0.000)],
177
+ ['C', 0, (1.526, 0.000, 0.000)],
178
+ ['CB', 0, (-0.525, -0.779, -1.207)],
179
+ ['O', 3, (0.626, 1.062, -0.000)],
180
+ ['CG', 4, (0.615, 1.393, 0.000)],
181
+ ['CD', 5, (0.587, 1.399, -0.000)],
182
+ ['NE2', 6, (0.593, -1.189, -0.001)],
183
+ ['OE1', 6, (0.634, 1.060, 0.000)],
184
+ ],
185
+ 'GLU': [
186
+ ['N', 0, (-0.528, 1.361, 0.000)],
187
+ ['CA', 0, (0.000, 0.000, 0.000)],
188
+ ['C', 0, (1.526, -0.000, -0.000)],
189
+ ['CB', 0, (-0.526, -0.781, -1.207)],
190
+ ['O', 3, (0.626, 1.062, 0.000)],
191
+ ['CG', 4, (0.615, 1.392, 0.000)],
192
+ ['CD', 5, (0.600, 1.397, 0.000)],
193
+ ['OE1', 6, (0.607, 1.095, -0.000)],
194
+ ['OE2', 6, (0.589, -1.104, -0.001)],
195
+ ],
196
+ 'GLY': [
197
+ ['N', 0, (-0.572, 1.337, 0.000)],
198
+ ['CA', 0, (0.000, 0.000, 0.000)],
199
+ ['C', 0, (1.517, -0.000, -0.000)],
200
+ ['O', 3, (0.626, 1.062, -0.000)],
201
+ ],
202
+ 'HIS': [
203
+ ['N', 0, (-0.527, 1.360, 0.000)],
204
+ ['CA', 0, (0.000, 0.000, 0.000)],
205
+ ['C', 0, (1.525, 0.000, 0.000)],
206
+ ['CB', 0, (-0.525, -0.778, -1.208)],
207
+ ['O', 3, (0.625, 1.063, 0.000)],
208
+ ['CG', 4, (0.600, 1.370, -0.000)],
209
+ ['CD2', 5, (0.889, -1.021, 0.003)],
210
+ ['ND1', 5, (0.744, 1.160, -0.000)],
211
+ ['CE1', 5, (2.030, 0.851, 0.002)],
212
+ ['NE2', 5, (2.145, -0.466, 0.004)],
213
+ ],
214
+ 'ILE': [
215
+ ['N', 0, (-0.493, 1.373, -0.000)],
216
+ ['CA', 0, (0.000, 0.000, 0.000)],
217
+ ['C', 0, (1.527, -0.000, -0.000)],
218
+ ['CB', 0, (-0.536, -0.793, -1.213)],
219
+ ['O', 3, (0.627, 1.062, -0.000)],
220
+ ['CG1', 4, (0.534, 1.437, -0.000)],
221
+ ['CG2', 4, (0.540, -0.785, -1.199)],
222
+ ['CD1', 5, (0.619, 1.391, 0.000)],
223
+ ],
224
+ 'LEU': [
225
+ ['N', 0, (-0.520, 1.363, 0.000)],
226
+ ['CA', 0, (0.000, 0.000, 0.000)],
227
+ ['C', 0, (1.525, -0.000, -0.000)],
228
+ ['CB', 0, (-0.522, -0.773, -1.214)],
229
+ ['O', 3, (0.625, 1.063, -0.000)],
230
+ ['CG', 4, (0.678, 1.371, 0.000)],
231
+ ['CD1', 5, (0.530, 1.430, -0.000)],
232
+ ['CD2', 5, (0.535, -0.774, 1.200)],
233
+ ],
234
+ 'LYS': [
235
+ ['N', 0, (-0.526, 1.362, -0.000)],
236
+ ['CA', 0, (0.000, 0.000, 0.000)],
237
+ ['C', 0, (1.526, 0.000, 0.000)],
238
+ ['CB', 0, (-0.524, -0.778, -1.208)],
239
+ ['O', 3, (0.626, 1.062, -0.000)],
240
+ ['CG', 4, (0.619, 1.390, 0.000)],
241
+ ['CD', 5, (0.559, 1.417, 0.000)],
242
+ ['CE', 6, (0.560, 1.416, 0.000)],
243
+ ['NZ', 7, (0.554, 1.387, 0.000)],
244
+ ],
245
+ 'MET': [
246
+ ['N', 0, (-0.521, 1.364, -0.000)],
247
+ ['CA', 0, (0.000, 0.000, 0.000)],
248
+ ['C', 0, (1.525, 0.000, 0.000)],
249
+ ['CB', 0, (-0.523, -0.776, -1.210)],
250
+ ['O', 3, (0.625, 1.062, -0.000)],
251
+ ['CG', 4, (0.613, 1.391, -0.000)],
252
+ ['SD', 5, (0.703, 1.695, 0.000)],
253
+ ['CE', 6, (0.320, 1.786, -0.000)],
254
+ ],
255
+ 'PHE': [
256
+ ['N', 0, (-0.518, 1.363, 0.000)],
257
+ ['CA', 0, (0.000, 0.000, 0.000)],
258
+ ['C', 0, (1.524, 0.000, -0.000)],
259
+ ['CB', 0, (-0.525, -0.776, -1.212)],
260
+ ['O', 3, (0.626, 1.062, -0.000)],
261
+ ['CG', 4, (0.607, 1.377, 0.000)],
262
+ ['CD1', 5, (0.709, 1.195, -0.000)],
263
+ ['CD2', 5, (0.706, -1.196, 0.000)],
264
+ ['CE1', 5, (2.102, 1.198, -0.000)],
265
+ ['CE2', 5, (2.098, -1.201, -0.000)],
266
+ ['CZ', 5, (2.794, -0.003, -0.001)],
267
+ ],
268
+ 'PRO': [
269
+ ['N', 0, (-0.566, 1.351, -0.000)],
270
+ ['CA', 0, (0.000, 0.000, 0.000)],
271
+ ['C', 0, (1.527, -0.000, 0.000)],
272
+ ['CB', 0, (-0.546, -0.611, -1.293)],
273
+ ['O', 3, (0.621, 1.066, 0.000)],
274
+ ['CG', 4, (0.382, 1.445, 0.0)],
275
+ # ['CD', 5, (0.427, 1.440, 0.0)],
276
+ ['CD', 5, (0.477, 1.424, 0.0)], # manually made angle 2 degrees larger
277
+ ],
278
+ 'SER': [
279
+ ['N', 0, (-0.529, 1.360, -0.000)],
280
+ ['CA', 0, (0.000, 0.000, 0.000)],
281
+ ['C', 0, (1.525, -0.000, -0.000)],
282
+ ['CB', 0, (-0.518, -0.777, -1.211)],
283
+ ['O', 3, (0.626, 1.062, -0.000)],
284
+ ['OG', 4, (0.503, 1.325, 0.000)],
285
+ ],
286
+ 'THR': [
287
+ ['N', 0, (-0.517, 1.364, 0.000)],
288
+ ['CA', 0, (0.000, 0.000, 0.000)],
289
+ ['C', 0, (1.526, 0.000, -0.000)],
290
+ ['CB', 0, (-0.516, -0.793, -1.215)],
291
+ ['O', 3, (0.626, 1.062, 0.000)],
292
+ ['CG2', 4, (0.550, -0.718, -1.228)],
293
+ ['OG1', 4, (0.472, 1.353, 0.000)],
294
+ ],
295
+ 'TRP': [
296
+ ['N', 0, (-0.521, 1.363, 0.000)],
297
+ ['CA', 0, (0.000, 0.000, 0.000)],
298
+ ['C', 0, (1.525, -0.000, 0.000)],
299
+ ['CB', 0, (-0.523, -0.776, -1.212)],
300
+ ['O', 3, (0.627, 1.062, 0.000)],
301
+ ['CG', 4, (0.609, 1.370, -0.000)],
302
+ ['CD1', 5, (0.824, 1.091, 0.000)],
303
+ ['CD2', 5, (0.854, -1.148, -0.005)],
304
+ ['CE2', 5, (2.186, -0.678, -0.007)],
305
+ ['CE3', 5, (0.622, -2.530, -0.007)],
306
+ ['NE1', 5, (2.140, 0.690, -0.004)],
307
+ ['CH2', 5, (3.028, -2.890, -0.013)],
308
+ ['CZ2', 5, (3.283, -1.543, -0.011)],
309
+ ['CZ3', 5, (1.715, -3.389, -0.011)],
310
+ ],
311
+ 'TYR': [
312
+ ['N', 0, (-0.522, 1.362, 0.000)],
313
+ ['CA', 0, (0.000, 0.000, 0.000)],
314
+ ['C', 0, (1.524, -0.000, -0.000)],
315
+ ['CB', 0, (-0.522, -0.776, -1.213)],
316
+ ['O', 3, (0.627, 1.062, -0.000)],
317
+ ['CG', 4, (0.607, 1.382, -0.000)],
318
+ ['CD1', 5, (0.716, 1.195, -0.000)],
319
+ ['CD2', 5, (0.713, -1.194, -0.001)],
320
+ ['CE1', 5, (2.107, 1.200, -0.002)],
321
+ ['CE2', 5, (2.104, -1.201, -0.003)],
322
+ ['OH', 5, (4.168, -0.002, -0.005)],
323
+ ['CZ', 5, (2.791, -0.001, -0.003)],
324
+ ],
325
+ 'VAL': [
326
+ ['N', 0, (-0.494, 1.373, -0.000)],
327
+ ['CA', 0, (0.000, 0.000, 0.000)],
328
+ ['C', 0, (1.527, -0.000, -0.000)],
329
+ ['CB', 0, (-0.533, -0.795, -1.213)],
330
+ ['O', 3, (0.627, 1.062, -0.000)],
331
+ ['CG1', 4, (0.540, 1.429, -0.000)],
332
+ ['CG2', 4, (0.533, -0.776, 1.203)],
333
+ ],
334
+ }
335
+
336
+ # A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
337
+ residue_atoms = {
338
+ 'ALA': ['C', 'CA', 'CB', 'N', 'O'],
339
+ 'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'],
340
+ 'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'],
341
+ 'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'],
342
+ 'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'],
343
+ 'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'],
344
+ 'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'],
345
+ 'GLY': ['C', 'CA', 'N', 'O'],
346
+ 'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'],
347
+ 'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'],
348
+ 'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'],
349
+ 'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'],
350
+ 'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'],
351
+ 'PHE': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'],
352
+ 'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'],
353
+ 'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'],
354
+ 'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'],
355
+ 'TRP': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'CZ2', 'CZ3',
356
+ 'CH2', 'N', 'NE1', 'O'],
357
+ 'TYR': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O',
358
+ 'OH'],
359
+ 'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O']
360
+ }
361
+
362
+ # Naming swaps for ambiguous atom names.
363
+ # Due to symmetries in the amino acids the naming of atoms is ambiguous in
364
+ # 4 of the 20 amino acids.
365
+ # (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
366
+ # in LEU, VAL and ARG can be resolved by using the 3d constellations of
367
+ # the 'ambiguous' atoms and their neighbours)
368
+ residue_atom_renaming_swaps = {
369
+ 'ASP': {'OD1': 'OD2'},
370
+ 'GLU': {'OE1': 'OE2'},
371
+ 'PHE': {'CD1': 'CD2', 'CE1': 'CE2'},
372
+ 'TYR': {'CD1': 'CD2', 'CE1': 'CE2'},
373
+ }
374
+
375
+ # Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
376
+ van_der_waals_radius = {
377
+ 'C': 1.7,
378
+ 'N': 1.55,
379
+ 'O': 1.52,
380
+ 'S': 1.8,
381
+ }
382
+
383
+ Bond = collections.namedtuple(
384
+ 'Bond', ['atom1_name', 'atom2_name', 'length', 'stddev'])
385
+ BondAngle = collections.namedtuple(
386
+ 'BondAngle',
387
+ ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev'])
388
+
389
+
390
+ @functools.lru_cache(maxsize=None)
391
+ def load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]],
392
+ Mapping[str, List[Bond]],
393
+ Mapping[str, List[BondAngle]]]:
394
+ """Load stereo_chemical_props.txt into a nice structure.
395
+
396
+ Load literature values for bond lengths and bond angles and translate
397
+ bond angles into the length of the opposite edge of the triangle
398
+ ("residue_virtual_bonds").
399
+
400
+ Returns:
401
+ residue_bonds: dict that maps resname --> list of Bond tuples
402
+ residue_virtual_bonds: dict that maps resname --> list of Bond tuples
403
+ residue_bond_angles: dict that maps resname --> list of BondAngle tuples
404
+ """
405
+ stereo_chemical_props_path = (
406
+ 'alphafold/common/stereo_chemical_props.txt')
407
+ with open(stereo_chemical_props_path, 'rt') as f:
408
+ stereo_chemical_props = f.read()
409
+ lines_iter = iter(stereo_chemical_props.splitlines())
410
+ # Load bond lengths.
411
+ residue_bonds = {}
412
+ next(lines_iter) # Skip header line.
413
+ for line in lines_iter:
414
+ if line.strip() == '-':
415
+ break
416
+ bond, resname, length, stddev = line.split()
417
+ atom1, atom2 = bond.split('-')
418
+ if resname not in residue_bonds:
419
+ residue_bonds[resname] = []
420
+ residue_bonds[resname].append(
421
+ Bond(atom1, atom2, float(length), float(stddev)))
422
+ residue_bonds['UNK'] = []
423
+
424
+ # Load bond angles.
425
+ residue_bond_angles = {}
426
+ next(lines_iter) # Skip empty line.
427
+ next(lines_iter) # Skip header line.
428
+ for line in lines_iter:
429
+ if line.strip() == '-':
430
+ break
431
+ bond, resname, angle_degree, stddev_degree = line.split()
432
+ atom1, atom2, atom3 = bond.split('-')
433
+ if resname not in residue_bond_angles:
434
+ residue_bond_angles[resname] = []
435
+ residue_bond_angles[resname].append(
436
+ BondAngle(atom1, atom2, atom3,
437
+ float(angle_degree) / 180. * np.pi,
438
+ float(stddev_degree) / 180. * np.pi))
439
+ residue_bond_angles['UNK'] = []
440
+
441
+ def make_bond_key(atom1_name, atom2_name):
442
+ """Unique key to lookup bonds."""
443
+ return '-'.join(sorted([atom1_name, atom2_name]))
444
+
445
+ # Translate bond angles into distances ("virtual bonds").
446
+ residue_virtual_bonds = {}
447
+ for resname, bond_angles in residue_bond_angles.items():
448
+ # Create a fast lookup dict for bond lengths.
449
+ bond_cache = {}
450
+ for b in residue_bonds[resname]:
451
+ bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
452
+ residue_virtual_bonds[resname] = []
453
+ for ba in bond_angles:
454
+ bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
455
+ bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
456
+
457
+ # Compute distance between atom1 and atom3 using the law of cosines
458
+ # c^2 = a^2 + b^2 - 2ab*cos(gamma).
459
+ gamma = ba.angle_rad
460
+ length = np.sqrt(bond1.length**2 + bond2.length**2
461
+ - 2 * bond1.length * bond2.length * np.cos(gamma))
462
+
463
+ # Propagation of uncertainty assuming uncorrelated errors.
464
+ dl_outer = 0.5 / length
465
+ dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer
466
+ dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
467
+ dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
468
+ stddev = np.sqrt((dl_dgamma * ba.stddev)**2 +
469
+ (dl_db1 * bond1.stddev)**2 +
470
+ (dl_db2 * bond2.stddev)**2)
471
+ residue_virtual_bonds[resname].append(
472
+ Bond(ba.atom1_name, ba.atom3name, length, stddev))
473
+
474
+ return (residue_bonds,
475
+ residue_virtual_bonds,
476
+ residue_bond_angles)
477
+
478
+
479
+ # Between-residue bond lengths for general bonds (first element) and for Proline
480
+ # (second element).
481
+ between_res_bond_length_c_n = [1.329, 1.341]
482
+ between_res_bond_length_stddev_c_n = [0.014, 0.016]
483
+
484
+ # Between-residue cos_angles.
485
+ between_res_cos_angles_c_n_ca = [-0.5203, 0.0353] # degrees: 121.352 +- 2.315
486
+ between_res_cos_angles_ca_c_n = [-0.4473, 0.0311] # degrees: 116.568 +- 1.995
487
+
488
+ # This mapping is used when we need to store atom data in a format that requires
489
+ # fixed atom data size for every residue (e.g. a numpy array).
490
+ atom_types = [
491
+ 'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
492
+ 'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
493
+ 'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
494
+ 'CZ3', 'NZ', 'OXT'
495
+ ]
496
+ atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
497
+ atom_type_num = len(atom_types) # := 37.
498
+
499
+ # A compact atom encoding with 14 columns
500
+ # pylint: disable=line-too-long
501
+ # pylint: disable=bad-whitespace
502
+ restype_name_to_atom14_names = {
503
+ 'ALA': ['N', 'CA', 'C', 'O', 'CB', '', '', '', '', '', '', '', '', ''],
504
+ 'ARG': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2', '', '', ''],
505
+ 'ASN': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2', '', '', '', '', '', ''],
506
+ 'ASP': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2', '', '', '', '', '', ''],
507
+ 'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG', '', '', '', '', '', '', '', ''],
508
+ 'GLN': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2', '', '', '', '', ''],
509
+ 'GLU': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2', '', '', '', '', ''],
510
+ 'GLY': ['N', 'CA', 'C', 'O', '', '', '', '', '', '', '', '', '', ''],
511
+ 'HIS': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2', '', '', '', ''],
512
+ 'ILE': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '', '', '', '', '', ''],
513
+ 'LEU': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', '', '', '', '', '', ''],
514
+ 'LYS': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', '', '', '', '', ''],
515
+ 'MET': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE', '', '', '', '', '', ''],
516
+ 'PHE': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', '', '', ''],
517
+ 'PRO': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', '', '', '', '', '', '', ''],
518
+ 'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG', '', '', '', '', '', '', '', ''],
519
+ 'THR': ['N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '', '', '', '', '', '', ''],
520
+ 'TRP': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2'],
521
+ 'TYR': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH', '', ''],
522
+ 'VAL': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '', '', '', '', '', '', ''],
523
+ 'UNK': ['', '', '', '', '', '', '', '', '', '', '', '', '', ''],
524
+
525
+ }
526
+ # pylint: enable=line-too-long
527
+ # pylint: enable=bad-whitespace
528
+
529
+
530
+ # This is the standard residue order when coding AA type as a number.
531
+ # Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
532
+ restypes = [
533
+ 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P',
534
+ 'S', 'T', 'W', 'Y', 'V'
535
+ ]
536
+ restype_order = {restype: i for i, restype in enumerate(restypes)}
537
+ restype_num = len(restypes) # := 20.
538
+ unk_restype_index = restype_num # Catch-all index for unknown restypes.
539
+
540
+ restypes_with_x = restypes + ['X']
541
+ restype_order_with_x = {restype: i for i, restype in enumerate(restypes_with_x)}
542
+
543
+
544
+ def sequence_to_onehot(
545
+ sequence: str,
546
+ mapping: Mapping[str, int],
547
+ map_unknown_to_x: bool = False) -> np.ndarray:
548
+ """Maps the given sequence into a one-hot encoded matrix.
549
+
550
+ Args:
551
+ sequence: An amino acid sequence.
552
+ mapping: A dictionary mapping amino acids to integers.
553
+ map_unknown_to_x: If True, any amino acid that is not in the mapping will be
554
+ mapped to the unknown amino acid 'X'. If the mapping doesn't contain
555
+ amino acid 'X', an error will be thrown. If False, any amino acid not in
556
+ the mapping will throw an error.
557
+
558
+ Returns:
559
+ A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
560
+ the sequence.
561
+
562
+ Raises:
563
+ ValueError: If the mapping doesn't contain values from 0 to
564
+ num_unique_aas - 1 without any gaps.
565
+ """
566
+ num_entries = max(mapping.values()) + 1
567
+
568
+ if sorted(set(mapping.values())) != list(range(num_entries)):
569
+ raise ValueError('The mapping must have values from 0 to num_unique_aas-1 '
570
+ 'without any gaps. Got: %s' % sorted(mapping.values()))
571
+
572
+ one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
573
+
574
+ for aa_index, aa_type in enumerate(sequence):
575
+ if map_unknown_to_x:
576
+ if aa_type.isalpha() and aa_type.isupper():
577
+ aa_id = mapping.get(aa_type, mapping['X'])
578
+ else:
579
+ raise ValueError(f'Invalid character in the sequence: {aa_type}')
580
+ else:
581
+ aa_id = mapping[aa_type]
582
+ one_hot_arr[aa_index, aa_id] = 1
583
+
584
+ return one_hot_arr
585
+
586
+
587
+ restype_1to3 = {
588
+ 'A': 'ALA',
589
+ 'R': 'ARG',
590
+ 'N': 'ASN',
591
+ 'D': 'ASP',
592
+ 'C': 'CYS',
593
+ 'Q': 'GLN',
594
+ 'E': 'GLU',
595
+ 'G': 'GLY',
596
+ 'H': 'HIS',
597
+ 'I': 'ILE',
598
+ 'L': 'LEU',
599
+ 'K': 'LYS',
600
+ 'M': 'MET',
601
+ 'F': 'PHE',
602
+ 'P': 'PRO',
603
+ 'S': 'SER',
604
+ 'T': 'THR',
605
+ 'W': 'TRP',
606
+ 'Y': 'TYR',
607
+ 'V': 'VAL',
608
+ }
609
+
610
+
611
+ # NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
612
+ # 1-to-1 mapping of 3 letter names to one letter names. The latter contains
613
+ # many more, and less common, three letter names as keys and maps many of these
614
+ # to the same one letter name (including 'X' and 'U' which we don't use here).
615
+ restype_3to1 = {v: k for k, v in restype_1to3.items()}
616
+
617
+ # Define a restype name for all unknown residues.
618
+ unk_restype = 'UNK'
619
+
620
+ resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
621
+ resname_to_idx = {resname: i for i, resname in enumerate(resnames)}
622
+
623
+
624
+ # The mapping here uses hhblits convention, so that B is mapped to D, J and O
625
+ # are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
626
+ # remaining 20 amino acids are kept in alphabetical order.
627
+ # There are 2 non-amino acid codes, X (representing any amino acid) and
628
+ # "-" representing a missing amino acid in an alignment. The id for these
629
+ # codes is put at the end (20 and 21) so that they can easily be ignored if
630
+ # desired.
631
+ HHBLITS_AA_TO_ID = {
632
+ 'A': 0,
633
+ 'B': 2,
634
+ 'C': 1,
635
+ 'D': 2,
636
+ 'E': 3,
637
+ 'F': 4,
638
+ 'G': 5,
639
+ 'H': 6,
640
+ 'I': 7,
641
+ 'J': 20,
642
+ 'K': 8,
643
+ 'L': 9,
644
+ 'M': 10,
645
+ 'N': 11,
646
+ 'O': 20,
647
+ 'P': 12,
648
+ 'Q': 13,
649
+ 'R': 14,
650
+ 'S': 15,
651
+ 'T': 16,
652
+ 'U': 1,
653
+ 'V': 17,
654
+ 'W': 18,
655
+ 'X': 20,
656
+ 'Y': 19,
657
+ 'Z': 3,
658
+ '-': 21,
659
+ }
660
+
661
+ # Partial inversion of HHBLITS_AA_TO_ID.
662
+ ID_TO_HHBLITS_AA = {
663
+ 0: 'A',
664
+ 1: 'C', # Also U.
665
+ 2: 'D', # Also B.
666
+ 3: 'E', # Also Z.
667
+ 4: 'F',
668
+ 5: 'G',
669
+ 6: 'H',
670
+ 7: 'I',
671
+ 8: 'K',
672
+ 9: 'L',
673
+ 10: 'M',
674
+ 11: 'N',
675
+ 12: 'P',
676
+ 13: 'Q',
677
+ 14: 'R',
678
+ 15: 'S',
679
+ 16: 'T',
680
+ 17: 'V',
681
+ 18: 'W',
682
+ 19: 'Y',
683
+ 20: 'X', # Includes J and O.
684
+ 21: '-',
685
+ }
686
+
687
+ restypes_with_x_and_gap = restypes + ['X', '-']
688
+ MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
689
+ restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
690
+ for i in range(len(restypes_with_x_and_gap)))
691
+
692
+
693
+ def _make_standard_atom_mask() -> np.ndarray:
694
+ """Returns [num_res_types, num_atom_types] mask array."""
695
+ # +1 to account for unknown (all 0s).
696
+ mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
697
+ for restype, restype_letter in enumerate(restypes):
698
+ restype_name = restype_1to3[restype_letter]
699
+ atom_names = residue_atoms[restype_name]
700
+ for atom_name in atom_names:
701
+ atom_type = atom_order[atom_name]
702
+ mask[restype, atom_type] = 1
703
+ return mask
704
+
705
+
706
+ STANDARD_ATOM_MASK = _make_standard_atom_mask()
707
+
708
+
709
+ # A one hot representation for the first and second atoms defining the axis
710
+ # of rotation for each chi-angle in each residue.
711
+ def chi_angle_atom(atom_index: int) -> np.ndarray:
712
+ """Define chi-angle rigid groups via one-hot representations."""
713
+ chi_angles_index = {}
714
+ one_hots = []
715
+
716
+ for k, v in chi_angles_atoms.items():
717
+ indices = [atom_types.index(s[atom_index]) for s in v]
718
+ indices.extend([-1]*(4-len(indices)))
719
+ chi_angles_index[k] = indices
720
+
721
+ for r in restypes:
722
+ res3 = restype_1to3[r]
723
+ one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
724
+ one_hots.append(one_hot)
725
+
726
+ one_hots.append(np.zeros([4, atom_type_num])) # Add zeros for residue `X`.
727
+ one_hot = np.stack(one_hots, axis=0)
728
+ one_hot = np.transpose(one_hot, [0, 2, 1])
729
+
730
+ return one_hot
731
+
732
+ chi_atom_1_one_hot = chi_angle_atom(1)
733
+ chi_atom_2_one_hot = chi_angle_atom(2)
734
+
735
+ # An array like chi_angles_atoms but using indices rather than names.
736
+ chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
737
+ chi_angles_atom_indices = tree.map_structure(
738
+ lambda atom_name: atom_order[atom_name], chi_angles_atom_indices)
739
+ chi_angles_atom_indices = np.array([
740
+ chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
741
+ for chi_atoms in chi_angles_atom_indices])
742
+
743
+ # Mapping from (res_name, atom_name) pairs to the atom's chi group index
744
+ # and atom index within that group.
745
+ chi_groups_for_atom = collections.defaultdict(list)
746
+ for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
747
+ for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
748
+ for atom_i, atom in enumerate(chi_group):
749
+ chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
750
+ chi_groups_for_atom = dict(chi_groups_for_atom)
751
+
752
+
753
+ def _make_rigid_transformation_4x4(ex, ey, translation):
754
+ """Create a rigid 4x4 transformation matrix from two axes and transl."""
755
+ # Normalize ex.
756
+ ex_normalized = ex / np.linalg.norm(ex)
757
+
758
+ # make ey perpendicular to ex
759
+ ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
760
+ ey_normalized /= np.linalg.norm(ey_normalized)
761
+
762
+ # compute ez as cross product
763
+ eznorm = np.cross(ex_normalized, ey_normalized)
764
+ m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose()
765
+ m = np.concatenate([m, [[0., 0., 0., 1.]]], axis=0)
766
+ return m
767
+
768
+
769
+ # create an array with (restype, atomtype) --> rigid_group_idx
770
+ # and an array with (restype, atomtype, coord) for the atom positions
771
+ # and compute affine transformation matrices (4,4) from one rigid group to the
772
+ # previous group
773
+ restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=np.int)
774
+ restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
775
+ restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
776
+ restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=np.int)
777
+ restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
778
+ restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
779
+ restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
780
+
781
+
782
+ def _make_rigid_group_constants():
783
+ """Fill the arrays above."""
784
+ for restype, restype_letter in enumerate(restypes):
785
+ resname = restype_1to3[restype_letter]
786
+ for atomname, group_idx, atom_position in rigid_group_atom_positions[
787
+ resname]:
788
+ atomtype = atom_order[atomname]
789
+ restype_atom37_to_rigid_group[restype, atomtype] = group_idx
790
+ restype_atom37_mask[restype, atomtype] = 1
791
+ restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position
792
+
793
+ atom14idx = restype_name_to_atom14_names[resname].index(atomname)
794
+ restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
795
+ restype_atom14_mask[restype, atom14idx] = 1
796
+ restype_atom14_rigid_group_positions[restype,
797
+ atom14idx, :] = atom_position
798
+
799
+ for restype, restype_letter in enumerate(restypes):
800
+ resname = restype_1to3[restype_letter]
801
+ atom_positions = {name: np.array(pos) for name, _, pos
802
+ in rigid_group_atom_positions[resname]}
803
+
804
+ # backbone to backbone is the identity transform
805
+ restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
806
+
807
+ # pre-omega-frame to backbone (currently dummy identity matrix)
808
+ restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
809
+
810
+ # phi-frame to backbone
811
+ mat = _make_rigid_transformation_4x4(
812
+ ex=atom_positions['N'] - atom_positions['CA'],
813
+ ey=np.array([1., 0., 0.]),
814
+ translation=atom_positions['N'])
815
+ restype_rigid_group_default_frame[restype, 2, :, :] = mat
816
+
817
+ # psi-frame to backbone
818
+ mat = _make_rigid_transformation_4x4(
819
+ ex=atom_positions['C'] - atom_positions['CA'],
820
+ ey=atom_positions['CA'] - atom_positions['N'],
821
+ translation=atom_positions['C'])
822
+ restype_rigid_group_default_frame[restype, 3, :, :] = mat
823
+
824
+ # chi1-frame to backbone
825
+ if chi_angles_mask[restype][0]:
826
+ base_atom_names = chi_angles_atoms[resname][0]
827
+ base_atom_positions = [atom_positions[name] for name in base_atom_names]
828
+ mat = _make_rigid_transformation_4x4(
829
+ ex=base_atom_positions[2] - base_atom_positions[1],
830
+ ey=base_atom_positions[0] - base_atom_positions[1],
831
+ translation=base_atom_positions[2])
832
+ restype_rigid_group_default_frame[restype, 4, :, :] = mat
833
+
834
+ # chi2-frame to chi1-frame
835
+ # chi3-frame to chi2-frame
836
+ # chi4-frame to chi3-frame
837
+ # luckily all rotation axes for the next frame start at (0,0,0) of the
838
+ # previous frame
839
+ for chi_idx in range(1, 4):
840
+ if chi_angles_mask[restype][chi_idx]:
841
+ axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
842
+ axis_end_atom_position = atom_positions[axis_end_atom_name]
843
+ mat = _make_rigid_transformation_4x4(
844
+ ex=axis_end_atom_position,
845
+ ey=np.array([-1., 0., 0.]),
846
+ translation=axis_end_atom_position)
847
+ restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat
848
+
849
+
850
+ _make_rigid_group_constants()
851
+
852
+
853
+ def make_atom14_dists_bounds(overlap_tolerance=1.5,
854
+ bond_length_tolerance_factor=15):
855
+ """compute upper and lower bounds for bonds to assess violations."""
856
+ restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
857
+ restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
858
+ restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
859
+ residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
860
+ for restype, restype_letter in enumerate(restypes):
861
+ resname = restype_1to3[restype_letter]
862
+ atom_list = restype_name_to_atom14_names[resname]
863
+
864
+ # create lower and upper bounds for clashes
865
+ for atom1_idx, atom1_name in enumerate(atom_list):
866
+ if not atom1_name:
867
+ continue
868
+ atom1_radius = van_der_waals_radius[atom1_name[0]]
869
+ for atom2_idx, atom2_name in enumerate(atom_list):
870
+ if (not atom2_name) or atom1_idx == atom2_idx:
871
+ continue
872
+ atom2_radius = van_der_waals_radius[atom2_name[0]]
873
+ lower = atom1_radius + atom2_radius - overlap_tolerance
874
+ upper = 1e10
875
+ restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
876
+ restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
877
+ restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
878
+ restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
879
+
880
+ # overwrite lower and upper bounds for bonds and angles
881
+ for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
882
+ atom1_idx = atom_list.index(b.atom1_name)
883
+ atom2_idx = atom_list.index(b.atom2_name)
884
+ lower = b.length - bond_length_tolerance_factor * b.stddev
885
+ upper = b.length + bond_length_tolerance_factor * b.stddev
886
+ restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
887
+ restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
888
+ restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
889
+ restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
890
+ restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
891
+ restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
892
+ return {'lower_bound': restype_atom14_bond_lower_bound, # shape (21,14,14)
893
+ 'upper_bound': restype_atom14_bond_upper_bound, # shape (21,14,14)
894
+ 'stddev': restype_atom14_bond_stddev, # shape (21,14,14)
895
+ }
alphafold/alphafold/common/residue_constants_test.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Test that residue_constants generates correct values."""
16
+
17
+ from absl.testing import absltest
18
+ from absl.testing import parameterized
19
+ from alphafold.common import residue_constants
20
+ import numpy as np
21
+
22
+
23
+ class ResidueConstantsTest(parameterized.TestCase):
24
+
25
+ @parameterized.parameters(
26
+ ('ALA', 0),
27
+ ('CYS', 1),
28
+ ('HIS', 2),
29
+ ('MET', 3),
30
+ ('LYS', 4),
31
+ ('ARG', 4),
32
+ )
33
+ def testChiAnglesAtoms(self, residue_name, chi_num):
34
+ chi_angles_atoms = residue_constants.chi_angles_atoms[residue_name]
35
+ self.assertLen(chi_angles_atoms, chi_num)
36
+ for chi_angle_atoms in chi_angles_atoms:
37
+ self.assertLen(chi_angle_atoms, 4)
38
+
39
+ def testChiGroupsForAtom(self):
40
+ for k, chi_groups in residue_constants.chi_groups_for_atom.items():
41
+ res_name, atom_name = k
42
+ for chi_group_i, atom_i in chi_groups:
43
+ self.assertEqual(
44
+ atom_name,
45
+ residue_constants.chi_angles_atoms[res_name][chi_group_i][atom_i])
46
+
47
+ @parameterized.parameters(
48
+ ('ALA', 5), ('ARG', 11), ('ASN', 8), ('ASP', 8), ('CYS', 6), ('GLN', 9),
49
+ ('GLU', 9), ('GLY', 4), ('HIS', 10), ('ILE', 8), ('LEU', 8), ('LYS', 9),
50
+ ('MET', 8), ('PHE', 11), ('PRO', 7), ('SER', 6), ('THR', 7), ('TRP', 14),
51
+ ('TYR', 12), ('VAL', 7)
52
+ )
53
+ def testResidueAtoms(self, atom_name, num_residue_atoms):
54
+ residue_atoms = residue_constants.residue_atoms[atom_name]
55
+ self.assertLen(residue_atoms, num_residue_atoms)
56
+
57
+ def testStandardAtomMask(self):
58
+ with self.subTest('Check shape'):
59
+ self.assertEqual(residue_constants.STANDARD_ATOM_MASK.shape, (21, 37,))
60
+
61
+ with self.subTest('Check values'):
62
+ str_to_row = lambda s: [c == '1' for c in s] # More clear/concise.
63
+ np.testing.assert_array_equal(
64
+ residue_constants.STANDARD_ATOM_MASK,
65
+ np.array([
66
+ # NB This was defined by c+p but looks sane.
67
+ str_to_row('11111 '), # ALA
68
+ str_to_row('111111 1 1 11 1 '), # ARG
69
+ str_to_row('111111 11 '), # ASP
70
+ str_to_row('111111 11 '), # ASN
71
+ str_to_row('11111 1 '), # CYS
72
+ str_to_row('111111 1 11 '), # GLU
73
+ str_to_row('111111 1 11 '), # GLN
74
+ str_to_row('111 1 '), # GLY
75
+ str_to_row('111111 11 1 1 '), # HIS
76
+ str_to_row('11111 11 1 '), # ILE
77
+ str_to_row('111111 11 '), # LEU
78
+ str_to_row('111111 1 1 1 '), # LYS
79
+ str_to_row('111111 11 '), # MET
80
+ str_to_row('111111 11 11 1 '), # PHE
81
+ str_to_row('111111 1 '), # PRO
82
+ str_to_row('11111 1 '), # SER
83
+ str_to_row('11111 1 1 '), # THR
84
+ str_to_row('111111 11 11 1 1 11 '), # TRP
85
+ str_to_row('111111 11 11 11 '), # TYR
86
+ str_to_row('11111 11 '), # VAL
87
+ str_to_row(' '), # UNK
88
+ ]))
89
+
90
+ with self.subTest('Check row totals'):
91
+ # Check each row has the right number of atoms.
92
+ for row, restype in enumerate(residue_constants.restypes): # A, R, ...
93
+ long_restype = residue_constants.restype_1to3[restype] # ALA, ARG, ...
94
+ atoms_names = residue_constants.residue_atoms[
95
+ long_restype] # ['C', 'CA', 'CB', 'N', 'O'], ...
96
+ self.assertLen(atoms_names,
97
+ residue_constants.STANDARD_ATOM_MASK[row, :].sum(),
98
+ long_restype)
99
+
100
+ def testAtomTypes(self):
101
+ self.assertEqual(residue_constants.atom_type_num, 37)
102
+
103
+ self.assertEqual(residue_constants.atom_types[0], 'N')
104
+ self.assertEqual(residue_constants.atom_types[1], 'CA')
105
+ self.assertEqual(residue_constants.atom_types[2], 'C')
106
+ self.assertEqual(residue_constants.atom_types[3], 'CB')
107
+ self.assertEqual(residue_constants.atom_types[4], 'O')
108
+
109
+ self.assertEqual(residue_constants.atom_order['N'], 0)
110
+ self.assertEqual(residue_constants.atom_order['CA'], 1)
111
+ self.assertEqual(residue_constants.atom_order['C'], 2)
112
+ self.assertEqual(residue_constants.atom_order['CB'], 3)
113
+ self.assertEqual(residue_constants.atom_order['O'], 4)
114
+ self.assertEqual(residue_constants.atom_type_num, 37)
115
+
116
+ def testRestypes(self):
117
+ three_letter_restypes = [
118
+ residue_constants.restype_1to3[r] for r in residue_constants.restypes]
119
+ for restype, exp_restype in zip(
120
+ three_letter_restypes, sorted(residue_constants.restype_1to3.values())):
121
+ self.assertEqual(restype, exp_restype)
122
+ self.assertEqual(residue_constants.restype_num, 20)
123
+
124
+ def testSequenceToOneHotHHBlits(self):
125
+ one_hot = residue_constants.sequence_to_onehot(
126
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ-', residue_constants.HHBLITS_AA_TO_ID)
127
+ exp_one_hot = np.array(
128
+ [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
129
+ [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
130
+ [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
131
+ [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
132
+ [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
133
+ [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
134
+ [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
135
+ [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
136
+ [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
137
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
138
+ [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
139
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
140
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
141
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
142
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
143
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
144
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
145
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
146
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
147
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
148
+ [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
149
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
150
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
151
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
152
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
153
+ [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
154
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
155
+ np.testing.assert_array_equal(one_hot, exp_one_hot)
156
+
157
+ def testSequenceToOneHotStandard(self):
158
+ one_hot = residue_constants.sequence_to_onehot(
159
+ 'ARNDCQEGHILKMFPSTWYV', residue_constants.restype_order)
160
+ np.testing.assert_array_equal(one_hot, np.eye(20))
161
+
162
+ def testSequenceToOneHotUnknownMapping(self):
163
+ seq = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
164
+ expected_out = np.zeros([26, 21])
165
+ for row, position in enumerate(
166
+ [0, 20, 4, 3, 6, 13, 7, 8, 9, 20, 11, 10, 12, 2, 20, 14, 5, 1, 15, 16,
167
+ 20, 19, 17, 20, 18, 20]):
168
+ expected_out[row, position] = 1
169
+ aa_types = residue_constants.sequence_to_onehot(
170
+ sequence=seq,
171
+ mapping=residue_constants.restype_order_with_x,
172
+ map_unknown_to_x=True)
173
+ self.assertTrue((aa_types == expected_out).all())
174
+
175
+ @parameterized.named_parameters(
176
+ ('lowercase', 'aaa'), # Insertions in A3M.
177
+ ('gaps', '---'), # Gaps in A3M.
178
+ ('dots', '...'), # Gaps in A3M.
179
+ ('metadata', '>TEST'), # FASTA metadata line.
180
+ )
181
+ def testSequenceToOneHotUnknownMappingError(self, seq):
182
+ with self.assertRaises(ValueError):
183
+ residue_constants.sequence_to_onehot(
184
+ sequence=seq,
185
+ mapping=residue_constants.restype_order_with_x,
186
+ map_unknown_to_x=True)
187
+
188
+
189
+ if __name__ == '__main__':
190
+ absltest.main()
alphafold/alphafold/common/testdata/2rbg.pdb ADDED
The diff for this file is too large to render. See raw diff
 
alphafold/alphafold/data/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Data pipeline for model features."""
alphafold/alphafold/data/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (191 Bytes). View file
 
alphafold/alphafold/data/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (213 Bytes). View file
 
alphafold/alphafold/data/__pycache__/mmcif_parsing.cpython-36.pyc ADDED
Binary file (11.4 kB). View file
 
alphafold/alphafold/data/__pycache__/mmcif_parsing.cpython-38.pyc ADDED
Binary file (11.6 kB). View file
 
alphafold/alphafold/data/__pycache__/parsers.cpython-36.pyc ADDED
Binary file (9.79 kB). View file
 
alphafold/alphafold/data/__pycache__/parsers.cpython-38.pyc ADDED
Binary file (9.9 kB). View file
 
alphafold/alphafold/data/__pycache__/pipeline.cpython-36.pyc ADDED
Binary file (5.88 kB). View file
 
alphafold/alphafold/data/__pycache__/pipeline.cpython-38.pyc ADDED
Binary file (6.03 kB). View file
 
alphafold/alphafold/data/__pycache__/templates.cpython-36.pyc ADDED
Binary file (27.7 kB). View file
 
alphafold/alphafold/data/__pycache__/templates.cpython-38.pyc ADDED
Binary file (28 kB). View file
 
alphafold/alphafold/data/mmcif_parsing.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Parses the mmCIF file format."""
16
+ import collections
17
+ import dataclasses
18
+ import io
19
+ from typing import Any, Mapping, Optional, Sequence, Tuple
20
+
21
+ from absl import logging
22
+ from Bio import PDB
23
+ from Bio.Data import SCOPData
24
+
25
+ # Type aliases:
26
+ ChainId = str
27
+ PdbHeader = Mapping[str, Any]
28
+ PdbStructure = PDB.Structure.Structure
29
+ SeqRes = str
30
+ MmCIFDict = Mapping[str, Sequence[str]]
31
+
32
+
33
+ @dataclasses.dataclass(frozen=True)
34
+ class Monomer:
35
+ id: str
36
+ num: int
37
+
38
+
39
+ # Note - mmCIF format provides no guarantees on the type of author-assigned
40
+ # sequence numbers. They need not be integers.
41
+ @dataclasses.dataclass(frozen=True)
42
+ class AtomSite:
43
+ residue_name: str
44
+ author_chain_id: str
45
+ mmcif_chain_id: str
46
+ author_seq_num: str
47
+ mmcif_seq_num: int
48
+ insertion_code: str
49
+ hetatm_atom: str
50
+ model_num: int
51
+
52
+
53
+ # Used to map SEQRES index to a residue in the structure.
54
+ @dataclasses.dataclass(frozen=True)
55
+ class ResiduePosition:
56
+ chain_id: str
57
+ residue_number: int
58
+ insertion_code: str
59
+
60
+
61
+ @dataclasses.dataclass(frozen=True)
62
+ class ResidueAtPosition:
63
+ position: Optional[ResiduePosition]
64
+ name: str
65
+ is_missing: bool
66
+ hetflag: str
67
+
68
+
69
+ @dataclasses.dataclass(frozen=True)
70
+ class MmcifObject:
71
+ """Representation of a parsed mmCIF file.
72
+
73
+ Contains:
74
+ file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
75
+ files being processed.
76
+ header: Biopython header.
77
+ structure: Biopython structure.
78
+ chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
79
+ {'A': 'ABCDEFG'}
80
+ seqres_to_structure: Dict; for each chain_id contains a mapping between
81
+ SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
82
+ 1: ResidueAtPosition,
83
+ ...}}
84
+ raw_string: The raw string used to construct the MmcifObject.
85
+ """
86
+ file_id: str
87
+ header: PdbHeader
88
+ structure: PdbStructure
89
+ chain_to_seqres: Mapping[ChainId, SeqRes]
90
+ seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
91
+ raw_string: Any
92
+
93
+
94
+ @dataclasses.dataclass(frozen=True)
95
+ class ParsingResult:
96
+ """Returned by the parse function.
97
+
98
+ Contains:
99
+ mmcif_object: A MmcifObject, may be None if no chain could be successfully
100
+ parsed.
101
+ errors: A dict mapping (file_id, chain_id) to any exception generated.
102
+ """
103
+ mmcif_object: Optional[MmcifObject]
104
+ errors: Mapping[Tuple[str, str], Any]
105
+
106
+
107
+ class ParseError(Exception):
108
+ """An error indicating that an mmCIF file could not be parsed."""
109
+
110
+
111
+ def mmcif_loop_to_list(prefix: str,
112
+ parsed_info: MmCIFDict) -> Sequence[Mapping[str, str]]:
113
+ """Extracts loop associated with a prefix from mmCIF data as a list.
114
+
115
+ Reference for loop_ in mmCIF:
116
+ http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
117
+
118
+ Args:
119
+ prefix: Prefix shared by each of the data items in the loop.
120
+ e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
121
+ _entity_poly_seq.mon_id. Should include the trailing period.
122
+ parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
123
+ parser.
124
+
125
+ Returns:
126
+ Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
127
+ """
128
+ cols = []
129
+ data = []
130
+ for key, value in parsed_info.items():
131
+ if key.startswith(prefix):
132
+ cols.append(key)
133
+ data.append(value)
134
+
135
+ assert all([len(xs) == len(data[0]) for xs in data]), (
136
+ 'mmCIF error: Not all loops are the same length: %s' % cols)
137
+
138
+ return [dict(zip(cols, xs)) for xs in zip(*data)]
139
+
140
+
141
+ def mmcif_loop_to_dict(prefix: str,
142
+ index: str,
143
+ parsed_info: MmCIFDict,
144
+ ) -> Mapping[str, Mapping[str, str]]:
145
+ """Extracts loop associated with a prefix from mmCIF data as a dictionary.
146
+
147
+ Args:
148
+ prefix: Prefix shared by each of the data items in the loop.
149
+ e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
150
+ _entity_poly_seq.mon_id. Should include the trailing period.
151
+ index: Which item of loop data should serve as the key.
152
+ parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
153
+ parser.
154
+
155
+ Returns:
156
+ Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
157
+ indexed by the index column.
158
+ """
159
+ entries = mmcif_loop_to_list(prefix, parsed_info)
160
+ return {entry[index]: entry for entry in entries}
161
+
162
+
163
+ def parse(*,
164
+ file_id: str,
165
+ mmcif_string: str,
166
+ catch_all_errors: bool = True) -> ParsingResult:
167
+ """Entry point, parses an mmcif_string.
168
+
169
+ Args:
170
+ file_id: A string identifier for this file. Should be unique within the
171
+ collection of files being processed.
172
+ mmcif_string: Contents of an mmCIF file.
173
+ catch_all_errors: If True, all exceptions are caught and error messages are
174
+ returned as part of the ParsingResult. If False exceptions will be allowed
175
+ to propagate.
176
+
177
+ Returns:
178
+ A ParsingResult.
179
+ """
180
+ errors = {}
181
+ try:
182
+ parser = PDB.MMCIFParser(QUIET=True)
183
+ handle = io.StringIO(mmcif_string)
184
+ full_structure = parser.get_structure('', handle)
185
+ first_model_structure = _get_first_model(full_structure)
186
+ # Extract the _mmcif_dict from the parser, which contains useful fields not
187
+ # reflected in the Biopython structure.
188
+ parsed_info = parser._mmcif_dict # pylint:disable=protected-access
189
+
190
+ # Ensure all values are lists, even if singletons.
191
+ for key, value in parsed_info.items():
192
+ if not isinstance(value, list):
193
+ parsed_info[key] = [value]
194
+
195
+ header = _get_header(parsed_info)
196
+
197
+ # Determine the protein chains, and their start numbers according to the
198
+ # internal mmCIF numbering scheme (likely but not guaranteed to be 1).
199
+ valid_chains = _get_protein_chains(parsed_info=parsed_info)
200
+ if not valid_chains:
201
+ return ParsingResult(
202
+ None, {(file_id, ''): 'No protein chains found in this file.'})
203
+ seq_start_num = {chain_id: min([monomer.num for monomer in seq])
204
+ for chain_id, seq in valid_chains.items()}
205
+
206
+ # Loop over the atoms for which we have coordinates. Populate two mappings:
207
+ # -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
208
+ # the authors / Biopython).
209
+ # -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
210
+ mmcif_to_author_chain_id = {}
211
+ seq_to_structure_mappings = {}
212
+ for atom in _get_atom_site_list(parsed_info):
213
+ if atom.model_num != '1':
214
+ # We only process the first model at the moment.
215
+ continue
216
+
217
+ mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
218
+
219
+ if atom.mmcif_chain_id in valid_chains:
220
+ hetflag = ' '
221
+ if atom.hetatm_atom == 'HETATM':
222
+ # Water atoms are assigned a special hetflag of W in Biopython. We
223
+ # need to do the same, so that this hetflag can be used to fetch
224
+ # a residue from the Biopython structure by id.
225
+ if atom.residue_name in ('HOH', 'WAT'):
226
+ hetflag = 'W'
227
+ else:
228
+ hetflag = 'H_' + atom.residue_name
229
+ insertion_code = atom.insertion_code
230
+ if not _is_set(atom.insertion_code):
231
+ insertion_code = ' '
232
+ position = ResiduePosition(chain_id=atom.author_chain_id,
233
+ residue_number=int(atom.author_seq_num),
234
+ insertion_code=insertion_code)
235
+ seq_idx = int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
236
+ current = seq_to_structure_mappings.get(atom.author_chain_id, {})
237
+ current[seq_idx] = ResidueAtPosition(position=position,
238
+ name=atom.residue_name,
239
+ is_missing=False,
240
+ hetflag=hetflag)
241
+ seq_to_structure_mappings[atom.author_chain_id] = current
242
+
243
+ # Add missing residue information to seq_to_structure_mappings.
244
+ for chain_id, seq_info in valid_chains.items():
245
+ author_chain = mmcif_to_author_chain_id[chain_id]
246
+ current_mapping = seq_to_structure_mappings[author_chain]
247
+ for idx, monomer in enumerate(seq_info):
248
+ if idx not in current_mapping:
249
+ current_mapping[idx] = ResidueAtPosition(position=None,
250
+ name=monomer.id,
251
+ is_missing=True,
252
+ hetflag=' ')
253
+
254
+ author_chain_to_sequence = {}
255
+ for chain_id, seq_info in valid_chains.items():
256
+ author_chain = mmcif_to_author_chain_id[chain_id]
257
+ seq = []
258
+ for monomer in seq_info:
259
+ code = SCOPData.protein_letters_3to1.get(monomer.id, 'X')
260
+ seq.append(code if len(code) == 1 else 'X')
261
+ seq = ''.join(seq)
262
+ author_chain_to_sequence[author_chain] = seq
263
+
264
+ mmcif_object = MmcifObject(
265
+ file_id=file_id,
266
+ header=header,
267
+ structure=first_model_structure,
268
+ chain_to_seqres=author_chain_to_sequence,
269
+ seqres_to_structure=seq_to_structure_mappings,
270
+ raw_string=parsed_info)
271
+
272
+ return ParsingResult(mmcif_object=mmcif_object, errors=errors)
273
+ except Exception as e: # pylint:disable=broad-except
274
+ errors[(file_id, '')] = e
275
+ if not catch_all_errors:
276
+ raise
277
+ return ParsingResult(mmcif_object=None, errors=errors)
278
+
279
+
280
+ def _get_first_model(structure: PdbStructure) -> PdbStructure:
281
+ """Returns the first model in a Biopython structure."""
282
+ return next(structure.get_models())
283
+
284
+ _MIN_LENGTH_OF_CHAIN_TO_BE_COUNTED_AS_PEPTIDE = 21
285
+
286
+
287
+ def get_release_date(parsed_info: MmCIFDict) -> str:
288
+ """Returns the oldest revision date."""
289
+ revision_dates = parsed_info['_pdbx_audit_revision_history.revision_date']
290
+ return min(revision_dates)
291
+
292
+
293
+ def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
294
+ """Returns a basic header containing method, release date and resolution."""
295
+ header = {}
296
+
297
+ experiments = mmcif_loop_to_list('_exptl.', parsed_info)
298
+ header['structure_method'] = ','.join([
299
+ experiment['_exptl.method'].lower() for experiment in experiments])
300
+
301
+ # Note: The release_date here corresponds to the oldest revision. We prefer to
302
+ # use this for dataset filtering over the deposition_date.
303
+ if '_pdbx_audit_revision_history.revision_date' in parsed_info:
304
+ header['release_date'] = get_release_date(parsed_info)
305
+ else:
306
+ logging.warning('Could not determine release_date: %s',
307
+ parsed_info['_entry.id'])
308
+
309
+ header['resolution'] = 0.00
310
+ for res_key in ('_refine.ls_d_res_high', '_em_3d_reconstruction.resolution',
311
+ '_reflns.d_resolution_high'):
312
+ if res_key in parsed_info:
313
+ try:
314
+ raw_resolution = parsed_info[res_key][0]
315
+ header['resolution'] = float(raw_resolution)
316
+ except ValueError:
317
+ logging.warning('Invalid resolution format: %s', parsed_info[res_key])
318
+
319
+ return header
320
+
321
+
322
+ def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
323
+ """Returns list of atom sites; contains data not present in the structure."""
324
+ return [AtomSite(*site) for site in zip( # pylint:disable=g-complex-comprehension
325
+ parsed_info['_atom_site.label_comp_id'],
326
+ parsed_info['_atom_site.auth_asym_id'],
327
+ parsed_info['_atom_site.label_asym_id'],
328
+ parsed_info['_atom_site.auth_seq_id'],
329
+ parsed_info['_atom_site.label_seq_id'],
330
+ parsed_info['_atom_site.pdbx_PDB_ins_code'],
331
+ parsed_info['_atom_site.group_PDB'],
332
+ parsed_info['_atom_site.pdbx_PDB_model_num'],
333
+ )]
334
+
335
+
336
+ def _get_protein_chains(
337
+ *, parsed_info: Mapping[str, Any]) -> Mapping[ChainId, Sequence[Monomer]]:
338
+ """Extracts polymer information for protein chains only.
339
+
340
+ Args:
341
+ parsed_info: _mmcif_dict produced by the Biopython parser.
342
+
343
+ Returns:
344
+ A dict mapping mmcif chain id to a list of Monomers.
345
+ """
346
+ # Get polymer information for each entity in the structure.
347
+ entity_poly_seqs = mmcif_loop_to_list('_entity_poly_seq.', parsed_info)
348
+
349
+ polymers = collections.defaultdict(list)
350
+ for entity_poly_seq in entity_poly_seqs:
351
+ polymers[entity_poly_seq['_entity_poly_seq.entity_id']].append(
352
+ Monomer(id=entity_poly_seq['_entity_poly_seq.mon_id'],
353
+ num=int(entity_poly_seq['_entity_poly_seq.num'])))
354
+
355
+ # Get chemical compositions. Will allow us to identify which of these polymers
356
+ # are proteins.
357
+ chem_comps = mmcif_loop_to_dict('_chem_comp.', '_chem_comp.id', parsed_info)
358
+
359
+ # Get chains information for each entity. Necessary so that we can return a
360
+ # dict keyed on chain id rather than entity.
361
+ struct_asyms = mmcif_loop_to_list('_struct_asym.', parsed_info)
362
+
363
+ entity_to_mmcif_chains = collections.defaultdict(list)
364
+ for struct_asym in struct_asyms:
365
+ chain_id = struct_asym['_struct_asym.id']
366
+ entity_id = struct_asym['_struct_asym.entity_id']
367
+ entity_to_mmcif_chains[entity_id].append(chain_id)
368
+
369
+ # Identify and return the valid protein chains.
370
+ valid_chains = {}
371
+ for entity_id, seq_info in polymers.items():
372
+ chain_ids = entity_to_mmcif_chains[entity_id]
373
+
374
+ # Reject polymers without any peptide-like components, such as DNA/RNA.
375
+ if any(['peptide' in chem_comps[monomer.id]['_chem_comp.type']
376
+ for monomer in seq_info]):
377
+ for chain_id in chain_ids:
378
+ valid_chains[chain_id] = seq_info
379
+ return valid_chains
380
+
381
+
382
+ def _is_set(data: str) -> bool:
383
+ """Returns False if data is a special mmCIF character indicating 'unset'."""
384
+ return data not in ('.', '?')
alphafold/alphafold/data/parsers.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Functions for parsing various file formats."""
16
+ import collections
17
+ import dataclasses
18
+ import re
19
+ import string
20
+ from typing import Dict, Iterable, List, Optional, Sequence, Tuple
21
+
22
+ DeletionMatrix = Sequence[Sequence[int]]
23
+
24
+
25
+ @dataclasses.dataclass(frozen=True)
26
+ class TemplateHit:
27
+ """Class representing a template hit."""
28
+ index: int
29
+ name: str
30
+ aligned_cols: int
31
+ sum_probs: float
32
+ query: str
33
+ hit_sequence: str
34
+ indices_query: List[int]
35
+ indices_hit: List[int]
36
+
37
+
38
+ def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
39
+ """Parses FASTA string and returns list of strings with amino-acid sequences.
40
+
41
+ Arguments:
42
+ fasta_string: The string contents of a FASTA file.
43
+
44
+ Returns:
45
+ A tuple of two lists:
46
+ * A list of sequences.
47
+ * A list of sequence descriptions taken from the comment lines. In the
48
+ same order as the sequences.
49
+ """
50
+ sequences = []
51
+ descriptions = []
52
+ index = -1
53
+ for line in fasta_string.splitlines():
54
+ line = line.strip()
55
+ if line.startswith('>'):
56
+ index += 1
57
+ descriptions.append(line[1:]) # Remove the '>' at the beginning.
58
+ sequences.append('')
59
+ continue
60
+ elif not line:
61
+ continue # Skip blank lines.
62
+ sequences[index] += line
63
+
64
+ return sequences, descriptions
65
+
66
+
67
+ def parse_stockholm(
68
+ stockholm_string: str
69
+ ) -> Tuple[Sequence[str], DeletionMatrix, Sequence[str]]:
70
+ """Parses sequences and deletion matrix from stockholm format alignment.
71
+
72
+ Args:
73
+ stockholm_string: The string contents of a stockholm file. The first
74
+ sequence in the file should be the query sequence.
75
+
76
+ Returns:
77
+ A tuple of:
78
+ * A list of sequences that have been aligned to the query. These
79
+ might contain duplicates.
80
+ * The deletion matrix for the alignment as a list of lists. The element
81
+ at `deletion_matrix[i][j]` is the number of residues deleted from
82
+ the aligned sequence i at residue position j.
83
+ * The names of the targets matched, including the jackhmmer subsequence
84
+ suffix.
85
+ """
86
+ name_to_sequence = collections.OrderedDict()
87
+ for line in stockholm_string.splitlines():
88
+ line = line.strip()
89
+ if not line or line.startswith(('#', '//')):
90
+ continue
91
+ name, sequence = line.split()
92
+ if name not in name_to_sequence:
93
+ name_to_sequence[name] = ''
94
+ name_to_sequence[name] += sequence
95
+
96
+ msa = []
97
+ deletion_matrix = []
98
+
99
+ query = ''
100
+ keep_columns = []
101
+ for seq_index, sequence in enumerate(name_to_sequence.values()):
102
+ if seq_index == 0:
103
+ # Gather the columns with gaps from the query
104
+ query = sequence
105
+ keep_columns = [i for i, res in enumerate(query) if res != '-']
106
+
107
+ # Remove the columns with gaps in the query from all sequences.
108
+ aligned_sequence = ''.join([sequence[c] for c in keep_columns])
109
+
110
+ msa.append(aligned_sequence)
111
+
112
+ # Count the number of deletions w.r.t. query.
113
+ deletion_vec = []
114
+ deletion_count = 0
115
+ for seq_res, query_res in zip(sequence, query):
116
+ if seq_res != '-' or query_res != '-':
117
+ if query_res == '-':
118
+ deletion_count += 1
119
+ else:
120
+ deletion_vec.append(deletion_count)
121
+ deletion_count = 0
122
+ deletion_matrix.append(deletion_vec)
123
+
124
+ return msa, deletion_matrix, list(name_to_sequence.keys())
125
+
126
+
127
+ def parse_a3m(a3m_string: str) -> Tuple[Sequence[str], DeletionMatrix]:
128
+ """Parses sequences and deletion matrix from a3m format alignment.
129
+
130
+ Args:
131
+ a3m_string: The string contents of a a3m file. The first sequence in the
132
+ file should be the query sequence.
133
+
134
+ Returns:
135
+ A tuple of:
136
+ * A list of sequences that have been aligned to the query. These
137
+ might contain duplicates.
138
+ * The deletion matrix for the alignment as a list of lists. The element
139
+ at `deletion_matrix[i][j]` is the number of residues deleted from
140
+ the aligned sequence i at residue position j.
141
+ """
142
+ sequences, _ = parse_fasta(a3m_string)
143
+ deletion_matrix = []
144
+ for msa_sequence in sequences:
145
+ deletion_vec = []
146
+ deletion_count = 0
147
+ for j in msa_sequence:
148
+ if j.islower():
149
+ deletion_count += 1
150
+ else:
151
+ deletion_vec.append(deletion_count)
152
+ deletion_count = 0
153
+ deletion_matrix.append(deletion_vec)
154
+
155
+ # Make the MSA matrix out of aligned (deletion-free) sequences.
156
+ deletion_table = str.maketrans('', '', string.ascii_lowercase)
157
+ aligned_sequences = [s.translate(deletion_table) for s in sequences]
158
+ return aligned_sequences, deletion_matrix
159
+
160
+
161
+ def _convert_sto_seq_to_a3m(
162
+ query_non_gaps: Sequence[bool], sto_seq: str) -> Iterable[str]:
163
+ for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
164
+ if is_query_res_non_gap:
165
+ yield sequence_res
166
+ elif sequence_res != '-':
167
+ yield sequence_res.lower()
168
+
169
+
170
+ def convert_stockholm_to_a3m(stockholm_format: str,
171
+ max_sequences: Optional[int] = None) -> str:
172
+ """Converts MSA in Stockholm format to the A3M format."""
173
+ descriptions = {}
174
+ sequences = {}
175
+ reached_max_sequences = False
176
+
177
+ for line in stockholm_format.splitlines():
178
+ reached_max_sequences = max_sequences and len(sequences) >= max_sequences
179
+ if line.strip() and not line.startswith(('#', '//')):
180
+ # Ignore blank lines, markup and end symbols - remainder are alignment
181
+ # sequence parts.
182
+ seqname, aligned_seq = line.split(maxsplit=1)
183
+ if seqname not in sequences:
184
+ if reached_max_sequences:
185
+ continue
186
+ sequences[seqname] = ''
187
+ sequences[seqname] += aligned_seq
188
+
189
+ for line in stockholm_format.splitlines():
190
+ if line[:4] == '#=GS':
191
+ # Description row - example format is:
192
+ # #=GS UniRef90_Q9H5Z4/4-78 DE [subseq from] cDNA: FLJ22755 ...
193
+ columns = line.split(maxsplit=3)
194
+ seqname, feature = columns[1:3]
195
+ value = columns[3] if len(columns) == 4 else ''
196
+ if feature != 'DE':
197
+ continue
198
+ if reached_max_sequences and seqname not in sequences:
199
+ continue
200
+ descriptions[seqname] = value
201
+ if len(descriptions) == len(sequences):
202
+ break
203
+
204
+ # Convert sto format to a3m line by line
205
+ a3m_sequences = {}
206
+ # query_sequence is assumed to be the first sequence
207
+ query_sequence = next(iter(sequences.values()))
208
+ query_non_gaps = [res != '-' for res in query_sequence]
209
+ for seqname, sto_sequence in sequences.items():
210
+ a3m_sequences[seqname] = ''.join(
211
+ _convert_sto_seq_to_a3m(query_non_gaps, sto_sequence))
212
+
213
+ fasta_chunks = (f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
214
+ for k in a3m_sequences)
215
+ return '\n'.join(fasta_chunks) + '\n' # Include terminating newline.
216
+
217
+
218
+ def _get_hhr_line_regex_groups(
219
+ regex_pattern: str, line: str) -> Sequence[Optional[str]]:
220
+ match = re.match(regex_pattern, line)
221
+ if match is None:
222
+ raise RuntimeError(f'Could not parse query line {line}')
223
+ return match.groups()
224
+
225
+
226
+ def _update_hhr_residue_indices_list(
227
+ sequence: str, start_index: int, indices_list: List[int]):
228
+ """Computes the relative indices for each residue with respect to the original sequence."""
229
+ counter = start_index
230
+ for symbol in sequence:
231
+ if symbol == '-':
232
+ indices_list.append(-1)
233
+ else:
234
+ indices_list.append(counter)
235
+ counter += 1
236
+
237
+
238
+ def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
239
+ """Parses the detailed HMM HMM comparison section for a single Hit.
240
+
241
+ This works on .hhr files generated from both HHBlits and HHSearch.
242
+
243
+ Args:
244
+ detailed_lines: A list of lines from a single comparison section between 2
245
+ sequences (which each have their own HMM's)
246
+
247
+ Returns:
248
+ A dictionary with the information from that detailed comparison section
249
+
250
+ Raises:
251
+ RuntimeError: If a certain line cannot be processed
252
+ """
253
+ # Parse first 2 lines.
254
+ number_of_hit = int(detailed_lines[0].split()[-1])
255
+ name_hit = detailed_lines[1][1:]
256
+
257
+ # Parse the summary line.
258
+ pattern = (
259
+ 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t'
260
+ ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t '
261
+ ']*Template_Neff=(.*)')
262
+ match = re.match(pattern, detailed_lines[2])
263
+ if match is None:
264
+ raise RuntimeError(
265
+ 'Could not parse section: %s. Expected this: \n%s to contain summary.' %
266
+ (detailed_lines, detailed_lines[2]))
267
+ (prob_true, e_value, _, aligned_cols, _, _, sum_probs,
268
+ neff) = [float(x) for x in match.groups()]
269
+
270
+ # The next section reads the detailed comparisons. These are in a 'human
271
+ # readable' format which has a fixed length. The strategy employed is to
272
+ # assume that each block starts with the query sequence line, and to parse
273
+ # that with a regexp in order to deduce the fixed length used for that block.
274
+ query = ''
275
+ hit_sequence = ''
276
+ indices_query = []
277
+ indices_hit = []
278
+ length_block = None
279
+
280
+ for line in detailed_lines[3:]:
281
+ # Parse the query sequence line
282
+ if (line.startswith('Q ') and not line.startswith('Q ss_dssp') and
283
+ not line.startswith('Q ss_pred') and
284
+ not line.startswith('Q Consensus')):
285
+ # Thus the first 17 characters must be 'Q <query_name> ', and we can parse
286
+ # everything after that.
287
+ # start sequence end total_sequence_length
288
+ patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)'
289
+ groups = _get_hhr_line_regex_groups(patt, line[17:])
290
+
291
+ # Get the length of the parsed block using the start and finish indices,
292
+ # and ensure it is the same as the actual block length.
293
+ start = int(groups[0]) - 1 # Make index zero based.
294
+ delta_query = groups[1]
295
+ end = int(groups[2])
296
+ num_insertions = len([x for x in delta_query if x == '-'])
297
+ length_block = end - start + num_insertions
298
+ assert length_block == len(delta_query)
299
+
300
+ # Update the query sequence and indices list.
301
+ query += delta_query
302
+ _update_hhr_residue_indices_list(delta_query, start, indices_query)
303
+
304
+ elif line.startswith('T '):
305
+ # Parse the hit sequence.
306
+ if (not line.startswith('T ss_dssp') and
307
+ not line.startswith('T ss_pred') and
308
+ not line.startswith('T Consensus')):
309
+ # Thus the first 17 characters must be 'T <hit_name> ', and we can
310
+ # parse everything after that.
311
+ # start sequence end total_sequence_length
312
+ patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)'
313
+ groups = _get_hhr_line_regex_groups(patt, line[17:])
314
+ start = int(groups[0]) - 1 # Make index zero based.
315
+ delta_hit_sequence = groups[1]
316
+ assert length_block == len(delta_hit_sequence)
317
+
318
+ # Update the hit sequence and indices list.
319
+ hit_sequence += delta_hit_sequence
320
+ _update_hhr_residue_indices_list(delta_hit_sequence, start, indices_hit)
321
+
322
+ return TemplateHit(
323
+ index=number_of_hit,
324
+ name=name_hit,
325
+ aligned_cols=int(aligned_cols),
326
+ sum_probs=sum_probs,
327
+ query=query,
328
+ hit_sequence=hit_sequence,
329
+ indices_query=indices_query,
330
+ indices_hit=indices_hit,
331
+ )
332
+
333
+
334
+ def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
335
+ """Parses the content of an entire HHR file."""
336
+ lines = hhr_string.splitlines()
337
+
338
+ # Each .hhr file starts with a results table, then has a sequence of hit
339
+ # "paragraphs", each paragraph starting with a line 'No <hit number>'. We
340
+ # iterate through each paragraph to parse each hit.
341
+
342
+ block_starts = [i for i, line in enumerate(lines) if line.startswith('No ')]
343
+
344
+ hits = []
345
+ if block_starts:
346
+ block_starts.append(len(lines)) # Add the end of the final block.
347
+ for i in range(len(block_starts) - 1):
348
+ hits.append(_parse_hhr_hit(lines[block_starts[i]:block_starts[i + 1]]))
349
+ return hits
350
+
351
+
352
+ def parse_e_values_from_tblout(tblout: str) -> Dict[str, float]:
353
+ """Parse target to e-value mapping parsed from Jackhmmer tblout string."""
354
+ e_values = {'query': 0}
355
+ lines = [line for line in tblout.splitlines() if line[0] != '#']
356
+ # As per http://eddylab.org/software/hmmer/Userguide.pdf fields are
357
+ # space-delimited. Relevant fields are (1) target name: and
358
+ # (5) E-value (full sequence) (numbering from 1).
359
+ for line in lines:
360
+ fields = line.split()
361
+ e_value = fields[4]
362
+ target_name = fields[0]
363
+ e_values[target_name] = float(e_value)
364
+ return e_values
alphafold/alphafold/data/pipeline.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Functions for building the input features for the AlphaFold model."""
16
+
17
+ import os
18
+ from typing import Mapping, Optional, Sequence
19
+ from absl import logging
20
+ from alphafold.common import residue_constants
21
+ from alphafold.data import parsers
22
+ from alphafold.data import templates
23
+ from alphafold.data.tools import hhblits
24
+ from alphafold.data.tools import hhsearch
25
+ from alphafold.data.tools import jackhmmer
26
+ import numpy as np
27
+
28
+ # Internal import (7716).
29
+
30
+ FeatureDict = Mapping[str, np.ndarray]
31
+
32
+
33
+ def make_sequence_features(
34
+ sequence: str, description: str, num_res: int) -> FeatureDict:
35
+ """Constructs a feature dict of sequence features."""
36
+ features = {}
37
+ features['aatype'] = residue_constants.sequence_to_onehot(
38
+ sequence=sequence,
39
+ mapping=residue_constants.restype_order_with_x,
40
+ map_unknown_to_x=True)
41
+ features['between_segment_residues'] = np.zeros((num_res,), dtype=np.int32)
42
+ features['domain_name'] = np.array([description.encode('utf-8')],
43
+ dtype=np.object_)
44
+ features['residue_index'] = np.array(range(num_res), dtype=np.int32)
45
+ features['seq_length'] = np.array([num_res] * num_res, dtype=np.int32)
46
+ features['sequence'] = np.array([sequence.encode('utf-8')], dtype=np.object_)
47
+ return features
48
+
49
+
50
+ def make_msa_features(
51
+ msas: Sequence[Sequence[str]],
52
+ deletion_matrices: Sequence[parsers.DeletionMatrix]) -> FeatureDict:
53
+ """Constructs a feature dict of MSA features."""
54
+ if not msas:
55
+ raise ValueError('At least one MSA must be provided.')
56
+
57
+ int_msa = []
58
+ deletion_matrix = []
59
+ seen_sequences = set()
60
+ for msa_index, msa in enumerate(msas):
61
+ if not msa:
62
+ raise ValueError(f'MSA {msa_index} must contain at least one sequence.')
63
+ for sequence_index, sequence in enumerate(msa):
64
+ if sequence in seen_sequences:
65
+ continue
66
+ seen_sequences.add(sequence)
67
+ int_msa.append(
68
+ [residue_constants.HHBLITS_AA_TO_ID[res] for res in sequence])
69
+ deletion_matrix.append(deletion_matrices[msa_index][sequence_index])
70
+
71
+ num_res = len(msas[0][0])
72
+ num_alignments = len(int_msa)
73
+ features = {}
74
+ features['deletion_matrix_int'] = np.array(deletion_matrix, dtype=np.int32)
75
+ features['msa'] = np.array(int_msa, dtype=np.int32)
76
+ features['num_alignments'] = np.array(
77
+ [num_alignments] * num_res, dtype=np.int32)
78
+ return features
79
+
80
+
81
+ class DataPipeline:
82
+ """Runs the alignment tools and assembles the input features."""
83
+
84
+ def __init__(self,
85
+ jackhmmer_binary_path: str,
86
+ hhblits_binary_path: str,
87
+ hhsearch_binary_path: str,
88
+ uniref90_database_path: str,
89
+ mgnify_database_path: str,
90
+ bfd_database_path: Optional[str],
91
+ uniclust30_database_path: Optional[str],
92
+ small_bfd_database_path: Optional[str],
93
+ pdb70_database_path: str,
94
+ template_featurizer: templates.TemplateHitFeaturizer,
95
+ use_small_bfd: bool,
96
+ mgnify_max_hits: int = 501,
97
+ uniref_max_hits: int = 10000):
98
+ """Constructs a feature dict for a given FASTA file."""
99
+ self._use_small_bfd = use_small_bfd
100
+ self.jackhmmer_uniref90_runner = jackhmmer.Jackhmmer(
101
+ binary_path=jackhmmer_binary_path,
102
+ database_path=uniref90_database_path)
103
+ if use_small_bfd:
104
+ self.jackhmmer_small_bfd_runner = jackhmmer.Jackhmmer(
105
+ binary_path=jackhmmer_binary_path,
106
+ database_path=small_bfd_database_path)
107
+ else:
108
+ self.hhblits_bfd_uniclust_runner = hhblits.HHBlits(
109
+ binary_path=hhblits_binary_path,
110
+ databases=[bfd_database_path, uniclust30_database_path])
111
+ self.jackhmmer_mgnify_runner = jackhmmer.Jackhmmer(
112
+ binary_path=jackhmmer_binary_path,
113
+ database_path=mgnify_database_path)
114
+ self.hhsearch_pdb70_runner = hhsearch.HHSearch(
115
+ binary_path=hhsearch_binary_path,
116
+ databases=[pdb70_database_path])
117
+ self.template_featurizer = template_featurizer
118
+ self.mgnify_max_hits = mgnify_max_hits
119
+ self.uniref_max_hits = uniref_max_hits
120
+
121
+ def process(self, input_fasta_path: str, msa_output_dir: str) -> FeatureDict:
122
+ """Runs alignment tools on the input sequence and creates features."""
123
+ with open(input_fasta_path) as f:
124
+ input_fasta_str = f.read()
125
+ input_seqs, input_descs = parsers.parse_fasta(input_fasta_str)
126
+ if len(input_seqs) != 1:
127
+ raise ValueError(
128
+ f'More than one input sequence found in {input_fasta_path}.')
129
+ input_sequence = input_seqs[0]
130
+ input_description = input_descs[0]
131
+ num_res = len(input_sequence)
132
+
133
+ jackhmmer_uniref90_result = self.jackhmmer_uniref90_runner.query(
134
+ input_fasta_path)[0]
135
+ jackhmmer_mgnify_result = self.jackhmmer_mgnify_runner.query(
136
+ input_fasta_path)[0]
137
+
138
+ uniref90_msa_as_a3m = parsers.convert_stockholm_to_a3m(
139
+ jackhmmer_uniref90_result['sto'], max_sequences=self.uniref_max_hits)
140
+ hhsearch_result = self.hhsearch_pdb70_runner.query(uniref90_msa_as_a3m)
141
+
142
+ uniref90_out_path = os.path.join(msa_output_dir, 'uniref90_hits.sto')
143
+ with open(uniref90_out_path, 'w') as f:
144
+ f.write(jackhmmer_uniref90_result['sto'])
145
+
146
+ mgnify_out_path = os.path.join(msa_output_dir, 'mgnify_hits.sto')
147
+ with open(mgnify_out_path, 'w') as f:
148
+ f.write(jackhmmer_mgnify_result['sto'])
149
+
150
+ pdb70_out_path = os.path.join(msa_output_dir, 'pdb70_hits.hhr')
151
+ with open(pdb70_out_path, 'w') as f:
152
+ f.write(hhsearch_result)
153
+
154
+ uniref90_msa, uniref90_deletion_matrix, _ = parsers.parse_stockholm(
155
+ jackhmmer_uniref90_result['sto'])
156
+ mgnify_msa, mgnify_deletion_matrix, _ = parsers.parse_stockholm(
157
+ jackhmmer_mgnify_result['sto'])
158
+ hhsearch_hits = parsers.parse_hhr(hhsearch_result)
159
+ mgnify_msa = mgnify_msa[:self.mgnify_max_hits]
160
+ mgnify_deletion_matrix = mgnify_deletion_matrix[:self.mgnify_max_hits]
161
+
162
+ if self._use_small_bfd:
163
+ jackhmmer_small_bfd_result = self.jackhmmer_small_bfd_runner.query(
164
+ input_fasta_path)[0]
165
+
166
+ bfd_out_path = os.path.join(msa_output_dir, 'small_bfd_hits.a3m')
167
+ with open(bfd_out_path, 'w') as f:
168
+ f.write(jackhmmer_small_bfd_result['sto'])
169
+
170
+ bfd_msa, bfd_deletion_matrix, _ = parsers.parse_stockholm(
171
+ jackhmmer_small_bfd_result['sto'])
172
+ else:
173
+ hhblits_bfd_uniclust_result = self.hhblits_bfd_uniclust_runner.query(
174
+ input_fasta_path)
175
+
176
+ bfd_out_path = os.path.join(msa_output_dir, 'bfd_uniclust_hits.a3m')
177
+ with open(bfd_out_path, 'w') as f:
178
+ f.write(hhblits_bfd_uniclust_result['a3m'])
179
+
180
+ bfd_msa, bfd_deletion_matrix = parsers.parse_a3m(
181
+ hhblits_bfd_uniclust_result['a3m'])
182
+
183
+ templates_result = self.template_featurizer.get_templates(
184
+ query_sequence=input_sequence,
185
+ query_pdb_code=None,
186
+ query_release_date=None,
187
+ hits=hhsearch_hits)
188
+
189
+ sequence_features = make_sequence_features(
190
+ sequence=input_sequence,
191
+ description=input_description,
192
+ num_res=num_res)
193
+
194
+ msa_features = make_msa_features(
195
+ msas=(uniref90_msa, bfd_msa, mgnify_msa),
196
+ deletion_matrices=(uniref90_deletion_matrix,
197
+ bfd_deletion_matrix,
198
+ mgnify_deletion_matrix))
199
+
200
+ logging.info('Uniref90 MSA size: %d sequences.', len(uniref90_msa))
201
+ logging.info('BFD MSA size: %d sequences.', len(bfd_msa))
202
+ logging.info('MGnify MSA size: %d sequences.', len(mgnify_msa))
203
+ logging.info('Final (deduplicated) MSA size: %d sequences.',
204
+ msa_features['num_alignments'][0])
205
+ logging.info('Total number of templates (NB: this can include bad '
206
+ 'templates and is later filtered to top 4): %d.',
207
+ templates_result.features['template_domain_names'].shape[0])
208
+
209
+ return {**sequence_features, **msa_features, **templates_result.features}
alphafold/alphafold/data/templates.py ADDED
@@ -0,0 +1,922 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Functions for getting templates and calculating template features."""
16
+ import dataclasses
17
+ import datetime
18
+ import glob
19
+ import os
20
+ import re
21
+ from typing import Any, Dict, Mapping, Optional, Sequence, Tuple
22
+
23
+ from absl import logging
24
+ from alphafold.common import residue_constants
25
+ from alphafold.data import mmcif_parsing
26
+ from alphafold.data import parsers
27
+ from alphafold.data.tools import kalign
28
+ import numpy as np
29
+
30
+ # Internal import (7716).
31
+
32
+
33
+ class Error(Exception):
34
+ """Base class for exceptions."""
35
+
36
+
37
+ class NoChainsError(Error):
38
+ """An error indicating that template mmCIF didn't have any chains."""
39
+
40
+
41
+ class SequenceNotInTemplateError(Error):
42
+ """An error indicating that template mmCIF didn't contain the sequence."""
43
+
44
+
45
+ class NoAtomDataInTemplateError(Error):
46
+ """An error indicating that template mmCIF didn't contain atom positions."""
47
+
48
+
49
+ class TemplateAtomMaskAllZerosError(Error):
50
+ """An error indicating that template mmCIF had all atom positions masked."""
51
+
52
+
53
+ class QueryToTemplateAlignError(Error):
54
+ """An error indicating that the query can't be aligned to the template."""
55
+
56
+
57
+ class CaDistanceError(Error):
58
+ """An error indicating that a CA atom distance exceeds a threshold."""
59
+
60
+
61
+ class MultipleChainsError(Error):
62
+ """An error indicating that multiple chains were found for a given ID."""
63
+
64
+
65
+ # Prefilter exceptions.
66
+ class PrefilterError(Exception):
67
+ """A base class for template prefilter exceptions."""
68
+
69
+
70
+ class DateError(PrefilterError):
71
+ """An error indicating that the hit date was after the max allowed date."""
72
+
73
+
74
+ class PdbIdError(PrefilterError):
75
+ """An error indicating that the hit PDB ID was identical to the query."""
76
+
77
+
78
+ class AlignRatioError(PrefilterError):
79
+ """An error indicating that the hit align ratio to the query was too small."""
80
+
81
+
82
+ class DuplicateError(PrefilterError):
83
+ """An error indicating that the hit was an exact subsequence of the query."""
84
+
85
+
86
+ class LengthError(PrefilterError):
87
+ """An error indicating that the hit was too short."""
88
+
89
+
90
+ TEMPLATE_FEATURES = {
91
+ 'template_aatype': np.float32,
92
+ 'template_all_atom_masks': np.float32,
93
+ 'template_all_atom_positions': np.float32,
94
+ 'template_domain_names': np.object,
95
+ 'template_sequence': np.object,
96
+ 'template_sum_probs': np.float32,
97
+ }
98
+
99
+
100
+ def _get_pdb_id_and_chain(hit: parsers.TemplateHit) -> Tuple[str, str]:
101
+ """Returns PDB id and chain id for an HHSearch Hit."""
102
+ # PDB ID: 4 letters. Chain ID: 1+ alphanumeric letters or "." if unknown.
103
+ id_match = re.match(r'[a-zA-Z\d]{4}_[a-zA-Z0-9.]+', hit.name)
104
+ if not id_match:
105
+ raise ValueError(f'hit.name did not start with PDBID_chain: {hit.name}')
106
+ pdb_id, chain_id = id_match.group(0).split('_')
107
+ return pdb_id.lower(), chain_id
108
+
109
+
110
+ def _is_after_cutoff(
111
+ pdb_id: str,
112
+ release_dates: Mapping[str, datetime.datetime],
113
+ release_date_cutoff: Optional[datetime.datetime]) -> bool:
114
+ """Checks if the template date is after the release date cutoff.
115
+
116
+ Args:
117
+ pdb_id: 4 letter pdb code.
118
+ release_dates: Dictionary mapping PDB ids to their structure release dates.
119
+ release_date_cutoff: Max release date that is valid for this query.
120
+
121
+ Returns:
122
+ True if the template release date is after the cutoff, False otherwise.
123
+ """
124
+ if release_date_cutoff is None:
125
+ raise ValueError('The release_date_cutoff must not be None.')
126
+ if pdb_id in release_dates:
127
+ return release_dates[pdb_id] > release_date_cutoff
128
+ else:
129
+ # Since this is just a quick prefilter to reduce the number of mmCIF files
130
+ # we need to parse, we don't have to worry about returning True here.
131
+ logging.warning('Template structure not in release dates dict: %s', pdb_id)
132
+ return False
133
+
134
+
135
+ def _parse_obsolete(obsolete_file_path: str) -> Mapping[str, Optional[str]]:
136
+ """Parses the data file from PDB that lists which pdb_ids are obsolete."""
137
+ with open(obsolete_file_path) as f:
138
+ result = {}
139
+ for line in f:
140
+ line = line.strip()
141
+ # Format: Date From To
142
+ # 'OBSLTE 06-NOV-19 6G9Y' - Removed, rare
143
+ # 'OBSLTE 31-JUL-94 116L 216L' - Replaced, common
144
+ # 'OBSLTE 26-SEP-06 2H33 2JM5 2OWI' - Replaced by multiple, rare
145
+ if line.startswith('OBSLTE'):
146
+ if len(line) > 30:
147
+ # Replaced by at least one structure.
148
+ from_id = line[20:24].lower()
149
+ to_id = line[29:33].lower()
150
+ result[from_id] = to_id
151
+ elif len(line) == 24:
152
+ # Removed.
153
+ from_id = line[20:24].lower()
154
+ result[from_id] = None
155
+ return result
156
+
157
+
158
+ def _parse_release_dates(path: str) -> Mapping[str, datetime.datetime]:
159
+ """Parses release dates file, returns a mapping from PDBs to release dates."""
160
+ if path.endswith('txt'):
161
+ release_dates = {}
162
+ with open(path, 'r') as f:
163
+ for line in f:
164
+ pdb_id, date = line.split(':')
165
+ date = date.strip()
166
+ # Python 3.6 doesn't have datetime.date.fromisoformat() which is about
167
+ # 90x faster than strptime. However, splitting the string manually is
168
+ # about 10x faster than strptime.
169
+ release_dates[pdb_id.strip()] = datetime.datetime(
170
+ year=int(date[:4]), month=int(date[5:7]), day=int(date[8:10]))
171
+ return release_dates
172
+ else:
173
+ raise ValueError('Invalid format of the release date file %s.' % path)
174
+
175
+
176
+ def _assess_hhsearch_hit(
177
+ hit: parsers.TemplateHit,
178
+ hit_pdb_code: str,
179
+ query_sequence: str,
180
+ query_pdb_code: Optional[str],
181
+ release_dates: Mapping[str, datetime.datetime],
182
+ release_date_cutoff: datetime.datetime,
183
+ max_subsequence_ratio: float = 0.95,
184
+ min_align_ratio: float = 0.1) -> bool:
185
+ """Determines if template is valid (without parsing the template mmcif file).
186
+
187
+ Args:
188
+ hit: HhrHit for the template.
189
+ hit_pdb_code: The 4 letter pdb code of the template hit. This might be
190
+ different from the value in the actual hit since the original pdb might
191
+ have become obsolete.
192
+ query_sequence: Amino acid sequence of the query.
193
+ query_pdb_code: 4 letter pdb code of the query.
194
+ release_dates: Dictionary mapping pdb codes to their structure release
195
+ dates.
196
+ release_date_cutoff: Max release date that is valid for this query.
197
+ max_subsequence_ratio: Exclude any exact matches with this much overlap.
198
+ min_align_ratio: Minimum overlap between the template and query.
199
+
200
+ Returns:
201
+ True if the hit passed the prefilter. Raises an exception otherwise.
202
+
203
+ Raises:
204
+ DateError: If the hit date was after the max allowed date.
205
+ PdbIdError: If the hit PDB ID was identical to the query.
206
+ AlignRatioError: If the hit align ratio to the query was too small.
207
+ DuplicateError: If the hit was an exact subsequence of the query.
208
+ LengthError: If the hit was too short.
209
+ """
210
+ aligned_cols = hit.aligned_cols
211
+ align_ratio = aligned_cols / len(query_sequence)
212
+
213
+ template_sequence = hit.hit_sequence.replace('-', '')
214
+ length_ratio = float(len(template_sequence)) / len(query_sequence)
215
+
216
+ # Check whether the template is a large subsequence or duplicate of original
217
+ # query. This can happen due to duplicate entries in the PDB database.
218
+ duplicate = (template_sequence in query_sequence and
219
+ length_ratio > max_subsequence_ratio)
220
+
221
+ if _is_after_cutoff(hit_pdb_code, release_dates, release_date_cutoff):
222
+ raise DateError(f'Date ({release_dates[hit_pdb_code]}) > max template date '
223
+ f'({release_date_cutoff}).')
224
+
225
+ if query_pdb_code is not None:
226
+ if query_pdb_code.lower() == hit_pdb_code.lower():
227
+ raise PdbIdError('PDB code identical to Query PDB code.')
228
+
229
+ if align_ratio <= min_align_ratio:
230
+ raise AlignRatioError('Proportion of residues aligned to query too small. '
231
+ f'Align ratio: {align_ratio}.')
232
+
233
+ if duplicate:
234
+ raise DuplicateError('Template is an exact subsequence of query with large '
235
+ f'coverage. Length ratio: {length_ratio}.')
236
+
237
+ if len(template_sequence) < 10:
238
+ raise LengthError(f'Template too short. Length: {len(template_sequence)}.')
239
+
240
+ return True
241
+
242
+
243
+ def _find_template_in_pdb(
244
+ template_chain_id: str,
245
+ template_sequence: str,
246
+ mmcif_object: mmcif_parsing.MmcifObject) -> Tuple[str, str, int]:
247
+ """Tries to find the template chain in the given pdb file.
248
+
249
+ This method tries the three following things in order:
250
+ 1. Tries if there is an exact match in both the chain ID and the sequence.
251
+ If yes, the chain sequence is returned. Otherwise:
252
+ 2. Tries if there is an exact match only in the sequence.
253
+ If yes, the chain sequence is returned. Otherwise:
254
+ 3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
255
+ If yes, the chain sequence is returned.
256
+ If none of these succeed, a SequenceNotInTemplateError is thrown.
257
+
258
+ Args:
259
+ template_chain_id: The template chain ID.
260
+ template_sequence: The template chain sequence.
261
+ mmcif_object: The PDB object to search for the template in.
262
+
263
+ Returns:
264
+ A tuple with:
265
+ * The chain sequence that was found to match the template in the PDB object.
266
+ * The ID of the chain that is being returned.
267
+ * The offset where the template sequence starts in the chain sequence.
268
+
269
+ Raises:
270
+ SequenceNotInTemplateError: If no match is found after the steps described
271
+ above.
272
+ """
273
+ # Try if there is an exact match in both the chain ID and the (sub)sequence.
274
+ pdb_id = mmcif_object.file_id
275
+ chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
276
+ if chain_sequence and (template_sequence in chain_sequence):
277
+ logging.info(
278
+ 'Found an exact template match %s_%s.', pdb_id, template_chain_id)
279
+ mapping_offset = chain_sequence.find(template_sequence)
280
+ return chain_sequence, template_chain_id, mapping_offset
281
+
282
+ # Try if there is an exact match in the (sub)sequence only.
283
+ for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
284
+ if chain_sequence and (template_sequence in chain_sequence):
285
+ logging.info('Found a sequence-only match %s_%s.', pdb_id, chain_id)
286
+ mapping_offset = chain_sequence.find(template_sequence)
287
+ return chain_sequence, chain_id, mapping_offset
288
+
289
+ # Return a chain sequence that fuzzy matches (X = wildcard) the template.
290
+ # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
291
+ regex = ['.' if aa == 'X' else '(?:%s|X)' % aa for aa in template_sequence]
292
+ regex = re.compile(''.join(regex))
293
+ for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
294
+ match = re.search(regex, chain_sequence)
295
+ if match:
296
+ logging.info('Found a fuzzy sequence-only match %s_%s.', pdb_id, chain_id)
297
+ mapping_offset = match.start()
298
+ return chain_sequence, chain_id, mapping_offset
299
+
300
+ # No hits, raise an error.
301
+ raise SequenceNotInTemplateError(
302
+ 'Could not find the template sequence in %s_%s. Template sequence: %s, '
303
+ 'chain_to_seqres: %s' % (pdb_id, template_chain_id, template_sequence,
304
+ mmcif_object.chain_to_seqres))
305
+
306
+
307
+ def _realign_pdb_template_to_query(
308
+ old_template_sequence: str,
309
+ template_chain_id: str,
310
+ mmcif_object: mmcif_parsing.MmcifObject,
311
+ old_mapping: Mapping[int, int],
312
+ kalign_binary_path: str) -> Tuple[str, Mapping[int, int]]:
313
+ """Aligns template from the mmcif_object to the query.
314
+
315
+ In case PDB70 contains a different version of the template sequence, we need
316
+ to perform a realignment to the actual sequence that is in the mmCIF file.
317
+ This method performs such realignment, but returns the new sequence and
318
+ mapping only if the sequence in the mmCIF file is 90% identical to the old
319
+ sequence.
320
+
321
+ Note that the old_template_sequence comes from the hit, and contains only that
322
+ part of the chain that matches with the query while the new_template_sequence
323
+ is the full chain.
324
+
325
+ Args:
326
+ old_template_sequence: The template sequence that was returned by the PDB
327
+ template search (typically done using HHSearch).
328
+ template_chain_id: The template chain id was returned by the PDB template
329
+ search (typically done using HHSearch). This is used to find the right
330
+ chain in the mmcif_object chain_to_seqres mapping.
331
+ mmcif_object: A mmcif_object which holds the actual template data.
332
+ old_mapping: A mapping from the query sequence to the template sequence.
333
+ This mapping will be used to compute the new mapping from the query
334
+ sequence to the actual mmcif_object template sequence by aligning the
335
+ old_template_sequence and the actual template sequence.
336
+ kalign_binary_path: The path to a kalign executable.
337
+
338
+ Returns:
339
+ A tuple (new_template_sequence, new_query_to_template_mapping) where:
340
+ * new_template_sequence is the actual template sequence that was found in
341
+ the mmcif_object.
342
+ * new_query_to_template_mapping is the new mapping from the query to the
343
+ actual template found in the mmcif_object.
344
+
345
+ Raises:
346
+ QueryToTemplateAlignError:
347
+ * If there was an error thrown by the alignment tool.
348
+ * Or if the actual template sequence differs by more than 10% from the
349
+ old_template_sequence.
350
+ """
351
+ aligner = kalign.Kalign(binary_path=kalign_binary_path)
352
+ new_template_sequence = mmcif_object.chain_to_seqres.get(
353
+ template_chain_id, '')
354
+
355
+ # Sometimes the template chain id is unknown. But if there is only a single
356
+ # sequence within the mmcif_object, it is safe to assume it is that one.
357
+ if not new_template_sequence:
358
+ if len(mmcif_object.chain_to_seqres) == 1:
359
+ logging.info('Could not find %s in %s, but there is only 1 sequence, so '
360
+ 'using that one.',
361
+ template_chain_id,
362
+ mmcif_object.file_id)
363
+ new_template_sequence = list(mmcif_object.chain_to_seqres.values())[0]
364
+ else:
365
+ raise QueryToTemplateAlignError(
366
+ f'Could not find chain {template_chain_id} in {mmcif_object.file_id}. '
367
+ 'If there are no mmCIF parsing errors, it is possible it was not a '
368
+ 'protein chain.')
369
+
370
+ try:
371
+ (old_aligned_template, new_aligned_template), _ = parsers.parse_a3m(
372
+ aligner.align([old_template_sequence, new_template_sequence]))
373
+ except Exception as e:
374
+ raise QueryToTemplateAlignError(
375
+ 'Could not align old template %s to template %s (%s_%s). Error: %s' %
376
+ (old_template_sequence, new_template_sequence, mmcif_object.file_id,
377
+ template_chain_id, str(e)))
378
+
379
+ logging.info('Old aligned template: %s\nNew aligned template: %s',
380
+ old_aligned_template, new_aligned_template)
381
+
382
+ old_to_new_template_mapping = {}
383
+ old_template_index = -1
384
+ new_template_index = -1
385
+ num_same = 0
386
+ for old_template_aa, new_template_aa in zip(
387
+ old_aligned_template, new_aligned_template):
388
+ if old_template_aa != '-':
389
+ old_template_index += 1
390
+ if new_template_aa != '-':
391
+ new_template_index += 1
392
+ if old_template_aa != '-' and new_template_aa != '-':
393
+ old_to_new_template_mapping[old_template_index] = new_template_index
394
+ if old_template_aa == new_template_aa:
395
+ num_same += 1
396
+
397
+ # Require at least 90 % sequence identity wrt to the shorter of the sequences.
398
+ if float(num_same) / min(
399
+ len(old_template_sequence), len(new_template_sequence)) < 0.9:
400
+ raise QueryToTemplateAlignError(
401
+ 'Insufficient similarity of the sequence in the database: %s to the '
402
+ 'actual sequence in the mmCIF file %s_%s: %s. We require at least '
403
+ '90 %% similarity wrt to the shorter of the sequences. This is not a '
404
+ 'problem unless you think this is a template that should be included.' %
405
+ (old_template_sequence, mmcif_object.file_id, template_chain_id,
406
+ new_template_sequence))
407
+
408
+ new_query_to_template_mapping = {}
409
+ for query_index, old_template_index in old_mapping.items():
410
+ new_query_to_template_mapping[query_index] = (
411
+ old_to_new_template_mapping.get(old_template_index, -1))
412
+
413
+ new_template_sequence = new_template_sequence.replace('-', '')
414
+
415
+ return new_template_sequence, new_query_to_template_mapping
416
+
417
+
418
+ def _check_residue_distances(all_positions: np.ndarray,
419
+ all_positions_mask: np.ndarray,
420
+ max_ca_ca_distance: float):
421
+ """Checks if the distance between unmasked neighbor residues is ok."""
422
+ ca_position = residue_constants.atom_order['CA']
423
+ prev_is_unmasked = False
424
+ prev_calpha = None
425
+ for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
426
+ this_is_unmasked = bool(mask[ca_position])
427
+ if this_is_unmasked:
428
+ this_calpha = coords[ca_position]
429
+ if prev_is_unmasked:
430
+ distance = np.linalg.norm(this_calpha - prev_calpha)
431
+ if distance > max_ca_ca_distance:
432
+ raise CaDistanceError(
433
+ 'The distance between residues %d and %d is %f > limit %f.' % (
434
+ i, i + 1, distance, max_ca_ca_distance))
435
+ prev_calpha = this_calpha
436
+ prev_is_unmasked = this_is_unmasked
437
+
438
+
439
+ def _get_atom_positions(
440
+ mmcif_object: mmcif_parsing.MmcifObject,
441
+ auth_chain_id: str,
442
+ max_ca_ca_distance: float) -> Tuple[np.ndarray, np.ndarray]:
443
+ """Gets atom positions and mask from a list of Biopython Residues."""
444
+ num_res = len(mmcif_object.chain_to_seqres[auth_chain_id])
445
+
446
+ relevant_chains = [c for c in mmcif_object.structure.get_chains()
447
+ if c.id == auth_chain_id]
448
+ if len(relevant_chains) != 1:
449
+ raise MultipleChainsError(
450
+ f'Expected exactly one chain in structure with id {auth_chain_id}.')
451
+ chain = relevant_chains[0]
452
+
453
+ all_positions = np.zeros([num_res, residue_constants.atom_type_num, 3])
454
+ all_positions_mask = np.zeros([num_res, residue_constants.atom_type_num],
455
+ dtype=np.int64)
456
+ for res_index in range(num_res):
457
+ pos = np.zeros([residue_constants.atom_type_num, 3], dtype=np.float32)
458
+ mask = np.zeros([residue_constants.atom_type_num], dtype=np.float32)
459
+ res_at_position = mmcif_object.seqres_to_structure[auth_chain_id][res_index]
460
+ if not res_at_position.is_missing:
461
+ res = chain[(res_at_position.hetflag,
462
+ res_at_position.position.residue_number,
463
+ res_at_position.position.insertion_code)]
464
+ for atom in res.get_atoms():
465
+ atom_name = atom.get_name()
466
+ x, y, z = atom.get_coord()
467
+ if atom_name in residue_constants.atom_order.keys():
468
+ pos[residue_constants.atom_order[atom_name]] = [x, y, z]
469
+ mask[residue_constants.atom_order[atom_name]] = 1.0
470
+ elif atom_name.upper() == 'SE' and res.get_resname() == 'MSE':
471
+ # Put the coordinates of the selenium atom in the sulphur column.
472
+ pos[residue_constants.atom_order['SD']] = [x, y, z]
473
+ mask[residue_constants.atom_order['SD']] = 1.0
474
+
475
+ all_positions[res_index] = pos
476
+ all_positions_mask[res_index] = mask
477
+ _check_residue_distances(
478
+ all_positions, all_positions_mask, max_ca_ca_distance)
479
+ return all_positions, all_positions_mask
480
+
481
+
482
+ def _extract_template_features(
483
+ mmcif_object: mmcif_parsing.MmcifObject,
484
+ pdb_id: str,
485
+ mapping: Mapping[int, int],
486
+ template_sequence: str,
487
+ query_sequence: str,
488
+ template_chain_id: str,
489
+ kalign_binary_path: str) -> Tuple[Dict[str, Any], Optional[str]]:
490
+ """Parses atom positions in the target structure and aligns with the query.
491
+
492
+ Atoms for each residue in the template structure are indexed to coincide
493
+ with their corresponding residue in the query sequence, according to the
494
+ alignment mapping provided.
495
+
496
+ Args:
497
+ mmcif_object: mmcif_parsing.MmcifObject representing the template.
498
+ pdb_id: PDB code for the template.
499
+ mapping: Dictionary mapping indices in the query sequence to indices in
500
+ the template sequence.
501
+ template_sequence: String describing the amino acid sequence for the
502
+ template protein.
503
+ query_sequence: String describing the amino acid sequence for the query
504
+ protein.
505
+ template_chain_id: String ID describing which chain in the structure proto
506
+ should be used.
507
+ kalign_binary_path: The path to a kalign executable used for template
508
+ realignment.
509
+
510
+ Returns:
511
+ A tuple with:
512
+ * A dictionary containing the extra features derived from the template
513
+ protein structure.
514
+ * A warning message if the hit was realigned to the actual mmCIF sequence.
515
+ Otherwise None.
516
+
517
+ Raises:
518
+ NoChainsError: If the mmcif object doesn't contain any chains.
519
+ SequenceNotInTemplateError: If the given chain id / sequence can't
520
+ be found in the mmcif object.
521
+ QueryToTemplateAlignError: If the actual template in the mmCIF file
522
+ can't be aligned to the query.
523
+ NoAtomDataInTemplateError: If the mmcif object doesn't contain
524
+ atom positions.
525
+ TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
526
+ unmasked residues.
527
+ """
528
+ if mmcif_object is None or not mmcif_object.chain_to_seqres:
529
+ raise NoChainsError('No chains in PDB: %s_%s' % (pdb_id, template_chain_id))
530
+
531
+ warning = None
532
+ try:
533
+ seqres, chain_id, mapping_offset = _find_template_in_pdb(
534
+ template_chain_id=template_chain_id,
535
+ template_sequence=template_sequence,
536
+ mmcif_object=mmcif_object)
537
+ except SequenceNotInTemplateError:
538
+ # If PDB70 contains a different version of the template, we use the sequence
539
+ # from the mmcif_object.
540
+ chain_id = template_chain_id
541
+ warning = (
542
+ f'The exact sequence {template_sequence} was not found in '
543
+ f'{pdb_id}_{chain_id}. Realigning the template to the actual sequence.')
544
+ logging.warning(warning)
545
+ # This throws an exception if it fails to realign the hit.
546
+ seqres, mapping = _realign_pdb_template_to_query(
547
+ old_template_sequence=template_sequence,
548
+ template_chain_id=template_chain_id,
549
+ mmcif_object=mmcif_object,
550
+ old_mapping=mapping,
551
+ kalign_binary_path=kalign_binary_path)
552
+ logging.info('Sequence in %s_%s: %s successfully realigned to %s',
553
+ pdb_id, chain_id, template_sequence, seqres)
554
+ # The template sequence changed.
555
+ template_sequence = seqres
556
+ # No mapping offset, the query is aligned to the actual sequence.
557
+ mapping_offset = 0
558
+
559
+ try:
560
+ # Essentially set to infinity - we don't want to reject templates unless
561
+ # they're really really bad.
562
+ all_atom_positions, all_atom_mask = _get_atom_positions(
563
+ mmcif_object, chain_id, max_ca_ca_distance=150.0)
564
+ except (CaDistanceError, KeyError) as ex:
565
+ raise NoAtomDataInTemplateError(
566
+ 'Could not get atom data (%s_%s): %s' % (pdb_id, chain_id, str(ex))
567
+ ) from ex
568
+
569
+ all_atom_positions = np.split(all_atom_positions, all_atom_positions.shape[0])
570
+ all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])
571
+
572
+ output_templates_sequence = []
573
+ templates_all_atom_positions = []
574
+ templates_all_atom_masks = []
575
+
576
+ for _ in query_sequence:
577
+ # Residues in the query_sequence that are not in the template_sequence:
578
+ templates_all_atom_positions.append(
579
+ np.zeros((residue_constants.atom_type_num, 3)))
580
+ templates_all_atom_masks.append(np.zeros(residue_constants.atom_type_num))
581
+ output_templates_sequence.append('-')
582
+
583
+ for k, v in mapping.items():
584
+ template_index = v + mapping_offset
585
+ templates_all_atom_positions[k] = all_atom_positions[template_index][0]
586
+ templates_all_atom_masks[k] = all_atom_masks[template_index][0]
587
+ output_templates_sequence[k] = template_sequence[v]
588
+
589
+ # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
590
+ if np.sum(templates_all_atom_masks) < 5:
591
+ raise TemplateAtomMaskAllZerosError(
592
+ 'Template all atom mask was all zeros: %s_%s. Residue range: %d-%d' %
593
+ (pdb_id, chain_id, min(mapping.values()) + mapping_offset,
594
+ max(mapping.values()) + mapping_offset))
595
+
596
+ output_templates_sequence = ''.join(output_templates_sequence)
597
+
598
+ templates_aatype = residue_constants.sequence_to_onehot(
599
+ output_templates_sequence, residue_constants.HHBLITS_AA_TO_ID)
600
+
601
+ return (
602
+ {
603
+ 'template_all_atom_positions': np.array(templates_all_atom_positions),
604
+ 'template_all_atom_masks': np.array(templates_all_atom_masks),
605
+ 'template_sequence': output_templates_sequence.encode(),
606
+ 'template_aatype': np.array(templates_aatype),
607
+ 'template_domain_names': f'{pdb_id.lower()}_{chain_id}'.encode(),
608
+ },
609
+ warning)
610
+
611
+
612
+ def _build_query_to_hit_index_mapping(
613
+ hit_query_sequence: str,
614
+ hit_sequence: str,
615
+ indices_hit: Sequence[int],
616
+ indices_query: Sequence[int],
617
+ original_query_sequence: str) -> Mapping[int, int]:
618
+ """Gets mapping from indices in original query sequence to indices in the hit.
619
+
620
+ hit_query_sequence and hit_sequence are two aligned sequences containing gap
621
+ characters. hit_query_sequence contains only the part of the original query
622
+ sequence that matched the hit. When interpreting the indices from the .hhr, we
623
+ need to correct for this to recover a mapping from original query sequence to
624
+ the hit sequence.
625
+
626
+ Args:
627
+ hit_query_sequence: The portion of the query sequence that is in the .hhr
628
+ hit
629
+ hit_sequence: The portion of the hit sequence that is in the .hhr
630
+ indices_hit: The indices for each aminoacid relative to the hit sequence
631
+ indices_query: The indices for each aminoacid relative to the original query
632
+ sequence
633
+ original_query_sequence: String describing the original query sequence.
634
+
635
+ Returns:
636
+ Dictionary with indices in the original query sequence as keys and indices
637
+ in the hit sequence as values.
638
+ """
639
+ # If the hit is empty (no aligned residues), return empty mapping
640
+ if not hit_query_sequence:
641
+ return {}
642
+
643
+ # Remove gaps and find the offset of hit.query relative to original query.
644
+ hhsearch_query_sequence = hit_query_sequence.replace('-', '')
645
+ hit_sequence = hit_sequence.replace('-', '')
646
+ hhsearch_query_offset = original_query_sequence.find(hhsearch_query_sequence)
647
+
648
+ # Index of -1 used for gap characters. Subtract the min index ignoring gaps.
649
+ min_idx = min(x for x in indices_hit if x > -1)
650
+ fixed_indices_hit = [
651
+ x - min_idx if x > -1 else -1 for x in indices_hit
652
+ ]
653
+
654
+ min_idx = min(x for x in indices_query if x > -1)
655
+ fixed_indices_query = [x - min_idx if x > -1 else -1 for x in indices_query]
656
+
657
+ # Zip the corrected indices, ignore case where both seqs have gap characters.
658
+ mapping = {}
659
+ for q_i, q_t in zip(fixed_indices_query, fixed_indices_hit):
660
+ if q_t != -1 and q_i != -1:
661
+ if (q_t >= len(hit_sequence) or
662
+ q_i + hhsearch_query_offset >= len(original_query_sequence)):
663
+ continue
664
+ mapping[q_i + hhsearch_query_offset] = q_t
665
+
666
+ return mapping
667
+
668
+
669
+ @dataclasses.dataclass(frozen=True)
670
+ class SingleHitResult:
671
+ features: Optional[Mapping[str, Any]]
672
+ error: Optional[str]
673
+ warning: Optional[str]
674
+
675
+
676
+ def _process_single_hit(
677
+ query_sequence: str,
678
+ query_pdb_code: Optional[str],
679
+ hit: parsers.TemplateHit,
680
+ mmcif_dir: str,
681
+ max_template_date: datetime.datetime,
682
+ release_dates: Mapping[str, datetime.datetime],
683
+ obsolete_pdbs: Mapping[str, Optional[str]],
684
+ kalign_binary_path: str,
685
+ strict_error_check: bool = False) -> SingleHitResult:
686
+ """Tries to extract template features from a single HHSearch hit."""
687
+ # Fail hard if we can't get the PDB ID and chain name from the hit.
688
+ hit_pdb_code, hit_chain_id = _get_pdb_id_and_chain(hit)
689
+
690
+ # This hit has been removed (obsoleted) from PDB, skip it.
691
+ if hit_pdb_code in obsolete_pdbs and obsolete_pdbs[hit_pdb_code] is None:
692
+ return SingleHitResult(
693
+ features=None, error=None, warning=f'Hit {hit_pdb_code} is obsolete.')
694
+
695
+ if hit_pdb_code not in release_dates:
696
+ if hit_pdb_code in obsolete_pdbs:
697
+ hit_pdb_code = obsolete_pdbs[hit_pdb_code]
698
+
699
+ # Pass hit_pdb_code since it might have changed due to the pdb being obsolete.
700
+ try:
701
+ _assess_hhsearch_hit(
702
+ hit=hit,
703
+ hit_pdb_code=hit_pdb_code,
704
+ query_sequence=query_sequence,
705
+ query_pdb_code=query_pdb_code,
706
+ release_dates=release_dates,
707
+ release_date_cutoff=max_template_date)
708
+ except PrefilterError as e:
709
+ msg = f'hit {hit_pdb_code}_{hit_chain_id} did not pass prefilter: {str(e)}'
710
+ logging.info('%s: %s', query_pdb_code, msg)
711
+ if strict_error_check and isinstance(
712
+ e, (DateError, PdbIdError, DuplicateError)):
713
+ # In strict mode we treat some prefilter cases as errors.
714
+ return SingleHitResult(features=None, error=msg, warning=None)
715
+
716
+ return SingleHitResult(features=None, error=None, warning=None)
717
+
718
+ mapping = _build_query_to_hit_index_mapping(
719
+ hit.query, hit.hit_sequence, hit.indices_hit, hit.indices_query,
720
+ query_sequence)
721
+
722
+ # The mapping is from the query to the actual hit sequence, so we need to
723
+ # remove gaps (which regardless have a missing confidence score).
724
+ template_sequence = hit.hit_sequence.replace('-', '')
725
+
726
+ cif_path = os.path.join(mmcif_dir, hit_pdb_code + '.cif')
727
+ logging.info('Reading PDB entry from %s. Query: %s, template: %s',
728
+ cif_path, query_sequence, template_sequence)
729
+ # Fail if we can't find the mmCIF file.
730
+ with open(cif_path, 'r') as cif_file:
731
+ cif_string = cif_file.read()
732
+
733
+ parsing_result = mmcif_parsing.parse(
734
+ file_id=hit_pdb_code, mmcif_string=cif_string)
735
+
736
+ if parsing_result.mmcif_object is not None:
737
+ hit_release_date = datetime.datetime.strptime(
738
+ parsing_result.mmcif_object.header['release_date'], '%Y-%m-%d')
739
+ if hit_release_date > max_template_date:
740
+ error = ('Template %s date (%s) > max template date (%s).' %
741
+ (hit_pdb_code, hit_release_date, max_template_date))
742
+ if strict_error_check:
743
+ return SingleHitResult(features=None, error=error, warning=None)
744
+ else:
745
+ logging.warning(error)
746
+ return SingleHitResult(features=None, error=None, warning=None)
747
+
748
+ try:
749
+ features, realign_warning = _extract_template_features(
750
+ mmcif_object=parsing_result.mmcif_object,
751
+ pdb_id=hit_pdb_code,
752
+ mapping=mapping,
753
+ template_sequence=template_sequence,
754
+ query_sequence=query_sequence,
755
+ template_chain_id=hit_chain_id,
756
+ kalign_binary_path=kalign_binary_path)
757
+ features['template_sum_probs'] = [hit.sum_probs]
758
+
759
+ # It is possible there were some errors when parsing the other chains in the
760
+ # mmCIF file, but the template features for the chain we want were still
761
+ # computed. In such case the mmCIF parsing errors are not relevant.
762
+ return SingleHitResult(
763
+ features=features, error=None, warning=realign_warning)
764
+ except (NoChainsError, NoAtomDataInTemplateError,
765
+ TemplateAtomMaskAllZerosError) as e:
766
+ # These 3 errors indicate missing mmCIF experimental data rather than a
767
+ # problem with the template search, so turn them into warnings.
768
+ warning = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
769
+ '%s, mmCIF parsing errors: %s'
770
+ % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index,
771
+ str(e), parsing_result.errors))
772
+ if strict_error_check:
773
+ return SingleHitResult(features=None, error=warning, warning=None)
774
+ else:
775
+ return SingleHitResult(features=None, error=None, warning=warning)
776
+ except Error as e:
777
+ error = ('%s_%s (sum_probs: %.2f, rank: %d): feature extracting errors: '
778
+ '%s, mmCIF parsing errors: %s'
779
+ % (hit_pdb_code, hit_chain_id, hit.sum_probs, hit.index,
780
+ str(e), parsing_result.errors))
781
+ return SingleHitResult(features=None, error=error, warning=None)
782
+
783
+
784
+ @dataclasses.dataclass(frozen=True)
785
+ class TemplateSearchResult:
786
+ features: Mapping[str, Any]
787
+ errors: Sequence[str]
788
+ warnings: Sequence[str]
789
+
790
+
791
+ class TemplateHitFeaturizer:
792
+ """A class for turning hhr hits to template features."""
793
+
794
+ def __init__(
795
+ self,
796
+ mmcif_dir: str,
797
+ max_template_date: str,
798
+ max_hits: int,
799
+ kalign_binary_path: str,
800
+ release_dates_path: Optional[str],
801
+ obsolete_pdbs_path: Optional[str],
802
+ strict_error_check: bool = False):
803
+ """Initializes the Template Search.
804
+
805
+ Args:
806
+ mmcif_dir: Path to a directory with mmCIF structures. Once a template ID
807
+ is found by HHSearch, this directory is used to retrieve the template
808
+ data.
809
+ max_template_date: The maximum date permitted for template structures. No
810
+ template with date higher than this date will be returned. In ISO8601
811
+ date format, YYYY-MM-DD.
812
+ max_hits: The maximum number of templates that will be returned.
813
+ kalign_binary_path: The path to a kalign executable used for template
814
+ realignment.
815
+ release_dates_path: An optional path to a file with a mapping from PDB IDs
816
+ to their release dates. Thanks to this we don't have to redundantly
817
+ parse mmCIF files to get that information.
818
+ obsolete_pdbs_path: An optional path to a file containing a mapping from
819
+ obsolete PDB IDs to the PDB IDs of their replacements.
820
+ strict_error_check: If True, then the following will be treated as errors:
821
+ * If any template date is after the max_template_date.
822
+ * If any template has identical PDB ID to the query.
823
+ * If any template is a duplicate of the query.
824
+ * Any feature computation errors.
825
+ """
826
+ self._mmcif_dir = mmcif_dir
827
+ if not glob.glob(os.path.join(self._mmcif_dir, '*.cif')):
828
+ logging.error('Could not find CIFs in %s', self._mmcif_dir)
829
+ raise ValueError(f'Could not find CIFs in {self._mmcif_dir}')
830
+
831
+ try:
832
+ self._max_template_date = datetime.datetime.strptime(
833
+ max_template_date, '%Y-%m-%d')
834
+ except ValueError:
835
+ raise ValueError(
836
+ 'max_template_date must be set and have format YYYY-MM-DD.')
837
+ self._max_hits = max_hits
838
+ self._kalign_binary_path = kalign_binary_path
839
+ self._strict_error_check = strict_error_check
840
+
841
+ if release_dates_path:
842
+ logging.info('Using precomputed release dates %s.', release_dates_path)
843
+ self._release_dates = _parse_release_dates(release_dates_path)
844
+ else:
845
+ self._release_dates = {}
846
+
847
+ if obsolete_pdbs_path:
848
+ logging.info('Using precomputed obsolete pdbs %s.', obsolete_pdbs_path)
849
+ self._obsolete_pdbs = _parse_obsolete(obsolete_pdbs_path)
850
+ else:
851
+ self._obsolete_pdbs = {}
852
+
853
+ def get_templates(
854
+ self,
855
+ query_sequence: str,
856
+ query_pdb_code: Optional[str],
857
+ query_release_date: Optional[datetime.datetime],
858
+ hits: Sequence[parsers.TemplateHit]) -> TemplateSearchResult:
859
+ """Computes the templates for given query sequence (more details above)."""
860
+ logging.info('Searching for template for: %s', query_pdb_code)
861
+
862
+ template_features = {}
863
+ for template_feature_name in TEMPLATE_FEATURES:
864
+ template_features[template_feature_name] = []
865
+
866
+ # Always use a max_template_date. Set to query_release_date minus 60 days
867
+ # if that's earlier.
868
+ template_cutoff_date = self._max_template_date
869
+ if query_release_date:
870
+ delta = datetime.timedelta(days=60)
871
+ if query_release_date - delta < template_cutoff_date:
872
+ template_cutoff_date = query_release_date - delta
873
+ assert template_cutoff_date < query_release_date
874
+ assert template_cutoff_date <= self._max_template_date
875
+
876
+ num_hits = 0
877
+ errors = []
878
+ warnings = []
879
+
880
+ for hit in sorted(hits, key=lambda x: x.sum_probs, reverse=True):
881
+ # We got all the templates we wanted, stop processing hits.
882
+ if num_hits >= self._max_hits:
883
+ break
884
+
885
+ result = _process_single_hit(
886
+ query_sequence=query_sequence,
887
+ query_pdb_code=query_pdb_code,
888
+ hit=hit,
889
+ mmcif_dir=self._mmcif_dir,
890
+ max_template_date=template_cutoff_date,
891
+ release_dates=self._release_dates,
892
+ obsolete_pdbs=self._obsolete_pdbs,
893
+ strict_error_check=self._strict_error_check,
894
+ kalign_binary_path=self._kalign_binary_path)
895
+
896
+ if result.error:
897
+ errors.append(result.error)
898
+
899
+ # There could be an error even if there are some results, e.g. thrown by
900
+ # other unparsable chains in the same mmCIF file.
901
+ if result.warning:
902
+ warnings.append(result.warning)
903
+
904
+ if result.features is None:
905
+ logging.info('Skipped invalid hit %s, error: %s, warning: %s',
906
+ hit.name, result.error, result.warning)
907
+ else:
908
+ # Increment the hit counter, since we got features out of this hit.
909
+ num_hits += 1
910
+ for k in template_features:
911
+ template_features[k].append(result.features[k])
912
+
913
+ for name in template_features:
914
+ if num_hits > 0:
915
+ template_features[name] = np.stack(
916
+ template_features[name], axis=0).astype(TEMPLATE_FEATURES[name])
917
+ else:
918
+ # Make sure the feature has correct dtype even if empty.
919
+ template_features[name] = np.array([], dtype=TEMPLATE_FEATURES[name])
920
+
921
+ return TemplateSearchResult(
922
+ features=template_features, errors=errors, warnings=warnings)
alphafold/alphafold/data/tools/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Python wrappers for third party tools."""
alphafold/alphafold/data/tools/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (202 Bytes). View file
 
alphafold/alphafold/data/tools/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (224 Bytes). View file
 
alphafold/alphafold/data/tools/__pycache__/hhblits.cpython-36.pyc ADDED
Binary file (4.44 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/hhblits.cpython-38.pyc ADDED
Binary file (4.52 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/hhsearch.cpython-36.pyc ADDED
Binary file (2.52 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/hhsearch.cpython-38.pyc ADDED
Binary file (2.58 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/jackhmmer.cpython-36.pyc ADDED
Binary file (5.23 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/jackhmmer.cpython-38.pyc ADDED
Binary file (5.34 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/kalign.cpython-36.pyc ADDED
Binary file (3.04 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/kalign.cpython-38.pyc ADDED
Binary file (3.09 kB). View file
 
alphafold/alphafold/data/tools/__pycache__/utils.cpython-36.pyc ADDED
Binary file (929 Bytes). View file
 
alphafold/alphafold/data/tools/__pycache__/utils.cpython-38.pyc ADDED
Binary file (959 Bytes). View file
 
alphafold/alphafold/data/tools/hhblits.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Library to run HHblits from Python."""
16
+
17
+ import glob
18
+ import os
19
+ import subprocess
20
+ from typing import Any, Mapping, Optional, Sequence
21
+
22
+ from absl import logging
23
+ from alphafold.data.tools import utils
24
+ # Internal import (7716).
25
+
26
+
27
+ _HHBLITS_DEFAULT_P = 20
28
+ _HHBLITS_DEFAULT_Z = 500
29
+
30
+
31
+ class HHBlits:
32
+ """Python wrapper of the HHblits binary."""
33
+
34
+ def __init__(self,
35
+ *,
36
+ binary_path: str,
37
+ databases: Sequence[str],
38
+ n_cpu: int = 4,
39
+ n_iter: int = 3,
40
+ e_value: float = 0.001,
41
+ maxseq: int = 1_000_000,
42
+ realign_max: int = 100_000,
43
+ maxfilt: int = 100_000,
44
+ min_prefilter_hits: int = 1000,
45
+ all_seqs: bool = False,
46
+ alt: Optional[int] = None,
47
+ p: int = _HHBLITS_DEFAULT_P,
48
+ z: int = _HHBLITS_DEFAULT_Z):
49
+ """Initializes the Python HHblits wrapper.
50
+
51
+ Args:
52
+ binary_path: The path to the HHblits executable.
53
+ databases: A sequence of HHblits database paths. This should be the
54
+ common prefix for the database files (i.e. up to but not including
55
+ _hhm.ffindex etc.)
56
+ n_cpu: The number of CPUs to give HHblits.
57
+ n_iter: The number of HHblits iterations.
58
+ e_value: The E-value, see HHblits docs for more details.
59
+ maxseq: The maximum number of rows in an input alignment. Note that this
60
+ parameter is only supported in HHBlits version 3.1 and higher.
61
+ realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
62
+ maxfilt: Max number of hits allowed to pass the 2nd prefilter.
63
+ HHblits default: 20000.
64
+ min_prefilter_hits: Min number of hits to pass prefilter.
65
+ HHblits default: 100.
66
+ all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
67
+ HHblits default: False.
68
+ alt: Show up to this many alternative alignments.
69
+ p: Minimum Prob for a hit to be included in the output hhr file.
70
+ HHblits default: 20.
71
+ z: Hard cap on number of hits reported in the hhr file.
72
+ HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
73
+
74
+ Raises:
75
+ RuntimeError: If HHblits binary not found within the path.
76
+ """
77
+ self.binary_path = binary_path
78
+ self.databases = databases
79
+
80
+ for database_path in self.databases:
81
+ if not glob.glob(database_path + '_*'):
82
+ logging.error('Could not find HHBlits database %s', database_path)
83
+ raise ValueError(f'Could not find HHBlits database {database_path}')
84
+
85
+ self.n_cpu = n_cpu
86
+ self.n_iter = n_iter
87
+ self.e_value = e_value
88
+ self.maxseq = maxseq
89
+ self.realign_max = realign_max
90
+ self.maxfilt = maxfilt
91
+ self.min_prefilter_hits = min_prefilter_hits
92
+ self.all_seqs = all_seqs
93
+ self.alt = alt
94
+ self.p = p
95
+ self.z = z
96
+
97
+ def query(self, input_fasta_path: str) -> Mapping[str, Any]:
98
+ """Queries the database using HHblits."""
99
+ with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
100
+ a3m_path = os.path.join(query_tmp_dir, 'output.a3m')
101
+
102
+ db_cmd = []
103
+ for db_path in self.databases:
104
+ db_cmd.append('-d')
105
+ db_cmd.append(db_path)
106
+ cmd = [
107
+ self.binary_path,
108
+ '-i', input_fasta_path,
109
+ '-cpu', str(self.n_cpu),
110
+ '-oa3m', a3m_path,
111
+ '-o', '/dev/null',
112
+ '-n', str(self.n_iter),
113
+ '-e', str(self.e_value),
114
+ '-maxseq', str(self.maxseq),
115
+ '-realign_max', str(self.realign_max),
116
+ '-maxfilt', str(self.maxfilt),
117
+ '-min_prefilter_hits', str(self.min_prefilter_hits)]
118
+ if self.all_seqs:
119
+ cmd += ['-all']
120
+ if self.alt:
121
+ cmd += ['-alt', str(self.alt)]
122
+ if self.p != _HHBLITS_DEFAULT_P:
123
+ cmd += ['-p', str(self.p)]
124
+ if self.z != _HHBLITS_DEFAULT_Z:
125
+ cmd += ['-Z', str(self.z)]
126
+ cmd += db_cmd
127
+
128
+ logging.info('Launching subprocess "%s"', ' '.join(cmd))
129
+ process = subprocess.Popen(
130
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
131
+
132
+ with utils.timing('HHblits query'):
133
+ stdout, stderr = process.communicate()
134
+ retcode = process.wait()
135
+
136
+ if retcode:
137
+ # Logs have a 15k character limit, so log HHblits error line by line.
138
+ logging.error('HHblits failed. HHblits stderr begin:')
139
+ for error_line in stderr.decode('utf-8').splitlines():
140
+ if error_line.strip():
141
+ logging.error(error_line.strip())
142
+ logging.error('HHblits stderr end')
143
+ raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (
144
+ stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))
145
+
146
+ with open(a3m_path) as f:
147
+ a3m = f.read()
148
+
149
+ raw_output = dict(
150
+ a3m=a3m,
151
+ output=stdout,
152
+ stderr=stderr,
153
+ n_iter=self.n_iter,
154
+ e_value=self.e_value)
155
+ return raw_output
alphafold/alphafold/data/tools/hhsearch.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Library to run HHsearch from Python."""
16
+
17
+ import glob
18
+ import os
19
+ import subprocess
20
+ from typing import Sequence
21
+
22
+ from absl import logging
23
+
24
+ from alphafold.data.tools import utils
25
+ # Internal import (7716).
26
+
27
+
28
+ class HHSearch:
29
+ """Python wrapper of the HHsearch binary."""
30
+
31
+ def __init__(self,
32
+ *,
33
+ binary_path: str,
34
+ databases: Sequence[str],
35
+ maxseq: int = 1_000_000):
36
+ """Initializes the Python HHsearch wrapper.
37
+
38
+ Args:
39
+ binary_path: The path to the HHsearch executable.
40
+ databases: A sequence of HHsearch database paths. This should be the
41
+ common prefix for the database files (i.e. up to but not including
42
+ _hhm.ffindex etc.)
43
+ maxseq: The maximum number of rows in an input alignment. Note that this
44
+ parameter is only supported in HHBlits version 3.1 and higher.
45
+
46
+ Raises:
47
+ RuntimeError: If HHsearch binary not found within the path.
48
+ """
49
+ self.binary_path = binary_path
50
+ self.databases = databases
51
+ self.maxseq = maxseq
52
+
53
+ for database_path in self.databases:
54
+ if not glob.glob(database_path + '_*'):
55
+ logging.error('Could not find HHsearch database %s', database_path)
56
+ raise ValueError(f'Could not find HHsearch database {database_path}')
57
+
58
+ def query(self, a3m: str) -> str:
59
+ """Queries the database using HHsearch using a given a3m."""
60
+ with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
61
+ input_path = os.path.join(query_tmp_dir, 'query.a3m')
62
+ hhr_path = os.path.join(query_tmp_dir, 'output.hhr')
63
+ with open(input_path, 'w') as f:
64
+ f.write(a3m)
65
+
66
+ db_cmd = []
67
+ for db_path in self.databases:
68
+ db_cmd.append('-d')
69
+ db_cmd.append(db_path)
70
+ cmd = [self.binary_path,
71
+ '-i', input_path,
72
+ '-o', hhr_path,
73
+ '-maxseq', str(self.maxseq)
74
+ ] + db_cmd
75
+
76
+ logging.info('Launching subprocess "%s"', ' '.join(cmd))
77
+ process = subprocess.Popen(
78
+ cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
79
+ with utils.timing('HHsearch query'):
80
+ stdout, stderr = process.communicate()
81
+ retcode = process.wait()
82
+
83
+ if retcode:
84
+ # Stderr is truncated to prevent proto size errors in Beam.
85
+ raise RuntimeError(
86
+ 'HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
87
+ stdout.decode('utf-8'), stderr[:100_000].decode('utf-8')))
88
+
89
+ with open(hhr_path) as f:
90
+ hhr = f.read()
91
+ return hhr
alphafold/alphafold/data/tools/hmmbuild.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 DeepMind Technologies Limited
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """A Python wrapper for hmmbuild - construct HMM profiles from MSA."""
16
+
17
+ import os
18
+ import re
19
+ import subprocess
20
+
21
+ from absl import logging
22
+ from alphafold.data.tools import utils
23
+ # Internal import (7716).
24
+
25
+
26
+ class Hmmbuild(object):
27
+ """Python wrapper of the hmmbuild binary."""
28
+
29
+ def __init__(self,
30
+ *,
31
+ binary_path: str,
32
+ singlemx: bool = False):
33
+ """Initializes the Python hmmbuild wrapper.
34
+
35
+ Args:
36
+ binary_path: The path to the hmmbuild executable.
37
+ singlemx: Whether to use --singlemx flag. If True, it forces HMMBuild to
38
+ just use a common substitution score matrix.
39
+
40
+ Raises:
41
+ RuntimeError: If hmmbuild binary not found within the path.
42
+ """
43
+ self.binary_path = binary_path
44
+ self.singlemx = singlemx
45
+
46
+ def build_profile_from_sto(self, sto: str, model_construction='fast') -> str:
47
+ """Builds a HHM for the aligned sequences given as an A3M string.
48
+
49
+ Args:
50
+ sto: A string with the aligned sequences in the Stockholm format.
51
+ model_construction: Whether to use reference annotation in the msa to
52
+ determine consensus columns ('hand') or default ('fast').
53
+
54
+ Returns:
55
+ A string with the profile in the HMM format.
56
+
57
+ Raises:
58
+ RuntimeError: If hmmbuild fails.
59
+ """
60
+ return self._build_profile(sto, model_construction=model_construction)
61
+
62
+ def build_profile_from_a3m(self, a3m: str) -> str:
63
+ """Builds a HHM for the aligned sequences given as an A3M string.
64
+
65
+ Args:
66
+ a3m: A string with the aligned sequences in the A3M format.
67
+
68
+ Returns:
69
+ A string with the profile in the HMM format.
70
+
71
+ Raises:
72
+ RuntimeError: If hmmbuild fails.
73
+ """
74
+ lines = []
75
+ for line in a3m.splitlines():
76
+ if not line.startswith('>'):
77
+ line = re.sub('[a-z]+', '', line) # Remove inserted residues.
78
+ lines.append(line + '\n')
79
+ msa = ''.join(lines)
80
+ return self._build_profile(msa, model_construction='fast')
81
+
82
+ def _build_profile(self, msa: str, model_construction: str = 'fast') -> str:
83
+ """Builds a HMM for the aligned sequences given as an MSA string.
84
+
85
+ Args:
86
+ msa: A string with the aligned sequences, in A3M or STO format.
87
+ model_construction: Whether to use reference annotation in the msa to
88
+ determine consensus columns ('hand') or default ('fast').
89
+
90
+ Returns:
91
+ A string with the profile in the HMM format.
92
+
93
+ Raises:
94
+ RuntimeError: If hmmbuild fails.
95
+ ValueError: If unspecified arguments are provided.
96
+ """
97
+ if model_construction not in {'hand', 'fast'}:
98
+ raise ValueError(f'Invalid model_construction {model_construction} - only'
99
+ 'hand and fast supported.')
100
+
101
+ with utils.tmpdir_manager(base_dir='/tmp') as query_tmp_dir:
102
+ input_query = os.path.join(query_tmp_dir, 'query.msa')
103
+ output_hmm_path = os.path.join(query_tmp_dir, 'output.hmm')
104
+
105
+ with open(input_query, 'w') as f:
106
+ f.write(msa)
107
+
108
+ cmd = [self.binary_path]
109
+ # If adding flags, we have to do so before the output and input:
110
+
111
+ if model_construction == 'hand':
112
+ cmd.append(f'--{model_construction}')
113
+ if self.singlemx:
114
+ cmd.append('--singlemx')
115
+ cmd.extend([
116
+ '--amino',
117
+ output_hmm_path,
118
+ input_query,
119
+ ])
120
+
121
+ logging.info('Launching subprocess %s', cmd)
122
+ process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
123
+ stderr=subprocess.PIPE)
124
+
125
+ with utils.timing('hmmbuild query'):
126
+ stdout, stderr = process.communicate()
127
+ retcode = process.wait()
128
+ logging.info('hmmbuild stdout:\n%s\n\nstderr:\n%s\n',
129
+ stdout.decode('utf-8'), stderr.decode('utf-8'))
130
+
131
+ if retcode:
132
+ raise RuntimeError('hmmbuild failed\nstdout:\n%s\n\nstderr:\n%s\n'
133
+ % (stdout.decode('utf-8'), stderr.decode('utf-8')))
134
+
135
+ with open(output_hmm_path, encoding='utf-8') as f:
136
+ hmm = f.read()
137
+
138
+ return hmm