Update README.md
Browse files
README.md
CHANGED
@@ -1,6 +1,85 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
4 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
This model checkpoint is based on the benchmarking human PPIs from https://d-script.readthedocs.io/en/stable/data.html
|
|
|
1 |
---
|
2 |
+
license: mit
|
3 |
+
tags:
|
4 |
+
- protein-protein interactions
|
5 |
+
- paired proteins encoding
|
6 |
+
- protein language model
|
7 |
---
|
8 |
+
# PLM-interact model
|
9 |
+
|
10 |
+
|
11 |
+
PLM-interact: extending protein language models to predict protein-protein interactions
|
12 |
+
The preprint is available at [PLM-interact](https://www.biorxiv.org/content/10.1101/2024.11.05.622169v1) and the code see [github link](https://github.com/liudan111/PLM-interact)
|
13 |
+
|
14 |
+
This model is trained on human PPIs from STRING V12. For the PPI preprocessing details, see Methods
|
15 |
+
in the preprint.
|
16 |
+
|
17 |
+
## Model description
|
18 |
+
|
19 |
+
PLM-interact, goes beyond a single protein, jointly encoding protein pairs to learn their relationships,
|
20 |
+
analogous to the next-sentence prediction task from natural language processing. This approach provides
|
21 |
+
a significant improvement in performance: Trained on human-human PPIs, PLM-interact predicts mouse, fly,
|
22 |
+
worm, E. coli and yeast PPIs, with 16-28% improvements in AUPR compared with state-of-the-art PPI models.
|
23 |
+
Additionally, it can detect changes that disrupt or cause PPIs and be applied to virus-host PPI prediction.
|
24 |
+
|
25 |
+

|
26 |
+
|
27 |
+
### An example to predict interaction probability between proteins
|
28 |
+
|
29 |
+
```python
|
30 |
+
import torch
|
31 |
+
import torch.nn as nn
|
32 |
+
from transformers import AutoModel,AutoModelForMaskedLM,AutoTokenizer
|
33 |
+
import os
|
34 |
+
import torch.nn.functional as F
|
35 |
+
|
36 |
+
class PLMinteract(nn.Module):
|
37 |
+
def __init__(self,model_name,num_labels,embedding_size):
|
38 |
+
super(PLMinteract,self).__init__()
|
39 |
+
self.esm_mask = AutoModelForMaskedLM.from_pretrained(model_name)
|
40 |
+
self.embedding_size=embedding_size
|
41 |
+
self.classifier = nn.Linear(embedding_size,1) # embedding_size
|
42 |
+
self.num_labels=num_labels
|
43 |
+
|
44 |
+
def forward_test(self,features):
|
45 |
+
embedding_output = self.esm_mask.base_model(**features, return_dict=True)
|
46 |
+
embedding=embedding_output.last_hidden_state[:,0,:] #cls token
|
47 |
+
embedding = F.relu(embedding)
|
48 |
+
logits = self.classifier(embedding)
|
49 |
+
logits=logits.view(-1)
|
50 |
+
probability = torch.sigmoid(logits)
|
51 |
+
return probability
|
52 |
+
|
53 |
+
# folder_huggingface_download : the download model from huggingface, such as "danliu1226/PLM-interact-650M-humanV11"
|
54 |
+
# model_name: the ESM2 model that PLM-interact trained
|
55 |
+
# embedding_size: the embedding size of ESM2 model
|
56 |
+
|
57 |
+
folder_huggingface_download='download_huggingface_folder/'
|
58 |
+
model_name= 'facebook/esm2_t33_650M_UR50D'
|
59 |
+
embedding_size =1280
|
60 |
+
|
61 |
+
protein1 ="EGCVSNLMVCNLAYSGKLEELKESILADKSLATRTDQDSRTALHWACSAGHTEIVEFLLQLGVPVNDKDDAGWSPLHIAASAGRDEIVKALLGKGAQVNAVNQNGCTPLHYAASKNRHEIAVMLLEGGANPDAKDHYEATAMHRAAAKGNLKMIHILLYYKASTNIQDTEGNTPLHLACDEERVEEAKLLVSQGASIYIENKEEKTPLQVAKGGLGLILKRMVEG"
|
62 |
+
|
63 |
+
protein2= "MGQSQSGGHGPGGGKKDDKDKKKKYEPPVPTRVGKKKKKTKGPDAASKLPLVTPHTQCRLKLLKLERIKDYLLMEEEFIRNQEQMKPLEEKQEEERSKVDDLRGTPMSVGTLEEIIDDNHAIVSTSVGSEHYVSILSFVDKDLLEPGCSVLLNHKVHAVIGVLMDDTDPLVTVMKVEKAPQETYADIGGLDNQIQEIKESVELPLTHPEYYEEMGIKPPKGVILYGPPGTGKTLLAKAVANQTSATFLRVVGSELIQKYLGDGPKLVRELFRVAEEHAPSIVFIDEIDAIGTKRYDSNSGGEREIQRTMLELLNQLDGFDSRGDVKVIMATNRIETLDPALIRPGRIDRKIEFPLPDEKTKKRIFQIHTSRMTLADDVTLDDLIMAKDDLSGADIKAICTEAGLMALRERRMKVTNEDFKKSKENVLYKKQEGTPEGLYL"
|
64 |
+
|
65 |
+
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
66 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
67 |
+
PLMinter= PLMinteract(model_name, 1, embedding_size)
|
68 |
+
load_model = torch.load(f"{folder_huggingface_download}pytorch_model.bin")
|
69 |
+
PLMinter.load_state_dict(load_model)
|
70 |
+
|
71 |
+
texts=[protein1, protein2]
|
72 |
+
tokenized = tokenizer(*texts, padding=True, truncation='longest_first', return_tensors="pt", max_length=1603)
|
73 |
+
tokenized = tokenized.to(DEVICE)
|
74 |
+
|
75 |
+
PLMinter.eval()
|
76 |
+
PLMinter.to(DEVICE)
|
77 |
+
with torch.no_grad():
|
78 |
+
probability = PLMinter.forward_test(tokenized)
|
79 |
+
print(probability.item())
|
80 |
+
```
|
81 |
+
|
82 |
+
## Training dataset
|
83 |
+
This model checkpoint is trained on the benchmarking human PPIs from https://d-script.readthedocs.io/en/stable/data.html
|
84 |
+
|
85 |
|
|