Update README.md
Browse files
README.md
CHANGED
@@ -7,6 +7,9 @@ metrics:
|
|
7 |
model-index:
|
8 |
- name: output_v3
|
9 |
results: []
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
@@ -25,7 +28,42 @@ More information needed
|
|
25 |
|
26 |
## Intended uses & limitations
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
## Training and evaluation data
|
31 |
|
|
|
7 |
model-index:
|
8 |
- name: output_v3
|
9 |
results: []
|
10 |
+
widget:
|
11 |
+
- text: >-
|
12 |
+
<|endoftext|>MAADGYLPDWLEDNLSEGIREWWALKPGAPQPKANQQHQDNARGLVLPGYKYLGPGNGL
|
13 |
---
|
14 |
|
15 |
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
|
|
28 |
|
29 |
## Intended uses & limitations
|
30 |
|
31 |
+
### Generate novel sequences for viral capsid proteins
|
32 |
+
|
33 |
+
### Calculate the perplexity of a protein sequence
|
34 |
+
|
35 |
+
```python
|
36 |
+
def calculatePerplexity(sequence, model, tokenizer):
|
37 |
+
input_ids = torch.tensor(tokenizer.encode(sequence)).unsqueeze(0)
|
38 |
+
input_ids = input_ids.to(device)
|
39 |
+
with torch.no_grad():
|
40 |
+
outputs = model(input_ids, labels=input_ids)
|
41 |
+
loss, logits = outputs[:2]
|
42 |
+
return math.exp(loss)
|
43 |
+
|
44 |
+
def split_sequence(sequence):
|
45 |
+
chunks = []
|
46 |
+
max_i = 0
|
47 |
+
for i in range(0, len(sequence), 60):
|
48 |
+
chunk = sequence[i:i+60]
|
49 |
+
|
50 |
+
if i == 0:
|
51 |
+
chunk = '<|endoftext|>' + chunk[:-1]
|
52 |
+
chunks.append(chunk)
|
53 |
+
max_i = i
|
54 |
+
|
55 |
+
chunks = '\n'.join(chunks)
|
56 |
+
|
57 |
+
if max_i+61==len(sequence):
|
58 |
+
chunks = chunks+"\n<|endoftext|>"
|
59 |
+
else:
|
60 |
+
chunks = chunks+"<|endoftext|>"
|
61 |
+
return chunks
|
62 |
+
|
63 |
+
seq = "MAADGYLPDWLEDNLSEGIREWWALKPGAPQPKANQQHQDNARGLVLPGYKYLGPGNGLDKGEPVNAADAAALEHDKAYDQQLKAGDNPYLKYNHADAEFQERLKEDTSFGGNLGRAVFQAKKRLLEPLGLVEEAAKTAPGKKRPVEQSPQEPDSSAGIGKSGAQPAKKRLNFGQTGDTESVPDPQPIGEPPAAPSGVGSLTMASGGGAPVADNNEGADGVGSSSGNWHCDSQWLGDRVITTSTRTWALPTYNNHLYKQISNSTSGGSSNDNAYFGYSTPWGYFDFNRFHCHFSPRDWQRLINNNWGFRPKRLNFKLFNIQVKEVTDNNGVKTIANNLTSTVQVFTDSDYQLPYVLGSAHEGCLPPFPADVFMIPQYGYLTLNDGSQAVGRSSFYCLEYFPSQMLRTGNNFQFSYEFENVPFHSSYAHSQSLDRLMNPLIDQYLYYLSKTINGSGQNQQTLKFSVAGPSNMAVQGRNYIPGPSYRQQRVSTTVTQNNNSEFAWPGASSWALNGRNSLMNPGPAMASHKEGEDRFFPLSGSLIFGKQGTGRDNVDADKVMITNEEEIKTTNPVATESYGQVATNHQSAQAQAQTGWVQNQGILPGMVWQDRDVYLQGPIWAKIPHTDGNFHPSPLMGGFGMKHPPPQILIKNTPVPADPPTAFNKDKLNSFITQYSTGQVSVEIEWELQKENSKRWNPEIQYTSNYYKSNNVEFAVNTEGVYSEPRPIGTRYLTRNL"
|
64 |
+
seq = split_sequence(seq)
|
65 |
+
print(f"{calculatePerplexity(seq, model, tokenizer):.2f}")
|
66 |
+
```
|
67 |
|
68 |
## Training and evaluation data
|
69 |
|