Commit
·
7216e1f
1
Parent(s):
3558c2e
Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
def build_vocab(sentences):
|
4 |
+
vocab = {"[PAD]": 0, "[UNK]": 1}
|
5 |
+
for sentence in sentences:
|
6 |
+
for word in sentence.split():
|
7 |
+
if word not in vocab:
|
8 |
+
vocab[word] = len(vocab)
|
9 |
+
return vocab
|
10 |
+
|
11 |
+
def sentence_to_token_ids(sentence, vocab):
|
12 |
+
return [vocab.get(word, vocab["[UNK]"]) for word in sentence.split()]
|
13 |
+
|
14 |
+
def get_embedding_tensor(sentence, vocab, embedding_layer):
|
15 |
+
token_ids = sentence_to_token_ids(sentence, vocab)
|
16 |
+
seq = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0) # Add batch dimension
|
17 |
+
seg = torch.zeros_like(seq)
|
18 |
+
embedding_tensor = embedding_layer(seq, seg)
|
19 |
+
return embedding_tensor
|
20 |
+
|
21 |
+
def get_input_tensors(sentence, vocab):
|
22 |
+
def sentence_to_token_ids(sentence, vocab):
|
23 |
+
return [vocab.get(word, vocab["[UNK]"]) for word in sentence.split()]
|
24 |
+
token_ids = sentence_to_token_ids(sentence, vocab)
|
25 |
+
seq = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0) # Add batch dimension
|
26 |
+
seg = torch.zeros_like(seq)
|
27 |
+
|
28 |
+
return seq, seg
|