sahilnishad commited on
Commit
7216e1f
·
1 Parent(s): 3558c2e

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +28 -0
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ def build_vocab(sentences):
4
+ vocab = {"[PAD]": 0, "[UNK]": 1}
5
+ for sentence in sentences:
6
+ for word in sentence.split():
7
+ if word not in vocab:
8
+ vocab[word] = len(vocab)
9
+ return vocab
10
+
11
+ def sentence_to_token_ids(sentence, vocab):
12
+ return [vocab.get(word, vocab["[UNK]"]) for word in sentence.split()]
13
+
14
+ def get_embedding_tensor(sentence, vocab, embedding_layer):
15
+ token_ids = sentence_to_token_ids(sentence, vocab)
16
+ seq = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0) # Add batch dimension
17
+ seg = torch.zeros_like(seq)
18
+ embedding_tensor = embedding_layer(seq, seg)
19
+ return embedding_tensor
20
+
21
+ def get_input_tensors(sentence, vocab):
22
+ def sentence_to_token_ids(sentence, vocab):
23
+ return [vocab.get(word, vocab["[UNK]"]) for word in sentence.split()]
24
+ token_ids = sentence_to_token_ids(sentence, vocab)
25
+ seq = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0) # Add batch dimension
26
+ seg = torch.zeros_like(seq)
27
+
28
+ return seq, seg