File size: 3,999 Bytes
fcb30fd
 
722ea14
 
 
 
 
 
 
fcb30fd
 
bf1126c
722ea14
 
 
 
 
 
fcb30fd
722ea14
 
fcb30fd
722ea14
fcb30fd
722ea14
bf1126c
722ea14
 
 
 
 
 
 
 
 
bf1126c
722ea14
 
 
 
 
 
 
 
d22d58c
722ea14
 
 
 
 
d22d58c
bf1126c
722ea14
 
 
 
 
 
 
 
 
 
fcb30fd
bf1126c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb30fd
bf1126c
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# from scipy.special import softmax
import tensorflow as tf
from transformers import Pipeline
import tensorflow as tf
import numpy as np
import json
from hazm import *
from scipy.spatial import distance


class PreTrainedPipeline():
    def __init__(self, path):
        self.model_dir = "saved_model"
        self.t2id_path = "t2id.json"
        self.stopwords_path = "stopwords.txt"
        self.id2h_path = "id2h.json"
        self.t2id = json.load(open(self.t2id_path,encoding="utf8"))
        self.id2h = json.load(open(self.id2h_path,encoding="utf8"))

        self.stopwords = set(line.strip() for line in open(self.stopwords_path,encoding="utf8"))
        self.comparisons = np.load(self.comparison_matrix_path)['arr_0']

        self.model = tf.saved_model.load(self.model_dir)

    def __call__(self, inputs: str):

        # Preprocess the input sentence
        sentence = Normalizer().normalize(inputs)
        tokens = word_tokenize(sentence)
        tokens = [t for t in tokens if t not in self.stopwords]
        input_ids = np.zeros((1, 20))
        for i, token in enumerate(tokens):
            if i >= 20:
                break
            input_ids[0, i] = self.t2id.get(token, self.t2id['UNK'])

        # Call the model on the input ids
        embeddings = self.model(tf.constant(input_ids, dtype=tf.int32)).numpy()
        # Postprocess the embeddings to get the most similar words
        similarities = distance.cdist(embeddings.reshape((1,300)), self.comparisons, "cosine")[0]
        top_indices = similarities.argsort()[:10]
        top_words = [[self.id2h[str(top_indices[i])]] for i in range(10)]
        
        
        return [
            [ 
                {'label': top_words[0], 'score': 0},
                {'label': top_words[1], 'score': 0},
                {'label': top_words[2], 'score': 0},
                {'label': top_words[3], 'score': 0},
            ]
        ]

        
        # return [
        #     [ # Sample output, call the model here TODO
        #         {'label': 'POSITIVE', 'score': 0.05},
        #         {'label': 'NEGATIVE', 'score': 0.03},
        #         {'label': 'معنی', 'score': 0.92},
        #         {'label': f'{inputs}', 'score': 0},
        #     ]
        # ]
 
    # def RevDict(sent,flag,model): 
    #     """ 
    #     This function recieves a sentence from the user, and turns back top_10 (for flag=0) or top_100 (for flag=1) predictions. 
    #     the input sentence will be normalized, and stop words will be removed 
    #     """ 
        
    #     normalizer = Normalizer() 
    #     X_Normalized = normalizer.normalize(sent) 
    #     X_Tokens = word_tokenize(X_Normalized) 
    #     stopwords = [normalizer.normalize(x.strip()) for x in codecs.open(r"stopwords.txt",'r','utf-8').readlines()] 
    #     X_Tokens = [t for t in X_Tokens if t not in stopwords] 
    #     preprocessed = [' '.join(X_Tokens)][0] 
    #     sent_ids = sent2id([preprocessed]) 
    #     output=np.array((model.predict(sent_ids.reshape((1,20))).tolist()[0])) 
    #     distances=distance.cdist(output.reshape((1,300)), comparison_matrix, "cosine")[0] 
    #     min_index_100 = distances.argsort()[:100] 
    #     min_index_10 = distances.argsort()[:10] 
    
    #     temp=[] 
    #     if flag == 0: 
    #         for i in range(10): 
    #             temp.append(id2h[str(min_index_10[i])]) 
    #     elif flag == 1: 
    #         for i in range(100): 
    #             temp.append(id2h[str(min_index_100[i])]) 
        
    #     for i in range(len(temp)): 
    #         print(temp[i])

    # def sent2id(sents): 
    #     sents_id=np.zeros((len(sents),20)) 
    #     for j in tqdm(range(len(sents))): 
    #         for i,word in enumerate(sents[j].split()): 
    #             try: 
    #                 sents_id[j,i] = t2id[word] 
    #             except: 
    #                 sents_id[j,i] = t2id['UNK'] 
    #             if i==19: 
    #                 break 
    #     return sents_id