Stanford-TH commited on
Commit
8d3380d
Β·
verified Β·
1 Parent(s): 31c045f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
EmotionClassifier/EmotionPredictor.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import pipeline
4
+ from huggingface_hub import PyTorchModelHubMixin
5
+
6
+ class EmotionPredictor(nn.Module,PyTorchModelHubMixin):
7
+ def __init__(self):
8
+ super(EmotionPredictor, self).__init__()
9
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
10
+ self.classifier = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli",device=self.device)
11
+ self.tokenizer = self.classifier.tokenizer
12
+
13
+ def forward(self, payload):
14
+ length_sentences = []
15
+ sentences = []
16
+ sorted_tensors = []
17
+ tokens = self.tokenizer.encode(payload, return_tensors="pt", return_overflowing_tokens=True, stride=10, max_length=1096, truncation=True, padding=True)
18
+ for i in range(len(tokens)):
19
+ tokens_list = self.tokenizer.convert_ids_to_tokens(tokens[i])
20
+ tokens_string = self.tokenizer.convert_tokens_to_string([token for token in tokens_list if token not in ['<s>', '</s>', self.tokenizer.pad_token]])
21
+ length_sentences.append(len(tokens_string.split()))
22
+ sentences.append(tokens_string)
23
+
24
+ length_sentences = torch.tensor(length_sentences)
25
+ weights = length_sentences/length_sentences.sum()
26
+ weights.to(self.device)
27
+ del length_sentences,tokens
28
+ emotions = ['anger', 'disgust', 'fear', 'inspiration', 'joy', 'love', 'neutral', 'sadness', 'suprise']
29
+ predictions = self.classifier(sentences, emotions, multi_label=True)
30
+ print(predictions)
31
+ emotions.sort()
32
+ for prediction in predictions:
33
+ item = dict(zip(prediction['labels'],prediction['scores']))
34
+ sorted_scores = [item[label] for label in emotions]
35
+ sorted_tensors.append(sorted_scores)
36
+ sorted_tensors = torch.tensor(sorted_tensors)
37
+ sorted_tensors.to(self.device)
38
+ weighted_scores = torch.mul(weights.unsqueeze(1),sorted_tensors).to(self.device)
39
+ weighted_scores = weighted_scores.sum(dim=0)
40
+ return weighted_scores.cpu().numpy()
EmotionClassifier/__init__.py ADDED
File without changes
EmotionClassifier/__pycache__/EmotionPredictor.cpython-311.pyc ADDED
Binary file (4.47 kB). View file
 
EmotionClassifier/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (183 Bytes). View file
 
NER_Wrapper/NER_Wrapper.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ from NER_Wrapper.NameExtractors import NERWrapper,NextPassNERWrapper
3
+
4
+ class FullNERPipeline:
5
+ def __init__(self):
6
+ """
7
+ Initializes the FullNERPipeline with paths to the model and idx2tag configuration.
8
+
9
+ Parameters:
10
+ model_path (str): Path to the pre-trained NER model.
11
+ idx2tag_path (str): Path to the index-to-tag mapping file.
12
+ """
13
+ # Initialize the NERWrapper with the provided model and idx2tag path.
14
+ self.ner_wrapper = NERWrapper('models/NER_Models/torch_model.pth','models/NER_Models/idx2tag.json')
15
+
16
+ # Initialize the NextPassNERWrapper which uses a different pre-trained model.
17
+ self.next_ner_wrapper = NextPassNERWrapper()
18
+
19
+ def process_text(self, text):
20
+ """
21
+ Processes the input text through two stages of NER processing and returns processed sentences.
22
+
23
+ Parameters:
24
+ text (str): The input text to be processed for named entity recognition.
25
+
26
+ Returns:
27
+ list of str: The list of processed sentences with entities tagged or tokens modified.
28
+ """
29
+ # First, evaluate the text using the initial NER model.
30
+ evaluated_text = self.ner_wrapper.evaluate_text(text.split('.'))
31
+
32
+ # Next, process the sentences through the second NER pass.
33
+ ner_text = self.next_ner_wrapper.process_sentences(evaluated_text)
34
+
35
+ # Manually collect garbage to manage memory when dealing with large models or data.
36
+ gc.collect()
37
+
38
+ return " ".join(ner_text)
NER_Wrapper/NameExtractors.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import DistilBertForTokenClassification, AutoTokenizer, AutoModelForTokenClassification
4
+ from torch.utils.data import Dataset, DataLoader, TensorDataset
5
+ import json
6
+ import gc
7
+
8
+ class BertNER(nn.Module):
9
+ """
10
+ A custom PyTorch Module for Named Entity Recognition (NER) using DistilBertForTokenClassification.
11
+ """
12
+ def __init__(self,token_dims):
13
+ """
14
+ Initializes the BertNER model.
15
+
16
+ Parameters:
17
+ token_dims (int): The number of unique tokens/labels in the NER task.
18
+ """
19
+ super(BertNER,self).__init__()
20
+ if type(token_dims) != int:
21
+ raise TypeError("Token Dimensions should be an integer")
22
+ if token_dims <= 0:
23
+ raise ValueError("Dimension should atleast be more than 1")
24
+
25
+ self.pretrained_model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased',num_labels=token_dims)
26
+
27
+ def forward(self,input_ids,attention_mask,labels=None):
28
+ """
29
+ Forward pass of the model.
30
+
31
+ Parameters:
32
+ input_ids (torch.Tensor): Tensor of token ids to be fed to DistilBERT.
33
+ attention_mask (torch.Tensor): Tensor indicating which tokens should be attended to by the model.
34
+ labels (torch.Tensor, optional): Tensor of actual labels for computing loss. If None, the model returns logits.
35
+
36
+ Returns:
37
+ The model's output, which varies depending on whether labels are provided.
38
+ """
39
+ if labels == None:
40
+ out = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask)
41
+
42
+ out = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
43
+
44
+ return out
45
+
46
+ class SentenceDataset(TensorDataset):
47
+ """
48
+ Custom Dataset class for sentences, handling tokenization and preparing inputs for the NER model.
49
+ """
50
+ def __init__(self, sentences, tokenizer, max_length=256):
51
+ """
52
+ Initializes the SentenceDataset.
53
+
54
+ Parameters:
55
+ sentences (list of str): The list of sentences to be processed.
56
+ tokenizer (transformers.PreTrainedTokenizer): Tokenizer for converting sentences to model inputs.
57
+ max_length (int): Maximum length of the tokenized output.
58
+ """
59
+ self.sentences = [sentence.split() for sentence in sentences]
60
+ self.tokenizer = tokenizer
61
+ self.max_length = max_length
62
+ self.text = self.tokenizer(sentences, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt",is_split_into_words=True)
63
+
64
+ def __len__(self):
65
+ return len(self.sentences)
66
+
67
+ def __getitem__(self, idx):
68
+ """
69
+ Retrieves an item from the dataset by index.
70
+
71
+ Parameters:
72
+ idx (int): Index of the item to retrieve.
73
+
74
+ Returns:
75
+ A dictionary containing input_ids, attention_mask, word_ids, and the original sentences.
76
+ """
77
+ sentence = self.sentences[idx]
78
+ encoded_sentence = self.tokenizer(sentence, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt", is_split_into_words=True)
79
+ #During __getitem__ call the tokenized_sentence ('encoded_sentence') does not consider it to be tokenized by fast tokenizer, hence word_ids will not be given when accessed through data loader
80
+ return {"input_ids":encoded_sentence.input_ids.squeeze(0),"attention_mask":encoded_sentence.attention_mask.squeeze(0),'word_ids':[-1 if x is None else x for x in encoded_sentence.word_ids()],"sentences":self.sentences}
81
+
82
+ class NERWrapper:
83
+ """
84
+ A wrapper class for the Named Entity Recognition (NER) model, simplifying the process of model loading,
85
+ prediction, and utility functions.
86
+ """
87
+ def __init__(self, model_path, idx2tag_path, tokenizer_path='distilbert-base-uncased', token_dims=17):
88
+ """
89
+ Initializes the NERWrapper.
90
+
91
+ Parameters:
92
+ model_path (str): Path to the pre-trained NER model.
93
+ idx2tag_path (str): Path to the index-to-tag mapping file, for decoding model predictions.
94
+ tokenizer_path (str): Path or identifier for the tokenizer to be used.
95
+ token_dims (int): The number of unique tokens/labels in the NER task.
96
+ """
97
+ self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,use_fast=True)
98
+ self.model = BertNER(token_dims=token_dims)
99
+ self.idx2tag = self.load_idx2tag(idx2tag_path)
100
+ self.load_model(model_path)
101
+
102
+ def load_model(self, model_path):
103
+ """
104
+ Loads the model from a specified path.
105
+
106
+ Parameters:
107
+ model_path (str): Path to the pre-trained NER model.
108
+ """
109
+ map_location = "cuda" if torch.cuda.is_available() else "cpu"
110
+ checkpoint = torch.load(model_path,map_location=map_location)
111
+ self.model.load_state_dict(checkpoint['model_state_dict'])
112
+
113
+ def load_idx2tag(self, idx2tag_path):
114
+ """
115
+ Loads the index-to-tag mapping from a specified path.
116
+
117
+ Parameters:
118
+ idx2tag_path (str): Path to the index-to-tag mapping file.
119
+
120
+ Returns:
121
+ dict: A dictionary mapping indices to tags.
122
+ """
123
+ with open(idx2tag_path, 'r') as file:
124
+ idx2tag = json.load(file)
125
+ def _jsonKeys2int(x):
126
+ if isinstance(x, dict):
127
+ return {int(k):v for k,v in x.items()}
128
+ return x
129
+ return _jsonKeys2int(idx2tag)
130
+
131
+ def align_word_ids(self,texts, input_tensor,label_all_tokens=False):
132
+ """
133
+ Aligns word IDs with their corresponding labels, useful for creating a consistent format for model inputs.
134
+
135
+ Parameters:
136
+ texts (list of str): The original texts used for prediction.
137
+ input_tensor (torch.Tensor): Tensor containing word IDs.
138
+ label_all_tokens (bool): Whether to label all tokens or only the first token of each word.
139
+
140
+ Returns:
141
+ torch.Tensor: Tensor of aligned label IDs.
142
+ """
143
+ # Initialize an empty tensor for all_label_ids with the same shape and type as input_tensor but empty
144
+ all_label_ids = []
145
+
146
+ # Iterate through each row in the input_tensor
147
+ for i, word_ids in enumerate(input_tensor):
148
+ previous_word_idx = None
149
+ label_ids = []
150
+ # Iterate through each word_idx in the word_ids tensor
151
+ for word_idx in word_ids:
152
+ # Convert tensor to Python int for comparison
153
+ word_idx = word_idx.item()
154
+ if word_idx == -1:
155
+ label_ids.append(-100)
156
+ elif word_idx != previous_word_idx:
157
+ label_ids.append(1)
158
+ else:
159
+ label_ids.append(1 if label_all_tokens else -100)
160
+ previous_word_idx = word_idx
161
+
162
+ # Convert label_ids list to a tensor and assign it to the corresponding row in all_label_ids
163
+ all_label_ids.append(label_ids)
164
+ return all_label_ids
165
+
166
+ def evaluate_text(self, sentences):
167
+ """
168
+ Evaluates texts using the NER model, returning the prediction results.
169
+
170
+ Parameters:
171
+ sentences (list of str): List of sentences to evaluate.
172
+
173
+ Returns:
174
+ list of str: The modified sentences with identified entities replaced with special tokens (e.g., <PER>).
175
+ """
176
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
177
+ self.model.to(device)
178
+ dataset = SentenceDataset(sentences,self.tokenizer)
179
+ dataloader = DataLoader(dataset,batch_size=32,shuffle=False)
180
+ predictions = []
181
+ for data in dataloader:
182
+ #Load the attention mask and the input ids
183
+ mask = data['attention_mask'].to(device)
184
+ input_id = data['input_ids'].to(device)
185
+ # Creates a tensor of word IDs for aligning model predictions with words.
186
+ concatenated_tensor = torch.stack((data['word_ids'])).t()
187
+ label_ids = torch.Tensor(self.align_word_ids(data['sentences'][0],concatenated_tensor)).to(device)
188
+ output = self.model(input_id, mask, None)
189
+ logits = output.logits
190
+ for i in range(logits.shape[0]):
191
+ # Filters logits for each item in the batch, removing those not associated with actual words.
192
+ logits_clean = logits[i][label_ids[i] != -100]
193
+ # Determines the most likely label for each token and stores the result.
194
+ predictions.append(logits_clean.argmax(dim=1).tolist())
195
+ del mask,input_id,label_ids
196
+ word_ids = []
197
+ gc.collect()
198
+ torch.cuda.empty_cache()
199
+ prediction_label = [[self.idx2tag[i] for i in prediction] for prediction in predictions]
200
+
201
+ return self.replace_sentence_with_tokens([sentence.split() for sentence in sentences],prediction_label)
202
+
203
+ def replace_sentence_with_tokens(self,sentences,prediction_labels):
204
+ """
205
+ Replaces identified entities in sentences with special tokens based on the model's predictions.
206
+
207
+ Parameters:
208
+ sentences (list of list of str): Tokenized sentences.
209
+ prediction_labels (list of list of str): Labels predicted by the model for each token.
210
+
211
+ Returns:
212
+ list of str: Modified sentences with entities replaced by special tokens.
213
+ """
214
+ modified_sentences = []
215
+ for sentence, tags in zip(sentences, prediction_labels):
216
+ words = sentence # Split the sentence into words
217
+ modified_sentence = [] # Initializes an empty list for the current modified sentence.
218
+ skip_next = False # A flag used to indicate whether to skip the next word (used for entities spanning multiple tokens).
219
+ for i,(word,tag) in enumerate(zip(words,tags)):
220
+ if skip_next:
221
+ skip_next = False
222
+ continue #Skip the current word
223
+ if tag == 'B-per':
224
+ modified_sentence.append('<PER>')
225
+ # Checks if the next word is part of the same entity (continuation of a person's name).
226
+ if i + 1 < len(tags) and tags[i + 1] == 'I-per':
227
+ skip_next = True # Skip the next word if it's part of the same entity
228
+ elif tag == 'I-per':
229
+ pass
230
+ elif tag != 'I-per':
231
+ modified_sentence.append(word)
232
+
233
+ modified_sentences.append(" ".join(modified_sentence))
234
+
235
+ return modified_sentences
236
+
237
+ class NextPassNERWrapper:
238
+ """
239
+ This class wraps around a pretrained BERT model for Named Entity Recognition (NER) tasks,
240
+ simplifying the process of sentence processing, entity recognition, and sentence reconstruction
241
+ with entity tags.
242
+ """
243
+ def __init__(self):
244
+ """
245
+ Initializes the wrapper by loading a pretrained tokenizer and model from Hugging Face's
246
+ transformers library specifically designed for NER. It also sets up the device for model
247
+ computation (GPU if available, otherwise CPU) and establishes a mapping from model output
248
+ indices to entity types.
249
+ """
250
+ self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
251
+ self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
252
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
253
+ self.model.to(self.device)
254
+ self.entity_map = {
255
+ 0: "O",
256
+ 1: "B-MISC",
257
+ 2: "I-MISC",
258
+ 3: "B-PER",
259
+ 4: "I-PER",
260
+ 5: "B-ORG",
261
+ 6: "I-ORG",
262
+ 7: "B-LOC",
263
+ 8: "I-LOC",
264
+ }
265
+
266
+ def process_sentences(self, sentences):
267
+ """
268
+ Processes input sentences to identify named entities and reconstructs the sentences
269
+ by tagging entities or modifying tokens based on the model's predictions. It leverages
270
+ a custom dataset and DataLoader for efficient batch processing.
271
+
272
+ Parameters:
273
+ sentences (list of str): The sentences to be processed for named entity recognition.
274
+
275
+ Returns:
276
+ list of str: The list of processed sentences with entities tagged or tokens modified.
277
+ """
278
+ dataset = SentenceDataset(sentences,self.tokenizer)
279
+ dataloader = DataLoader(dataset,batch_size=32,shuffle=False)
280
+ paragraph = []
281
+ for data in dataloader:
282
+ input_ids = data['input_ids'].to(self.device)
283
+ attention_mask = data['attention_mask'].to(self.device)
284
+ with torch.no_grad():
285
+ outputs = self.model(input_ids, attention_mask=attention_mask).logits
286
+
287
+ word_ids = torch.stack((data['word_ids'])).t()
288
+ tokens = [self.tokenizer.convert_ids_to_tokens(X) for X in input_ids.cpu().numpy()]
289
+ predictions = torch.argmax(outputs,dim=2).cpu().numpy()
290
+ skip_next = False
291
+ for word_id,tokens_single,prediction in zip(word_ids,tokens,predictions):
292
+ reconstructed_tokens = []
293
+ for word_id_token, token, prediction_token in zip(word_id, tokens_single, prediction):
294
+ if word_id is None or token in ["[CLS]", "[SEP]", "[PAD]"] or skip_next:
295
+ skip_next = False
296
+ continue
297
+
298
+ entity = self.entity_map[prediction_token]
299
+
300
+ if entity in ["B-PER", "I-PER"] and (reconstructed_tokens[-1] != "<PER>" if reconstructed_tokens else True):
301
+ reconstructed_tokens.append("<PER>")
302
+ elif entity not in ["B-PER", "I-PER"]:
303
+ if token.startswith("##"):
304
+ if(len(reconstructed_tokens) > 1 and reconstructed_tokens[-2] == '<'):
305
+ reconstructed_tokens[-1] = '<' + reconstructed_tokens[-1] + token[2:] + '>'
306
+ reconstructed_tokens.pop(-2)
307
+ skip_next = True
308
+ else:
309
+ reconstructed_tokens[-1] = reconstructed_tokens[-1] + token[2:]
310
+ else:
311
+ reconstructed_tokens.append(token.strip())
312
+
313
+ detokenized_sentence = " ".join(reconstructed_tokens)
314
+ paragraph.append(detokenized_sentence)
315
+ return paragraph
NER_Wrapper/__init__.py ADDED
File without changes
NER_Wrapper/__pycache__/NER_Wrapper.cpython-311.pyc ADDED
Binary file (2.13 kB). View file
 
NER_Wrapper/__pycache__/NameExtractors.cpython-311.pyc ADDED
Binary file (19.9 kB). View file
 
NER_Wrapper/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (195 Bytes). View file
 
README.md CHANGED
@@ -1,13 +1,13 @@
1
- ---
2
- title: EmotionPredictor
3
- emoji: πŸ“Š
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 4.29.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: EmotionPredictor
3
+ emoji: πŸ“Š
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from EmotionClassifier.EmotionPredictor import EmotionPredictor
3
+ from NER_Wrapper.NER_Wrapper import FullNERPipeline
4
+ import warnings
5
+ import pandas as pd
6
+
7
+ warnings.filterwarnings('ignore')
8
+
9
+ emotion_predictor,ner_pipe = EmotionPredictor(),FullNERPipeline()
10
+
11
+ def predict(description):
12
+ ner_text = ner_pipe.process_text(description)
13
+ emotions = emotion_predictor(ner_text)
14
+ return pd.DataFrame(emotions)
15
+
16
+ iface = gr.Interface(
17
+ fn=predict,
18
+ inputs="textarea",
19
+ outputs="dataframe",
20
+ live=False,
21
+ title="Emotion Prediction"
22
+ )
23
+
models/NER_Models/idx2tag.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": "O", "1": "I-tim", "2": "I-per", "3": "I-org", "4": "I-nat", "5": "I-gpe", "6": "I-geo", "7": "I-eve", "8": "I-art", "9": "B-tim", "10": "B-per", "11": "B-org", "12": "B-nat", "13": "B-gpe", "14": "B-geo", "15": "B-eve", "16": "B-art"}
models/NER_Models/torch_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d25daee929a47864a3a6161c45fb201ce51d061d9974d8aff7127efa9471023d
3
+ size 531083290
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers==4.39.3
2
+ torch==2.1.1
3
+ gradio==4.29.0
4
+ pandas==2.1.1