Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +35 -35
- EmotionClassifier/EmotionPredictor.py +40 -0
- EmotionClassifier/__init__.py +0 -0
- EmotionClassifier/__pycache__/EmotionPredictor.cpython-311.pyc +0 -0
- EmotionClassifier/__pycache__/__init__.cpython-311.pyc +0 -0
- NER_Wrapper/NER_Wrapper.py +38 -0
- NER_Wrapper/NameExtractors.py +315 -0
- NER_Wrapper/__init__.py +0 -0
- NER_Wrapper/__pycache__/NER_Wrapper.cpython-311.pyc +0 -0
- NER_Wrapper/__pycache__/NameExtractors.cpython-311.pyc +0 -0
- NER_Wrapper/__pycache__/__init__.cpython-311.pyc +0 -0
- README.md +13 -13
- app.py +23 -0
- models/NER_Models/idx2tag.json +1 -0
- models/NER_Models/torch_model.pth +3 -0
- requirements.txt +4 -0
.gitattributes
CHANGED
@@ -1,35 +1,35 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
EmotionClassifier/EmotionPredictor.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import pipeline
|
4 |
+
from huggingface_hub import PyTorchModelHubMixin
|
5 |
+
|
6 |
+
class EmotionPredictor(nn.Module,PyTorchModelHubMixin):
|
7 |
+
def __init__(self):
|
8 |
+
super(EmotionPredictor, self).__init__()
|
9 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
+
self.classifier = pipeline(task="zero-shot-classification", model="facebook/bart-large-mnli",device=self.device)
|
11 |
+
self.tokenizer = self.classifier.tokenizer
|
12 |
+
|
13 |
+
def forward(self, payload):
|
14 |
+
length_sentences = []
|
15 |
+
sentences = []
|
16 |
+
sorted_tensors = []
|
17 |
+
tokens = self.tokenizer.encode(payload, return_tensors="pt", return_overflowing_tokens=True, stride=10, max_length=1096, truncation=True, padding=True)
|
18 |
+
for i in range(len(tokens)):
|
19 |
+
tokens_list = self.tokenizer.convert_ids_to_tokens(tokens[i])
|
20 |
+
tokens_string = self.tokenizer.convert_tokens_to_string([token for token in tokens_list if token not in ['<s>', '</s>', self.tokenizer.pad_token]])
|
21 |
+
length_sentences.append(len(tokens_string.split()))
|
22 |
+
sentences.append(tokens_string)
|
23 |
+
|
24 |
+
length_sentences = torch.tensor(length_sentences)
|
25 |
+
weights = length_sentences/length_sentences.sum()
|
26 |
+
weights.to(self.device)
|
27 |
+
del length_sentences,tokens
|
28 |
+
emotions = ['anger', 'disgust', 'fear', 'inspiration', 'joy', 'love', 'neutral', 'sadness', 'suprise']
|
29 |
+
predictions = self.classifier(sentences, emotions, multi_label=True)
|
30 |
+
print(predictions)
|
31 |
+
emotions.sort()
|
32 |
+
for prediction in predictions:
|
33 |
+
item = dict(zip(prediction['labels'],prediction['scores']))
|
34 |
+
sorted_scores = [item[label] for label in emotions]
|
35 |
+
sorted_tensors.append(sorted_scores)
|
36 |
+
sorted_tensors = torch.tensor(sorted_tensors)
|
37 |
+
sorted_tensors.to(self.device)
|
38 |
+
weighted_scores = torch.mul(weights.unsqueeze(1),sorted_tensors).to(self.device)
|
39 |
+
weighted_scores = weighted_scores.sum(dim=0)
|
40 |
+
return weighted_scores.cpu().numpy()
|
EmotionClassifier/__init__.py
ADDED
File without changes
|
EmotionClassifier/__pycache__/EmotionPredictor.cpython-311.pyc
ADDED
Binary file (4.47 kB). View file
|
|
EmotionClassifier/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (183 Bytes). View file
|
|
NER_Wrapper/NER_Wrapper.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gc
|
2 |
+
from NER_Wrapper.NameExtractors import NERWrapper,NextPassNERWrapper
|
3 |
+
|
4 |
+
class FullNERPipeline:
|
5 |
+
def __init__(self):
|
6 |
+
"""
|
7 |
+
Initializes the FullNERPipeline with paths to the model and idx2tag configuration.
|
8 |
+
|
9 |
+
Parameters:
|
10 |
+
model_path (str): Path to the pre-trained NER model.
|
11 |
+
idx2tag_path (str): Path to the index-to-tag mapping file.
|
12 |
+
"""
|
13 |
+
# Initialize the NERWrapper with the provided model and idx2tag path.
|
14 |
+
self.ner_wrapper = NERWrapper('models/NER_Models/torch_model.pth','models/NER_Models/idx2tag.json')
|
15 |
+
|
16 |
+
# Initialize the NextPassNERWrapper which uses a different pre-trained model.
|
17 |
+
self.next_ner_wrapper = NextPassNERWrapper()
|
18 |
+
|
19 |
+
def process_text(self, text):
|
20 |
+
"""
|
21 |
+
Processes the input text through two stages of NER processing and returns processed sentences.
|
22 |
+
|
23 |
+
Parameters:
|
24 |
+
text (str): The input text to be processed for named entity recognition.
|
25 |
+
|
26 |
+
Returns:
|
27 |
+
list of str: The list of processed sentences with entities tagged or tokens modified.
|
28 |
+
"""
|
29 |
+
# First, evaluate the text using the initial NER model.
|
30 |
+
evaluated_text = self.ner_wrapper.evaluate_text(text.split('.'))
|
31 |
+
|
32 |
+
# Next, process the sentences through the second NER pass.
|
33 |
+
ner_text = self.next_ner_wrapper.process_sentences(evaluated_text)
|
34 |
+
|
35 |
+
# Manually collect garbage to manage memory when dealing with large models or data.
|
36 |
+
gc.collect()
|
37 |
+
|
38 |
+
return " ".join(ner_text)
|
NER_Wrapper/NameExtractors.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import DistilBertForTokenClassification, AutoTokenizer, AutoModelForTokenClassification
|
4 |
+
from torch.utils.data import Dataset, DataLoader, TensorDataset
|
5 |
+
import json
|
6 |
+
import gc
|
7 |
+
|
8 |
+
class BertNER(nn.Module):
|
9 |
+
"""
|
10 |
+
A custom PyTorch Module for Named Entity Recognition (NER) using DistilBertForTokenClassification.
|
11 |
+
"""
|
12 |
+
def __init__(self,token_dims):
|
13 |
+
"""
|
14 |
+
Initializes the BertNER model.
|
15 |
+
|
16 |
+
Parameters:
|
17 |
+
token_dims (int): The number of unique tokens/labels in the NER task.
|
18 |
+
"""
|
19 |
+
super(BertNER,self).__init__()
|
20 |
+
if type(token_dims) != int:
|
21 |
+
raise TypeError("Token Dimensions should be an integer")
|
22 |
+
if token_dims <= 0:
|
23 |
+
raise ValueError("Dimension should atleast be more than 1")
|
24 |
+
|
25 |
+
self.pretrained_model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased',num_labels=token_dims)
|
26 |
+
|
27 |
+
def forward(self,input_ids,attention_mask,labels=None):
|
28 |
+
"""
|
29 |
+
Forward pass of the model.
|
30 |
+
|
31 |
+
Parameters:
|
32 |
+
input_ids (torch.Tensor): Tensor of token ids to be fed to DistilBERT.
|
33 |
+
attention_mask (torch.Tensor): Tensor indicating which tokens should be attended to by the model.
|
34 |
+
labels (torch.Tensor, optional): Tensor of actual labels for computing loss. If None, the model returns logits.
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
The model's output, which varies depending on whether labels are provided.
|
38 |
+
"""
|
39 |
+
if labels == None:
|
40 |
+
out = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask)
|
41 |
+
|
42 |
+
out = self.pretrained_model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
|
43 |
+
|
44 |
+
return out
|
45 |
+
|
46 |
+
class SentenceDataset(TensorDataset):
|
47 |
+
"""
|
48 |
+
Custom Dataset class for sentences, handling tokenization and preparing inputs for the NER model.
|
49 |
+
"""
|
50 |
+
def __init__(self, sentences, tokenizer, max_length=256):
|
51 |
+
"""
|
52 |
+
Initializes the SentenceDataset.
|
53 |
+
|
54 |
+
Parameters:
|
55 |
+
sentences (list of str): The list of sentences to be processed.
|
56 |
+
tokenizer (transformers.PreTrainedTokenizer): Tokenizer for converting sentences to model inputs.
|
57 |
+
max_length (int): Maximum length of the tokenized output.
|
58 |
+
"""
|
59 |
+
self.sentences = [sentence.split() for sentence in sentences]
|
60 |
+
self.tokenizer = tokenizer
|
61 |
+
self.max_length = max_length
|
62 |
+
self.text = self.tokenizer(sentences, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt",is_split_into_words=True)
|
63 |
+
|
64 |
+
def __len__(self):
|
65 |
+
return len(self.sentences)
|
66 |
+
|
67 |
+
def __getitem__(self, idx):
|
68 |
+
"""
|
69 |
+
Retrieves an item from the dataset by index.
|
70 |
+
|
71 |
+
Parameters:
|
72 |
+
idx (int): Index of the item to retrieve.
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
A dictionary containing input_ids, attention_mask, word_ids, and the original sentences.
|
76 |
+
"""
|
77 |
+
sentence = self.sentences[idx]
|
78 |
+
encoded_sentence = self.tokenizer(sentence, padding='max_length', max_length=self.max_length, truncation=True, return_tensors="pt", is_split_into_words=True)
|
79 |
+
#During __getitem__ call the tokenized_sentence ('encoded_sentence') does not consider it to be tokenized by fast tokenizer, hence word_ids will not be given when accessed through data loader
|
80 |
+
return {"input_ids":encoded_sentence.input_ids.squeeze(0),"attention_mask":encoded_sentence.attention_mask.squeeze(0),'word_ids':[-1 if x is None else x for x in encoded_sentence.word_ids()],"sentences":self.sentences}
|
81 |
+
|
82 |
+
class NERWrapper:
|
83 |
+
"""
|
84 |
+
A wrapper class for the Named Entity Recognition (NER) model, simplifying the process of model loading,
|
85 |
+
prediction, and utility functions.
|
86 |
+
"""
|
87 |
+
def __init__(self, model_path, idx2tag_path, tokenizer_path='distilbert-base-uncased', token_dims=17):
|
88 |
+
"""
|
89 |
+
Initializes the NERWrapper.
|
90 |
+
|
91 |
+
Parameters:
|
92 |
+
model_path (str): Path to the pre-trained NER model.
|
93 |
+
idx2tag_path (str): Path to the index-to-tag mapping file, for decoding model predictions.
|
94 |
+
tokenizer_path (str): Path or identifier for the tokenizer to be used.
|
95 |
+
token_dims (int): The number of unique tokens/labels in the NER task.
|
96 |
+
"""
|
97 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,use_fast=True)
|
98 |
+
self.model = BertNER(token_dims=token_dims)
|
99 |
+
self.idx2tag = self.load_idx2tag(idx2tag_path)
|
100 |
+
self.load_model(model_path)
|
101 |
+
|
102 |
+
def load_model(self, model_path):
|
103 |
+
"""
|
104 |
+
Loads the model from a specified path.
|
105 |
+
|
106 |
+
Parameters:
|
107 |
+
model_path (str): Path to the pre-trained NER model.
|
108 |
+
"""
|
109 |
+
map_location = "cuda" if torch.cuda.is_available() else "cpu"
|
110 |
+
checkpoint = torch.load(model_path,map_location=map_location)
|
111 |
+
self.model.load_state_dict(checkpoint['model_state_dict'])
|
112 |
+
|
113 |
+
def load_idx2tag(self, idx2tag_path):
|
114 |
+
"""
|
115 |
+
Loads the index-to-tag mapping from a specified path.
|
116 |
+
|
117 |
+
Parameters:
|
118 |
+
idx2tag_path (str): Path to the index-to-tag mapping file.
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
dict: A dictionary mapping indices to tags.
|
122 |
+
"""
|
123 |
+
with open(idx2tag_path, 'r') as file:
|
124 |
+
idx2tag = json.load(file)
|
125 |
+
def _jsonKeys2int(x):
|
126 |
+
if isinstance(x, dict):
|
127 |
+
return {int(k):v for k,v in x.items()}
|
128 |
+
return x
|
129 |
+
return _jsonKeys2int(idx2tag)
|
130 |
+
|
131 |
+
def align_word_ids(self,texts, input_tensor,label_all_tokens=False):
|
132 |
+
"""
|
133 |
+
Aligns word IDs with their corresponding labels, useful for creating a consistent format for model inputs.
|
134 |
+
|
135 |
+
Parameters:
|
136 |
+
texts (list of str): The original texts used for prediction.
|
137 |
+
input_tensor (torch.Tensor): Tensor containing word IDs.
|
138 |
+
label_all_tokens (bool): Whether to label all tokens or only the first token of each word.
|
139 |
+
|
140 |
+
Returns:
|
141 |
+
torch.Tensor: Tensor of aligned label IDs.
|
142 |
+
"""
|
143 |
+
# Initialize an empty tensor for all_label_ids with the same shape and type as input_tensor but empty
|
144 |
+
all_label_ids = []
|
145 |
+
|
146 |
+
# Iterate through each row in the input_tensor
|
147 |
+
for i, word_ids in enumerate(input_tensor):
|
148 |
+
previous_word_idx = None
|
149 |
+
label_ids = []
|
150 |
+
# Iterate through each word_idx in the word_ids tensor
|
151 |
+
for word_idx in word_ids:
|
152 |
+
# Convert tensor to Python int for comparison
|
153 |
+
word_idx = word_idx.item()
|
154 |
+
if word_idx == -1:
|
155 |
+
label_ids.append(-100)
|
156 |
+
elif word_idx != previous_word_idx:
|
157 |
+
label_ids.append(1)
|
158 |
+
else:
|
159 |
+
label_ids.append(1 if label_all_tokens else -100)
|
160 |
+
previous_word_idx = word_idx
|
161 |
+
|
162 |
+
# Convert label_ids list to a tensor and assign it to the corresponding row in all_label_ids
|
163 |
+
all_label_ids.append(label_ids)
|
164 |
+
return all_label_ids
|
165 |
+
|
166 |
+
def evaluate_text(self, sentences):
|
167 |
+
"""
|
168 |
+
Evaluates texts using the NER model, returning the prediction results.
|
169 |
+
|
170 |
+
Parameters:
|
171 |
+
sentences (list of str): List of sentences to evaluate.
|
172 |
+
|
173 |
+
Returns:
|
174 |
+
list of str: The modified sentences with identified entities replaced with special tokens (e.g., <PER>).
|
175 |
+
"""
|
176 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
177 |
+
self.model.to(device)
|
178 |
+
dataset = SentenceDataset(sentences,self.tokenizer)
|
179 |
+
dataloader = DataLoader(dataset,batch_size=32,shuffle=False)
|
180 |
+
predictions = []
|
181 |
+
for data in dataloader:
|
182 |
+
#Load the attention mask and the input ids
|
183 |
+
mask = data['attention_mask'].to(device)
|
184 |
+
input_id = data['input_ids'].to(device)
|
185 |
+
# Creates a tensor of word IDs for aligning model predictions with words.
|
186 |
+
concatenated_tensor = torch.stack((data['word_ids'])).t()
|
187 |
+
label_ids = torch.Tensor(self.align_word_ids(data['sentences'][0],concatenated_tensor)).to(device)
|
188 |
+
output = self.model(input_id, mask, None)
|
189 |
+
logits = output.logits
|
190 |
+
for i in range(logits.shape[0]):
|
191 |
+
# Filters logits for each item in the batch, removing those not associated with actual words.
|
192 |
+
logits_clean = logits[i][label_ids[i] != -100]
|
193 |
+
# Determines the most likely label for each token and stores the result.
|
194 |
+
predictions.append(logits_clean.argmax(dim=1).tolist())
|
195 |
+
del mask,input_id,label_ids
|
196 |
+
word_ids = []
|
197 |
+
gc.collect()
|
198 |
+
torch.cuda.empty_cache()
|
199 |
+
prediction_label = [[self.idx2tag[i] for i in prediction] for prediction in predictions]
|
200 |
+
|
201 |
+
return self.replace_sentence_with_tokens([sentence.split() for sentence in sentences],prediction_label)
|
202 |
+
|
203 |
+
def replace_sentence_with_tokens(self,sentences,prediction_labels):
|
204 |
+
"""
|
205 |
+
Replaces identified entities in sentences with special tokens based on the model's predictions.
|
206 |
+
|
207 |
+
Parameters:
|
208 |
+
sentences (list of list of str): Tokenized sentences.
|
209 |
+
prediction_labels (list of list of str): Labels predicted by the model for each token.
|
210 |
+
|
211 |
+
Returns:
|
212 |
+
list of str: Modified sentences with entities replaced by special tokens.
|
213 |
+
"""
|
214 |
+
modified_sentences = []
|
215 |
+
for sentence, tags in zip(sentences, prediction_labels):
|
216 |
+
words = sentence # Split the sentence into words
|
217 |
+
modified_sentence = [] # Initializes an empty list for the current modified sentence.
|
218 |
+
skip_next = False # A flag used to indicate whether to skip the next word (used for entities spanning multiple tokens).
|
219 |
+
for i,(word,tag) in enumerate(zip(words,tags)):
|
220 |
+
if skip_next:
|
221 |
+
skip_next = False
|
222 |
+
continue #Skip the current word
|
223 |
+
if tag == 'B-per':
|
224 |
+
modified_sentence.append('<PER>')
|
225 |
+
# Checks if the next word is part of the same entity (continuation of a person's name).
|
226 |
+
if i + 1 < len(tags) and tags[i + 1] == 'I-per':
|
227 |
+
skip_next = True # Skip the next word if it's part of the same entity
|
228 |
+
elif tag == 'I-per':
|
229 |
+
pass
|
230 |
+
elif tag != 'I-per':
|
231 |
+
modified_sentence.append(word)
|
232 |
+
|
233 |
+
modified_sentences.append(" ".join(modified_sentence))
|
234 |
+
|
235 |
+
return modified_sentences
|
236 |
+
|
237 |
+
class NextPassNERWrapper:
|
238 |
+
"""
|
239 |
+
This class wraps around a pretrained BERT model for Named Entity Recognition (NER) tasks,
|
240 |
+
simplifying the process of sentence processing, entity recognition, and sentence reconstruction
|
241 |
+
with entity tags.
|
242 |
+
"""
|
243 |
+
def __init__(self):
|
244 |
+
"""
|
245 |
+
Initializes the wrapper by loading a pretrained tokenizer and model from Hugging Face's
|
246 |
+
transformers library specifically designed for NER. It also sets up the device for model
|
247 |
+
computation (GPU if available, otherwise CPU) and establishes a mapping from model output
|
248 |
+
indices to entity types.
|
249 |
+
"""
|
250 |
+
self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
|
251 |
+
self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
|
252 |
+
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
253 |
+
self.model.to(self.device)
|
254 |
+
self.entity_map = {
|
255 |
+
0: "O",
|
256 |
+
1: "B-MISC",
|
257 |
+
2: "I-MISC",
|
258 |
+
3: "B-PER",
|
259 |
+
4: "I-PER",
|
260 |
+
5: "B-ORG",
|
261 |
+
6: "I-ORG",
|
262 |
+
7: "B-LOC",
|
263 |
+
8: "I-LOC",
|
264 |
+
}
|
265 |
+
|
266 |
+
def process_sentences(self, sentences):
|
267 |
+
"""
|
268 |
+
Processes input sentences to identify named entities and reconstructs the sentences
|
269 |
+
by tagging entities or modifying tokens based on the model's predictions. It leverages
|
270 |
+
a custom dataset and DataLoader for efficient batch processing.
|
271 |
+
|
272 |
+
Parameters:
|
273 |
+
sentences (list of str): The sentences to be processed for named entity recognition.
|
274 |
+
|
275 |
+
Returns:
|
276 |
+
list of str: The list of processed sentences with entities tagged or tokens modified.
|
277 |
+
"""
|
278 |
+
dataset = SentenceDataset(sentences,self.tokenizer)
|
279 |
+
dataloader = DataLoader(dataset,batch_size=32,shuffle=False)
|
280 |
+
paragraph = []
|
281 |
+
for data in dataloader:
|
282 |
+
input_ids = data['input_ids'].to(self.device)
|
283 |
+
attention_mask = data['attention_mask'].to(self.device)
|
284 |
+
with torch.no_grad():
|
285 |
+
outputs = self.model(input_ids, attention_mask=attention_mask).logits
|
286 |
+
|
287 |
+
word_ids = torch.stack((data['word_ids'])).t()
|
288 |
+
tokens = [self.tokenizer.convert_ids_to_tokens(X) for X in input_ids.cpu().numpy()]
|
289 |
+
predictions = torch.argmax(outputs,dim=2).cpu().numpy()
|
290 |
+
skip_next = False
|
291 |
+
for word_id,tokens_single,prediction in zip(word_ids,tokens,predictions):
|
292 |
+
reconstructed_tokens = []
|
293 |
+
for word_id_token, token, prediction_token in zip(word_id, tokens_single, prediction):
|
294 |
+
if word_id is None or token in ["[CLS]", "[SEP]", "[PAD]"] or skip_next:
|
295 |
+
skip_next = False
|
296 |
+
continue
|
297 |
+
|
298 |
+
entity = self.entity_map[prediction_token]
|
299 |
+
|
300 |
+
if entity in ["B-PER", "I-PER"] and (reconstructed_tokens[-1] != "<PER>" if reconstructed_tokens else True):
|
301 |
+
reconstructed_tokens.append("<PER>")
|
302 |
+
elif entity not in ["B-PER", "I-PER"]:
|
303 |
+
if token.startswith("##"):
|
304 |
+
if(len(reconstructed_tokens) > 1 and reconstructed_tokens[-2] == '<'):
|
305 |
+
reconstructed_tokens[-1] = '<' + reconstructed_tokens[-1] + token[2:] + '>'
|
306 |
+
reconstructed_tokens.pop(-2)
|
307 |
+
skip_next = True
|
308 |
+
else:
|
309 |
+
reconstructed_tokens[-1] = reconstructed_tokens[-1] + token[2:]
|
310 |
+
else:
|
311 |
+
reconstructed_tokens.append(token.strip())
|
312 |
+
|
313 |
+
detokenized_sentence = " ".join(reconstructed_tokens)
|
314 |
+
paragraph.append(detokenized_sentence)
|
315 |
+
return paragraph
|
NER_Wrapper/__init__.py
ADDED
File without changes
|
NER_Wrapper/__pycache__/NER_Wrapper.cpython-311.pyc
ADDED
Binary file (2.13 kB). View file
|
|
NER_Wrapper/__pycache__/NameExtractors.cpython-311.pyc
ADDED
Binary file (19.9 kB). View file
|
|
NER_Wrapper/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (195 Bytes). View file
|
|
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
---
|
2 |
-
title: EmotionPredictor
|
3 |
-
emoji: π
|
4 |
-
colorFrom: green
|
5 |
-
colorTo: gray
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.29.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
---
|
2 |
+
title: EmotionPredictor
|
3 |
+
emoji: π
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: gray
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.29.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from EmotionClassifier.EmotionPredictor import EmotionPredictor
|
3 |
+
from NER_Wrapper.NER_Wrapper import FullNERPipeline
|
4 |
+
import warnings
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
warnings.filterwarnings('ignore')
|
8 |
+
|
9 |
+
emotion_predictor,ner_pipe = EmotionPredictor(),FullNERPipeline()
|
10 |
+
|
11 |
+
def predict(description):
|
12 |
+
ner_text = ner_pipe.process_text(description)
|
13 |
+
emotions = emotion_predictor(ner_text)
|
14 |
+
return pd.DataFrame(emotions)
|
15 |
+
|
16 |
+
iface = gr.Interface(
|
17 |
+
fn=predict,
|
18 |
+
inputs="textarea",
|
19 |
+
outputs="dataframe",
|
20 |
+
live=False,
|
21 |
+
title="Emotion Prediction"
|
22 |
+
)
|
23 |
+
|
models/NER_Models/idx2tag.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"0": "O", "1": "I-tim", "2": "I-per", "3": "I-org", "4": "I-nat", "5": "I-gpe", "6": "I-geo", "7": "I-eve", "8": "I-art", "9": "B-tim", "10": "B-per", "11": "B-org", "12": "B-nat", "13": "B-gpe", "14": "B-geo", "15": "B-eve", "16": "B-art"}
|
models/NER_Models/torch_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d25daee929a47864a3a6161c45fb201ce51d061d9974d8aff7127efa9471023d
|
3 |
+
size 531083290
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers==4.39.3
|
2 |
+
torch==2.1.1
|
3 |
+
gradio==4.29.0
|
4 |
+
pandas==2.1.1
|