Shanks0465 commited on
Commit
d9ae7cd
1 Parent(s): 33f37c9

Added app.py

Browse files
Files changed (2) hide show
  1. app.py +44 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
4
+
5
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
6
+
7
+ model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")
8
+
9
+
10
+ def get_ner(sentence):
11
+ tok_sentence = tokenizer(sentence, return_tensors='pt')
12
+
13
+ with torch.no_grad():
14
+ logits = model(**tok_sentence).logits.argmax(-1)
15
+ predicted_tokens_classes = [
16
+ model.config.id2label[t.item()] for t in logits[0]]
17
+
18
+ predicted_labels = []
19
+
20
+ previous_token_id = 0
21
+ word_ids = tok_sentence.word_ids()
22
+ for word_index in range(len(word_ids)):
23
+ if word_ids[word_index] == None:
24
+ previous_token_id = word_ids[word_index]
25
+ elif word_ids[word_index] == previous_token_id:
26
+ previous_token_id = word_ids[word_index]
27
+ else:
28
+ predicted_labels.append(predicted_tokens_classes[word_index])
29
+ previous_token_id = word_ids[word_index]
30
+
31
+ ner_output = []
32
+ for index in range(len(sentence.split(' '))):
33
+ ner_output.append(
34
+ (sentence.split(' ')[index], predicted_labels[index]))
35
+ return ner_output
36
+
37
+
38
+ iface = gr.Interface(get_ner,
39
+ gr.Textbox(placeholder="Enter sentence here..."),
40
+ ["highlight"], examples=['लगातार हमलावर हो रहे शिवपाल और राजभर को सपा की दो टूक, चिट्ठी जारी कर कहा- जहां जाना चाहें जा सकते हैं', 'ಶರಣ್ ರ ನೀವು ನೋಡಲೇಬೇಕಾದ ಟಾಪ್ 5 ಕಾಮಿಡಿ ಚಲನಚಿತ್ರಗಳು'], title='IndicNER',
41
+ article='IndicNER is a model trained to complete the task of identifying named entities from sentences in Indian languages. Our model is specifically fine-tuned to the 11 Indian languages mentioned above over millions of sentences. The model is then benchmarked over a human annotated testset and multiple other publicly available Indian NER datasets. The 11 languages covered by IndicNER are: Assamese, Bengali, Gujarati, Hindi, Kannada, Malayalam, Marathi, Oriya, Punjabi, Tamil, Telugu.'
42
+ )
43
+
44
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ sentencepiece==0.1.95
4
+ datasets
5
+ seqeval