Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import subprocess
|
4 |
+
import sys
|
5 |
+
|
6 |
+
def install(package):
|
7 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
8 |
+
|
9 |
+
install("numpy")
|
10 |
+
install("torch")
|
11 |
+
install("transformers")
|
12 |
+
install("unidecode")
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from transformers import AutoTokenizer
|
17 |
+
from transformers import BertForTokenClassification
|
18 |
+
from collections import Counter
|
19 |
+
from unidecode import unidecode
|
20 |
+
import string
|
21 |
+
import re
|
22 |
+
|
23 |
+
auth_token = os.environ.get("AUTH-TOKEN")
|
24 |
+
|
25 |
+
tokenizer = AutoTokenizer.from_pretrained("osiria/bert-base-cased-ner-en", use_auth_token=auth_token)
|
26 |
+
model = BertForTokenClassification.from_pretrained("osiria/bert-base-cased-ner-en", num_labels = 5, use_auth_token=auth_token)
|
27 |
+
device = torch.device("cpu")
|
28 |
+
model = model.to(device)
|
29 |
+
model.eval()
|
30 |
+
|
31 |
+
from transformers import pipeline
|
32 |
+
ner = pipeline('ner', model=model, tokenizer=tokenizer, device=-1)
|
33 |
+
|
34 |
+
|
35 |
+
header = '''--------------------------------------------------------------------------------------------------
|
36 |
+
<style>
|
37 |
+
.vertical-text {
|
38 |
+
writing-mode: vertical-lr;
|
39 |
+
text-orientation: upright;
|
40 |
+
background-color:red;
|
41 |
+
}
|
42 |
+
</style>
|
43 |
+
<center>
|
44 |
+
<body>
|
45 |
+
<span class="vertical-text" style="background-color:lightgreen;border-radius: 3px;padding: 3px;"> </span>
|
46 |
+
<span class="vertical-text" style="background-color:orange;border-radius: 3px;padding: 3px;"> D</span>
|
47 |
+
<span class="vertical-text" style="background-color:lightblue;border-radius: 3px;padding: 3px;"> E</span>
|
48 |
+
<span class="vertical-text" style="background-color:tomato;border-radius: 3px;padding: 3px;"> M</span>
|
49 |
+
<span class="vertical-text" style="background-color:lightgrey;border-radius: 3px;padding: 3px;"> O</span>
|
50 |
+
<span class="vertical-text" style="background-color:#CF9FFF;border-radius: 3px;padding: 3px;"> </span>
|
51 |
+
</body>
|
52 |
+
</center>
|
53 |
+
<br>
|
54 |
+
<center>(BETA)</center>
|
55 |
+
'''
|
56 |
+
|
57 |
+
maps = {"O": "NONE", "PER": "PER", "LOC": "LOC", "ORG": "ORG", "MISC": "MISC", "DATE": "DATE"}
|
58 |
+
reg_month = "(?:gennaio|febbraio|marzo|aprile|maggio|giugno|luglio|agosto|settembre|ottobre|novembre|dicembre|january|february|march|april|may|june|july|august|september|october|november|december)"
|
59 |
+
reg_date = "(?:\d{1,2}\°{0,1}|primo|\d{1,2}\º{0,1})" + " " + reg_month + " " + "\d{4}|"
|
60 |
+
reg_date = reg_date + reg_month + " " + "\d{4}|"
|
61 |
+
reg_date = reg_date + "\d{1,2}" + " " + reg_month
|
62 |
+
reg_date = reg_date + "\d{1,2}" + "(?:\/|\.)\d{1,2}(?:\/|\.)" + "\d{4}|"
|
63 |
+
reg_date = reg_date + "(?<=dal )\d{4}|(?<=al )\d{4}|(?<=nel )\d{4}|(?<=anno )\d{4}|(?<=del )\d{4}|"
|
64 |
+
reg_date = reg_date + "\d{1,5} a\.c\.|\d{1,5} d\.c\."
|
65 |
+
map_punct = {"’": "'", "«": '"', "»": '"', "”": '"', "“": '"', "–": "-", "$": ""}
|
66 |
+
unk_tok = 9005
|
67 |
+
merge_th_1 = 0.8
|
68 |
+
merge_th_2 = 0.4
|
69 |
+
min_th = 0.5
|
70 |
+
|
71 |
+
def extract(text):
|
72 |
+
|
73 |
+
text = text.strip()
|
74 |
+
for mp in map_punct:
|
75 |
+
text = text.replace(mp, map_punct[mp])
|
76 |
+
text = re.sub("\[\d+\]", "", text)
|
77 |
+
|
78 |
+
warn_flag = False
|
79 |
+
|
80 |
+
res_total = []
|
81 |
+
out_text = ""
|
82 |
+
|
83 |
+
for p_text in text.split("\n"):
|
84 |
+
|
85 |
+
if p_text:
|
86 |
+
|
87 |
+
toks = tokenizer.encode(p_text)
|
88 |
+
if unk_tok in toks:
|
89 |
+
warn_flag = True
|
90 |
+
|
91 |
+
res_orig = ner(p_text, aggregation_strategy = "first")
|
92 |
+
res_orig = [el for r, el in enumerate(res_orig) if len(el["word"].strip()) > 1]
|
93 |
+
res = []
|
94 |
+
|
95 |
+
for r, ent in enumerate(res_orig):
|
96 |
+
if r > 0 and ent["score"] < merge_th_1 and ent["start"] <= res[-1]["end"] + 1 and ent["score"] <= res[-1]["score"]:
|
97 |
+
res[-1]["word"] = res[-1]["word"] + " " + ent["word"]
|
98 |
+
res[-1]["score"] = merge_th_1*(res[-1]["score"] > merge_th_2)
|
99 |
+
res[-1]["end"] = ent["end"]
|
100 |
+
elif r < len(res_orig) - 1 and ent["score"] < merge_th_1 and res_orig[r+1]["start"] <= ent["end"] + 1 and res_orig[r+1]["score"] > ent["score"]:
|
101 |
+
res_orig[r+1]["word"] = ent["word"] + " " + res_orig[r+1]["word"]
|
102 |
+
res_orig[r+1]["score"] = merge_th_1*(res_orig[r+1]["score"] > merge_th_2)
|
103 |
+
res_orig[r+1]["start"] = ent["start"]
|
104 |
+
else:
|
105 |
+
res.append(ent)
|
106 |
+
|
107 |
+
res = [el for r, el in enumerate(res) if el["score"] >= min_th]
|
108 |
+
|
109 |
+
dates = [{"entity_group": "DATE", "score": 1.0, "word": p_text[el.span()[0]:el.span()[1]], "start": el.span()[0], "end": el.span()[1]} for el in re.finditer(reg_date, p_text, flags = re.IGNORECASE)]
|
110 |
+
res.extend(dates)
|
111 |
+
res = sorted(res, key = lambda t: t["start"])
|
112 |
+
res_total.extend(res)
|
113 |
+
|
114 |
+
chunks = [("", "", 0, "NONE")]
|
115 |
+
|
116 |
+
for el in res:
|
117 |
+
if maps[el["entity_group"]] != "NONE":
|
118 |
+
tag = maps[el["entity_group"]]
|
119 |
+
chunks.append((p_text[el["start"]: el["end"]], p_text[chunks[-1][2]:el["end"]], el["end"], tag))
|
120 |
+
|
121 |
+
if chunks[-1][2] < len(p_text):
|
122 |
+
chunks.append(("END", p_text[chunks[-1][2]:], -1, "NONE"))
|
123 |
+
chunks = chunks[1:]
|
124 |
+
|
125 |
+
n_text = []
|
126 |
+
|
127 |
+
for i, chunk in enumerate(chunks):
|
128 |
+
|
129 |
+
rep = chunk[0]
|
130 |
+
|
131 |
+
if chunk[3] == "PER":
|
132 |
+
rep = '<span style="background-color:lightgreen;border-radius: 3px;padding: 3px;"><b>ᴘᴇʀ</b> ' + chunk[0] + '</span>'
|
133 |
+
elif chunk[3] == "LOC":
|
134 |
+
rep = '<span style="background-color:orange;border-radius: 3px;padding: 3px;"><b>ʟᴏᴄ</b> ' + chunk[0] + '</span>'
|
135 |
+
elif chunk[3] == "ORG":
|
136 |
+
rep = '<span style="background-color:lightblue;border-radius: 3px;padding: 3px;"><b>ᴏʀɢ</b> ' + chunk[0] + '</span>'
|
137 |
+
elif chunk[3] == "MISC":
|
138 |
+
rep = '<span style="background-color:tomato;border-radius: 3px;padding: 3px;"><b>ᴍɪsᴄ</b> ' + chunk[0] + '</span>'
|
139 |
+
elif chunk[3] == "DATE":
|
140 |
+
rep = '<span style="background-color:lightgrey;border-radius: 3px;padding: 3px;"><b>ᴅᴀᴛᴇ</b> ' + chunk[0] + '</span>'
|
141 |
+
|
142 |
+
n_text.append(chunk[1].replace(chunk[0], rep))
|
143 |
+
|
144 |
+
n_text = "".join(n_text)
|
145 |
+
if out_text:
|
146 |
+
out_text = out_text + "<br>" + n_text
|
147 |
+
else:
|
148 |
+
out_text = n_text
|
149 |
+
|
150 |
+
|
151 |
+
tags = [el["word"] for el in res_total if el["entity_group"] not in ['DATE', None]]
|
152 |
+
cnt = Counter(tags)
|
153 |
+
tags = sorted(list(set([el for el in tags if cnt[el] > 1])), key = lambda t: cnt[t]*np.exp(-tags.index(t)))[::-1]
|
154 |
+
tags = [" ".join(re.sub("[^A-Za-z0-9\s]", "", unidecode(tag)).split()) for tag in tags]
|
155 |
+
tags = ['<span style="background-color:#CF9FFF;border-radius: 3px;padding: 3px;"><b>ᴛᴀɢ </b> ' + el + '</span>' for el in tags]
|
156 |
+
tags = " ".join(tags)
|
157 |
+
|
158 |
+
if tags:
|
159 |
+
out_text = out_text + "<br><br><b>Tags:</b> " + tags
|
160 |
+
|
161 |
+
if warn_flag:
|
162 |
+
out_text = out_text + "<br><br><b>Warning ⚠️:</b> Unknown tokens detected in text. The model might behave erratically"
|
163 |
+
|
164 |
+
return out_text
|
165 |
+
|
166 |
+
|
167 |
+
|
168 |
+
init_text = '''The American Academy of Arts and Sciences (AAA&S) is one of the oldest learned societies in the United States. It was founded in 1780 during the American Revolution by John Adams, John Hancock, James Bowdoin, Andrew Oliver, and other Founding Fathers of the United States. It is headquartered in Cambridge, Massachusetts.
|
169 |
+
Membership in the academy is achieved through a thorough petition, review, and election process. The academy's quarterly journal, Dædalus, is published by the MIT Press on behalf of the academy. The academy also conducts multidisciplinary public policy research.
|
170 |
+
The Academy was established by the Massachusetts legislature on May 4, 1780, charted in order "to cultivate every art and science which may tend to advance the interest, honor, dignity, and happiness of a free, independent, and virtuous people." The sixty-two incorporating fellows represented varying interests and high standing in the political, professional, and commercial sectors of the state. The first class of new members, chosen by the Academy in 1781, included Benjamin Franklin and George Washington as well as several international honorary members.
|
171 |
+
'''
|
172 |
+
|
173 |
+
init_output = extract(init_text)
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
with gr.Blocks(css="footer {visibility: hidden}", theme=gr.themes.Default(text_size="lg", spacing_size="lg")) as interface:
|
179 |
+
|
180 |
+
with gr.Row():
|
181 |
+
gr.Markdown(header)
|
182 |
+
with gr.Row():
|
183 |
+
text = gr.Text(label="Extract entities", lines = 10, value = init_text)
|
184 |
+
with gr.Row():
|
185 |
+
with gr.Column():
|
186 |
+
button = gr.Button("Extract").style(full_width=False)
|
187 |
+
with gr.Row():
|
188 |
+
with gr.Column():
|
189 |
+
entities = gr.Markdown(init_output)
|
190 |
+
|
191 |
+
with gr.Row():
|
192 |
+
with gr.Column():
|
193 |
+
gr.Markdown("<center>The input examples in this demo are extracted from https://it.wikipedia.org</center>")
|
194 |
+
|
195 |
+
button.click(extract, inputs=[text], outputs = [entities])
|
196 |
+
|
197 |
+
|
198 |
+
interface.launch()
|