harshildarji commited on
Commit
fd2d5c8
1 Parent(s): b7424f7
Files changed (1) hide show
  1. app.py +243 -0
app.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ import matplotlib.colors as mcolors
4
+ import matplotlib.pyplot as plt
5
+ from transformers import (
6
+ AutoModelForTokenClassification,
7
+ AutoTokenizer,
8
+ logging,
9
+ pipeline,
10
+ )
11
+
12
+ import streamlit as st
13
+
14
+ warnings.simplefilter(action="ignore", category=Warning)
15
+ logging.set_verbosity(logging.ERROR)
16
+
17
+ st.set_page_config(page_title="CAROLL Language Models - Demo", layout="wide")
18
+
19
+ st.markdown(
20
+ """
21
+ <style>
22
+ body {
23
+ font-family: 'Poppins', sans-serif;
24
+ background-color: #f4f4f8;
25
+ }
26
+ .header {
27
+ background-color: rgba(220, 219, 219, 0.25);
28
+ color: #000;
29
+ padding: 5px 0;
30
+ text-align: center;
31
+ border-radius: 7px;
32
+ margin-bottom: 13px;
33
+ border-bottom: 2px solid #333;
34
+ }
35
+ #logo {
36
+ width: auto;
37
+ height: 75px;
38
+ margin-top: -15px;
39
+ margin-bottom: 15px;
40
+ }
41
+ .container {
42
+ background-color: #fff;
43
+ padding: 30px;
44
+ border-radius: 10px;
45
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
46
+ width: 100%;
47
+ max-width: 1000px;
48
+ margin: 0 auto;
49
+ position: absolute;
50
+ top: 50%;
51
+ left: 50%;
52
+ transform: translate(-50%, -50%);
53
+ }
54
+ .btn-primary {
55
+ background-color: #5477d1;
56
+ border: none;
57
+ transition: background-color 0.3s, transform 0.2s;
58
+ border-radius: 25px;
59
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
60
+ }
61
+ .btn-primary:hover {
62
+ background-color: #4c6cbe;
63
+ transform: translateY(-1px);
64
+ }
65
+ h2 {
66
+ font-weight: 600;
67
+ font-size: 24px;
68
+ margin-bottom: 20px;
69
+ }
70
+ h4 {
71
+ font-weight: 500;
72
+ font-size: 15px;
73
+ margin-top: 15px;
74
+ margin-bottom: 15px;
75
+ }
76
+ label {
77
+ font-weight: 500;
78
+ }
79
+ </style>
80
+ """,
81
+ unsafe_allow_html=True,
82
+ )
83
+
84
+ st.markdown(
85
+ """
86
+ <div class="header">
87
+ <img src="https://raw.githubusercontent.com/ca-roll/ca-roll.github.io/release/images/logopic/caroll.png" alt="Research Group Logo" id="logo">
88
+ <h4>Demonstrating <a href="https://ca-roll.github.io/" target="_blank">CAROLL Research Group</a>'s Language Models</h4>
89
+ </div>
90
+ """,
91
+ unsafe_allow_html=True,
92
+ )
93
+
94
+ # Initialization for Legal NER
95
+ tokenizer_legal = AutoTokenizer.from_pretrained("PaDaS-Lab/gbert-legal-ner")
96
+ model_legal = AutoModelForTokenClassification.from_pretrained(
97
+ "PaDaS-Lab/gbert-legal-ner"
98
+ )
99
+ ner_legal = pipeline("ner", model=model_legal, tokenizer=tokenizer_legal)
100
+
101
+ # Initialization for GDPR Privacy Policy NER
102
+ tokenizer_gdpr = AutoTokenizer.from_pretrained("PaDaS-Lab/gdpr-privacy-policy-ner")
103
+ model_gdpr = AutoModelForTokenClassification.from_pretrained(
104
+ "PaDaS-Lab/gdpr-privacy-policy-ner"
105
+ )
106
+ ner_gdpr = pipeline("ner", model=model_gdpr, tokenizer=tokenizer_gdpr)
107
+
108
+ # Define class labels for Legal and GDPR NER models
109
+ classes_legal = {
110
+ "AN": "Lawyer",
111
+ "EUN": "European legal norm",
112
+ "GRT": "Court",
113
+ "GS": "Law",
114
+ "INN": "Institution",
115
+ "LD": "Country",
116
+ "LDS": "Landscape",
117
+ "LIT": "Legal literature",
118
+ "MRK": "Brand",
119
+ "ORG": "Organization",
120
+ "PER": "Person",
121
+ "RR": "Judge",
122
+ "RS": "Court decision",
123
+ "ST": "City",
124
+ "STR": "Street",
125
+ "UN": "Company",
126
+ "VO": "Ordinance",
127
+ "VS": "Regulation",
128
+ "VT": "Contract",
129
+ }
130
+ classes_gdpr = {
131
+ "DC": "Data Controller",
132
+ "DP": "Data Processor",
133
+ "DPO": "Data Protection Officer",
134
+ "R": "Recipient",
135
+ "TP": "Third Party",
136
+ "A": "Authority",
137
+ "DS": "Data Subject",
138
+ "DSO": "Data Source",
139
+ "RP": "Required Purpose",
140
+ "NRP": "Not-Required Purpose",
141
+ "P": "Processing",
142
+ "NPD": "Non-Personal Data",
143
+ "PD": "Personal Data",
144
+ "OM": "Organisational Measure",
145
+ "TM": "Technical Measure",
146
+ "LB": "Legal Basis",
147
+ "CONS": "Consent",
148
+ "CONT": "Contract",
149
+ "LI": "Legitimate Interest",
150
+ "ADM": "Automated Decision Making",
151
+ "RET": "Retention",
152
+ "SEU": "Scale EU",
153
+ "SNEU": "Scale Non-EU",
154
+ "RI": "Right",
155
+ "DSR15": "Art. 15 Right of access by the data subject",
156
+ "DSR16": "Art. 16 Right to rectification",
157
+ "DSR17": "Art. 17 Right to erasure (‘right to be forgotten’)",
158
+ "DSR18": "Art. 18 Right to restriction of processing",
159
+ "DSR19": "Art. 19 Notification obligation regarding rectification or erasure of personal data or restriction of processing",
160
+ "DSR20": "Art. 20 Right to data portability",
161
+ "DSR21": "Art. 21 Right to object",
162
+ "DSR22": "Art. 22 Automated individual decision-making, including profiling",
163
+ "LC": "Lodge Complaint",
164
+ }
165
+
166
+ # Extract the keys (labels) from the class dictionaries
167
+ ner_labels_legal = list(classes_legal.keys())
168
+ ner_labels_gdpr = list(classes_gdpr.keys())
169
+
170
+
171
+ # Function to generate a list of colors for visualization
172
+ def generate_colors(num_colors):
173
+ cm = plt.get_cmap("tab20")
174
+ colors = [mcolors.rgb2hex(cm(1.0 * i / num_colors)) for i in range(num_colors)]
175
+ return colors
176
+
177
+
178
+ # Function to color substrings based on NER results
179
+ def color_substrings(input_string, model_output, ner_labels, current_classes):
180
+ colors = generate_colors(len(ner_labels))
181
+ label_to_color = {
182
+ label: colors[i % len(colors)] for i, label in enumerate(ner_labels)
183
+ }
184
+
185
+ last_end = 0
186
+ html_output = ""
187
+
188
+ for entity in sorted(model_output, key=lambda x: x["start"]):
189
+ start, end, label = entity["start"], entity["end"], entity["label"]
190
+ html_output += input_string[last_end:start]
191
+ tooltip = current_classes.get(label, "")
192
+ html_output += f'<span style="color: {label_to_color.get(label)}; font-weight: bold;" title="{tooltip}">{input_string[start:end]}</span>'
193
+ last_end = end
194
+
195
+ html_output += input_string[last_end:]
196
+
197
+ return html_output
198
+
199
+
200
+ st.title("CAROLL Language Models - Demo")
201
+ st.markdown("<hr>", unsafe_allow_html=True)
202
+
203
+ test_sentence = st.text_area("Enter Text:", height=200)
204
+ model_choice = st.selectbox(
205
+ "Choose a model:", ["Legal NER", "GDPR Privacy Policy NER"], index=0
206
+ )
207
+
208
+ if st.button("Analyze"):
209
+ if model_choice == "Legal NER":
210
+ ner_model = ner_legal
211
+ current_classes = classes_legal
212
+ current_ner_labels = ner_labels_legal
213
+ else:
214
+ ner_model = ner_gdpr
215
+ current_classes = classes_gdpr
216
+ current_ner_labels = ner_labels_gdpr
217
+
218
+ results = ner_model(test_sentence)
219
+ processed_results = [
220
+ {
221
+ "start": result["start"],
222
+ "end": result["end"],
223
+ "label": result["entity"].split("-")[-1],
224
+ }
225
+ for result in results
226
+ ]
227
+
228
+ colored_html = color_substrings(
229
+ test_sentence, processed_results, current_ner_labels, current_classes
230
+ )
231
+
232
+ st.markdown(
233
+ "<strong>- Original text -</strong><br><br>{}".format(test_sentence),
234
+ unsafe_allow_html=True,
235
+ )
236
+ st.markdown(
237
+ "<strong>- Analyzed text -</strong><br><br>{}".format(colored_html),
238
+ unsafe_allow_html=True,
239
+ )
240
+ st.markdown(
241
+ "<mark><strong>Tip:</strong> Hover over the colored words to see its class.</mark>",
242
+ unsafe_allow_html=True,
243
+ )