ProfessorLeVesseur commited on
Commit
6c9d6da
1 Parent(s): 7ab40c0

Upload 01_Parts of Speech Annotation.py

Browse files
pages/01_Parts of Speech Annotation.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #------------------------------------------------------------------------
2
+ # Import Modules
3
+ #------------------------------------------------------------------------
4
+
5
+ import streamlit as st
6
+ import spacy
7
+ import string
8
+ from annotated_text import annotated_text
9
+ from PIL import Image
10
+
11
+ # Load the English NLP model
12
+ nlp = spacy.load("en_core_web_sm")
13
+
14
+ #------------------------------------------------------------------------
15
+ # Configurations
16
+ #------------------------------------------------------------------------
17
+
18
+ # Streamlit page setup
19
+ # icon = Image.open("MTSS.ai_Icon.png")
20
+ icon = Image.open("/Users/cheynelevesseur/Desktop/Python_Code/LLM_Projects/LLM_Prxmpting/MTSS.ai_Icon.png")
21
+ st.set_page_config(
22
+ page_title="Kaleidoscope | Text Annotation",
23
+ page_icon=icon,
24
+ layout="centered",
25
+ initial_sidebar_state="auto",
26
+ menu_items={
27
+ 'About': "### *This application was created by* \n### LeVesseur Ph.D | MTSS.ai"
28
+ }
29
+ )
30
+
31
+ #------------------------------------------------------------------------
32
+ # Header
33
+ #------------------------------------------------------------------------
34
+
35
+ # st.image('MTSS.ai_Logo.png', width=300)
36
+
37
+ st.title('MTSS:grey[.ai]')
38
+ st.header('Kaleidoscope:grey[ | Parts of Speech Annotation]')
39
+
40
+ #------------------------------------------------------------------------
41
+ # Sidebar
42
+ #------------------------------------------------------------------------
43
+
44
+ contact = st.sidebar.toggle('Handmade by \n**LeVesseur** :grey[ PhD] \n| :grey[MTSS.ai]')
45
+ if contact:
46
+ st.sidebar.write('Inquiries: [[email protected]](mailto:[email protected]) \nProfile: [levesseur.com](http://levesseur.com) \nCheck out: [InkQA | Dynamic PDFs](http://www.inkqa.com)')
47
+
48
+ # Color options
49
+ colors = {
50
+ "Green (DAF1E7)": "#DAF1E7",
51
+ "Blue (BDE5FF)": "#BDE5FF",
52
+ "Navy (D1DBE9)": "#D1DBE9",
53
+ "Teal (D6EAED)": "#D6EAED",
54
+ "Iceburg (E4EEF6)": "#E4EEF6",
55
+ "Vermillion (F6DCDD)": "#F6DCDD",
56
+ }
57
+
58
+ with st.sidebar:
59
+ st.divider()
60
+ # Sidebar display (Option 1: Color blocks with hex)
61
+ st.sidebar.header("Recommended Colors")
62
+
63
+ for color_name, hex_code in colors.items():
64
+ st.sidebar.color_picker(color_name, hex_code)
65
+
66
+ st.subheader("Example")
67
+
68
+ annotated_text(
69
+ ("I", "Pronoun", "#F6DCDD"),
70
+ " ",
71
+ "really",
72
+ " ",
73
+ ("appreciate", "Verb", "#DAF1E7"),
74
+ " ",
75
+ ("all", "Pronoun", "#F6DCDD"),
76
+ " ",
77
+ ("that", "Pronoun", "#F6DCDD"),
78
+ " ",
79
+ "the",
80
+ " ",
81
+ ("social", "Adj", "#BDE5FF"),
82
+ " ",
83
+ "committee",
84
+ " ",
85
+ "has",
86
+ " ",
87
+ ("done", "Verb", "#DAF1E7"),
88
+ " ",
89
+ "to",
90
+ " ",
91
+ ("keep", "Verb", "#DAF1E7"),
92
+ " ",
93
+ ("us", "Pronoun", "#F6DCDD"),
94
+ " ",
95
+ ("feeling", "Verb", "#DAF1E7"),
96
+ " ",
97
+ ("connected", "Adj", "#BDE5FF"),
98
+ " ",
99
+ ".",
100
+ " ",
101
+ "I",
102
+ " ",
103
+ "also",
104
+ " ",
105
+ "really",
106
+ " ",
107
+ ("value", "Verb", "#DAF1E7"),
108
+ " ",
109
+ ("our", "Pronoun", "#F6DCDD"),
110
+ " ",
111
+ "in",
112
+ " ",
113
+ "-person",
114
+ " ",
115
+ ("meetings", "Noun", "#D1DBE9"),
116
+ " ",
117
+ "and",
118
+ " ",
119
+ "the",
120
+ " ",
121
+ "social",
122
+ " ",
123
+ ("opportunities", "Noun", "#D1DBE9"),
124
+ " ",
125
+ ("built", "Verb", "#DAF1E7"),
126
+ " ",
127
+ "into",
128
+ " ",
129
+ "these",
130
+ " ",
131
+ "meetings",
132
+ " ",
133
+ ".",
134
+ )
135
+
136
+ st.divider()
137
+
138
+ st.subheader("Directions for Using the Text Annotation Tool")
139
+
140
+ directions = """
141
+ 1. **Enter Your Text**:
142
+ - Type the text you want to annotate in the text area provided.
143
+
144
+ 2. **Select Parts of Speech**:
145
+ - Choose which parts of speech you want to include in the annotation by checking the corresponding boxes (e.g., Verbs, Adjectives, Nouns, Pronouns).
146
+
147
+ 3. **Submit Your Text**:
148
+ - Click the "Submit Text" button to process your input. The app will automatically label and color the words based on the selected parts of speech.
149
+
150
+ 4. **Review the Annotations**:
151
+ - The annotated text will be displayed, showing the parts of speech labels and colors applied to the words.
152
+
153
+ 5. **Adjust Annotations (Optional)**:
154
+ - You can manually adjust the labels and colors for each word if needed.
155
+
156
+ 6. **Generate Annotated Text**:
157
+ - After reviewing and adjusting the annotations, click the "Generate Annotated Text" button.
158
+ - The final annotated text will be displayed.
159
+
160
+ 7. **Take a Screenshot**:
161
+ - To use the annotated text, take a screenshot of the displayed text.
162
+
163
+ 8. **Adjust Text Width** (Optional):
164
+ - If you want to adjust the width of the sentences for a better screenshot, minimize or resize your browser window accordingly before taking the screenshot.
165
+ """
166
+
167
+ st.markdown(directions)
168
+
169
+ #------------------------------------------------------------------------
170
+ # Functions: Parts of Speech
171
+ #------------------------------------------------------------------------
172
+
173
+ # # Function to split text into words
174
+ # def split_text(text):
175
+ # # Add a space before punctuation marks
176
+ # for char in string.punctuation:
177
+ # text = text.replace(char, f" {char}")
178
+ # return text.split()
179
+
180
+ # # Function to automatically label and color words based on parts of speech
181
+ # def auto_label_and_color_words(doc, words):
182
+ # labels = [""] * len(words)
183
+ # colors = ["#FFFFFF"] * len(words)
184
+ # word_positions = {i: word for i, word in enumerate(words)}
185
+
186
+ # for token in doc:
187
+ # # Match token with the words from the original text
188
+ # for index, word in word_positions.items():
189
+ # if token.text == word:
190
+ # if token.pos_ == "VERB":
191
+ # labels[index] = "Verb"
192
+ # colors[index] = "#DAF1E7"
193
+ # elif token.pos_ == "ADJ":
194
+ # labels[index] = "Adj"
195
+ # colors[index] = "#BDE5FF"
196
+ # elif token.pos_ == "NOUN":
197
+ # labels[index] = "Noun"
198
+ # colors[index] = "#D1DBE9"
199
+ # elif token.pos_ == "PRON":
200
+ # labels[index] = "Pronoun"
201
+ # colors[index] = "#F6DCDD"
202
+ # break # Exit loop once the word is found and processed
203
+ # return labels, colors
204
+
205
+ # # Main Streamlit application
206
+ # st.title("Text Annotation Tool")
207
+
208
+ # # Initialize session state to store text and annotations
209
+ # if 'user_text' not in st.session_state:
210
+ # st.session_state.user_text = ""
211
+ # if 'words' not in st.session_state:
212
+ # st.session_state.words = []
213
+ # if 'labels' not in st.session_state:
214
+ # st.session_state.labels = []
215
+ # if 'colors' not in st.session_state:
216
+ # st.session_state.colors = []
217
+ # if 'extracted_pos' not in st.session_state:
218
+ # st.session_state.extracted_pos = {}
219
+
220
+ # # User input for the text
221
+ # user_text = st.text_area("Enter the text you want to annotate:", value=st.session_state.user_text, height=100)
222
+
223
+ # # Button to process the text
224
+ # if st.button("Submit Text"):
225
+ # st.session_state.user_text = user_text
226
+ # st.session_state.words = split_text(user_text)
227
+
228
+ # # Process the text with spaCy
229
+ # doc = nlp(user_text)
230
+
231
+ # # Automatically label and color words based on parts of speech
232
+ # st.session_state.labels, st.session_state.colors = auto_label_and_color_words(doc, st.session_state.words)
233
+
234
+ # # Extract parts of speech
235
+ # st.session_state.extracted_pos = {
236
+ # "verbs": [token.text for token in doc if token.pos_ == "VERB"],
237
+ # "adjectives": [token.text for token in doc if token.pos_ == "ADJ"],
238
+ # "nouns": [token.text for token in doc if token.pos_ == "NOUN"],
239
+ # "pronouns": [token.text for token in doc if token.pos_ == "PRON"]
240
+ # }
241
+
242
+ # # Display extracted parts of speech
243
+ # if st.session_state.extracted_pos:
244
+ # st.subheader("Extracted Parts of Speech")
245
+ # st.write("**Verbs:**", st.session_state.extracted_pos.get("verbs", []))
246
+ # st.write("**Adjectives:**", st.session_state.extracted_pos.get("adjectives", []))
247
+ # st.write("**Nouns:**", st.session_state.extracted_pos.get("nouns", []))
248
+ # st.write("**Pronouns:**", st.session_state.extracted_pos.get("pronouns", []))
249
+
250
+ # # Collect annotation inputs for each word
251
+ # if st.session_state.words:
252
+ # for i, word in enumerate(st.session_state.words):
253
+ # st.write(f"Annotate the word: {word}")
254
+ # st.session_state.labels[i] = st.selectbox(
255
+ # f"Label for '{word}'", ["", "Verb", "Adj", "Noun", "Pronoun"],
256
+ # key=f"label_{i}", index=["", "Verb", "Adj", "Noun", "Pronoun"].index(st.session_state.labels[i])
257
+ # )
258
+ # st.session_state.colors[i] = st.color_picker(
259
+ # f"Color for '{word}'",
260
+ # value=st.session_state.colors[i],
261
+ # key=f"color_{i}"
262
+ # )
263
+
264
+ # # Generate button to process the annotations
265
+ # if st.button("Generate Annotated Text"):
266
+ # annotated_elements = []
267
+ # for i, word in enumerate(st.session_state.words):
268
+ # if st.session_state.labels[i] and st.session_state.colors[i] != "#FFFFFF":
269
+ # annotated_elements.append((word, st.session_state.labels[i], st.session_state.colors[i]))
270
+ # else:
271
+ # annotated_elements.append(word)
272
+ # annotated_elements.append(" ") # Add space between words
273
+
274
+ # # Remove the last extra space added
275
+ # if annotated_elements and annotated_elements[-1] == " ":
276
+ # annotated_elements.pop()
277
+
278
+ # # Display the annotated text using the `annotated_text` function
279
+ # st.subheader("Annotated Text:")
280
+ # annotated_text(*annotated_elements)
281
+
282
+ # # Print the code for the annotated text
283
+ # st.subheader("Generated Code:")
284
+ # code_str = 'annotated_text(\n'
285
+ # for elem in annotated_elements:
286
+ # if isinstance(elem, tuple):
287
+ # code_str += f' ("{elem[0]}", "{elem[1]}", "{elem[2]}"),\n'
288
+ # else:
289
+ # code_str += f' "{elem}",\n'
290
+ # code_str += ')'
291
+ # st.code(code_str, language='python')
292
+
293
+
294
+ #------------------------------------------------------------------------
295
+ # Functions: Parts of Speech + Buttons
296
+ #------------------------------------------------------------------------
297
+
298
+ # Function to split text into words
299
+ def split_text(text):
300
+ # Add a space before punctuation marks
301
+ for char in string.punctuation:
302
+ text = text.replace(char, f" {char}")
303
+ return text.split()
304
+
305
+ # Function to automatically label and color words based on parts of speech
306
+ def auto_label_and_color_words(doc, words, include_verbs, include_adjectives, include_nouns, include_pronouns):
307
+ labels = [""] * len(words)
308
+ colors = ["#FFFFFF"] * len(words)
309
+ word_positions = {i: word for i, word in enumerate(words)}
310
+
311
+ for token in doc:
312
+ # Match token with the words from the original text
313
+ for index, word in word_positions.items():
314
+ if token.text == word:
315
+ if token.pos_ == "VERB" and include_verbs:
316
+ labels[index] = "Verb"
317
+ colors[index] = "#DAF1E7"
318
+ elif token.pos_ == "ADJ" and include_adjectives:
319
+ labels[index] = "Adj"
320
+ colors[index] = "#BDE5FF"
321
+ elif token.pos_ == "NOUN" and include_nouns:
322
+ labels[index] = "Noun"
323
+ colors[index] = "#D1DBE9"
324
+ elif token.pos_ == "PRON" and include_pronouns:
325
+ labels[index] = "Pronoun"
326
+ colors[index] = "#F6DCDD"
327
+ break # Exit loop once the word is found and processed
328
+ return labels, colors
329
+
330
+ # Initialize session state to store text and annotations
331
+ if 'user_text' not in st.session_state:
332
+ st.session_state.user_text = ""
333
+ if 'words' not in st.session_state:
334
+ st.session_state.words = []
335
+ if 'labels' not in st.session_state:
336
+ st.session_state.labels = []
337
+ if 'colors' not in st.session_state:
338
+ st.session_state.colors = []
339
+ if 'extracted_pos' not in st.session_state:
340
+ st.session_state.extracted_pos = {}
341
+
342
+ # User input for the text
343
+ user_text = st.text_area("Enter the text you want to annotate:", value=st.session_state.user_text, height=100)
344
+
345
+ # Checkboxes for parts of speech to include
346
+ include_verbs = st.checkbox("Include Verbs", value=True)
347
+ include_adjectives = st.checkbox("Include Adjectives", value=True)
348
+ include_nouns = st.checkbox("Include Nouns", value=True)
349
+ include_pronouns = st.checkbox("Include Pronouns", value=True)
350
+
351
+ # Button to process the text
352
+ if st.button("Submit Text"):
353
+ st.session_state.user_text = user_text
354
+ st.session_state.words = split_text(user_text)
355
+
356
+ # Process the text with spaCy
357
+ doc = nlp(user_text)
358
+
359
+ # Automatically label and color words based on parts of speech
360
+ st.session_state.labels, st.session_state.colors = auto_label_and_color_words(
361
+ doc, st.session_state.words, include_verbs, include_adjectives, include_nouns, include_pronouns)
362
+
363
+ # Extract parts of speech
364
+ st.session_state.extracted_pos = {
365
+ "verbs": [token.text for token in doc if token.pos_ == "VERB"],
366
+ "adjectives": [token.text for token in doc if token.pos_ == "ADJ"],
367
+ "nouns": [token.text for token in doc if token.pos_ == "NOUN"],
368
+ "pronouns": [token.text for token in doc if token.pos_ == "PRON"]
369
+ }
370
+
371
+ # Display extracted parts of speech
372
+ if st.session_state.extracted_pos:
373
+ st.subheader("Extracted Parts of Speech")
374
+ st.write("**Verbs:**", st.session_state.extracted_pos.get("verbs", []))
375
+ st.write("**Adjectives:**", st.session_state.extracted_pos.get("adjectives", []))
376
+ st.write("**Nouns:**", st.session_state.extracted_pos.get("nouns", []))
377
+ st.write("**Pronouns:**", st.session_state.extracted_pos.get("pronouns", []))
378
+
379
+ # Collect annotation inputs for each word
380
+ if st.session_state.words:
381
+ for i, word in enumerate(st.session_state.words):
382
+ st.write(f"Annotate the word: {word}")
383
+ st.session_state.labels[i] = st.selectbox(
384
+ f"Label for '{word}'", ["", "Verb", "Adj", "Noun", "Pronoun"],
385
+ key=f"label_{i}", index=["", "Verb", "Adj", "Noun", "Pronoun"].index(st.session_state.labels[i])
386
+ )
387
+ st.session_state.colors[i] = st.color_picker(
388
+ f"Color for '{word}'",
389
+ value=st.session_state.colors[i],
390
+ key=f"color_{i}"
391
+ )
392
+
393
+ # Generate button to process the annotations
394
+ if st.button("Generate Annotated Text", type="primary"):
395
+ annotated_elements = []
396
+ for i, word in enumerate(st.session_state.words):
397
+ if st.session_state.labels[i] and st.session_state.colors[i] != "#FFFFFF":
398
+ annotated_elements.append((word, st.session_state.labels[i], st.session_state.colors[i]))
399
+ else:
400
+ annotated_elements.append(word)
401
+ annotated_elements.append(" ") # Add space between words
402
+
403
+ # Remove the last extra space added
404
+ if annotated_elements and annotated_elements[-1] == " ":
405
+ annotated_elements.pop()
406
+
407
+ # Display the annotated text using the `annotated_text` function
408
+ st.subheader("Annotated Text:")
409
+ annotated_text(*annotated_elements)
410
+
411
+ # Print the code for the annotated text
412
+ st.subheader("Generated Code:")
413
+ code_str = 'annotated_text(\n'
414
+ for elem in annotated_elements:
415
+ if isinstance(elem, tuple):
416
+ code_str += f' ("{elem[0]}", "{elem[1]}", "{elem[2]}"),\n'
417
+ else:
418
+ code_str += f' "{elem}",\n'
419
+ code_str += ')'
420
+ st.code(code_str, language='python')