Spaces:

ProfessorLeVesseur
/

Kaleidoscope

Sleeping

App Files Files Community

Kaleidoscope / pages /01_Parts of Speech Annotation.py

ProfessorLeVesseur

Update pages/01_Parts of Speech Annotation.py

8f06cb3 verified about 1 year ago

raw

history blame contribute delete

15.5 kB

	#------------------------------------------------------------------------
	# Import Modules
	#------------------------------------------------------------------------

	import streamlit as st
	import spacy
	import string
	from annotated_text import annotated_text
	from PIL import Image

	spacy.cli.download("en_core_web_sm") # Download and install the model

	# Load the English NLP model
	nlp = spacy.load("en_core_web_sm")

	#------------------------------------------------------------------------
	# Configurations
	#------------------------------------------------------------------------

	# Streamlit page setup
	# icon = Image.open("MTSS.ai_Icon.png")
	icon = Image.open("MTSS.ai_Icon.png")
	st.set_page_config(
	page_title="Kaleidoscope \| Text Annotation",
	page_icon=icon,
	layout="centered",
	initial_sidebar_state="auto",
	menu_items={
	'About': "### This application was created by \n### LeVesseur Ph.D \| MTSS.ai"
	}
	)

	#------------------------------------------------------------------------
	# Header
	#------------------------------------------------------------------------

	# st.image('MTSS.ai_Logo.png', width=300)

	st.title('MTSS:grey[.ai]')
	st.header('Kaleidoscope:grey[ \| Parts of Speech Annotation]')

	#------------------------------------------------------------------------
	# Sidebar
	#------------------------------------------------------------------------

	contact = st.sidebar.toggle('Handmade by \nLeVesseur :grey[ PhD] \n\| :grey[MTSS.ai]')
	if contact:
	st.sidebar.write('Inquiries: [[email protected]](mailto:[email protected]) \nProfile: [levesseur.com](http://levesseur.com) \nCheck out: [InkQA \| Dynamic PDFs](http://www.inkqa.com)')

	# Color options
	colors = {
	"Green (DAF1E7)": "#DAF1E7",
	"Blue (BDE5FF)": "#BDE5FF",
	"Navy (D1DBE9)": "#D1DBE9",
	"Teal (D6EAED)": "#D6EAED",
	"Iceburg (E4EEF6)": "#E4EEF6",
	"Vermillion (F6DCDD)": "#F6DCDD",
	}

	with st.sidebar:
	st.divider()
	# Sidebar display (Option 1: Color blocks with hex)
	st.sidebar.header("Recommended Colors")

	for color_name, hex_code in colors.items():
	st.sidebar.color_picker(color_name, hex_code)

	st.subheader("Example")

	annotated_text(
	("I", "Pronoun", "#F6DCDD"),
	" ",
	"really",
	" ",
	("appreciate", "Verb", "#DAF1E7"),
	" ",
	("all", "Pronoun", "#F6DCDD"),
	" ",
	("that", "Pronoun", "#F6DCDD"),
	" ",
	"the",
	" ",
	("social", "Adj", "#BDE5FF"),
	" ",
	"committee",
	" ",
	"has",
	" ",
	("done", "Verb", "#DAF1E7"),
	" ",
	"to",
	" ",
	("keep", "Verb", "#DAF1E7"),
	" ",
	("us", "Pronoun", "#F6DCDD"),
	" ",
	("feeling", "Verb", "#DAF1E7"),
	" ",
	("connected", "Adj", "#BDE5FF"),
	" ",
	".",
	" ",
	"I",
	" ",
	"also",
	" ",
	"really",
	" ",
	("value", "Verb", "#DAF1E7"),
	" ",
	("our", "Pronoun", "#F6DCDD"),
	" ",
	"in",
	" ",
	"-person",
	" ",
	("meetings", "Noun", "#D1DBE9"),
	" ",
	"and",
	" ",
	"the",
	" ",
	"social",
	" ",
	("opportunities", "Noun", "#D1DBE9"),
	" ",
	("built", "Verb", "#DAF1E7"),
	" ",
	"into",
	" ",
	"these",
	" ",
	"meetings",
	" ",
	".",
	)

	st.divider()

	st.subheader("Directions for Using the Text Annotation Tool")

	directions = """
	1. Enter Your Text:
	- Type the text you want to annotate in the text area provided.

	2. Select Parts of Speech:
	- Choose which parts of speech you want to include in the annotation by checking the corresponding boxes (e.g., Verbs, Adjectives, Nouns, Pronouns).

	3. Submit Your Text:
	- Click the "Submit Text" button to process your input. The app will automatically label and color the words based on the selected parts of speech.

	4. Review the Annotations:
	- The annotated text will be displayed, showing the parts of speech labels and colors applied to the words.

	5. Adjust Annotations (Optional):
	- You can manually adjust the labels and colors for each word if needed.

	6. Generate Annotated Text:
	- After reviewing and adjusting the annotations, click the "Generate Annotated Text" button.
	- The final annotated text will be displayed.

	7. Take a Screenshot:
	- To use the annotated text, take a screenshot of the displayed text.

	8. Adjust Text Width (Optional):
	- If you want to adjust the width of the sentences for a better screenshot, minimize or resize your browser window accordingly before taking the screenshot.
	"""

	st.markdown(directions)

	#------------------------------------------------------------------------
	# Functions: Parts of Speech
	#------------------------------------------------------------------------

	# # Function to split text into words
	# def split_text(text):
	# # Add a space before punctuation marks
	# for char in string.punctuation:
	# text = text.replace(char, f" {char}")
	# return text.split()

	# # Function to automatically label and color words based on parts of speech
	# def auto_label_and_color_words(doc, words):
	# labels = [""] * len(words)
	# colors = ["#FFFFFF"] * len(words)
	# word_positions = {i: word for i, word in enumerate(words)}

	# for token in doc:
	# # Match token with the words from the original text
	# for index, word in word_positions.items():
	# if token.text == word:
	# if token.pos_ == "VERB":
	# labels[index] = "Verb"
	# colors[index] = "#DAF1E7"
	# elif token.pos_ == "ADJ":
	# labels[index] = "Adj"
	# colors[index] = "#BDE5FF"
	# elif token.pos_ == "NOUN":
	# labels[index] = "Noun"
	# colors[index] = "#D1DBE9"
	# elif token.pos_ == "PRON":
	# labels[index] = "Pronoun"
	# colors[index] = "#F6DCDD"
	# break # Exit loop once the word is found and processed
	# return labels, colors

	# # Main Streamlit application
	# st.title("Text Annotation Tool")

	# # Initialize session state to store text and annotations
	# if 'user_text' not in st.session_state:
	# st.session_state.user_text = ""
	# if 'words' not in st.session_state:
	# st.session_state.words = []
	# if 'labels' not in st.session_state:
	# st.session_state.labels = []
	# if 'colors' not in st.session_state:
	# st.session_state.colors = []
	# if 'extracted_pos' not in st.session_state:
	# st.session_state.extracted_pos = {}

	# # User input for the text
	# user_text = st.text_area("Enter the text you want to annotate:", value=st.session_state.user_text, height=100)

	# # Button to process the text
	# if st.button("Submit Text"):
	# st.session_state.user_text = user_text
	# st.session_state.words = split_text(user_text)

	# # Process the text with spaCy
	# doc = nlp(user_text)

	# # Automatically label and color words based on parts of speech
	# st.session_state.labels, st.session_state.colors = auto_label_and_color_words(doc, st.session_state.words)

	# # Extract parts of speech
	# st.session_state.extracted_pos = {
	# "verbs": [token.text for token in doc if token.pos_ == "VERB"],
	# "adjectives": [token.text for token in doc if token.pos_ == "ADJ"],
	# "nouns": [token.text for token in doc if token.pos_ == "NOUN"],
	# "pronouns": [token.text for token in doc if token.pos_ == "PRON"]
	# }

	# # Display extracted parts of speech
	# if st.session_state.extracted_pos:
	# st.subheader("Extracted Parts of Speech")
	# st.write("Verbs:", st.session_state.extracted_pos.get("verbs", []))
	# st.write("Adjectives:", st.session_state.extracted_pos.get("adjectives", []))
	# st.write("Nouns:", st.session_state.extracted_pos.get("nouns", []))
	# st.write("Pronouns:", st.session_state.extracted_pos.get("pronouns", []))

	# # Collect annotation inputs for each word
	# if st.session_state.words:
	# for i, word in enumerate(st.session_state.words):
	# st.write(f"Annotate the word: {word}")
	# st.session_state.labels[i] = st.selectbox(
	# f"Label for '{word}'", ["", "Verb", "Adj", "Noun", "Pronoun"],
	# key=f"label_{i}", index=["", "Verb", "Adj", "Noun", "Pronoun"].index(st.session_state.labels[i])
	# )
	# st.session_state.colors[i] = st.color_picker(
	# f"Color for '{word}'",
	# value=st.session_state.colors[i],
	# key=f"color_{i}"
	# )

	# # Generate button to process the annotations
	# if st.button("Generate Annotated Text"):
	# annotated_elements = []
	# for i, word in enumerate(st.session_state.words):
	# if st.session_state.labels[i] and st.session_state.colors[i] != "#FFFFFF":
	# annotated_elements.append((word, st.session_state.labels[i], st.session_state.colors[i]))
	# else:
	# annotated_elements.append(word)
	# annotated_elements.append(" ") # Add space between words

	# # Remove the last extra space added
	# if annotated_elements and annotated_elements[-1] == " ":
	# annotated_elements.pop()

	# # Display the annotated text using the `annotated_text` function
	# st.subheader("Annotated Text:")
	# annotated_text(*annotated_elements)

	# # Print the code for the annotated text
	# st.subheader("Generated Code:")
	# code_str = 'annotated_text(\n'
	# for elem in annotated_elements:
	# if isinstance(elem, tuple):
	# code_str += f' ("{elem[0]}", "{elem[1]}", "{elem[2]}"),\n'
	# else:
	# code_str += f' "{elem}",\n'
	# code_str += ')'
	# st.code(code_str, language='python')


	#------------------------------------------------------------------------
	# Functions: Parts of Speech + Buttons
	#------------------------------------------------------------------------

	# Function to split text into words
	def split_text(text):
	# Add a space before punctuation marks
	for char in string.punctuation:
	text = text.replace(char, f" {char}")
	return text.split()

	# Function to automatically label and color words based on parts of speech
	def auto_label_and_color_words(doc, words, include_verbs, include_adjectives, include_nouns, include_pronouns):
	labels = [""] * len(words)
	colors = ["#FFFFFF"] * len(words)
	word_positions = {i: word for i, word in enumerate(words)}

	for token in doc:
	# Match token with the words from the original text
	for index, word in word_positions.items():
	if token.text == word:
	if token.pos_ == "VERB" and include_verbs:
	labels[index] = "Verb"
	colors[index] = "#DAF1E7"
	elif token.pos_ == "ADJ" and include_adjectives:
	labels[index] = "Adj"
	colors[index] = "#BDE5FF"
	elif token.pos_ == "NOUN" and include_nouns:
	labels[index] = "Noun"
	colors[index] = "#D1DBE9"
	elif token.pos_ == "PRON" and include_pronouns:
	labels[index] = "Pronoun"
	colors[index] = "#F6DCDD"
	break # Exit loop once the word is found and processed
	return labels, colors

	# Initialize session state to store text and annotations
	if 'user_text' not in st.session_state:
	st.session_state.user_text = ""
	if 'words' not in st.session_state:
	st.session_state.words = []
	if 'labels' not in st.session_state:
	st.session_state.labels = []
	if 'colors' not in st.session_state:
	st.session_state.colors = []
	if 'extracted_pos' not in st.session_state:
	st.session_state.extracted_pos = {}

	# User input for the text
	user_text = st.text_area("Enter the text you want to annotate:", value=st.session_state.user_text, height=100)

	# Checkboxes for parts of speech to include
	include_verbs = st.checkbox("Include Verbs", value=True)
	include_adjectives = st.checkbox("Include Adjectives", value=True)
	include_nouns = st.checkbox("Include Nouns", value=True)
	include_pronouns = st.checkbox("Include Pronouns", value=True)

	# Button to process the text
	if st.button("Submit Text"):
	st.session_state.user_text = user_text
	st.session_state.words = split_text(user_text)

	# Process the text with spaCy
	doc = nlp(user_text)

	# Automatically label and color words based on parts of speech
	st.session_state.labels, st.session_state.colors = auto_label_and_color_words(
	doc, st.session_state.words, include_verbs, include_adjectives, include_nouns, include_pronouns)

	# Extract parts of speech
	st.session_state.extracted_pos = {
	"verbs": [token.text for token in doc if token.pos_ == "VERB"],
	"adjectives": [token.text for token in doc if token.pos_ == "ADJ"],
	"nouns": [token.text for token in doc if token.pos_ == "NOUN"],
	"pronouns": [token.text for token in doc if token.pos_ == "PRON"]
	}

	# Display extracted parts of speech
	if st.session_state.extracted_pos:
	st.subheader("Extracted Parts of Speech")
	st.write("Verbs:", st.session_state.extracted_pos.get("verbs", []))
	st.write("Adjectives:", st.session_state.extracted_pos.get("adjectives", []))
	st.write("Nouns:", st.session_state.extracted_pos.get("nouns", []))
	st.write("Pronouns:", st.session_state.extracted_pos.get("pronouns", []))

	# Collect annotation inputs for each word
	if st.session_state.words:
	for i, word in enumerate(st.session_state.words):
	st.write(f"Annotate the word: {word}")
	st.session_state.labels[i] = st.selectbox(
	f"Label for '{word}'", ["", "Verb", "Adj", "Noun", "Pronoun"],
	key=f"label_{i}", index=["", "Verb", "Adj", "Noun", "Pronoun"].index(st.session_state.labels[i])
	)
	st.session_state.colors[i] = st.color_picker(
	f"Color for '{word}'",
	value=st.session_state.colors[i],
	key=f"color_{i}"
	)

	# Generate button to process the annotations
	if st.button("Generate Annotated Text", type="primary"):
	annotated_elements = []
	for i, word in enumerate(st.session_state.words):
	if st.session_state.labels[i] and st.session_state.colors[i] != "#FFFFFF":
	annotated_elements.append((word, st.session_state.labels[i], st.session_state.colors[i]))
	else:
	annotated_elements.append(word)
	annotated_elements.append(" ") # Add space between words

	# Remove the last extra space added
	if annotated_elements and annotated_elements[-1] == " ":
	annotated_elements.pop()

	# Display the annotated text using the `annotated_text` function
	st.subheader("Annotated Text:")
	annotated_text(*annotated_elements)

	# Print the code for the annotated text
	st.subheader("Generated Code:")
	code_str = 'annotated_text(\n'
	for elem in annotated_elements:
	if isinstance(elem, tuple):
	code_str += f' ("{elem[0]}", "{elem[1]}", "{elem[2]}"),\n'
	else:
	code_str += f' "{elem}",\n'
	code_str += ')'
	st.code(code_str, language='python')