#------------------------------------------------------------------------ # Import Modules #------------------------------------------------------------------------ import streamlit as st import spacy import string from annotated_text import annotated_text from PIL import Image spacy.cli.download("en_core_web_sm") # Download and install the model # Load the English NLP model nlp = spacy.load("en_core_web_sm") #------------------------------------------------------------------------ # Configurations #------------------------------------------------------------------------ # Streamlit page setup # icon = Image.open("MTSS.ai_Icon.png") icon = Image.open("MTSS.ai_Icon.png") st.set_page_config( page_title="Kaleidoscope | Text Annotation", page_icon=icon, layout="centered", initial_sidebar_state="auto", menu_items={ 'About': "### *This application was created by* \n### LeVesseur Ph.D | MTSS.ai" } ) #------------------------------------------------------------------------ # Header #------------------------------------------------------------------------ # st.image('MTSS.ai_Logo.png', width=300) st.title('MTSS:grey[.ai]') st.header('Kaleidoscope:grey[ | Parts of Speech Annotation]') #------------------------------------------------------------------------ # Sidebar #------------------------------------------------------------------------ contact = st.sidebar.toggle('Handmade by \n**LeVesseur** :grey[ PhD] \n| :grey[MTSS.ai]') if contact: st.sidebar.write('Inquiries: [info@mtss.ai](mailto:info@mtss.ai) \nProfile: [levesseur.com](http://levesseur.com) \nCheck out: [InkQA | Dynamic PDFs](http://www.inkqa.com)') # Color options colors = { "Green (DAF1E7)": "#DAF1E7", "Blue (BDE5FF)": "#BDE5FF", "Navy (D1DBE9)": "#D1DBE9", "Teal (D6EAED)": "#D6EAED", "Iceburg (E4EEF6)": "#E4EEF6", "Vermillion (F6DCDD)": "#F6DCDD", } with st.sidebar: st.divider() # Sidebar display (Option 1: Color blocks with hex) st.sidebar.header("Recommended Colors") for color_name, hex_code in colors.items(): st.sidebar.color_picker(color_name, hex_code) st.subheader("Example") annotated_text( ("I", "Pronoun", "#F6DCDD"), " ", "really", " ", ("appreciate", "Verb", "#DAF1E7"), " ", ("all", "Pronoun", "#F6DCDD"), " ", ("that", "Pronoun", "#F6DCDD"), " ", "the", " ", ("social", "Adj", "#BDE5FF"), " ", "committee", " ", "has", " ", ("done", "Verb", "#DAF1E7"), " ", "to", " ", ("keep", "Verb", "#DAF1E7"), " ", ("us", "Pronoun", "#F6DCDD"), " ", ("feeling", "Verb", "#DAF1E7"), " ", ("connected", "Adj", "#BDE5FF"), " ", ".", " ", "I", " ", "also", " ", "really", " ", ("value", "Verb", "#DAF1E7"), " ", ("our", "Pronoun", "#F6DCDD"), " ", "in", " ", "-person", " ", ("meetings", "Noun", "#D1DBE9"), " ", "and", " ", "the", " ", "social", " ", ("opportunities", "Noun", "#D1DBE9"), " ", ("built", "Verb", "#DAF1E7"), " ", "into", " ", "these", " ", "meetings", " ", ".", ) st.divider() st.subheader("Directions for Using the Text Annotation Tool") directions = """ 1. **Enter Your Text**: - Type the text you want to annotate in the text area provided. 2. **Select Parts of Speech**: - Choose which parts of speech you want to include in the annotation by checking the corresponding boxes (e.g., Verbs, Adjectives, Nouns, Pronouns). 3. **Submit Your Text**: - Click the "Submit Text" button to process your input. The app will automatically label and color the words based on the selected parts of speech. 4. **Review the Annotations**: - The annotated text will be displayed, showing the parts of speech labels and colors applied to the words. 5. **Adjust Annotations (Optional)**: - You can manually adjust the labels and colors for each word if needed. 6. **Generate Annotated Text**: - After reviewing and adjusting the annotations, click the "Generate Annotated Text" button. - The final annotated text will be displayed. 7. **Take a Screenshot**: - To use the annotated text, take a screenshot of the displayed text. 8. **Adjust Text Width** (Optional): - If you want to adjust the width of the sentences for a better screenshot, minimize or resize your browser window accordingly before taking the screenshot. """ st.markdown(directions) #------------------------------------------------------------------------ # Functions: Parts of Speech #------------------------------------------------------------------------ # # Function to split text into words # def split_text(text): # # Add a space before punctuation marks # for char in string.punctuation: # text = text.replace(char, f" {char}") # return text.split() # # Function to automatically label and color words based on parts of speech # def auto_label_and_color_words(doc, words): # labels = [""] * len(words) # colors = ["#FFFFFF"] * len(words) # word_positions = {i: word for i, word in enumerate(words)} # for token in doc: # # Match token with the words from the original text # for index, word in word_positions.items(): # if token.text == word: # if token.pos_ == "VERB": # labels[index] = "Verb" # colors[index] = "#DAF1E7" # elif token.pos_ == "ADJ": # labels[index] = "Adj" # colors[index] = "#BDE5FF" # elif token.pos_ == "NOUN": # labels[index] = "Noun" # colors[index] = "#D1DBE9" # elif token.pos_ == "PRON": # labels[index] = "Pronoun" # colors[index] = "#F6DCDD" # break # Exit loop once the word is found and processed # return labels, colors # # Main Streamlit application # st.title("Text Annotation Tool") # # Initialize session state to store text and annotations # if 'user_text' not in st.session_state: # st.session_state.user_text = "" # if 'words' not in st.session_state: # st.session_state.words = [] # if 'labels' not in st.session_state: # st.session_state.labels = [] # if 'colors' not in st.session_state: # st.session_state.colors = [] # if 'extracted_pos' not in st.session_state: # st.session_state.extracted_pos = {} # # User input for the text # user_text = st.text_area("Enter the text you want to annotate:", value=st.session_state.user_text, height=100) # # Button to process the text # if st.button("Submit Text"): # st.session_state.user_text = user_text # st.session_state.words = split_text(user_text) # # Process the text with spaCy # doc = nlp(user_text) # # Automatically label and color words based on parts of speech # st.session_state.labels, st.session_state.colors = auto_label_and_color_words(doc, st.session_state.words) # # Extract parts of speech # st.session_state.extracted_pos = { # "verbs": [token.text for token in doc if token.pos_ == "VERB"], # "adjectives": [token.text for token in doc if token.pos_ == "ADJ"], # "nouns": [token.text for token in doc if token.pos_ == "NOUN"], # "pronouns": [token.text for token in doc if token.pos_ == "PRON"] # } # # Display extracted parts of speech # if st.session_state.extracted_pos: # st.subheader("Extracted Parts of Speech") # st.write("**Verbs:**", st.session_state.extracted_pos.get("verbs", [])) # st.write("**Adjectives:**", st.session_state.extracted_pos.get("adjectives", [])) # st.write("**Nouns:**", st.session_state.extracted_pos.get("nouns", [])) # st.write("**Pronouns:**", st.session_state.extracted_pos.get("pronouns", [])) # # Collect annotation inputs for each word # if st.session_state.words: # for i, word in enumerate(st.session_state.words): # st.write(f"Annotate the word: {word}") # st.session_state.labels[i] = st.selectbox( # f"Label for '{word}'", ["", "Verb", "Adj", "Noun", "Pronoun"], # key=f"label_{i}", index=["", "Verb", "Adj", "Noun", "Pronoun"].index(st.session_state.labels[i]) # ) # st.session_state.colors[i] = st.color_picker( # f"Color for '{word}'", # value=st.session_state.colors[i], # key=f"color_{i}" # ) # # Generate button to process the annotations # if st.button("Generate Annotated Text"): # annotated_elements = [] # for i, word in enumerate(st.session_state.words): # if st.session_state.labels[i] and st.session_state.colors[i] != "#FFFFFF": # annotated_elements.append((word, st.session_state.labels[i], st.session_state.colors[i])) # else: # annotated_elements.append(word) # annotated_elements.append(" ") # Add space between words # # Remove the last extra space added # if annotated_elements and annotated_elements[-1] == " ": # annotated_elements.pop() # # Display the annotated text using the `annotated_text` function # st.subheader("Annotated Text:") # annotated_text(*annotated_elements) # # Print the code for the annotated text # st.subheader("Generated Code:") # code_str = 'annotated_text(\n' # for elem in annotated_elements: # if isinstance(elem, tuple): # code_str += f' ("{elem[0]}", "{elem[1]}", "{elem[2]}"),\n' # else: # code_str += f' "{elem}",\n' # code_str += ')' # st.code(code_str, language='python') #------------------------------------------------------------------------ # Functions: Parts of Speech + Buttons #------------------------------------------------------------------------ # Function to split text into words def split_text(text): # Add a space before punctuation marks for char in string.punctuation: text = text.replace(char, f" {char}") return text.split() # Function to automatically label and color words based on parts of speech def auto_label_and_color_words(doc, words, include_verbs, include_adjectives, include_nouns, include_pronouns): labels = [""] * len(words) colors = ["#FFFFFF"] * len(words) word_positions = {i: word for i, word in enumerate(words)} for token in doc: # Match token with the words from the original text for index, word in word_positions.items(): if token.text == word: if token.pos_ == "VERB" and include_verbs: labels[index] = "Verb" colors[index] = "#DAF1E7" elif token.pos_ == "ADJ" and include_adjectives: labels[index] = "Adj" colors[index] = "#BDE5FF" elif token.pos_ == "NOUN" and include_nouns: labels[index] = "Noun" colors[index] = "#D1DBE9" elif token.pos_ == "PRON" and include_pronouns: labels[index] = "Pronoun" colors[index] = "#F6DCDD" break # Exit loop once the word is found and processed return labels, colors # Initialize session state to store text and annotations if 'user_text' not in st.session_state: st.session_state.user_text = "" if 'words' not in st.session_state: st.session_state.words = [] if 'labels' not in st.session_state: st.session_state.labels = [] if 'colors' not in st.session_state: st.session_state.colors = [] if 'extracted_pos' not in st.session_state: st.session_state.extracted_pos = {} # User input for the text user_text = st.text_area("Enter the text you want to annotate:", value=st.session_state.user_text, height=100) # Checkboxes for parts of speech to include include_verbs = st.checkbox("Include Verbs", value=True) include_adjectives = st.checkbox("Include Adjectives", value=True) include_nouns = st.checkbox("Include Nouns", value=True) include_pronouns = st.checkbox("Include Pronouns", value=True) # Button to process the text if st.button("Submit Text"): st.session_state.user_text = user_text st.session_state.words = split_text(user_text) # Process the text with spaCy doc = nlp(user_text) # Automatically label and color words based on parts of speech st.session_state.labels, st.session_state.colors = auto_label_and_color_words( doc, st.session_state.words, include_verbs, include_adjectives, include_nouns, include_pronouns) # Extract parts of speech st.session_state.extracted_pos = { "verbs": [token.text for token in doc if token.pos_ == "VERB"], "adjectives": [token.text for token in doc if token.pos_ == "ADJ"], "nouns": [token.text for token in doc if token.pos_ == "NOUN"], "pronouns": [token.text for token in doc if token.pos_ == "PRON"] } # Display extracted parts of speech if st.session_state.extracted_pos: st.subheader("Extracted Parts of Speech") st.write("**Verbs:**", st.session_state.extracted_pos.get("verbs", [])) st.write("**Adjectives:**", st.session_state.extracted_pos.get("adjectives", [])) st.write("**Nouns:**", st.session_state.extracted_pos.get("nouns", [])) st.write("**Pronouns:**", st.session_state.extracted_pos.get("pronouns", [])) # Collect annotation inputs for each word if st.session_state.words: for i, word in enumerate(st.session_state.words): st.write(f"Annotate the word: {word}") st.session_state.labels[i] = st.selectbox( f"Label for '{word}'", ["", "Verb", "Adj", "Noun", "Pronoun"], key=f"label_{i}", index=["", "Verb", "Adj", "Noun", "Pronoun"].index(st.session_state.labels[i]) ) st.session_state.colors[i] = st.color_picker( f"Color for '{word}'", value=st.session_state.colors[i], key=f"color_{i}" ) # Generate button to process the annotations if st.button("Generate Annotated Text", type="primary"): annotated_elements = [] for i, word in enumerate(st.session_state.words): if st.session_state.labels[i] and st.session_state.colors[i] != "#FFFFFF": annotated_elements.append((word, st.session_state.labels[i], st.session_state.colors[i])) else: annotated_elements.append(word) annotated_elements.append(" ") # Add space between words # Remove the last extra space added if annotated_elements and annotated_elements[-1] == " ": annotated_elements.pop() # Display the annotated text using the `annotated_text` function st.subheader("Annotated Text:") annotated_text(*annotated_elements) # Print the code for the annotated text st.subheader("Generated Code:") code_str = 'annotated_text(\n' for elem in annotated_elements: if isinstance(elem, tuple): code_str += f' ("{elem[0]}", "{elem[1]}", "{elem[2]}"),\n' else: code_str += f' "{elem}",\n' code_str += ')' st.code(code_str, language='python')