# Imports import gradio as gr from sklearn.linear_model import LogisticRegression import pickle5 as pickle import re import string import nltk from nltk.corpus import stopwords nltk.download('stopwords') from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from flair.data import Sentence from flair.models import SequenceTagger # Load pickled model and vectorizer model = 'lr_021823.pkl' model_loaded = pickle.load(open(model, 'rb')) vectorizer = 'vectorizer_021823.pkl' vectorizer_loaded = pickle.load(open(vectorizer, 'rb')) # Process input text, including removing stopwords, converting to lowercase, and removing punctuation stop = stopwords.words('english') def process_text(text): text = [word for word in text.split() if word not in stop] text = str(text).lower() text = re.sub( f"[{re.escape(string.punctuation)}]", " ", text ) text = " ".join(text.split()) return text # Vectorize text def vectorize_text(text): text = process_text(text) text = vectorizer_loaded.transform([text]) return text # Valid input for the model so number of features match def class_predict(text): text = process_text(text) vec = vectorizer_loaded.transform([text]) prediction = model_loaded.predict(vec) return prediction # Specify NER model tagger = SequenceTagger.load('best-model.pt') # SequenceTagger.load('best-model.pt') # Runs NER on input text def run_ner(input_text): sentence = Sentence(input_text) tagger.predict(sentence) output = [] for entity in sentence.get_spans('ner'): output.append({'entity': entity.get_label('ner').value, 'word': entity.text, 'start': entity.start_position, 'end': entity.end_position}) return {"text": input_text, "entities": output} # Run both models, and return a tuple of their results def run_models(input_text): prediction = class_predict(input_text) entities = run_ner(input_text) return prediction, entities # Define interface demo = gr.Interface(fn=run_models, title="Text Classification & Named Entity Recognition Demo", description="This is a demo of a text classification model using logistic regression as well as a named entity recognition model. Enter in some text or use one of the provided examples. Note that common named entity recognition tags include **geo** (geographical entity), **org** (organization), **per** (person), and **tim** (time).", article='*This demo is based on Logistic Regression and Named Entity Recognition models trained by Curtis Pond and Julia Nickerson as part of their FourthBrain capstone project. For more information, check out their [GitHub repo](https://github.com/nickersonj/glg-capstone).*', inputs=gr.Textbox(lines=10, placeholder='Input text here...', label="Input Text"), outputs=[gr.Textbox(label="Predicted Classification Label: Other: 0, Healthcare: 1, Technology: 2", lines=2, placeholder='Predicted label will appear here...'), gr.HighlightedText(label='Named Entity Recognition Results')], # These examples are just placeholders; once the LR model is working, we can use longer example text such as paragraphs examples=['The indictments were announced Tuesday by the Justice Department in Cairo.', "In 2019, the men's singles winner was Novak Djokovic who defeated Roger Federer in a tournament taking place in the United Kingdom.", 'In a study published by the American Heart Association on January 18, researchers at the Johns Hopkins School of Medicine found that meal timing did not impact weight.'], allow_flagging='never' ) demo.launch()