File size: 2,980 Bytes
c4aa562 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
import streamlit as st
import spacy
import numpy as np
from gensim import corpora, models
from utils import window, get_depths, get_local_maxima, compute_threshold, get_threshold_segments
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load('en_core_web_sm')
def print_list(lst):
for e in lst:
st.markdown("- " + e)
st.subheader("Topic Modeling with Segmentation")
uploaded_file = st.file_uploader("choose a text file", type=["txt"])
if uploaded_file is not None:
st.session_state["text"] = uploaded_file.getvalue().decode('utf-8')
st.write("OR")
input_text = st.text_area(
label="Enter text separated by newlines",
value="",
key="text",
height=150
)
button=st.button('Get Segments')
if (button==True) and input_text != "":
texts = input_text.split('\n')
sents = []
for text in texts:
doc = nlp(text)
for sent in doc.sents:
sents.append(sent)
MIN_LENGTH = 3
tokenized_sents = [[token.lemma_.lower() for token in sent if
not token.is_stop and not token.is_punct and token.text.strip() and len(token) >= MIN_LENGTH]
for sent in sents]
st.write("Modeling topics:")
np.random.seed(123)
N_TOPICS = 5
N_PASSES = 5
dictionary = corpora.Dictionary(tokenized_sents)
bow = [dictionary.doc2bow(sent) for sent in tokenized_sents]
topic_model = models.LdaModel(corpus=bow, id2word=dictionary, num_topics=N_TOPICS, passes=N_PASSES)
st.write("inferring topics ...")
THRESHOLD = 0.05
doc_topics = list(topic_model.get_document_topics(bow, minimum_probability=THRESHOLD))
k = 3
top_k_topics = [[t[0] for t in sorted(sent_topics, key=lambda x: x[1], reverse=True)][:k]
for sent_topics in doc_topics]
WINDOW_SIZE = 3
window_topics = window(top_k_topics, n=WINDOW_SIZE)
window_topics = [list(set(chain.from_iterable(window))) for window in window_topics]
binarizer = MultiLabelBinarizer(classes=range(N_TOPICS))
encoded_topic = binarizer.fit_transform(window_topics)
st.write("generating segments ...")
sims_topic = [cosine_similarity([pair[0]], [pair[1]])[0][0] for pair in zip(encoded_topic, encoded_topic[1:])]
depths_topic = get_depths(sims_topic)
filtered_topic = get_local_maxima(depths_topic, order=1)
threshold_topic = compute_threshold(filtered_topic)
threshold_segments_topic = get_threshold_segments(filtered_topic, threshold_topic)
segment_ids = threshold_segments_topic + WINDOW_SIZE
segment_ids = [0] + segment_ids.tolist() + [len(sents)]
slices = list(zip(segment_ids[:-1], segment_ids[1:]))
segmented = [sents[s[0]: s[1]] for s in slices]
for segment in segmented[:-1]:
print_list([s.text for s in segment])
st.markdown("""---""")
print_list([s.text for s in segmented[-1]]) |