File size: 2,980 Bytes
c4aa562
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import streamlit as st
import spacy
import numpy as np
from gensim import corpora, models
from utils import window, get_depths, get_local_maxima, compute_threshold, get_threshold_segments
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en_core_web_sm')

def print_list(lst):
    for e in lst:
        st.markdown("- " + e)

st.subheader("Topic Modeling with Segmentation")
uploaded_file = st.file_uploader("choose a text file", type=["txt"])
if uploaded_file is not None: 
    st.session_state["text"] = uploaded_file.getvalue().decode('utf-8')

st.write("OR")

input_text = st.text_area(
    label="Enter text separated by newlines",
    value="",
    key="text",
    height=150
)

button=st.button('Get Segments')
if (button==True) and input_text != "":
    texts = input_text.split('\n')
    sents = []
    for text in texts:
        doc = nlp(text)
        for sent in doc.sents:
            sents.append(sent)
    MIN_LENGTH = 3
    tokenized_sents = [[token.lemma_.lower() for token in sent if 
                        not token.is_stop and not token.is_punct and token.text.strip() and len(token) >= MIN_LENGTH] 
                        for sent in sents]
    st.write("Modeling topics:")


    np.random.seed(123)

    N_TOPICS = 5
    N_PASSES = 5

    dictionary = corpora.Dictionary(tokenized_sents)
    bow = [dictionary.doc2bow(sent) for sent in tokenized_sents]
    topic_model = models.LdaModel(corpus=bow, id2word=dictionary, num_topics=N_TOPICS, passes=N_PASSES)
    st.write("inferring topics ...")
    THRESHOLD = 0.05
    doc_topics = list(topic_model.get_document_topics(bow, minimum_probability=THRESHOLD))
    k = 3
    top_k_topics = [[t[0] for t in sorted(sent_topics, key=lambda x: x[1], reverse=True)][:k] 
                    for sent_topics in doc_topics]
    WINDOW_SIZE = 3
    window_topics = window(top_k_topics, n=WINDOW_SIZE)
    window_topics = [list(set(chain.from_iterable(window))) for window in window_topics]

    binarizer = MultiLabelBinarizer(classes=range(N_TOPICS))

    encoded_topic = binarizer.fit_transform(window_topics)
    st.write("generating segments ...")
    sims_topic = [cosine_similarity([pair[0]], [pair[1]])[0][0] for pair in zip(encoded_topic, encoded_topic[1:])]
    depths_topic = get_depths(sims_topic)
    filtered_topic = get_local_maxima(depths_topic, order=1)
    threshold_topic = compute_threshold(filtered_topic)
    threshold_segments_topic = get_threshold_segments(filtered_topic, threshold_topic)

    segment_ids = threshold_segments_topic + WINDOW_SIZE

    segment_ids = [0] + segment_ids.tolist() + [len(sents)]
    slices = list(zip(segment_ids[:-1], segment_ids[1:]))

    segmented = [sents[s[0]: s[1]] for s in slices]

    for segment in segmented[:-1]:
        print_list([s.text for s in segment])
        st.markdown("""---""")
        
    print_list([s.text for s in segmented[-1]])