File size: 3,252 Bytes
4df3ec6
4354680
fe021fb
cf53b75
0c2753a
4df3ec6
4b21134
 
fe021fb
f3505bb
 
 
 
cf53b75
4b21134
4065f3f
 
4354680
4b21134
 
4065f3f
 
cf53b75
e36f01a
 
 
f39343a
4b21134
 
 
4354680
f39343a
fe021fb
f39343a
fe021fb
 
 
4354680
fe021fb
4354680
4b21134
4df3ec6
 
4354680
 
 
 
4df3ec6
f39343a
4df3ec6
 
 
b916752
4354680
4df3ec6
4b21134
 
 
 
6f0c363
fe021fb
4df3ec6
4b21134
 
 
 
f3505bb
 
4354680
0c2753a
f3505bb
 
 
 
 
 
4354680
 
 
 
 
 
 
 
 
 
 
4b21134
 
4df3ec6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import torch
import nltk
import validators
import streamlit as st
from transformers import T5Tokenizer, T5ForConditionalGeneration

# local modules
from extractive_summarizer.model_processors import Summarizer
from src.utils import clean_text, fetch_article_text
from src.abstractive_summarizer import (
    abstractive_summarizer,
    preprocess_text_for_abstractive_summarization,
)

# abstractive summarizer model
@st.cache()
def load_abs_model():
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    return tokenizer, model


if __name__ == "__main__":
    # ---------------------------------
    # Main Application
    # ---------------------------------
    st.title("Text Summarizer πŸ“")
    summarize_type = st.sidebar.selectbox(
        "Summarization type", options=["Extractive", "Abstractive"]
    )
    nltk.download("punkt")

    inp_text = st.text_input("Enter text or a url here")

    is_url = validators.url(inp_text)
    if is_url:
        # complete text, chunks to summarize (list of sentences for long docs)
        text, clean_txt = fetch_article_text(url=inp_text)
    else:
        clean_txt = clean_text(inp_text)

    # view summarized text (expander)
    with st.expander("View input text"):
        if is_url:
            st.write(clean_txt[0])
        else:
            st.write(clean_txt)
    summarize = st.button("Summarize")

    # called on toggle button [summarize]
    if summarize:
        if summarize_type == "Extractive":
            if is_url:
                text_to_summarize = " ".join([txt for txt in clean_txt])
            # extractive summarizer

            with st.spinner(
                text="Creating extractive summary. This might take a few seconds ..."
            ):
                ext_model = Summarizer()
                summarized_text = ext_model(text_to_summarize, num_sentences=6)

        elif summarize_type == "Abstractive":
            with st.spinner(
                text="Creating abstractive summary. This might take a few seconds ..."
            ):
                text_to_summarize = clean_txt
                abs_tokenizer, abs_model = load_abs_model()
                if not is_url:
                    # list of chunks
                    text_to_summarize = preprocess_text_for_abstractive_summarization(
                        tokenizer=abs_tokenizer, text=clean_txt
                    )
                summarized_text = abstractive_summarizer(
                    abs_tokenizer, abs_model, text_to_summarize
                )

        #         abs_tokenizer, abs_model = load_abs_model()
        #         summarized_text = abstractive_summarizer(
        #             abs_tokenizer, abs_model, text_to_summarize
        #         )
        # elif summarize_type == "Abstractive" and is_url:
        #     abs_url_summarizer = pipeline("summarization")
        #     tmp_sum = abs_url_summarizer(
        #         text_to_summarize, max_length=120, min_length=30, do_sample=False
        #     )
        #     summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])

        # final summarized output
        st.subheader("Summarized text")
        st.info(summarized_text)