Spaces:
Runtime error
Runtime error
import streamlit as st | |
from transformers import pipeline | |
from qa.qa import parse_file | |
from transformers import AutoTokenizer | |
from typing import Text, Union | |
def summarization_model( | |
model_name:str="facebook/bart-large-cnn", | |
custom_tokenizer:Union[AutoTokenizer, bool]=False | |
): | |
summarizer = pipeline( | |
model=model_name, | |
tokenizer=model_name if custom_tokenizer==False else custom_tokenizer, | |
task="summarization" | |
) | |
return summarizer | |
def split_string_into_token_chunks(s:Text, _tokenizer:AutoTokenizer, chunk_size:int): | |
# Tokenize the entire string | |
token_ids = _tokenizer.encode(s) | |
# Split the token ids into chunks of the desired size | |
chunks = [token_ids[i:i+chunk_size] for i in range(0, len(token_ids), chunk_size)] | |
# Decode each chunk back into a string | |
return [_tokenizer.decode(chunk) for chunk in chunks] | |
def summarization_main(): | |
st.markdown("<h2 style='text-align: center'>Text Summarization</h2>", unsafe_allow_html=True) | |
st.markdown("<h3 style='text-align: left'><b>What is text summarization about?<b></h3>", unsafe_allow_html=True) | |
st.write(""" | |
Text summarization is common NLP task concerned with producing a shorter version of a given text while preserving the important information | |
contained in such text | |
""") | |
OPTION_1 = "I want to input some text" | |
OPTION_2 = "I want to upload a file" | |
option = st.radio("How would you like to start? Choose an option below", [OPTION_1, OPTION_2]) | |
# greenlight to summarize | |
text_is_given = False | |
if option == OPTION_1: | |
sample_text = "" | |
text = st.text_area( | |
"Input a text in English (10,000 characters max)", | |
value=sample_text, | |
max_chars=10_000, | |
height=330) | |
# toggle text is given greenlight | |
if text != sample_text: | |
text_is_given = not text_is_given | |
elif option == OPTION_2: | |
uploaded_file = st.file_uploader( | |
"Upload a pdf, docx, or txt file (scanned documents not supported)", | |
type=["pdf", "docx", "txt"], | |
help="Scanned documents are not supported yet 🥲" | |
) | |
if uploaded_file is not None: | |
# parse the file using custom parsers and build a concatenation for the summarizer | |
text = " ".join(parse_file(uploaded_file)) | |
# toggle text is given greenlight | |
text_is_given = not text_is_given | |
if text_is_given: | |
# minimal number of words in the summary | |
min_length, max_length = 30, 200 | |
user_max_length = max_length | |
# user_max_lenght = st.slider( | |
# label="Maximal number of tokens in the summary", | |
# min_value=min_length, | |
# max_value=max_length, | |
# value=150, | |
# step=10, | |
# ) | |
summarizer_downloaded = False | |
# loading the tokenizer to split the input document into feasible chunks | |
model_name = "facebook/bart-large-cnn" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# the maximum number of tokens the model can handle depends on the model - accounting for tokens added by tokenizer | |
chunk_size = int(0.88*tokenizer.model_max_length) | |
# loading the summarization model considered | |
with st.spinner(text="Loading summarization model..."): | |
summarizer = summarization_model(model_name=model_name) | |
summarizer_downloaded = True | |
if summarizer_downloaded: | |
button = st.button("Summarize!") | |
if button: | |
with st.spinner(text="Summarizing text..."): | |
# summarizing each chunk of the input text to avoid exceeding the maximum number of tokens | |
summary = "" | |
chunks = split_string_into_token_chunks(text, tokenizer, chunk_size) | |
for chunk in chunks: | |
chunk_summary = summarizer(chunk, max_length=user_max_length, min_length=min_length) | |
summary += "\n" + chunk_summary[0]["summary_text"] | |
st.markdown("<h3 style='text-align: left'><b>Summary<b></h3>", unsafe_allow_html=True) | |
print(summary) | |
st.write(summary) | |