import streamlit as st from transformers import pipeline import fitz # PyMuPDF for handling PDFs from docx import Document import pypandoc from concurrent.futures import ThreadPoolExecutor # Initialize the summarization pipeline pipe = pipeline("summarization", model="facebook/bart-large-cnn") # Title of the app st.title("Text Summarizer") # Input text box input_text = st.text_area("Enter the text you want to summarize", height=200) # File uploader for PDF, TXT, DOC, and DOCX files uploaded_file = st.file_uploader("Upload a PDF, TXT, DOC, or DOCX file", type=["pdf", "txt", "doc", "docx"]) def extract_text_from_pdf(file): text = "" with fitz.open(stream=file.read(), filetype="pdf") as doc: for page in doc: text += page.get_text() return text def extract_text_from_txt(file): return file.read().decode("utf-8") def extract_text_from_doc(file): return pypandoc.convert_text(file.read(), 'plain', format='doc') def extract_text_from_docx(file): doc = Document(file) return '\n'.join([para.text for para in doc.paragraphs]) def chunk_text(text, max_len=1024): # Split the text into smaller chunks sentences = text.split('. ') current_chunk = [] current_length = 0 chunks = [] for sentence in sentences: if current_length + len(sentence) <= max_len: current_chunk.append(sentence) current_length += len(sentence) else: chunks.append('. '.join(current_chunk) + '.') current_chunk = [sentence] current_length = len(sentence) if current_chunk: chunks.append('. '.join(current_chunk) + '.') return chunks def summarize_chunk(chunk): return pipe(chunk)[0]['summary_text'] # Summarize button if st.button("Summarize"): if input_text: chunks = chunk_text(input_text) with ThreadPoolExecutor() as executor: summaries = list(executor.map(summarize_chunk, chunks)) st.subheader("Summary") st.write(' '.join(summaries)) elif uploaded_file is not None: if uploaded_file.type == "application/pdf": file_text = extract_text_from_pdf(uploaded_file) elif uploaded_file.type == "text/plain": file_text = extract_text_from_txt(uploaded_file) elif uploaded_file.type == "application/msword": file_text = extract_text_from_doc(uploaded_file) elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": file_text = extract_text_from_docx(uploaded_file) chunks = chunk_text(file_text) with ThreadPoolExecutor() as executor: summaries = list(executor.map(summarize_chunk, chunks)) st.subheader("Summary") st.write(' '.join(summaries)) else: st.write("Please enter some text or upload a file to summarize.")