import streamlit as st
from transformers import pipeline
import fitz  # PyMuPDF for handling PDFs
from docx import Document
import pypandoc
from concurrent.futures import ThreadPoolExecutor

# Initialize the summarization pipeline
pipe = pipeline("summarization", model="facebook/bart-large-cnn")

# Title of the app
st.title("Text Summarizer")

# Input text box
input_text = st.text_area("Enter the text you want to summarize", height=200)

# File uploader for PDF, TXT, DOC, and DOCX files
uploaded_file = st.file_uploader("Upload a PDF, TXT, DOC, or DOCX file", type=["pdf", "txt", "doc", "docx"])

def extract_text_from_pdf(file):
    text = ""
    with fitz.open(stream=file.read(), filetype="pdf") as doc:
        for page in doc:
            text += page.get_text()
    return text

def extract_text_from_txt(file):
    return file.read().decode("utf-8")

def extract_text_from_doc(file):
    return pypandoc.convert_text(file.read(), 'plain', format='doc')

def extract_text_from_docx(file):
    doc = Document(file)
    return '\n'.join([para.text for para in doc.paragraphs])

def chunk_text(text, max_len=1024):
    # Split the text into smaller chunks
    sentences = text.split('. ')
    current_chunk = []
    current_length = 0
    chunks = []

    for sentence in sentences:
        if current_length + len(sentence) <= max_len:
            current_chunk.append(sentence)
            current_length += len(sentence)
        else:
            chunks.append('. '.join(current_chunk) + '.')
            current_chunk = [sentence]
            current_length = len(sentence)

    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')

    return chunks

def summarize_chunk(chunk):
    return pipe(chunk)[0]['summary_text']

# Summarize button
if st.button("Summarize"):
    if input_text:
        chunks = chunk_text(input_text)
        with ThreadPoolExecutor() as executor:
            summaries = list(executor.map(summarize_chunk, chunks))
        st.subheader("Summary")
        st.write(' '.join(summaries))
    elif uploaded_file is not None:
        if uploaded_file.type == "application/pdf":
            file_text = extract_text_from_pdf(uploaded_file)
        elif uploaded_file.type == "text/plain":
            file_text = extract_text_from_txt(uploaded_file)
        elif uploaded_file.type == "application/msword":
            file_text = extract_text_from_doc(uploaded_file)
        elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
            file_text = extract_text_from_docx(uploaded_file)
        
        chunks = chunk_text(file_text)
        with ThreadPoolExecutor() as executor:
            summaries = list(executor.map(summarize_chunk, chunks))
        st.subheader("Summary")
        st.write(' '.join(summaries))
    else:
        st.write("Please enter some text or upload a file to summarize.")