import gradio as gr from transformers import pipeline import PyPDF2 import pdfplumber # Load the summarization pipeline summarizer = pipeline("summarization", model="facebook/bart-large-cnn") def extract_text_from_pdf(pdf_file): """Extract text from a PDF using PyPDF2 with a fallback to pdfplumber.""" text = "" try: # First try with PyPDF2 pdf_reader = PyPDF2.PdfReader(pdf_file) for page in pdf_reader.pages: text += page.extract_text() except Exception as e: print(f"PyPDF2 failed: {e}") # Fallback to pdfplumber with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: text += page.extract_text() return text def chunk_text(text, max_chunk_size=1024): """Split text into smaller chunks to fit within model token limits.""" words = text.split() for i in range(0, len(words), max_chunk_size): yield " ".join(words[i:i + max_chunk_size]) def summarize_pdf(pdf_file): """Extract text from PDF, chunk it, and summarize.""" try: # Extract text from the PDF text = extract_text_from_pdf(pdf_file) if not text.strip(): return "❌ Could not extract any text from the PDF. Please upload a readable document." # Chunk text for summarization summaries = [] for chunk in chunk_text(text): # Summarize each chunk summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False) summaries.append(summary[0]['summary_text']) # Combine all summaries into one full_summary = "\n\n".join(summaries) return full_summary except Exception as e: return f"❌ An error occurred: {str(e)}" # Gradio Interface interface = gr.Interface( fn=summarize_pdf, inputs=gr.File(label="Upload PDF"), outputs=gr.Textbox(label="Summary"), title="PDF Summarizer", description="Upload a PDF file to extract and summarize its content using state-of-the-art AI." ) interface.launch()