Spaces:

cstr
/

PDF-Summarizer

Running

App Files Files Community

cstr commited on Dec 6, 2024

Commit

9d8df86

verified ·

1 Parent(s): 19b0276

Update app.py

Browse files

Files changed (1) hide show

app.py +322 -63

app.py CHANGED Viewed

@@ -1,75 +1,334 @@
 import gradio as gr
-from functions import extract_text_from_pdf, format_content, split_into_snippets, build_prompts
-def process_inputs(pdf_file, model_choice, output_format, oauth_token: gr.OAuthToken | None = None):
-    """Process PDF and generate summary"""
-    if oauth_token is None:
-        return "### Please log in to use this service"
-    if not pdf_file:
-        return "### Please upload a PDF file"
     try:
-        text = extract_text_from_pdf(pdf_file.name)
-        return f"### Processing successful with {model_choice}!"
     except Exception as e:
-        return f"### Error: {str(e)}"
-# Define core interface components
-iface = gr.Interface(
-    fn=process_inputs,
-    inputs=[
-        gr.File(
-            label="Upload PDF",
-            file_types=[".pdf"]
-        ),
-        gr.Dropdown(
-            choices=[
-                "GPT-3.5",
-                "GPT-4",
-                "Claude-3",
-                "Mistral"
-            ],
-            label="Model",
-            value="GPT-3.5"
-        ),
-        gr.Radio(
-            choices=["TXT", "MD", "HTML"],
-            label="Format",
-            value="TXT"
         )
-    ],
-    outputs=gr.Markdown(
-        label="Output",
-        value="### Upload your PDF to begin"
-    ),
-    flagging_mode="never",
-    css="""
-        .gradio-container {
-            max-width: 800px !important;
-            margin: 0 auto !important;
-        }
-        .container {
-            max-width: 800px !important;
-            margin: 0 auto !important;
-            padding: 2rem !important;
-        }
-    """
-)
-# Create main app
 with gr.Blocks(theme=gr.themes.Default()) as demo:
-    gr.Markdown("## 🚀 PDF to LLM Summarizer")
     with gr.Row():
-        with gr.Column():
-            gr.Markdown("📄 Extract and summarize text from PDFs using state-of-the-art language models")
-        with gr.Column():
-            gr.LoginButton(min_width=200)
-    iface.render()
-    gr.Markdown("Made with Gradio")
 if __name__ == "__main__":
-    demo.launch()

+import os
+import re
+import tempfile
+import requests
 import gradio as gr
+from PyPDF2 import PdfReader
+import openai
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Initialize Hugging Face models
+HUGGINGFACE_MODELS = {
+    "Phi-3 Mini 128k Instruct by EswardiVI": "eswardivi/Phi-3-mini-128k-instruct",
+    "Phi-3 Mini 128k Instruct by TaufiqDP": "taufiqdp/phi-3-mini-128k-instruct"
+}
+# Utility Functions
+def extract_text_from_pdf(pdf_path):
+    """Extract text content from PDF file."""
     try:
+        reader = PdfReader(pdf_path)
+        text = ""
+        for page_num, page in enumerate(reader.pages, start=1):
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+            else:
+                logging.warning(f"No text found on page {page_num}.")
+        if not text.strip():
+            return "Error: No extractable text found in the PDF."
+        return text
     except Exception as e:
+        logging.error(f"Error reading PDF file: {e}")
+        return f"Error reading PDF file: {e}"
+def format_content(text, format_type):
+    """Format extracted text according to specified format."""
+    if format_type == 'txt':
+        return text
+    elif format_type == 'md':
+        paragraphs = text.split('\n\n')
+        return '\n\n'.join(paragraphs)
+    elif format_type == 'html':
+        paragraphs = text.split('\n\n')
+        return ''.join([f'<p>{para.strip()}</p>' for para in paragraphs if para.strip()])
+    else:
+        logging.error(f"Unsupported format: {format_type}")
+        return f"Unsupported format: {format_type}"
+def split_into_snippets(text, context_size):
+    """Split text into manageable snippets based on context size."""
+    sentences = re.split(r'(?<=[.!?]) +', text)
+    snippets = []
+    current_snippet = ""
+    for sentence in sentences:
+        if len(current_snippet) + len(sentence) + 1 > context_size:
+            if current_snippet:
+                snippets.append(current_snippet.strip())
+                current_snippet = sentence + " "
+            else:
+                snippets.append(sentence.strip())
+                current_snippet = ""
+        else:
+            current_snippet += sentence + " "
+    if current_snippet.strip():
+        snippets.append(current_snippet.strip())
+    return snippets
+def build_prompts(snippets, prompt_instruction, custom_prompt):
+    """Build formatted prompts from text snippets."""
+    prompts = []
+    for idx, snippet in enumerate(snippets, start=1):
+        current_prompt = custom_prompt if custom_prompt else prompt_instruction
+        framed_prompt = f"---\nPart {idx} of {len(snippets)}:\n{current_prompt}\n\n{snippet}\n\nEnd of Part {idx}.\n---"
+        prompts.append(framed_prompt)
+    return prompts
+def send_to_huggingface(prompt, model_name):
+    """Send prompt to Hugging Face model."""
+    try:
+        payload = {"inputs": prompt}
+        response = requests.post(
+            f"https://api-inference.huggingface.co/models/{model_name}",
+            json=payload
         )
+        if response.status_code == 200:
+            return response.json()[0].get('generated_text', 'No generated text found.')
+        else:
+            error_info = response.json()
+            error_message = error_info.get('error', 'Unknown error occurred.')
+            logging.error(f"Error from Hugging Face model: {error_message}")
+            return f"Error from Hugging Face model: {error_message}"
+    except Exception as e:
+        logging.error(f"Error interacting with Hugging Face model: {e}")
+        return f"Error interacting with Hugging Face model: {e}"
+def authenticate_openai(api_key):
+    """Authenticate with OpenAI API."""
+    if api_key:
+        try:
+            openai.api_key = api_key
+            openai.Model.list()
+            return "OpenAI Authentication Successful!"
+        except Exception as e:
+            logging.error(f"OpenAI API Key Error: {e}")
+            return f"OpenAI API Key Error: {e}"
+    return "No OpenAI API key provided."
+# Main Interface
 with gr.Blocks(theme=gr.themes.Default()) as demo:
+    # Header
+    gr.Markdown("# 📄 Smart PDF Summarizer")
+    gr.Markdown("Upload a PDF document and get AI-powered summaries using OpenAI or Hugging Face models.")
+    # Authentication Section
     with gr.Row():
+        with gr.Column(scale=1):
+            openai_api_key = gr.Textbox(
+                label="🔑 OpenAI API Key",
+                type="password",
+                placeholder="Enter your OpenAI API key (optional)"
+            )
+            auth_status = gr.Textbox(
+                label="Authentication Status",
+                interactive=False
+            )
+            auth_button = gr.Button("🔓 Authenticate", variant="primary")
+    # Main Content
+    with gr.Row():
+        # Left Column - Input Options
+        with gr.Column(scale=1):
+            pdf_input = gr.File(
+                label="📁 Upload PDF",
+                file_types=[".pdf"]
+            )
+            with gr.Row():
+                format_type = gr.Radio(
+                    choices=["txt", "md", "html"],
+                    value="txt",
+                    label="📝 Output Format"
+                )
+            context_size = gr.Slider(
+                minimum=4000,
+                maximum=128000,
+                step=4000,
+                value=32000,
+                label="📏 Context Window Size"
+            )
+            snippet_number = gr.Number(
+                label="🔢 Snippet Number (Optional)",
+                value=None,
+                precision=0
+            )
+            custom_prompt = gr.Textbox(
+                label="✍️ Custom Prompt",
+                placeholder="Enter your custom prompt here...",
+                lines=2
+            )
+            model_choice = gr.Radio(
+                choices=["OpenAI ChatGPT", "Hugging Face Model"],
+                value="OpenAI ChatGPT",
+                label="🤖 Model Selection"
+            )
+            hf_model = gr.Dropdown(
+                choices=list(HUGGINGFACE_MODELS.keys()),
+                label="🔧 Hugging Face Model",
+                visible=False
+            )
+        # Right Column - Output
+        with gr.Column(scale=1):
+            with gr.Row():
+                process_button = gr.Button("🚀 Process PDF", variant="primary")
+            progress_status = gr.Textbox(
+                label="📊 Progress",
+                interactive=False
+            )
+            generated_prompt = gr.Textbox(
+                label="📋 Generated Prompt",
+                lines=10
+            )
+            summary_output = gr.Textbox(
+                label="📝 Summary",
+                lines=15
+            )
+            with gr.Row():
+                download_prompt = gr.File(
+                    label="📥 Download Prompt"
+                )
+                download_summary = gr.File(
+                    label="📥 Download Summary"
+                )
+    # Event Handlers
+    def toggle_hf_model(choice):
+        return gr.update(visible=choice == "Hugging Face Model")
+    def handle_authentication(api_key):
+        return authenticate_openai(api_key)
+    def process_pdf(pdf, fmt, ctx_size, snippet_num, prompt, model_selection, hf_model_choice, api_key):
+        try:
+            if not pdf:
+                return "Please upload a PDF file.", "", "", None, None
+            # Extract text
+            text = extract_text_from_pdf(pdf.name)
+            if text.startswith("Error"):
+                return text, "", "", None, None
+            # Format content
+            formatted_text = format_content(text, fmt)
+            # Split into snippets
+            snippets = split_into_snippets(formatted_text, ctx_size)
+            # Process specific snippet or all
+            if snippet_num is not None:
+                if 1 <= snippet_num <= len(snippets):
+                    selected_snippets = [snippets[snippet_num - 1]]
+                else:
+                    return f"Invalid snippet number. Please choose between 1 and {len(snippets)}.", "", "", None, None
+            else:
+                selected_snippets = snippets
+            # Build prompts
+            default_prompt = "Summarize the following text:"
+            prompts = build_prompts(selected_snippets, default_prompt, prompt)
+            full_prompt = "\n".join(prompts)
+            # Generate summary
+            if model_selection == "OpenAI ChatGPT":
+                if not api_key:
+                    return "OpenAI API key required.", full_prompt, "", None, None
+                try:
+                    openai.api_key = api_key
+                    response = openai.ChatCompletion.create(
+                        model="gpt-3.5-turbo",
+                        messages=[{"role": "user", "content": full_prompt}]
+                    )
+                    summary = response.choices[0].message.content
+                except Exception as e:
+                    return f"OpenAI API error: {str(e)}", full_prompt, "", None, None
+            else:
+                summary = send_to_huggingface(full_prompt, HUGGINGFACE_MODELS[hf_model_choice])
+            # Save files for download
+            with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as prompt_file:
+                prompt_file.write(full_prompt)
+                prompt_path = prompt_file.name
+            with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as summary_file:
+                summary_file.write(summary)
+                summary_path = summary_file.name
+            return "Processing complete!", full_prompt, summary, prompt_path, summary_path
+        except Exception as e:
+            logging.error(f"Error processing PDF: {e}")
+            return f"Error processing PDF: {str(e)}", "", "", None, None
+    # Connect event handlers
+    model_choice.change(
+        toggle_hf_model,
+        inputs=[model_choice],
+        outputs=[hf_model]
+    )
+    auth_button.click(
+        handle_authentication,
+        inputs=[openai_api_key],
+        outputs=[auth_status]
+    )
+    process_button.click(
+        process_pdf,
+        inputs=[
+            pdf_input,
+            format_type,
+            context_size,
+            snippet_number,
+            custom_prompt,
+            model_choice,
+            hf_model,
+            openai_api_key
+        ],
+        outputs=[
+            progress_status,
+            generated_prompt,
+            summary_output,
+            download_prompt,
+            download_summary
+        ]
+    )
+    # Instructions
+    gr.Markdown("""
+    ### 📌 Instructions:
+    1. (Optional) Enter your OpenAI API key and authenticate
+    2. Upload a PDF document
+    3. Choose output format and context window size
+    4. Optionally specify a snippet number or custom prompt
+    5. Select between OpenAI ChatGPT or Hugging Face model
+    6. Click 'Process PDF' to generate summary
+    7. Download the generated prompt and summary as needed
+    ### ⚙️ Features:
+    - Support for multiple PDF formats
+    - Flexible text formatting options
+    - Custom prompt creation
+    - Multiple AI model options
+    - Snippet-based processing
+    - Downloadable outputs
+    """)
+# Launch the interface
 if __name__ == "__main__":
+    demo.launch(share=False, debug=True)