Spaces:

cstr
/

PDF-Summarizer

Running

App Files Files Community

PDF-Summarizer / app.py

cstr

Update app.py

9d8df86 verified 8 months ago

raw

history blame

11.7 kB

	import os
	import re
	import tempfile
	import requests
	import gradio as gr
	from PyPDF2 import PdfReader
	import openai
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# Initialize Hugging Face models
	HUGGINGFACE_MODELS = {
	"Phi-3 Mini 128k Instruct by EswardiVI": "eswardivi/Phi-3-mini-128k-instruct",
	"Phi-3 Mini 128k Instruct by TaufiqDP": "taufiqdp/phi-3-mini-128k-instruct"
	}

	# Utility Functions
	def extract_text_from_pdf(pdf_path):
	"""Extract text content from PDF file."""
	try:
	reader = PdfReader(pdf_path)
	text = ""
	for page_num, page in enumerate(reader.pages, start=1):
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	else:
	logging.warning(f"No text found on page {page_num}.")
	if not text.strip():
	return "Error: No extractable text found in the PDF."
	return text
	except Exception as e:
	logging.error(f"Error reading PDF file: {e}")
	return f"Error reading PDF file: {e}"

	def format_content(text, format_type):
	"""Format extracted text according to specified format."""
	if format_type == 'txt':
	return text
	elif format_type == 'md':
	paragraphs = text.split('\n\n')
	return '\n\n'.join(paragraphs)
	elif format_type == 'html':
	paragraphs = text.split('\n\n')
	return ''.join([f'<p>{para.strip()}</p>' for para in paragraphs if para.strip()])
	else:
	logging.error(f"Unsupported format: {format_type}")
	return f"Unsupported format: {format_type}"

	def split_into_snippets(text, context_size):
	"""Split text into manageable snippets based on context size."""
	sentences = re.split(r'(?<=[.!?]) +', text)
	snippets = []
	current_snippet = ""

	for sentence in sentences:
	if len(current_snippet) + len(sentence) + 1 > context_size:
	if current_snippet:
	snippets.append(current_snippet.strip())
	current_snippet = sentence + " "
	else:
	snippets.append(sentence.strip())
	current_snippet = ""
	else:
	current_snippet += sentence + " "

	if current_snippet.strip():
	snippets.append(current_snippet.strip())

	return snippets

	def build_prompts(snippets, prompt_instruction, custom_prompt):
	"""Build formatted prompts from text snippets."""
	prompts = []
	for idx, snippet in enumerate(snippets, start=1):
	current_prompt = custom_prompt if custom_prompt else prompt_instruction
	framed_prompt = f"---\nPart {idx} of {len(snippets)}:\n{current_prompt}\n\n{snippet}\n\nEnd of Part {idx}.\n---"
	prompts.append(framed_prompt)
	return prompts

	def send_to_huggingface(prompt, model_name):
	"""Send prompt to Hugging Face model."""
	try:
	payload = {"inputs": prompt}
	response = requests.post(
	f"https://api-inference.huggingface.co/models/{model_name}",
	json=payload
	)
	if response.status_code == 200:
	return response.json()[0].get('generated_text', 'No generated text found.')
	else:
	error_info = response.json()
	error_message = error_info.get('error', 'Unknown error occurred.')
	logging.error(f"Error from Hugging Face model: {error_message}")
	return f"Error from Hugging Face model: {error_message}"
	except Exception as e:
	logging.error(f"Error interacting with Hugging Face model: {e}")
	return f"Error interacting with Hugging Face model: {e}"

	def authenticate_openai(api_key):
	"""Authenticate with OpenAI API."""
	if api_key:
	try:
	openai.api_key = api_key
	openai.Model.list()
	return "OpenAI Authentication Successful!"
	except Exception as e:
	logging.error(f"OpenAI API Key Error: {e}")
	return f"OpenAI API Key Error: {e}"
	return "No OpenAI API key provided."

	# Main Interface
	with gr.Blocks(theme=gr.themes.Default()) as demo:
	# Header
	gr.Markdown("# 📄 Smart PDF Summarizer")
	gr.Markdown("Upload a PDF document and get AI-powered summaries using OpenAI or Hugging Face models.")

	# Authentication Section
	with gr.Row():
	with gr.Column(scale=1):
	openai_api_key = gr.Textbox(
	label="🔑 OpenAI API Key",
	type="password",
	placeholder="Enter your OpenAI API key (optional)"
	)
	auth_status = gr.Textbox(
	label="Authentication Status",
	interactive=False
	)
	auth_button = gr.Button("🔓 Authenticate", variant="primary")

	# Main Content
	with gr.Row():
	# Left Column - Input Options
	with gr.Column(scale=1):
	pdf_input = gr.File(
	label="📁 Upload PDF",
	file_types=[".pdf"]
	)

	with gr.Row():
	format_type = gr.Radio(
	choices=["txt", "md", "html"],
	value="txt",
	label="📝 Output Format"
	)

	context_size = gr.Slider(
	minimum=4000,
	maximum=128000,
	step=4000,
	value=32000,
	label="📏 Context Window Size"
	)

	snippet_number = gr.Number(
	label="🔢 Snippet Number (Optional)",
	value=None,
	precision=0
	)

	custom_prompt = gr.Textbox(
	label="✍️ Custom Prompt",
	placeholder="Enter your custom prompt here...",
	lines=2
	)

	model_choice = gr.Radio(
	choices=["OpenAI ChatGPT", "Hugging Face Model"],
	value="OpenAI ChatGPT",
	label="🤖 Model Selection"
	)

	hf_model = gr.Dropdown(
	choices=list(HUGGINGFACE_MODELS.keys()),
	label="🔧 Hugging Face Model",
	visible=False
	)

	# Right Column - Output
	with gr.Column(scale=1):
	with gr.Row():
	process_button = gr.Button("🚀 Process PDF", variant="primary")

	progress_status = gr.Textbox(
	label="📊 Progress",
	interactive=False
	)

	generated_prompt = gr.Textbox(
	label="📋 Generated Prompt",
	lines=10
	)

	summary_output = gr.Textbox(
	label="📝 Summary",
	lines=15
	)

	with gr.Row():
	download_prompt = gr.File(
	label="📥 Download Prompt"
	)
	download_summary = gr.File(
	label="📥 Download Summary"
	)

	# Event Handlers
	def toggle_hf_model(choice):
	return gr.update(visible=choice == "Hugging Face Model")

	def handle_authentication(api_key):
	return authenticate_openai(api_key)

	def process_pdf(pdf, fmt, ctx_size, snippet_num, prompt, model_selection, hf_model_choice, api_key):
	try:
	if not pdf:
	return "Please upload a PDF file.", "", "", None, None

	# Extract text
	text = extract_text_from_pdf(pdf.name)
	if text.startswith("Error"):
	return text, "", "", None, None

	# Format content
	formatted_text = format_content(text, fmt)

	# Split into snippets
	snippets = split_into_snippets(formatted_text, ctx_size)

	# Process specific snippet or all
	if snippet_num is not None:
	if 1 <= snippet_num <= len(snippets):
	selected_snippets = [snippets[snippet_num - 1]]
	else:
	return f"Invalid snippet number. Please choose between 1 and {len(snippets)}.", "", "", None, None
	else:
	selected_snippets = snippets

	# Build prompts
	default_prompt = "Summarize the following text:"
	prompts = build_prompts(selected_snippets, default_prompt, prompt)
	full_prompt = "\n".join(prompts)

	# Generate summary
	if model_selection == "OpenAI ChatGPT":
	if not api_key:
	return "OpenAI API key required.", full_prompt, "", None, None
	try:
	openai.api_key = api_key
	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[{"role": "user", "content": full_prompt}]
	)
	summary = response.choices[0].message.content
	except Exception as e:
	return f"OpenAI API error: {str(e)}", full_prompt, "", None, None
	else:
	summary = send_to_huggingface(full_prompt, HUGGINGFACE_MODELS[hf_model_choice])

	# Save files for download
	with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as prompt_file:
	prompt_file.write(full_prompt)
	prompt_path = prompt_file.name

	with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as summary_file:
	summary_file.write(summary)
	summary_path = summary_file.name

	return "Processing complete!", full_prompt, summary, prompt_path, summary_path

	except Exception as e:
	logging.error(f"Error processing PDF: {e}")
	return f"Error processing PDF: {str(e)}", "", "", None, None

	# Connect event handlers
	model_choice.change(
	toggle_hf_model,
	inputs=[model_choice],
	outputs=[hf_model]
	)

	auth_button.click(
	handle_authentication,
	inputs=[openai_api_key],
	outputs=[auth_status]
	)

	process_button.click(
	process_pdf,
	inputs=[
	pdf_input,
	format_type,
	context_size,
	snippet_number,
	custom_prompt,
	model_choice,
	hf_model,
	openai_api_key
	],
	outputs=[
	progress_status,
	generated_prompt,
	summary_output,
	download_prompt,
	download_summary
	]
	)

	# Instructions
	gr.Markdown("""
	### 📌 Instructions:
	1. (Optional) Enter your OpenAI API key and authenticate
	2. Upload a PDF document
	3. Choose output format and context window size
	4. Optionally specify a snippet number or custom prompt
	5. Select between OpenAI ChatGPT or Hugging Face model
	6. Click 'Process PDF' to generate summary
	7. Download the generated prompt and summary as needed

	### ⚙️ Features:
	- Support for multiple PDF formats
	- Flexible text formatting options
	- Custom prompt creation
	- Multiple AI model options
	- Snippet-based processing
	- Downloadable outputs
	""")

	# Launch the interface
	if __name__ == "__main__":
	demo.launch(share=False, debug=True)