cstr commited on
Commit
9d8df86
Β·
verified Β·
1 Parent(s): 19b0276

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +322 -63
app.py CHANGED
@@ -1,75 +1,334 @@
 
 
 
 
1
  import gradio as gr
2
- from functions import extract_text_from_pdf, format_content, split_into_snippets, build_prompts
 
 
3
 
4
- def process_inputs(pdf_file, model_choice, output_format, oauth_token: gr.OAuthToken | None = None):
5
- """Process PDF and generate summary"""
6
- if oauth_token is None:
7
- return "### Please log in to use this service"
8
-
9
- if not pdf_file:
10
- return "### Please upload a PDF file"
11
-
 
 
 
 
12
  try:
13
- text = extract_text_from_pdf(pdf_file.name)
14
- return f"### Processing successful with {model_choice}!"
 
 
 
 
 
 
 
 
 
15
  except Exception as e:
16
- return f"### Error: {str(e)}"
17
-
18
- # Define core interface components
19
- iface = gr.Interface(
20
- fn=process_inputs,
21
- inputs=[
22
- gr.File(
23
- label="Upload PDF",
24
- file_types=[".pdf"]
25
- ),
26
- gr.Dropdown(
27
- choices=[
28
- "GPT-3.5",
29
- "GPT-4",
30
- "Claude-3",
31
- "Mistral"
32
- ],
33
- label="Model",
34
- value="GPT-3.5"
35
- ),
36
- gr.Radio(
37
- choices=["TXT", "MD", "HTML"],
38
- label="Format",
39
- value="TXT"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  )
41
- ],
42
- outputs=gr.Markdown(
43
- label="Output",
44
- value="### Upload your PDF to begin"
45
- ),
46
- flagging_mode="never",
47
- css="""
48
- .gradio-container {
49
- max-width: 800px !important;
50
- margin: 0 auto !important;
51
- }
52
- .container {
53
- max-width: 800px !important;
54
- margin: 0 auto !important;
55
- padding: 2rem !important;
56
- }
57
- """
58
- )
59
-
60
- # Create main app
 
 
 
 
61
  with gr.Blocks(theme=gr.themes.Default()) as demo:
62
- gr.Markdown("## πŸš€ PDF to LLM Summarizer")
 
 
63
 
 
64
  with gr.Row():
65
- with gr.Column():
66
- gr.Markdown("πŸ“„ Extract and summarize text from PDFs using state-of-the-art language models")
67
- with gr.Column():
68
- gr.LoginButton(min_width=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- iface.render()
 
 
 
 
71
 
72
- gr.Markdown("Made with Gradio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
 
74
  if __name__ == "__main__":
75
- demo.launch()
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+ import requests
5
  import gradio as gr
6
+ from PyPDF2 import PdfReader
7
+ import openai
8
+ import logging
9
 
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
+
13
+ # Initialize Hugging Face models
14
+ HUGGINGFACE_MODELS = {
15
+ "Phi-3 Mini 128k Instruct by EswardiVI": "eswardivi/Phi-3-mini-128k-instruct",
16
+ "Phi-3 Mini 128k Instruct by TaufiqDP": "taufiqdp/phi-3-mini-128k-instruct"
17
+ }
18
+
19
+ # Utility Functions
20
+ def extract_text_from_pdf(pdf_path):
21
+ """Extract text content from PDF file."""
22
  try:
23
+ reader = PdfReader(pdf_path)
24
+ text = ""
25
+ for page_num, page in enumerate(reader.pages, start=1):
26
+ page_text = page.extract_text()
27
+ if page_text:
28
+ text += page_text + "\n"
29
+ else:
30
+ logging.warning(f"No text found on page {page_num}.")
31
+ if not text.strip():
32
+ return "Error: No extractable text found in the PDF."
33
+ return text
34
  except Exception as e:
35
+ logging.error(f"Error reading PDF file: {e}")
36
+ return f"Error reading PDF file: {e}"
37
+
38
+ def format_content(text, format_type):
39
+ """Format extracted text according to specified format."""
40
+ if format_type == 'txt':
41
+ return text
42
+ elif format_type == 'md':
43
+ paragraphs = text.split('\n\n')
44
+ return '\n\n'.join(paragraphs)
45
+ elif format_type == 'html':
46
+ paragraphs = text.split('\n\n')
47
+ return ''.join([f'<p>{para.strip()}</p>' for para in paragraphs if para.strip()])
48
+ else:
49
+ logging.error(f"Unsupported format: {format_type}")
50
+ return f"Unsupported format: {format_type}"
51
+
52
+ def split_into_snippets(text, context_size):
53
+ """Split text into manageable snippets based on context size."""
54
+ sentences = re.split(r'(?<=[.!?]) +', text)
55
+ snippets = []
56
+ current_snippet = ""
57
+
58
+ for sentence in sentences:
59
+ if len(current_snippet) + len(sentence) + 1 > context_size:
60
+ if current_snippet:
61
+ snippets.append(current_snippet.strip())
62
+ current_snippet = sentence + " "
63
+ else:
64
+ snippets.append(sentence.strip())
65
+ current_snippet = ""
66
+ else:
67
+ current_snippet += sentence + " "
68
+
69
+ if current_snippet.strip():
70
+ snippets.append(current_snippet.strip())
71
+
72
+ return snippets
73
+
74
+ def build_prompts(snippets, prompt_instruction, custom_prompt):
75
+ """Build formatted prompts from text snippets."""
76
+ prompts = []
77
+ for idx, snippet in enumerate(snippets, start=1):
78
+ current_prompt = custom_prompt if custom_prompt else prompt_instruction
79
+ framed_prompt = f"---\nPart {idx} of {len(snippets)}:\n{current_prompt}\n\n{snippet}\n\nEnd of Part {idx}.\n---"
80
+ prompts.append(framed_prompt)
81
+ return prompts
82
+
83
+ def send_to_huggingface(prompt, model_name):
84
+ """Send prompt to Hugging Face model."""
85
+ try:
86
+ payload = {"inputs": prompt}
87
+ response = requests.post(
88
+ f"https://api-inference.huggingface.co/models/{model_name}",
89
+ json=payload
90
  )
91
+ if response.status_code == 200:
92
+ return response.json()[0].get('generated_text', 'No generated text found.')
93
+ else:
94
+ error_info = response.json()
95
+ error_message = error_info.get('error', 'Unknown error occurred.')
96
+ logging.error(f"Error from Hugging Face model: {error_message}")
97
+ return f"Error from Hugging Face model: {error_message}"
98
+ except Exception as e:
99
+ logging.error(f"Error interacting with Hugging Face model: {e}")
100
+ return f"Error interacting with Hugging Face model: {e}"
101
+
102
+ def authenticate_openai(api_key):
103
+ """Authenticate with OpenAI API."""
104
+ if api_key:
105
+ try:
106
+ openai.api_key = api_key
107
+ openai.Model.list()
108
+ return "OpenAI Authentication Successful!"
109
+ except Exception as e:
110
+ logging.error(f"OpenAI API Key Error: {e}")
111
+ return f"OpenAI API Key Error: {e}"
112
+ return "No OpenAI API key provided."
113
+
114
+ # Main Interface
115
  with gr.Blocks(theme=gr.themes.Default()) as demo:
116
+ # Header
117
+ gr.Markdown("# πŸ“„ Smart PDF Summarizer")
118
+ gr.Markdown("Upload a PDF document and get AI-powered summaries using OpenAI or Hugging Face models.")
119
 
120
+ # Authentication Section
121
  with gr.Row():
122
+ with gr.Column(scale=1):
123
+ openai_api_key = gr.Textbox(
124
+ label="πŸ”‘ OpenAI API Key",
125
+ type="password",
126
+ placeholder="Enter your OpenAI API key (optional)"
127
+ )
128
+ auth_status = gr.Textbox(
129
+ label="Authentication Status",
130
+ interactive=False
131
+ )
132
+ auth_button = gr.Button("πŸ”“ Authenticate", variant="primary")
133
+
134
+ # Main Content
135
+ with gr.Row():
136
+ # Left Column - Input Options
137
+ with gr.Column(scale=1):
138
+ pdf_input = gr.File(
139
+ label="πŸ“ Upload PDF",
140
+ file_types=[".pdf"]
141
+ )
142
+
143
+ with gr.Row():
144
+ format_type = gr.Radio(
145
+ choices=["txt", "md", "html"],
146
+ value="txt",
147
+ label="πŸ“ Output Format"
148
+ )
149
+
150
+ context_size = gr.Slider(
151
+ minimum=4000,
152
+ maximum=128000,
153
+ step=4000,
154
+ value=32000,
155
+ label="πŸ“ Context Window Size"
156
+ )
157
+
158
+ snippet_number = gr.Number(
159
+ label="πŸ”’ Snippet Number (Optional)",
160
+ value=None,
161
+ precision=0
162
+ )
163
+
164
+ custom_prompt = gr.Textbox(
165
+ label="✍️ Custom Prompt",
166
+ placeholder="Enter your custom prompt here...",
167
+ lines=2
168
+ )
169
+
170
+ model_choice = gr.Radio(
171
+ choices=["OpenAI ChatGPT", "Hugging Face Model"],
172
+ value="OpenAI ChatGPT",
173
+ label="πŸ€– Model Selection"
174
+ )
175
+
176
+ hf_model = gr.Dropdown(
177
+ choices=list(HUGGINGFACE_MODELS.keys()),
178
+ label="πŸ”§ Hugging Face Model",
179
+ visible=False
180
+ )
181
+
182
+ # Right Column - Output
183
+ with gr.Column(scale=1):
184
+ with gr.Row():
185
+ process_button = gr.Button("πŸš€ Process PDF", variant="primary")
186
+
187
+ progress_status = gr.Textbox(
188
+ label="πŸ“Š Progress",
189
+ interactive=False
190
+ )
191
+
192
+ generated_prompt = gr.Textbox(
193
+ label="πŸ“‹ Generated Prompt",
194
+ lines=10
195
+ )
196
+
197
+ summary_output = gr.Textbox(
198
+ label="πŸ“ Summary",
199
+ lines=15
200
+ )
201
+
202
+ with gr.Row():
203
+ download_prompt = gr.File(
204
+ label="πŸ“₯ Download Prompt"
205
+ )
206
+ download_summary = gr.File(
207
+ label="πŸ“₯ Download Summary"
208
+ )
209
+
210
+ # Event Handlers
211
+ def toggle_hf_model(choice):
212
+ return gr.update(visible=choice == "Hugging Face Model")
213
+
214
+ def handle_authentication(api_key):
215
+ return authenticate_openai(api_key)
216
+
217
+ def process_pdf(pdf, fmt, ctx_size, snippet_num, prompt, model_selection, hf_model_choice, api_key):
218
+ try:
219
+ if not pdf:
220
+ return "Please upload a PDF file.", "", "", None, None
221
+
222
+ # Extract text
223
+ text = extract_text_from_pdf(pdf.name)
224
+ if text.startswith("Error"):
225
+ return text, "", "", None, None
226
+
227
+ # Format content
228
+ formatted_text = format_content(text, fmt)
229
+
230
+ # Split into snippets
231
+ snippets = split_into_snippets(formatted_text, ctx_size)
232
+
233
+ # Process specific snippet or all
234
+ if snippet_num is not None:
235
+ if 1 <= snippet_num <= len(snippets):
236
+ selected_snippets = [snippets[snippet_num - 1]]
237
+ else:
238
+ return f"Invalid snippet number. Please choose between 1 and {len(snippets)}.", "", "", None, None
239
+ else:
240
+ selected_snippets = snippets
241
+
242
+ # Build prompts
243
+ default_prompt = "Summarize the following text:"
244
+ prompts = build_prompts(selected_snippets, default_prompt, prompt)
245
+ full_prompt = "\n".join(prompts)
246
+
247
+ # Generate summary
248
+ if model_selection == "OpenAI ChatGPT":
249
+ if not api_key:
250
+ return "OpenAI API key required.", full_prompt, "", None, None
251
+ try:
252
+ openai.api_key = api_key
253
+ response = openai.ChatCompletion.create(
254
+ model="gpt-3.5-turbo",
255
+ messages=[{"role": "user", "content": full_prompt}]
256
+ )
257
+ summary = response.choices[0].message.content
258
+ except Exception as e:
259
+ return f"OpenAI API error: {str(e)}", full_prompt, "", None, None
260
+ else:
261
+ summary = send_to_huggingface(full_prompt, HUGGINGFACE_MODELS[hf_model_choice])
262
+
263
+ # Save files for download
264
+ with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as prompt_file:
265
+ prompt_file.write(full_prompt)
266
+ prompt_path = prompt_file.name
267
+
268
+ with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as summary_file:
269
+ summary_file.write(summary)
270
+ summary_path = summary_file.name
271
+
272
+ return "Processing complete!", full_prompt, summary, prompt_path, summary_path
273
+
274
+ except Exception as e:
275
+ logging.error(f"Error processing PDF: {e}")
276
+ return f"Error processing PDF: {str(e)}", "", "", None, None
277
+
278
+ # Connect event handlers
279
+ model_choice.change(
280
+ toggle_hf_model,
281
+ inputs=[model_choice],
282
+ outputs=[hf_model]
283
+ )
284
 
285
+ auth_button.click(
286
+ handle_authentication,
287
+ inputs=[openai_api_key],
288
+ outputs=[auth_status]
289
+ )
290
 
291
+ process_button.click(
292
+ process_pdf,
293
+ inputs=[
294
+ pdf_input,
295
+ format_type,
296
+ context_size,
297
+ snippet_number,
298
+ custom_prompt,
299
+ model_choice,
300
+ hf_model,
301
+ openai_api_key
302
+ ],
303
+ outputs=[
304
+ progress_status,
305
+ generated_prompt,
306
+ summary_output,
307
+ download_prompt,
308
+ download_summary
309
+ ]
310
+ )
311
+
312
+ # Instructions
313
+ gr.Markdown("""
314
+ ### πŸ“Œ Instructions:
315
+ 1. (Optional) Enter your OpenAI API key and authenticate
316
+ 2. Upload a PDF document
317
+ 3. Choose output format and context window size
318
+ 4. Optionally specify a snippet number or custom prompt
319
+ 5. Select between OpenAI ChatGPT or Hugging Face model
320
+ 6. Click 'Process PDF' to generate summary
321
+ 7. Download the generated prompt and summary as needed
322
+
323
+ ### βš™οΈ Features:
324
+ - Support for multiple PDF formats
325
+ - Flexible text formatting options
326
+ - Custom prompt creation
327
+ - Multiple AI model options
328
+ - Snippet-based processing
329
+ - Downloadable outputs
330
+ """)
331
 
332
+ # Launch the interface
333
  if __name__ == "__main__":
334
+ demo.launch(share=False, debug=True)