jackkuo commited on
Commit
900c0a5
·
verified ·
1 Parent(s): 0ad8950

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -59
app.py CHANGED
@@ -1,8 +1,10 @@
 
1
  import gradio as gr
2
- import base64
 
 
3
  import os
4
- from openai import OpenAI
5
- import fitz
6
 
7
  api_key = os.getenv('API_KEY')
8
  base_url = os.getenv("BASE_URL")
@@ -14,20 +16,21 @@ client = OpenAI(
14
 
15
 
16
  def extract_pdf_pypdf(pdf_dir):
17
- path = pdf_dir
18
-
19
  try:
20
- doc = fitz.open(path)
21
- except:
22
- print("can not read pdf")
23
  return None
24
 
25
  page_count = doc.page_count
26
  file_content = ""
27
  for page in range(page_count):
28
- text = doc.load_page(page).get_text("text")
29
- # 防止目录中包含References
30
- file_content += text + "\n\n"
 
 
 
31
 
32
  return file_content
33
 
@@ -39,26 +42,13 @@ def openai_api(messages):
39
  messages=messages,
40
  temperature=0.1,
41
  max_tokens=8192,
42
- # timeout=300,
43
  stream=True
44
  )
 
 
 
45
  except Exception as ex:
46
- print("api 出现如下异常%s" % ex)
47
- return None
48
-
49
- if completion:
50
- try:
51
- response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
52
- completion]
53
- print("response tokens:", len(response_2_list))
54
-
55
- response_2_content = ''.join(response_2_list)
56
- return response_2_content
57
- except Exception as ex:
58
- print("第二轮 出现如下异常%s" % ex)
59
- return None
60
- else:
61
- print("第二轮出现异常")
62
  return None
63
 
64
 
@@ -83,29 +73,30 @@ def predict(input_text, pdf_file):
83
  return extract_result or "Too many users. Please wait a moment!"
84
 
85
 
86
- def view_pdf(pdf_file, max_pages=3):
87
- if pdf_file is None:
88
- return "Please upload a PDF file to view."
89
 
90
- try:
91
- # Open the PDF file
92
- doc = fitz.open(pdf_file.name)
93
 
94
- # Only read up to `max_pages` pages to reduce size for large PDFs
95
- preview_pdf = fitz.open() # Create an empty PDF for the preview
96
- for page_num in range(min(max_pages, doc.page_count)):
97
- preview_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
 
 
 
98
 
99
- # Save the preview as a temporary in-memory file
100
- pdf_data = preview_pdf.tobytes()
101
 
102
- # Encode as base64 for embedding in HTML
103
- b64_data = base64.b64encode(pdf_data).decode('utf-8')
104
- return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
105
 
106
- except Exception as e:
107
- print(f"Error displaying PDF: {e}")
108
- return "Error displaying PDF. Please try re-uploading."
 
109
 
110
 
111
  en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
@@ -120,22 +111,20 @@ examples = [[en_1], [en_2]]
120
 
121
  with gr.Blocks(title="PaperExtractGPT") as demo:
122
  gr.Markdown(
123
- '''<p align="center">
124
- <h1 align="center"> Paper Extract GPT </h1>
125
- <p> How to use:
126
- <br> <strong>1</strong>: Upload your PDF.
127
- <br> <strong>2</strong>: Click "View PDF" to preview it.
128
- <br> <strong>3</strong>: Enter your extraction prompt in the input box.
129
- <br> <strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
130
- </p>
131
- '''
132
  )
133
  with gr.Row():
134
  with gr.Column():
135
- gr.Markdown('## Upload PDF')
136
  file_input = gr.File(label="Upload your PDF", type="filepath")
 
137
  viewer_button = gr.Button("View PDF")
138
- file_out = gr.HTML(label="PDF Preview")
139
 
140
  with gr.Column():
141
  model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
@@ -143,13 +132,13 @@ with gr.Blocks(title="PaperExtractGPT") as demo:
143
  with gr.Row():
144
  gen = gr.Button("Generate")
145
  clr = gr.Button("Clear")
146
- outputs = gr.Markdown(label='Output', show_label=True, value="""| Title | Journal | Year | Author | Institution | Email |
147
  |---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
148
  | Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
149
  """)
150
 
151
  gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
152
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
153
- viewer_button.click(view_pdf, inputs=file_input, outputs=file_out)
154
 
155
  demo.launch()
 
1
+ from openai import OpenAI
2
  import gradio as gr
3
+ import fitz # PyMuPDF
4
+ from PIL import Image
5
+ from pathlib import Path
6
  import os
7
+
 
8
 
9
  api_key = os.getenv('API_KEY')
10
  base_url = os.getenv("BASE_URL")
 
16
 
17
 
18
  def extract_pdf_pypdf(pdf_dir):
 
 
19
  try:
20
+ doc = fitz.open(pdf_dir)
21
+ except Exception as e:
22
+ print(f"Error opening PDF: {e}")
23
  return None
24
 
25
  page_count = doc.page_count
26
  file_content = ""
27
  for page in range(page_count):
28
+ try:
29
+ text = doc.load_page(page).get_text("text")
30
+ file_content += text + "\n\n"
31
+ except Exception as e:
32
+ print(f"Error reading page {page}: {e}")
33
+ continue
34
 
35
  return file_content
36
 
 
42
  messages=messages,
43
  temperature=0.1,
44
  max_tokens=8192,
 
45
  stream=True
46
  )
47
+ response = ''.join(
48
+ [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
49
+ return response
50
  except Exception as ex:
51
+ print("API error:", ex)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  return None
53
 
54
 
 
73
  return extract_result or "Too many users. Please wait a moment!"
74
 
75
 
76
+ def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
77
+ # 创建存储图像的文件夹
78
+ os.makedirs(image_folder, exist_ok=True)
79
 
80
+ # 打开PDF文档
81
+ pdf_document = fitz.open(pdf_path)
82
+ image_paths = []
83
 
84
+ # 遍历每一页PDF,并生成高DPI的图像
85
+ for page_number in range(len(pdf_document)):
86
+ page = pdf_document[page_number]
87
+ pix = page.get_pixmap(dpi=dpi)
88
+ image_path = Path(image_folder) / f"page_{page_number + 1}.png"
89
+ Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
90
+ image_paths.append(str(image_path)) # 收集每一页的图像路径
91
 
92
+ pdf_document.close()
93
+ return image_paths
94
 
 
 
 
95
 
96
+ def display_pdf_images(file):
97
+ # 转换PDF为高清图像
98
+ image_paths = convert_pdf_to_images(file)
99
+ return image_paths # 返回图像路径列表以显示
100
 
101
 
102
  en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
 
111
 
112
  with gr.Blocks(title="PaperExtractGPT") as demo:
113
  gr.Markdown(
114
+ '''<h1 align="center"> Paper Extract GPT </h1>
115
+ <p>How to use:
116
+ <br><strong>1</strong>: Upload your PDF.
117
+ <br><strong>2</strong>: Click "View PDF" to preview it.
118
+ <br><strong>3</strong>: Enter your extraction prompt in the input box.
119
+ <br><strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
120
+ </p>'''
 
 
121
  )
122
  with gr.Row():
123
  with gr.Column():
 
124
  file_input = gr.File(label="Upload your PDF", type="filepath")
125
+ example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
126
  viewer_button = gr.Button("View PDF")
127
+ file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
128
 
129
  with gr.Column():
130
  model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
 
132
  with gr.Row():
133
  gen = gr.Button("Generate")
134
  clr = gr.Button("Clear")
135
+ outputs = gr.Markdown(label='Output', value="""| Title | Journal | Year | Author | Institution | Email |
136
  |---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
137
  | Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
138
  """)
139
 
140
  gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
141
  clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
142
+ viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
143
 
144
  demo.launch()