Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
-
import
|
|
|
|
|
3 |
import os
|
4 |
-
|
5 |
-
import fitz
|
6 |
|
7 |
api_key = os.getenv('API_KEY')
|
8 |
base_url = os.getenv("BASE_URL")
|
@@ -14,20 +16,21 @@ client = OpenAI(
|
|
14 |
|
15 |
|
16 |
def extract_pdf_pypdf(pdf_dir):
|
17 |
-
path = pdf_dir
|
18 |
-
|
19 |
try:
|
20 |
-
doc = fitz.open(
|
21 |
-
except:
|
22 |
-
print("
|
23 |
return None
|
24 |
|
25 |
page_count = doc.page_count
|
26 |
file_content = ""
|
27 |
for page in range(page_count):
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
31 |
|
32 |
return file_content
|
33 |
|
@@ -39,26 +42,13 @@ def openai_api(messages):
|
|
39 |
messages=messages,
|
40 |
temperature=0.1,
|
41 |
max_tokens=8192,
|
42 |
-
# timeout=300,
|
43 |
stream=True
|
44 |
)
|
|
|
|
|
|
|
45 |
except Exception as ex:
|
46 |
-
print("
|
47 |
-
return None
|
48 |
-
|
49 |
-
if completion:
|
50 |
-
try:
|
51 |
-
response_2_list = [chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in
|
52 |
-
completion]
|
53 |
-
print("response tokens:", len(response_2_list))
|
54 |
-
|
55 |
-
response_2_content = ''.join(response_2_list)
|
56 |
-
return response_2_content
|
57 |
-
except Exception as ex:
|
58 |
-
print("第二轮 出现如下异常%s" % ex)
|
59 |
-
return None
|
60 |
-
else:
|
61 |
-
print("第二轮出现异常")
|
62 |
return None
|
63 |
|
64 |
|
@@ -83,29 +73,30 @@ def predict(input_text, pdf_file):
|
|
83 |
return extract_result or "Too many users. Please wait a moment!"
|
84 |
|
85 |
|
86 |
-
def
|
87 |
-
|
88 |
-
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
98 |
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
# Encode as base64 for embedding in HTML
|
103 |
-
b64_data = base64.b64encode(pdf_data).decode('utf-8')
|
104 |
-
return f"<embed src='data:application/pdf;base64,{b64_data}' type='application/pdf' width='100%' height='700px' />"
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
109 |
|
110 |
|
111 |
en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
|
@@ -120,22 +111,20 @@ examples = [[en_1], [en_2]]
|
|
120 |
|
121 |
with gr.Blocks(title="PaperExtractGPT") as demo:
|
122 |
gr.Markdown(
|
123 |
-
'''<
|
124 |
-
<
|
125 |
-
<
|
126 |
-
<br
|
127 |
-
<br
|
128 |
-
<br
|
129 |
-
|
130 |
-
</p>
|
131 |
-
'''
|
132 |
)
|
133 |
with gr.Row():
|
134 |
with gr.Column():
|
135 |
-
gr.Markdown('## Upload PDF')
|
136 |
file_input = gr.File(label="Upload your PDF", type="filepath")
|
|
|
137 |
viewer_button = gr.Button("View PDF")
|
138 |
-
file_out = gr.
|
139 |
|
140 |
with gr.Column():
|
141 |
model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
@@ -143,13 +132,13 @@ with gr.Blocks(title="PaperExtractGPT") as demo:
|
|
143 |
with gr.Row():
|
144 |
gen = gr.Button("Generate")
|
145 |
clr = gr.Button("Clear")
|
146 |
-
outputs = gr.Markdown(label='Output',
|
147 |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
|
148 |
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
|
149 |
""")
|
150 |
|
151 |
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
|
152 |
clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
|
153 |
-
viewer_button.click(
|
154 |
|
155 |
demo.launch()
|
|
|
1 |
+
from openai import OpenAI
|
2 |
import gradio as gr
|
3 |
+
import fitz # PyMuPDF
|
4 |
+
from PIL import Image
|
5 |
+
from pathlib import Path
|
6 |
import os
|
7 |
+
|
|
|
8 |
|
9 |
api_key = os.getenv('API_KEY')
|
10 |
base_url = os.getenv("BASE_URL")
|
|
|
16 |
|
17 |
|
18 |
def extract_pdf_pypdf(pdf_dir):
|
|
|
|
|
19 |
try:
|
20 |
+
doc = fitz.open(pdf_dir)
|
21 |
+
except Exception as e:
|
22 |
+
print(f"Error opening PDF: {e}")
|
23 |
return None
|
24 |
|
25 |
page_count = doc.page_count
|
26 |
file_content = ""
|
27 |
for page in range(page_count):
|
28 |
+
try:
|
29 |
+
text = doc.load_page(page).get_text("text")
|
30 |
+
file_content += text + "\n\n"
|
31 |
+
except Exception as e:
|
32 |
+
print(f"Error reading page {page}: {e}")
|
33 |
+
continue
|
34 |
|
35 |
return file_content
|
36 |
|
|
|
42 |
messages=messages,
|
43 |
temperature=0.1,
|
44 |
max_tokens=8192,
|
|
|
45 |
stream=True
|
46 |
)
|
47 |
+
response = ''.join(
|
48 |
+
[chunk.choices[0].delta.content if chunk.choices[0].delta.content else "" for chunk in completion])
|
49 |
+
return response
|
50 |
except Exception as ex:
|
51 |
+
print("API error:", ex)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
return None
|
53 |
|
54 |
|
|
|
73 |
return extract_result or "Too many users. Please wait a moment!"
|
74 |
|
75 |
|
76 |
+
def convert_pdf_to_images(pdf_path, image_folder="pdf_images", dpi=300):
|
77 |
+
# 创建存储图像的文件夹
|
78 |
+
os.makedirs(image_folder, exist_ok=True)
|
79 |
|
80 |
+
# 打开PDF文档
|
81 |
+
pdf_document = fitz.open(pdf_path)
|
82 |
+
image_paths = []
|
83 |
|
84 |
+
# 遍历每一页PDF,并生成高DPI的图像
|
85 |
+
for page_number in range(len(pdf_document)):
|
86 |
+
page = pdf_document[page_number]
|
87 |
+
pix = page.get_pixmap(dpi=dpi)
|
88 |
+
image_path = Path(image_folder) / f"page_{page_number + 1}.png"
|
89 |
+
Image.frombytes("RGB", [pix.width, pix.height], pix.samples).save(image_path)
|
90 |
+
image_paths.append(str(image_path)) # 收集每一页的图像路径
|
91 |
|
92 |
+
pdf_document.close()
|
93 |
+
return image_paths
|
94 |
|
|
|
|
|
|
|
95 |
|
96 |
+
def display_pdf_images(file):
|
97 |
+
# 转换PDF为高清图像
|
98 |
+
image_paths = convert_pdf_to_images(file)
|
99 |
+
return image_paths # 返回图像路径列表以显示
|
100 |
|
101 |
|
102 |
en_1 = """Could you please help me extract the information of 'title'/'journal'/'year'/'author'/'institution'/'email' from the previous content in a markdown table format?
|
|
|
111 |
|
112 |
with gr.Blocks(title="PaperExtractGPT") as demo:
|
113 |
gr.Markdown(
|
114 |
+
'''<h1 align="center"> Paper Extract GPT </h1>
|
115 |
+
<p>How to use:
|
116 |
+
<br><strong>1</strong>: Upload your PDF.
|
117 |
+
<br><strong>2</strong>: Click "View PDF" to preview it.
|
118 |
+
<br><strong>3</strong>: Enter your extraction prompt in the input box.
|
119 |
+
<br><strong>4</strong>: Click "Generate" to extract, and the extracted information will display below.
|
120 |
+
</p>'''
|
|
|
|
|
121 |
)
|
122 |
with gr.Row():
|
123 |
with gr.Column():
|
|
|
124 |
file_input = gr.File(label="Upload your PDF", type="filepath")
|
125 |
+
example = gr.Examples(examples=[["./sample.pdf"]], inputs=file_input)
|
126 |
viewer_button = gr.Button("View PDF")
|
127 |
+
file_out = gr.Gallery(label="PDF Viewer", columns=1, height="auto", object_fit="contain")
|
128 |
|
129 |
with gr.Column():
|
130 |
model_input = gr.Textbox(lines=7, placeholder='Enter your extraction prompt here', label='Input Prompt')
|
|
|
132 |
with gr.Row():
|
133 |
gen = gr.Button("Generate")
|
134 |
clr = gr.Button("Clear")
|
135 |
+
outputs = gr.Markdown(label='Output', value="""| Title | Journal | Year | Author | Institution | Email |
|
136 |
|---------------------------------------------|--------------------|------|-----------------------------------------------|-------------------------------------------------------|-----------------------|
|
137 |
| Paleomagnetic Study of Deccan Traps from Jabalpur to Amarkantak, Central India | J. Geomag. Geoelectr. | 1973 | R. K. VERMA, G. PULLAIAH, G.R. ANJANEYULU, P. K. MALLIK | National Geophysical Research Institute, Hyderabad, and Indian School o f Mines, Dhanbad | "" |
|
138 |
""")
|
139 |
|
140 |
gen.click(fn=predict, inputs=[model_input, file_input], outputs=outputs)
|
141 |
clr.click(fn=lambda: [gr.update(value=""), gr.update(value="")], inputs=None, outputs=[model_input, outputs])
|
142 |
+
viewer_button.click(display_pdf_images, inputs=file_input, outputs=file_out)
|
143 |
|
144 |
demo.launch()
|