Spaces:
Running
Running
wangrongsheng
commited on
Commit
·
d95123f
1
Parent(s):
824d864
Upload 2 files
Browse files- app.py +642 -0
- requirements.txt +9 -0
app.py
ADDED
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
import datetime
|
5 |
+
import arxiv
|
6 |
+
import openai, tenacity
|
7 |
+
import base64, requests
|
8 |
+
import argparse
|
9 |
+
import configparser
|
10 |
+
import fitz, io, os
|
11 |
+
from PIL import Image
|
12 |
+
import gradio
|
13 |
+
import markdown
|
14 |
+
|
15 |
+
class Paper:
|
16 |
+
def __init__(self, path, title='', url='', abs='', authers=[], sl=[]):
|
17 |
+
# 初始化函数,根据pdf路径初始化Paper对象
|
18 |
+
self.url = url # 文章链接
|
19 |
+
self.path = path # pdf路径
|
20 |
+
self.sl = sl
|
21 |
+
self.section_names = [] # 段落标题
|
22 |
+
self.section_texts = {} # 段落内容
|
23 |
+
if title == '':
|
24 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
25 |
+
self.title = self.get_title()
|
26 |
+
self.parse_pdf()
|
27 |
+
else:
|
28 |
+
self.title = title
|
29 |
+
self.authers = authers
|
30 |
+
self.abs = abs
|
31 |
+
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
32 |
+
self.digit_num = [str(d+1) for d in range(10)]
|
33 |
+
self.first_image = ''
|
34 |
+
|
35 |
+
def parse_pdf(self):
|
36 |
+
self.pdf = fitz.open(self.path) # pdf文档
|
37 |
+
self.text_list = [page.get_text() for page in self.pdf]
|
38 |
+
self.all_text = ' '.join(self.text_list)
|
39 |
+
self.section_page_dict = self._get_all_page_index() # 段落与页码的对应字典
|
40 |
+
print("section_page_dict", self.section_page_dict)
|
41 |
+
self.section_text_dict = self._get_all_page() # 段落与内容的对应字典
|
42 |
+
self.section_text_dict.update({"title": self.title})
|
43 |
+
self.pdf.close()
|
44 |
+
|
45 |
+
def get_image_path(self, image_path=''):
|
46 |
+
"""
|
47 |
+
将PDF中的第一张图保存到image.png里面,存到本地目录,返回文件名称,供gitee读取
|
48 |
+
:param filename: 图片所在路径,"C:\\Users\\Administrator\\Desktop\\nwd.pdf"
|
49 |
+
:param image_path: 图片提取后的保存路径
|
50 |
+
:return:
|
51 |
+
"""
|
52 |
+
# open file
|
53 |
+
max_size = 0
|
54 |
+
image_list = []
|
55 |
+
with fitz.Document(self.path) as my_pdf_file:
|
56 |
+
# 遍历所有页面
|
57 |
+
for page_number in range(1, len(my_pdf_file) + 1):
|
58 |
+
# 查看独立页面
|
59 |
+
page = my_pdf_file[page_number - 1]
|
60 |
+
# 查看当前页所有图片
|
61 |
+
images = page.get_images()
|
62 |
+
# 遍历当前页面所有图片
|
63 |
+
for image_number, image in enumerate(page.get_images(), start=1):
|
64 |
+
# 访问图片xref
|
65 |
+
xref_value = image[0]
|
66 |
+
# 提取图片信息
|
67 |
+
base_image = my_pdf_file.extract_image(xref_value)
|
68 |
+
# 访问图片
|
69 |
+
image_bytes = base_image["image"]
|
70 |
+
# 获取图片扩展名
|
71 |
+
ext = base_image["ext"]
|
72 |
+
# 加载图片
|
73 |
+
image = Image.open(io.BytesIO(image_bytes))
|
74 |
+
image_size = image.size[0] * image.size[1]
|
75 |
+
if image_size > max_size:
|
76 |
+
max_size = image_size
|
77 |
+
image_list.append(image)
|
78 |
+
for image in image_list:
|
79 |
+
image_size = image.size[0] * image.size[1]
|
80 |
+
if image_size == max_size:
|
81 |
+
image_name = f"image.{ext}"
|
82 |
+
im_path = os.path.join(image_path, image_name)
|
83 |
+
print("im_path:", im_path)
|
84 |
+
|
85 |
+
max_pix = 480
|
86 |
+
origin_min_pix = min(image.size[0], image.size[1])
|
87 |
+
|
88 |
+
if image.size[0] > image.size[1]:
|
89 |
+
min_pix = int(image.size[1] * (max_pix/image.size[0]))
|
90 |
+
newsize = (max_pix, min_pix)
|
91 |
+
else:
|
92 |
+
min_pix = int(image.size[0] * (max_pix/image.size[1]))
|
93 |
+
newsize = (min_pix, max_pix)
|
94 |
+
image = image.resize(newsize)
|
95 |
+
|
96 |
+
image.save(open(im_path, "wb"))
|
97 |
+
return im_path, ext
|
98 |
+
return None, None
|
99 |
+
|
100 |
+
# 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
|
101 |
+
def get_chapter_names(self,):
|
102 |
+
# # 打开一个pdf文件
|
103 |
+
doc = fitz.open(self.path) # pdf文档
|
104 |
+
text_list = [page.get_text() for page in doc]
|
105 |
+
all_text = ''
|
106 |
+
for text in text_list:
|
107 |
+
all_text += text
|
108 |
+
# # 创建一个空列表,用于存储章节名称
|
109 |
+
chapter_names = []
|
110 |
+
for line in all_text.split('\n'):
|
111 |
+
line_list = line.split(' ')
|
112 |
+
if '.' in line:
|
113 |
+
point_split_list = line.split('.')
|
114 |
+
space_split_list = line.split(' ')
|
115 |
+
if 1 < len(space_split_list) < 5:
|
116 |
+
if 1 < len(point_split_list) < 5 and (point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
|
117 |
+
print("line:", line)
|
118 |
+
chapter_names.append(line)
|
119 |
+
|
120 |
+
return chapter_names
|
121 |
+
|
122 |
+
def get_title(self):
|
123 |
+
doc = self.pdf # 打开pdf文件
|
124 |
+
max_font_size = 0 # 初始化最大字体大小为0
|
125 |
+
max_string = "" # 初始化最大字体大小对应的字符串为空
|
126 |
+
max_font_sizes = [0]
|
127 |
+
for page in doc: # 遍历每一页
|
128 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
129 |
+
blocks = text["blocks"] # 获取文本块列表
|
130 |
+
for block in blocks: # 遍历每个文本块
|
131 |
+
if block["type"] == 0: # 如果是文字类型
|
132 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
133 |
+
max_font_sizes.append(font_size)
|
134 |
+
if font_size > max_font_size: # 如果字体大小大于当前最大值
|
135 |
+
max_font_size = font_size # 更新最大值
|
136 |
+
max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
137 |
+
max_font_sizes.sort()
|
138 |
+
print("max_font_sizes", max_font_sizes[-10:])
|
139 |
+
cur_title = ''
|
140 |
+
for page in doc: # 遍历每一页
|
141 |
+
text = page.get_text("dict") # 获取页面上的文本信息
|
142 |
+
blocks = text["blocks"] # 获取文本块列表
|
143 |
+
for block in blocks: # 遍历每个文本块
|
144 |
+
if block["type"] == 0: # 如果是文字类型
|
145 |
+
cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
146 |
+
font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
|
147 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
148 |
+
# print(font_size)
|
149 |
+
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
|
150 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
151 |
+
if len(cur_string) > 4 and "arXiv" not in cur_string:
|
152 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
153 |
+
if cur_title == '' :
|
154 |
+
cur_title += cur_string
|
155 |
+
else:
|
156 |
+
cur_title += ' ' + cur_string
|
157 |
+
# break
|
158 |
+
title = cur_title.replace('\n', ' ')
|
159 |
+
return title
|
160 |
+
|
161 |
+
def _get_all_page_index(self):
|
162 |
+
# 定义需要寻找的章节名称列表
|
163 |
+
section_list = self.sl
|
164 |
+
# 初始化一个字典来存储找到的章节和它们在文档中出现的页码
|
165 |
+
section_page_dict = {}
|
166 |
+
# 遍历每一页文档
|
167 |
+
for page_index, page in enumerate(self.pdf):
|
168 |
+
# 获取当前页面的文本内容
|
169 |
+
cur_text = page.get_text()
|
170 |
+
# 遍历需要寻找的章节名称列表
|
171 |
+
for section_name in section_list:
|
172 |
+
# 将章节名称转换成大写形式
|
173 |
+
section_name_upper = section_name.upper()
|
174 |
+
# 如果当前页面包含"Abstract"这个关键词
|
175 |
+
if "Abstract" == section_name and section_name in cur_text:
|
176 |
+
# 将"Abstract"和它所在的页码加入字典中
|
177 |
+
section_page_dict[section_name] = page_index
|
178 |
+
# 如果当前页面包含章节名称,则将章节名称和它所在的页码加入字典中
|
179 |
+
else:
|
180 |
+
if section_name + '\n' in cur_text:
|
181 |
+
section_page_dict[section_name] = page_index
|
182 |
+
elif section_name_upper + '\n' in cur_text:
|
183 |
+
section_page_dict[section_name] = page_index
|
184 |
+
# 返回所有找到的章节名称及它们在文档中出现的页码
|
185 |
+
return section_page_dict
|
186 |
+
|
187 |
+
def _get_all_page(self):
|
188 |
+
"""
|
189 |
+
获取PDF文件中每个页面的文本信息,并将文本信息按照章节组织成字典返回。
|
190 |
+
Returns:
|
191 |
+
section_dict (dict): 每个章节的文本信息字典,key为章节名,value为章节文本。
|
192 |
+
"""
|
193 |
+
text = ''
|
194 |
+
text_list = []
|
195 |
+
section_dict = {}
|
196 |
+
|
197 |
+
# # 先处理Abstract章节
|
198 |
+
# for page_index, page in enumerate(self.pdf):
|
199 |
+
# cur_text = page.get_text()
|
200 |
+
# # 如果该页面是Abstract章节所在页面
|
201 |
+
# if page_index == list(self.section_page_dict.values())[0]:
|
202 |
+
# abs_str = "Abstract"
|
203 |
+
# # 获取Abstract章节的起始位置
|
204 |
+
# first_index = cur_text.find(abs_str)
|
205 |
+
# # 查找下一个章节的关键词,这里是Introduction
|
206 |
+
# intro_str = "Introduction"
|
207 |
+
# if intro_str in cur_text:
|
208 |
+
# second_index = cur_text.find(intro_str)
|
209 |
+
# elif intro_str.upper() in cur_text:
|
210 |
+
# second_index = cur_text.find(intro_str.upper())
|
211 |
+
# # 将Abstract章节内容加入字典中
|
212 |
+
# section_dict[abs_str] = cur_text[first_index+len(abs_str)+1:second_index].replace('-\n',
|
213 |
+
# '').replace('\n', ' ').split('I.')[0].split("II.")[0]
|
214 |
+
|
215 |
+
# 再处理其他章节:
|
216 |
+
text_list = [page.get_text() for page in self.pdf]
|
217 |
+
for sec_index, sec_name in enumerate(self.section_page_dict):
|
218 |
+
print(sec_index, sec_name, self.section_page_dict[sec_name])
|
219 |
+
if sec_index <= 0:
|
220 |
+
continue
|
221 |
+
else:
|
222 |
+
# 直接考虑后面的内容:
|
223 |
+
start_page = self.section_page_dict[sec_name]
|
224 |
+
if sec_index < len(list(self.section_page_dict.keys()))-1:
|
225 |
+
end_page = self.section_page_dict[list(self.section_page_dict.keys())[sec_index+1]]
|
226 |
+
else:
|
227 |
+
end_page = len(text_list)
|
228 |
+
print("start_page, end_page:", start_page, end_page)
|
229 |
+
cur_sec_text = ''
|
230 |
+
if end_page - start_page == 0:
|
231 |
+
if sec_index < len(list(self.section_page_dict.keys()))-1:
|
232 |
+
next_sec = list(self.section_page_dict.keys())[sec_index+1]
|
233 |
+
if text_list[start_page].find(sec_name) == -1:
|
234 |
+
start_i = text_list[start_page].find(sec_name.upper())
|
235 |
+
else:
|
236 |
+
start_i = text_list[start_page].find(sec_name)
|
237 |
+
if text_list[start_page].find(next_sec) == -1:
|
238 |
+
end_i = text_list[start_page].find(next_sec.upper())
|
239 |
+
else:
|
240 |
+
end_i = text_list[start_page].find(next_sec)
|
241 |
+
cur_sec_text += text_list[start_page][start_i:end_i]
|
242 |
+
else:
|
243 |
+
for page_i in range(start_page, end_page):
|
244 |
+
# print("page_i:", page_i)
|
245 |
+
if page_i == start_page:
|
246 |
+
if text_list[start_page].find(sec_name) == -1:
|
247 |
+
start_i = text_list[start_page].find(sec_name.upper())
|
248 |
+
else:
|
249 |
+
start_i = text_list[start_page].find(sec_name)
|
250 |
+
cur_sec_text += text_list[page_i][start_i:]
|
251 |
+
elif page_i < end_page:
|
252 |
+
cur_sec_text += text_list[page_i]
|
253 |
+
elif page_i == end_page:
|
254 |
+
if sec_index < len(list(self.section_page_dict.keys()))-1:
|
255 |
+
next_sec = list(self.section_page_dict.keys())[sec_index+1]
|
256 |
+
if text_list[start_page].find(next_sec) == -1:
|
257 |
+
end_i = text_list[start_page].find(next_sec.upper())
|
258 |
+
else:
|
259 |
+
end_i = text_list[start_page].find(next_sec)
|
260 |
+
cur_sec_text += text_list[page_i][:end_i]
|
261 |
+
section_dict[sec_name] = cur_sec_text.replace('-\n', '').replace('\n', ' ')
|
262 |
+
return section_dict
|
263 |
+
|
264 |
+
# 定义Reader类
|
265 |
+
class Reader:
|
266 |
+
# 初始化方法,设置属性
|
267 |
+
def __init__(self, key_word='', query='', filter_keys='',
|
268 |
+
root_path='./',
|
269 |
+
gitee_key='',
|
270 |
+
sort=arxiv.SortCriterion.SubmittedDate, user_name='defualt', language='cn', key=''):
|
271 |
+
self.key = str(key) # OpenAI key
|
272 |
+
self.user_name = user_name # 读者姓名
|
273 |
+
self.key_word = key_word # 读者感兴趣的关键词
|
274 |
+
self.query = query # 读者输入的搜索查询
|
275 |
+
self.sort = sort # 读者选择的排序方式
|
276 |
+
self.language = language # 读者选择的语言
|
277 |
+
self.filter_keys = filter_keys # 用于在摘要中筛选的关键词
|
278 |
+
self.root_path = root_path
|
279 |
+
self.file_format = 'md' # or 'txt',如果为图片,则必须为'md'
|
280 |
+
self.save_image = False
|
281 |
+
if self.save_image:
|
282 |
+
self.gitee_key = self.config.get('Gitee', 'api')
|
283 |
+
else:
|
284 |
+
self.gitee_key = ''
|
285 |
+
|
286 |
+
def get_arxiv(self, max_results=30):
|
287 |
+
search = arxiv.Search(query=self.query,
|
288 |
+
max_results=max_results,
|
289 |
+
sort_by=self.sort,
|
290 |
+
sort_order=arxiv.SortOrder.Descending,
|
291 |
+
)
|
292 |
+
return search
|
293 |
+
|
294 |
+
def filter_arxiv(self, max_results=30):
|
295 |
+
search = self.get_arxiv(max_results=max_results)
|
296 |
+
print("all search:")
|
297 |
+
for index, result in enumerate(search.results()):
|
298 |
+
print(index, result.title, result.updated)
|
299 |
+
|
300 |
+
filter_results = []
|
301 |
+
filter_keys = self.filter_keys
|
302 |
+
|
303 |
+
print("filter_keys:", self.filter_keys)
|
304 |
+
# 确保每个关键词都能在摘要中找到,才算是目标论文
|
305 |
+
for index, result in enumerate(search.results()):
|
306 |
+
abs_text = result.summary.replace('-\n', '-').replace('\n', ' ')
|
307 |
+
meet_num = 0
|
308 |
+
for f_key in filter_keys.split(" "):
|
309 |
+
if f_key.lower() in abs_text.lower():
|
310 |
+
meet_num += 1
|
311 |
+
if meet_num == len(filter_keys.split(" ")):
|
312 |
+
filter_results.append(result)
|
313 |
+
# break
|
314 |
+
print("filter_results:", len(filter_results))
|
315 |
+
print("filter_papers:")
|
316 |
+
for index, result in enumerate(filter_results):
|
317 |
+
print(index, result.title, result.updated)
|
318 |
+
return filter_results
|
319 |
+
|
320 |
+
def validateTitle(self, title):
|
321 |
+
# 将论文的乱七八糟的路径格式修正
|
322 |
+
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
|
323 |
+
new_title = re.sub(rstr, "_", title) # 替换为下划线
|
324 |
+
return new_title
|
325 |
+
|
326 |
+
def download_pdf(self, filter_results):
|
327 |
+
# 先创建文件夹
|
328 |
+
date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
|
329 |
+
key_word = str(self.key_word.replace(':', ' '))
|
330 |
+
path = self.root_path + 'pdf_files/' + self.query.replace('au: ', '').replace('title: ', '').replace('ti: ', '').replace(':', ' ')[:25] + '-' + date_str
|
331 |
+
try:
|
332 |
+
os.makedirs(path)
|
333 |
+
except:
|
334 |
+
pass
|
335 |
+
print("All_paper:", len(filter_results))
|
336 |
+
# 开始下载:
|
337 |
+
paper_list = []
|
338 |
+
for r_index, result in enumerate(filter_results):
|
339 |
+
try:
|
340 |
+
title_str = self.validateTitle(result.title)
|
341 |
+
pdf_name = title_str+'.pdf'
|
342 |
+
# result.download_pdf(path, filename=pdf_name)
|
343 |
+
self.try_download_pdf(result, path, pdf_name)
|
344 |
+
paper_path = os.path.join(path, pdf_name)
|
345 |
+
print("paper_path:", paper_path)
|
346 |
+
paper = Paper(path=paper_path,
|
347 |
+
url=result.entry_id,
|
348 |
+
title=result.title,
|
349 |
+
abs=result.summary.replace('-\n', '-').replace('\n', ' '),
|
350 |
+
authers=[str(aut) for aut in result.authors],
|
351 |
+
)
|
352 |
+
# 下载完毕,开始解析:
|
353 |
+
paper.parse_pdf()
|
354 |
+
paper_list.append(paper)
|
355 |
+
except Exception as e:
|
356 |
+
print("download_error:", e)
|
357 |
+
pass
|
358 |
+
return paper_list
|
359 |
+
|
360 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
361 |
+
stop=tenacity.stop_after_attempt(5),
|
362 |
+
reraise=True)
|
363 |
+
def try_download_pdf(self, result, path, pdf_name):
|
364 |
+
result.download_pdf(path, filename=pdf_name)
|
365 |
+
|
366 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
367 |
+
stop=tenacity.stop_after_attempt(5),
|
368 |
+
reraise=True)
|
369 |
+
def upload_gitee(self, image_path, image_name='', ext='png'):
|
370 |
+
"""
|
371 |
+
上传到码云
|
372 |
+
:return:
|
373 |
+
"""
|
374 |
+
with open(image_path, 'rb') as f:
|
375 |
+
base64_data = base64.b64encode(f.read())
|
376 |
+
base64_content = base64_data.decode()
|
377 |
+
|
378 |
+
date_str = str(datetime.datetime.now())[:19].replace(':', '-').replace(' ', '-') + '.' + ext
|
379 |
+
path = image_name+ '-' +date_str
|
380 |
+
|
381 |
+
payload = {
|
382 |
+
"access_token": self.gitee_key,
|
383 |
+
"owner": self.config.get('Gitee', 'owner'),
|
384 |
+
"repo": self.config.get('Gitee', 'repo'),
|
385 |
+
"path": self.config.get('Gitee', 'path'),
|
386 |
+
"content": base64_content,
|
387 |
+
"message": "upload image"
|
388 |
+
}
|
389 |
+
# 这里需要修改成你的gitee的账户和仓库名,以及文件夹的名字:
|
390 |
+
url = f'https://gitee.com/api/v5/repos/'+self.config.get('Gitee', 'owner')+'/'+self.config.get('Gitee', 'repo')+'/contents/'+self.config.get('Gitee', 'path')+'/'+path
|
391 |
+
rep = requests.post(url, json=payload).json()
|
392 |
+
print("rep:", rep)
|
393 |
+
if 'content' in rep.keys():
|
394 |
+
image_url = rep['content']['download_url']
|
395 |
+
else:
|
396 |
+
image_url = r"https://gitee.com/api/v5/repos/"+self.config.get('Gitee', 'owner')+'/'+self.config.get('Gitee', 'repo')+'/contents/'+self.config.get('Gitee', 'path')+'/' + path
|
397 |
+
|
398 |
+
return image_url
|
399 |
+
|
400 |
+
def summary_with_chat(self, paper_list):
|
401 |
+
htmls = []
|
402 |
+
for paper_index, paper in enumerate(paper_list):
|
403 |
+
# 第一步先用title,abs,和introduction进行总结。
|
404 |
+
text = ''
|
405 |
+
text += 'Title:' + paper.title
|
406 |
+
text += 'Url:' + paper.url
|
407 |
+
text += 'Abstrat:' + paper.abs
|
408 |
+
# intro
|
409 |
+
text += list(paper.section_text_dict.values())[0]
|
410 |
+
max_token = 2500 * 4
|
411 |
+
text = text[:max_token]
|
412 |
+
chat_summary_text = self.chat_summary(text=text)
|
413 |
+
htmls.append(chat_summary_text)
|
414 |
+
|
415 |
+
# TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
|
416 |
+
first_image, ext = paper.get_image_path()
|
417 |
+
if first_image is None or self.gitee_key == '':
|
418 |
+
pass
|
419 |
+
else:
|
420 |
+
image_title = self.validateTitle(paper.title)
|
421 |
+
image_url = self.upload_gitee(image_path=first_image, image_name=image_title, ext=ext)
|
422 |
+
htmls.append("\n")
|
423 |
+
htmls.append("![Fig]("+image_url+")")
|
424 |
+
htmls.append("\n")
|
425 |
+
# 第二步总结方法:
|
426 |
+
# TODO,由于有些文章的方法章节名是算法名,所以简单的通过关键词来筛选,很难获取,后面需要用其他的方案去优化。
|
427 |
+
method_key = ''
|
428 |
+
for parse_key in paper.section_text_dict.keys():
|
429 |
+
if 'method' in parse_key.lower() or 'approach' in parse_key.lower():
|
430 |
+
method_key = parse_key
|
431 |
+
break
|
432 |
+
|
433 |
+
if method_key != '':
|
434 |
+
text = ''
|
435 |
+
method_text = ''
|
436 |
+
summary_text = ''
|
437 |
+
summary_text += "<summary>" + chat_summary_text
|
438 |
+
# methods
|
439 |
+
method_text += paper.section_text_dict[method_key]
|
440 |
+
# TODO 把这个变成tenacity的自动判别!
|
441 |
+
max_token = 2500 * 4
|
442 |
+
text = summary_text + "\n <Methods>:\n" + method_text
|
443 |
+
text = text[:max_token]
|
444 |
+
chat_method_text = self.chat_method(text=text)
|
445 |
+
htmls.append(chat_method_text)
|
446 |
+
else:
|
447 |
+
chat_method_text = ''
|
448 |
+
htmls.append("\n")
|
449 |
+
|
450 |
+
# 第三步总结全文,并打分:
|
451 |
+
conclusion_key = ''
|
452 |
+
for parse_key in paper.section_text_dict.keys():
|
453 |
+
if 'conclu' in parse_key.lower():
|
454 |
+
conclusion_key = parse_key
|
455 |
+
break
|
456 |
+
|
457 |
+
text = ''
|
458 |
+
conclusion_text = ''
|
459 |
+
summary_text = ''
|
460 |
+
summary_text += "<summary>" + chat_summary_text + "\n <Method summary>:\n" + chat_method_text
|
461 |
+
if conclusion_key != '':
|
462 |
+
# conclusion
|
463 |
+
conclusion_text += paper.section_text_dict[conclusion_key]
|
464 |
+
max_token = 2500 * 4
|
465 |
+
text = summary_text + "\n <Conclusion>:\n" + conclusion_text
|
466 |
+
else:
|
467 |
+
text = summary_text
|
468 |
+
text = text[:max_token]
|
469 |
+
chat_conclusion_text = self.chat_conclusion(text=text)
|
470 |
+
htmls.append(chat_conclusion_text)
|
471 |
+
htmls.append("\n")
|
472 |
+
md_text = "\n".join(htmls)
|
473 |
+
|
474 |
+
return markdown.markdown(md_text)
|
475 |
+
# # 整合成一个文件,打包保存下来。
|
476 |
+
'''
|
477 |
+
date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
|
478 |
+
try:
|
479 |
+
export_path = os.path.join(self.root_path, 'export')
|
480 |
+
os.makedirs(export_path)
|
481 |
+
except:
|
482 |
+
pass
|
483 |
+
mode = 'w' if paper_index == 0 else 'a'
|
484 |
+
file_name = os.path.join(export_path, date_str+'-'+self.validateTitle(paper.title)[:25]+"."+self.file_format)
|
485 |
+
self.export_to_markdown("\n".join(htmls), file_name=file_name, mode=mode)
|
486 |
+
htmls = []
|
487 |
+
'''
|
488 |
+
# file_name = os.path.join(export_path, date_str+'-'+self.validateTitle(paper.title)+".md")
|
489 |
+
# self.export_to_markdown("\n".join(htmls), file_name=file_name, mode=mode)
|
490 |
+
|
491 |
+
|
492 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
493 |
+
stop=tenacity.stop_after_attempt(5),
|
494 |
+
reraise=True)
|
495 |
+
def chat_conclusion(self, text):
|
496 |
+
openai.api_key = self.key
|
497 |
+
response = openai.ChatCompletion.create(
|
498 |
+
model="gpt-3.5-turbo",
|
499 |
+
# prompt需要用英语替换,少占用token。
|
500 |
+
messages=[
|
501 |
+
{"role": "system", "content": "你是一个["+self.key_word+"]领域的审稿人,你需要严格评审这篇文章"}, # chatgpt 角色
|
502 |
+
{"role": "assistant", "content": "这是一篇英文文献的<summary>和<conclusion>部分内容,其中<summary>你已经总结好了,但是<conclusion>部分,我需要你帮忙归纳下面问题:"+text}, # 背景知识,可以参考OpenReview的审稿流程
|
503 |
+
{"role": "user", "content": """
|
504 |
+
8. 做出如下总结:
|
505 |
+
- (1):这篇工作的意义如何?
|
506 |
+
- (2):从创新点、性能、工作量这三个维度,总结这篇文章的优点和缺点。
|
507 |
+
.......
|
508 |
+
按照后面的格式输出:
|
509 |
+
8. Conclusion:
|
510 |
+
- (1):xxx;
|
511 |
+
- (2):创新点: xxx; 性能: xxx; 工作量: xxx;
|
512 |
+
|
513 |
+
务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要和之前的<summary>内容重复,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,.......代表按照实际需求填写,如果没有可以不用写.
|
514 |
+
"""},
|
515 |
+
]
|
516 |
+
)
|
517 |
+
result = ''
|
518 |
+
for choice in response.choices:
|
519 |
+
result += choice.message.content
|
520 |
+
print("conclusion_result:\n", result)
|
521 |
+
return result
|
522 |
+
|
523 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
524 |
+
stop=tenacity.stop_after_attempt(5),
|
525 |
+
reraise=True)
|
526 |
+
def chat_method(self, text):
|
527 |
+
openai.api_key = self.key
|
528 |
+
response = openai.ChatCompletion.create(
|
529 |
+
model="gpt-3.5-turbo",
|
530 |
+
messages=[
|
531 |
+
{"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
|
532 |
+
{"role": "assistant", "content": "这是一篇英文文献的<summary>和<Method>部分内容,其中<summary>你已经总结好了,但是<Methods>部分,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
|
533 |
+
{"role": "user", "content": """
|
534 |
+
7. 详细描述这篇文章的方法思路。比如说它的步骤是:
|
535 |
+
- (1):...
|
536 |
+
- (2):...
|
537 |
+
- (3):...
|
538 |
+
- .......
|
539 |
+
按照后面的格式输出:
|
540 |
+
7. Methods:
|
541 |
+
- (1):xxx;
|
542 |
+
- (2):xxx;
|
543 |
+
- (3):xxx;
|
544 |
+
.......
|
545 |
+
|
546 |
+
务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要和之前的<summary>内容重复,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,按照\n换行,.......代表按照实际需求填写,如果没有可以不用写.
|
547 |
+
"""},
|
548 |
+
]
|
549 |
+
)
|
550 |
+
result = ''
|
551 |
+
for choice in response.choices:
|
552 |
+
result += choice.message.content
|
553 |
+
print("method_result:\n", result)
|
554 |
+
return result
|
555 |
+
|
556 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
557 |
+
stop=tenacity.stop_after_attempt(5),
|
558 |
+
reraise=True)
|
559 |
+
def chat_summary(self, text):
|
560 |
+
openai.api_key = self.key
|
561 |
+
response = openai.ChatCompletion.create(
|
562 |
+
model="gpt-3.5-turbo",
|
563 |
+
messages=[
|
564 |
+
{"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
|
565 |
+
{"role": "assistant", "content": "这是一篇英文文献的标题,作者,链接,Abstract和Introduction部分内容,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
|
566 |
+
{"role": "user", "content": """
|
567 |
+
1. 标记出这篇文献的标题(加上中文翻译)
|
568 |
+
2. 列举所有的作者姓名 (使用英文)
|
569 |
+
3. 标记第一作者的单位(只输出中文翻译)
|
570 |
+
4. 标记出这篇文章的关键词(使用英文)
|
571 |
+
5. 论文链接,Github代码链接(如果有的话,没有的话请填写Github:None)
|
572 |
+
6. 按照下面四个点进行总结:
|
573 |
+
- (1):这篇文章的研究背景是什么?
|
574 |
+
- (2):过去的方法有哪些?它们存在什么问题?本文和过去的研究有哪些本质的区别?Is the approach well motivated?
|
575 |
+
- (3):本文提出的研究方法是什么?
|
576 |
+
- (4):本文方法在什么任务上,取得了什么性能?性能能否支持他们的目标?
|
577 |
+
按照后面的格式输出:
|
578 |
+
1. Title: xxx
|
579 |
+
2. Authors: xxx
|
580 |
+
3. Affiliation: xxx
|
581 |
+
4. Keywords: xxx
|
582 |
+
5. Urls: xxx or xxx , xxx
|
583 |
+
6. Summary:
|
584 |
+
- (1):xxx;
|
585 |
+
- (2):xxx;
|
586 |
+
- (3):xxx;
|
587 |
+
- (4):xxx.
|
588 |
+
|
589 |
+
务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要有太多重复的信息,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,按照\n换行.
|
590 |
+
"""},
|
591 |
+
]
|
592 |
+
)
|
593 |
+
result = ''
|
594 |
+
for choice in response.choices:
|
595 |
+
result += choice.message.content
|
596 |
+
print("summary_result:\n", result)
|
597 |
+
return result
|
598 |
+
|
599 |
+
def export_to_markdown(self, text, file_name, mode='w'):
|
600 |
+
# 使用markdown模块的convert方法,将文本转换为html格式
|
601 |
+
# html = markdown.markdown(text)
|
602 |
+
# 打开一个文件,以写入模式
|
603 |
+
with open(file_name, mode, encoding="utf-8") as f:
|
604 |
+
# 将html格式的内容写入文件
|
605 |
+
f.write(text)
|
606 |
+
|
607 |
+
# 定义一个方法,打印出读者信息
|
608 |
+
def show_info(self):
|
609 |
+
print(f"Key word: {self.key_word}")
|
610 |
+
print(f"Query: {self.query}")
|
611 |
+
print(f"Sort: {self.sort}")
|
612 |
+
|
613 |
+
def upload_pdf(key, text, file):
|
614 |
+
# 检查两个输入都不为空
|
615 |
+
if not key or not text or not file:
|
616 |
+
return "两个输入都不能为空,请输入字符并上传 PDF 文件!"
|
617 |
+
# 判断PDF文件
|
618 |
+
if file and file.name.split(".")[-1].lower() != "pdf":
|
619 |
+
return '请勿上传非 PDF 文件!'
|
620 |
+
else:
|
621 |
+
section_list = text.split(',')
|
622 |
+
paper_list = [Paper(path=file, sl=section_list)]
|
623 |
+
# 创建一个Reader对象
|
624 |
+
reader = Reader()
|
625 |
+
sum_info = reader.summary_with_chat(paper_list=paper_list, key=key)
|
626 |
+
return sum_info
|
627 |
+
|
628 |
+
# 标题
|
629 |
+
title = "ChatPaper"
|
630 |
+
# 描述
|
631 |
+
description = "<div align='center'>帮助您快速阅读论文</div>"
|
632 |
+
# 创建Gradio界面
|
633 |
+
ip = [
|
634 |
+
gradio.inputs.Textbox(label="请输入你的API-key", default=""),
|
635 |
+
gradio.inputs.Textbox(label="请输入论文大标题索引,(用【,】隔开)", default="'Abstract,Introduction,Related Work,Background,Preliminary,Problem Formulation,Methods,Methodology,Method,Approach,Approaches,Materials and Methods,Experiment Settings,Experiment,Experimental Results,Evaluation,Experiments,Results,Findings,Data Analysis,Discussion,Results and Discussion,Conclusion,References'"),
|
636 |
+
gradio.inputs.File(label="上传论文(必须为PDF)")
|
637 |
+
]
|
638 |
+
|
639 |
+
interface = gradio.Interface(fn=upload_pdf, inputs=ip, outputs="html", title=title, description=description)
|
640 |
+
|
641 |
+
# 运行Gradio应用程序
|
642 |
+
interface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
arxiv==1.4.3
|
2 |
+
PyMuPDF==1.21.1
|
3 |
+
requests==2.26.0
|
4 |
+
tiktoken==0.2.0
|
5 |
+
tenacity==8.2.2
|
6 |
+
pybase64==1.2.3
|
7 |
+
Pillow==9.4.0
|
8 |
+
openai==0.27.0
|
9 |
+
markdown
|