Euclid-Jie commited on
Commit
285fa46
·
1 Parent(s): 380bfe6

feature(read pdf paper then write summary):

Browse files

add a func called readPdf in toolbox, which can read pdf paper to str. then use bs4.BeautifulSoup to clean content.

crazy_functions/读文章写摘要.py CHANGED
@@ -1,14 +1,19 @@
1
  from predict import predict_no_ui
2
- from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
3
  fast_debug = False
 
4
 
5
 
6
  def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt):
7
  import time, glob, os
8
  print('begin analysis on:', file_manifest)
9
  for index, fp in enumerate(file_manifest):
10
- with open(fp, 'r', encoding='utf-8') as f:
11
- file_content = f.read()
 
 
 
 
12
 
13
  prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
14
  i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```'
@@ -17,7 +22,7 @@ def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, hist
17
  print('[1] yield chatbot, history')
18
  yield chatbot, history, '正常'
19
 
20
- if not fast_debug:
21
  msg = '正常'
22
  # ** gpt request **
23
  gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[]) # 带超时倒计时
@@ -35,7 +40,7 @@ def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, hist
35
  chatbot.append((i_say, "[Local Message] waiting gpt response."))
36
  yield chatbot, history, '正常'
37
 
38
- if not fast_debug:
39
  msg = '正常'
40
  # ** gpt request **
41
  gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say, chatbot, top_p, temperature, history=history) # 带超时倒计时
@@ -60,11 +65,12 @@ def 读文章写摘要(txt, top_p, temperature, chatbot, history, systemPromptTx
60
  report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
61
  yield chatbot, history, '正常'
62
  return
63
- file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] # + \
 
64
  # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
65
  # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
66
  if len(file_manifest) == 0:
67
- report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex文件: {txt}")
68
  yield chatbot, history, '正常'
69
  return
70
  yield from 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt)
 
1
  from predict import predict_no_ui
2
+ from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down, readPdf
3
  fast_debug = False
4
+ from bs4 import BeautifulSoup
5
 
6
 
7
  def 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt):
8
  import time, glob, os
9
  print('begin analysis on:', file_manifest)
10
  for index, fp in enumerate(file_manifest):
11
+ if ".tex" in fp:
12
+ with open(fp, 'r', encoding='utf-8') as f:
13
+ file_content = f.read()
14
+ if ".pdf" in fp:
15
+ file_content = readPdf(fp)
16
+ file_content = BeautifulSoup(''.join(file_content), features="lxml").body.text.encode('gbk', 'ignore').decode('gbk')
17
 
18
  prefix = "接下来请你逐文件分析下面的论文文件,概括其内容" if index==0 else ""
19
  i_say = prefix + f'请对下面的文章片段用中文做一个概述,文件名是{os.path.relpath(fp, project_folder)},文章内容是 ```{file_content}```'
 
22
  print('[1] yield chatbot, history')
23
  yield chatbot, history, '正常'
24
 
25
+ if not fast_debug:
26
  msg = '正常'
27
  # ** gpt request **
28
  gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[]) # 带超时倒计时
 
40
  chatbot.append((i_say, "[Local Message] waiting gpt response."))
41
  yield chatbot, history, '正常'
42
 
43
+ if not fast_debug:
44
  msg = '正常'
45
  # ** gpt request **
46
  gpt_say = yield from predict_no_ui_but_counting_down(i_say, i_say, chatbot, top_p, temperature, history=history) # 带超时倒计时
 
65
  report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
66
  yield chatbot, history, '正常'
67
  return
68
+ file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.tex', recursive=True)] + \
69
+ [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)] # + \
70
  # [f for f in glob.glob(f'{project_folder}/**/*.cpp', recursive=True)] + \
71
  # [f for f in glob.glob(f'{project_folder}/**/*.c', recursive=True)]
72
  if len(file_manifest) == 0:
73
+ report_execption(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或pdf文件: {txt}")
74
  yield chatbot, history, '正常'
75
  return
76
  yield from 解析Paper(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt)
functional_crazy.py CHANGED
@@ -30,7 +30,7 @@ def get_crazy_functionals():
30
  "Color": "stop", # 按钮颜色
31
  "Function": 解析一个C项目
32
  },
33
- "读tex论文写摘要": {
34
  "Color": "stop", # 按钮颜色
35
  "Function": 读文章写摘要
36
  },
 
30
  "Color": "stop", # 按钮颜色
31
  "Function": 解析一个C项目
32
  },
33
+ "读tex or pdf论文写摘要": {
34
  "Color": "stop", # 按钮颜色
35
  "Function": 读文章写摘要
36
  },
requirements.txt CHANGED
@@ -1,3 +1,10 @@
1
  gradio>=3.23
2
- requests[socks]
3
- mdtex2html
 
 
 
 
 
 
 
 
1
  gradio>=3.23
2
+ requests[socks]~=2.28.2
3
+ mdtex2html~=1.2.0
4
+ Markdown~=3.4.3
5
+ latex2mathml~=3.75.1
6
+ bs4~=0.0.1
7
+ lxml~=4.6.4
8
+ beautifulsoup4~=4.12.0
9
+ numpy~=1.24.2
10
+ pdfminer.six
toolbox.py CHANGED
@@ -1,6 +1,14 @@
1
  import markdown, mdtex2html, threading, importlib, traceback
2
  from show_math import convert as convert_math
3
  from functools import wraps
 
 
 
 
 
 
 
 
4
 
5
  def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
6
  """
@@ -235,4 +243,52 @@ def clear_line_break(txt):
235
  txt = txt.replace('\n', ' ')
236
  txt = txt.replace(' ', ' ')
237
  txt = txt.replace(' ', ' ')
238
- return txt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import markdown, mdtex2html, threading, importlib, traceback
2
  from show_math import convert as convert_math
3
  from functools import wraps
4
+ import pdfminer
5
+ from pdfminer.pdfparser import PDFParser
6
+ from pdfminer.pdfdocument import PDFDocument
7
+ from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
8
+ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
9
+ from pdfminer.pdfdevice import PDFDevice
10
+ from pdfminer.layout import LAParams
11
+ from pdfminer.converter import PDFPageAggregator
12
 
13
  def predict_no_ui_but_counting_down(i_say, i_say_show_user, chatbot, top_p, temperature, history=[], sys_prompt=''):
14
  """
 
243
  txt = txt.replace('\n', ' ')
244
  txt = txt.replace(' ', ' ')
245
  txt = txt.replace(' ', ' ')
246
+ return txt
247
+
248
+ def readPdf(pdfPath):
249
+ """
250
+ 读取pdf文件,返回文本内容
251
+ """
252
+ fp = open(pdfPath, 'rb')
253
+
254
+ # Create a PDF parser object associated with the file object
255
+ parser = PDFParser(fp)
256
+
257
+ # Create a PDF document object that stores the document structure.
258
+ # Password for initialization as 2nd parameter
259
+ document = PDFDocument(parser)
260
+ # Check if the document allows text extraction. If not, abort.
261
+ if not document.is_extractable:
262
+ raise PDFTextExtractionNotAllowed
263
+
264
+ # Create a PDF resource manager object that stores shared resources.
265
+ rsrcmgr = PDFResourceManager()
266
+
267
+ # Create a PDF device object.
268
+ # device = PDFDevice(rsrcmgr)
269
+
270
+ # BEGIN LAYOUT ANALYSIS.
271
+ # Set parameters for analysis.
272
+ laparams = LAParams(
273
+ char_margin=10.0,
274
+ line_margin=0.2,
275
+ boxes_flow=0.2,
276
+ all_texts=False,
277
+ )
278
+ # Create a PDF page aggregator object.
279
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
280
+ # Create a PDF interpreter object.
281
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
282
+
283
+ # loop over all pages in the document
284
+ outTextList = []
285
+ for page in PDFPage.create_pages(document):
286
+ # read the page into a layout object
287
+ interpreter.process_page(page)
288
+ layout = device.get_result()
289
+ for obj in layout._objs:
290
+ if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
291
+ # print(obj.get_text())
292
+ outTextList.append(obj.get_text())
293
+
294
+ return outTextList