wangrongsheng commited on
Commit
a04bf88
1 Parent(s): 2ccd2a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -1
app.py CHANGED
@@ -60,6 +60,7 @@ class Paper:
60
  self.section_names = [] # 段落标题
61
  self.section_texts = {} # 段落内容
62
  self.abs = abs
 
63
  if title == '':
64
  self.pdf = fitz.open(self.path) # pdf文档
65
  self.title = self.get_title()
@@ -79,7 +80,17 @@ class Paper:
79
  print("section_page_dict", self.section_page_dict)
80
  self.section_text_dict = self._get_all_page() # 段落与内容的对应字典
81
  self.section_text_dict.update({"title": self.title})
82
- self.pdf.close()
 
 
 
 
 
 
 
 
 
 
83
 
84
  def get_image_path(self, image_path=''):
85
  """
@@ -195,6 +206,7 @@ class Paper:
195
  cur_title += cur_string
196
  else:
197
  cur_title += ' ' + cur_string
 
198
  # break
199
  title = cur_title.replace('\n', ' ')
200
  return title
@@ -433,6 +445,7 @@ class Reader:
433
  text += 'Title:' + paper.title
434
  text += 'Url:' + paper.url
435
  text += 'Abstrat:' + paper.abs
 
436
  # intro
437
  text += list(paper.section_text_dict.values())[0]
438
  #max_token = 2500 * 4
 
60
  self.section_names = [] # 段落标题
61
  self.section_texts = {} # 段落内容
62
  self.abs = abs
63
+ self.title_page = 0
64
  if title == '':
65
  self.pdf = fitz.open(self.path) # pdf文档
66
  self.title = self.get_title()
 
80
  print("section_page_dict", self.section_page_dict)
81
  self.section_text_dict = self._get_all_page() # 段落与内容的对应字典
82
  self.section_text_dict.update({"title": self.title})
83
+ self.pdf.close()
84
+
85
+ def get_paper_info(self):
86
+ first_page_text = self.pdf[self.title_page].get_text()
87
+ if "Abstract" in self.section_text_dict.keys():
88
+ abstract_text = self.section_text_dict['Abstract']
89
+ else:
90
+ abstract_text = self.abs
91
+ introduction_text = self.section_text_dict['Introduction']
92
+ first_page_text = first_page_text.replace(abstract_text, "").replace(introduction_text, "")
93
+ return first_page_text
94
 
95
  def get_image_path(self, image_path=''):
96
  """
 
206
  cur_title += cur_string
207
  else:
208
  cur_title += ' ' + cur_string
209
+ self.title_page = page_index
210
  # break
211
  title = cur_title.replace('\n', ' ')
212
  return title
 
445
  text += 'Title:' + paper.title
446
  text += 'Url:' + paper.url
447
  text += 'Abstrat:' + paper.abs
448
+ text += 'Paper_info:' + paper.section_text_dict['paper_info']
449
  # intro
450
  text += list(paper.section_text_dict.values())[0]
451
  #max_token = 2500 * 4