zmbfeng commited on
Commit
bd247ef
1 Parent(s): 3aa8d73

pdf to paragraph list refactored

Browse files
Files changed (1) hide show
  1. app.py +44 -36
app.py CHANGED
@@ -27,7 +27,7 @@ def is_new_file_upload(uploaded_file):
27
  # st.write("This is the first file upload detected.")
28
  st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
29
  return True
30
- def combined_similarity(similarity, sentence, query):
31
  # Tokenize both the sentence and the query
32
  # sentence_words = set(sentence.split())
33
  # query_words = set(query.split())
@@ -126,6 +126,43 @@ big_text = """
126
  # Display the styled text
127
  st.markdown(big_text, unsafe_allow_html=True)
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  uploaded_pdf_file = st.file_uploader("Upload a PDF file",
130
  type=['pdf'])
131
  st.markdown(
@@ -152,39 +189,10 @@ if uploaded_pdf_file is not None:
152
  st.session_state.uploaded_path=os.path.join(save_path, uploaded_pdf_file.name)
153
  # st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
154
  # print("page_count=",st.session_state.page_count)
 
155
  doc = fitz.open(st.session_state.uploaded_path)
156
- sentence_endings = ('.', '!', '?')
157
- start_page = 1
158
- st.session_state.restored_paragraphs = []
159
- for page_num in range(start_page - 1, len(doc)): # start_page - 1 to adjust for 0-based index
160
- page = doc.load_page(page_num)
161
- blocks = page.get_text("blocks")
162
-
163
- block_index = 1
164
- for block in blocks:
165
- x0, y0, x1, y1, text, block_type, flags = block
166
- if text.strip() != "":
167
- text = text.strip()
168
- text = re.sub(r'\n\s+\n', '\n\n', text)
169
- list_pattern = re.compile(r'^\s*((?:\d+\.|[a-zA-Z]\.|[*-])\s+.+)', re.MULTILINE)
170
- match = list_pattern.search(text)
171
- containsList = False
172
- if match:
173
- containsList = True
174
- # print ("list detected")
175
- paragraph = ""
176
- if bool(re.search(r'\n{2,}', text)):
177
- substrings = re.split(r'\n{2,}', text)
178
- for substring in substrings:
179
- if substring.strip() != "":
180
- paragraph = substring
181
- st.session_state.restored_paragraphs.append(
182
- {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": text});
183
- # print(f"<substring> {substring} </substring>")
184
- else:
185
- paragraph = text
186
- st.session_state.restored_paragraphs.append(
187
- {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": None});
188
  if isinstance(st.session_state.restored_paragraphs, list):
189
  # Count the restored_paragraphs of top-level elements
190
  st.session_state.list_count = len(st.session_state.restored_paragraphs)
@@ -217,9 +225,9 @@ if 'paragraph_sentence_encodings' in st.session_state:
217
  for sentence_encoding in paragraph_sentence_encoding[1]:
218
  if sentence_encoding:
219
  similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
220
- combined_score, similarity_score, commonality_score = combined_similarity(similarity,
221
- sentence_encoding[0],
222
- query)
223
  sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
224
  sentence_scores.append((combined_score, sentence_encoding[0]))
225
 
 
27
  # st.write("This is the first file upload detected.")
28
  st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
29
  return True
30
+ def add_commonality_to_similarity_score(similarity, sentence, query):
31
  # Tokenize both the sentence and the query
32
  # sentence_words = set(sentence.split())
33
  # query_words = set(query.split())
 
126
  # Display the styled text
127
  st.markdown(big_text, unsafe_allow_html=True)
128
 
129
+ def convert_pdf_to_paragraph_list(doc):
130
+ paragraphs = []
131
+ sentence_endings = ('.', '!', '?')
132
+ start_page = 1
133
+
134
+ for page_num in range(start_page - 1, len(doc)): # start_page - 1 to adjust for 0-based index
135
+ page = doc.load_page(page_num)
136
+ blocks = page.get_text("blocks")
137
+
138
+ block_index = 1
139
+ for block in blocks:
140
+ x0, y0, x1, y1, text, block_type, flags = block
141
+ if text.strip() != "":
142
+ text = text.strip()
143
+ text = re.sub(r'\n\s+\n', '\n\n', text)
144
+ list_pattern = re.compile(r'^\s*((?:\d+\.|[a-zA-Z]\.|[*-])\s+.+)', re.MULTILINE)
145
+ match = list_pattern.search(text)
146
+ containsList = False
147
+ if match:
148
+ containsList = True
149
+ # print ("list detected")
150
+ paragraph = ""
151
+ if bool(re.search(r'\n{2,}', text)):
152
+ substrings = re.split(r'\n{2,}', text)
153
+ for substring in substrings:
154
+ if substring.strip() != "":
155
+ paragraph = substring
156
+ paragraphs.append(
157
+ {"paragraph": paragraph, "containsList": containsList, "page_num": page_num,
158
+ "text": text});
159
+ # print(f"<substring> {substring} </substring>")
160
+ else:
161
+ paragraph = text
162
+ paragraphs.append(
163
+ {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": None});
164
+ return paragraphs
165
+
166
  uploaded_pdf_file = st.file_uploader("Upload a PDF file",
167
  type=['pdf'])
168
  st.markdown(
 
189
  st.session_state.uploaded_path=os.path.join(save_path, uploaded_pdf_file.name)
190
  # st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
191
  # print("page_count=",st.session_state.page_count)
192
+
193
  doc = fitz.open(st.session_state.uploaded_path)
194
+
195
+ st.session_state.restored_paragraphs=convert_pdf_to_paragraph_list(doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  if isinstance(st.session_state.restored_paragraphs, list):
197
  # Count the restored_paragraphs of top-level elements
198
  st.session_state.list_count = len(st.session_state.restored_paragraphs)
 
225
  for sentence_encoding in paragraph_sentence_encoding[1]:
226
  if sentence_encoding:
227
  similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
228
+ combined_score, similarity_score, commonality_score = add_commonality_to_similarity_score(similarity,
229
+ sentence_encoding[0],
230
+ query)
231
  sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
232
  sentence_scores.append((combined_score, sentence_encoding[0]))
233