dtcda

Sleeping

App Files Files Community

zmbfeng commited on Sep 20

Commit

bd247ef

•

1 Parent(s): 3aa8d73

pdf to paragraph list refactored

Browse files

Files changed (1) hide show

app.py +44 -36

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ def is_new_file_upload(uploaded_file):
         # st.write("This is the first file upload detected.")
         st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
         return True
-def combined_similarity(similarity, sentence, query):
     # Tokenize both the sentence and the query
     # sentence_words = set(sentence.split())
     # query_words = set(query.split())
@@ -126,6 +126,43 @@ big_text = """
     # Display the styled text
 st.markdown(big_text, unsafe_allow_html=True)
 uploaded_pdf_file = st.file_uploader("Upload a PDF file",
                                      type=['pdf'])
 st.markdown(
@@ -152,39 +189,10 @@ if uploaded_pdf_file is not None:
             st.session_state.uploaded_path=os.path.join(save_path, uploaded_pdf_file.name)
             # st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
             # print("page_count=",st.session_state.page_count)
         doc = fitz.open(st.session_state.uploaded_path)
-        sentence_endings = ('.', '!', '?')
-        start_page = 1
-        st.session_state.restored_paragraphs = []
-        for page_num in range(start_page - 1, len(doc)):  # start_page - 1 to adjust for 0-based index
-            page = doc.load_page(page_num)
-            blocks = page.get_text("blocks")
-            block_index = 1
-            for block in blocks:
-                x0, y0, x1, y1, text, block_type, flags = block
-                if text.strip() != "":
-                    text = text.strip()
-                    text = re.sub(r'\n\s+\n', '\n\n', text)
-                    list_pattern = re.compile(r'^\s*((?:\d+\.|[a-zA-Z]\.|[*-])\s+.+)', re.MULTILINE)
-                    match = list_pattern.search(text)
-                    containsList = False
-                    if match:
-                        containsList = True
-                        # print ("list detected")
-                    paragraph = ""
-                    if bool(re.search(r'\n{2,}', text)):
-                        substrings = re.split(r'\n{2,}', text)
-                        for substring in substrings:
-                            if substring.strip() != "":
-                                paragraph = substring
-                                st.session_state.restored_paragraphs.append(
-                                    {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": text});
-                                # print(f"<substring> {substring} </substring>")
-                    else:
-                        paragraph = text
-                        st.session_state.restored_paragraphs.append(
-                            {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": None});
         if isinstance(st.session_state.restored_paragraphs, list):
             # Count the restored_paragraphs of top-level elements
             st.session_state.list_count = len(st.session_state.restored_paragraphs)
@@ -217,9 +225,9 @@ if 'paragraph_sentence_encodings' in st.session_state:
                 for sentence_encoding in paragraph_sentence_encoding[1]:
                     if sentence_encoding:
                         similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
-                        combined_score, similarity_score, commonality_score = combined_similarity(similarity,
-                                                                                                  sentence_encoding[0],
-                                                                                                  query)
                         sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
                         sentence_scores.append((combined_score, sentence_encoding[0]))

         # st.write("This is the first file upload detected.")
         st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
         return True
+def add_commonality_to_similarity_score(similarity, sentence, query):
     # Tokenize both the sentence and the query
     # sentence_words = set(sentence.split())
     # query_words = set(query.split())
     # Display the styled text
 st.markdown(big_text, unsafe_allow_html=True)
+def convert_pdf_to_paragraph_list(doc):
+    paragraphs = []
+    sentence_endings = ('.', '!', '?')
+    start_page = 1
+    for page_num in range(start_page - 1, len(doc)):  # start_page - 1 to adjust for 0-based index
+        page = doc.load_page(page_num)
+        blocks = page.get_text("blocks")
+        block_index = 1
+        for block in blocks:
+            x0, y0, x1, y1, text, block_type, flags = block
+            if text.strip() != "":
+                text = text.strip()
+                text = re.sub(r'\n\s+\n', '\n\n', text)
+                list_pattern = re.compile(r'^\s*((?:\d+\.|[a-zA-Z]\.|[*-])\s+.+)', re.MULTILINE)
+                match = list_pattern.search(text)
+                containsList = False
+                if match:
+                    containsList = True
+                    # print ("list detected")
+                paragraph = ""
+                if bool(re.search(r'\n{2,}', text)):
+                    substrings = re.split(r'\n{2,}', text)
+                    for substring in substrings:
+                        if substring.strip() != "":
+                            paragraph = substring
+                            paragraphs.append(
+                                {"paragraph": paragraph, "containsList": containsList, "page_num": page_num,
+                                 "text": text});
+                            # print(f"<substring> {substring} </substring>")
+                else:
+                    paragraph = text
+                    paragraphs.append(
+                        {"paragraph": paragraph, "containsList": containsList, "page_num": page_num, "text": None});
+    return  paragraphs
 uploaded_pdf_file = st.file_uploader("Upload a PDF file",
                                      type=['pdf'])
 st.markdown(
             st.session_state.uploaded_path=os.path.join(save_path, uploaded_pdf_file.name)
             # st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
             # print("page_count=",st.session_state.page_count)
         doc = fitz.open(st.session_state.uploaded_path)
+        st.session_state.restored_paragraphs=convert_pdf_to_paragraph_list(doc)
         if isinstance(st.session_state.restored_paragraphs, list):
             # Count the restored_paragraphs of top-level elements
             st.session_state.list_count = len(st.session_state.restored_paragraphs)
                 for sentence_encoding in paragraph_sentence_encoding[1]:
                     if sentence_encoding:
                         similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
+                        combined_score, similarity_score, commonality_score = add_commonality_to_similarity_score(similarity,
+                                                                                                                  sentence_encoding[0],
+                                                                                                                  query)
                         sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
                         sentence_scores.append((combined_score, sentence_encoding[0]))