dtcda

Sleeping

App Files Files Community

zmbfeng commited on Sep 20

Commit

49fa3c4

•

1 Parent(s): 7cb50e9

get similiarity scores refactored

Browse files

Files changed (1) hide show

app.py +48 -43

app.py CHANGED Viewed

@@ -199,6 +199,51 @@ if uploaded_pdf_file is not None:
             st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count}')
         st.rerun()
 if 'paragraph_sentence_encodings' in st.session_state:
     query = st.text_input("Enter your query")
@@ -209,55 +254,15 @@ if 'paragraph_sentence_encodings' in st.session_state:
             query_encoding = encode_sentence(query)
             paragraph_scores = []
-            sentence_scores = []
             total_count = len(st.session_state.paragraph_sentence_encodings)
             processing_progress_bar = st.progress(0)
-            for index, paragraph_sentence_encoding in enumerate(st.session_state.paragraph_sentence_encodings):
-                progress_percentage = index / (total_count - 1)
-                processing_progress_bar.progress(progress_percentage)
-                sentence_similarities = []
-                for sentence_encoding in paragraph_sentence_encoding[1]:
-                    if sentence_encoding:
-                        similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
-                        combined_score, similarity_score, commonality_score = add_commonality_to_similarity_score(similarity,
-                                                                                                                  sentence_encoding[0],
-                                                                                                                  query)
-                        sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
-                        sentence_scores.append((combined_score, sentence_encoding[0]))
-                sentence_similarities.sort(reverse=True, key=lambda x: x[0])
-                # print(sentence_similarities)
-                if len(sentence_similarities) >= 3:
-                    top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
-                    top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
-                    top_three_sentences = sentence_similarities[:3]
-                elif sentence_similarities:
-                    top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities])
-                    top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities])
-                    top_three_sentences = sentence_similarities
-                else:
-                    top_three_avg_similarity = 0
-                    top_three_avg_commonality = 0
-                    top_three_sentences = []
-                # print(f"top_three_sentences={top_three_sentences}")
-                # top_three_texts = [s[1] for s in top_three_sentences]
-                # remaining_texts = [s[0] for s in paragraph_sentence_encoding[1] if s and s[0] not in top_three_texts]
-                # reordered_paragraph = top_three_texts + remaining_texts
-                #
-                # original_paragraph = ' '.join([s[0] for s in paragraph_sentence_encoding[1] if s])
-                # modified_paragraph = ' '.join(reordered_paragraph)
-                paragraph_scores.append(
-                    (top_three_avg_similarity, top_three_avg_commonality,
-                     {'top_three_sentences': top_three_sentences, 'original_text': paragraph_sentence_encoding[0]})
-                )
-            sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
             st.session_state.paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
         if 'paragraph_scores' in st.session_state:

             st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count}')
         st.rerun()
+def find_sentences_scores(paragraph_sentence_encodings, query_encoding, processing_progress_bar,total_count):
+    sentence_scores = []
+    for index, paragraph_sentence_encoding in enumerate(paragraph_sentence_encodings):
+        progress_percentage = index / (total_count - 1)
+        processing_progress_bar.progress(progress_percentage)
+        sentence_similarities = []
+        for sentence_encoding in paragraph_sentence_encoding[1]:
+            if sentence_encoding:
+                similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
+                combined_score, similarity_score, commonality_score = add_commonality_to_similarity_score(similarity,
+                                                                                                          sentence_encoding[0],
+                                                                                                          query)
+                sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
+                sentence_scores.append((combined_score, sentence_encoding[0]))
+        sentence_similarities.sort(reverse=True, key=lambda x: x[0])
+        # print(sentence_similarities)
+        if len(sentence_similarities) >= 3:
+            top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
+            top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
+            top_three_sentences = sentence_similarities[:3]
+        elif sentence_similarities:
+            top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities])
+            top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities])
+            top_three_sentences = sentence_similarities
+        else:
+            top_three_avg_similarity = 0
+            top_three_avg_commonality = 0
+            top_three_sentences = []
+        # print(f"top_three_sentences={top_three_sentences}")
+        # top_three_texts = [s[1] for s in top_three_sentences]
+        # remaining_texts = [s[0] for s in paragraph_sentence_encoding[1] if s and s[0] not in top_three_texts]
+        # reordered_paragraph = top_three_texts + remaining_texts
+        #
+        # original_paragraph = ' '.join([s[0] for s in paragraph_sentence_encoding[1] if s])
+        # modified_paragraph = ' '.join(reordered_paragraph)
+        paragraph_scores.append(
+            (top_three_avg_similarity, top_three_avg_commonality,
+             {'top_three_sentences': top_three_sentences, 'original_text': paragraph_sentence_encoding[0]})
+        )
+    sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
 if 'paragraph_sentence_encodings' in st.session_state:
     query = st.text_input("Enter your query")
             query_encoding = encode_sentence(query)
             paragraph_scores = []
             total_count = len(st.session_state.paragraph_sentence_encodings)
             processing_progress_bar = st.progress(0)
+            sentence_scores = find_sentences_scores(
+                st.session_state.paragraph_sentence_encodings, query_encoding, processing_progress_bar,total_count)
             st.session_state.paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)
         if 'paragraph_scores' in st.session_state: