dtcda

Sleeping

App Files Files Community

zmbfeng commited on Sep 17

Commit

1d745e5

•

1 Parent(s): 2917b3d

preliminary finding similarities working

Browse files

Files changed (1) hide show

app.py +9 -51

app.py CHANGED Viewed

@@ -41,33 +41,7 @@ def combined_similarity(similarity, sentence, query):
     return combined_score,similarity,(common_words / max(len(query_words), 1))
-def paraphrase(sentence):
-  text =  "paraphrase: " + sentence + " </s>"
-  encoding = st.session_state.paraphrase_tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
-  input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
-  outputs = st.session_state.paraphrase_model.generate(
-      input_ids=input_ids, attention_mask=attention_masks,
-      max_length=256,
-      do_sample=True,
-      top_k=120,
-      top_p=0.95,
-      #early_stopping=True,
-      early_stopping=False,
-      #num_return_sequences=5,
-      num_return_sequences=1,
-      repetition_penalty=1.5
-  )
-  # print(f"outputs = {outputs}")
-  results=[]
-  for output in outputs:
-    print("*")
-    line = st.session_state.paraphrase_tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
-    #results.append(line)
-  return line
 if 'is_initialized' not in st.session_state:
     st.session_state['is_initialized'] = True
@@ -81,9 +55,8 @@ if 'is_initialized' not in st.session_state:
     st.session_state.stop_words = set(stop_words_list)
     st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
     st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
-    st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
-    st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to('cuda')
-    print(str(st.session_state.paraphrase_model ))
 if 'list_count' in st.session_state:
     st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
     if 'paragraph_sentence_encodings' not in st.session_state:
@@ -98,7 +71,8 @@ if 'list_count' in st.session_state:
             read_progress_bar.progress(progress_percentage)
             sentence_encodings = []
-            sentences = sent_tokenize(paragraph['paragraph'])
             for sentence in sentences:
                 if sentence.strip().endswith('?'):
                     sentence_encodings.append(None)
@@ -134,8 +108,6 @@ st.markdown("sample queries for above file: <br/> what does nontechnical manager
 if uploaded_json_file is not None:
     if is_new_file_upload(uploaded_json_file):
         print("is new file uploaded")
-        if 'paraphrased_paragrpahs' in st.session_state:
-            del st.session_state['paraphrased_paragrpahs']
         if 'prev_query' in st.session_state:
             del st.session_state['prev_query']
         if 'paragraph_sentence_encodings' in st.session_state:
@@ -170,8 +142,6 @@ if 'paragraph_sentence_encodings' in st.session_state:
     if query:
         if 'prev_query' not in st.session_state or st.session_state.prev_query != query:
             st.session_state.prev_query = query
-            if 'paraphrased_paragrpahs' in st.session_state:
-                del st.session_state['paraphrased_paragrpahs']
             query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
                 'cuda')
             with torch.no_grad():  # Disable gradient calculation for inference
@@ -198,7 +168,7 @@ if 'paragraph_sentence_encodings' in st.session_state:
                         sentence_scores.append((combined_score, sentence_encoding[0]))
                 sentence_similarities.sort(reverse=True, key=lambda x: x[0])
                 if len(sentence_similarities) >= 3:
                     top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
                     top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
@@ -232,24 +202,12 @@ if 'paragraph_sentence_encodings' in st.session_state:
         if 'paragraph_scores' in st.session_state:
-            if "paraphrased_paragrpahs" not in st.session_state:
-                st.session_state.paraphrased_paragrpahs = []
-                processing_progress_bar=st.progress(0)
-                for i, (similarity_score, commonality_score, paragraph) in enumerate(st.session_state.paragraph_scores[:5]):
-                    output_1 = paraphrase(paragraph['modified_text'])
-                    # print(output_1)
-                    output_2 = paraphrase(output_1)
-                    # print(output_2)
-                    st.session_state.paraphrased_paragrpahs.append(output_2)
-                    processing_progress_bar.progress(i / (len(st.session_state.paragraph_scores[:5]) - 1))
             st.write("Top scored paragraphs and their scores:")
             for i, (similarity_score, commonality_score, paragraph) in enumerate(
                     st.session_state.paragraph_scores[:5]):
-                st.write("Paraphrased Paragraph: ", st.session_state.paraphrased_paragrpahs[i])
-                if st.button(f"Show Original Paragraph {i + 1}", key=f"button_{i}"):
-                    st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
-                    st.write("Original Paragraph: ", paragraph['original_text'])
                 # st.write("Modified Paragraph: ", paragraph['modified_text'])

     return combined_score,similarity,(common_words / max(len(query_words), 1))
 if 'is_initialized' not in st.session_state:
     st.session_state['is_initialized'] = True
     st.session_state.stop_words = set(stop_words_list)
     st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
     st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
 if 'list_count' in st.session_state:
     st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
     if 'paragraph_sentence_encodings' not in st.session_state:
             read_progress_bar.progress(progress_percentage)
             sentence_encodings = []
+            paragraph_without_newline= paragraph['paragraph'].replace("\n", "")
+            sentences = sent_tokenize(paragraph_without_newline)
             for sentence in sentences:
                 if sentence.strip().endswith('?'):
                     sentence_encodings.append(None)
 if uploaded_json_file is not None:
     if is_new_file_upload(uploaded_json_file):
         print("is new file uploaded")
         if 'prev_query' in st.session_state:
             del st.session_state['prev_query']
         if 'paragraph_sentence_encodings' in st.session_state:
     if query:
         if 'prev_query' not in st.session_state or st.session_state.prev_query != query:
             st.session_state.prev_query = query
             query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
                 'cuda')
             with torch.no_grad():  # Disable gradient calculation for inference
                         sentence_scores.append((combined_score, sentence_encoding[0]))
                 sentence_similarities.sort(reverse=True, key=lambda x: x[0])
+                # print(sentence_similarities)
                 if len(sentence_similarities) >= 3:
                     top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
                     top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
         if 'paragraph_scores' in st.session_state:
             st.write("Top scored paragraphs and their scores:")
             for i, (similarity_score, commonality_score, paragraph) in enumerate(
                     st.session_state.paragraph_scores[:5]):
+                st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
+                st.write("Original Paragraph: ", paragraph['original_text'])
+                #Member will be considered Actively at Work if he or she is able and available for active
                 # st.write("Modified Paragraph: ", paragraph['modified_text'])