zmbfeng commited on
Commit
1d745e5
1 Parent(s): 2917b3d

preliminary finding similarities working

Browse files
Files changed (1) hide show
  1. app.py +9 -51
app.py CHANGED
@@ -41,33 +41,7 @@ def combined_similarity(similarity, sentence, query):
41
  return combined_score,similarity,(common_words / max(len(query_words), 1))
42
 
43
 
44
- def paraphrase(sentence):
45
- text = "paraphrase: " + sentence + " </s>"
46
-
47
- encoding = st.session_state.paraphrase_tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
48
- input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
49
-
50
-
51
- outputs = st.session_state.paraphrase_model.generate(
52
- input_ids=input_ids, attention_mask=attention_masks,
53
- max_length=256,
54
- do_sample=True,
55
- top_k=120,
56
- top_p=0.95,
57
- #early_stopping=True,
58
- early_stopping=False,
59
- #num_return_sequences=5,
60
- num_return_sequences=1,
61
- repetition_penalty=1.5
62
-
63
- )
64
- # print(f"outputs = {outputs}")
65
- results=[]
66
- for output in outputs:
67
- print("*")
68
- line = st.session_state.paraphrase_tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
69
- #results.append(line)
70
- return line
71
 
72
  if 'is_initialized' not in st.session_state:
73
  st.session_state['is_initialized'] = True
@@ -81,9 +55,8 @@ if 'is_initialized' not in st.session_state:
81
  st.session_state.stop_words = set(stop_words_list)
82
  st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
83
  st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
84
- st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
85
- st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to('cuda')
86
- print(str(st.session_state.paraphrase_model ))
87
  if 'list_count' in st.session_state:
88
  st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
89
  if 'paragraph_sentence_encodings' not in st.session_state:
@@ -98,7 +71,8 @@ if 'list_count' in st.session_state:
98
  read_progress_bar.progress(progress_percentage)
99
 
100
  sentence_encodings = []
101
- sentences = sent_tokenize(paragraph['paragraph'])
 
102
  for sentence in sentences:
103
  if sentence.strip().endswith('?'):
104
  sentence_encodings.append(None)
@@ -134,8 +108,6 @@ st.markdown("sample queries for above file: <br/> what does nontechnical manager
134
  if uploaded_json_file is not None:
135
  if is_new_file_upload(uploaded_json_file):
136
  print("is new file uploaded")
137
- if 'paraphrased_paragrpahs' in st.session_state:
138
- del st.session_state['paraphrased_paragrpahs']
139
  if 'prev_query' in st.session_state:
140
  del st.session_state['prev_query']
141
  if 'paragraph_sentence_encodings' in st.session_state:
@@ -170,8 +142,6 @@ if 'paragraph_sentence_encodings' in st.session_state:
170
  if query:
171
  if 'prev_query' not in st.session_state or st.session_state.prev_query != query:
172
  st.session_state.prev_query = query
173
- if 'paraphrased_paragrpahs' in st.session_state:
174
- del st.session_state['paraphrased_paragrpahs']
175
  query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
176
  'cuda')
177
  with torch.no_grad(): # Disable gradient calculation for inference
@@ -198,7 +168,7 @@ if 'paragraph_sentence_encodings' in st.session_state:
198
  sentence_scores.append((combined_score, sentence_encoding[0]))
199
 
200
  sentence_similarities.sort(reverse=True, key=lambda x: x[0])
201
-
202
  if len(sentence_similarities) >= 3:
203
  top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
204
  top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
@@ -232,24 +202,12 @@ if 'paragraph_sentence_encodings' in st.session_state:
232
 
233
  if 'paragraph_scores' in st.session_state:
234
 
235
- if "paraphrased_paragrpahs" not in st.session_state:
236
- st.session_state.paraphrased_paragrpahs = []
237
- processing_progress_bar=st.progress(0)
238
- for i, (similarity_score, commonality_score, paragraph) in enumerate(st.session_state.paragraph_scores[:5]):
239
-
240
 
241
- output_1 = paraphrase(paragraph['modified_text'])
242
- # print(output_1)
243
- output_2 = paraphrase(output_1)
244
- # print(output_2)
245
- st.session_state.paraphrased_paragrpahs.append(output_2)
246
- processing_progress_bar.progress(i / (len(st.session_state.paragraph_scores[:5]) - 1))
247
  st.write("Top scored paragraphs and their scores:")
248
  for i, (similarity_score, commonality_score, paragraph) in enumerate(
249
  st.session_state.paragraph_scores[:5]):
250
- st.write("Paraphrased Paragraph: ", st.session_state.paraphrased_paragrpahs[i])
251
- if st.button(f"Show Original Paragraph {i + 1}", key=f"button_{i}"):
252
- st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
253
- st.write("Original Paragraph: ", paragraph['original_text'])
254
  # st.write("Modified Paragraph: ", paragraph['modified_text'])
255
 
 
41
  return combined_score,similarity,(common_words / max(len(query_words), 1))
42
 
43
 
44
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  if 'is_initialized' not in st.session_state:
47
  st.session_state['is_initialized'] = True
 
55
  st.session_state.stop_words = set(stop_words_list)
56
  st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
57
  st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
58
+
59
+
 
60
  if 'list_count' in st.session_state:
61
  st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
62
  if 'paragraph_sentence_encodings' not in st.session_state:
 
71
  read_progress_bar.progress(progress_percentage)
72
 
73
  sentence_encodings = []
74
+ paragraph_without_newline= paragraph['paragraph'].replace("\n", "")
75
+ sentences = sent_tokenize(paragraph_without_newline)
76
  for sentence in sentences:
77
  if sentence.strip().endswith('?'):
78
  sentence_encodings.append(None)
 
108
  if uploaded_json_file is not None:
109
  if is_new_file_upload(uploaded_json_file):
110
  print("is new file uploaded")
 
 
111
  if 'prev_query' in st.session_state:
112
  del st.session_state['prev_query']
113
  if 'paragraph_sentence_encodings' in st.session_state:
 
142
  if query:
143
  if 'prev_query' not in st.session_state or st.session_state.prev_query != query:
144
  st.session_state.prev_query = query
 
 
145
  query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
146
  'cuda')
147
  with torch.no_grad(): # Disable gradient calculation for inference
 
168
  sentence_scores.append((combined_score, sentence_encoding[0]))
169
 
170
  sentence_similarities.sort(reverse=True, key=lambda x: x[0])
171
+ # print(sentence_similarities)
172
  if len(sentence_similarities) >= 3:
173
  top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
174
  top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
 
202
 
203
  if 'paragraph_scores' in st.session_state:
204
 
 
 
 
 
 
205
 
 
 
 
 
 
 
206
  st.write("Top scored paragraphs and their scores:")
207
  for i, (similarity_score, commonality_score, paragraph) in enumerate(
208
  st.session_state.paragraph_scores[:5]):
209
+ st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
210
+ st.write("Original Paragraph: ", paragraph['original_text'])
211
+ #Member will be considered Actively at Work if he or she is able and available for active
 
212
  # st.write("Modified Paragraph: ", paragraph['modified_text'])
213