preliminary finding similarities working
Browse files
app.py
CHANGED
@@ -41,33 +41,7 @@ def combined_similarity(similarity, sentence, query):
|
|
41 |
return combined_score,similarity,(common_words / max(len(query_words), 1))
|
42 |
|
43 |
|
44 |
-
|
45 |
-
text = "paraphrase: " + sentence + " </s>"
|
46 |
-
|
47 |
-
encoding = st.session_state.paraphrase_tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
|
48 |
-
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")
|
49 |
-
|
50 |
-
|
51 |
-
outputs = st.session_state.paraphrase_model.generate(
|
52 |
-
input_ids=input_ids, attention_mask=attention_masks,
|
53 |
-
max_length=256,
|
54 |
-
do_sample=True,
|
55 |
-
top_k=120,
|
56 |
-
top_p=0.95,
|
57 |
-
#early_stopping=True,
|
58 |
-
early_stopping=False,
|
59 |
-
#num_return_sequences=5,
|
60 |
-
num_return_sequences=1,
|
61 |
-
repetition_penalty=1.5
|
62 |
-
|
63 |
-
)
|
64 |
-
# print(f"outputs = {outputs}")
|
65 |
-
results=[]
|
66 |
-
for output in outputs:
|
67 |
-
print("*")
|
68 |
-
line = st.session_state.paraphrase_tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
|
69 |
-
#results.append(line)
|
70 |
-
return line
|
71 |
|
72 |
if 'is_initialized' not in st.session_state:
|
73 |
st.session_state['is_initialized'] = True
|
@@ -81,9 +55,8 @@ if 'is_initialized' not in st.session_state:
|
|
81 |
st.session_state.stop_words = set(stop_words_list)
|
82 |
st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
|
83 |
st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
|
84 |
-
|
85 |
-
|
86 |
-
print(str(st.session_state.paraphrase_model ))
|
87 |
if 'list_count' in st.session_state:
|
88 |
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
|
89 |
if 'paragraph_sentence_encodings' not in st.session_state:
|
@@ -98,7 +71,8 @@ if 'list_count' in st.session_state:
|
|
98 |
read_progress_bar.progress(progress_percentage)
|
99 |
|
100 |
sentence_encodings = []
|
101 |
-
|
|
|
102 |
for sentence in sentences:
|
103 |
if sentence.strip().endswith('?'):
|
104 |
sentence_encodings.append(None)
|
@@ -134,8 +108,6 @@ st.markdown("sample queries for above file: <br/> what does nontechnical manager
|
|
134 |
if uploaded_json_file is not None:
|
135 |
if is_new_file_upload(uploaded_json_file):
|
136 |
print("is new file uploaded")
|
137 |
-
if 'paraphrased_paragrpahs' in st.session_state:
|
138 |
-
del st.session_state['paraphrased_paragrpahs']
|
139 |
if 'prev_query' in st.session_state:
|
140 |
del st.session_state['prev_query']
|
141 |
if 'paragraph_sentence_encodings' in st.session_state:
|
@@ -170,8 +142,6 @@ if 'paragraph_sentence_encodings' in st.session_state:
|
|
170 |
if query:
|
171 |
if 'prev_query' not in st.session_state or st.session_state.prev_query != query:
|
172 |
st.session_state.prev_query = query
|
173 |
-
if 'paraphrased_paragrpahs' in st.session_state:
|
174 |
-
del st.session_state['paraphrased_paragrpahs']
|
175 |
query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
|
176 |
'cuda')
|
177 |
with torch.no_grad(): # Disable gradient calculation for inference
|
@@ -198,7 +168,7 @@ if 'paragraph_sentence_encodings' in st.session_state:
|
|
198 |
sentence_scores.append((combined_score, sentence_encoding[0]))
|
199 |
|
200 |
sentence_similarities.sort(reverse=True, key=lambda x: x[0])
|
201 |
-
|
202 |
if len(sentence_similarities) >= 3:
|
203 |
top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
|
204 |
top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
|
@@ -232,24 +202,12 @@ if 'paragraph_sentence_encodings' in st.session_state:
|
|
232 |
|
233 |
if 'paragraph_scores' in st.session_state:
|
234 |
|
235 |
-
if "paraphrased_paragrpahs" not in st.session_state:
|
236 |
-
st.session_state.paraphrased_paragrpahs = []
|
237 |
-
processing_progress_bar=st.progress(0)
|
238 |
-
for i, (similarity_score, commonality_score, paragraph) in enumerate(st.session_state.paragraph_scores[:5]):
|
239 |
-
|
240 |
|
241 |
-
output_1 = paraphrase(paragraph['modified_text'])
|
242 |
-
# print(output_1)
|
243 |
-
output_2 = paraphrase(output_1)
|
244 |
-
# print(output_2)
|
245 |
-
st.session_state.paraphrased_paragrpahs.append(output_2)
|
246 |
-
processing_progress_bar.progress(i / (len(st.session_state.paragraph_scores[:5]) - 1))
|
247 |
st.write("Top scored paragraphs and their scores:")
|
248 |
for i, (similarity_score, commonality_score, paragraph) in enumerate(
|
249 |
st.session_state.paragraph_scores[:5]):
|
250 |
-
st.write("
|
251 |
-
|
252 |
-
|
253 |
-
st.write("Original Paragraph: ", paragraph['original_text'])
|
254 |
# st.write("Modified Paragraph: ", paragraph['modified_text'])
|
255 |
|
|
|
41 |
return combined_score,similarity,(common_words / max(len(query_words), 1))
|
42 |
|
43 |
|
44 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
if 'is_initialized' not in st.session_state:
|
47 |
st.session_state['is_initialized'] = True
|
|
|
55 |
st.session_state.stop_words = set(stop_words_list)
|
56 |
st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
|
57 |
st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
|
58 |
+
|
59 |
+
|
|
|
60 |
if 'list_count' in st.session_state:
|
61 |
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
|
62 |
if 'paragraph_sentence_encodings' not in st.session_state:
|
|
|
71 |
read_progress_bar.progress(progress_percentage)
|
72 |
|
73 |
sentence_encodings = []
|
74 |
+
paragraph_without_newline= paragraph['paragraph'].replace("\n", "")
|
75 |
+
sentences = sent_tokenize(paragraph_without_newline)
|
76 |
for sentence in sentences:
|
77 |
if sentence.strip().endswith('?'):
|
78 |
sentence_encodings.append(None)
|
|
|
108 |
if uploaded_json_file is not None:
|
109 |
if is_new_file_upload(uploaded_json_file):
|
110 |
print("is new file uploaded")
|
|
|
|
|
111 |
if 'prev_query' in st.session_state:
|
112 |
del st.session_state['prev_query']
|
113 |
if 'paragraph_sentence_encodings' in st.session_state:
|
|
|
142 |
if query:
|
143 |
if 'prev_query' not in st.session_state or st.session_state.prev_query != query:
|
144 |
st.session_state.prev_query = query
|
|
|
|
|
145 |
query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
|
146 |
'cuda')
|
147 |
with torch.no_grad(): # Disable gradient calculation for inference
|
|
|
168 |
sentence_scores.append((combined_score, sentence_encoding[0]))
|
169 |
|
170 |
sentence_similarities.sort(reverse=True, key=lambda x: x[0])
|
171 |
+
# print(sentence_similarities)
|
172 |
if len(sentence_similarities) >= 3:
|
173 |
top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
|
174 |
top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
|
|
|
202 |
|
203 |
if 'paragraph_scores' in st.session_state:
|
204 |
|
|
|
|
|
|
|
|
|
|
|
205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
st.write("Top scored paragraphs and their scores:")
|
207 |
for i, (similarity_score, commonality_score, paragraph) in enumerate(
|
208 |
st.session_state.paragraph_scores[:5]):
|
209 |
+
st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")
|
210 |
+
st.write("Original Paragraph: ", paragraph['original_text'])
|
211 |
+
#Member will be considered Actively at Work if he or she is able and available for active
|
|
|
212 |
# st.write("Modified Paragraph: ", paragraph['modified_text'])
|
213 |
|