|
import streamlit as st |
|
import os |
|
import json |
|
|
|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
import torch |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import numpy as np |
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
from nltk.corpus import stopwords |
|
|
|
def is_new_file_upload(uploaded_file): |
|
if 'last_uploaded_file' in st.session_state: |
|
|
|
if (uploaded_file.name != st.session_state.last_uploaded_file['name'] or |
|
uploaded_file.size != st.session_state.last_uploaded_file['size']): |
|
st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size} |
|
|
|
return True |
|
else: |
|
|
|
return False |
|
else: |
|
|
|
st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size} |
|
return True |
|
def combined_similarity(similarity, sentence, query): |
|
|
|
|
|
|
|
sentence_words = set(word for word in sentence.split() if word.lower() not in st.session_state.stop_words) |
|
query_words = set(word for word in query.split() if word.lower() not in st.session_state.stop_words) |
|
|
|
|
|
common_words = len(sentence_words.intersection(query_words)) |
|
|
|
|
|
combined_score = similarity + (common_words / max(len(query_words), 1)) |
|
return combined_score,similarity,(common_words / max(len(query_words), 1)) |
|
|
|
|
|
def paraphrase(sentence): |
|
text = "paraphrase: " + sentence + " </s>" |
|
|
|
encoding = st.session_state.paraphrase_tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt") |
|
input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda") |
|
|
|
|
|
outputs = st.session_state.paraphrase_model.generate( |
|
input_ids=input_ids, attention_mask=attention_masks, |
|
max_length=256, |
|
do_sample=True, |
|
top_k=120, |
|
top_p=0.95, |
|
|
|
early_stopping=False, |
|
|
|
num_return_sequences=1, |
|
repetition_penalty=1.5 |
|
|
|
) |
|
|
|
results=[] |
|
for output in outputs: |
|
print("*") |
|
line = st.session_state.paraphrase_tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True) |
|
|
|
return line |
|
|
|
big_text = """ |
|
<div style='text-align: center;'> |
|
<h1 style='font-size: 30x;'>Knowledge Extraction A</h1> |
|
</div> |
|
""" |
|
|
|
st.markdown(big_text, unsafe_allow_html=True) |
|
|
|
uploaded_json_file = st.file_uploader("Upload a pre-processed file", |
|
type=['json']) |
|
st.markdown( |
|
f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>', |
|
unsafe_allow_html=True) |
|
st.markdown("sample queries for above file: <br/> What is death? What is a lucid dream? What is the seat of consciousness?",unsafe_allow_html=True) |
|
st.markdown( |
|
f'<a href="https://ikmtechnology.github.io/ikmtechnology/the_business_case_for_ai_extracted_paragraphs.json" target="_blank">Sample 2 download and then upload to above</a>', |
|
unsafe_allow_html=True) |
|
st.markdown("sample queries for above file: <br/> what does nontechnical managers worry about? what if you put all the knowledge, frameworks, and tips from this book to full use? tell me about AI agent",unsafe_allow_html=True) |
|
if uploaded_json_file is not None: |
|
if is_new_file_upload(uploaded_json_file): |
|
print("is new file uploaded") |
|
save_path = './uploaded_files' |
|
if not os.path.exists(save_path): |
|
os.makedirs(save_path) |
|
with open(os.path.join(save_path, uploaded_json_file.name), "wb") as f: |
|
f.write(uploaded_json_file.getbuffer()) |
|
st.success(f'Saved file temp_{uploaded_json_file.name} in {save_path}') |
|
st.session_state.uploaded_path=os.path.join(save_path, uploaded_json_file.name) |
|
|
|
|
|
content = uploaded_json_file.read() |
|
try: |
|
st.session_state.restored_paragraphs = json.loads(content) |
|
|
|
|
|
if isinstance(st.session_state.restored_paragraphs, list): |
|
|
|
st.session_state.list_count = len(st.session_state.restored_paragraphs) |
|
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }') |
|
else: |
|
st.write('The JSON content is not a dictionary.') |
|
except json.JSONDecodeError: |
|
st.write('Invalid JSON file.') |
|
st.rerun() |
|
if 'is_initialized' not in st.session_state: |
|
st.session_state['is_initialized'] = True |
|
|
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
st.session_state.stop_words = set(stopwords.words('english')) |
|
st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", ) |
|
st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda') |
|
st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws") |
|
st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to('cuda') |
|
|
|
if 'list_count' in st.session_state: |
|
st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }') |
|
if 'paragraph_sentence_encodings' not in st.session_state: |
|
print("start embedding paragarphs") |
|
read_progress_bar = st.progress(0) |
|
st.session_state.paragraph_sentence_encodings = [] |
|
for index,paragraph in enumerate(st.session_state.restored_paragraphs): |
|
|
|
|
|
progress_percentage = (index) / (st.session_state.list_count - 1) |
|
|
|
read_progress_bar.progress(progress_percentage) |
|
|
|
sentence_encodings = [] |
|
sentences = sent_tokenize(paragraph['text']) |
|
for sentence in sentences: |
|
if sentence.strip().endswith('?'): |
|
sentence_encodings.append(None) |
|
continue |
|
if len(sentence.strip()) < 4: |
|
sentence_encodings.append(None) |
|
continue |
|
sentence_tokens = st.session_state.bert_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to('cuda') |
|
with torch.no_grad(): |
|
sentence_encoding = st.session_state.bert_model(**sentence_tokens).last_hidden_state[:, 0, :].cpu().numpy() |
|
sentence_encodings.append([sentence, sentence_encoding]) |
|
|
|
st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings]) |
|
st.rerun() |
|
if 'paragraph_sentence_encodings' in st.session_state: |
|
query = st.text_input("Enter your query") |
|
|
|
if query: |
|
query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to( |
|
'cuda') |
|
with torch.no_grad(): |
|
query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0, |
|
:].cpu().numpy() |
|
|
|
paragraph_scores = [] |
|
sentence_scores = [] |
|
total_count = len(st.session_state.paragraph_sentence_encodings) |
|
processing_progress_bar = st.progress(0) |
|
|
|
for index, paragraph_sentence_encoding in enumerate(st.session_state.paragraph_sentence_encodings): |
|
progress_percentage = index / (total_count - 1) |
|
processing_progress_bar.progress(progress_percentage) |
|
|
|
sentence_similarities = [] |
|
for sentence_encoding in paragraph_sentence_encoding[1]: |
|
if sentence_encoding: |
|
similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0] |
|
combined_score, similarity_score, commonality_score = combined_similarity(similarity, |
|
sentence_encoding[0], |
|
query) |
|
sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score)) |
|
sentence_scores.append((combined_score, sentence_encoding[0])) |
|
|
|
sentence_similarities.sort(reverse=True, key=lambda x: x[0]) |
|
|
|
if len(sentence_similarities) >= 3: |
|
top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]]) |
|
top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]]) |
|
top_three_sentences = sentence_similarities[:3] |
|
elif sentence_similarities: |
|
top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities]) |
|
top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities]) |
|
top_three_sentences = sentence_similarities |
|
else: |
|
top_three_avg_similarity = 0 |
|
top_three_avg_commonality = 0 |
|
top_three_sentences = [] |
|
|
|
top_three_texts = [s[1] for s in top_three_sentences] |
|
remaining_texts = [s[0] for s in paragraph_sentence_encoding[1] if s and s[0] not in top_three_texts] |
|
reordered_paragraph = top_three_texts + remaining_texts |
|
|
|
original_paragraph = ' '.join([s[0] for s in paragraph_sentence_encoding[1] if s]) |
|
modified_paragraph = ' '.join(reordered_paragraph) |
|
|
|
|
|
|
|
|
|
paragraph_scores.append( |
|
(top_three_avg_similarity, top_three_avg_commonality, |
|
{'modified_text': modified_paragraph, 'original_text': paragraph_sentence_encoding[0]}) |
|
) |
|
|
|
sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True) |
|
paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True) |
|
|
|
st.write("Top scored paragraphs and their scores:") |
|
for similarity_score, commonality_score, paragraph in paragraph_scores[:5]: |
|
st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}") |
|
|
|
output_1 = paraphrase(paragraph['modified_text']) |
|
print(output_1) |
|
|
|
output_2 = paraphrase(output_1) |
|
print(output_2) |
|
st.write("Paraphrased Paragraph: ", output_2) |
|
st.write("Modified Paragraph: ", paragraph['modified_text']) |
|
st.write("Original Paragraph: ", paragraph['original_text']) |
|
|