dtcda

Sleeping

App Files Files Community

dtcda / app.py

zmbfeng

original paragraph info passed

0612a6a 3 months ago

raw

history blame

11.7 kB

	import streamlit as st
	import os
	import json

	from transformers import GPT2Tokenizer, GPT2LMHeadModel, BertTokenizer, BertModel,T5Tokenizer, T5ForConditionalGeneration,AutoTokenizer, AutoModelForSeq2SeqLM

	import torch
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np
	import nltk
	from nltk.tokenize import sent_tokenize
	from nltk.corpus import stopwords

	def is_new_file_upload(uploaded_file):
	if 'last_uploaded_file' in st.session_state:
	# Check if the newly uploaded file is different from the last one
	if (uploaded_file.name != st.session_state.last_uploaded_file['name'] or
	uploaded_file.size != st.session_state.last_uploaded_file['size']):
	st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
	# st.write("A new src image file has been uploaded.")
	return True
	else:
	# st.write("The same src image file has been re-uploaded.")
	return False
	else:
	# st.write("This is the first file upload detected.")
	st.session_state.last_uploaded_file = {'name': uploaded_file.name, 'size': uploaded_file.size}
	return True
	def combined_similarity(similarity, sentence, query):
	# Tokenize both the sentence and the query
	# sentence_words = set(sentence.split())
	# query_words = set(query.split())
	sentence_words = set(word for word in sentence.split() if word.lower() not in st.session_state.stop_words)
	query_words = set(word for word in query.split() if word.lower() not in st.session_state.stop_words)

	# Calculate the number of common words
	common_words = len(sentence_words.intersection(query_words))

	# Adjust the similarity score with the common words count
	combined_score = similarity + (common_words / max(len(query_words), 1)) # Normalize by the length of the query to keep the score between -1 and 1
	return combined_score,similarity,(common_words / max(len(query_words), 1))


	def paraphrase(sentence):
	text = "paraphrase: " + sentence + " </s>"

	encoding = st.session_state.paraphrase_tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
	input_ids, attention_masks = encoding["input_ids"].to("cuda"), encoding["attention_mask"].to("cuda")


	outputs = st.session_state.paraphrase_model.generate(
	input_ids=input_ids, attention_mask=attention_masks,
	max_length=256,
	do_sample=True,
	top_k=120,
	top_p=0.95,
	#early_stopping=True,
	early_stopping=False,
	#num_return_sequences=5,
	num_return_sequences=1,
	repetition_penalty=1.5

	)
	# print(f"outputs = {outputs}")
	results=[]
	for output in outputs:
	print("*")
	line = st.session_state.paraphrase_tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
	#results.append(line)
	return line

	big_text = """
	<div style='text-align: center;'>
	<h1 style='font-size: 30x;'>Knowledge Extraction A</h1>
	</div>
	"""
	# Display the styled text
	st.markdown(big_text, unsafe_allow_html=True)

	uploaded_json_file = st.file_uploader("Upload a pre-processed file",
	type=['json'])
	st.markdown(
	f'<a href="https://ikmtechnology.github.io/ikmtechnology/untethered_extracted_paragraphs.json" target="_blank">Sample 1 download and then upload to above</a>',
	unsafe_allow_html=True)
	st.markdown("sample queries for above file: <br/> What is death? What is a lucid dream? What is the seat of consciousness?",unsafe_allow_html=True)
	st.markdown(
	f'<a href="https://ikmtechnology.github.io/ikmtechnology/the_business_case_for_ai_extracted_paragraphs.json" target="_blank">Sample 2 download and then upload to above</a>',
	unsafe_allow_html=True)
	st.markdown("sample queries for above file: <br/> what does nontechnical managers worry about? what if you put all the knowledge, frameworks, and tips from this book to full use? tell me about AI agent",unsafe_allow_html=True)
	if uploaded_json_file is not None:
	if is_new_file_upload(uploaded_json_file):
	print("is new file uploaded")
	save_path = './uploaded_files'
	if not os.path.exists(save_path):
	os.makedirs(save_path)
	with open(os.path.join(save_path, uploaded_json_file.name), "wb") as f:
	f.write(uploaded_json_file.getbuffer()) # Write the file to the specified location
	st.success(f'Saved file temp_{uploaded_json_file.name} in {save_path}')
	st.session_state.uploaded_path=os.path.join(save_path, uploaded_json_file.name)
	# st.session_state.page_count = utils.get_pdf_page_count(st.session_state.uploaded_pdf_path)
	# print("page_count=",st.session_state.page_count)
	content = uploaded_json_file.read()
	try:
	st.session_state.restored_paragraphs = json.loads(content)
	#print(data)
	# Check if the parsed data is a dictionary
	if isinstance(st.session_state.restored_paragraphs, list):
	# Count the restored_paragraphs of top-level elements
	st.session_state.list_count = len(st.session_state.restored_paragraphs)
	st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
	else:
	st.write('The JSON content is not a dictionary.')
	except json.JSONDecodeError:
	st.write('Invalid JSON file.')
	st.rerun()
	if 'is_initialized' not in st.session_state:
	st.session_state['is_initialized'] = True

	nltk.download('punkt')
	nltk.download('stopwords')
	st.session_state.stop_words = set(stopwords.words('english'))
	st.session_state.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", )
	st.session_state.bert_model = BertModel.from_pretrained("bert-base-uncased", ).to('cuda')
	st.session_state.paraphrase_tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")
	st.session_state.paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to('cuda')

	if 'list_count' in st.session_state:
	st.write(f'The number of elements at the top level of the hierarchy: {st.session_state.list_count }')
	if 'paragraph_sentence_encodings' not in st.session_state:
	print("start embedding paragarphs")
	read_progress_bar = st.progress(0)
	st.session_state.paragraph_sentence_encodings = []
	for index,paragraph in enumerate(st.session_state.restored_paragraphs):
	#print(paragraph)

	progress_percentage = (index) / (st.session_state.list_count - 1)
	# print(progress_percentage)
	read_progress_bar.progress(progress_percentage)

	sentence_encodings = []
	sentences = sent_tokenize(paragraph['text'])
	for sentence in sentences:
	if sentence.strip().endswith('?'):
	sentence_encodings.append(None)
	continue
	if len(sentence.strip()) < 4:
	sentence_encodings.append(None)
	continue
	sentence_tokens = st.session_state.bert_tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to('cuda')
	with torch.no_grad():
	sentence_encoding = st.session_state.bert_model(**sentence_tokens).last_hidden_state[:, 0, :].cpu().numpy()
	sentence_encodings.append([sentence, sentence_encoding])
	# sentence_encodings.append([sentence,bert_model(**sentence_tokens).last_hidden_state[:, 0, :].detach().numpy()])
	st.session_state.paragraph_sentence_encodings.append([paragraph, sentence_encodings])
	st.rerun()
	if 'paragraph_sentence_encodings' in st.session_state:
	query = st.text_input("Enter your query")

	if query:
	query_tokens = st.session_state.bert_tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(
	'cuda')
	with torch.no_grad(): # Disable gradient calculation for inference
	query_encoding = st.session_state.bert_model(**query_tokens).last_hidden_state[:, 0,
	:].cpu().numpy() # Move the result to CPU and convert to NumPy

	paragraph_scores = []
	sentence_scores = []
	total_count = len(st.session_state.paragraph_sentence_encodings)
	processing_progress_bar = st.progress(0)

	for index, paragraph_sentence_encoding in enumerate(st.session_state.paragraph_sentence_encodings):
	progress_percentage = index / (total_count - 1)
	processing_progress_bar.progress(progress_percentage)

	sentence_similarities = []
	for sentence_encoding in paragraph_sentence_encoding[1]:
	if sentence_encoding:
	similarity = cosine_similarity(query_encoding, sentence_encoding[1])[0][0]
	combined_score, similarity_score, commonality_score = combined_similarity(similarity,
	sentence_encoding[0],
	query)
	sentence_similarities.append((combined_score, sentence_encoding[0], commonality_score))
	sentence_scores.append((combined_score, sentence_encoding[0]))

	sentence_similarities.sort(reverse=True, key=lambda x: x[0])

	if len(sentence_similarities) >= 3:
	top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities[:3]])
	top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities[:3]])
	top_three_sentences = sentence_similarities[:3]
	elif sentence_similarities:
	top_three_avg_similarity = np.mean([s[0] for s in sentence_similarities])
	top_three_avg_commonality = np.mean([s[2] for s in sentence_similarities])
	top_three_sentences = sentence_similarities
	else:
	top_three_avg_similarity = 0
	top_three_avg_commonality = 0
	top_three_sentences = []

	top_three_texts = [s[1] for s in top_three_sentences]
	remaining_texts = [s[0] for s in paragraph_sentence_encoding[1] if s and s[0] not in top_three_texts]
	reordered_paragraph = top_three_texts + remaining_texts

	original_paragraph = ' '.join([s[0] for s in paragraph_sentence_encoding[1] if s])
	modified_paragraph = ' '.join(reordered_paragraph)




	paragraph_scores.append(
	(top_three_avg_similarity, top_three_avg_commonality,
	{'modified_text': modified_paragraph, 'original_text': paragraph_sentence_encoding[0]})
	)

	sentence_scores = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
	paragraph_scores = sorted(paragraph_scores, key=lambda x: x[0], reverse=True)

	st.write("Top scored paragraphs and their scores:")
	for similarity_score, commonality_score, paragraph in paragraph_scores[:5]:
	st.write(f"Similarity Score: {similarity_score}, Commonality Score: {commonality_score}")

	output_1 = paraphrase(paragraph['modified_text'])
	print(output_1)

	output_2 = paraphrase(output_1)
	print(output_2)
	st.write("Paraphrased Paragraph: ", output_2)
	st.write("Modified Paragraph: ", paragraph['modified_text'])
	st.write("Original Paragraph: ", paragraph['original_text'])