Spaces:

hexml
/

chat-with-pdf

Sleeping

App Files Files Community

chat-with-pdf / embeddings.py

JaiSurya

Initial app setup

9f493b6 about 1 year ago

raw

history blame contribute delete

4.48 kB

	# This file contains all the functionalities from the pdf extraction to the embeddings
	import os
	import re

	from tqdm import tqdm
	from spacy.lang.en import English
	import fitz
	import pandas as pd

	import torch
	from sentence_transformers import SentenceTransformer

	class Embeddings:

	def __init__(self,pdf_file_path : str):
	self.pdf_file_path = pdf_file_path
	self.embedding_model_name = "all-mpnet-base-v2"
	self.device = self.get_device()

	def get_device(self) -> str:
	""" Returns the device """
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	return device

	def text_formatter(self,text : str) -> str:
	""" Convert the text that contains the /n with the space"""
	formatted_text = text.replace('\n',' ').strip()

	return formatted_text

	def count_and_split_sentence(self,text : str) -> (int,list[str]):
	"""To count and split the sentences from the given text """
	nlp = English()
	nlp.add_pipe("sentencizer")

	list_of_sentences = list(nlp(text).sents)
	list_of_sentences = [str(sentence) for sentence in list_of_sentences]

	return len(list_of_sentences),list_of_sentences

	def open_pdf(self):
	"""convert the pdf into dict dtype"""
	doc = fitz.open(self.pdf_file_path)
	data = []

	print("[INFO] Converting the pdf into dict dtype")
	for page_number,page in tqdm(enumerate(doc)):
	text = page.get_text()
	text = self.text_formatter(text = text)

	sentence_count,sentences = self.count_and_split_sentence(text)

	data.append(
	{
	"page_number" : page_number,
	"char_count" : len(text),
	"word_count" : len(text.split(" ")),
	"sentence_count" : sentence_count,
	"token_count" : len(text) / 4,
	"sentence" : sentences,
	"text" : text
	}
	)

	return data

	def split_the_array(self,array_list : list,
	chunk_length : int) -> list[list[str]]:
	"""Split the array of sentences into groups of chunks"""
	return [array_list[i:i+chunk_length] for i in range(0,len(array_list),chunk_length)]

	def convert_to_chunk(self,chunk_size : int = 10) -> list[dict]:
	""" Convert the sentences into chunks """
	pages_and_texts = self.open_pdf()
	pages_and_chunks = []

	# splitting the chunks
	print("[INFO] Splitting the sentences ")
	for item in tqdm(pages_and_texts):
	item["sentence_chunks"] = self.split_the_array(item["sentence"],chunk_size)
	item["chunk_count"] = len(item["sentence_chunks"])

	# splitting the chunks
	print("[INFO] Splitting into chunks ")
	for item in tqdm(pages_and_texts):
	for chunks in item["sentence_chunks"]:
	d = {}

	joined_sentence = "".join(chunks).replace(" "," ").strip()
	joined_sentence = re.sub(r'\.([A-Z])', r'. \1',joined_sentence) # .A -> . A it is used to provide a space after a sentence ends

	if len(joined_sentence) / 4 > 30:
	d["page_number"] = item["page_number"]
	d["sentence_chunk"] = joined_sentence
	# stats
	d["char_count"] = len(joined_sentence)
	d["word_count"] = len(list(joined_sentence.split(" ")))
	d["token_count"] = len(joined_sentence) / 4 # 4 tokens ~ 1 word

	pages_and_chunks.append(d)

	return pages_and_chunks

	def convert_to_embedds(self,chunk_size = 10) -> list[dict] :

	data = self.convert_to_chunk(chunk_size)

	embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
	print("[INFO] Converting into embeddings ")
	for item in tqdm(data):
	item["embeddings"] = embedding_model.encode(item["sentence_chunk"], convert_to_tensor = True)

	return data

	def save_the_embeddings(self,filename : str = "embeddings.csv",data : list[dict] = None):
	embedd_file = filename
	if data is None:
	data = self.convert_to_embedds()
	dataframe = pd.DataFrame(data)
	dataframe.to_csv(embedd_file,index = False)