chat-with-pdf / embeddings.py
JaiSurya's picture
Initial app setup
9f493b6
# This file contains all the functionalities from the pdf extraction to the embeddings
import os
import re
from tqdm import tqdm
from spacy.lang.en import English
import fitz
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
class Embeddings:
def __init__(self,pdf_file_path : str):
self.pdf_file_path = pdf_file_path
self.embedding_model_name = "all-mpnet-base-v2"
self.device = self.get_device()
def get_device(self) -> str:
""" Returns the device """
device = 'cuda' if torch.cuda.is_available() else 'cpu'
return device
def text_formatter(self,text : str) -> str:
""" Convert the text that contains the /n with the space"""
formatted_text = text.replace('\n',' ').strip()
return formatted_text
def count_and_split_sentence(self,text : str) -> (int,list[str]):
"""To count and split the sentences from the given text """
nlp = English()
nlp.add_pipe("sentencizer")
list_of_sentences = list(nlp(text).sents)
list_of_sentences = [str(sentence) for sentence in list_of_sentences]
return len(list_of_sentences),list_of_sentences
def open_pdf(self):
"""convert the pdf into dict dtype"""
doc = fitz.open(self.pdf_file_path)
data = []
print("[INFO] Converting the pdf into dict dtype")
for page_number,page in tqdm(enumerate(doc)):
text = page.get_text()
text = self.text_formatter(text = text)
sentence_count,sentences = self.count_and_split_sentence(text)
data.append(
{
"page_number" : page_number,
"char_count" : len(text),
"word_count" : len(text.split(" ")),
"sentence_count" : sentence_count,
"token_count" : len(text) / 4,
"sentence" : sentences,
"text" : text
}
)
return data
def split_the_array(self,array_list : list,
chunk_length : int) -> list[list[str]]:
"""Split the array of sentences into groups of chunks"""
return [array_list[i:i+chunk_length] for i in range(0,len(array_list),chunk_length)]
def convert_to_chunk(self,chunk_size : int = 10) -> list[dict]:
""" Convert the sentences into chunks """
pages_and_texts = self.open_pdf()
pages_and_chunks = []
# splitting the chunks
print("[INFO] Splitting the sentences ")
for item in tqdm(pages_and_texts):
item["sentence_chunks"] = self.split_the_array(item["sentence"],chunk_size)
item["chunk_count"] = len(item["sentence_chunks"])
# splitting the chunks
print("[INFO] Splitting into chunks ")
for item in tqdm(pages_and_texts):
for chunks in item["sentence_chunks"]:
d = {}
joined_sentence = "".join(chunks).replace(" "," ").strip()
joined_sentence = re.sub(r'\.([A-Z])', r'. \1',joined_sentence) # .A -> . A it is used to provide a space after a sentence ends
if len(joined_sentence) / 4 > 30:
d["page_number"] = item["page_number"]
d["sentence_chunk"] = joined_sentence
# stats
d["char_count"] = len(joined_sentence)
d["word_count"] = len(list(joined_sentence.split(" ")))
d["token_count"] = len(joined_sentence) / 4 # 4 tokens ~ 1 word
pages_and_chunks.append(d)
return pages_and_chunks
def convert_to_embedds(self,chunk_size = 10) -> list[dict] :
data = self.convert_to_chunk(chunk_size)
embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
print("[INFO] Converting into embeddings ")
for item in tqdm(data):
item["embeddings"] = embedding_model.encode(item["sentence_chunk"], convert_to_tensor = True)
return data
def save_the_embeddings(self,filename : str = "embeddings.csv",data : list[dict] = None):
embedd_file = filename
if data is None:
data = self.convert_to_embedds()
dataframe = pd.DataFrame(data)
dataframe.to_csv(embedd_file,index = False)