File size: 4,478 Bytes
9f493b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# This file contains all the functionalities from the pdf extraction to the embeddings
import os
import re

from tqdm import tqdm
from spacy.lang.en import English
import fitz
import pandas as pd

import torch
from sentence_transformers import SentenceTransformer

class Embeddings:

    def __init__(self,pdf_file_path : str):
        self.pdf_file_path = pdf_file_path
        self.embedding_model_name = "all-mpnet-base-v2"
        self.device = self.get_device()
            
    def get_device(self) -> str:
        """ Returns the device """
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        return device

    def text_formatter(self,text : str) -> str:
        """ Convert the text that contains the /n with the space"""
        formatted_text = text.replace('\n',' ').strip()
    
        return formatted_text

    def count_and_split_sentence(self,text : str) -> (int,list[str]):
        """To count and split the sentences from the given text """
        nlp = English()
        nlp.add_pipe("sentencizer")

        list_of_sentences = list(nlp(text).sents)
        list_of_sentences = [str(sentence) for sentence in list_of_sentences]

        return len(list_of_sentences),list_of_sentences

    def open_pdf(self):
        """convert the pdf into dict dtype"""
        doc = fitz.open(self.pdf_file_path)
        data = []

        print("[INFO] Converting the pdf into dict dtype")
        for page_number,page in tqdm(enumerate(doc)):
            text = page.get_text()
            text = self.text_formatter(text = text)

            sentence_count,sentences = self.count_and_split_sentence(text)

            data.append(
                {
                    "page_number" : page_number,
                    "char_count" : len(text),
                    "word_count" : len(text.split(" ")),
                    "sentence_count" : sentence_count,
                    "token_count" : len(text) / 4,
                    "sentence" : sentences,
                    "text" : text
                }
            )

        return data

    def split_the_array(self,array_list : list,
                    chunk_length : int) -> list[list[str]]:
        """Split the array of sentences into groups of chunks"""
        return [array_list[i:i+chunk_length] for i in range(0,len(array_list),chunk_length)]

    def convert_to_chunk(self,chunk_size : int = 10) -> list[dict]:
        """ Convert the sentences into chunks """
        pages_and_texts = self.open_pdf()
        pages_and_chunks = []

        # splitting the chunks 
        print("[INFO] Splitting the sentences ")
        for item in tqdm(pages_and_texts):
            item["sentence_chunks"] = self.split_the_array(item["sentence"],chunk_size)
            item["chunk_count"] = len(item["sentence_chunks"])
    
        # splitting the chunks
        print("[INFO] Splitting into chunks ")
        for item in tqdm(pages_and_texts):
            for chunks in item["sentence_chunks"]:
                d = {}

                joined_sentence = "".join(chunks).replace("  "," ").strip()
                joined_sentence = re.sub(r'\.([A-Z])', r'. \1',joined_sentence) # .A -> . A it is used to provide a space after a sentence ends

                if len(joined_sentence) / 4 > 30:
                    d["page_number"] = item["page_number"]
                    d["sentence_chunk"] = joined_sentence
                    # stats
                    d["char_count"] = len(joined_sentence)
                    d["word_count"] = len(list(joined_sentence.split(" ")))
                    d["token_count"] = len(joined_sentence) / 4 # 4 tokens ~ 1 word
        
                    pages_and_chunks.append(d)
    
        return pages_and_chunks

    def convert_to_embedds(self,chunk_size = 10) -> list[dict] :
    
        data = self.convert_to_chunk(chunk_size)
        
        embedding_model = SentenceTransformer(model_name_or_path = self.embedding_model_name,device = self.device)
        print("[INFO] Converting into embeddings ")
        for item in tqdm(data):
            item["embeddings"] = embedding_model.encode(item["sentence_chunk"], convert_to_tensor = True)
    
        return data

    def save_the_embeddings(self,filename : str = "embeddings.csv",data : list[dict] = None):
        embedd_file = filename
        if data is None:
            data = self.convert_to_embedds()
        dataframe = pd.DataFrame(data)
        dataframe.to_csv(embedd_file,index = False)