File size: 4,275 Bytes
9151071
 
1ff6584
39b560e
683c59a
1ff6584
 
 
 
 
b5f36b8
 
 
794ae55
 
39b560e
63a6e05
78efab9
9151071
bdc84e2
 
 
5e8fd8b
4c88907
f4f9ced
 
 
4c88907
b5f36b8
0fbf3f1
1ff6584
bdc84e2
78efab9
9151071
5529817
9151071
 
 
1ff6584
bdc84e2
9151071
 
 
b5f36b8
 
889a629
b5f36b8
5529817
 
b5808ba
 
b5f36b8
 
 
 
 
 
bdc84e2
881c0e5
5c1d000
b5f36b8
 
881c0e5
5e8fd8b
8b90c15
 
b5f36b8
bdc84e2
1ff6584
4c88907
881c0e5
f4f9ced
 
1ff6584
bdc84e2
f4f9ced
1ff6584
35f7fe0
 
b7a35f3
 
1ff6584
bdc84e2
f4f9ced
5c1d000
1ff6584
 
 
 
bdc84e2
5c1d000
bdc84e2
1ff6584
f4f9ced
1ff6584
 
f1cf709
5e8fd8b
2d12855
 
ce047d8
2d12855
b5f36b8
d04ea2c
bdc84e2
2d12855
fd6a7b2
5e8fd8b
 
bc3c00d
bc8f854
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
import logging
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    Settings,
    get_response_synthesizer)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode, MetadataMode
from llama_index.core.vector_stores import VectorStoreQuery
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from llama_index.readers.file.docs.base import DocxReader, HWPReader, PDFReader

store_dir = os.path.expanduser("~/wtp_be_store/")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ChatPDF:
    pdf_count = 0
    text_chunks = []
    doc_ids = []
    nodes = []

    def __init__(self):
        self.text_parser = SentenceSplitter(chunk_size=512, chunk_overlap=24)

        logger.info("initializing the vector store related objects")
        self.client = QdrantClient(path=store_dir)
        self.vector_store = QdrantVectorStore(
            client=self.client,
            collection_name="rag_documents",
            # enable_hybrid=True
        )

        logger.info("initializing the FastEmbedEmbedding")
        self.embed_model = FastEmbedEmbedding(
            # model_name="BAAI/bge-small-en"
        )

        llm = LlamaCPP(
            model_url="https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GGUF/resolve/main/qwen2-0_5b-instruct-fp16.gguf",
            temperature=0.1,
            max_new_tokens=256,
            generate_kwargs={"max_tokens": 256, "temperature": 0.1, "top_k": 3},
            # messages_to_prompt=self.messages_to_prompt,
            # completion_to_prompt=self.completion_to_prompt,
            verbose=True,
        )

        # tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
        # tokenizer.save_pretrained("./models/tokenizer/")

        logger.info("initializing the global settings")
        Settings.text_splitter = self.text_parser
        Settings.embed_model = self.embed_model
        Settings.llm = llm
        # Settings.tokenzier = tokenizer
        Settings.transformations = [self.text_parser]

    def ingest(self, files_dir: str):
        docs = SimpleDirectoryReader(input_dir=files_dir).load_data()

        logger.info("enumerating docs")
        for doc_idx, doc in enumerate(docs):
            self.pdf_count = self.pdf_count + 1
            curr_text_chunks = self.text_parser.split_text(doc.text)
            self.text_chunks.extend(curr_text_chunks)
            self.doc_ids.extend([doc_idx] * len(curr_text_chunks))

        logger.info("enumerating text_chunks")
        for idx, text_chunk in enumerate(self.text_chunks):
            node = TextNode(text=text_chunk)
            # src_doc = docs[self.doc_ids[idx]]
            # node.metadata = src_doc.metadata
            if node.get_content(metadata_mode=MetadataMode.EMBED):
                self.nodes.append(node)

        logger.info("enumerating nodes")
        for node in self.nodes:
            node_embedding = self.embed_model.get_text_embedding(
                node.get_content(metadata_mode=MetadataMode.ALL)
            )
            node.embedding = node_embedding

        logger.info("initializing the storage context")
        storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
        logger.info("indexing the nodes in VectorStoreIndex")
        index = VectorStoreIndex(
            nodes=self.nodes,
            storage_context=storage_context,
            transformations=Settings.transformations,
        )

        self.query_engine = index.as_query_engine(
            streaming=True,
            similarity_top_k=3,
        )

    def ask(self, query: str):
        logger.info("retrieving the response to the query")
        streaming_response = self.query_engine.query(query)
        return streaming_response

    def clear(self):
        # self.vector_store.clear()
        if self.nodes:
            self.vector_store.delete_nodes(self.nodes)
            self.pdf_count = 0
            self.text_chunks = []
            self.doc_ids = []
            self.nodes = []