File size: 3,806 Bytes
9151071
 
1ff6584
39b560e
683c59a
1ff6584
4c354df
1ff6584
 
b5f36b8
 
 
794ae55
 
39b560e
63a6e05
78efab9
9151071
bdc84e2
 
 
4c354df
 
5e8fd8b
4c88907
f4f9ced
 
 
4c88907
b5f36b8
0fbf3f1
1ff6584
bdc84e2
78efab9
9151071
5529817
4c354df
9151071
1ff6584
bdc84e2
4c354df
b5f36b8
 
4c354df
b5f36b8
4c354df
5529817
4c354df
 
b5f36b8
 
 
bdc84e2
881c0e5
5c1d000
b5f36b8
881c0e5
5e8fd8b
8b90c15
 
b5f36b8
bdc84e2
1ff6584
4c88907
881c0e5
f4f9ced
 
1ff6584
bdc84e2
4c354df
1ff6584
b7a35f3
 
1ff6584
bdc84e2
f4f9ced
5c1d000
1ff6584
 
 
 
bdc84e2
5c1d000
bdc84e2
1ff6584
f4f9ced
1ff6584
4c354df
f1cf709
5e8fd8b
2d12855
 
ce047d8
2d12855
b5f36b8
d04ea2c
bdc84e2
4c354df
 
fd6a7b2
5e8fd8b
 
4c354df
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import logging
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    Settings)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode, MetadataMode
from llama_index.core.vector_stores import VectorStoreQuery
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from llama_index.readers.file.docs.base import DocxReader, HWPReader, PDFReader

store_dir = os.path.expanduser("~/wtp_be_store/")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

model_url = "https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GGUF/resolve/main/qwen2-0_5b-instruct-q4_k_m.gguf"

class ChatPDF:
    pdf_count = 0
    text_chunks = []
    doc_ids = []
    nodes = []

    def __init__(self):
        self.text_parser = SentenceSplitter(chunk_size=512, chunk_overlap=24)

        logger.info("initializing the vector store related objects")
        self.client = QdrantClient(path=store_dir)
        self.vector_store = QdrantVectorStore(
            client=self.client,
            collection_name="rag_documents"
        )

        logger.info("initializing the FastEmbedEmbedding")
        self.embed_model = FastEmbedEmbedding()

        llm = LlamaCPP(
            model_url=model_url,
            temperature=0.1,
            model_path=None,
            max_new_tokens=256,
            context_window=29440,
            generate_kwargs={},
            verbose=True,
        )

        logger.info("initializing the global settings")
        Settings.text_splitter = self.text_parser
        Settings.embed_model = self.embed_model
        Settings.llm = llm
        Settings.transformations = [self.text_parser]

    def ingest(self, files_dir: str):
        docs = SimpleDirectoryReader(input_dir=files_dir).load_data()

        logger.info("enumerating docs")
        for doc_idx, doc in enumerate(docs):
            self.pdf_count = self.pdf_count + 1
            curr_text_chunks = self.text_parser.split_text(doc.text)
            self.text_chunks.extend(curr_text_chunks)
            self.doc_ids.extend([doc_idx] * len(curr_text_chunks))

        logger.info("enumerating text_chunks")
        for text_chunk in self.text_chunks:
            node = TextNode(text=text_chunk)
            if node.get_content(metadata_mode=MetadataMode.EMBED):
                self.nodes.append(node)

        logger.info("enumerating nodes")
        for node in self.nodes:
            node_embedding = self.embed_model.get_text_embedding(
                node.get_content(metadata_mode=MetadataMode.ALL)
            )
            node.embedding = node_embedding

        logger.info("initializing the storage context")
        storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
        logger.info("indexing the nodes in VectorStoreIndex")
        index = VectorStoreIndex(
            nodes=self.nodes,
            storage_context=storage_context,
            transformations=Settings.transformations
        )

        self.query_engine = index.as_query_engine(
            streaming=True,
            similarity_top_k=3,
        )

    def ask(self, query: str):
        logger.info("retrieving the response to the query")
        streaming_response = self.query_engine.query("You are an assistant for question-answering tasks. Use three \
            sentences only and keep the answer concise.\n\n" + query)
        return streaming_response

    def clear(self):
        self.vector_store.clear()
        self.pdf_count = 0
        self.text_chunks = []
        self.doc_ids = []
        self.nodes = []