Spaces:
Runtime error
Runtime error
File size: 4,239 Bytes
9151071 1ff6584 39b560e 683c59a 1ff6584 b5f36b8 794ae55 39b560e 63a6e05 78efab9 9151071 bdc84e2 5e8fd8b 4c88907 f4f9ced 4c88907 b5f36b8 0fbf3f1 1ff6584 bdc84e2 78efab9 9151071 5529817 9151071 1ff6584 bdc84e2 9151071 b5f36b8 889a629 b5f36b8 5529817 b5808ba b5f36b8 bdc84e2 881c0e5 5c1d000 b5f36b8 881c0e5 5e8fd8b 8b90c15 b5f36b8 bdc84e2 1ff6584 4c88907 881c0e5 f4f9ced 1ff6584 bdc84e2 f4f9ced 1ff6584 35f7fe0 b7a35f3 1ff6584 bdc84e2 f4f9ced 5c1d000 1ff6584 bdc84e2 5c1d000 bdc84e2 1ff6584 f4f9ced 1ff6584 f1cf709 5e8fd8b 2d12855 ce047d8 2d12855 b5f36b8 d04ea2c bdc84e2 2d12855 fd6a7b2 5e8fd8b bc8f854 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import os
import logging
from llama_index.core import (
SimpleDirectoryReader,
VectorStoreIndex,
StorageContext,
Settings,
get_response_synthesizer)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode, MetadataMode
from llama_index.core.vector_stores import VectorStoreQuery
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from llama_index.readers.file.docs.base import DocxReader, HWPReader, PDFReader
store_dir = os.path.expanduser("~/wtp_be_store/")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ChatPDF:
pdf_count = 0
text_chunks = []
doc_ids = []
nodes = []
def __init__(self):
self.text_parser = SentenceSplitter(chunk_size=512, chunk_overlap=24)
logger.info("initializing the vector store related objects")
self.client = QdrantClient(path=store_dir)
self.vector_store = QdrantVectorStore(
client=self.client,
collection_name="rag_documents",
# enable_hybrid=True
)
logger.info("initializing the FastEmbedEmbedding")
self.embed_model = FastEmbedEmbedding(
# model_name="BAAI/bge-small-en"
)
llm = LlamaCPP(
model_url="https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GGUF/resolve/main/qwen2-0_5b-instruct-fp16.gguf",
temperature=0.1,
max_new_tokens=256,
generate_kwargs={"max_tokens": 256, "temperature": 0.1, "top_k": 3},
# messages_to_prompt=self.messages_to_prompt,
# completion_to_prompt=self.completion_to_prompt,
verbose=True,
)
# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
# tokenizer.save_pretrained("./models/tokenizer/")
logger.info("initializing the global settings")
Settings.text_splitter = self.text_parser
Settings.embed_model = self.embed_model
Settings.llm = llm
# Settings.tokenzier = tokenizer
Settings.transformations = [self.text_parser]
def ingest(self, files_dir: str):
docs = SimpleDirectoryReader(input_dir=files_dir).load_data()
logger.info("enumerating docs")
for doc_idx, doc in enumerate(docs):
self.pdf_count = self.pdf_count + 1
curr_text_chunks = self.text_parser.split_text(doc.text)
self.text_chunks.extend(curr_text_chunks)
self.doc_ids.extend([doc_idx] * len(curr_text_chunks))
logger.info("enumerating text_chunks")
for idx, text_chunk in enumerate(self.text_chunks):
node = TextNode(text=text_chunk)
# src_doc = docs[self.doc_ids[idx]]
# node.metadata = src_doc.metadata
if node.get_content(metadata_mode=MetadataMode.EMBED):
self.nodes.append(node)
logger.info("enumerating nodes")
for node in self.nodes:
node_embedding = self.embed_model.get_text_embedding(
node.get_content(metadata_mode=MetadataMode.ALL)
)
node.embedding = node_embedding
logger.info("initializing the storage context")
storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
logger.info("indexing the nodes in VectorStoreIndex")
index = VectorStoreIndex(
nodes=self.nodes,
storage_context=storage_context,
transformations=Settings.transformations,
)
self.query_engine = index.as_query_engine(
streaming=True,
similarity_top_k=3,
)
def ask(self, query: str):
logger.info("retrieving the response to the query")
streaming_response = self.query_engine.query(query)
return streaming_response
def clear(self):
if self.nodes:
self.vector_store.delete_nodes(self.nodes)
self.pdf_count = 0
self.text_chunks = []
self.doc_ids = []
self.nodes = []
|