File size: 2,167 Bytes
5e8fd8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5402d5
f1cf709
5e8fd8b
 
acf6d8f
 
 
5e8fd8b
acf6d8f
5e8fd8b
 
 
 
 
 
 
 
 
f1cf709
 
 
 
 
 
 
5e8fd8b
 
 
 
 
 
 
 
 
 
b1f6e10
5e8fd8b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores.utils import filter_complex_metadata


class ChatPDF:
    vector_store = None
    retriever = None
    chain = None

    def __init__(self):
        self.model = ChatOllama(model="qwen:0.5b")
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=96)
        self.prompt = PromptTemplate.from_template(
            """
            You are an assistant for question-answering tasks. Use the following pieces of context 
            to answer the question. If you don't know the answer, just say that you don't know. 
            Question: {question} 
            Context: {context} 
            Answer: 
            """
        )

    def ingest(self, pdf_file_path: str):
        docs = PyMuPDFLoader(file_path=pdf_file_path).load()
        chunks = self.text_splitter.split_documents(docs)
        chunks = filter_complex_metadata(chunks)

        vector_store = Chroma.from_documents(documents=chunks, embedding=FastEmbedEmbeddings())
        self.retriever = vector_store.as_retriever(
            search_type="similarity_score_threshold",
            search_kwargs={
                "k": 7,
                "score_threshold": 0.1,
            },
        )

        self.chain = ({"context": self.retriever, "question": RunnablePassthrough()}
                      | self.prompt
                      | self.model
                      | StrOutputParser())

    def ask(self, query: str):
        if not self.chain:
            return "Please, add a PDF document first."

        return self.chain.invoke(query[:512])

    def clear(self):
        self.vector_store = None
        self.retriever = None
        self.chain = None