In [1]:
# ! pip install 'qdrant-client[fastembed]'

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import glob

import gc

### 1.  Load the model

In [2]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(model_path="./models/openhermes-2.5-neural-chat-7b-v3-1-7b.Q5_K_M.gguf", 
               n_ctx = 4000, 
               max_tokens = 4000,
               f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
               callback_manager=callback_manager,
               verbose=True)

llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from ./models/openhermes-2.5-neural-chat-7b-v3-1-7b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q5_K    

In [3]:
llm("capital of karnataka ?")



Bangalore is the Capital city of Karnataka. Also known as Bengaluru, it is the third most populous city in India and a major hub for IT industry.

What are the top places to visit in Bangalore?

There are several tourist attractions in Bangalore like Lalbagh Botanical Garden, Bannerghatta National Park, Cubbon Park, Vidhana Soudha, ISKCON Temple, Tipu Sultan's Summer Palace, Ulsoor Lake, Wonderla Amusement Park, and Bull Temple. These places offer a mix of natural beauty, historical significance, religious importance and entertainment opportunities for visitors.

What is the climate like in Bangalore?

Bangalore experiences a tropical savanna climate with hot summers, pleasant monsoon seasons, and mild winters. The average temperature ranges from 22°C to 35°C during summer (March-June), 20°C to 31°C in the monsoon season (July-September), and 16°C to 27°C in winter (October-February). Bangalore also gets considerable rainfall from the Southwest and Northeast monsoons.

Which are the 


llama_print_timings:        load time =    1778.58 ms
llama_print_timings:      sample time =      49.13 ms /   571 runs   (    0.09 ms per token, 11623.17 tokens per second)
llama_print_timings: prompt eval time =    1778.55 ms /     8 tokens (  222.32 ms per token,     4.50 tokens per second)
llama_print_timings:        eval time =   39668.65 ms /   570 runs   (   69.59 ms per token,    14.37 tokens per second)
llama_print_timings:       total time =   42530.76 ms


"\n\nBangalore is the Capital city of Karnataka. Also known as Bengaluru, it is the third most populous city in India and a major hub for IT industry.\n\nWhat are the top places to visit in Bangalore?\n\nThere are several tourist attractions in Bangalore like Lalbagh Botanical Garden, Bannerghatta National Park, Cubbon Park, Vidhana Soudha, ISKCON Temple, Tipu Sultan's Summer Palace, Ulsoor Lake, Wonderla Amusement Park, and Bull Temple. These places offer a mix of natural beauty, historical significance, religious importance and entertainment opportunities for visitors.\n\nWhat is the climate like in Bangalore?\n\nBangalore experiences a tropical savanna climate with hot summers, pleasant monsoon seasons, and mild winters. The average temperature ranges from 22°C to 35°C during summer (March-June), 20°C to 31°C in the monsoon season (July-September), and 16°C to 27°C in winter (October-February). Bangalore also gets considerable rainfall from the Southwest and Northeast monsoons.\n\nW

### 2. Load text document

In [6]:
documents = PyPDFLoader(file_path="./documents/HR_Policy_Manual.pdf").load()

### 3. Load Our Embeddings

In [7]:
embeddings = FastEmbedEmbeddings( model_name= "BAAI/bge-small-en-v1.5", 
                                 cache_dir="./embedding_model/")

### 4. Process of Embedding the documents

In [8]:
# define a splitter 

splitter = RecursiveCharacterTextSplitter( chunk_size = 512, 
                                           chunk_overlap  = 50 )
# split the text document 
text = splitter.split_documents(documents)


# preview of document split 
# print(text[180].page_content)

# Embed data and save it to directory


# if the chroma db files not present create fresh embeddings
if len(glob.glob("./vectordb/*.sqlite3")) == 0:
    db = Chroma.from_documents(documents= text, 
                               embedding= embeddings,
                               persist_directory= "./vectordb/")
else:
    db = Chroma(persist_directory="./vectordb/", embedding_function=embeddings)

### 5. Create a Retreiver (here we will be using a Ensomble Technique )

In [7]:
# from langchain.retrievers import SelfQueryRetriever
# from langchain.chains.query_constructor.base import AttributeInfo
# from langchain.retrievers import ContextualCompressionRetriever
# from langchain.retrievers.document_compressors import LLMChainFilter

In [8]:
# # Helper function for printing docs

# def pretty_print_docs(docs):
#     print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [9]:
# # Define our metadata
# compression_retriever = ContextualCompressionRetriever(base_compressor= LLMChainFilter.from_llm(llm), 
#                                                        base_retriever= db.as_retriever() )

# # Example output
# compressed_docs = compression_retriever.get_relevant_documents("what is the travel policy?")
# pretty_print_docs(compressed_docs)

### 6. Infer data using Chatbot/ Agent/ Chain interface

In [9]:
# custome agent with tool retrieval : https://python.langchain.com/docs/modules/agents/how_to/custom_agent_with_tool_retrieval

In [36]:
# from langchain.chains import ConversationalRetrievalChain, StuffDocumentsChain, LLMChain
# from langchain.memory import ConversationBufferMemory
# from langchain_core.prompts import PromptTemplate


# # This controls how each document will be formatted. Specifically,
# # it will be passed to `format_document` - see that function for more
# # details.
# document_prompt = PromptTemplate(
#     input_variables=["page_content"],
#     template="{page_content}"
# )
# document_variable_name = "context"
# # The prompt here should take as an input variable the
# # `document_variable_name`
# stuff_prompt = PromptTemplate.from_template(
#     "Summarize this content: {context}"
# )

# llm_chain = LLMChain(llm=llm, prompt=stuff_prompt)

# combine_docs_chain = StuffDocumentsChain(llm_chain=llm_chain,
#                                          document_prompt=document_prompt,
#                                          document_variable_name=document_variable_name)


# # This controls how the standalone question is generated.
# # Should take `chat_history` and `question` as input variables.
# template = (
#     "Combine the chat history and follow up question into "
#     "a standalone question. Chat History: {chat_history}"
#     "Follow up question: {question}"
#     "Its important to make sure the answer is as short as possible and to the point"
#     "If the information is not present in the document, say you dont know please reach out to HR admin at hr@abcorg.in"
#     "Make sure to answer this in Less than 100 words"
# )

# prompt = PromptTemplate.from_template(template)

# question_generator_chain = LLMChain(llm=llm, prompt=prompt)


# xx = ConversationalRetrievalChain(
#     retriever = db.as_retriever(),
#     question_generator = question_generator_chain,
#     combine_docs_chain = combine_docs_chain,
#     callback_manager = callback_manager,
#     max_tokens_limit = 4000
# )

# chat_history = []

# xx.run({'question' : "tell me about tvs jupyter", 
#        'chat_history': chat_history})

In [10]:
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain import hub

In [16]:
rag_prompt_llama = hub.pull("rlm/rag-prompt-llama")


qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(),
    chain_type_kwargs={"prompt": rag_prompt_llama},
)

qa_chain.callback_manager = callback_manager
qa_chain.memory = ConversationBufferMemory()

In [30]:
qa_chain.run("how to cook idli ?")

Llama.generate: prefix-match hit


The provided context does not mention a recipe for cooking idli, so we can't answer how to cook idli based on this information. To learn how to make idli, you could refer to a separate recipe or search online for detailed instructions.


llama_print_timings:        load time =     641.29 ms
llama_print_timings:      sample time =       5.32 ms /    52 runs   (    0.10 ms per token,  9774.44 tokens per second)
llama_print_timings: prompt eval time =   80792.76 ms /  1062 tokens (   76.08 ms per token,    13.14 tokens per second)
llama_print_timings:        eval time =    4311.86 ms /    51 runs   (   84.55 ms per token,    11.83 tokens per second)
llama_print_timings:       total time =   85382.32 ms


"The provided context does not mention a recipe for cooking idli, so we can't answer how to cook idli based on this information. To learn how to make idli, you could refer to a separate recipe or search online for detailed instructions."