from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI from dotenv import load_dotenv from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.memory import ChatMemoryBuffer from llama_index.core import Settings import os import gradio as gr Settings.llm = HuggingFaceInferenceAPI( model_name="HuggingFaceH4/zephyr-7b-beta", tokenizer_name="HuggingFaceH4/zephyr-7b-beta", context_window=3000, max_new_tokens=512, generate_kwargs={"temperature": 0.1}, stream=True ) Settings.embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-small-en-v1.5" ) # Define the directory of data DATA_DIR = "." # Ensure data directory exists # os.makedirs(DATA_DIR, exist_ok=True) # Load documents documents = SimpleDirectoryReader(DATA_DIR, required_exts=[".pdf"]).load_data() print(documents[0]) # Create Index index = VectorStoreIndex.from_documents(documents) chat_text_qa_msgs = [ ( "user", """You are a Q&A assistant named PEDEEP. For all other inquiries, your main goal is to provide answers as accurately as possible, based on the instructions and context you have been given. If a question does not match the provided context or is outside the scope of the document, kindly advise the user to ask questions within the context of the document. Context: {context_str} Question: {query_str} """ ) ] text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs) # Initialize Chat Memory Buffer for Conversation Memory memory = ChatMemoryBuffer.from_defaults(token_limit=3900) # Create Chat Engine with LLM chat_engine = index.as_chat_engine( text_qa_template=text_qa_template, memory=memory, chat_mode="condense_question" # Chooses mode suit for your use case ) ### Gradio Interface ### def chat_with_ollama(message, history): # debug print memory # print(memory.get_all()) if history == []: print("# cleared history, resetting chatbot state") chat_engine.reset() # HuggingFaceInferenceAPI not implemented stream yet return chat_engine.chat(message).response chatbot = gr.ChatInterface( chat_with_ollama, title="(UUD45) Document-Based Chatbot with LLM") chatbot.launch() # chatbot.launch(server_name="xx.xx.xx.xx", server_port=7860) # set IP and port for deployment