from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from dotenv import load_dotenv
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core import Settings
import os
import gradio as gr


Settings.llm = HuggingFaceInferenceAPI(
    model_name="HuggingFaceH4/zephyr-7b-beta",
    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
    context_window=3000,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.1},
    stream=True
)

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

# Define the directory of    data
DATA_DIR = "."

# Ensure data directory exists
# os.makedirs(DATA_DIR, exist_ok=True)

# Load documents
documents = SimpleDirectoryReader(DATA_DIR, required_exts=[".pdf"]).load_data()

print(documents[0])

# Create Index
index = VectorStoreIndex.from_documents(documents)

chat_text_qa_msgs = [
(
    "user",
    """You are a Q&A assistant named PEDEEP. For all other inquiries, your main goal is to provide answers as accurately as possible, based on the instructions and context you have been given. If a question does not match the provided context or is outside the scope of the document, kindly advise the user to ask questions within the context of the document.
    Context:
    {context_str}
    Question:
    {query_str}
    """
)
]
text_qa_template = ChatPromptTemplate.from_messages(chat_text_qa_msgs)

# Initialize Chat Memory Buffer for Conversation Memory
memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

# Create Chat Engine with LLM
chat_engine = index.as_chat_engine(
    text_qa_template=text_qa_template,
    memory=memory,
    chat_mode="condense_question"  # Chooses mode suit for your use case
)

### Gradio Interface ###

def chat_with_ollama(message, history):
    # debug print memory
    # print(memory.get_all())

    if history == []:
        print("# cleared history, resetting chatbot state")
        chat_engine.reset()

    # HuggingFaceInferenceAPI not implemented stream yet

    return chat_engine.chat(message).response


chatbot = gr.ChatInterface(
    chat_with_ollama, title="(UUD45) Document-Based Chatbot with LLM")

chatbot.launch()
# chatbot.launch(server_name="xx.xx.xx.xx", server_port=7860)  # set IP and port for deployment