# %% [markdown]
# # Changes:
#     - long short response tooltip feature swap option
#     - Definitions for difficult words are now provied at the end of each response

# %% [markdown]
# # All imports and inits

# %%
import gradio as gr
import os
import time
from dotenv import load_dotenv
from transformers import AutoTokenizer

# from typing import List, Tuple
from openai import OpenAI
from pinecone import Pinecone
from groq import Groq

load_dotenv()

DATA_PATH = os.getenv("DATA_PATH")
PINECONE_API = os.getenv("PINECONE_API")
# PINECONE_ENV = os.getenv("PINECONE_ENV")

NVIDIA_API = os.getenv("NVIDIA_API")
NVIDIA_BASE_URL = os.getenv("NVIDIA_BASE_URL")

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_CHAT_URL = os.getenv("GROQ_CHAT_URL")
# Configure headers for Groq API requests
GROQ_HEADERS = {
    "Authorization": f"Bearer {GROQ_API_KEY}",
    "Content-Type": "application/json",
}
# LLM_MODEL = "llama3-70b-8192"
LLM_MODEL = "llama-3.3-70b-versatile"


# NVidia Embedding import
client = OpenAI(
    api_key=NVIDIA_API,
    base_url=NVIDIA_BASE_URL,
)

"""
Input:
    - Context window: 128K
Ouput:
    - Output Max Tokens: 32,768

"""


def track_time(func):
    def wrapper(*args, **kwargs):
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        print(f"[Time Tracker] `{func.__name__}` took {end - start:.4f} seconds")
        return result

    return wrapper


# # EMBEDDING_MODEL = "llama3-405b-8192-embed"

# vo = voyageai.Client()


# %% [markdown]
# # Init Pinecone

# %%
pc = Pinecone(api_key=PINECONE_API)


# %% [markdown]
# # Embedding Function
#
#

# %%
# Connect to the index
# index = pc.Index("ai-coach")
# index = pc.Index("ahsan-400pg-pdf-doc-test")
# index = pc.Index("surgical-tech-complete")  # -- COMPLETE SURGICAL TECH BOOTCAMP
index = pc.Index("quick-start")  # -- QUICK START COURSE 1


# embedding_model = AutoModel.from_pretrained(
#     'jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)


# # Function to generate embeddings without tokenization
# def get_embedding(data):
#     embeddings = embedding_model.encode(data).tolist()
#     return embeddings


@track_time
def get_embedding(text="None"):
    response = client.embeddings.create(
        input=text,
        model="nvidia/nv-embed-v1",
        encoding_format="float",
        extra_body={"input_type": "query", "truncate": "NONE"},
    )

    # print(response.data[0].embedding)
    # print(count_tokens(response.data[0].embedding))
    return response.data[0].embedding


# get_embedding("None")


# %% [markdown]
# # Query Pinecone
#


# %%
# Function to query Pinecone index using embeddings
@track_time
def query_pinecone(embedding):
    # Use keyword arguments to pass the embedding and other parameters
    result = index.query(vector=embedding, top_k=5, include_metadata=True)
    return result["matches"]


print(query_pinecone(get_embedding("Network Components")))


# %% [markdown]
# # Query Groq Inference

# %% [markdown]
# | Use Case | Recommended top_p | Notes |
# |----------|------------------|-------|
# | Factual Q&A | 0.1 - 0.3 | Lower values for more deterministic, factual responses |
# | Code Generation | 0.2 - 0.5 | Precision matters more than creativity |
# | Technical Writing | 0.5 - 0.7 | Balanced approach for technical accuracy with clarity |
# | General Conversation | 0.7 - 0.9 | Good balance for most chatbot applications |
# | Creative Writing | 0.9 - 1.0 | Higher values for more diverse and creative outputs |
#
# \n
# | Parameter Combination | Use Case |
# |----------------------|----------|
# | top_p=0.5, temperature=0.3 | Highly factual, consistent responses |
# | top_p=0.7, temperature=0.5 | Educational content with examples |
# | top_p=0.9, temperature=0.7 | Creative but coherent responses |


# %%
# Modified query_groq function with more explicit streaming handling
@track_time
def query_groq(user_prompt, sys_prompt):
    client = Groq(api_key=os.environ["GROQ_API_KEY"])

    # Always use streaming mode
    return client.chat.completions.create(
        model=LLM_MODEL,  # or whichever model you're using
        temperature=0.5,
        messages=[
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_prompt},
        ],
        stream=True,
        # top_p=0.7,  # testing for better results
    )


# Print all tool calls
# print(completion.choices[0].message.executed_tools)


# Tokenizer to count number of tokens
"""
Putting tokenizer outside of the function to avoid reinitialization and optimize performance.
"""
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")


@track_time
def count_tokens(text: str) -> int:
    # Encode the text into tokens
    tokens = tokenizer.encode(text)
    return len(tokens)


# %% [markdown]
# # Process User Query

# %% [markdown]
# ## Groq and Gradio with Streaming Enabled
#
# - ### i.e. will start showing text as soon as it gets generated from groq inference
# - ### faster than optimized version
#
# ## Query:
# - pediatic surgery
# ## Response Time:
# User Query Tokens: 6
# [Time Tracker] `get_embedding` took 0.4752 seconds
# [Time Tracker] `query_pinecone` took 0.2222 seconds
# [Time Tracker] `query_groq` took 0.5060 seconds
#
# Total time: 1.19 seconds

# %%
# # Modified query_groq function with more explicit streaming handling
# @track_time
# def query_groq(prompt):
#     client = Groq(api_key=os.environ["GROQ_API_KEY"])

#     # Always use streaming mode
#     return client.chat.completions.create(
#         model="llama3-70b-8192",  # or whichever model you're using
#         messages=[{"role": "user", "content": prompt}],
#         stream=True,
#     )


# --------------------------------------------------------- ## Groq and Gradio with Streaming Enabled -----------------------------------------------------
# Modified process_user_query to properly yield streaming updates
@track_time
def process_user_query(user_query: str, conversation_history: list, response_type: str):
    print(f"User Query Tokens: {count_tokens(user_query)}")

    # Generate embedding and get relevant context
    embedding = get_embedding(user_query)
    relevant_chunks = query_pinecone(embedding)
    context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks)

    # Format conversation history for the prompt
    history_str = "\n".join(
        f"User: {user}\nCoach: {response}" for user, response in conversation_history
    )

    system_prompt = f"""
    
    Conversation history:
    {history_str}
    
    learning materials:
    {context}
    
    You are an expert, knowledgeable, and friendly coach. Follow these **guidelines** carefully:

    - If the user requests a **long** response, provide a detailed, comprehensive, and in-depth explanation. Cover all relevant aspects, break down complex concepts step-by-step, include context, background, and reasoning, and use examples, analogies, and clarifications. Address possible follow-up questions and ensure the answer is thorough and educational.
    - If the user requests a **short** response, provide a concise, focused, and to-the-point answer. Summarize the key information in a clear and scannable way, using bullet points or brief sentences. Only elaborate if the user asks for more detail.
    - Provide clear, step-by-step explanations to ensure deep understanding.
    - Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding.
    - Ask guiding questions to encourage critical thinking.
    - Adapt your explanation to match the student's knowledge level.
    - Strictly use terminologies provided in the given context.
    - Provide short, ideal examples (2–3) to illustrate your points clearly.
    - Blend learning material with your own knowledge while ensuring answers stay within the healthcare context.
    - **Always provide all specific relevant sources with name from the context in your responses: URLs, video names, video timestamps, links, resources, *ebook names*, lesson names, lesson numbers. If the user query is not relevant to the context, do not provide any references and sources.**
    - Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively.
    - Must provide all relevant video timestamps with video name (start and end). If timestamps aren't available, tell the user to watch the whole video.
    - Provide a thoughtful and contextually accurate response.
    - No repetition in responses.
    - **If student asks something completely out of context, politely decline and ask them to ask a question related to their course. Do not provide any references or sources.**
    - Please avoid using "Bank Name".
    - When users request questions, answers, quizzes, or exams, generate high-quality educational assessments directly from the learning material. Use various question types (e.g., multiple-choice with 4–5 options, true/false, short answer, fill-in-the-blanks, essay), clear formatting, different cognitive levels (recall, comprehension, application, analysis), detailed answer keys with explanations, consistent formatting (numbered questions, lettered options, bold correct answers), difficulty adaptation, and step-by-step solutions.
    - Never generate fabricated information when providing references or sources. Only use facts, references, citations, lesson names, e-book titles, video names, and timestamps explicitly present in the provided learning materials.
    - Always provide all references and sources **only at the end of the response.**. Do **not** include fill-in-the-blanks, quizzes, or Q&A content in the references and sources section. Make sure you mention what type of reference / source it is.
    - Incorporate occasional follow-up questions or prompts (e.g., “Would you like to see an example?” or “Need a quick quiz to check your understanding?”) to promote engagement and deeper learning.
    - Default to concise, scannable answers. Use bullet points and bolding. Only give longer explanation and details if the user requests it or implies it.
    - Donot provide references or sources between text or at the end of a sentence, only under **References and Sources**.
    - For followup questions and queries , use previous conversation history and context to answer the user query.
    - Be more confident and assertive in your responses.
    - At the end of each response, provide definitions for any difficult words under the heading 'Difficult Words'.
    - ** If a user asks about anything related to certification, licensing, externships, or career pathways, provide clear, specific information. This includes queries that mention or imply:

        “certification”
        “certificate”
        “get certified”
        “license” / “licensing”
        “externship”
        “internship”
        “official requirement”
        “exam eligibility”
        “career path”
        “qualification”
        “accreditation”
        “approved training”
        “enrollment”
        “program approval”

        if such keywords are present then **ask user to contact official support for confirmation or further help,** while including the contact details below:

        Support Contact Information:
        📞 Phone: 1-800-555-HEAL (4325)
        📧 Email: support@healthylifehelp.org
        🌐 Website: www.healthylifehelp.org
        🏥 Address: 123 Wellness Blvd, Caretown, CA 90210
        🕒 Hours: Mon–Fri, 8 AM – 6 PM (PST)
    """

    # User prompt - contains the specific query and response type only (not in system prompt)
    user_prompt = f"""
    
    New student question:
    "{user_query}"

    Response type requested: {response_type}
    """

    # Then in your query_groq function:
    stream_response = query_groq(user_prompt, system_prompt)

    # The function now directly yields the stream chunks for the Gradio interface to use
    full_response = ""

    # First, yield a response with empty text to set up the message
    # This creates the user message immediately
    temp_history = conversation_history.copy()
    temp_history.append((user_query, ""))
    yield temp_history, context

    # Process the stream
    for chunk in stream_response:
        if (
            hasattr(chunk.choices[0].delta, "content")
            and chunk.choices[0].delta.content is not None
        ):
            content_chunk = chunk.choices[0].delta.content
            full_response += content_chunk

            # Create a temporary history with the current response
            temp_history = conversation_history.copy()
            temp_history.append((user_query, full_response))

            # Yield the updated history for display
            yield temp_history, context

    # Return the final history with the complete response
    final_history = conversation_history.copy()
    final_history.append((user_query, full_response))
    yield final_history, context


@track_time
def create_gradio_interface(conversation_history, response_type="default"):
    with gr.Blocks() as interface:
        gr.Markdown("# 🧑‍🏫 Quick Start AI Coaching Assistant")
        gr.Markdown("Welcome! I'm here to help you learn. Type your question below.")

        # State management
        chat_history = gr.State(conversation_history)

        with gr.Row():
            chatbot = gr.Chatbot(height=500)
            with gr.Column(scale=0.5):
                context_display = gr.Textbox(
                    label="Relevant Context", interactive=False
                )

        user_input = gr.Textbox(label="Your Question", placeholder="Type here...")

        with gr.Row():
            submit_btn = gr.Button("Submit", variant="primary")
            undo_btn = gr.Button("Undo Last")
            clear_btn = gr.Button("Clear History")

        def handle_submit(user_query, history):
            if not user_query.strip():
                return gr.update(), history, ""

            # Use the generator directly from process_user_query
            # This will yield incremental updates as they arrive
            response_generator = process_user_query(user_query, history, response_type)

            for updated_history, context in response_generator:
                # Directly update the chatbot with each streaming chunk
                yield "", updated_history, context, updated_history

        # Component interactions with streaming support
        submit_btn.click(
            handle_submit,
            [user_input, chat_history],
            [user_input, chat_history, context_display, chatbot],
        )

        # Add submit on Enter key press
        user_input.submit(
            handle_submit,
            [user_input, chat_history],
            [user_input, chat_history, context_display, chatbot],
        )

        undo_btn.click(
            lambda history: history[:-1] if history else [],
            [chat_history],
            [chat_history],
        ).then(lambda x: x, [chat_history], [chatbot])

        clear_btn.click(lambda: [], None, [chat_history]).then(
            lambda: ([], ""), None, [chatbot, context_display]
        )

    return interface


def main():
    """
    Main entry point for the application.

    Initializes the conversation history with a welcome message,
    creates the Gradio interface, and launches the web app.
    """
    # Initialize conversation history with welcome message
    welcome_message = "Hi there! I'm your AI coach. I can help answer questions about your course materials, explain difficult concepts, and guide your learning journey. What would you like to know today?"
    initial_conversation_history = [("", welcome_message)]

    # Create and launch the interface
    interface = create_gradio_interface(initial_conversation_history, "long")
    interface.launch()


if __name__ == "__main__":
    main()