# %% [markdown] # # Changes: # - long short response tooltip feature swap option # - Definitions for difficult words are now provied at the end of each response # %% [markdown] # # All imports and inits # %% import gradio as gr import os import time from dotenv import load_dotenv from transformers import AutoTokenizer # from typing import List, Tuple from openai import OpenAI from pinecone import Pinecone from groq import Groq load_dotenv() DATA_PATH = os.getenv("DATA_PATH") PINECONE_API = os.getenv("PINECONE_API") # PINECONE_ENV = os.getenv("PINECONE_ENV") NVIDIA_API = os.getenv("NVIDIA_API") NVIDIA_BASE_URL = os.getenv("NVIDIA_BASE_URL") GROQ_API_KEY = os.getenv("GROQ_API_KEY") GROQ_CHAT_URL = os.getenv("GROQ_CHAT_URL") # Configure headers for Groq API requests GROQ_HEADERS = { "Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json", } # LLM_MODEL = "llama3-70b-8192" LLM_MODEL = "llama-3.3-70b-versatile" # NVidia Embedding import client = OpenAI( api_key=NVIDIA_API, base_url=NVIDIA_BASE_URL, ) """ Input: - Context window: 128K Ouput: - Output Max Tokens: 32,768 """ def track_time(func): def wrapper(*args, **kwargs): start = time.perf_counter() result = func(*args, **kwargs) end = time.perf_counter() print(f"[Time Tracker] `{func.__name__}` took {end - start:.4f} seconds") return result return wrapper # # EMBEDDING_MODEL = "llama3-405b-8192-embed" # vo = voyageai.Client() # %% [markdown] # # Init Pinecone # %% pc = Pinecone(api_key=PINECONE_API) # %% [markdown] # # Embedding Function # # # %% # Connect to the index # index = pc.Index("ai-coach") # index = pc.Index("ahsan-400pg-pdf-doc-test") # index = pc.Index("surgical-tech-complete") # -- COMPLETE SURGICAL TECH BOOTCAMP index = pc.Index("quick-start") # -- QUICK START COURSE 1 # embedding_model = AutoModel.from_pretrained( # 'jinaai/jina-embeddings-v2-base-en', trust_remote_code=True) # # Function to generate embeddings without tokenization # def get_embedding(data): # embeddings = embedding_model.encode(data).tolist() # return embeddings @track_time def get_embedding(text="None"): response = client.embeddings.create( input=text, model="nvidia/nv-embed-v1", encoding_format="float", extra_body={"input_type": "query", "truncate": "NONE"}, ) # print(response.data[0].embedding) # print(count_tokens(response.data[0].embedding)) return response.data[0].embedding # get_embedding("None") # %% [markdown] # # Query Pinecone # # %% # Function to query Pinecone index using embeddings @track_time def query_pinecone(embedding): # Use keyword arguments to pass the embedding and other parameters result = index.query(vector=embedding, top_k=5, include_metadata=True) return result["matches"] print(query_pinecone(get_embedding("Network Components"))) # %% [markdown] # # Query Groq Inference # %% [markdown] # | Use Case | Recommended top_p | Notes | # |----------|------------------|-------| # | Factual Q&A | 0.1 - 0.3 | Lower values for more deterministic, factual responses | # | Code Generation | 0.2 - 0.5 | Precision matters more than creativity | # | Technical Writing | 0.5 - 0.7 | Balanced approach for technical accuracy with clarity | # | General Conversation | 0.7 - 0.9 | Good balance for most chatbot applications | # | Creative Writing | 0.9 - 1.0 | Higher values for more diverse and creative outputs | # # \n # | Parameter Combination | Use Case | # |----------------------|----------| # | top_p=0.5, temperature=0.3 | Highly factual, consistent responses | # | top_p=0.7, temperature=0.5 | Educational content with examples | # | top_p=0.9, temperature=0.7 | Creative but coherent responses | # %% # Modified query_groq function with more explicit streaming handling @track_time def query_groq(user_prompt, sys_prompt): client = Groq(api_key=os.environ["GROQ_API_KEY"]) # Always use streaming mode return client.chat.completions.create( model=LLM_MODEL, # or whichever model you're using temperature=0.5, messages=[ {"role": "system", "content": sys_prompt}, {"role": "user", "content": user_prompt}, ], stream=True, # top_p=0.7, # testing for better results ) # Print all tool calls # print(completion.choices[0].message.executed_tools) # Tokenizer to count number of tokens """ Putting tokenizer outside of the function to avoid reinitialization and optimize performance. """ tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en") @track_time def count_tokens(text: str) -> int: # Encode the text into tokens tokens = tokenizer.encode(text) return len(tokens) # %% [markdown] # # Process User Query # %% [markdown] # ## Groq and Gradio with Streaming Enabled # # - ### i.e. will start showing text as soon as it gets generated from groq inference # - ### faster than optimized version # # ## Query: # - pediatic surgery # ## Response Time: # User Query Tokens: 6 # [Time Tracker] `get_embedding` took 0.4752 seconds # [Time Tracker] `query_pinecone` took 0.2222 seconds # [Time Tracker] `query_groq` took 0.5060 seconds # # Total time: 1.19 seconds # %% # # Modified query_groq function with more explicit streaming handling # @track_time # def query_groq(prompt): # client = Groq(api_key=os.environ["GROQ_API_KEY"]) # # Always use streaming mode # return client.chat.completions.create( # model="llama3-70b-8192", # or whichever model you're using # messages=[{"role": "user", "content": prompt}], # stream=True, # ) # --------------------------------------------------------- ## Groq and Gradio with Streaming Enabled ----------------------------------------------------- # Modified process_user_query to properly yield streaming updates @track_time def process_user_query(user_query: str, conversation_history: list, response_type: str): print(f"User Query Tokens: {count_tokens(user_query)}") # Generate embedding and get relevant context embedding = get_embedding(user_query) relevant_chunks = query_pinecone(embedding) context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks) # Format conversation history for the prompt history_str = "\n".join( f"User: {user}\nCoach: {response}" for user, response in conversation_history ) system_prompt = f""" Conversation history: {history_str} learning materials: {context} You are an expert, knowledgeable, and friendly coach. Follow these **guidelines** carefully: - If the user requests a **long** response, provide a detailed, comprehensive, and in-depth explanation. Cover all relevant aspects, break down complex concepts step-by-step, include context, background, and reasoning, and use examples, analogies, and clarifications. Address possible follow-up questions and ensure the answer is thorough and educational. - If the user requests a **short** response, provide a concise, focused, and to-the-point answer. Summarize the key information in a clear and scannable way, using bullet points or brief sentences. Only elaborate if the user asks for more detail. - Provide clear, step-by-step explanations to ensure deep understanding. - Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding. - Ask guiding questions to encourage critical thinking. - Adapt your explanation to match the student's knowledge level. - Strictly use terminologies provided in the given context. - Provide short, ideal examples (2–3) to illustrate your points clearly. - Blend learning material with your own knowledge while ensuring answers stay within the healthcare context. - **Always provide all specific relevant sources with name from the context in your responses: URLs, video names, video timestamps, links, resources, *ebook names*, lesson names, lesson numbers. If the user query is not relevant to the context, do not provide any references and sources.** - Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively. - Must provide all relevant video timestamps with video name (start and end). If timestamps aren't available, tell the user to watch the whole video. - Provide a thoughtful and contextually accurate response. - No repetition in responses. - **If student asks something completely out of context, politely decline and ask them to ask a question related to their course. Do not provide any references or sources.** - Please avoid using "Bank Name". - When users request questions, answers, quizzes, or exams, generate high-quality educational assessments directly from the learning material. Use various question types (e.g., multiple-choice with 4–5 options, true/false, short answer, fill-in-the-blanks, essay), clear formatting, different cognitive levels (recall, comprehension, application, analysis), detailed answer keys with explanations, consistent formatting (numbered questions, lettered options, bold correct answers), difficulty adaptation, and step-by-step solutions. - Never generate fabricated information when providing references or sources. Only use facts, references, citations, lesson names, e-book titles, video names, and timestamps explicitly present in the provided learning materials. - Always provide all references and sources **only at the end of the response.**. Do **not** include fill-in-the-blanks, quizzes, or Q&A content in the references and sources section. Make sure you mention what type of reference / source it is. - Incorporate occasional follow-up questions or prompts (e.g., “Would you like to see an example?” or “Need a quick quiz to check your understanding?”) to promote engagement and deeper learning. - Default to concise, scannable answers. Use bullet points and bolding. Only give longer explanation and details if the user requests it or implies it. - Donot provide references or sources between text or at the end of a sentence, only under **References and Sources**. - For followup questions and queries , use previous conversation history and context to answer the user query. - Be more confident and assertive in your responses. - At the end of each response, provide definitions for any difficult words under the heading 'Difficult Words'. - ** If a user asks about anything related to certification, licensing, externships, or career pathways, provide clear, specific information. This includes queries that mention or imply: “certification” “certificate” “get certified” “license” / “licensing” “externship” “internship” “official requirement” “exam eligibility” “career path” “qualification” “accreditation” “approved training” “enrollment” “program approval” if such keywords are present then **ask user to contact official support for confirmation or further help,** while including the contact details below: Support Contact Information: 📞 Phone: 1-800-555-HEAL (4325) 📧 Email: support@healthylifehelp.org 🌐 Website: www.healthylifehelp.org 🏥 Address: 123 Wellness Blvd, Caretown, CA 90210 🕒 Hours: Mon–Fri, 8 AM – 6 PM (PST) """ # User prompt - contains the specific query and response type only (not in system prompt) user_prompt = f""" New student question: "{user_query}" Response type requested: {response_type} """ # Then in your query_groq function: stream_response = query_groq(user_prompt, system_prompt) # The function now directly yields the stream chunks for the Gradio interface to use full_response = "" # First, yield a response with empty text to set up the message # This creates the user message immediately temp_history = conversation_history.copy() temp_history.append((user_query, "")) yield temp_history, context # Process the stream for chunk in stream_response: if ( hasattr(chunk.choices[0].delta, "content") and chunk.choices[0].delta.content is not None ): content_chunk = chunk.choices[0].delta.content full_response += content_chunk # Create a temporary history with the current response temp_history = conversation_history.copy() temp_history.append((user_query, full_response)) # Yield the updated history for display yield temp_history, context # Return the final history with the complete response final_history = conversation_history.copy() final_history.append((user_query, full_response)) yield final_history, context @track_time def create_gradio_interface(conversation_history, response_type="default"): with gr.Blocks() as interface: gr.Markdown("# 🧑‍🏫 Quick Start AI Coaching Assistant") gr.Markdown("Welcome! I'm here to help you learn. Type your question below.") # State management chat_history = gr.State(conversation_history) with gr.Row(): chatbot = gr.Chatbot(height=500) with gr.Column(scale=0.5): context_display = gr.Textbox( label="Relevant Context", interactive=False ) user_input = gr.Textbox(label="Your Question", placeholder="Type here...") with gr.Row(): submit_btn = gr.Button("Submit", variant="primary") undo_btn = gr.Button("Undo Last") clear_btn = gr.Button("Clear History") def handle_submit(user_query, history): if not user_query.strip(): return gr.update(), history, "" # Use the generator directly from process_user_query # This will yield incremental updates as they arrive response_generator = process_user_query(user_query, history, response_type) for updated_history, context in response_generator: # Directly update the chatbot with each streaming chunk yield "", updated_history, context, updated_history # Component interactions with streaming support submit_btn.click( handle_submit, [user_input, chat_history], [user_input, chat_history, context_display, chatbot], ) # Add submit on Enter key press user_input.submit( handle_submit, [user_input, chat_history], [user_input, chat_history, context_display, chatbot], ) undo_btn.click( lambda history: history[:-1] if history else [], [chat_history], [chat_history], ).then(lambda x: x, [chat_history], [chatbot]) clear_btn.click(lambda: [], None, [chat_history]).then( lambda: ([], ""), None, [chatbot, context_display] ) return interface def main(): """ Main entry point for the application. Initializes the conversation history with a welcome message, creates the Gradio interface, and launches the web app. """ # Initialize conversation history with welcome message welcome_message = "Hi there! I'm your AI coach. I can help answer questions about your course materials, explain difficult concepts, and guide your learning journey. What would you like to know today?" initial_conversation_history = [("", welcome_message)] # Create and launch the interface interface = create_gradio_interface(initial_conversation_history, "long") interface.launch() if __name__ == "__main__": main()