quick-start / app.py
OsamaIzhar's picture
Upload 2 files
473efac verified
# %% [markdown]
# # Changes:
# - long short response tooltip feature swap option
# - Definitions for difficult words are now provied at the end of each response
# %% [markdown]
# # All imports and inits
# %%
import gradio as gr
import os
import time
from dotenv import load_dotenv
from transformers import AutoTokenizer
# from typing import List, Tuple
from openai import OpenAI
from pinecone import Pinecone
from groq import Groq
load_dotenv()
DATA_PATH = os.getenv("DATA_PATH")
PINECONE_API = os.getenv("PINECONE_API")
# PINECONE_ENV = os.getenv("PINECONE_ENV")
NVIDIA_API = os.getenv("NVIDIA_API")
NVIDIA_BASE_URL = os.getenv("NVIDIA_BASE_URL")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_CHAT_URL = os.getenv("GROQ_CHAT_URL")
# Configure headers for Groq API requests
GROQ_HEADERS = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json",
}
# LLM_MODEL = "llama3-70b-8192"
LLM_MODEL = "llama-3.3-70b-versatile"
# NVidia Embedding import
client = OpenAI(
api_key=NVIDIA_API,
base_url=NVIDIA_BASE_URL,
)
"""
Input:
- Context window: 128K
Ouput:
- Output Max Tokens: 32,768
"""
def track_time(func):
def wrapper(*args, **kwargs):
start = time.perf_counter()
result = func(*args, **kwargs)
end = time.perf_counter()
print(f"[Time Tracker] `{func.__name__}` took {end - start:.4f} seconds")
return result
return wrapper
# # EMBEDDING_MODEL = "llama3-405b-8192-embed"
# vo = voyageai.Client()
# %% [markdown]
# # Init Pinecone
# %%
pc = Pinecone(api_key=PINECONE_API)
# %% [markdown]
# # Embedding Function
#
#
# %%
# Connect to the index
# index = pc.Index("ai-coach")
# index = pc.Index("ahsan-400pg-pdf-doc-test")
# index = pc.Index("surgical-tech-complete") # -- COMPLETE SURGICAL TECH BOOTCAMP
index = pc.Index("quick-start") # -- QUICK START COURSE 1
# embedding_model = AutoModel.from_pretrained(
# 'jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
# # Function to generate embeddings without tokenization
# def get_embedding(data):
# embeddings = embedding_model.encode(data).tolist()
# return embeddings
@track_time
def get_embedding(text="None"):
response = client.embeddings.create(
input=text,
model="nvidia/nv-embed-v1",
encoding_format="float",
extra_body={"input_type": "query", "truncate": "NONE"},
)
# print(response.data[0].embedding)
# print(count_tokens(response.data[0].embedding))
return response.data[0].embedding
# get_embedding("None")
# %% [markdown]
# # Query Pinecone
#
# %%
# Function to query Pinecone index using embeddings
@track_time
def query_pinecone(embedding):
# Use keyword arguments to pass the embedding and other parameters
result = index.query(vector=embedding, top_k=5, include_metadata=True)
return result["matches"]
print(query_pinecone(get_embedding("Network Components")))
# %% [markdown]
# # Query Groq Inference
# %% [markdown]
# | Use Case | Recommended top_p | Notes |
# |----------|------------------|-------|
# | Factual Q&A | 0.1 - 0.3 | Lower values for more deterministic, factual responses |
# | Code Generation | 0.2 - 0.5 | Precision matters more than creativity |
# | Technical Writing | 0.5 - 0.7 | Balanced approach for technical accuracy with clarity |
# | General Conversation | 0.7 - 0.9 | Good balance for most chatbot applications |
# | Creative Writing | 0.9 - 1.0 | Higher values for more diverse and creative outputs |
#
# \n
# | Parameter Combination | Use Case |
# |----------------------|----------|
# | top_p=0.5, temperature=0.3 | Highly factual, consistent responses |
# | top_p=0.7, temperature=0.5 | Educational content with examples |
# | top_p=0.9, temperature=0.7 | Creative but coherent responses |
# %%
# Modified query_groq function with more explicit streaming handling
@track_time
def query_groq(user_prompt, sys_prompt):
client = Groq(api_key=os.environ["GROQ_API_KEY"])
# Always use streaming mode
return client.chat.completions.create(
model=LLM_MODEL, # or whichever model you're using
temperature=0.5,
messages=[
{"role": "system", "content": sys_prompt},
{"role": "user", "content": user_prompt},
],
stream=True,
# top_p=0.7, # testing for better results
)
# Print all tool calls
# print(completion.choices[0].message.executed_tools)
# Tokenizer to count number of tokens
"""
Putting tokenizer outside of the function to avoid reinitialization and optimize performance.
"""
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")
@track_time
def count_tokens(text: str) -> int:
# Encode the text into tokens
tokens = tokenizer.encode(text)
return len(tokens)
# %% [markdown]
# # Process User Query
# %% [markdown]
# ## Groq and Gradio with Streaming Enabled
#
# - ### i.e. will start showing text as soon as it gets generated from groq inference
# - ### faster than optimized version
#
# ## Query:
# - pediatic surgery
# ## Response Time:
# User Query Tokens: 6
# [Time Tracker] `get_embedding` took 0.4752 seconds
# [Time Tracker] `query_pinecone` took 0.2222 seconds
# [Time Tracker] `query_groq` took 0.5060 seconds
#
# Total time: 1.19 seconds
# %%
# # Modified query_groq function with more explicit streaming handling
# @track_time
# def query_groq(prompt):
# client = Groq(api_key=os.environ["GROQ_API_KEY"])
# # Always use streaming mode
# return client.chat.completions.create(
# model="llama3-70b-8192", # or whichever model you're using
# messages=[{"role": "user", "content": prompt}],
# stream=True,
# )
# --------------------------------------------------------- ## Groq and Gradio with Streaming Enabled -----------------------------------------------------
# Modified process_user_query to properly yield streaming updates
@track_time
def process_user_query(user_query: str, conversation_history: list, response_type: str):
print(f"User Query Tokens: {count_tokens(user_query)}")
# Generate embedding and get relevant context
embedding = get_embedding(user_query)
relevant_chunks = query_pinecone(embedding)
context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks)
# Format conversation history for the prompt
history_str = "\n".join(
f"User: {user}\nCoach: {response}" for user, response in conversation_history
)
system_prompt = f"""
Conversation history:
{history_str}
learning materials:
{context}
You are an expert, knowledgeable, and friendly coach. Follow these **guidelines** carefully:
- If the user requests a **long** response, provide a detailed, comprehensive, and in-depth explanation. Cover all relevant aspects, break down complex concepts step-by-step, include context, background, and reasoning, and use examples, analogies, and clarifications. Address possible follow-up questions and ensure the answer is thorough and educational.
- If the user requests a **short** response, provide a concise, focused, and to-the-point answer. Summarize the key information in a clear and scannable way, using bullet points or brief sentences. Only elaborate if the user asks for more detail.
- Provide clear, step-by-step explanations to ensure deep understanding.
- Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding.
- Ask guiding questions to encourage critical thinking.
- Adapt your explanation to match the student's knowledge level.
- Strictly use terminologies provided in the given context.
- Provide short, ideal examples (2–3) to illustrate your points clearly.
- Blend learning material with your own knowledge while ensuring answers stay within the healthcare context.
- **Always provide all specific relevant sources with name from the context in your responses: URLs, video names, video timestamps, links, resources, *ebook names*, lesson names, lesson numbers. If the user query is not relevant to the context, do not provide any references and sources.**
- Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively.
- Must provide all relevant video timestamps with video name (start and end). If timestamps aren't available, tell the user to watch the whole video.
- Provide a thoughtful and contextually accurate response.
- No repetition in responses.
- **If student asks something completely out of context, politely decline and ask them to ask a question related to their course. Do not provide any references or sources.**
- Please avoid using "Bank Name".
- When users request questions, answers, quizzes, or exams, generate high-quality educational assessments directly from the learning material. Use various question types (e.g., multiple-choice with 4–5 options, true/false, short answer, fill-in-the-blanks, essay), clear formatting, different cognitive levels (recall, comprehension, application, analysis), detailed answer keys with explanations, consistent formatting (numbered questions, lettered options, bold correct answers), difficulty adaptation, and step-by-step solutions.
- Never generate fabricated information when providing references or sources. Only use facts, references, citations, lesson names, e-book titles, video names, and timestamps explicitly present in the provided learning materials.
- Always provide all references and sources **only at the end of the response.**. Do **not** include fill-in-the-blanks, quizzes, or Q&A content in the references and sources section. Make sure you mention what type of reference / source it is.
- Incorporate occasional follow-up questions or prompts (e.g., β€œWould you like to see an example?” or β€œNeed a quick quiz to check your understanding?”) to promote engagement and deeper learning.
- Default to concise, scannable answers. Use bullet points and bolding. Only give longer explanation and details if the user requests it or implies it.
- Donot provide references or sources between text or at the end of a sentence, only under **References and Sources**.
- For followup questions and queries , use previous conversation history and context to answer the user query.
- Be more confident and assertive in your responses.
- At the end of each response, provide definitions for any difficult words under the heading 'Difficult Words'.
- ** If a user asks about anything related to certification, licensing, externships, or career pathways, provide clear, specific information. This includes queries that mention or imply:
β€œcertification”
β€œcertificate”
β€œget certified”
β€œlicense” / β€œlicensing”
β€œexternship”
β€œinternship”
β€œofficial requirement”
β€œexam eligibility”
β€œcareer path”
β€œqualification”
β€œaccreditation”
β€œapproved training”
β€œenrollment”
β€œprogram approval”
if such keywords are present then **ask user to contact official support for confirmation or further help,** while including the contact details below:
Support Contact Information:
πŸ“ž Phone: 1-800-555-HEAL (4325)
πŸ“§ Email: [email protected]
🌐 Website: www.healthylifehelp.org
πŸ₯ Address: 123 Wellness Blvd, Caretown, CA 90210
πŸ•’ Hours: Mon–Fri, 8 AM – 6 PM (PST)
"""
# User prompt - contains the specific query and response type only (not in system prompt)
user_prompt = f"""
New student question:
"{user_query}"
Response type requested: {response_type}
"""
# Then in your query_groq function:
stream_response = query_groq(user_prompt, system_prompt)
# The function now directly yields the stream chunks for the Gradio interface to use
full_response = ""
# First, yield a response with empty text to set up the message
# This creates the user message immediately
temp_history = conversation_history.copy()
temp_history.append((user_query, ""))
yield temp_history, context
# Process the stream
for chunk in stream_response:
if (
hasattr(chunk.choices[0].delta, "content")
and chunk.choices[0].delta.content is not None
):
content_chunk = chunk.choices[0].delta.content
full_response += content_chunk
# Create a temporary history with the current response
temp_history = conversation_history.copy()
temp_history.append((user_query, full_response))
# Yield the updated history for display
yield temp_history, context
# Return the final history with the complete response
final_history = conversation_history.copy()
final_history.append((user_query, full_response))
yield final_history, context
@track_time
def create_gradio_interface(conversation_history, response_type="default"):
with gr.Blocks() as interface:
gr.Markdown("# πŸ§‘β€πŸ« Quick Start AI Coaching Assistant")
gr.Markdown("Welcome! I'm here to help you learn. Type your question below.")
# State management
chat_history = gr.State(conversation_history)
with gr.Row():
chatbot = gr.Chatbot(height=500)
with gr.Column(scale=0.5):
context_display = gr.Textbox(
label="Relevant Context", interactive=False
)
user_input = gr.Textbox(label="Your Question", placeholder="Type here...")
with gr.Row():
submit_btn = gr.Button("Submit", variant="primary")
undo_btn = gr.Button("Undo Last")
clear_btn = gr.Button("Clear History")
def handle_submit(user_query, history):
if not user_query.strip():
return gr.update(), history, ""
# Use the generator directly from process_user_query
# This will yield incremental updates as they arrive
response_generator = process_user_query(user_query, history, response_type)
for updated_history, context in response_generator:
# Directly update the chatbot with each streaming chunk
yield "", updated_history, context, updated_history
# Component interactions with streaming support
submit_btn.click(
handle_submit,
[user_input, chat_history],
[user_input, chat_history, context_display, chatbot],
)
# Add submit on Enter key press
user_input.submit(
handle_submit,
[user_input, chat_history],
[user_input, chat_history, context_display, chatbot],
)
undo_btn.click(
lambda history: history[:-1] if history else [],
[chat_history],
[chat_history],
).then(lambda x: x, [chat_history], [chatbot])
clear_btn.click(lambda: [], None, [chat_history]).then(
lambda: ([], ""), None, [chatbot, context_display]
)
return interface
def main():
"""
Main entry point for the application.
Initializes the conversation history with a welcome message,
creates the Gradio interface, and launches the web app.
"""
# Initialize conversation history with welcome message
welcome_message = "Hi there! I'm your AI coach. I can help answer questions about your course materials, explain difficult concepts, and guide your learning journey. What would you like to know today?"
initial_conversation_history = [("", welcome_message)]
# Create and launch the interface
interface = create_gradio_interface(initial_conversation_history, "long")
interface.launch()
if __name__ == "__main__":
main()