Spaces:
Sleeping
Sleeping
# %% [markdown] | |
# # Changes: | |
# - long short response tooltip feature swap option | |
# - Definitions for difficult words are now provied at the end of each response | |
# %% [markdown] | |
# # All imports and inits | |
# %% | |
import gradio as gr | |
import os | |
import time | |
from dotenv import load_dotenv | |
from transformers import AutoTokenizer | |
# from typing import List, Tuple | |
from openai import OpenAI | |
from pinecone import Pinecone | |
from groq import Groq | |
load_dotenv() | |
DATA_PATH = os.getenv("DATA_PATH") | |
PINECONE_API = os.getenv("PINECONE_API") | |
# PINECONE_ENV = os.getenv("PINECONE_ENV") | |
NVIDIA_API = os.getenv("NVIDIA_API") | |
NVIDIA_BASE_URL = os.getenv("NVIDIA_BASE_URL") | |
GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
GROQ_CHAT_URL = os.getenv("GROQ_CHAT_URL") | |
# Configure headers for Groq API requests | |
GROQ_HEADERS = { | |
"Authorization": f"Bearer {GROQ_API_KEY}", | |
"Content-Type": "application/json", | |
} | |
# LLM_MODEL = "llama3-70b-8192" | |
LLM_MODEL = "llama-3.3-70b-versatile" | |
# NVidia Embedding import | |
client = OpenAI( | |
api_key=NVIDIA_API, | |
base_url=NVIDIA_BASE_URL, | |
) | |
""" | |
Input: | |
- Context window: 128K | |
Ouput: | |
- Output Max Tokens: 32,768 | |
""" | |
def track_time(func): | |
def wrapper(*args, **kwargs): | |
start = time.perf_counter() | |
result = func(*args, **kwargs) | |
end = time.perf_counter() | |
print(f"[Time Tracker] `{func.__name__}` took {end - start:.4f} seconds") | |
return result | |
return wrapper | |
# # EMBEDDING_MODEL = "llama3-405b-8192-embed" | |
# vo = voyageai.Client() | |
# %% [markdown] | |
# # Init Pinecone | |
# %% | |
pc = Pinecone(api_key=PINECONE_API) | |
# %% [markdown] | |
# # Embedding Function | |
# | |
# | |
# %% | |
# Connect to the index | |
# index = pc.Index("ai-coach") | |
# index = pc.Index("ahsan-400pg-pdf-doc-test") | |
# index = pc.Index("surgical-tech-complete") # -- COMPLETE SURGICAL TECH BOOTCAMP | |
index = pc.Index("quick-start") # -- QUICK START COURSE 1 | |
# embedding_model = AutoModel.from_pretrained( | |
# 'jinaai/jina-embeddings-v2-base-en', trust_remote_code=True) | |
# # Function to generate embeddings without tokenization | |
# def get_embedding(data): | |
# embeddings = embedding_model.encode(data).tolist() | |
# return embeddings | |
def get_embedding(text="None"): | |
response = client.embeddings.create( | |
input=text, | |
model="nvidia/nv-embed-v1", | |
encoding_format="float", | |
extra_body={"input_type": "query", "truncate": "NONE"}, | |
) | |
# print(response.data[0].embedding) | |
# print(count_tokens(response.data[0].embedding)) | |
return response.data[0].embedding | |
# get_embedding("None") | |
# %% [markdown] | |
# # Query Pinecone | |
# | |
# %% | |
# Function to query Pinecone index using embeddings | |
def query_pinecone(embedding): | |
# Use keyword arguments to pass the embedding and other parameters | |
result = index.query(vector=embedding, top_k=5, include_metadata=True) | |
return result["matches"] | |
print(query_pinecone(get_embedding("Network Components"))) | |
# %% [markdown] | |
# # Query Groq Inference | |
# %% [markdown] | |
# | Use Case | Recommended top_p | Notes | | |
# |----------|------------------|-------| | |
# | Factual Q&A | 0.1 - 0.3 | Lower values for more deterministic, factual responses | | |
# | Code Generation | 0.2 - 0.5 | Precision matters more than creativity | | |
# | Technical Writing | 0.5 - 0.7 | Balanced approach for technical accuracy with clarity | | |
# | General Conversation | 0.7 - 0.9 | Good balance for most chatbot applications | | |
# | Creative Writing | 0.9 - 1.0 | Higher values for more diverse and creative outputs | | |
# | |
# \n | |
# | Parameter Combination | Use Case | | |
# |----------------------|----------| | |
# | top_p=0.5, temperature=0.3 | Highly factual, consistent responses | | |
# | top_p=0.7, temperature=0.5 | Educational content with examples | | |
# | top_p=0.9, temperature=0.7 | Creative but coherent responses | | |
# %% | |
# Modified query_groq function with more explicit streaming handling | |
def query_groq(user_prompt, sys_prompt): | |
client = Groq(api_key=os.environ["GROQ_API_KEY"]) | |
# Always use streaming mode | |
return client.chat.completions.create( | |
model=LLM_MODEL, # or whichever model you're using | |
temperature=0.5, | |
messages=[ | |
{"role": "system", "content": sys_prompt}, | |
{"role": "user", "content": user_prompt}, | |
], | |
stream=True, | |
# top_p=0.7, # testing for better results | |
) | |
# Print all tool calls | |
# print(completion.choices[0].message.executed_tools) | |
# Tokenizer to count number of tokens | |
""" | |
Putting tokenizer outside of the function to avoid reinitialization and optimize performance. | |
""" | |
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en") | |
def count_tokens(text: str) -> int: | |
# Encode the text into tokens | |
tokens = tokenizer.encode(text) | |
return len(tokens) | |
# %% [markdown] | |
# # Process User Query | |
# %% [markdown] | |
# ## Groq and Gradio with Streaming Enabled | |
# | |
# - ### i.e. will start showing text as soon as it gets generated from groq inference | |
# - ### faster than optimized version | |
# | |
# ## Query: | |
# - pediatic surgery | |
# ## Response Time: | |
# User Query Tokens: 6 | |
# [Time Tracker] `get_embedding` took 0.4752 seconds | |
# [Time Tracker] `query_pinecone` took 0.2222 seconds | |
# [Time Tracker] `query_groq` took 0.5060 seconds | |
# | |
# Total time: 1.19 seconds | |
# %% | |
# # Modified query_groq function with more explicit streaming handling | |
# @track_time | |
# def query_groq(prompt): | |
# client = Groq(api_key=os.environ["GROQ_API_KEY"]) | |
# # Always use streaming mode | |
# return client.chat.completions.create( | |
# model="llama3-70b-8192", # or whichever model you're using | |
# messages=[{"role": "user", "content": prompt}], | |
# stream=True, | |
# ) | |
# --------------------------------------------------------- ## Groq and Gradio with Streaming Enabled ----------------------------------------------------- | |
# Modified process_user_query to properly yield streaming updates | |
def process_user_query(user_query: str, conversation_history: list, response_type: str): | |
print(f"User Query Tokens: {count_tokens(user_query)}") | |
# Generate embedding and get relevant context | |
embedding = get_embedding(user_query) | |
relevant_chunks = query_pinecone(embedding) | |
context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks) | |
# Format conversation history for the prompt | |
history_str = "\n".join( | |
f"User: {user}\nCoach: {response}" for user, response in conversation_history | |
) | |
system_prompt = f""" | |
Conversation history: | |
{history_str} | |
learning materials: | |
{context} | |
You are an expert, knowledgeable, and friendly coach. Follow these **guidelines** carefully: | |
- If the user requests a **long** response, provide a detailed, comprehensive, and in-depth explanation. Cover all relevant aspects, break down complex concepts step-by-step, include context, background, and reasoning, and use examples, analogies, and clarifications. Address possible follow-up questions and ensure the answer is thorough and educational. | |
- If the user requests a **short** response, provide a concise, focused, and to-the-point answer. Summarize the key information in a clear and scannable way, using bullet points or brief sentences. Only elaborate if the user asks for more detail. | |
- Provide clear, step-by-step explanations to ensure deep understanding. | |
- Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding. | |
- Ask guiding questions to encourage critical thinking. | |
- Adapt your explanation to match the student's knowledge level. | |
- Strictly use terminologies provided in the given context. | |
- Provide short, ideal examples (2β3) to illustrate your points clearly. | |
- Blend learning material with your own knowledge while ensuring answers stay within the healthcare context. | |
- **Always provide all specific relevant sources with name from the context in your responses: URLs, video names, video timestamps, links, resources, *ebook names*, lesson names, lesson numbers. If the user query is not relevant to the context, do not provide any references and sources.** | |
- Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively. | |
- Must provide all relevant video timestamps with video name (start and end). If timestamps aren't available, tell the user to watch the whole video. | |
- Provide a thoughtful and contextually accurate response. | |
- No repetition in responses. | |
- **If student asks something completely out of context, politely decline and ask them to ask a question related to their course. Do not provide any references or sources.** | |
- Please avoid using "Bank Name". | |
- When users request questions, answers, quizzes, or exams, generate high-quality educational assessments directly from the learning material. Use various question types (e.g., multiple-choice with 4β5 options, true/false, short answer, fill-in-the-blanks, essay), clear formatting, different cognitive levels (recall, comprehension, application, analysis), detailed answer keys with explanations, consistent formatting (numbered questions, lettered options, bold correct answers), difficulty adaptation, and step-by-step solutions. | |
- Never generate fabricated information when providing references or sources. Only use facts, references, citations, lesson names, e-book titles, video names, and timestamps explicitly present in the provided learning materials. | |
- Always provide all references and sources **only at the end of the response.**. Do **not** include fill-in-the-blanks, quizzes, or Q&A content in the references and sources section. Make sure you mention what type of reference / source it is. | |
- Incorporate occasional follow-up questions or prompts (e.g., βWould you like to see an example?β or βNeed a quick quiz to check your understanding?β) to promote engagement and deeper learning. | |
- Default to concise, scannable answers. Use bullet points and bolding. Only give longer explanation and details if the user requests it or implies it. | |
- Donot provide references or sources between text or at the end of a sentence, only under **References and Sources**. | |
- For followup questions and queries , use previous conversation history and context to answer the user query. | |
- Be more confident and assertive in your responses. | |
- At the end of each response, provide definitions for any difficult words under the heading 'Difficult Words'. | |
- ** If a user asks about anything related to certification, licensing, externships, or career pathways, provide clear, specific information. This includes queries that mention or imply: | |
βcertificationβ | |
βcertificateβ | |
βget certifiedβ | |
βlicenseβ / βlicensingβ | |
βexternshipβ | |
βinternshipβ | |
βofficial requirementβ | |
βexam eligibilityβ | |
βcareer pathβ | |
βqualificationβ | |
βaccreditationβ | |
βapproved trainingβ | |
βenrollmentβ | |
βprogram approvalβ | |
if such keywords are present then **ask user to contact official support for confirmation or further help,** while including the contact details below: | |
Support Contact Information: | |
π Phone: 1-800-555-HEAL (4325) | |
π§ Email: [email protected] | |
π Website: www.healthylifehelp.org | |
π₯ Address: 123 Wellness Blvd, Caretown, CA 90210 | |
π Hours: MonβFri, 8 AM β 6 PM (PST) | |
""" | |
# User prompt - contains the specific query and response type only (not in system prompt) | |
user_prompt = f""" | |
New student question: | |
"{user_query}" | |
Response type requested: {response_type} | |
""" | |
# Then in your query_groq function: | |
stream_response = query_groq(user_prompt, system_prompt) | |
# The function now directly yields the stream chunks for the Gradio interface to use | |
full_response = "" | |
# First, yield a response with empty text to set up the message | |
# This creates the user message immediately | |
temp_history = conversation_history.copy() | |
temp_history.append((user_query, "")) | |
yield temp_history, context | |
# Process the stream | |
for chunk in stream_response: | |
if ( | |
hasattr(chunk.choices[0].delta, "content") | |
and chunk.choices[0].delta.content is not None | |
): | |
content_chunk = chunk.choices[0].delta.content | |
full_response += content_chunk | |
# Create a temporary history with the current response | |
temp_history = conversation_history.copy() | |
temp_history.append((user_query, full_response)) | |
# Yield the updated history for display | |
yield temp_history, context | |
# Return the final history with the complete response | |
final_history = conversation_history.copy() | |
final_history.append((user_query, full_response)) | |
yield final_history, context | |
def create_gradio_interface(conversation_history, response_type="default"): | |
with gr.Blocks() as interface: | |
gr.Markdown("# π§βπ« Quick Start AI Coaching Assistant") | |
gr.Markdown("Welcome! I'm here to help you learn. Type your question below.") | |
# State management | |
chat_history = gr.State(conversation_history) | |
with gr.Row(): | |
chatbot = gr.Chatbot(height=500) | |
with gr.Column(scale=0.5): | |
context_display = gr.Textbox( | |
label="Relevant Context", interactive=False | |
) | |
user_input = gr.Textbox(label="Your Question", placeholder="Type here...") | |
with gr.Row(): | |
submit_btn = gr.Button("Submit", variant="primary") | |
undo_btn = gr.Button("Undo Last") | |
clear_btn = gr.Button("Clear History") | |
def handle_submit(user_query, history): | |
if not user_query.strip(): | |
return gr.update(), history, "" | |
# Use the generator directly from process_user_query | |
# This will yield incremental updates as they arrive | |
response_generator = process_user_query(user_query, history, response_type) | |
for updated_history, context in response_generator: | |
# Directly update the chatbot with each streaming chunk | |
yield "", updated_history, context, updated_history | |
# Component interactions with streaming support | |
submit_btn.click( | |
handle_submit, | |
[user_input, chat_history], | |
[user_input, chat_history, context_display, chatbot], | |
) | |
# Add submit on Enter key press | |
user_input.submit( | |
handle_submit, | |
[user_input, chat_history], | |
[user_input, chat_history, context_display, chatbot], | |
) | |
undo_btn.click( | |
lambda history: history[:-1] if history else [], | |
[chat_history], | |
[chat_history], | |
).then(lambda x: x, [chat_history], [chatbot]) | |
clear_btn.click(lambda: [], None, [chat_history]).then( | |
lambda: ([], ""), None, [chatbot, context_display] | |
) | |
return interface | |
def main(): | |
""" | |
Main entry point for the application. | |
Initializes the conversation history with a welcome message, | |
creates the Gradio interface, and launches the web app. | |
""" | |
# Initialize conversation history with welcome message | |
welcome_message = "Hi there! I'm your AI coach. I can help answer questions about your course materials, explain difficult concepts, and guide your learning journey. What would you like to know today?" | |
initial_conversation_history = [("", welcome_message)] | |
# Create and launch the interface | |
interface = create_gradio_interface(initial_conversation_history, "long") | |
interface.launch() | |
if __name__ == "__main__": | |
main() | |