Spaces:

OsamaIzhar
/

quick-start

Sleeping

App Files Files Community

quick-start / app.py

OsamaIzhar

Upload 2 files

473efac verified about 2 months ago

raw

history blame contribute delete

16 kB

	# %% [markdown]
	# # Changes:
	# - long short response tooltip feature swap option
	# - Definitions for difficult words are now provied at the end of each response

	# %% [markdown]
	# # All imports and inits

	# %%
	import gradio as gr
	import os
	import time
	from dotenv import load_dotenv
	from transformers import AutoTokenizer

	# from typing import List, Tuple
	from openai import OpenAI
	from pinecone import Pinecone
	from groq import Groq

	load_dotenv()

	DATA_PATH = os.getenv("DATA_PATH")
	PINECONE_API = os.getenv("PINECONE_API")
	# PINECONE_ENV = os.getenv("PINECONE_ENV")

	NVIDIA_API = os.getenv("NVIDIA_API")
	NVIDIA_BASE_URL = os.getenv("NVIDIA_BASE_URL")

	GROQ_API_KEY = os.getenv("GROQ_API_KEY")
	GROQ_CHAT_URL = os.getenv("GROQ_CHAT_URL")
	# Configure headers for Groq API requests
	GROQ_HEADERS = {
	"Authorization": f"Bearer {GROQ_API_KEY}",
	"Content-Type": "application/json",
	}
	# LLM_MODEL = "llama3-70b-8192"
	LLM_MODEL = "llama-3.3-70b-versatile"


	# NVidia Embedding import
	client = OpenAI(
	api_key=NVIDIA_API,
	base_url=NVIDIA_BASE_URL,
	)

	"""
	Input:
	- Context window: 128K
	Ouput:
	- Output Max Tokens: 32,768

	"""


	def track_time(func):
	def wrapper(args, *kwargs):
	start = time.perf_counter()
	result = func(args, *kwargs)
	end = time.perf_counter()
	print(f"[Time Tracker] `{func.__name__}` took {end - start:.4f} seconds")
	return result

	return wrapper


	# # EMBEDDING_MODEL = "llama3-405b-8192-embed"

	# vo = voyageai.Client()


	# %% [markdown]
	# # Init Pinecone

	# %%
	pc = Pinecone(api_key=PINECONE_API)


	# %% [markdown]
	# # Embedding Function
	#
	#

	# %%
	# Connect to the index
	# index = pc.Index("ai-coach")
	# index = pc.Index("ahsan-400pg-pdf-doc-test")
	# index = pc.Index("surgical-tech-complete") # -- COMPLETE SURGICAL TECH BOOTCAMP
	index = pc.Index("quick-start") # -- QUICK START COURSE 1


	# embedding_model = AutoModel.from_pretrained(
	# 'jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)


	# # Function to generate embeddings without tokenization
	# def get_embedding(data):
	# embeddings = embedding_model.encode(data).tolist()
	# return embeddings


	@track_time
	def get_embedding(text="None"):
	response = client.embeddings.create(
	input=text,
	model="nvidia/nv-embed-v1",
	encoding_format="float",
	extra_body={"input_type": "query", "truncate": "NONE"},
	)

	# print(response.data[0].embedding)
	# print(count_tokens(response.data[0].embedding))
	return response.data[0].embedding


	# get_embedding("None")


	# %% [markdown]
	# # Query Pinecone
	#


	# %%
	# Function to query Pinecone index using embeddings
	@track_time
	def query_pinecone(embedding):
	# Use keyword arguments to pass the embedding and other parameters
	result = index.query(vector=embedding, top_k=5, include_metadata=True)
	return result["matches"]


	print(query_pinecone(get_embedding("Network Components")))


	# %% [markdown]
	# # Query Groq Inference

	# %% [markdown]
	# \| Use Case \| Recommended top_p \| Notes \|
	# \|----------\|------------------\|-------\|
	# \| Factual Q&A \| 0.1 - 0.3 \| Lower values for more deterministic, factual responses \|
	# \| Code Generation \| 0.2 - 0.5 \| Precision matters more than creativity \|
	# \| Technical Writing \| 0.5 - 0.7 \| Balanced approach for technical accuracy with clarity \|
	# \| General Conversation \| 0.7 - 0.9 \| Good balance for most chatbot applications \|
	# \| Creative Writing \| 0.9 - 1.0 \| Higher values for more diverse and creative outputs \|
	#
	# \n
	# \| Parameter Combination \| Use Case \|
	# \|----------------------\|----------\|
	# \| top_p=0.5, temperature=0.3 \| Highly factual, consistent responses \|
	# \| top_p=0.7, temperature=0.5 \| Educational content with examples \|
	# \| top_p=0.9, temperature=0.7 \| Creative but coherent responses \|


	# %%
	# Modified query_groq function with more explicit streaming handling
	@track_time
	def query_groq(user_prompt, sys_prompt):
	client = Groq(api_key=os.environ["GROQ_API_KEY"])

	# Always use streaming mode
	return client.chat.completions.create(
	model=LLM_MODEL, # or whichever model you're using
	temperature=0.5,
	messages=[
	{"role": "system", "content": sys_prompt},
	{"role": "user", "content": user_prompt},
	],
	stream=True,
	# top_p=0.7, # testing for better results
	)


	# Print all tool calls
	# print(completion.choices[0].message.executed_tools)


	# Tokenizer to count number of tokens
	"""
	Putting tokenizer outside of the function to avoid reinitialization and optimize performance.
	"""
	tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v2-base-en")


	@track_time
	def count_tokens(text: str) -> int:
	# Encode the text into tokens
	tokens = tokenizer.encode(text)
	return len(tokens)


	# %% [markdown]
	# # Process User Query

	# %% [markdown]
	# ## Groq and Gradio with Streaming Enabled
	#
	# - ### i.e. will start showing text as soon as it gets generated from groq inference
	# - ### faster than optimized version
	#
	# ## Query:
	# - pediatic surgery
	# ## Response Time:
	# User Query Tokens: 6
	# [Time Tracker] `get_embedding` took 0.4752 seconds
	# [Time Tracker] `query_pinecone` took 0.2222 seconds
	# [Time Tracker] `query_groq` took 0.5060 seconds
	#
	# Total time: 1.19 seconds

	# %%
	# # Modified query_groq function with more explicit streaming handling
	# @track_time
	# def query_groq(prompt):
	# client = Groq(api_key=os.environ["GROQ_API_KEY"])

	# # Always use streaming mode
	# return client.chat.completions.create(
	# model="llama3-70b-8192", # or whichever model you're using
	# messages=[{"role": "user", "content": prompt}],
	# stream=True,
	# )


	# --------------------------------------------------------- ## Groq and Gradio with Streaming Enabled -----------------------------------------------------
	# Modified process_user_query to properly yield streaming updates
	@track_time
	def process_user_query(user_query: str, conversation_history: list, response_type: str):
	print(f"User Query Tokens: {count_tokens(user_query)}")

	# Generate embedding and get relevant context
	embedding = get_embedding(user_query)
	relevant_chunks = query_pinecone(embedding)
	context = "\n".join(chunk["metadata"]["text"] for chunk in relevant_chunks)

	# Format conversation history for the prompt
	history_str = "\n".join(
	f"User: {user}\nCoach: {response}" for user, response in conversation_history
	)

	system_prompt = f"""

	Conversation history:
	{history_str}

	learning materials:
	{context}

	You are an expert, knowledgeable, and friendly coach. Follow these guidelines carefully:

	- If the user requests a long response, provide a detailed, comprehensive, and in-depth explanation. Cover all relevant aspects, break down complex concepts step-by-step, include context, background, and reasoning, and use examples, analogies, and clarifications. Address possible follow-up questions and ensure the answer is thorough and educational.
	- If the user requests a short response, provide a concise, focused, and to-the-point answer. Summarize the key information in a clear and scannable way, using bullet points or brief sentences. Only elaborate if the user asks for more detail.
	- Provide clear, step-by-step explanations to ensure deep understanding.
	- Use chain-of-thought reasoning to thoroughly evaluate the provided context before responding.
	- Ask guiding questions to encourage critical thinking.
	- Adapt your explanation to match the student's knowledge level.
	- Strictly use terminologies provided in the given context.
	- Provide short, ideal examples (2–3) to illustrate your points clearly.
	- Blend learning material with your own knowledge while ensuring answers stay within the healthcare context.
	- *Always provide all specific relevant sources with name from the context in your responses: URLs, video names, video timestamps, links, resources, ebook names, lesson names, lesson numbers. If the user query is not relevant to the context, do not provide any references and sources.*
	- Perform sentiment analysis based on conversation history and user queries to adapt your responses empathetically and effectively.
	- Must provide all relevant video timestamps with video name (start and end). If timestamps aren't available, tell the user to watch the whole video.
	- Provide a thoughtful and contextually accurate response.
	- No repetition in responses.
	- If student asks something completely out of context, politely decline and ask them to ask a question related to their course. Do not provide any references or sources.
	- Please avoid using "Bank Name".
	- When users request questions, answers, quizzes, or exams, generate high-quality educational assessments directly from the learning material. Use various question types (e.g., multiple-choice with 4–5 options, true/false, short answer, fill-in-the-blanks, essay), clear formatting, different cognitive levels (recall, comprehension, application, analysis), detailed answer keys with explanations, consistent formatting (numbered questions, lettered options, bold correct answers), difficulty adaptation, and step-by-step solutions.
	- Never generate fabricated information when providing references or sources. Only use facts, references, citations, lesson names, e-book titles, video names, and timestamps explicitly present in the provided learning materials.
	- Always provide all references and sources only at the end of the response.. Do not include fill-in-the-blanks, quizzes, or Q&A content in the references and sources section. Make sure you mention what type of reference / source it is.
	- Incorporate occasional follow-up questions or prompts (e.g., “Would you like to see an example?” or “Need a quick quiz to check your understanding?”) to promote engagement and deeper learning.
	- Default to concise, scannable answers. Use bullet points and bolding. Only give longer explanation and details if the user requests it or implies it.
	- Donot provide references or sources between text or at the end of a sentence, only under References and Sources.
	- For followup questions and queries , use previous conversation history and context to answer the user query.
	- Be more confident and assertive in your responses.
	- At the end of each response, provide definitions for any difficult words under the heading 'Difficult Words'.
	- ** If a user asks about anything related to certification, licensing, externships, or career pathways, provide clear, specific information. This includes queries that mention or imply:

	“certification”
	“certificate”
	“get certified”
	“license” / “licensing”
	“externship”
	“internship”
	“official requirement”
	“exam eligibility”
	“career path”
	“qualification”
	“accreditation”
	“approved training”
	“enrollment”
	“program approval”

	if such keywords are present then ask user to contact official support for confirmation or further help, while including the contact details below:

	Support Contact Information:
	📞 Phone: 1-800-555-HEAL (4325)
	📧 Email: [email protected]
	🌐 Website: www.healthylifehelp.org
	🏥 Address: 123 Wellness Blvd, Caretown, CA 90210
	🕒 Hours: Mon–Fri, 8 AM – 6 PM (PST)
	"""

	# User prompt - contains the specific query and response type only (not in system prompt)
	user_prompt = f"""

	New student question:
	"{user_query}"

	Response type requested: {response_type}
	"""

	# Then in your query_groq function:
	stream_response = query_groq(user_prompt, system_prompt)

	# The function now directly yields the stream chunks for the Gradio interface to use
	full_response = ""

	# First, yield a response with empty text to set up the message
	# This creates the user message immediately
	temp_history = conversation_history.copy()
	temp_history.append((user_query, ""))
	yield temp_history, context

	# Process the stream
	for chunk in stream_response:
	if (
	hasattr(chunk.choices[0].delta, "content")
	and chunk.choices[0].delta.content is not None
	):
	content_chunk = chunk.choices[0].delta.content
	full_response += content_chunk

	# Create a temporary history with the current response
	temp_history = conversation_history.copy()
	temp_history.append((user_query, full_response))

	# Yield the updated history for display
	yield temp_history, context

	# Return the final history with the complete response
	final_history = conversation_history.copy()
	final_history.append((user_query, full_response))
	yield final_history, context


	@track_time
	def create_gradio_interface(conversation_history, response_type="default"):
	with gr.Blocks() as interface:
	gr.Markdown("# 🧑‍🏫 Quick Start AI Coaching Assistant")
	gr.Markdown("Welcome! I'm here to help you learn. Type your question below.")

	# State management
	chat_history = gr.State(conversation_history)

	with gr.Row():
	chatbot = gr.Chatbot(height=500)
	with gr.Column(scale=0.5):
	context_display = gr.Textbox(
	label="Relevant Context", interactive=False
	)

	user_input = gr.Textbox(label="Your Question", placeholder="Type here...")

	with gr.Row():
	submit_btn = gr.Button("Submit", variant="primary")
	undo_btn = gr.Button("Undo Last")
	clear_btn = gr.Button("Clear History")

	def handle_submit(user_query, history):
	if not user_query.strip():
	return gr.update(), history, ""

	# Use the generator directly from process_user_query
	# This will yield incremental updates as they arrive
	response_generator = process_user_query(user_query, history, response_type)

	for updated_history, context in response_generator:
	# Directly update the chatbot with each streaming chunk
	yield "", updated_history, context, updated_history

	# Component interactions with streaming support
	submit_btn.click(
	handle_submit,
	[user_input, chat_history],
	[user_input, chat_history, context_display, chatbot],
	)

	# Add submit on Enter key press
	user_input.submit(
	handle_submit,
	[user_input, chat_history],
	[user_input, chat_history, context_display, chatbot],
	)

	undo_btn.click(
	lambda history: history[:-1] if history else [],
	[chat_history],
	[chat_history],
	).then(lambda x: x, [chat_history], [chatbot])

	clear_btn.click(lambda: [], None, [chat_history]).then(
	lambda: ([], ""), None, [chatbot, context_display]
	)

	return interface


	def main():
	"""
	Main entry point for the application.

	Initializes the conversation history with a welcome message,
	creates the Gradio interface, and launches the web app.
	"""
	# Initialize conversation history with welcome message
	welcome_message = "Hi there! I'm your AI coach. I can help answer questions about your course materials, explain difficult concepts, and guide your learning journey. What would you like to know today?"
	initial_conversation_history = [("", welcome_message)]

	# Create and launch the interface
	interface = create_gradio_interface(initial_conversation_history, "long")
	interface.launch()


	if __name__ == "__main__":
	main()