Spaces:

SaisExperiments
/

Sad-Llama-3.2-3B

Running

App Files Files Community

Sad-Llama-3.2-3B / app.py

SaisExperiments

Update app.py

a8e97ac verified 2 months ago

raw

history blame

5.5 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	import os

	# --- Installation Note ---
	# Ensure you have the necessary libraries installed:
	# pip install gradio huggingface_hub

	# --- Hugging Face Hub Token ---
	# The InferenceClient might require a Hugging Face Hub token for certain models or usage.
	# Set it as an environment variable HUGGING_FACE_HUB_TOKEN, or log in via `huggingface-cli login`.
	# If the model is public and doesn't require login, this might work without a token.
	# HUGGING_FACE_HUB_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN") # Optional: explicitly get token if needed
	client = None
	try:
	client = InferenceClient(
	"HuggingFaceH4/zephyr-7b-beta",
	# token=HUGGING_FACE_HUB_TOKEN # Uncomment if you want to pass token explicitly
	)
	print("InferenceClient initialized successfully.")
	except Exception as e:
	print(f"Error initializing InferenceClient: {e}")
	print("Please ensure the model identifier is correct and you have necessary permissions/token.")
	# You might want to exit or raise the error depending on your application structure
	# For this Gradio app, we'll let the respond function handle the missing client.


	def respond(
	message: str,
	history: list[tuple[str, str]],
	system_message: str = "You are a friendly Chatbot.", # Default value matching UI
	max_tokens: int = 512, # Default value matching UI
	temperature: float = 0.7, # Default value matching UI
	top_p: float = 0.95, # Default value matching UI
	):
	"""
	Chat response function for the Gradio interface.
	"""
	# --- Client Check ---
	if client is None:
	yield "Error: InferenceClient could not be initialized. Please check server logs."
	return # Stop generation if client is not available

	# --- Input Validation (Basic) ---
	if not message:
	yield "Error: Please enter a message."
	return
	if not system_message:
	system_message = "You are a helpful assistant." # Fallback system message

	messages = [{"role": "system", "content": system_message}]

	for user_msg, assistant_msg in history:
	if user_msg:
	messages.append({"role": "user", "content": user_msg})
	if assistant_msg:
	messages.append({"role": "assistant", "content": assistant_msg})

	messages.append({"role": "user", "content": message})

	response_text = ""

	try:
	# Stream the response
	for message_chunk in client.chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	):
	# Check if delta and content exist and are not None
	token = message_chunk.choices[0].delta.content

	# --- Robust Token Handling ---
	if token is not None:
	response_text += token
	yield response_text # Yield the accumulated response incrementally

	except Exception as e:
	print(f"Error during API call: {e}")
	# Yield a user-friendly error message
	yield f"An error occurred while generating the response: {e}"


	# --- Gradio Interface Definition ---
	demo = gr.ChatInterface(
	respond,
	chatbot=gr.Chatbot(
	height=500,
	label="Zephyr 7B Beta",
	show_label=True,
	bubble_full_width=False, # Optional: Adjust bubble width
	),
	title="🤖 Zephyr 7B Beta Chat",
	description="Chat with the Zephyr 7B Beta model using the Hugging Face Inference API. \nEnter your message and adjust settings below.",
	examples=[
	["Hello, how are you today?"],
	["What is the capital of France?"],
	["Explain the concept of large language models in simple terms."],
	["Write a short poem about the rain."]
	],
	cache_examples=False, # Set to True to cache example results if desired
	additional_inputs=[
	gr.Textbox(
	value="You are a friendly and helpful chatbot.", # Default system message
	label="System Message",
	info="The instruction given to the chatbot to guide its behavior.",
	),
	gr.Slider(
	minimum=1,
	maximum=2048,
	value=512, # Default max tokens
	step=1,
	label="Max New Tokens",
	info="Maximum number of tokens to generate."
	),
	gr.Slider(
	minimum=0.1,
	# Max temperature adjusted: values > 1.0 often degrade quality
	maximum=1.0,
	value=0.7, # Default temperature
	step=0.1,
	label="Temperature",
	info="Controls randomness. Lower values make output more focused, higher values make it more diverse."
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95, # Default top-p
	step=0.05,
	label="Top-p (nucleus sampling)",
	info="Considers only the most probable tokens with cumulative probability p. Helps prevent low-probability tokens."
	),
	],
	additional_inputs_accordion_name="⚙️ Advanced Settings" # Group settings
	)


	if __name__ == "__main__":
	# Launch the Gradio app
	demo.launch(
	# share=True # Uncomment to create a temporary public link (use with caution)
	# server_name="0.0.0.0" # Uncomment to allow access from your local network
	# auth=("user", "password") # Optional: Add basic authentication
	)