DexterSptizu's picture
Create app.py
5101e06 verified
raw
history blame
2.29 kB
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Initialize model and tokenizer
checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
def generate_response(prompt, max_tokens, temperature, top_p):
try:
# Format input as chat message
messages = [{"role": "user", "content": prompt}]
input_text = tokenizer.apply_chat_template(messages, tokenize=False)
# Encode and generate
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(
inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True
)
# Decode and return response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
except Exception as e:
return f"Error: {str(e)}"
# Create Gradio interface
iface = gr.Interface(
fn=generate_response,
inputs=[
gr.Textbox(
label="Enter your prompt",
placeholder="What would you like to know?",
lines=3
),
gr.Slider(
minimum=10,
maximum=200,
value=50,
step=10,
label="Maximum Tokens"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.2,
step=0.1,
label="Temperature"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.1,
label="Top P"
)
],
outputs=gr.Textbox(label="Generated Response", lines=5),
title="SmolLM2-1.7B-Instruct Demo",
description="Generate responses using the SmolLM2-1.7B-Instruct model",
examples=[
["What is the capital of France?", 50, 0.2, 0.9],
["Explain quantum computing in simple terms.", 100, 0.3, 0.9],
["Write a short poem about nature.", 150, 0.7, 0.9]
]
)
# Launch the application
if __name__ == "__main__":
iface.launch(share=True)