File size: 3,904 Bytes
5101e06 9c49efc 5101e06 9c49efc ddd1c57 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 9c49efc 5101e06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
# Dictionary of available models
MODELS = {
"SmolLM2-135M-Instruct": "HuggingFaceTB/SmolLM2-135M-Instruct",
"SmolLM2-360M-Instruct": "HuggingFaceTB/SmolLM2-360M-Instruct",
"SmolLM2-1.7B-Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct"
}
class ModelHandler:
def __init__(self):
self.current_model = None
self.current_tokenizer = None
self.device = "cpu" if torch.cuda.is_available() else "cpu"
def load_model(self, model_name):
try:
checkpoint = MODELS[model_name]
self.current_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
self.current_model = AutoModelForCausalLM.from_pretrained(
checkpoint,
torch_dtype=torch.bfloat16,
device_map="auto"
)
return f"Successfully loaded {model_name}"
except Exception as e:
return f"Error loading model: {str(e)}"
model_handler = ModelHandler()
def generate_text(model_name, prompt, max_tokens, temperature, top_p):
try:
# Load model if it's different from the current one
if model_handler.current_model is None or MODELS[model_name] != model_handler.current_model.name_or_path:
load_status = model_handler.load_model(model_name)
if "Error" in load_status:
return load_status
# Format input as chat message
messages = [{"role": "user", "content": prompt}]
input_text = model_handler.current_tokenizer.apply_chat_template(messages, tokenize=False)
# Tokenize
inputs = model_handler.current_tokenizer.encode(
input_text,
return_tensors="pt"
).to(model_handler.device)
# Generate
outputs = model_handler.current_model.generate(
inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True
)
# Decode and return
response = model_handler.current_tokenizer.decode(
outputs[0],
skip_special_tokens=True
)
return response
except Exception as e:
return f"Error during generation: {str(e)}"
# Create Gradio interface
iface = gr.Interface(
fn=generate_text,
inputs=[
gr.Dropdown(
choices=list(MODELS.keys()),
label="Select Model",
value="SmolLM2-360M-Instruct"
),
gr.Textbox(
label="Enter your prompt",
placeholder="What would you like to know?",
lines=3
),
gr.Slider(
minimum=10,
maximum=500,
value=50,
step=10,
label="Maximum Tokens"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.2,
step=0.1,
label="Temperature"
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.1,
label="Top P"
)
],
outputs=gr.Textbox(label="Generated Response", lines=5),
title="SmolLM2 Model Comparison",
description="""
Compare different sizes of SmolLM2 models:
- SmolLM2-135M-Instruct: Smallest and fastest
- SmolLM2-360M-Instruct: Balanced size and performance
- SmolLM2-1.7B-Instruct: Largest and most capable
""",
examples=[
["SmolLM2-360M-Instruct", "What is the capital of France?", 50, 0.2, 0.9],
["SmolLM2-360M-Instruct", "Explain quantum computing in simple terms.", 200, 0.3, 0.9],
["SmolLM2-360M-Instruct", "Write a short poem about nature.", 100, 0.7, 0.9]
]
)
# Launch the application
if __name__ == "__main__":
iface.launch(share=True) |