import torch import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer # Dictionary of available models MODELS = { "SmolLM2-135M-Instruct": "HuggingFaceTB/SmolLM2-135M-Instruct", "SmolLM2-360M-Instruct": "HuggingFaceTB/SmolLM2-360M-Instruct", "SmolLM2-1.7B-Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct" } class ModelHandler: def __init__(self): self.current_model = None self.current_tokenizer = None self.device = "cpu" if torch.cuda.is_available() else "cpu" def load_model(self, model_name): try: checkpoint = MODELS[model_name] self.current_tokenizer = AutoTokenizer.from_pretrained(checkpoint) self.current_model = AutoModelForCausalLM.from_pretrained( checkpoint, torch_dtype=torch.bfloat16, device_map="auto" ) return f"Successfully loaded {model_name}" except Exception as e: return f"Error loading model: {str(e)}" model_handler = ModelHandler() def generate_text(model_name, prompt, max_tokens, temperature, top_p): try: # Load model if it's different from the current one if model_handler.current_model is None or MODELS[model_name] != model_handler.current_model.name_or_path: load_status = model_handler.load_model(model_name) if "Error" in load_status: return load_status # Format input as chat message messages = [{"role": "user", "content": prompt}] input_text = model_handler.current_tokenizer.apply_chat_template(messages, tokenize=False) # Tokenize inputs = model_handler.current_tokenizer.encode( input_text, return_tensors="pt" ).to(model_handler.device) # Generate outputs = model_handler.current_model.generate( inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True ) # Decode and return response = model_handler.current_tokenizer.decode( outputs[0], skip_special_tokens=True ) return response except Exception as e: return f"Error during generation: {str(e)}" # Create Gradio interface iface = gr.Interface( fn=generate_text, inputs=[ gr.Dropdown( choices=list(MODELS.keys()), label="Select Model", value="SmolLM2-360M-Instruct" ), gr.Textbox( label="Enter your prompt", placeholder="What would you like to know?", lines=3 ), gr.Slider( minimum=10, maximum=500, value=50, step=10, label="Maximum Tokens" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.2, step=0.1, label="Temperature" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top P" ) ], outputs=gr.Textbox(label="Generated Response", lines=5), title="SmolLM2 Model Comparison", description=""" Compare different sizes of SmolLM2 models: - SmolLM2-135M-Instruct: Smallest and fastest - SmolLM2-360M-Instruct: Balanced size and performance - SmolLM2-1.7B-Instruct: Largest and most capable """, examples=[ ["SmolLM2-360M-Instruct", "What is the capital of France?", 50, 0.2, 0.9], ["SmolLM2-360M-Instruct", "Explain quantum computing in simple terms.", 200, 0.3, 0.9], ["SmolLM2-360M-Instruct", "Write a short poem about nature.", 100, 0.7, 0.9] ] ) # Launch the application if __name__ == "__main__": iface.launch(share=True)