import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer # Initialize model and tokenizer checkpoint = "HuggingFaceTB/SmolLM2-1.7B-Instruct" device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device) def generate_response(prompt, max_tokens, temperature, top_p): try: # Format input as chat message messages = [{"role": "user", "content": prompt}] input_text = tokenizer.apply_chat_template(messages, tokenize=False) # Encode and generate inputs = tokenizer.encode(input_text, return_tensors="pt").to(device) outputs = model.generate( inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True ) # Decode and return response response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response except Exception as e: return f"Error: {str(e)}" # Create Gradio interface iface = gr.Interface( fn=generate_response, inputs=[ gr.Textbox( label="Enter your prompt", placeholder="What would you like to know?", lines=3 ), gr.Slider( minimum=10, maximum=200, value=50, step=10, label="Maximum Tokens" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.2, step=0.1, label="Temperature" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top P" ) ], outputs=gr.Textbox(label="Generated Response", lines=5), title="SmolLM2-1.7B-Instruct Demo", description="Generate responses using the SmolLM2-1.7B-Instruct model", examples=[ ["What is the capital of France?", 50, 0.2, 0.9], ["Explain quantum computing in simple terms.", 100, 0.3, 0.9], ["Write a short poem about nature.", 150, 0.7, 0.9] ] ) # Launch the application if __name__ == "__main__": iface.launch(share=True)