File size: 3,904 Bytes
5101e06
9c49efc
5101e06
 
9c49efc
 
 
 
 
 
 
 
 
 
 
ddd1c57
9c49efc
 
 
 
 
 
 
 
 
 
 
 
 
5101e06
9c49efc
 
 
5101e06
9c49efc
 
 
 
 
 
5101e06
 
9c49efc
5101e06
9c49efc
 
 
 
 
 
 
 
5101e06
 
 
 
 
 
 
9c49efc
 
 
 
 
5101e06
9c49efc
5101e06
9c49efc
5101e06
 
 
9c49efc
5101e06
9c49efc
 
 
 
 
5101e06
 
 
 
 
 
 
9c49efc
5101e06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c49efc
 
 
 
 
 
 
5101e06
9c49efc
 
 
5101e06
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

# Dictionary of available models
MODELS = {
    "SmolLM2-135M-Instruct": "HuggingFaceTB/SmolLM2-135M-Instruct",
    "SmolLM2-360M-Instruct": "HuggingFaceTB/SmolLM2-360M-Instruct",
    "SmolLM2-1.7B-Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct"
}

class ModelHandler:
    def __init__(self):
        self.current_model = None
        self.current_tokenizer = None
        self.device = "cpu" if torch.cuda.is_available() else "cpu"
    
    def load_model(self, model_name):
        try:
            checkpoint = MODELS[model_name]
            self.current_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
            self.current_model = AutoModelForCausalLM.from_pretrained(
                checkpoint,
                torch_dtype=torch.bfloat16,
                device_map="auto"
            )
            return f"Successfully loaded {model_name}"
        except Exception as e:
            return f"Error loading model: {str(e)}"

model_handler = ModelHandler()

def generate_text(model_name, prompt, max_tokens, temperature, top_p):
    try:
        # Load model if it's different from the current one
        if model_handler.current_model is None or MODELS[model_name] != model_handler.current_model.name_or_path:
            load_status = model_handler.load_model(model_name)
            if "Error" in load_status:
                return load_status

        # Format input as chat message
        messages = [{"role": "user", "content": prompt}]
        input_text = model_handler.current_tokenizer.apply_chat_template(messages, tokenize=False)
        
        # Tokenize
        inputs = model_handler.current_tokenizer.encode(
            input_text, 
            return_tensors="pt"
        ).to(model_handler.device)
        
        # Generate
        outputs = model_handler.current_model.generate(
            inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=True
        )
        
        # Decode and return
        response = model_handler.current_tokenizer.decode(
            outputs[0], 
            skip_special_tokens=True
        )
        return response

    except Exception as e:
        return f"Error during generation: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Dropdown(
            choices=list(MODELS.keys()),
            label="Select Model",
            value="SmolLM2-360M-Instruct"
        ),
        gr.Textbox(
            label="Enter your prompt",
            placeholder="What would you like to know?",
            lines=3
        ),
        gr.Slider(
            minimum=10,
            maximum=500,
            value=50,
            step=10,
            label="Maximum Tokens"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.2,
            step=0.1,
            label="Temperature"
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.9,
            step=0.1,
            label="Top P"
        )
    ],
    outputs=gr.Textbox(label="Generated Response", lines=5),
    title="SmolLM2 Model Comparison",
    description="""
    Compare different sizes of SmolLM2 models:
    - SmolLM2-135M-Instruct: Smallest and fastest
    - SmolLM2-360M-Instruct: Balanced size and performance
    - SmolLM2-1.7B-Instruct: Largest and most capable
    """,
    examples=[
        ["SmolLM2-360M-Instruct", "What is the capital of France?", 50, 0.2, 0.9],
        ["SmolLM2-360M-Instruct", "Explain quantum computing in simple terms.", 200, 0.3, 0.9],
        ["SmolLM2-360M-Instruct", "Write a short poem about nature.", 100, 0.7, 0.9]
    ]
)

# Launch the application
if __name__ == "__main__":
    iface.launch(share=True)