import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from peft import PeftModel, PeftConfig import gc import torch # Global variables to track loaded models current_model = None current_pipe = None def load_adapter_model(adapter_model_name): global current_model, current_pipe # If there's a model already loaded, delete it to free memory if current_model is not None: del current_model del current_pipe # Force garbage collection gc.collect() torch.cuda.empty_cache() # Load the base model and tokenizer base_model_name = "unsloth/gemma-3-12b-it" # Load tokenizer from the base model tokenizer = AutoTokenizer.from_pretrained(base_model_name) try: # Method 1: Try loading as a PEFT model print(f"Loading adapter model {adapter_model_name} on top of {base_model_name}...") # First load the adapter config peft_config = PeftConfig.from_pretrained(adapter_model_name) # Then load the base model base_model = AutoModelForCausalLM.from_pretrained( base_model_name, device_map="auto", torch_dtype="auto" ) # Load the adapter on top of the base model model = PeftModel.from_pretrained(base_model, adapter_model_name) current_model = model except Exception as e: print(f"PEFT loading failed: {e}") try: # Method 2: Try loading directly if it's already merged or a different format print("Trying to load model directly...") model = AutoModelForCausalLM.from_pretrained( adapter_model_name, device_map="auto", torch_dtype="auto" ) current_model = model except Exception as e2: print(f"Direct loading failed: {e2}") # Method 3: Fallback to using the model name in pipeline print("Falling back to using the model name in pipeline...") pipe = pipeline("text-generation", model=adapter_model_name) current_pipe = pipe return pipe # Create pipeline with the loaded model and tokenizer pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) current_pipe = pipe return pipe # Default model name default_model = "Chan-Y/gemma3-12b-1204-seperate" # Create the initial pipeline pipe = load_adapter_model(default_model) pipe.model_name = default_model # Track the current model name def generate_response(model_name, prompt, system_prompt, max_length, temperature, top_p, top_k): """Generate text using the model based on user input and advanced settings""" global pipe # Check if we need to load a different model if model_name != getattr(pipe, 'model_name', default_model): pipe = load_adapter_model(model_name) # Store the model name attribute on the pipeline for tracking pipe.model_name = model_name messages = [ [ { "role": "system", "content": [{"type": "text", "text": system_prompt}] }, { "role": "user", "content": [{"type": "text", "text": prompt}] }, ], ] print("Generating response...") # Generate text with all parameters output = pipe( messages, max_new_tokens=max_length, temperature=temperature, top_p=top_p, top_k=top_k ) # Extract the generated text from the output return output[0][0]["generated_text"][-1]["content"] # Default system prompt in Turkish #default_system_prompt = """Sana bir problem verildi. #Problem hakkında düşün ve çalışmanı göster. #Çalışmanı ve arasına yerleştir. #Sonra, çözümünü ve arasına yerleştir. #Lütfen SADECE Türkçe kullan.""" default_system_prompt = """Sen kullanıcıların isteklerine Türkçe cevap veren bir asistansın ve sana bir problem verildi. Problem hakkında düşün ve çalışmanı göster. Çalışmanı ve arasına yerleştir. Sonra, çözümünü ve arasına yerleştir. Lütfen SADECE Türkçe kullan.""" # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("# Gemma 3 Reasoning Model Interface") gr.Markdown("Using Gemma 3 1B with Turkish reasoning adapters") with gr.Row(): with gr.Column(): # Model selection in an expander with gr.Accordion("Model Selection", open=True): model_selector = gr.Dropdown( choices=[ "Chan-Y/gemma3-12b-1204-seperate", ], value="Chan-Y/gemma3-12b-1204-seperate", label="Select Model", info="Choosing a new model will unload the current one to save memory" ) prompt_input = gr.Textbox( lines=5, placeholder="Enter your prompt here...", label="Prompt" ) # Advanced settings in an expander (accordion) with gr.Accordion("Advanced Settings", open=False): system_prompt = gr.Textbox( lines=5, value=default_system_prompt, label="System Prompt" ) temperature = gr.Slider( minimum=0.0, maximum=1.0, value=0.75, step=0.1, label="Temperature" ) max_tokens = gr.Slider( minimum=64, maximum=1024*4, value=512, step=16, label="Max New Tokens" ) top_p_value = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p" ) top_k_value = gr.Slider( minimum=1, maximum=100, value=64, step=1, label="Top-k" ) submit_btn = gr.Button("Generate Response") with gr.Column(): output_text = gr.Textbox(lines=15, label="Generated Response") # Connect the function to the interface submit_btn.click( fn=generate_response, inputs=[ model_selector, prompt_input, system_prompt, max_tokens, temperature, top_p_value, top_k_value ], outputs=output_text ) # Launch the interface if __name__ == "__main__": demo.launch() # Set share=True to create a public link