import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoTokenizer import torch model_name = "rubenroy/Zurich-1.5B-GCv2-5m" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_name) @spaces.GPU def generate(message, chat_history, temperature=0.7, top_p=0.9, top_k=50, max_new_tokens=512, repetition_penalty=1.1): messages = [ {"role": "system", "content": "You are a helpul assistant named Zurich, a 1.5 billion parameter Large Language model, you were fine-tuned and trained by Ruben Roy. You have been trained with the GammaCorpus v2 dataset, a dataset filled with structured and filtered multi-turn conversations, this was also made by Ruben Roy."}, # Attribution to Qwen is not included to prevent hallucinations. {"role": "user", "content": message} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate( **model_inputs, temperature=float(temperature), top_p=float(top_p), top_k=int(top_k), max_new_tokens=int(max_new_tokens), repetition_penalty=float(repetition_penalty), do_sample=True if float(temperature) > 0 else False ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] return response TITLE_HTML = """

Zurich

GammaCorpus v2-5m

Fine-tuned from Qwen 2.5 1.5B Instruct | Model: Zurich-1.5B-GCv2-5m | Training Dataset: GammaCorpus v2 5m

1.5B Models

Zurich 1.5B GCv2 5m Zurich 1.5B GCv2 1m Zurich 1.5B GCv2 500k Zurich 1.5B GCv2 100k Zurich 1.5B GCv2 50k Zurich 1.5B GCv2 10k

7B Models

Zurich 7B GCv2 5m Zurich 7B GCv2 1m Zurich 7B GCv2 500k Zurich 7B GCv2 100k Zurich 7B GCv2 50k Zurich 7B GCv2 10k

14B Models

Zurich 14B GCv2 5m Zurich 14B GCv2 1m Zurich 14B GCv2 500k Zurich 14B GCv2 100k Zurich 14B GCv2 50k Zurich 14B GCv2 10k

""" examples = [ ["Explain quantum computing in simple terms"], ["Write a short story about a time traveler"], ["Explain the process of photosynthesis"], ["Tell me an intersting fact about Palm trees"] ] with gr.Blocks() as demo: gr.HTML(TITLE_HTML) with gr.Accordion("Generation Settings", open=False): with gr.Row(): with gr.Column(): temperature = gr.Slider( minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Higher values make the output more random, lower values make it more deterministic", interactive=True ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P", info="Controls the cumulative probability threshold for nucleus sampling", interactive=True ) top_k = gr.Slider( minimum=1, maximum=100, value=50, step=1, label="Top K", info="Limits the number of tokens to consider for each generation step", interactive=True ) with gr.Column(): max_new_tokens = gr.Slider( minimum=1, maximum=2048, value=512, step=1, label="Max New Tokens", info="Maximum number of tokens to generate in the response", interactive=True ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.1, label="Repetition Penalty", info="Higher values stop the model from repeating the same info", interactive=True ) chatbot = gr.ChatInterface( fn=generate, additional_inputs=[ temperature, top_p, top_k, max_new_tokens, repetition_penalty ], examples=examples ) demo.launch(share=True)