import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import os model_name = "CardinalOperations/ORLM-LLaMA-3-8B" device_map = 'cuda' HF_TOKEN = os.environ.get("HF_TOKEN", None) def load_model() -> AutoModelForCausalLM: return AutoModelForCausalLM.from_pretrained(model_name, device_map=device_map) def load_tokenizer() -> AutoTokenizer: return AutoTokenizer.from_pretrained(model_name) def preprocess_messages(message: str, history: list, system_prompt: str) -> dict: messages = [{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': message}] prompt = load_tokenizer().apply_chat_template(messages, tokenize=False, add_generation_prompt=True) return prompt def generate_text(prompt: str, max_new_tokens: int, temperature: float) -> str: model = load_model() terminators = [load_tokenizer().eos_token_id, load_tokenizer().convert_tokens_to_ids(['\n'])] temp = temperature + 0.1 outputs = model.generate( prompt, max_new_tokens=max_new_tokens, eos_token_id=terminators[0], do_sample=True, temperature=temp, top_p=0.9 ) return load_tokenizer().decode(outputs[0], skip_special_tokens=True) def chat_function( message: str, history: list, system_prompt: str, max_new_tokens: int, temperature: float ) -> str: prompt = preprocess_messages(message, history, system_prompt) return generate_text(prompt, max_new_tokens, temperature) gr.ChatInterface( chat_function, chatbot=gr.Chatbot(height=400), textbox=gr.Textbox(placeholder="Enter message here", container=False, scale=7), title="ORLM Chat", description="""Chat with ORLM""", theme="soft", additional_inputs=[ gr.Textbox("You shall answer to all the questions as very smart AI", label="System Prompt"), gr.Slider(512, 4096, label="Max New Tokens"), gr.Slider(0, 1, label="Temperature") ] ).launch(debug=True)