import gradio as gr import plotly.express as px import os import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BlenderbotForConditionalGeneration # Check if CUDA is available and set device accordingly device = "cuda" if torch.cuda.is_available() else "cpu" # Set environment variables for GPU usage and memory allocation if CUDA is available if device == "cuda": os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' torch.cuda.empty_cache() torch.cuda.set_per_process_memory_fraction(0.8) # Adjust the fraction as needed # System message (placeholder, adjust as needed) system_message = "" # Load the model and tokenizer def hermes_model(): tokenizer = AutoTokenizer.from_pretrained("TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ") model = AutoModelForCausalLM.from_pretrained("TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ", low_cpu_mem_usage=True, device_map="auto") return model, tokenizer def blender_model(): model = BlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill") tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") return model, tokenizer model, tokenizer = blender_model() def chat_response(msg_prompt: str) -> str: try: inputs = tokenizer(msg_prompt, return_tensors="pt") reply_ids = model.generate(**inputs) outputs = tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0] return outputs except Exception as e: return str(e) # Function to generate a response from the model def chat_responses(msg_prompt: str) -> str: """ Generates a response from the model given a prompt. Args: msg_prompt (str): The user's message prompt. Returns: str: The model's response. """ generation_params = { "do_sample": True, "temperature": 0.7, "top_p": 0.95, "top_k": 40, "max_new_tokens": 512, "repetition_penalty": 1.1, } pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, **generation_params) try: prompt_template = f'''system {system_message} user {msg_prompt} assistant ''' pipe_output = pipe(prompt_template)[0]['generated_text'] # Separate assistant's response from the output response_lines = pipe_output.split('assistant') assistant_response = response_lines[-1].strip() if len(response_lines) > 1 else pipe_output.strip() return assistant_response except Exception as e: return str(e) # Function to generate a random plot def random_plot(): df = px.data.iris() fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species", size='petal_length', hover_data=['petal_width']) return fig # Function to handle likes/dislikes (for demonstration purposes) def print_like_dislike(x: gr.LikeData): print(x.index, x.value, x.liked) # Function to add messages to the chat history def add_message(history, message, files): for file in files: history.append(((file,), None)) if message is not None: history.append((message, None)) return history, gr.update(value=None, interactive=True) # Function to simulate the bot response def bot(history): user_message = history[-1][0] bot_response = chat_response(user_message) history[-1][1] = bot_response return history fig = random_plot() # Gradio interface setup with gr.Blocks(fill_height=True) as demo: chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, scale=1) with gr.Row(): chat_input = gr.Textbox(placeholder="Enter message...", show_label=False) file_input = gr.File(label="Upload file(s)", file_count="multiple") chat_msg = chat_input.submit(add_message, [chatbot, chat_input, file_input], [chatbot, chat_input]) bot_msg = chat_msg.then(bot, chatbot, chatbot) bot_msg.then(lambda: gr.update(interactive=True), None, [chat_input]) chatbot.like(print_like_dislike, None, None) demo.queue() demo.launch()