#https://raw.githubusercontent.com/rohan-paul/LLM-FineTuning-Large-Language-Models/refs/heads/main/Mixtral_Chatbot_with_Gradio/Mixtral_Chatbot_with_Gradio.py from transformers import AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig from threading import Thread import gradio as gr import transformers import torch # Run the entire app with `python run_mixtral.py` """ The messages list should be of the following format: messages = [ {"role": "user", "content": "User's first message"}, {"role": "assistant", "content": "Assistant's first response"}, {"role": "user", "content": "User's second message"}, {"role": "assistant", "content": "Assistant's second response"}, {"role": "user", "content": "User's third message"} ] """ """ The `format_chat_history` function below is designed to format the dialogue history into a prompt that can be fed into the Mixtral model. This will help understand the context of the conversation and generate appropriate responses by the Model. The function takes a history of dialogues as input, which is a list of lists where each sublist represents a pair of user and assistant messages. """ def format_chat_history(history) -> str: messages = [] # Add a system message to set the context messages.append({"role": "system", "content": "You are a helpful assistant."}) for i, dialog in enumerate(history): if i == 0: # For the first interaction, only add the user message messages.append({"role": "user", "content": dialog[0]}) else: # For subsequent interactions, add both user and assistant messages if dialog[0]: # User message messages.append({"role": "user", "content": dialog[0]}) if dialog[1]: # Assistant message messages.append({"role": "assistant", "content": dialog[1]}) return pipeline.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True) def model_loading_pipeline(): model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1" tokenizer = AutoTokenizer.from_pretrained(model_id) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, Timeout=5) pipeline = transformers.pipeline( "text-generation", model=model_id, tokenizer=tokenizer, torch_dtype=torch.float16, load_in_4bit=True, # or load_in_8bit=True, depending on your preference device_map="auto", # This will automatically determine the best device setup streamer=streamer ) return pipeline, streamer def launch_gradio_app(pipeline, streamer): with gr.Blocks() as demo: chatbot = gr.Chatbot() msg = gr.Textbox() clear = gr.Button("Clear") def user(user_message, history): return "", history + [[user_message, None]] def bot(history): prompt = format_chat_history(history) history[-1][1] = "" kwargs = dict(text_inputs=prompt, max_new_tokens=2048, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) thread = Thread(target=pipeline, kwargs=kwargs) thread.start() for token in streamer: history[-1][1] += token yield history msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, chatbot, chatbot) clear.click(lambda: None, None, chatbot, queue=False) demo.queue() demo.launch(share=True, debug=True) if __name__ == '__main__': pipeline, streamer = model_loading_pipeline() launch_gradio_app(pipeline, streamer) # Run the entire app with `python run_mixtral.py`