import os from transformers import AutoTokenizer, AutoModelForCausalLM import torch import torch.nn as nn from torchsummary import summary from accelerate import dispatch_model, infer_auto_device_map from txtai import Embeddings from txtai.pipeline import LLM #pip3 install git+https://github.com/neuml/txtai#egg=txtai[pipeline-llm] # Wikipedia Embeddings Database embeddings = Embeddings() embeddings.load(provider="huggingface-hub", container="neuml/txtai-wikipedia") #os.environ['OMP_NUM_THREADS'] = '6' # Load model and tokenizer model_path = "Josephgflowers/Qllama-.5B-RAG-1"# # Define the device (CPU or GPU) device = torch.device("cuda") #device = torch.device("cpu") model = AutoModelForCausalLM.from_pretrained(model_path,ignore_mismatched_sizes=True,torch_dtype=torch.float16).to(device) print(model) total_params = sum(p.numel() for p in model.parameters()) print("Total number of parameters: ", total_params) sequence_length = 4096 # or whatever your specific sequence length is embedding_size = 2048 # as per your model's definition tokenizer = AutoTokenizer.from_pretrained(model_path) stop_token = 128247 #'' 2 def chat_with_model(prompt_text, stop_token, model, tokenizer): # Encode the prompt text encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt").to(device) # Generate response output_sequences = model.generate( input_ids=encoded_prompt, #max_length=len(encoded_prompt[0]) + 256, max_new_tokens=1024, temperature=0.1, repetition_penalty=1.2, top_k=20, top_p=0.9, do_sample=True, num_return_sequences=1, eos_token_id=stop_token ) # Decode the generated sequence generated_sequence = output_sequences[0].tolist() text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) response_text = text[len(prompt_text):].strip() # Extract only the response text #response_text = response_text.replace("","").replace("","") return response_text # Initialize conversation history conversation_history = '' # Get user's preference for input mode and character name input_mode = 'text' ##input("Enter 'text' for text input or 'speech' for speech input: ").lower() character_name = '<|user|>' # input("Enter your character name (USER, JONAH, JOSEPH, KIMBERLY, etc.): ") # Chat loop num_chat = 1 while num_chat <= 20: if input_mode == 'text': question = input(f"{character_name}: ") user_input = question # Get text input from user context = "\n".join([x["text"] for x in embeddings.search(question)]) elif input_mode == 'speech': print("Press Enter and then speak your input.") input() # Wait for user to hit Enter with sr.Microphone() as source: print("Listening...") audio_data = recognizer.listen(source, timeout=5, phrase_time_limit=10) try: user_input = recognizer.recognize_google(audio_data) print(f"USER: {user_input}") except sr.RequestError: print("API request failed. Please try again.") continue except sr.UnknownValueError: print("Could not understand the audio. Please try again.") continue else: print("Invalid input mode. Please restart and choose a valid mode.") break #print('History: '+ conversation_history) prompt_text = f""" <|system|> You will be given text as context to answer a users question. <|data|> {context} <|user|> {question} <|assistant|> """ response_text = chat_with_model(prompt_text, stop_token, model, tokenizer) response_text = response_text.replace('','') # Extract assistant's response from the response_text response_text = response_text.split('\n', 1)[0] # Extract the first message from the assistant print(f"Assistant: {response_text}","\n") # Update conversation history conversation_history += f"\n<|system|>\nYou will are a helpful assistant.\n{character_name}\n{user_input}\n\n<|assistant|>\n {response_text}\n\n" if len(conversation_history) > 3092: conversation_history = conversation_history[2048:] else: conversation_history = conversation_history num_chat += 1 #"You will receive a user question and context that should be used to answer the question."