import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn as nn
from torchsummary import summary
from accelerate import dispatch_model, infer_auto_device_map
from txtai import Embeddings
from txtai.pipeline import LLM
#pip3 install git+https://github.com/neuml/txtai#egg=txtai[pipeline-llm]
# Wikipedia Embeddings Database
embeddings = Embeddings()
embeddings.load(provider="huggingface-hub", container="neuml/txtai-wikipedia")
#os.environ['OMP_NUM_THREADS'] = '6'
# Load model and tokenizer
model_path = "Josephgflowers/Qllama-.5B-RAG-1"#
# Define the device (CPU or GPU)
device = torch.device("cuda")
#device = torch.device("cpu")
model = AutoModelForCausalLM.from_pretrained(model_path,ignore_mismatched_sizes=True,torch_dtype=torch.float16).to(device)
print(model)
total_params = sum(p.numel() for p in model.parameters())
print("Total number of parameters: ", total_params)
sequence_length = 4096 # or whatever your specific sequence length is
embedding_size = 2048 # as per your model's definition
tokenizer = AutoTokenizer.from_pretrained(model_path)
stop_token = 128247
#'' 2
def chat_with_model(prompt_text, stop_token, model, tokenizer):
# Encode the prompt text
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt").to(device)
# Generate response
output_sequences = model.generate(
input_ids=encoded_prompt,
#max_length=len(encoded_prompt[0]) + 256,
max_new_tokens=1024,
temperature=0.1,
repetition_penalty=1.2,
top_k=20,
top_p=0.9,
do_sample=True,
num_return_sequences=1,
eos_token_id=stop_token
)
# Decode the generated sequence
generated_sequence = output_sequences[0].tolist()
text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
response_text = text[len(prompt_text):].strip() # Extract only the response text
#response_text = response_text.replace("","").replace("","")
return response_text
# Initialize conversation history
conversation_history = ''
# Get user's preference for input mode and character name
input_mode = 'text' ##input("Enter 'text' for text input or 'speech' for speech input: ").lower()
character_name = '<|user|>' # input("Enter your character name (USER, JONAH, JOSEPH, KIMBERLY, etc.): ")
# Chat loop
num_chat = 1
while num_chat <= 20:
if input_mode == 'text':
question = input(f"{character_name}: ")
user_input = question # Get text input from user
context = "\n".join([x["text"] for x in embeddings.search(question)])
elif input_mode == 'speech':
print("Press Enter and then speak your input.")
input() # Wait for user to hit Enter
with sr.Microphone() as source:
print("Listening...")
audio_data = recognizer.listen(source, timeout=5, phrase_time_limit=10)
try:
user_input = recognizer.recognize_google(audio_data)
print(f"USER: {user_input}")
except sr.RequestError:
print("API request failed. Please try again.")
continue
except sr.UnknownValueError:
print("Could not understand the audio. Please try again.")
continue
else:
print("Invalid input mode. Please restart and choose a valid mode.")
break
#print('History: '+ conversation_history)
prompt_text = f"""
<|system|>
You will be given text as context to answer a users question.
<|data|>
{context}
<|user|>
{question}
<|assistant|>
"""
response_text = chat_with_model(prompt_text, stop_token, model, tokenizer)
response_text = response_text.replace('','')
# Extract assistant's response from the response_text
response_text = response_text.split('\n', 1)[0] # Extract the first message from the assistant
print(f"Assistant: {response_text}","\n")
# Update conversation history
conversation_history += f"\n<|system|>\nYou will are a helpful assistant.\n{character_name}\n{user_input}\n\n<|assistant|>\n {response_text}\n\n"
if len(conversation_history) > 3092:
conversation_history = conversation_history[2048:]
else:
conversation_history = conversation_history
num_chat += 1
#"You will receive a user question and context that should be used to answer the question."