Spaces:

Ozaii
/

Wali-13B-Chat

Runtime error

App Files Files Community

Ozaii commited on Jun 17

Commit

2dd3233

•

1 Parent(s): 66c31dd

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -71

app.py CHANGED Viewed

@@ -1,79 +1,202 @@
-import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
 import spaces
-# Load the model and tokenizer from Hugging Face
-model_path = "Ozaii/W.AI-13B-Chat" # Replace with your username and repository name
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-model = AutoModelForCausalLM.from_pretrained(model_path)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device)
-model.eval()
-@spaces.GPU
-def generate_response(user_input, chat_history):
- max_context_length = 750
- max_response_length = 150
- prompt = ""
- for message in chat_history:
- if message[0] is not None:
- prompt += f"User: {message[0]}\n"
- if message[1] is not None:
- prompt += f"Assistant: {message[1]}\n"
- prompt += f"User: {user_input}\nAssistant:"
- prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
- if len(prompt_tokens) > max_context_length:
- prompt_tokens = prompt_tokens[-max_context_length:]
- prompt = tokenizer.decode(prompt_tokens, clean_up_tokenization_spaces=True)
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
- with torch.no_grad():
- outputs = model.generate(
- inputs.input_ids,
- max_length=len(inputs.input_ids[0]) + max_response_length, # Limit the maximum length for context and response
- min_length=45,
- temperature=0.7, # Slightly higher temperature for more diverse responses
- top_k=30,
- top_p=0.9, # Allow a bit more randomness
- repetition_penalty=1.1, # Mild repetition penalty
- no_repeat_ngram_size=3, # Ensure no repeated phrases
- eos_token_id=tokenizer.eos_token_id,
- pad_token_id=tokenizer.eos_token_id
- )
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
- assistant_response = response.split("Assistant:")[-1].strip()
- assistant_response = assistant_response.split('\n')[0].strip()
- chat_history.append((user_input, assistant_response))
- return chat_history, chat_history
-def restart_chat():
- return [], []
-with gr.Blocks() as chat_interface:
- gr.Markdown("<h1><center>W.AI Chat Nikker xD</center></h1>")
- chat_history = gr.State([])
- with gr.Column():
- chatbox = gr.Chatbot()
- with gr.Row():
- user_input = gr.Textbox(show_label=False, placeholder="Summon Wali Here...")
- submit_button = gr.Button("Send")
- restart_button = gr.Button("Restart")
- submit_button.click(
- generate_response,
- inputs=[user_input, chat_history],
- outputs=[chatbox, chat_history]
  )
- restart_button.click(
- restart_chat,
- inputs=[],
- outputs=[chatbox, chat_history]
  )
-chat_interface.launch(share=True)

+# Import spaces first to ensure GPU resources are managed correctly
 import spaces
+# Import necessary libraries
+import os
+import json
+import logging
+import time
+import torch
+import bitsandbytes as bnb
+from datasets import Dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
+from peft import PeftModel, LoraConfig
+from transformers import BitsAndBytesConfig
+# Configure logging
+logging.basicConfig(level=logging.INFO, filename='training_log.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')
+logging.info("Started the script")
+# Load the Hugging Face API token from environment variables
+HF_API_TOKEN = os.getenv('HF_API_TOKEN')
+# Load the dataset
+file_path = 'best_training_data.json' # Adjust path as needed
+logging.info(f"Loading dataset from {file_path}")
+try:
+ with open(file_path, 'r') as file:
+ data = json.load(file)
+ logging.info("Dataset loaded successfully")
+except Exception as e:
+ logging.error(f"Failed to load dataset: {e}")
+# Convert the dataset to Hugging Face Dataset format
+try:
+ dataset = Dataset.from_dict({"text": [entry["text"] for entry in data]})
+ logging.info("Dataset converted to Hugging Face Dataset format")
+except Exception as e:
+ logging.error(f"Failed to convert dataset: {e}")
+# Initialize Tokenizer
+try:
+ tokenizer = AutoTokenizer.from_pretrained("SweatyCrayfish/llama-3-8b-quantized", token=HF_API_TOKEN)
+ logging.info("Tokenizer loaded successfully")
+ # Add padding token if not already present
+ if tokenizer.pad_token is None:
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+ logging.info("Padding token added to the tokenizer")
+ tokenizer.save_pretrained('.')
+except Exception as e:
+ logging.error(f"Failed to load or configure tokenizer: {e}")
+# Tokenize the Dataset
+def tokenize_function(examples):
+ return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=1024, return_tensors='pt')
+try:
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+ logging.info("Dataset tokenized successfully")
+except Exception as e:
+ logging.error(f"Failed to tokenize the dataset: {e}")
+# Setup Quantization Configuration
+nf4_config = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_use_double_quant=True,
+ bnb_4bit_compute_dtype=torch.bfloat16
+)
+# Load the LLaMA 8B Model with Quantization
+try:
+ model = AutoModelForCausalLM.from_pretrained(
+ "SweatyCrayfish/llama-3-8b-quantized",
+ quantization_config=nf4_config,
+ token=HF_API_TOKEN,
+ device_map="auto"
  )
+ model.resize_token_embeddings(len(tokenizer))
+ model.gradient_checkpointing_enable()
+ model.config.use_cache = False # Disable use_cache when using gradient checkpointing
+ logging.info("Model initialized and resized embeddings")
+ # Set up LoRa
+ lora_config = LoraConfig(
+ r=64,
+ lora_alpha=16,
+ lora_dropout=0.1,
+ bias="none",
+ task_type="CAUSAL_LM",
+ target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
+ )
+ model = PeftModel(model, lora_config)
+ logging.info("LoRa configuration applied to the model")
+ # Ensure only floating point parameters require gradients
+ for param in model.parameters():
+ if param.dtype in [torch.float16, torch.float32, torch.bfloat16, torch.complex64, torch.complex128]:
+ param.requires_grad = True
+ logging.info("Model parameters configured for gradient computation")
+except Exception as e:
+ logging.error(f"Failed to initialize the model: {e}")
+# Setup Training Arguments
+try:
+ training_args = TrainingArguments(
+ output_dir="training_results",
+ evaluation_strategy="no", # Disable evaluation
+ save_strategy="epoch", # Save only at the end of each epoch
+ learning_rate=2e-4,
+ per_device_train_batch_size=5,
+ gradient_accumulation_steps=4,
+ num_train_epochs=12,
+ weight_decay=0.01,
+ save_total_limit=1,
+ logging_dir="training_logs",
+ logging_steps=50,
+ fp16=False,
+ bf16=True,
+ load_best_model_at_end=False, # Do not load the best model
+ greater_is_better=False,
+ report_to="none" # Disable reporting to external services
+ )
+ logging.info("Training arguments configured successfully")
+except Exception as e:
+ logging.error(f"Failed to configure training arguments: {e}")
+# Initialize the Trainer
+try:
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+ trainer = Trainer(
+ model=model,
+ args=training_args,
+ train_dataset=tokenized_dataset,
+ data_collator=data_collator
  )
+ logging.info("Trainer initialized successfully")
+except Exception as e:
+ logging.error(f"Failed to initialize the Trainer: {e}")
+# Implementing 120-Second Segmented Training
+@spaces.GPU(duration=120)
+def segmented_train(trainer):
+ start_time = time.time()
+ while time.time() - start_time < 120:
+ try:
+ trainer.train()
+ except torch.cuda.OutOfMemoryError as e:
+ logging.error(f"Out of memory error: {e}")
+ break
+ except Exception as e:
+ logging.error(f"Training error: {e}")
+ break
+ trainer.save_state()
+try:
+ segmented_train(trainer)
+ logging.info("Model training completed successfully")
+except Exception as e:
+ logging.error(f"Training failed: {e}")
+ import traceback
+ traceback.print_exc()
+# Save the Model
+try:
+ model.save_pretrained("llama3-8b-chat-finetuned-final-version")
+ tokenizer.save_pretrained("llama3-8b-chat-finetuned-final-version")
+ logging.info("Final fine-tuned model and tokenizer saved successfully")
+except Exception as e:
+ logging.error(f"Failed to save the final fine-tuned model: {e}")
+# Inference Function
+@spaces.GPU
+def generate_response(prompt, model, tokenizer, max_length=128, min_length=20, temperature=0.7, top_k=50, top_p=0.9):
+ try:
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+ with torch.no_grad():
+ outputs = model.generate(
+ inputs.input_ids,
+ max_length=max_length,
+ min_length=min_length,
+ do_sample=True,
+ temperature=temperature,
+ top_k=top_k,
+ top_p=top_p,
+ repetition_penalty=1.3,
+ no_repeat_ngram_size=3,
+ eos_token_id=tokenizer.eos_token_id
+ )
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+ return response
+ except Exception as e:
+ logging.error(f"Failed to generate response: {e}")
+ return ""
+# Example Usage
+prompt = "bro did u talk with DK today"
+response = generate_response(prompt, model, tokenizer)
+print(response)
+logging.info(f"Generated response: {response}")