import spaces from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer import gradio as gr import torch import logging import sys import os from accelerate import infer_auto_device_map, init_empty_weights # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Get HuggingFace token from environment variable hf_token = os.environ.get('HUGGINGFACE_TOKEN') if not hf_token: logger.error("HUGGINGFACE_TOKEN environment variable not set") raise ValueError("Please set the HUGGINGFACE_TOKEN environment variable") # Define the model name model_name = "meta-llama/Llama-2-7b-chat-hf" try: logger.info("Starting model initialization...") # Check CUDA availability device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") # Load tokenizer logger.info("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True, token=hf_token ) tokenizer.pad_token = tokenizer.eos_token logger.info("Tokenizer loaded successfully") # Load model with basic configuration # Accelerate helps with automatic device mapping for large models logger.info("Loading model...") model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 if device == "cuda" else torch.float32, trust_remote_code=True, token=hf_token, device_map="auto" # Accelerate maneja automáticamente la distribución del modelo ) logger.info("Model loaded successfully") # Create pipeline with improved parameters logger.info("Creating generation pipeline...") model_gen = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, do_sample=True, temperature=0.8, top_p=0.95, repetition_penalty=1.2, device_map="auto" ) logger.info("Pipeline created successfully") except Exception as e: logger.error(f"Error during initialization: {str(e)}") raise @spaces.GPU(duration=60) @torch.inference_mode() def clean_response(text): """Limpia la respuesta del modelo eliminando etiquetas y texto no deseado""" # Eliminar etiquetas INST y wikipedia references text = text.replace('[INST]', '').replace('[/INST]', '') text = text.replace('(You can find more about it at wikipedia)', '') # Eliminar cualquier texto que comience con "User:" o "Assistant:" lines = text.split('\n') cleaned_lines = [] for line in lines: if not line.strip().startswith(('User:', 'Assistant:', 'Human:', 'AI:')): cleaned_lines.append(line) return '\n'.join(cleaned_lines).strip() @spaces.GPU(duration=60) @torch.inference_mode() def generate_response(user_input, chat_history): try: logger.info("Generating response for user input...") global total_water_consumption # Calculate water consumption for input input_water_consumption = calculate_water_consumption(user_input, True) total_water_consumption += input_water_consumption # Format conversation history without using INST tags formatted_history = "" if chat_history: for prev_input, prev_response in chat_history: formatted_history += f"Question: {prev_input}\nAnswer: {prev_response}\n\n" # Create prompt using a más natural format prompt = f""" {system_message} Previous conversation: {formatted_history} Question: {user_input} Answer:""" logger.info("Generating model response...") outputs = model_gen( prompt, max_new_tokens=512, return_full_text=False, pad_token_id=tokenizer.eos_token_id, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1 ) # Limpiar y procesar la respuesta assistant_response = outputs[0]['generated_text'] assistant_response = clean_response(assistant_response) # Si la respuesta sigue conteniendo texto no deseado, intentar extraer solo la parte relevante if 'Question:' in assistant_response or 'Answer:' in assistant_response: parts = assistant_response.split('Answer:') if len(parts) > 1: assistant_response = parts[1].split('Question:')[0].strip() logger.info("Response cleaned and processed") # Calculate water consumption for output output_water_consumption = calculate_water_consumption(assistant_response, False) total_water_consumption += output_water_consumption # Update chat history chat_history.append([user_input, assistant_response]) # Update water consumption display water_message = f"""
Welcome to AQuaBot - An AI assistant that helps raise awareness about water consumption in language models.
Water consumption calculations are based on the study:
Li, P. et al. (2023). Making AI Less Thirsty: Uncovering and Addressing the Secret Water
Footprint of AI Models. ArXiv Preprint,
https://arxiv.org/abs/2304.03271
Important note: This application uses Meta Llama-2-7b model instead of GPT-3 for availability and cost reasons. However, the water consumption calculations per token (input/output) are based on the conclusions from the cited paper.
Created by Camilo Vega, AI Consultant LinkedIn Profile