import os import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from huggingface_hub import login def generate_response(question): try: # Debug print to verify token token = os.environ.get('HUGGINGFACE_TOKEN') print(f"Token available: {'Yes' if token else 'No'}") login(token) print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( "meta-llama/Llama-2-7b-hf", use_auth_token=True # Changed to use environment token ) print("Loading model...") base_model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-2-7b-hf", use_auth_token=True, # Changed to use environment token load_in_8bit=True, device_map="auto" ) model = PeftModel.from_pretrained(base_model, "Lukeam/llama-aa-fine-tuned") # Format prompt prompt = f"### Question: {question}\n\n### Answer:" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # Generate response outputs = model.generate( **inputs, max_length=512, temperature=0.7, num_return_sequences=1 ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response.split("### Answer:")[1].strip() except Exception as e: return f"Error generating response: {str(e)}"