import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel, PeftConfig from huggingface_hub import InferenceClient # Load configuration MODEL_PATH = "sagar007/phi2_25k" peft_config = PeftConfig.from_pretrained(MODEL_PATH) # Initialize client for Zero-GPU environment client = InferenceClient() def load_model(): # Load base model base_model = AutoModelForCausalLM.from_pretrained( "microsoft/phi-2", torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) # Load PEFT model model = PeftModel.from_pretrained(base_model, MODEL_PATH) return model, AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) @client.gpu(timeout=120) def generate_response(instruction, max_length=512): try: model, tokenizer = load_model() prompt = f"Instruction: {instruction}\nResponse:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_length=max_length, temperature=0.7, top_p=0.9, do_sample=True ) return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Response:")[-1].strip() except Exception as e: print(f"Error: {str(e)}") return "Sorry, I encountered an error. Please try again." def chatbot(message, history): response = generate_response(message) return response demo = gr.ChatInterface( chatbot, title="Phi-2 Zero-GPU Chat", description="Fine-tuned Phi-2 model running on Hugging Face Zero-GPU Spaces", examples=[ ["Explain quantum computing in simple terms"], ["Write a poem about artificial intelligence"], ["How do I make a perfect omelette?"] ], cache_examples=False ) if __name__ == "__main__": demo.launch()