Spaces:

sagar007
/

phi2_finetune

Running

App Files Files Community

sagar007 commited on 5 days ago

Commit

66c498d

verified ·

1 Parent(s): ffe537c

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -30

app.py CHANGED Viewed

@@ -1,65 +1,85 @@
-import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig
-from huggingface_hub import InferenceClient
-# Load configuration
-MODEL_PATH = "sagar007/phi2_25k"
-peft_config = PeftConfig.from_pretrained(MODEL_PATH)
-# Initialize client for Zero-GPU environment
-client = InferenceClient()
-def load_model():
-    # Load base model
     base_model = AutoModelForCausalLM.from_pretrained(
         "microsoft/phi-2",
-        torch_dtype=torch.float16,
-        device_map="auto",
         trust_remote_code=True
     )
-    # Load PEFT model
     model = PeftModel.from_pretrained(base_model, MODEL_PATH)
-    return model, AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
-@client.gpu(timeout=120)
 def generate_response(instruction, max_length=512):
     try:
-        model, tokenizer = load_model()
-        prompt = f"Instruction: {instruction}\nResponse:"
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_length=max_length,
                 temperature=0.7,
                 top_p=0.9,
                 do_sample=True
             )
-        return tokenizer.decode(outputs[0], skip_special_tokens=True).split("Response:")[-1].strip()
     except Exception as e:
-        print(f"Error: {str(e)}")
-        return "Sorry, I encountered an error. Please try again."
 def chatbot(message, history):
     response = generate_response(message)
     return response
 demo = gr.ChatInterface(
     chatbot,
-    title="Phi-2 Zero-GPU Chat",
-    description="Fine-tuned Phi-2 model running on Hugging Face Zero-GPU Spaces",
     examples=[
-        ["Explain quantum computing in simple terms"],
-        ["Write a poem about artificial intelligence"],
-        ["How do I make a perfect omelette?"]
     ],
-    cache_examples=False
 )
 if __name__ == "__main__":

 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig
+import torch
+# --- 1. Check CUDA Availability and Set Device ---
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    print(f"Using device: {device} ({torch.cuda.get_device_name(0)})")
+else:
+    print("CUDA is not available. Falling back to CPU.")
+    device = torch.device("cpu")
+# --- 2. Load Tokenizer (with error handling) ---
+MODEL_PATH = "sagar007/phi2_25k"
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+except Exception as e:
+    print(f"Error loading tokenizer: {e}")
+    exit()
+# --- 3. Load Base Model (Optimized for GPU) ---
+try:
     base_model = AutoModelForCausalLM.from_pretrained(
         "microsoft/phi-2",
+        torch_dtype=torch.float16,  # Use float16 on GPU for efficiency
+        device_map="auto",  # Automatically distribute model across GPUs
         trust_remote_code=True
     )
+except Exception as e:
+    print(f"Error loading base model: {e}")
+    exit()
+# --- 4. Load PEFT Model (Optimized for GPU) ---
+try:
+    peft_config = PeftConfig.from_pretrained(MODEL_PATH)
     model = PeftModel.from_pretrained(base_model, MODEL_PATH)
+except Exception as e:
+    print(f"Error loading PEFT model: {e}")
+    exit()
+# Move model to the GPU
+model.to(device)
+model.eval()
+# --- 5. Generation Function (Optimized for GPU) ---
 def generate_response(instruction, max_length=512):
+    prompt = f"Instruction: {instruction}\nResponse:"
     try:
+        inputs = tokenizer(prompt, return_tensors="pt").to(device)
         with torch.no_grad():
             outputs = model.generate(
                 **inputs,
                 max_length=max_length,
+                num_return_sequences=1,
                 temperature=0.7,
                 top_p=0.9,
                 do_sample=True
             )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response.split("Response:")[1].strip()
     except Exception as e:
+        print(f"Error during generation: {e}")
+        return "Error during response generation."
+# --- 6. Gradio Interface ---
 def chatbot(message, history):
     response = generate_response(message)
     return response
 demo = gr.ChatInterface(
     chatbot,
+    title="Fine-tuned Phi-2 Chatbot (GPU)",
+    description="This is a chatbot using a fine-tuned version of the Phi-2 model, running on GPU.",
+    theme="default",
     examples=[
+        "Explain the concept of machine learning.",
+        "Write a short story about a robot learning to paint.",
+        "What are some effective ways to reduce stress?",
     ],
+    cache_examples=False,  # You can enable caching now
 )
 if __name__ == "__main__":