from transformers import pipeline from datasets import load_dataset from PIL import Image import io from tqdm import tqdm from sklearn.metrics import accuracy_score, precision_score, recall_score import os # Clear the dataset cache cache_dir = os.path.expanduser("~/.cache/huggingface/datasets") if os.path.exists(cache_dir): import shutil shutil.rmtree(cache_dir) # Load the CLIP model for zero-shot classification print("Loading CLIP model...") checkpoint = "openai/clip-vit-large-patch14" detector = pipeline(model=checkpoint, task="zero-shot-image-classification") # Load the Oxford Pets dataset print("Loading Oxford Pets dataset...") try: # Only use first 100 images for faster testing dataset = load_dataset('pcuenq/oxford-pets', split='train[:100]') print(f"Loaded {len(dataset)} images") except Exception as e: print(f"Error loading dataset: {e}") exit(1) # Define the labels for Oxford Pets labels_oxford_pets = [ 'Siamese', 'Birman', 'shiba inu', 'staffordshire bull terrier', 'basset hound', 'Bombay', 'japanese chin', 'chihuahua', 'german shorthaired', 'pomeranian', 'beagle', 'english cocker spaniel', 'american pit bull terrier', 'Ragdoll', 'Persian', 'Egyptian Mau', 'miniature pinscher', 'Sphynx', 'Maine Coon', 'keeshond', 'yorkshire terrier', 'havanese', 'leonberger', 'wheaten terrier', 'american bulldog', 'english setter', 'boxer', 'newfoundland', 'Bengal', 'samoyed', 'British Shorthair', 'great pyrenees', 'Abyssinian', 'pug', 'saint bernard', 'Russian Blue', 'scottish terrier' ] # Lists to store true and predicted labels true_labels = [] predicted_labels = [] print("Processing images...") for i in tqdm(range(len(dataset)), desc="Processing images"): try: # Get the image bytes from the dataset image_bytes = dataset[i]['image']['bytes'] # Convert the bytes to a PIL image image = Image.open(io.BytesIO(image_bytes)) # Run the detector on the image with the provided labels results = detector(image, candidate_labels=labels_oxford_pets) # Sort the results by score in descending order sorted_results = sorted(results, key=lambda x: x['score'], reverse=True) # Get the top predicted label predicted_label = sorted_results[0]['label'] # Append the true and predicted labels to the respective lists true_labels.append(dataset[i]['label']) predicted_labels.append(predicted_label) # Print progress every 10 images if (i + 1) % 10 == 0: print(f"Processed {i + 1}/{len(dataset)} images") except Exception as e: print(f"Error processing image {i}: {e}") continue # Calculate metrics accuracy = accuracy_score(true_labels, predicted_labels) precision = precision_score(true_labels, predicted_labels, average='weighted', labels=labels_oxford_pets) recall = recall_score(true_labels, predicted_labels, average='weighted', labels=labels_oxford_pets) # Print and save results results = f""" Zero-Shot Classification Results using CLIP (openai/clip-vit-large-patch14) ==================================================================== Accuracy: {accuracy:.4f} Precision: {precision:.4f} Recall: {recall:.4f} """ print(results) # Save results to a file with open('zero_shot_results.md', 'w') as f: f.write(results)