import gradio as gr
import os
import torch

from pathlib import Path

from model import create_effnetb3_model
from timeit import default_timer as timer
from typing import Tuple, Dict

class_names = ['Banh beo', 'Banh bot loc', 'Banh can', 'Banh canh', 'Banh chung','Banh cuon', 'Banh duc', 'Banh gio','Banh khot',
 'Banh mi','Banh pia', 'Banh tet', 'Banh trang nuong', 'Banh xeo', 'Bun bo Hue', 'Bun dau mam tom','Bun mam', 'Bun rieu', 'Bun thit nuong',
 'Ca kho to', 'Canh chua', 'Cao lau', 'Chao long', 'Com tam', 'Goi cuon', 'Hu tieu', 'Mi quang', 'Nem chua', 'Pho', 'Xoi xeo']

effnetb3, effnetb3_transforms = create_effnetb3_model(num_classes=30)

effnetb3.load_state_dict(
    torch.load(
        f= "./models/pretrained_effnetb3_vietnamese_food.pth",
        map_location=torch.device("cpu")
    )
)

def predict(img) -> Tuple[Dict, float]:
  start_time = timer()
  img = effnetb3_transforms(img).unsqueeze(0)
  
  effnetb3.eval()
  with torch.inference_mode():
    pred_probs = torch.softmax(effnetb3(img), dim = 1)
  
  pred_labels_and_probs = {class_names[i]: float(pred_probs[0][i]) for i in range(len(class_names))}

  pred_time = round(timer() - start_time, 4)

  return pred_labels_and_probs, pred_time

title = "Vietnamese food vision"
description = "An EfficientNetB3 feature extractor computer vision model"

example_list = [["examples/" + example] for example in os.listdir("examples")]

demo = gr.Interface(fn=predict,
                    inputs=gr.Image(type="pil"),
                    outputs=[gr.Label(num_top_classes=3, label="Prediction"),
                             gr.Number(label="Prediction time (s)")],
                    examples=example_list,
                    title=title,
                    description=description)

demo.launch(share=True)