import gradio as gr import numpy as np from PIL import Image from matplotlib import cm import torch from transformers import AutoTokenizer, AutoModel from model import ImageModel, TextModel import torch.nn.functional as F import torchvision.transforms.v2 as transforms # Load model directly MODEL_NAME = "distilbert/distilroberta-base" class_names = ['Action', 'Adventure', 'Comedy', 'Drama', 'Fantasy', 'Romance', 'Sci-Fi'] tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) cp = torch.load(r"model_only.pt", map_location="cpu") model_img = ImageModel(len(class_names)) model_img.load_state_dict(cp['w_i']) model_text = TextModel(MODEL_NAME, len(class_names)) model_text.load_state_dict(cp['w_t']) image_transforms = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ]) def text_predictor(title, synopsis): encoded_synopsis = tokenizer(f"{title} {synopsis}", \ add_special_tokens = True, \ max_length = 128, \ padding = "max_length", \ truncation = True, return_tensors='pt') with torch.no_grad(): score, isAward, genres = model_text((encoded_synopsis['input_ids'], encoded_synopsis['attention_mask'])) score, isAward, genres = score.squeeze(0), F.sigmoid(isAward.squeeze(0)) >= 0.5 , F.sigmoid(genres.squeeze(0)) preds_name = [] for prob, cls in zip(genres, class_names): if prob >= 0.5: preds_name.append(cls) return round(score.item(), 2), isAward.item(), {"genres":preds_name} def img_predictor(img): # Preprocess the image img = Image.fromarray(img.astype('uint8'), 'RGB') # Convert NumPy array to PIL Image img = image_transforms(img).unsqueeze(0) # Apply transforms and add batch dimension # Make predictions with torch.no_grad(): output = model_img(img) score, isAward, genres = output[0].squeeze(0), F.sigmoid(output[1].squeeze(0)) >= 0.5, F.sigmoid(output[2].squeeze(0)) preds_name = [] for prob, cls in zip(genres, class_names): if prob >= 0.5: preds_name.append(cls) return round(score.item(), 2), isAward.item(), {"genres": preds_name} def combine_predictor(title, synopsis, img): encoded_synopsis = tokenizer(f"{title} {synopsis}", \ add_special_tokens = True, \ max_length = 128, \ padding = "max_length", \ truncation = True, return_tensors='pt') img = Image.fromarray(img.astype('uint8'), 'RGB') # Convert NumPy array to PIL Image img = image_transforms(img).unsqueeze(0) # Apply transforms and add batch dimension # Make predictions with torch.no_grad(): output_text = model_text((encoded_synopsis['input_ids'], encoded_synopsis['attention_mask'])) output_img = model_img(img) score = (output_img[0].squeeze(0) + output_text[0].squeeze(0))/2 isAward = F.sigmoid((output_img[1].squeeze(0) + output_text[1].squeeze(0))/2) >= 0.5 genres = F.sigmoid((output_img[2].squeeze(0) + output_text[2].squeeze(0))/2) preds_name = [] for prob, cls in zip(genres, class_names): if prob >= 0.5: preds_name.append(cls) return round(score.item(), 2), isAward.item(), {"genres": preds_name} # iface_1 = gr.Interface(age_predictor_image, gr.Image(height=256, width=256), "json", examples=[["young.webp"], ["old.jpg"]]) iface_1 = gr.Interface(text_predictor, [gr.Text(placeholder="Input title here"), gr.Text(placeholder="Input synopsis here")], [gr.Label(label='Score'), gr.Label(label='Is Winning Award?'), "json"]) iface_2 = gr.Interface(img_predictor, gr.Image(height=224, width=224), [gr.Label(label='Score'), gr.Label(label='Is Winning Award?'), "json"]) iface_3 = gr.Interface(combine_predictor, [gr.Text(placeholder="Input title here"), gr.Text(placeholder="Input synopsis here"), gr.Image(height=224, width=224)], [gr.Label(label='Score'), gr.Label(label='Is Winning Award?'), "json"]) demo = gr.TabbedInterface([iface_1, iface_2, iface_3], ["From Text", "From Image", "From Text and Image"]) demo.launch() # Launches the mini app!