import clip import gradio as gr import numpy as np import simple_chalk as chalk import torch from googletrans import Translator from PIL import Image TOP_N = 5 def match_texts(in_img: Image) -> list: """モデル準備""" device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("ViT-B/32", device=device) """ テキスト前処理 """ translator = Translator() trans_dict = {} with open("./sentences_ja.txt") as f: for ja_sentence in f: en_sentence = translator.translate(ja_sentence, dest="en", src="ja").text trans_dict[en_sentence] = ja_sentence en_sentences = list(trans_dict.keys()) texts = clip.tokenize(en_sentences).to(device) """ 画像前処理 """ # image: Tensor (3, 224, 224) -> (1, 3, 224, 224) image = preprocess(in_img).unsqueeze(0).to(device) """ CLIP モデルで処理 """ with torch.no_grad(): logits_per_image, logits_per_text = model(image, texts) probs = logits_per_image.softmax(dim=-1).cpu().numpy() probs_per_image = probs.reshape(-1) sort_index = np.argsort(probs_per_image)[::-1] """ 処理結果(テキスト)出力 """ idxs = sort_index.tolist() # 英語出力 # confidences = {en_sentences[i]: float(probs_per_image[i]) for i in idxs} # 日本語変換出力 confidences = {trans_dict[en_sentences[i]]: float(probs_per_image[i]) for i in idxs} return confidences if __name__ == "__main__": inputs = gr.Image(type="pil", label="画像を入力") outputs = gr.Label(num_top_classes=TOP_N, label=f"一致したテキスト Top-{TOP_N}") gr.Interface( fn=match_texts, inputs=inputs, outputs=outputs, examples=["examples-01.jpg", "examples-02.jpg", "examples-03.jpg"], allow_flagging="never", ).launch(share=False)