from huggingface_hub import from_pretrained_keras import numpy as np import pandas as pd import tensorflow as tf import tensorflow_hub as hub import tensorflow_text as text from tensorflow import keras import gradio as gr def make_bert_preprocessing_model(sentence_features, seq_length=128): """Returns Model mapping string features to BERT inputs. Args: sentence_features: A list with the names of string-valued features. seq_length: An integer that defines the sequence length of BERT inputs. Returns: A Keras Model that can be called on a list or dict of string Tensors (with the order or names, resp., given by sentence_features) and returns a dict of tensors for input to BERT. """ input_segments = [ tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft) for ft in sentence_features ] # tokenize the text to word pieces bert_preprocess = hub.load(bert_preprocess_path) tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name="tokenizer") segments = [tokenizer(s) for s in input_segments] truncated_segments = segments packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs, arguments=dict(seq_length=seq_length), name="packer") model_inputs = packer(truncated_segments) return keras.Model(input_segments, model_inputs) def preprocess_image(image_path, resize): extension = tf.strings.split(image_path)[-1] image = tf.io.read_file(image_path) if extension == b"jpg": image = tf.image.decode_jpeg(image, 3) else: image = tf.image.decode_png(image, 3) image = tf.image.resize(image, resize) return image def preprocess_text(text_1, text_2): text_1 = tf.convert_to_tensor([text_1]) text_2 = tf.convert_to_tensor([text_2]) output = bert_preprocess_model([text_1, text_2]) output = {feature: tf.squeeze(output[feature]) for feature in bert_input_features} return output def preprocess_text_and_image(sample, resize): image_1 = preprocess_image(sample['image_1_path'], resize) image_2 = preprocess_image(sample['image_2_path'], resize) text = preprocess_text(sample['text_1'], sample['text_2']) return {"image_1": image_1, "image_2": image_2, "text": text} def classify_info(image_1, text_1, image_2, text_2): sample = dict() sample['image_1_path'] = image_1 sample['image_2_path'] = image_2 sample['text_1'] = text_1 sample['text_2'] = text_2 dataframe = pd.DataFrame(sample, index=[0]) ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), [0])) ds = ds.map(lambda x, y: (preprocess_text_and_image(x, resize), y)).cache() batch_size = 1 auto = tf.data.AUTOTUNE ds = ds.batch(batch_size).prefetch(auto) output = model.predict(ds) label = np.argmax(output) return labels[label] model = from_pretrained_keras("keras-io/multimodal-entailment") resize = (128, 128) bert_input_features = ["input_word_ids", "input_type_ids", "input_mask"] bert_model_path = ("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1") bert_preprocess_path = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3" bert_preprocess_model = make_bert_preprocessing_model(['text_1', 'text_2']) labels = {0: "Contradictory", 1: "Implies", 2: "No Entailment"} resize = (128, 128) image_1 = gr.inputs.Image(type="filepath") image_2 = gr.inputs.Image(type="filepath") text_1 = gr.inputs.Textbox(lines=5) text_2 = gr.inputs.Textbox(lines=5) label = gr.outputs.Label() iface = gr.Interface(classify_info, inputs=[image_1, text_1, image_2, text_2],outputs=label) iface.launch()