Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- .gitattributes +1 -0
- Checkpoint.data-00000-of-00001 +3 -0
- Checkpoint.index +0 -0
- app.py +245 -0
- checkpoint +2 -0
- requirements.txt +4 -0
- text_data.npy +3 -0
- vocabulary.npy +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Checkpoint.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
|
Checkpoint.data-00000-of-00001
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42c61a492b21231b78d69fbd350300e6ee5b685b8cc75cf72e3f60912fd1f5c3
|
3 |
+
size 201951653
|
Checkpoint.index
ADDED
Binary file (30 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from tensorflow import keras
|
3 |
+
from keras import layers
|
4 |
+
import tensorflow as tf
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
IMAGE_SIZE = (299, 299)
|
8 |
+
VOCAB_SIZE = 8800
|
9 |
+
SEQ_LENGTH = 25
|
10 |
+
EMBED_DIM = 512
|
11 |
+
FF_DIM = 512
|
12 |
+
import re
|
13 |
+
|
14 |
+
image_augmentation = keras.Sequential(
|
15 |
+
[
|
16 |
+
keras.layers.RandomFlip("horizontal"),
|
17 |
+
keras.layers.RandomRotation(0.2),
|
18 |
+
keras.layers.RandomContrast(0.3),
|
19 |
+
]
|
20 |
+
)
|
21 |
+
|
22 |
+
def get_cnn_model():
|
23 |
+
base_model = keras.applications.efficientnet.EfficientNetB0(
|
24 |
+
input_shape=(*IMAGE_SIZE, 3),
|
25 |
+
include_top=False,
|
26 |
+
weights="imagenet"
|
27 |
+
)
|
28 |
+
base_model.trainable = False
|
29 |
+
base_model_out = base_model.output
|
30 |
+
base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)
|
31 |
+
cnn_model = keras.models.Model(base_model.input, base_model_out)
|
32 |
+
return cnn_model
|
33 |
+
|
34 |
+
class TransformerEncoderBlock(layers.Layer):
|
35 |
+
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
|
36 |
+
super().__init__(**kwargs)
|
37 |
+
self.embed_dim = embed_dim
|
38 |
+
self.dense_dim = dense_dim
|
39 |
+
self.num_heads = num_heads
|
40 |
+
self.attention_1 = layers.MultiHeadAttention(
|
41 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=0.0
|
42 |
+
)
|
43 |
+
self.layernorm_1 = layers.LayerNormalization()
|
44 |
+
self.layernorm_2 = layers.LayerNormalization()
|
45 |
+
self.dense_1 = layers.Dense(embed_dim, activation="relu")
|
46 |
+
|
47 |
+
def call(self, inputs, training):
|
48 |
+
inputs = self.layernorm_1(inputs)
|
49 |
+
inputs = self.dense_1(inputs)
|
50 |
+
|
51 |
+
attention_output_1 = self.attention_1(
|
52 |
+
query=inputs,
|
53 |
+
value=inputs,
|
54 |
+
key=inputs,
|
55 |
+
training=training,
|
56 |
+
)
|
57 |
+
out_1 = self.layernorm_2(inputs + attention_output_1)
|
58 |
+
return out_1
|
59 |
+
|
60 |
+
class PositionalEmbedding(layers.Layer):
|
61 |
+
def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
|
62 |
+
super().__init__(**kwargs)
|
63 |
+
self.token_embeddings = layers.Embedding(
|
64 |
+
input_dim=vocab_size, output_dim=embed_dim, mask_zero=True
|
65 |
+
)
|
66 |
+
self.position_embeddings = layers.Embedding(
|
67 |
+
input_dim=sequence_length, output_dim=embed_dim
|
68 |
+
)
|
69 |
+
self.sequence_length = sequence_length
|
70 |
+
self.vocab_size = vocab_size
|
71 |
+
self.embed_dim = embed_dim
|
72 |
+
|
73 |
+
self.add = layers.Add()
|
74 |
+
|
75 |
+
|
76 |
+
def call(self, seq):
|
77 |
+
seq = self.token_embeddings(seq)
|
78 |
+
|
79 |
+
x = tf.range(tf.shape(seq)[1])
|
80 |
+
x = x[tf.newaxis, :]
|
81 |
+
x = self.position_embeddings(x)
|
82 |
+
|
83 |
+
return self.add([seq,x])
|
84 |
+
|
85 |
+
class TransformerDecoderBlock(layers.Layer):
|
86 |
+
def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
|
87 |
+
super().__init__(**kwargs)
|
88 |
+
self.embed_dim = embed_dim
|
89 |
+
self.ff_dim = ff_dim
|
90 |
+
self.num_heads = num_heads
|
91 |
+
self.attention_1 = layers.MultiHeadAttention(
|
92 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=0.1
|
93 |
+
)
|
94 |
+
self.attention_2 = layers.MultiHeadAttention(
|
95 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=0.1
|
96 |
+
)
|
97 |
+
self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
|
98 |
+
self.ffn_layer_2 = layers.Dense(embed_dim)
|
99 |
+
|
100 |
+
self.layernorm_1 = layers.LayerNormalization()
|
101 |
+
self.layernorm_2 = layers.LayerNormalization()
|
102 |
+
self.layernorm_3 = layers.LayerNormalization()
|
103 |
+
|
104 |
+
self.embedding = PositionalEmbedding(
|
105 |
+
embed_dim=EMBED_DIM,
|
106 |
+
sequence_length=SEQ_LENGTH,
|
107 |
+
vocab_size=VOCAB_SIZE,
|
108 |
+
)
|
109 |
+
self.out = layers.Dense(VOCAB_SIZE, activation="softmax")
|
110 |
+
|
111 |
+
self.dropout_1 = layers.Dropout(0.3)
|
112 |
+
self.dropout_2 = layers.Dropout(0.5)
|
113 |
+
self.supports_masking = True
|
114 |
+
|
115 |
+
def call(self, inputs, encoder_outputs, training, mask=None):
|
116 |
+
inputs = self.embedding(inputs)
|
117 |
+
|
118 |
+
attention_output_1 = self.attention_1(
|
119 |
+
query=inputs,
|
120 |
+
value=inputs,
|
121 |
+
key=inputs,
|
122 |
+
training=training,
|
123 |
+
use_causal_mask=True
|
124 |
+
)
|
125 |
+
out_1 = self.layernorm_1(inputs + attention_output_1)
|
126 |
+
|
127 |
+
attention_output_2 = self.attention_2(
|
128 |
+
query=out_1,
|
129 |
+
value=encoder_outputs,
|
130 |
+
key=encoder_outputs,
|
131 |
+
training=training,
|
132 |
+
)
|
133 |
+
out_2 = self.layernorm_2(out_1 + attention_output_2)
|
134 |
+
|
135 |
+
ffn_out = self.ffn_layer_1(out_2)
|
136 |
+
ffn_out = self.dropout_1(ffn_out, training=training)
|
137 |
+
ffn_out = self.ffn_layer_2(ffn_out)
|
138 |
+
|
139 |
+
ffn_out = self.layernorm_3(ffn_out + out_2, training=training)
|
140 |
+
ffn_out = self.dropout_2(ffn_out, training=training)
|
141 |
+
preds = self.out(ffn_out)
|
142 |
+
return preds
|
143 |
+
|
144 |
+
class ImageCaptioningModel(keras.Model):
|
145 |
+
def __init__(
|
146 |
+
self,
|
147 |
+
cnn_model,
|
148 |
+
encoder,
|
149 |
+
decoder,
|
150 |
+
image_aug=None,
|
151 |
+
**kwargs):
|
152 |
+
super().__init__(**kwargs)
|
153 |
+
self.cnn_model = cnn_model
|
154 |
+
self.encoder = encoder
|
155 |
+
self.decoder = decoder
|
156 |
+
self.image_aug = image_aug
|
157 |
+
|
158 |
+
def call(self, inputs, training):
|
159 |
+
img, caption = inputs
|
160 |
+
if self.image_aug:
|
161 |
+
img = self.image_aug(img)
|
162 |
+
img_embed = self.cnn_model(img)
|
163 |
+
encoder_out = self.encoder(img_embed, training=training)
|
164 |
+
pred = self.decoder(caption, encoder_out, training=training)
|
165 |
+
return pred
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
cnn_model = get_cnn_model()
|
170 |
+
encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM,
|
171 |
+
dense_dim=FF_DIM,
|
172 |
+
num_heads=1)
|
173 |
+
|
174 |
+
decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM,
|
175 |
+
ff_dim=FF_DIM,
|
176 |
+
num_heads=2)
|
177 |
+
|
178 |
+
loaded_model = ImageCaptioningModel(
|
179 |
+
cnn_model=cnn_model,
|
180 |
+
encoder=encoder,
|
181 |
+
decoder=decoder,
|
182 |
+
image_aug=image_augmentation)
|
183 |
+
|
184 |
+
loaded_model.compile(optimizer=keras.optimizers.Adam(learning_rate = 3e-4), loss='sparse_categorical_crossentropy',
|
185 |
+
metrics=['accuracy'])
|
186 |
+
|
187 |
+
loaded_model.load_weights("Checkpoint")
|
188 |
+
|
189 |
+
vocab = np.load("vocabulary.npy")
|
190 |
+
index_lookup = dict(zip(range(len(vocab)), vocab))
|
191 |
+
data_txt = np.load("text_data.npy").tolist()
|
192 |
+
|
193 |
+
max_decoded_sentence_length = SEQ_LENGTH - 1
|
194 |
+
strip_chars = "!\"#$%&'()*+,-./:;=?@[\]^_`{|}~"
|
195 |
+
|
196 |
+
|
197 |
+
def custom_standardization(input_string):
|
198 |
+
lowercase = tf.strings.lower(input_string)
|
199 |
+
return tf.strings.regex_replace(lowercase, f'{re.escape(strip_chars)}', '')
|
200 |
+
|
201 |
+
|
202 |
+
vectorization = keras.layers.TextVectorization(
|
203 |
+
max_tokens=VOCAB_SIZE,
|
204 |
+
output_mode="int",
|
205 |
+
output_sequence_length=SEQ_LENGTH,
|
206 |
+
standardize=custom_standardization,
|
207 |
+
)
|
208 |
+
|
209 |
+
vectorization.adapt(data_txt)
|
210 |
+
|
211 |
+
def generate_caption(image):
|
212 |
+
|
213 |
+
if isinstance(image, np.ndarray):
|
214 |
+
img = tf.constant(image)
|
215 |
+
img = tf.image.resize(img, IMAGE_SIZE)
|
216 |
+
img = tf.image.convert_image_dtype(img, tf.float32)
|
217 |
+
|
218 |
+
img = tf.expand_dims(img, 0)
|
219 |
+
img = loaded_model.cnn_model(img)
|
220 |
+
encoded_img = loaded_model.encoder(img, training=False)
|
221 |
+
|
222 |
+
decoded_caption = "startseq "
|
223 |
+
for i in range(SEQ_LENGTH - 1):
|
224 |
+
tokenized_caption = vectorization([decoded_caption])
|
225 |
+
mask = tf.math.not_equal(tokenized_caption, 0)
|
226 |
+
predictions = loaded_model.decoder(
|
227 |
+
tokenized_caption, encoded_img, training=False, mask=mask
|
228 |
+
)
|
229 |
+
sampled_token_index = np.argmax(predictions[0, i, :])
|
230 |
+
sampled_token = index_lookup[sampled_token_index]
|
231 |
+
if sampled_token == "endseq":
|
232 |
+
break
|
233 |
+
decoded_caption += " " + sampled_token
|
234 |
+
|
235 |
+
decoded_caption = decoded_caption.replace("startseq ", "")
|
236 |
+
decoded_caption = decoded_caption.replace(" endseq", "").strip()
|
237 |
+
return decoded_caption
|
238 |
+
|
239 |
+
demo = gr.Interface(fn=generate_caption,
|
240 |
+
inputs=gr.components.Image(),
|
241 |
+
outputs=[gr.components.Textbox(label="Generated Caption", lines=3)],
|
242 |
+
)
|
243 |
+
|
244 |
+
demo.launch(share = True, debug = True)
|
245 |
+
|
checkpoint
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
model_checkpoint_path: "Checkpoint"
|
2 |
+
all_model_checkpoint_paths: "Checkpoint"
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tensorflow==2.10.1
|
2 |
+
keras==2.10.0
|
3 |
+
gradio==4.27.0
|
4 |
+
pandas==2.2.2
|
text_data.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bfaf3c0a0414615b9ba4902ef831882aaf3a9bf60b6708857ca5b7d07d7f2b8b
|
3 |
+
size 34629608
|
vocabulary.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8870ebd02d94911b407ed4a889f161bd0d467bbd123a93f4e4457950416c6ec9
|
3 |
+
size 774528
|