windy2612 commited on
Commit
8a341bb
·
verified ·
1 Parent(s): 9c26227

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Checkpoint.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
Checkpoint.data-00000-of-00001 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42c61a492b21231b78d69fbd350300e6ee5b685b8cc75cf72e3f60912fd1f5c3
3
+ size 201951653
Checkpoint.index ADDED
Binary file (30 kB). View file
 
app.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from tensorflow import keras
3
+ from keras import layers
4
+ import tensorflow as tf
5
+ import numpy as np
6
+
7
+ IMAGE_SIZE = (299, 299)
8
+ VOCAB_SIZE = 8800
9
+ SEQ_LENGTH = 25
10
+ EMBED_DIM = 512
11
+ FF_DIM = 512
12
+ import re
13
+
14
+ image_augmentation = keras.Sequential(
15
+ [
16
+ keras.layers.RandomFlip("horizontal"),
17
+ keras.layers.RandomRotation(0.2),
18
+ keras.layers.RandomContrast(0.3),
19
+ ]
20
+ )
21
+
22
+ def get_cnn_model():
23
+ base_model = keras.applications.efficientnet.EfficientNetB0(
24
+ input_shape=(*IMAGE_SIZE, 3),
25
+ include_top=False,
26
+ weights="imagenet"
27
+ )
28
+ base_model.trainable = False
29
+ base_model_out = base_model.output
30
+ base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)
31
+ cnn_model = keras.models.Model(base_model.input, base_model_out)
32
+ return cnn_model
33
+
34
+ class TransformerEncoderBlock(layers.Layer):
35
+ def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.embed_dim = embed_dim
38
+ self.dense_dim = dense_dim
39
+ self.num_heads = num_heads
40
+ self.attention_1 = layers.MultiHeadAttention(
41
+ num_heads=num_heads, key_dim=embed_dim, dropout=0.0
42
+ )
43
+ self.layernorm_1 = layers.LayerNormalization()
44
+ self.layernorm_2 = layers.LayerNormalization()
45
+ self.dense_1 = layers.Dense(embed_dim, activation="relu")
46
+
47
+ def call(self, inputs, training):
48
+ inputs = self.layernorm_1(inputs)
49
+ inputs = self.dense_1(inputs)
50
+
51
+ attention_output_1 = self.attention_1(
52
+ query=inputs,
53
+ value=inputs,
54
+ key=inputs,
55
+ training=training,
56
+ )
57
+ out_1 = self.layernorm_2(inputs + attention_output_1)
58
+ return out_1
59
+
60
+ class PositionalEmbedding(layers.Layer):
61
+ def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
62
+ super().__init__(**kwargs)
63
+ self.token_embeddings = layers.Embedding(
64
+ input_dim=vocab_size, output_dim=embed_dim, mask_zero=True
65
+ )
66
+ self.position_embeddings = layers.Embedding(
67
+ input_dim=sequence_length, output_dim=embed_dim
68
+ )
69
+ self.sequence_length = sequence_length
70
+ self.vocab_size = vocab_size
71
+ self.embed_dim = embed_dim
72
+
73
+ self.add = layers.Add()
74
+
75
+
76
+ def call(self, seq):
77
+ seq = self.token_embeddings(seq)
78
+
79
+ x = tf.range(tf.shape(seq)[1])
80
+ x = x[tf.newaxis, :]
81
+ x = self.position_embeddings(x)
82
+
83
+ return self.add([seq,x])
84
+
85
+ class TransformerDecoderBlock(layers.Layer):
86
+ def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
87
+ super().__init__(**kwargs)
88
+ self.embed_dim = embed_dim
89
+ self.ff_dim = ff_dim
90
+ self.num_heads = num_heads
91
+ self.attention_1 = layers.MultiHeadAttention(
92
+ num_heads=num_heads, key_dim=embed_dim, dropout=0.1
93
+ )
94
+ self.attention_2 = layers.MultiHeadAttention(
95
+ num_heads=num_heads, key_dim=embed_dim, dropout=0.1
96
+ )
97
+ self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
98
+ self.ffn_layer_2 = layers.Dense(embed_dim)
99
+
100
+ self.layernorm_1 = layers.LayerNormalization()
101
+ self.layernorm_2 = layers.LayerNormalization()
102
+ self.layernorm_3 = layers.LayerNormalization()
103
+
104
+ self.embedding = PositionalEmbedding(
105
+ embed_dim=EMBED_DIM,
106
+ sequence_length=SEQ_LENGTH,
107
+ vocab_size=VOCAB_SIZE,
108
+ )
109
+ self.out = layers.Dense(VOCAB_SIZE, activation="softmax")
110
+
111
+ self.dropout_1 = layers.Dropout(0.3)
112
+ self.dropout_2 = layers.Dropout(0.5)
113
+ self.supports_masking = True
114
+
115
+ def call(self, inputs, encoder_outputs, training, mask=None):
116
+ inputs = self.embedding(inputs)
117
+
118
+ attention_output_1 = self.attention_1(
119
+ query=inputs,
120
+ value=inputs,
121
+ key=inputs,
122
+ training=training,
123
+ use_causal_mask=True
124
+ )
125
+ out_1 = self.layernorm_1(inputs + attention_output_1)
126
+
127
+ attention_output_2 = self.attention_2(
128
+ query=out_1,
129
+ value=encoder_outputs,
130
+ key=encoder_outputs,
131
+ training=training,
132
+ )
133
+ out_2 = self.layernorm_2(out_1 + attention_output_2)
134
+
135
+ ffn_out = self.ffn_layer_1(out_2)
136
+ ffn_out = self.dropout_1(ffn_out, training=training)
137
+ ffn_out = self.ffn_layer_2(ffn_out)
138
+
139
+ ffn_out = self.layernorm_3(ffn_out + out_2, training=training)
140
+ ffn_out = self.dropout_2(ffn_out, training=training)
141
+ preds = self.out(ffn_out)
142
+ return preds
143
+
144
+ class ImageCaptioningModel(keras.Model):
145
+ def __init__(
146
+ self,
147
+ cnn_model,
148
+ encoder,
149
+ decoder,
150
+ image_aug=None,
151
+ **kwargs):
152
+ super().__init__(**kwargs)
153
+ self.cnn_model = cnn_model
154
+ self.encoder = encoder
155
+ self.decoder = decoder
156
+ self.image_aug = image_aug
157
+
158
+ def call(self, inputs, training):
159
+ img, caption = inputs
160
+ if self.image_aug:
161
+ img = self.image_aug(img)
162
+ img_embed = self.cnn_model(img)
163
+ encoder_out = self.encoder(img_embed, training=training)
164
+ pred = self.decoder(caption, encoder_out, training=training)
165
+ return pred
166
+
167
+
168
+
169
+ cnn_model = get_cnn_model()
170
+ encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM,
171
+ dense_dim=FF_DIM,
172
+ num_heads=1)
173
+
174
+ decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM,
175
+ ff_dim=FF_DIM,
176
+ num_heads=2)
177
+
178
+ loaded_model = ImageCaptioningModel(
179
+ cnn_model=cnn_model,
180
+ encoder=encoder,
181
+ decoder=decoder,
182
+ image_aug=image_augmentation)
183
+
184
+ loaded_model.compile(optimizer=keras.optimizers.Adam(learning_rate = 3e-4), loss='sparse_categorical_crossentropy',
185
+ metrics=['accuracy'])
186
+
187
+ loaded_model.load_weights("Checkpoint")
188
+
189
+ vocab = np.load("vocabulary.npy")
190
+ index_lookup = dict(zip(range(len(vocab)), vocab))
191
+ data_txt = np.load("text_data.npy").tolist()
192
+
193
+ max_decoded_sentence_length = SEQ_LENGTH - 1
194
+ strip_chars = "!\"#$%&'()*+,-./:;=?@[\]^_`{|}~"
195
+
196
+
197
+ def custom_standardization(input_string):
198
+ lowercase = tf.strings.lower(input_string)
199
+ return tf.strings.regex_replace(lowercase, f'{re.escape(strip_chars)}', '')
200
+
201
+
202
+ vectorization = keras.layers.TextVectorization(
203
+ max_tokens=VOCAB_SIZE,
204
+ output_mode="int",
205
+ output_sequence_length=SEQ_LENGTH,
206
+ standardize=custom_standardization,
207
+ )
208
+
209
+ vectorization.adapt(data_txt)
210
+
211
+ def generate_caption(image):
212
+
213
+ if isinstance(image, np.ndarray):
214
+ img = tf.constant(image)
215
+ img = tf.image.resize(img, IMAGE_SIZE)
216
+ img = tf.image.convert_image_dtype(img, tf.float32)
217
+
218
+ img = tf.expand_dims(img, 0)
219
+ img = loaded_model.cnn_model(img)
220
+ encoded_img = loaded_model.encoder(img, training=False)
221
+
222
+ decoded_caption = "startseq "
223
+ for i in range(SEQ_LENGTH - 1):
224
+ tokenized_caption = vectorization([decoded_caption])
225
+ mask = tf.math.not_equal(tokenized_caption, 0)
226
+ predictions = loaded_model.decoder(
227
+ tokenized_caption, encoded_img, training=False, mask=mask
228
+ )
229
+ sampled_token_index = np.argmax(predictions[0, i, :])
230
+ sampled_token = index_lookup[sampled_token_index]
231
+ if sampled_token == "endseq":
232
+ break
233
+ decoded_caption += " " + sampled_token
234
+
235
+ decoded_caption = decoded_caption.replace("startseq ", "")
236
+ decoded_caption = decoded_caption.replace(" endseq", "").strip()
237
+ return decoded_caption
238
+
239
+ demo = gr.Interface(fn=generate_caption,
240
+ inputs=gr.components.Image(),
241
+ outputs=[gr.components.Textbox(label="Generated Caption", lines=3)],
242
+ )
243
+
244
+ demo.launch(share = True, debug = True)
245
+
checkpoint ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ model_checkpoint_path: "Checkpoint"
2
+ all_model_checkpoint_paths: "Checkpoint"
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ tensorflow==2.10.1
2
+ keras==2.10.0
3
+ gradio==4.27.0
4
+ pandas==2.2.2
text_data.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfaf3c0a0414615b9ba4902ef831882aaf3a9bf60b6708857ca5b7d07d7f2b8b
3
+ size 34629608
vocabulary.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8870ebd02d94911b407ed4a889f161bd0d467bbd123a93f4e4457950416c6ec9
3
+ size 774528