dixyes commited on
Commit
57d947c
·
verified ·
0 Parent(s):

initial commit

Browse files
Files changed (4) hide show
  1. .gitattributes +2 -0
  2. Readme.md +5 -0
  3. convert-pt-to-ggml.py +390 -0
  4. ggml-model.bin +3 -0
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.bin filter=lfs diff=lfs merge=lfs -text
2
+
Readme.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Belle-whisper-large-v3-zh GGML version
2
+
3
+ This is GGML version of [BELLE-2/Belle-whisper-large-v3-zh](https://huggingface.co/BELLE-2/Belle-whisper-large-v3-zh)
4
+
5
+ converted using conver-pt-to-ggml.py
convert-pt-to-ggml.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # modified from https://github.com/ggerganov/whisper.cpp/blob/2cdfc4e0251d77604b3879713ae403fe694d74e4/models/convert-pt-to-ggml.py
3
+ # hard coded for large-v3 models
4
+
5
+ # Convert Whisper transformer model from PyTorch to ggml format
6
+ #
7
+ # Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
8
+ #
9
+ # You need to clone the original repo in ~/path/to/repo/whisper/
10
+ #
11
+ # git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
12
+ #
13
+ # It is used to various assets needed by the algorithm:
14
+ #
15
+ # - tokenizer
16
+ # - mel filters
17
+ #
18
+ # Also, you need to have the original models in ~/.cache/whisper/
19
+ # See the original repo for more details.
20
+ #
21
+ # This script loads the specified model and whisper assets and saves them in ggml format.
22
+ # The output is a single binary file containing the following information:
23
+ #
24
+ # - hparams
25
+ # - mel filters
26
+ # - tokenizer vocab
27
+ # - model variables
28
+ #
29
+ # For each variable, write the following:
30
+ #
31
+ # - Number of dimensions (int)
32
+ # - Name length (int)
33
+ # - Dimensions (int[n_dims])
34
+ # - Name (char[name_length])
35
+ # - Data (float[n_dims])
36
+ #
37
+
38
+ import io
39
+ import os
40
+ import sys
41
+ import struct
42
+ import json
43
+ import code
44
+ import torch
45
+ import numpy as np
46
+ import base64
47
+ from pathlib import Path
48
+ #from transformers import GPTJForCausalLM
49
+ #from transformers import GPT2TokenizerFast
50
+
51
+ # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
52
+ #LANGUAGES = {
53
+ # "en": "english",
54
+ # "zh": "chinese",
55
+ # "de": "german",
56
+ # "es": "spanish",
57
+ # "ru": "russian",
58
+ # "ko": "korean",
59
+ # "fr": "french",
60
+ # "ja": "japanese",
61
+ # "pt": "portuguese",
62
+ # "tr": "turkish",
63
+ # "pl": "polish",
64
+ # "ca": "catalan",
65
+ # "nl": "dutch",
66
+ # "ar": "arabic",
67
+ # "sv": "swedish",
68
+ # "it": "italian",
69
+ # "id": "indonesian",
70
+ # "hi": "hindi",
71
+ # "fi": "finnish",
72
+ # "vi": "vietnamese",
73
+ # "iw": "hebrew",
74
+ # "uk": "ukrainian",
75
+ # "el": "greek",
76
+ # "ms": "malay",
77
+ # "cs": "czech",
78
+ # "ro": "romanian",
79
+ # "da": "danish",
80
+ # "hu": "hungarian",
81
+ # "ta": "tamil",
82
+ # "no": "norwegian",
83
+ # "th": "thai",
84
+ # "ur": "urdu",
85
+ # "hr": "croatian",
86
+ # "bg": "bulgarian",
87
+ # "lt": "lithuanian",
88
+ # "la": "latin",
89
+ # "mi": "maori",
90
+ # "ml": "malayalam",
91
+ # "cy": "welsh",
92
+ # "sk": "slovak",
93
+ # "te": "telugu",
94
+ # "fa": "persian",
95
+ # "lv": "latvian",
96
+ # "bn": "bengali",
97
+ # "sr": "serbian",
98
+ # "az": "azerbaijani",
99
+ # "sl": "slovenian",
100
+ # "kn": "kannada",
101
+ # "et": "estonian",
102
+ # "mk": "macedonian",
103
+ # "br": "breton",
104
+ # "eu": "basque",
105
+ # "is": "icelandic",
106
+ # "hy": "armenian",
107
+ # "ne": "nepali",
108
+ # "mn": "mongolian",
109
+ # "bs": "bosnian",
110
+ # "kk": "kazakh",
111
+ # "sq": "albanian",
112
+ # "sw": "swahili",
113
+ # "gl": "galician",
114
+ # "mr": "marathi",
115
+ # "pa": "punjabi",
116
+ # "si": "sinhala",
117
+ # "km": "khmer",
118
+ # "sn": "shona",
119
+ # "yo": "yoruba",
120
+ # "so": "somali",
121
+ # "af": "afrikaans",
122
+ # "oc": "occitan",
123
+ # "ka": "georgian",
124
+ # "be": "belarusian",
125
+ # "tg": "tajik",
126
+ # "sd": "sindhi",
127
+ # "gu": "gujarati",
128
+ # "am": "amharic",
129
+ # "yi": "yiddish",
130
+ # "lo": "lao",
131
+ # "uz": "uzbek",
132
+ # "fo": "faroese",
133
+ # "ht": "haitian creole",
134
+ # "ps": "pashto",
135
+ # "tk": "turkmen",
136
+ # "nn": "nynorsk",
137
+ # "mt": "maltese",
138
+ # "sa": "sanskrit",
139
+ # "lb": "luxembourgish",
140
+ # "my": "myanmar",
141
+ # "bo": "tibetan",
142
+ # "tl": "tagalog",
143
+ # "mg": "malagasy",
144
+ # "as": "assamese",
145
+ # "tt": "tatar",
146
+ # "haw": "hawaiian",
147
+ # "ln": "lingala",
148
+ # "ha": "hausa",
149
+ # "ba": "bashkir",
150
+ # "jw": "javanese",
151
+ # "su": "sundanese",
152
+ #}
153
+
154
+ ## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
155
+ #def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
156
+ # os.environ["TOKENIZERS_PARALLELISM"] = "false"
157
+ # path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
158
+ # tokenizer = GPT2TokenizerFast.from_pretrained(path)
159
+ #
160
+ # specials = [
161
+ # "<|startoftranscript|>",
162
+ # *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
163
+ # "<|translate|>",
164
+ # "<|transcribe|>",
165
+ # "<|startoflm|>",
166
+ # "<|startofprev|>",
167
+ # "<|nocaptions|>",
168
+ # "<|notimestamps|>",
169
+ # ]
170
+ #
171
+ # tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
172
+ # return tokenizer
173
+
174
+ # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
175
+ def bytes_to_unicode():
176
+ """
177
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
178
+ The reversible bpe codes work on unicode strings.
179
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
180
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
181
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
182
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
183
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
184
+ """
185
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
186
+ cs = bs[:]
187
+ n = 0
188
+ for b in range(2**8):
189
+ if b not in bs:
190
+ bs.append(b)
191
+ cs.append(2**8+n)
192
+ n += 1
193
+ cs = [chr(n) for n in cs]
194
+ return dict(zip(bs, cs))
195
+
196
+
197
+ # https://github.com/openai/whisper/discussions/830
198
+ def hf_to_whisper_states(text):
199
+ return (text
200
+ .replace("model.", "")
201
+ .replace("layers", "blocks")
202
+ .replace("fc1", "mlp.0")
203
+ .replace("fc2", "mlp.2")
204
+ .replace("final_layer_norm", "mlp_ln")
205
+ .replace(".self_attn.q_proj", ".attn.query")
206
+ .replace(".self_attn.k_proj", ".attn.key")
207
+ .replace(".self_attn.v_proj", ".attn.value")
208
+ .replace(".self_attn_layer_norm", ".attn_ln")
209
+ .replace(".self_attn.out_proj", ".attn.out")
210
+ .replace(".encoder_attn.q_proj", ".cross_attn.query")
211
+ .replace(".encoder_attn.k_proj", ".cross_attn.key")
212
+ .replace(".encoder_attn.v_proj", ".cross_attn.value")
213
+ .replace(".encoder_attn_layer_norm", ".cross_attn_ln")
214
+ .replace(".encoder_attn.out_proj", ".cross_attn.out")
215
+ .replace("decoder.layer_norm.", "decoder.ln.")
216
+ .replace("encoder.layer_norm.", "encoder.ln_post.")
217
+ .replace("embed_tokens", "token_embedding")
218
+ .replace("encoder.embed_positions.weight", "encoder.positional_embedding")
219
+ .replace("decoder.embed_positions.weight", "decoder.positional_embedding")
220
+ .replace("layer_norm", "ln_post")
221
+ )
222
+
223
+ if len(sys.argv) < 4:
224
+ print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
225
+ sys.exit(1)
226
+
227
+ fname_inp = Path(sys.argv[1])
228
+ dir_whisper = Path(sys.argv[2])
229
+ dir_out = Path(sys.argv[3])
230
+
231
+ # try to load PyTorch binary data
232
+ try:
233
+ model_bytes = open(fname_inp, "rb").read()
234
+ with io.BytesIO(model_bytes) as fp:
235
+ checkpoint = torch.load(fp, map_location="cpu")
236
+ except Exception:
237
+ print("Error: failed to load PyTorch model file:" , fname_inp)
238
+ sys.exit(1)
239
+
240
+ # hparams = checkpoint["dims"]
241
+ # same as large v3
242
+ hparams = {
243
+ 'n_mels': 128,
244
+ 'n_vocab': 51866,
245
+ 'n_audio_ctx': 1500,
246
+ 'n_audio_state': 1280,
247
+ 'n_audio_head': 20,
248
+ 'n_audio_layer': 32,
249
+ 'n_text_ctx': 448,
250
+ 'n_text_state': 1280,
251
+ 'n_text_head': 20,
252
+ 'n_text_layer': 32
253
+ }
254
+ print("hparams:", hparams)
255
+
256
+ list_vars = checkpoint
257
+
258
+ #print(list_vars['encoder.positional_embedding'])
259
+ #print(list_vars['encoder.conv1.weight'])
260
+ #print(list_vars['encoder.conv1.weight'].shape)
261
+
262
+ # load mel filters
263
+ n_mels = hparams["n_mels"]
264
+ with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
265
+ filters = torch.from_numpy(f[f"mel_{n_mels}"])
266
+ #print (filters)
267
+
268
+ #code.interact(local=locals())
269
+
270
+ # load tokenizer
271
+ # for backwards compatibility, also check for older hf_transformers format tokenizer files
272
+ # old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
273
+ # new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
274
+ multilingual = hparams["n_vocab"] >= 51865
275
+ tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
276
+ tokenizer_type = "tiktoken"
277
+ if not tokenizer.is_file():
278
+ tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual" or "gpt2") / "vocab.json"
279
+ tokenizer_type = "hf_transformers"
280
+ if not tokenizer.is_file():
281
+ print("Error: failed to find either tiktoken or hf_transformers tokenizer file:", tokenizer)
282
+ sys.exit(1)
283
+
284
+ byte_encoder = bytes_to_unicode()
285
+ byte_decoder = {v:k for k, v in byte_encoder.items()}
286
+
287
+ if tokenizer_type == "tiktoken":
288
+ with open(tokenizer, "rb") as f:
289
+ contents = f.read()
290
+ tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
291
+ elif tokenizer_type == "hf_transformers":
292
+ with open(tokenizer, "r", encoding="utf8") as f:
293
+ _tokens_raw = json.load(f)
294
+ if '<|endoftext|>' in _tokens_raw:
295
+ # ensures exact same model as tokenizer_type == tiktoken
296
+ # details: https://github.com/ggerganov/whisper.cpp/pull/725
297
+ del _tokens_raw['<|endoftext|>']
298
+ tokens = {bytes([byte_decoder[c] for c in token]): int(idx) for token, idx in _tokens_raw.items()}
299
+
300
+ # output in the same directory as the model
301
+ fname_out = dir_out / "ggml-model.bin"
302
+
303
+ # use 16-bit or 32-bit floats
304
+ use_f16 = True
305
+ if len(sys.argv) > 4:
306
+ use_f16 = False
307
+ fname_out = dir_out / "ggml-model-f32.bin"
308
+
309
+ fout = fname_out.open("wb")
310
+
311
+ fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
312
+ fout.write(struct.pack("i", hparams["n_vocab"]))
313
+ fout.write(struct.pack("i", hparams["n_audio_ctx"]))
314
+ fout.write(struct.pack("i", hparams["n_audio_state"]))
315
+ fout.write(struct.pack("i", hparams["n_audio_head"]))
316
+ fout.write(struct.pack("i", hparams["n_audio_layer"]))
317
+ fout.write(struct.pack("i", hparams["n_text_ctx"]))
318
+ fout.write(struct.pack("i", hparams["n_text_state"]))
319
+ fout.write(struct.pack("i", hparams["n_text_head"]))
320
+ fout.write(struct.pack("i", hparams["n_text_layer"]))
321
+ fout.write(struct.pack("i", hparams["n_mels"]))
322
+ fout.write(struct.pack("i", use_f16))
323
+
324
+ # write mel filters
325
+ fout.write(struct.pack("i", filters.shape[0]))
326
+ fout.write(struct.pack("i", filters.shape[1]))
327
+ for i in range(filters.shape[0]):
328
+ for j in range(filters.shape[1]):
329
+ fout.write(struct.pack("f", filters[i][j]))
330
+
331
+ # write tokenizer
332
+ fout.write(struct.pack("i", len(tokens)))
333
+
334
+ for key in tokens:
335
+ fout.write(struct.pack("i", len(key)))
336
+ fout.write(key)
337
+
338
+ for name in sorted(list_vars.keys(), key=hf_to_whisper_states):
339
+ if name == 'proj_out.weight':
340
+ continue
341
+ data = list_vars[name].squeeze().numpy()
342
+ name = hf_to_whisper_states(name)
343
+ print("Processing variable: " , name , " with shape: ", data.shape)
344
+
345
+ # reshape conv bias from [n] to [n, 1]
346
+ if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
347
+ data = data.reshape(data.shape[0], 1)
348
+ print(f" Reshaped variable: {name} to shape: ", data.shape)
349
+
350
+ n_dims = len(data.shape)
351
+
352
+ # looks like the whisper models are in f16 by default
353
+ # so we need to convert the small tensors to f32 until we fully support f16 in ggml
354
+ # ftype == 0 -> float32, ftype == 1 -> float16
355
+ ftype = 1
356
+ if use_f16:
357
+ if n_dims < 2 or \
358
+ name == "encoder.conv1.bias" or \
359
+ name == "encoder.conv2.bias" or \
360
+ name == "encoder.positional_embedding" or \
361
+ name == "decoder.positional_embedding":
362
+ print(" Converting to float32")
363
+ data = data.astype(np.float32)
364
+ ftype = 0
365
+ else:
366
+ data = data.astype(np.float16)
367
+ else:
368
+ data = data.astype(np.float32)
369
+ ftype = 0
370
+
371
+ #if name.startswith("encoder"):
372
+ # if name.endswith("mlp.0.weight") or \
373
+ # name.endswith("mlp.2.weight"):
374
+ # print(" Transposing")
375
+ # data = data.transpose()
376
+
377
+ # header
378
+ str_ = name.encode('utf-8')
379
+ fout.write(struct.pack("iii", n_dims, len(str_), ftype))
380
+ for i in range(n_dims):
381
+ fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
382
+ fout.write(str_)
383
+
384
+ # data
385
+ data.tofile(fout)
386
+
387
+ fout.close()
388
+
389
+ print("Done. Output file: " , fname_out)
390
+ print("")
ggml-model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e06ab64ed655e8e7823871a25ba5b5fcb0f99ff74c82940df2974d9702895a1
3
+ size 3095033483