dixyes
/

Belle-whisper-large-v3-zh-GGML

Model card Files Files and versions Community

dixyes commited on Aug 26, 2024

Commit

57d947c

verified ·

0 Parent(s):

initial commit

Browse files

Files changed (4) hide show

.gitattributes +2 -0
Readme.md +5 -0
convert-pt-to-ggml.py +390 -0
ggml-model.bin +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.bin filter=lfs diff=lfs merge=lfs -text
2	+

Readme.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# Belle-whisper-large-v3-zh GGML version
+This is GGML version of [BELLE-2/Belle-whisper-large-v3-zh](https://huggingface.co/BELLE-2/Belle-whisper-large-v3-zh)
+converted using conver-pt-to-ggml.py

convert-pt-to-ggml.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# modified from https://github.com/ggerganov/whisper.cpp/blob/2cdfc4e0251d77604b3879713ae403fe694d74e4/models/convert-pt-to-ggml.py
+# hard coded for large-v3 models
+# Convert Whisper transformer model from PyTorch to ggml format
+#
+# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+#
+# You need to clone the original repo in ~/path/to/repo/whisper/
+#
+#  git clone https://github.com/openai/whisper ~/path/to/repo/whisper/
+#
+# It is used to various assets needed by the algorithm:
+#
+#  - tokenizer
+#  - mel filters
+#
+# Also, you need to have the original models in ~/.cache/whisper/
+# See the original repo for more details.
+#
+# This script loads the specified model and whisper assets and saves them in ggml format.
+# The output is a single binary file containing the following information:
+#
+#  - hparams
+#  - mel filters
+#  - tokenizer vocab
+#  - model variables
+#
+# For each variable, write the following:
+#
+#  - Number of dimensions (int)
+#  - Name length (int)
+#  - Dimensions (int[n_dims])
+#  - Name (char[name_length])
+#  - Data (float[n_dims])
+#
+import io
+import os
+import sys
+import struct
+import json
+import code
+import torch
+import numpy as np
+import base64
+from pathlib import Path
+#from transformers import GPTJForCausalLM
+#from transformers import GPT2TokenizerFast
+# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
+#LANGUAGES = {
+#    "en": "english",
+#    "zh": "chinese",
+#    "de": "german",
+#    "es": "spanish",
+#    "ru": "russian",
+#    "ko": "korean",
+#    "fr": "french",
+#    "ja": "japanese",
+#    "pt": "portuguese",
+#    "tr": "turkish",
+#    "pl": "polish",
+#    "ca": "catalan",
+#    "nl": "dutch",
+#    "ar": "arabic",
+#    "sv": "swedish",
+#    "it": "italian",
+#    "id": "indonesian",
+#    "hi": "hindi",
+#    "fi": "finnish",
+#    "vi": "vietnamese",
+#    "iw": "hebrew",
+#    "uk": "ukrainian",
+#    "el": "greek",
+#    "ms": "malay",
+#    "cs": "czech",
+#    "ro": "romanian",
+#    "da": "danish",
+#    "hu": "hungarian",
+#    "ta": "tamil",
+#    "no": "norwegian",
+#    "th": "thai",
+#    "ur": "urdu",
+#    "hr": "croatian",
+#    "bg": "bulgarian",
+#    "lt": "lithuanian",
+#    "la": "latin",
+#    "mi": "maori",
+#    "ml": "malayalam",
+#    "cy": "welsh",
+#    "sk": "slovak",
+#    "te": "telugu",
+#    "fa": "persian",
+#    "lv": "latvian",
+#    "bn": "bengali",
+#    "sr": "serbian",
+#    "az": "azerbaijani",
+#    "sl": "slovenian",
+#    "kn": "kannada",
+#    "et": "estonian",
+#    "mk": "macedonian",
+#    "br": "breton",
+#    "eu": "basque",
+#    "is": "icelandic",
+#    "hy": "armenian",
+#    "ne": "nepali",
+#    "mn": "mongolian",
+#    "bs": "bosnian",
+#    "kk": "kazakh",
+#    "sq": "albanian",
+#    "sw": "swahili",
+#    "gl": "galician",
+#    "mr": "marathi",
+#    "pa": "punjabi",
+#    "si": "sinhala",
+#    "km": "khmer",
+#    "sn": "shona",
+#    "yo": "yoruba",
+#    "so": "somali",
+#    "af": "afrikaans",
+#    "oc": "occitan",
+#    "ka": "georgian",
+#    "be": "belarusian",
+#    "tg": "tajik",
+#    "sd": "sindhi",
+#    "gu": "gujarati",
+#    "am": "amharic",
+#    "yi": "yiddish",
+#    "lo": "lao",
+#    "uz": "uzbek",
+#    "fo": "faroese",
+#    "ht": "haitian creole",
+#    "ps": "pashto",
+#    "tk": "turkmen",
+#    "nn": "nynorsk",
+#    "mt": "maltese",
+#    "sa": "sanskrit",
+#    "lb": "luxembourgish",
+#    "my": "myanmar",
+#    "bo": "tibetan",
+#    "tl": "tagalog",
+#    "mg": "malagasy",
+#    "as": "assamese",
+#    "tt": "tatar",
+#    "haw": "hawaiian",
+#    "ln": "lingala",
+#    "ha": "hausa",
+#    "ba": "bashkir",
+#    "jw": "javanese",
+#    "su": "sundanese",
+#}
+## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
+#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):
+#    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+#    path = os.path.join(path_to_whisper_repo, "whisper/assets", name)
+#    tokenizer = GPT2TokenizerFast.from_pretrained(path)
+#
+#    specials = [
+#        "<|startoftranscript|>",
+#        *[f"<|{lang}|>" for lang in LANGUAGES.keys()],
+#        "<|translate|>",
+#        "<|transcribe|>",
+#        "<|startoflm|>",
+#        "<|startofprev|>",
+#        "<|nocaptions|>",
+#        "<|notimestamps|>",
+#    ]
+#
+#    tokenizer.add_special_tokens(dict(additional_special_tokens=specials))
+#    return tokenizer
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+# https://github.com/openai/whisper/discussions/830
+def hf_to_whisper_states(text):
+    return (text
+        .replace("model.", "")
+        .replace("layers", "blocks")
+        .replace("fc1", "mlp.0")
+        .replace("fc2", "mlp.2")
+        .replace("final_layer_norm", "mlp_ln")
+        .replace(".self_attn.q_proj", ".attn.query")
+        .replace(".self_attn.k_proj", ".attn.key")
+        .replace(".self_attn.v_proj", ".attn.value")
+        .replace(".self_attn_layer_norm", ".attn_ln")
+        .replace(".self_attn.out_proj", ".attn.out")
+        .replace(".encoder_attn.q_proj", ".cross_attn.query")
+        .replace(".encoder_attn.k_proj", ".cross_attn.key")
+        .replace(".encoder_attn.v_proj", ".cross_attn.value")
+        .replace(".encoder_attn_layer_norm", ".cross_attn_ln")
+        .replace(".encoder_attn.out_proj", ".cross_attn.out")
+        .replace("decoder.layer_norm.", "decoder.ln.")
+        .replace("encoder.layer_norm.", "encoder.ln_post.")
+        .replace("embed_tokens", "token_embedding")
+        .replace("encoder.embed_positions.weight", "encoder.positional_embedding")
+        .replace("decoder.embed_positions.weight", "decoder.positional_embedding")
+        .replace("layer_norm", "ln_post")
+    )
+if len(sys.argv) < 4:
+    print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
+    sys.exit(1)
+fname_inp   = Path(sys.argv[1])
+dir_whisper = Path(sys.argv[2])
+dir_out     = Path(sys.argv[3])
+# try to load PyTorch binary data
+try:
+    model_bytes = open(fname_inp, "rb").read()
+    with io.BytesIO(model_bytes) as fp:
+        checkpoint = torch.load(fp, map_location="cpu")
+except Exception:
+    print("Error: failed to load PyTorch model file:" , fname_inp)
+    sys.exit(1)
+# hparams = checkpoint["dims"]
+# same as large v3
+hparams = {
+    'n_mels': 128,
+    'n_vocab': 51866,
+    'n_audio_ctx': 1500,
+    'n_audio_state': 1280,
+    'n_audio_head': 20,
+    'n_audio_layer': 32,
+    'n_text_ctx': 448,
+    'n_text_state': 1280,
+    'n_text_head': 20,
+    'n_text_layer': 32
+}
+print("hparams:", hparams)
+list_vars = checkpoint
+#print(list_vars['encoder.positional_embedding'])
+#print(list_vars['encoder.conv1.weight'])
+#print(list_vars['encoder.conv1.weight'].shape)
+# load mel filters
+n_mels = hparams["n_mels"]
+with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
+    filters = torch.from_numpy(f[f"mel_{n_mels}"])
+    #print (filters)
+#code.interact(local=locals())
+# load tokenizer
+# for backwards compatibility, also check for older hf_transformers format tokenizer files
+# old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
+# new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
+multilingual = hparams["n_vocab"] >= 51865
+tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
+tokenizer_type = "tiktoken"
+if not tokenizer.is_file():
+    tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual" or "gpt2") / "vocab.json"
+    tokenizer_type = "hf_transformers"
+    if not tokenizer.is_file():
+        print("Error: failed to find either tiktoken or hf_transformers tokenizer file:", tokenizer)
+        sys.exit(1)
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+if tokenizer_type == "tiktoken":
+    with open(tokenizer, "rb") as f:
+        contents = f.read()
+        tokens = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)}
+elif tokenizer_type == "hf_transformers":
+    with open(tokenizer, "r", encoding="utf8") as f:
+        _tokens_raw = json.load(f)
+        if '<|endoftext|>' in _tokens_raw:
+            # ensures exact same model as tokenizer_type == tiktoken
+            # details: https://github.com/ggerganov/whisper.cpp/pull/725
+            del _tokens_raw['<|endoftext|>']
+        tokens = {bytes([byte_decoder[c] for c in token]): int(idx) for token, idx in _tokens_raw.items()}
+# output in the same directory as the model
+fname_out = dir_out / "ggml-model.bin"
+# use 16-bit or 32-bit floats
+use_f16 = True
+if len(sys.argv) > 4:
+    use_f16 = False
+    fname_out = dir_out / "ggml-model-f32.bin"
+fout = fname_out.open("wb")
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["n_vocab"]))
+fout.write(struct.pack("i", hparams["n_audio_ctx"]))
+fout.write(struct.pack("i", hparams["n_audio_state"]))
+fout.write(struct.pack("i", hparams["n_audio_head"]))
+fout.write(struct.pack("i", hparams["n_audio_layer"]))
+fout.write(struct.pack("i", hparams["n_text_ctx"]))
+fout.write(struct.pack("i", hparams["n_text_state"]))
+fout.write(struct.pack("i", hparams["n_text_head"]))
+fout.write(struct.pack("i", hparams["n_text_layer"]))
+fout.write(struct.pack("i", hparams["n_mels"]))
+fout.write(struct.pack("i", use_f16))
+# write mel filters
+fout.write(struct.pack("i", filters.shape[0]))
+fout.write(struct.pack("i", filters.shape[1]))
+for i in range(filters.shape[0]):
+    for j in range(filters.shape[1]):
+        fout.write(struct.pack("f", filters[i][j]))
+# write tokenizer
+fout.write(struct.pack("i", len(tokens)))
+for key in tokens:
+    fout.write(struct.pack("i", len(key)))
+    fout.write(key)
+for name in sorted(list_vars.keys(), key=hf_to_whisper_states):
+    if name == 'proj_out.weight':
+        continue
+    data = list_vars[name].squeeze().numpy()
+    name = hf_to_whisper_states(name)
+    print("Processing variable: " , name ,  " with shape: ", data.shape)
+    # reshape conv bias from [n] to [n, 1]
+    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
+        data = data.reshape(data.shape[0], 1)
+        print(f"  Reshaped variable: {name} to shape: ", data.shape)
+    n_dims = len(data.shape)
+    # looks like the whisper models are in f16 by default
+    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype = 1
+    if use_f16:
+        if n_dims < 2 or \
+                name == "encoder.conv1.bias"   or \
+                name == "encoder.conv2.bias"   or \
+                name == "encoder.positional_embedding" or \
+                name == "decoder.positional_embedding":
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype = 0
+        else:
+            data = data.astype(np.float16)
+    else:
+        data = data.astype(np.float32)
+        ftype = 0
+    #if name.startswith("encoder"):
+    #    if name.endswith("mlp.0.weight") or \
+    #       name.endswith("mlp.2.weight"):
+    #        print("  Transposing")
+    #        data = data.transpose()
+    # header
+    str_ = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
+    for i in range(n_dims):
+        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+    fout.write(str_)
+    # data
+    data.tofile(fout)
+fout.close()
+print("Done. Output file: " , fname_out)
+print("")

ggml-model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e06ab64ed655e8e7823871a25ba5b5fcb0f99ff74c82940df2974d9702895a1
+size 3095033483