DuyTa commited on
Commit
540fe46
·
verified ·
1 Parent(s): 9919379

Delete whisper_pipeline

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. whisper_pipeline/HA1.wav +0 -3
  2. whisper_pipeline/api.py +0 -42
  3. whisper_pipeline/check.py +0 -6
  4. whisper_pipeline/dockerfile +0 -51
  5. whisper_pipeline/faster-whisper-main/.github/workflows/ci.yml +0 -90
  6. whisper_pipeline/faster-whisper-main/.gitignore +0 -15
  7. whisper_pipeline/faster-whisper-main/CONTRIBUTING.md +0 -31
  8. whisper_pipeline/faster-whisper-main/LICENSE +0 -21
  9. whisper_pipeline/faster-whisper-main/MANIFEST.in +0 -4
  10. whisper_pipeline/faster-whisper-main/README.md +0 -319
  11. whisper_pipeline/faster-whisper-main/benchmark/benchmark.m4a +0 -3
  12. whisper_pipeline/faster-whisper-main/benchmark/memory_benchmark.py +0 -94
  13. whisper_pipeline/faster-whisper-main/benchmark/normalizer.json +0 -1742
  14. whisper_pipeline/faster-whisper-main/benchmark/requirements.benchmark.txt +0 -6
  15. whisper_pipeline/faster-whisper-main/benchmark/speed_benchmark.py +0 -31
  16. whisper_pipeline/faster-whisper-main/benchmark/utils.py +0 -39
  17. whisper_pipeline/faster-whisper-main/benchmark/wer_benchmark.py +0 -64
  18. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/__init__.py +0 -14
  19. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/__init__.py +0 -0
  20. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/pyannote_vad_model.bin +0 -3
  21. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/silero_vad.onnx +0 -3
  22. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/audio.py +0 -58
  23. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/feature_extractor.py +0 -114
  24. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/tokenizer.py +0 -314
  25. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/transcribe.py +0 -2170
  26. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/utils.py +0 -157
  27. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/vad.py +0 -596
  28. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/version.py +0 -3
  29. whisper_pipeline/faster-whisper-main/docker/Dockerfile +0 -6
  30. whisper_pipeline/faster-whisper-main/docker/infer.py +0 -7
  31. whisper_pipeline/faster-whisper-main/docker/jfk.flac +0 -3
  32. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/PKG-INFO +0 -347
  33. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/SOURCES.txt +0 -25
  34. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/dependency_links.txt +0 -1
  35. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/requires.txt +0 -17
  36. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/top_level.txt +0 -1
  37. whisper_pipeline/faster-whisper-main/faster_whisper/__init__.py +0 -14
  38. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/__init__.cpython-310.pyc +0 -0
  39. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/audio.cpython-310.pyc +0 -0
  40. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/feature_extractor.cpython-310.pyc +0 -0
  41. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/tokenizer.cpython-310.pyc +0 -0
  42. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/transcribe.cpython-310.pyc +0 -0
  43. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/utils.cpython-310.pyc +0 -0
  44. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/vad.cpython-310.pyc +0 -0
  45. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/version.cpython-310.pyc +0 -0
  46. whisper_pipeline/faster-whisper-main/faster_whisper/assets/__init__.py +0 -0
  47. whisper_pipeline/faster-whisper-main/faster_whisper/assets/pyannote_vad_model.bin +0 -3
  48. whisper_pipeline/faster-whisper-main/faster_whisper/assets/silero_vad.onnx +0 -3
  49. whisper_pipeline/faster-whisper-main/faster_whisper/audio.py +0 -58
  50. whisper_pipeline/faster-whisper-main/faster_whisper/feature_extractor.py +0 -114
whisper_pipeline/HA1.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:87fd3e947f85de5aeeae4d2f34a4774370541acf92e0f3317686e3c70572aa6a
3
- size 1242438
 
 
 
 
whisper_pipeline/api.py DELETED
@@ -1,42 +0,0 @@
1
- from fastapi import FastAPI, UploadFile, File
2
- from fastapi.responses import JSONResponse
3
- from pathlib import Path
4
- import os
5
- from gector import GecBERTModel
6
- from faster_whisper import WhisperModel, BatchedInferencePipeline
7
- from transformers.models.whisper.english_normalizer import BasicTextNormalizer
8
- from text_processing.inverse_normalize import InverseNormalizer
9
- import shutil
10
- import uvicorn
11
-
12
- # Initialize the FastAPI app
13
- app = FastAPI()
14
-
15
- # Initialize models and normalizer
16
- current_dir = Path(__file__).parent.as_posix()
17
- inverse_normalizer = InverseNormalizer('vi')
18
- whisper_model = WhisperModel("pho_distill_q8", device="auto", compute_type="auto")
19
- batched_model = BatchedInferencePipeline(model=whisper_model, use_vad_model=True, chunk_length=15)
20
- gector_model = GecBERTModel(
21
- vocab_path=os.path.join(current_dir, "gector/vocabulary"),
22
- model_paths=[os.path.join(current_dir, "gector/Model_GECTOR")],
23
- split_chunk=True
24
- )
25
- normalizer = BasicTextNormalizer()
26
-
27
- @app.post("/transcriptions")
28
- async def transcribe_audio(file: UploadFile = File(...)):
29
- # Save the uploaded file temporarily
30
- temp_file_path = Path(f"temp_{file.filename}")
31
- with open(temp_file_path, "wb") as buffer:
32
- shutil.copyfileobj(file.file, buffer)
33
- segments, info = batched_model.transcribe(str(temp_file_path), language="vi", batch_size=32)
34
- os.remove(temp_file_path)
35
- transcriptions = [segment.text for segment in segments]
36
- normalized_transcriptions = [inverse_normalizer.inverse_normalize(normalizer(text)) for text in transcriptions]
37
- corrected_texts = gector_model(normalized_transcriptions)
38
- return JSONResponse({"text": ' '.join(corrected_texts)})
39
-
40
-
41
- if __name__ == "__main__":
42
- uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/check.py DELETED
@@ -1,6 +0,0 @@
1
- from text_processing.inverse_normalize import InverseNormalizer
2
- import time
3
- normalizer = InverseNormalizer('vi')
4
- start = time.time()
5
- print(normalizer.inverse_normalize("mười hai ki lô gram"))
6
- print(time.time()- start)
 
 
 
 
 
 
 
whisper_pipeline/dockerfile DELETED
@@ -1,51 +0,0 @@
1
- # Use Python 3.11-slim-bookworm as base
2
- FROM python:3.11-slim-bookworm AS base
3
-
4
- # Use args
5
- ARG USE_CUDA
6
- ARG USE_CUDA_VER
7
-
8
- ## Basis ##
9
- ENV ENV=prod \
10
- PORT=5056 \
11
- # pass build args to the build
12
- USE_CUDA_DOCKER=${USE_CUDA} \
13
- USE_CUDA_DOCKER_VER=${USE_CUDA_VER}
14
-
15
- # Install GCC and build tools
16
- RUN apt-get update && \
17
- apt-get install -y gcc build-essential curl git pkg-config libicu-dev && \
18
- apt-get clean && \
19
- rm -rf /var/lib/apt/lists/*
20
-
21
- # Set working directory
22
- WORKDIR /app
23
-
24
- # Install the necessary dependencies from the requirements.txt file
25
- COPY ./requirements.txt .
26
- RUN pip3 install uv && \
27
- if [ "$USE_CUDA" = "true" ]; then \
28
- pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/$USE_CUDA_DOCKER_VER --no-cache-dir; \
29
- else \
30
- pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir; \
31
- fi
32
- # Copy faster-whisper-main folder (which includes the setup file) and install
33
- COPY ./faster-whisper-main ./faster-whisper-main
34
- RUN pip3 install ./faster-whisper-main
35
-
36
- RUN uv pip install --system -r requirements.txt --no-cache-dir
37
-
38
-
39
-
40
- # Copy the remaining application code
41
- COPY . .
42
-
43
- # Expose the API port
44
- EXPOSE 5056
45
-
46
- # Set the environment variables
47
- ENV HOST="0.0.0.0"
48
- ENV PORT="5056"
49
-
50
- # Set entrypoint to run the FastAPI server
51
- ENTRYPOINT [ "bash", "start.sh" ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/.github/workflows/ci.yml DELETED
@@ -1,90 +0,0 @@
1
- name: CI
2
-
3
- on:
4
- push:
5
- branches:
6
- - master
7
- tags:
8
- - v*
9
- pull_request:
10
- branches:
11
- - master
12
-
13
- jobs:
14
- check-code-format:
15
- runs-on: ubuntu-latest
16
-
17
- steps:
18
- - uses: actions/checkout@v3
19
-
20
- - name: Set up Python 3.8
21
- uses: actions/setup-python@v4
22
- with:
23
- python-version: 3.8
24
-
25
- - name: Install module
26
- run: |
27
- pip install wheel
28
- pip install -e .[dev]
29
-
30
- - name: Check code format with Black
31
- run: |
32
- black --check .
33
-
34
- - name: Check imports order with isort
35
- run: |
36
- isort --check-only .
37
-
38
- - name: Check code style with Flake8
39
- if: ${{ always() }}
40
- run: |
41
- flake8 .
42
-
43
-
44
- run-tests:
45
- runs-on: ubuntu-latest
46
-
47
- steps:
48
- - uses: actions/checkout@v3
49
-
50
- - name: Set up Python 3.8
51
- uses: actions/setup-python@v4
52
- with:
53
- python-version: 3.8
54
-
55
- - name: Install module
56
- run: |
57
- pip install wheel
58
- pip install -e .[dev]
59
-
60
- - name: Run pytest
61
- run: |
62
- pytest -v tests/
63
-
64
-
65
- build-and-push-package:
66
- runs-on: ubuntu-latest
67
- needs: [check-code-format, run-tests]
68
-
69
- steps:
70
- - uses: actions/checkout@v3
71
-
72
- - name: Set up Python 3.8
73
- uses: actions/setup-python@v4
74
- with:
75
- python-version: 3.8
76
-
77
- - name: Install dependencies
78
- run: |
79
- pip install wheel
80
-
81
- - name: Build package
82
- run: |
83
- python3 setup.py sdist bdist_wheel
84
-
85
- - name: Push package on PyPI
86
- if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
87
- uses: pypa/gh-action-pypi-publish@release/v1
88
- with:
89
- user: __token__
90
- password: ${{ secrets.PYPI_API_TOKEN }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/.gitignore DELETED
@@ -1,15 +0,0 @@
1
- # Byte-compiled / Optimized / DLL Files
2
- *.pyc
3
- *.pyo
4
- *.pyd
5
- __pycache__/
6
-
7
- # Distribution / Packaging
8
- venv/
9
-
10
- # Unit Test
11
- .pytest_cache/
12
-
13
- # Ignore IDE, Editor Files
14
- .idea/
15
- .vscode/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/CONTRIBUTING.md DELETED
@@ -1,31 +0,0 @@
1
- # Contributing to faster-whisper
2
-
3
- Contributions are welcome! Here are some pointers to help you install the library for development and validate your changes before submitting a pull request.
4
-
5
- ## Install the library for development
6
-
7
- We recommend installing the module in editable mode with the `dev` extra requirements:
8
-
9
- ```bash
10
- git clone https://github.com/SYSTRAN/faster-whisper.git
11
- cd faster-whisper/
12
- pip install -e .[dev]
13
- ```
14
-
15
- ## Validate the changes before creating a pull request
16
-
17
- 1. Make sure the existing tests are still passing (and consider adding new tests as well!):
18
-
19
- ```bash
20
- pytest tests/
21
- ```
22
-
23
- 2. Reformat and validate the code with the following tools:
24
-
25
- ```bash
26
- black .
27
- isort .
28
- flake8 .
29
- ```
30
-
31
- These steps are also run automatically in the CI when you open the pull request.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2023 SYSTRAN
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/MANIFEST.in DELETED
@@ -1,4 +0,0 @@
1
- include faster_whisper/assets/silero_vad.onnx
2
- include requirements.txt
3
- include requirements.conversion.txt
4
- include faster_whisper/assets/pyannote_vad_model.bin
 
 
 
 
 
whisper_pipeline/faster-whisper-main/README.md DELETED
@@ -1,319 +0,0 @@
1
- [![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
2
-
3
- # Faster Whisper transcription with CTranslate2
4
-
5
- **faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.
6
-
7
- This implementation is up to 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.
8
-
9
- ## Benchmark
10
-
11
- ### Whisper
12
-
13
- For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations:
14
-
15
- * [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258)
16
- * [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362)
17
- * [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e)
18
-
19
- ### Large-v2 model on GPU
20
-
21
- | Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
22
- | --- | --- | --- | --- | --- | --- |
23
- | openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
24
- | faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
25
- | faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB |
26
-
27
- *Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.*
28
-
29
- ### Small model on CPU
30
-
31
- | Implementation | Precision | Beam size | Time | Max. memory |
32
- | --- | --- | --- | --- | --- |
33
- | openai/whisper | fp32 | 5 | 10m31s | 3101MB |
34
- | whisper.cpp | fp32 | 5 | 17m42s | 1581MB |
35
- | whisper.cpp | fp16 | 5 | 12m39s | 873MB |
36
- | faster-whisper | fp32 | 5 | 2m44s | 1675MB |
37
- | faster-whisper | int8 | 5 | 2m04s | 995MB |
38
-
39
- *Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.*
40
-
41
-
42
- ### Distil-whisper
43
-
44
- | Implementation | Precision | Beam size | Time | Gigaspeech WER |
45
- | --- | --- | --- | --- | --- |
46
- | distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 |
47
- | [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 |
48
- | distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 |
49
- | [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 |
50
-
51
- *Executed with CUDA 11.4 on a NVIDIA 3090.*
52
-
53
- <details>
54
- <summary>testing details (click to expand)</summary>
55
-
56
- For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting:
57
- ```python
58
- from faster_whisper import WhisperModel
59
-
60
- model_size = "distil-large-v2"
61
- # model_size = "distil-medium.en"
62
- # Run on GPU with FP16
63
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
64
- segments, info = model.transcribe("audio.mp3", beam_size=5, language="en")
65
- ```
66
- </details>
67
-
68
- ## Requirements
69
-
70
- * Python 3.8 or greater
71
-
72
-
73
- ### GPU
74
-
75
- GPU execution requires the following NVIDIA libraries to be installed:
76
-
77
- * [cuBLAS for CUDA 12](https://developer.nvidia.com/cublas)
78
- * [cuDNN 8 for CUDA 12](https://developer.nvidia.com/cudnn)
79
-
80
- **Note**: Latest versions of `ctranslate2` support CUDA 12 only. For CUDA 11, the current workaround is downgrading to the `3.24.0` version of `ctranslate2` (This can be done with `pip install --force-reinstall ctranslate2==3.24.0` or specifying the version in a `requirements.txt`).
81
-
82
- There are multiple ways to install the NVIDIA libraries mentioned above. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below.
83
-
84
- <details>
85
- <summary>Other installation methods (click to expand)</summary>
86
-
87
-
88
- **Note:** For all these methods below, keep in mind the above note regarding CUDA versions. Depending on your setup, you may need to install the _CUDA 11_ versions of libraries that correspond to the CUDA 12 libraries listed in the instructions below.
89
-
90
- #### Use Docker
91
-
92
- The libraries (cuBLAS, cuDNN) are installed in these official NVIDIA CUDA Docker images: `nvidia/cuda:12.0.0-runtime-ubuntu20.04` or `nvidia/cuda:12.0.0-runtime-ubuntu22.04`.
93
-
94
- #### Install with `pip` (Linux only)
95
-
96
- On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python.
97
-
98
- ```bash
99
- pip install nvidia-cublas-cu12 nvidia-cudnn-cu12
100
-
101
- export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
102
- ```
103
-
104
- **Note**: Version 9+ of `nvidia-cudnn-cu12` appears to cause issues due its reliance on cuDNN 9 (Faster-Whisper does not currently support cuDNN 9). Ensure your version of the Python package is for cuDNN 8.
105
-
106
- #### Download the libraries from Purfview's repository (Windows & Linux)
107
-
108
- Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`.
109
-
110
- </details>
111
-
112
- ## Installation
113
-
114
- The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/):
115
-
116
- ```bash
117
- pip install faster-whisper
118
- ```
119
-
120
- <details>
121
- <summary>Other installation methods (click to expand)</summary>
122
-
123
- ### Install the master branch
124
-
125
- ```bash
126
- pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz"
127
- ```
128
-
129
- ### Install a specific commit
130
-
131
- ```bash
132
- pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz"
133
- ```
134
-
135
- </details>
136
-
137
- ## Usage
138
-
139
- ### Faster-whisper
140
-
141
- ```python
142
- from faster_whisper import WhisperModel
143
-
144
- model_size = "large-v3"
145
-
146
- # Run on GPU with FP16
147
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
148
-
149
- # or run on GPU with INT8
150
- # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
151
- # or run on CPU with INT8
152
- # model = WhisperModel(model_size, device="cpu", compute_type="int8")
153
-
154
- segments, info = model.transcribe("audio.mp3", beam_size=5)
155
-
156
- print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
157
-
158
- for segment in segments:
159
- print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
160
- ```
161
-
162
- **Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop:
163
-
164
- ```python
165
- segments, _ = model.transcribe("audio.mp3")
166
- segments = list(segments) # The transcription will actually run here.
167
- ```
168
-
169
- ### multi-segment language detection
170
-
171
- To directly use the model for improved language detection, the following code snippet can be used:
172
-
173
- ```python
174
- from faster_whisper import WhisperModel
175
- model = WhisperModel("medium", device="cuda", compute_type="float16")
176
- language_info = model.detect_language_multi_segment("audio.mp3")
177
- ```
178
-
179
- ### Batched faster-whisper
180
-
181
-
182
- The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-2 Clause license and integrates its VAD model to this library. We modify this implementation and also replaced the feature extraction with a faster torch-based implementation. Batched version improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference.
183
-
184
- The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
185
-
186
- ```python
187
- from faster_whisper import WhisperModel, BatchedInferencePipeline
188
-
189
- model = WhisperModel("medium", device="cuda", compute_type="float16")
190
- batched_model = BatchedInferencePipeline(model=model)
191
- segments, info = batched_model.transcribe("audio.mp3", batch_size=16)
192
-
193
- for segment in segments:
194
- print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
195
- ```
196
-
197
- ### Faster Distil-Whisper
198
-
199
- The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3)
200
- checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet
201
- demonstrates how to run inference with distil-large-v3 on a specified audio file:
202
-
203
- ```python
204
- from faster_whisper import WhisperModel
205
-
206
- model_size = "distil-large-v3"
207
-
208
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
209
- segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False)
210
-
211
- for segment in segments:
212
- print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
213
- ```
214
-
215
- For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3).
216
-
217
- ### Word-level timestamps
218
-
219
- ```python
220
- segments, _ = model.transcribe("audio.mp3", word_timestamps=True)
221
-
222
- for segment in segments:
223
- for word in segment.words:
224
- print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
225
- ```
226
-
227
- ### VAD filter
228
-
229
- The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech:
230
-
231
- ```python
232
- segments, _ = model.transcribe("audio.mp3", vad_filter=True)
233
- ```
234
-
235
- The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`:
236
-
237
- ```python
238
- segments, _ = model.transcribe(
239
- "audio.mp3",
240
- vad_filter=True,
241
- vad_parameters=dict(min_silence_duration_ms=500),
242
- )
243
- ```
244
-
245
- ### Logging
246
-
247
- The library logging level can be configured like this:
248
-
249
- ```python
250
- import logging
251
-
252
- logging.basicConfig()
253
- logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
254
- ```
255
-
256
- ### Going further
257
-
258
- See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation.
259
-
260
- ## Community integrations
261
-
262
- Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
263
-
264
-
265
- * [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription.
266
- * [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment
267
- * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
268
- * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.
269
- * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS.
270
- * [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines.
271
- * [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT.
272
- * [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor)
273
- * [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux.
274
- * [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art.
275
- * [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time.
276
- * [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface.
277
-
278
- ## Model conversion
279
-
280
- When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
281
-
282
- We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
283
-
284
- For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
285
-
286
- ```bash
287
- pip install transformers[torch]>=4.23
288
-
289
- ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
290
- --copy_files tokenizer.json preprocessor_config.json --quantization float16
291
- ```
292
-
293
- * The option `--model` accepts a model name on the Hub or a path to a model directory.
294
- * If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
295
-
296
- Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
297
-
298
- ### Load a converted model
299
-
300
- 1. Directly load the model from a local directory:
301
- ```python
302
- model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
303
- ```
304
-
305
- 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
306
- ```python
307
- model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
308
- ```
309
-
310
- ## Comparing performance against other implementations
311
-
312
- If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular:
313
-
314
- * Verify that the same transcription options are used, especially the same beam size. For example in openai/whisper, `model.transcribe` uses a default beam size of 1 but here we use a default beam size of 5.
315
- * When running on CPU, make sure to set the same number of threads. Many frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script:
316
-
317
- ```bash
318
- OMP_NUM_THREADS=4 python3 my_script.py
319
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/benchmark/benchmark.m4a DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5dedec4f587a7940cfab93ff36e5014f155f80e10b7935f67d9eee8761663c34
3
- size 12935433
 
 
 
 
whisper_pipeline/faster-whisper-main/benchmark/memory_benchmark.py DELETED
@@ -1,94 +0,0 @@
1
- import argparse
2
- import time
3
-
4
- from typing import Callable
5
-
6
- import py3nvml.py3nvml as nvml
7
-
8
- from memory_profiler import memory_usage
9
- from utils import MyThread, get_logger, inference
10
-
11
- logger = get_logger("faster-whisper")
12
- parser = argparse.ArgumentParser(description="Memory benchmark")
13
- parser.add_argument(
14
- "--gpu_memory", action="store_true", help="Measure GPU memory usage"
15
- )
16
- parser.add_argument("--device-index", type=int, default=0, help="GPU device index")
17
- parser.add_argument(
18
- "--interval",
19
- type=float,
20
- default=0.5,
21
- help="Interval at which measurements are collected",
22
- )
23
- args = parser.parse_args()
24
- device_idx = args.device_index
25
- interval = args.interval
26
-
27
-
28
- def measure_memory(func: Callable[[], None]):
29
- if args.gpu_memory:
30
- logger.info(
31
- "Measuring maximum GPU memory usage on GPU device."
32
- " Make sure to not have additional processes running on the same GPU."
33
- )
34
- # init nvml
35
- nvml.nvmlInit()
36
- handle = nvml.nvmlDeviceGetHandleByIndex(device_idx)
37
- gpu_name = nvml.nvmlDeviceGetName(handle)
38
- gpu_memory_limit = nvml.nvmlDeviceGetMemoryInfo(handle).total >> 20
39
- gpu_power_limit = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0
40
- info = {"gpu_memory_usage": [], "gpu_power_usage": []}
41
-
42
- def _get_gpu_info():
43
- while True:
44
- info["gpu_memory_usage"].append(
45
- nvml.nvmlDeviceGetMemoryInfo(handle).used >> 20
46
- )
47
- info["gpu_power_usage"].append(
48
- nvml.nvmlDeviceGetPowerUsage(handle) / 1000
49
- )
50
- time.sleep(interval)
51
-
52
- if stop:
53
- break
54
-
55
- return info
56
-
57
- stop = False
58
- thread = MyThread(_get_gpu_info, params=())
59
- thread.start()
60
- func()
61
- stop = True
62
- thread.join()
63
- result = thread.get_result()
64
-
65
- # shutdown nvml
66
- nvml.nvmlShutdown()
67
- max_memory_usage = max(result["gpu_memory_usage"])
68
- max_power_usage = max(result["gpu_power_usage"])
69
- print("GPU name: %s" % gpu_name)
70
- print("GPU device index: %s" % device_idx)
71
- print(
72
- "Maximum GPU memory usage: %dMiB / %dMiB (%.2f%%)"
73
- % (
74
- max_memory_usage,
75
- gpu_memory_limit,
76
- (max_memory_usage / gpu_memory_limit) * 100,
77
- )
78
- )
79
- print(
80
- "Maximum GPU power usage: %dW / %dW (%.2f%%)"
81
- % (
82
- max_power_usage,
83
- gpu_power_limit,
84
- (max_power_usage / gpu_power_limit) * 100,
85
- )
86
- )
87
- else:
88
- logger.info("Measuring maximum increase of memory usage.")
89
- max_usage = memory_usage(func, max_usage=True, interval=interval)
90
- print("Maximum increase of RAM memory usage: %d MiB" % max_usage)
91
-
92
-
93
- if __name__ == "__main__":
94
- measure_memory(inference)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/benchmark/normalizer.json DELETED
@@ -1,1742 +0,0 @@
1
- {
2
- "accessorise": "accessorize",
3
- "accessorised": "accessorized",
4
- "accessorises": "accessorizes",
5
- "accessorising": "accessorizing",
6
- "acclimatisation": "acclimatization",
7
- "acclimatise": "acclimatize",
8
- "acclimatised": "acclimatized",
9
- "acclimatises": "acclimatizes",
10
- "acclimatising": "acclimatizing",
11
- "accoutrements": "accouterments",
12
- "aeon": "eon",
13
- "aeons": "eons",
14
- "aerogramme": "aerogram",
15
- "aerogrammes": "aerograms",
16
- "aeroplane": "airplane",
17
- "aeroplanes": "airplanes",
18
- "aesthete": "esthete",
19
- "aesthetes": "esthetes",
20
- "aesthetic": "esthetic",
21
- "aesthetically": "esthetically",
22
- "aesthetics": "esthetics",
23
- "aetiology": "etiology",
24
- "ageing": "aging",
25
- "aggrandisement": "aggrandizement",
26
- "agonise": "agonize",
27
- "agonised": "agonized",
28
- "agonises": "agonizes",
29
- "agonising": "agonizing",
30
- "agonisingly": "agonizingly",
31
- "almanack": "almanac",
32
- "almanacks": "almanacs",
33
- "aluminium": "aluminum",
34
- "amortisable": "amortizable",
35
- "amortisation": "amortization",
36
- "amortisations": "amortizations",
37
- "amortise": "amortize",
38
- "amortised": "amortized",
39
- "amortises": "amortizes",
40
- "amortising": "amortizing",
41
- "amphitheatre": "amphitheater",
42
- "amphitheatres": "amphitheaters",
43
- "anaemia": "anemia",
44
- "anaemic": "anemic",
45
- "anaesthesia": "anesthesia",
46
- "anaesthetic": "anesthetic",
47
- "anaesthetics": "anesthetics",
48
- "anaesthetise": "anesthetize",
49
- "anaesthetised": "anesthetized",
50
- "anaesthetises": "anesthetizes",
51
- "anaesthetising": "anesthetizing",
52
- "anaesthetist": "anesthetist",
53
- "anaesthetists": "anesthetists",
54
- "anaesthetize": "anesthetize",
55
- "anaesthetized": "anesthetized",
56
- "anaesthetizes": "anesthetizes",
57
- "anaesthetizing": "anesthetizing",
58
- "analogue": "analog",
59
- "analogues": "analogs",
60
- "analyse": "analyze",
61
- "analysed": "analyzed",
62
- "analyses": "analyzes",
63
- "analysing": "analyzing",
64
- "anglicise": "anglicize",
65
- "anglicised": "anglicized",
66
- "anglicises": "anglicizes",
67
- "anglicising": "anglicizing",
68
- "annualised": "annualized",
69
- "antagonise": "antagonize",
70
- "antagonised": "antagonized",
71
- "antagonises": "antagonizes",
72
- "antagonising": "antagonizing",
73
- "apologise": "apologize",
74
- "apologised": "apologized",
75
- "apologises": "apologizes",
76
- "apologising": "apologizing",
77
- "appal": "appall",
78
- "appals": "appalls",
79
- "appetiser": "appetizer",
80
- "appetisers": "appetizers",
81
- "appetising": "appetizing",
82
- "appetisingly": "appetizingly",
83
- "arbour": "arbor",
84
- "arbours": "arbors",
85
- "archaeologically": "archeologically",
86
- "archaeologist": "archeologist",
87
- "archaeologists": "archeologists",
88
- "archaeology": "archeology</span>",
89
- "archeological": "archaeological",
90
- "ardour": "ardor",
91
- "armour": "armor",
92
- "armoured": "armored",
93
- "armourer": "armorer",
94
- "armourers": "armorers",
95
- "armouries": "armories",
96
- "armoury": "armory",
97
- "artefact": "artifact",
98
- "artefacts": "artifacts",
99
- "authorise": "authorize",
100
- "authorised": "authorized",
101
- "authorises": "authorizes",
102
- "authorising": "authorizing",
103
- "axe": "ax",
104
- "backpedalled": "backpedaled",
105
- "backpedalling": "backpedaling",
106
- "bannister": "banister",
107
- "bannisters": "banisters",
108
- "baptise": "baptize",
109
- "baptised": "baptized",
110
- "baptises": "baptizes",
111
- "baptising": "baptizing",
112
- "bastardise": "bastardize",
113
- "bastardised": "bastardized",
114
- "bastardises": "bastardizes",
115
- "bastardising": "bastardizing",
116
- "battleax": "battleaxe",
117
- "baulk": "balk",
118
- "baulked": "balked",
119
- "baulking": "balking",
120
- "baulks": "balks",
121
- "bedevilled": "bedeviled",
122
- "bedevilling": "bedeviling",
123
- "behaviour": "behavior",
124
- "behavioural": "behavioral",
125
- "behaviourism": "behaviorism",
126
- "behaviourist": "behaviorist",
127
- "behaviourists": "behaviorists",
128
- "behaviours": "behaviors",
129
- "behove": "behoove",
130
- "behoved": "behooved",
131
- "behoves": "behooves",
132
- "bejewelled": "bejeweled",
133
- "belabour": "belabor",
134
- "belaboured": "belabored",
135
- "belabouring": "belaboring",
136
- "belabours": "belabors",
137
- "bevelled": "beveled",
138
- "bevvies": "bevies",
139
- "bevvy": "bevy",
140
- "biassed": "biased",
141
- "biassing": "biasing",
142
- "bingeing": "binging",
143
- "bougainvillaea": "bougainvillea",
144
- "bougainvillaeas": "bougainvilleas",
145
- "bowdlerise": "bowdlerize",
146
- "bowdlerised": "bowdlerized",
147
- "bowdlerises": "bowdlerizes",
148
- "bowdlerising": "bowdlerizing",
149
- "breathalyse": "breathalyze",
150
- "breathalysed": "breathalyzed",
151
- "breathalyser": "breathalyzer",
152
- "breathalysers": "breathalyzers",
153
- "breathalyses": "breathalyzes",
154
- "breathalysing": "breathalyzing",
155
- "brutalise": "brutalize",
156
- "brutalised": "brutalized",
157
- "brutalises": "brutalizes",
158
- "brutalising": "brutalizing",
159
- "busses": "buses",
160
- "bussing": "busing",
161
- "caesarean": "cesarean",
162
- "caesareans": "cesareans",
163
- "calibre": "caliber",
164
- "calibres": "calibers",
165
- "calliper": "caliper",
166
- "callipers": "calipers",
167
- "callisthenics": "calisthenics",
168
- "canalise": "canalize",
169
- "canalised": "canalized",
170
- "canalises": "canalizes",
171
- "canalising": "canalizing",
172
- "cancelation": "cancellation",
173
- "cancelations": "cancellations",
174
- "cancelled": "canceled",
175
- "cancelling": "canceling",
176
- "candour": "candor",
177
- "cannibalise": "cannibalize",
178
- "cannibalised": "cannibalized",
179
- "cannibalises": "cannibalizes",
180
- "cannibalising": "cannibalizing",
181
- "canonise": "canonize",
182
- "canonised": "canonized",
183
- "canonises": "canonizes",
184
- "canonising": "canonizing",
185
- "capitalise": "capitalize",
186
- "capitalised": "capitalized",
187
- "capitalises": "capitalizes",
188
- "capitalising": "capitalizing",
189
- "caramelise": "caramelize",
190
- "caramelised": "caramelized",
191
- "caramelises": "caramelizes",
192
- "caramelising": "caramelizing",
193
- "carbonise": "carbonize",
194
- "carbonised": "carbonized",
195
- "carbonises": "carbonizes",
196
- "carbonising": "carbonizing",
197
- "carolled": "caroled",
198
- "carolling": "caroling",
199
- "catalogue": "catalog",
200
- "catalogued": "cataloged",
201
- "catalogues": "catalogs",
202
- "cataloguing": "cataloging",
203
- "catalyse": "catalyze",
204
- "catalysed": "catalyzed",
205
- "catalyses": "catalyzes",
206
- "catalysing": "catalyzing",
207
- "categorise": "categorize",
208
- "categorised": "categorized",
209
- "categorises": "categorizes",
210
- "categorising": "categorizing",
211
- "cauterise": "cauterize",
212
- "cauterised": "cauterized",
213
- "cauterises": "cauterizes",
214
- "cauterising": "cauterizing",
215
- "cavilled": "caviled",
216
- "cavilling": "caviling",
217
- "centigramme": "centigram",
218
- "centigrammes": "centigrams",
219
- "centilitre": "centiliter",
220
- "centilitres": "centiliters",
221
- "centimetre": "centimeter",
222
- "centimetres": "centimeters",
223
- "centralise": "centralize",
224
- "centralised": "centralized",
225
- "centralises": "centralizes",
226
- "centralising": "centralizing",
227
- "centre": "center",
228
- "centred": "centered",
229
- "centrefold": "centerfold",
230
- "centrefolds": "centerfolds",
231
- "centrepiece": "centerpiece",
232
- "centrepieces": "centerpieces",
233
- "centres": "centers",
234
- "channelled": "channeled",
235
- "channelling": "channeling",
236
- "characterise": "characterize",
237
- "characterised": "characterized",
238
- "characterises": "characterizes",
239
- "characterising": "characterizing",
240
- "cheque": "check",
241
- "chequebook": "checkbook",
242
- "chequebooks": "checkbooks",
243
- "chequered": "checkered",
244
- "cheques": "checks",
245
- "chilli": "chili",
246
- "chimaera": "chimera",
247
- "chimaeras": "chimeras",
248
- "chiselled": "chiseled",
249
- "chiselling": "chiseling",
250
- "circularise": "circularize",
251
- "circularised": "circularized",
252
- "circularises": "circularizes",
253
- "circularising": "circularizing",
254
- "civilise": "civilize",
255
- "civilised": "civilized",
256
- "civilises": "civilizes",
257
- "civilising": "civilizing",
258
- "clamour": "clamor",
259
- "clamoured": "clamored",
260
- "clamouring": "clamoring",
261
- "clamours": "clamors",
262
- "clangour": "clangor",
263
- "clarinettist": "clarinetist",
264
- "clarinettists": "clarinetists",
265
- "collectivise": "collectivize",
266
- "collectivised": "collectivized",
267
- "collectivises": "collectivizes",
268
- "collectivising": "collectivizing",
269
- "colonisation": "colonization",
270
- "colonise": "colonize",
271
- "colonised": "colonized",
272
- "coloniser": "colonizer",
273
- "colonisers": "colonizers",
274
- "colonises": "colonizes",
275
- "colonising": "colonizing",
276
- "colour": "color",
277
- "colourant": "colorant",
278
- "colourants": "colorants",
279
- "coloured": "colored",
280
- "coloureds": "coloreds",
281
- "colourful": "colorful",
282
- "colourfully": "colorfully",
283
- "colouring": "coloring",
284
- "colourize": "colorize",
285
- "colourized": "colorized",
286
- "colourizes": "colorizes",
287
- "colourizing": "colorizing",
288
- "colourless": "colorless",
289
- "colours": "colors",
290
- "commercialise": "commercialize",
291
- "commercialised": "commercialized",
292
- "commercialises": "commercializes",
293
- "commercialising": "commercializing",
294
- "compartmentalise": "compartmentalize",
295
- "compartmentalised": "compartmentalized",
296
- "compartmentalises": "compartmentalizes",
297
- "compartmentalising": "compartmentalizing",
298
- "computerise": "computerize",
299
- "computerised": "computerized",
300
- "computerises": "computerizes",
301
- "computerising": "computerizing",
302
- "conceptualise": "conceptualize",
303
- "conceptualised": "conceptualized",
304
- "conceptualises": "conceptualizes",
305
- "conceptualising": "conceptualizing",
306
- "connexion": "connection",
307
- "connexions": "connections",
308
- "contextualise": "contextualize",
309
- "contextualised": "contextualized",
310
- "contextualises": "contextualizes",
311
- "contextualising": "contextualizing",
312
- "cosier": "cozier",
313
- "cosies": "cozies",
314
- "cosiest": "coziest",
315
- "cosily": "cozily",
316
- "cosiness": "coziness",
317
- "cosy": "cozy",
318
- "councillor": "councilor",
319
- "councillors": "councilors",
320
- "counselled": "counseled",
321
- "counselling": "counseling",
322
- "counsellor": "counselor",
323
- "counsellors": "counselors",
324
- "crenelated": "crenellated",
325
- "criminalise": "criminalize",
326
- "criminalised": "criminalized",
327
- "criminalises": "criminalizes",
328
- "criminalising": "criminalizing",
329
- "criticise": "criticize",
330
- "criticised": "criticized",
331
- "criticises": "criticizes",
332
- "criticising": "criticizing",
333
- "crueller": "crueler",
334
- "cruellest": "cruelest",
335
- "crystallisation": "crystallization",
336
- "crystallise": "crystallize",
337
- "crystallised": "crystallized",
338
- "crystallises": "crystallizes",
339
- "crystallising": "crystallizing",
340
- "cudgelled": "cudgeled",
341
- "cudgelling": "cudgeling",
342
- "customise": "customize",
343
- "customised": "customized",
344
- "customises": "customizes",
345
- "customising": "customizing",
346
- "cypher": "cipher",
347
- "cyphers": "ciphers",
348
- "decentralisation": "decentralization",
349
- "decentralise": "decentralize",
350
- "decentralised": "decentralized",
351
- "decentralises": "decentralizes",
352
- "decentralising": "decentralizing",
353
- "decriminalisation": "decriminalization",
354
- "decriminalise": "decriminalize",
355
- "decriminalised": "decriminalized",
356
- "decriminalises": "decriminalizes",
357
- "decriminalising": "decriminalizing",
358
- "defence": "defense",
359
- "defenceless": "defenseless",
360
- "defences": "defenses",
361
- "dehumanisation": "dehumanization",
362
- "dehumanise": "dehumanize",
363
- "dehumanised": "dehumanized",
364
- "dehumanises": "dehumanizes",
365
- "dehumanising": "dehumanizing",
366
- "demeanour": "demeanor",
367
- "demilitarisation": "demilitarization",
368
- "demilitarise": "demilitarize",
369
- "demilitarised": "demilitarized",
370
- "demilitarises": "demilitarizes",
371
- "demilitarising": "demilitarizing",
372
- "demobilisation": "demobilization",
373
- "demobilise": "demobilize",
374
- "demobilised": "demobilized",
375
- "demobilises": "demobilizes",
376
- "demobilising": "demobilizing",
377
- "democratisation": "democratization",
378
- "democratise": "democratize",
379
- "democratised": "democratized",
380
- "democratises": "democratizes",
381
- "democratising": "democratizing",
382
- "demonise": "demonize",
383
- "demonised": "demonized",
384
- "demonises": "demonizes",
385
- "demonising": "demonizing",
386
- "demoralisation": "demoralization",
387
- "demoralise": "demoralize",
388
- "demoralised": "demoralized",
389
- "demoralises": "demoralizes",
390
- "demoralising": "demoralizing",
391
- "denationalisation": "denationalization",
392
- "denationalise": "denationalize",
393
- "denationalised": "denationalized",
394
- "denationalises": "denationalizes",
395
- "denationalising": "denationalizing",
396
- "deodorise": "deodorize",
397
- "deodorised": "deodorized",
398
- "deodorises": "deodorizes",
399
- "deodorising": "deodorizing",
400
- "depersonalise": "depersonalize",
401
- "depersonalised": "depersonalized",
402
- "depersonalises": "depersonalizes",
403
- "depersonalising": "depersonalizing",
404
- "deputise": "deputize",
405
- "deputised": "deputized",
406
- "deputises": "deputizes",
407
- "deputising": "deputizing",
408
- "desensitisation": "desensitization",
409
- "desensitise": "desensitize",
410
- "desensitised": "desensitized",
411
- "desensitises": "desensitizes",
412
- "desensitising": "desensitizing",
413
- "destabilisation": "destabilization",
414
- "destabilise": "destabilize",
415
- "destabilised": "destabilized",
416
- "destabilises": "destabilizes",
417
- "destabilising": "destabilizing",
418
- "dialled": "dialed",
419
- "dialling": "dialing",
420
- "dialogue": "dialog",
421
- "dialogues": "dialogs",
422
- "diarrhoea": "diarrhea",
423
- "digitise": "digitize",
424
- "digitised": "digitized",
425
- "digitises": "digitizes",
426
- "digitising": "digitizing",
427
- "disc": "disk",
428
- "discolour": "discolor",
429
- "discoloured": "discolored",
430
- "discolouring": "discoloring",
431
- "discolours": "discolors",
432
- "discs": "disks",
433
- "disembowelled": "disemboweled",
434
- "disembowelling": "disemboweling",
435
- "disfavour": "disfavor",
436
- "dishevelled": "disheveled",
437
- "dishonour": "dishonor",
438
- "dishonourable": "dishonorable",
439
- "dishonourably": "dishonorably",
440
- "dishonoured": "dishonored",
441
- "dishonouring": "dishonoring",
442
- "dishonours": "dishonors",
443
- "disorganisation": "disorganization",
444
- "disorganised": "disorganized",
445
- "distil": "distill",
446
- "distils": "distills",
447
- "dramatisation": "dramatization",
448
- "dramatisations": "dramatizations",
449
- "dramatise": "dramatize",
450
- "dramatised": "dramatized",
451
- "dramatises": "dramatizes",
452
- "dramatising": "dramatizing",
453
- "draught": "draft",
454
- "draughtboard": "draftboard",
455
- "draughtboards": "draftboards",
456
- "draughtier": "draftier",
457
- "draughtiest": "draftiest",
458
- "draughts": "drafts",
459
- "draughtsman": "draftsman",
460
- "draughtsmanship": "draftsmanship",
461
- "draughtsmen": "draftsmen",
462
- "draughtswoman": "draftswoman",
463
- "draughtswomen": "draftswomen",
464
- "draughty": "drafty",
465
- "drivelled": "driveled",
466
- "drivelling": "driveling",
467
- "duelled": "dueled",
468
- "duelling": "dueling",
469
- "economise": "economize",
470
- "economised": "economized",
471
- "economises": "economizes",
472
- "economising": "economizing",
473
- "editorialise": "editorialize",
474
- "editorialised": "editorialized",
475
- "editorialises": "editorializes",
476
- "editorialising": "editorializing",
477
- "edoema": "edema",
478
- "empathise": "empathize",
479
- "empathised": "empathized",
480
- "empathises": "empathizes",
481
- "empathising": "empathizing",
482
- "emphasise": "emphasize",
483
- "emphasised": "emphasized",
484
- "emphasises": "emphasizes",
485
- "emphasising": "emphasizing",
486
- "enamelled": "enameled",
487
- "enamelling": "enameling",
488
- "enamoured": "enamored",
489
- "encyclopaedia": "encyclopedia",
490
- "encyclopaedias": "encyclopedias",
491
- "encyclopaedic": "encyclopedic",
492
- "endeavour": "endeavor",
493
- "endeavoured": "endeavored",
494
- "endeavouring": "endeavoring",
495
- "endeavours": "endeavors",
496
- "energise": "energize",
497
- "energised": "energized",
498
- "energises": "energizes",
499
- "energising": "energizing",
500
- "enrol": "enroll",
501
- "enrols": "enrolls",
502
- "enthral": "enthrall",
503
- "enthrals": "enthralls",
504
- "epaulette": "epaulet",
505
- "epaulettes": "epaulets",
506
- "epicentre": "epicenter",
507
- "epicentres": "epicenters",
508
- "epilogue": "epilog",
509
- "epilogues": "epilogs",
510
- "epitomise": "epitomize",
511
- "epitomised": "epitomized",
512
- "epitomises": "epitomizes",
513
- "epitomising": "epitomizing",
514
- "equalisation": "equalization",
515
- "equalise": "equalize",
516
- "equalised": "equalized",
517
- "equaliser": "equalizer",
518
- "equalisers": "equalizers",
519
- "equalises": "equalizes",
520
- "equalising": "equalizing",
521
- "eulogise": "eulogize",
522
- "eulogised": "eulogized",
523
- "eulogises": "eulogizes",
524
- "eulogising": "eulogizing",
525
- "evangelise": "evangelize",
526
- "evangelised": "evangelized",
527
- "evangelises": "evangelizes",
528
- "evangelising": "evangelizing",
529
- "exorcise": "exorcize",
530
- "exorcised": "exorcized",
531
- "exorcises": "exorcizes",
532
- "exorcising": "exorcizing",
533
- "extemporisation": "extemporization",
534
- "extemporise": "extemporize",
535
- "extemporised": "extemporized",
536
- "extemporises": "extemporizes",
537
- "extemporising": "extemporizing",
538
- "externalisation": "externalization",
539
- "externalisations": "externalizations",
540
- "externalise": "externalize",
541
- "externalised": "externalized",
542
- "externalises": "externalizes",
543
- "externalising": "externalizing",
544
- "factorise": "factorize",
545
- "factorised": "factorized",
546
- "factorises": "factorizes",
547
- "factorising": "factorizing",
548
- "faecal": "fecal",
549
- "faeces": "feces",
550
- "familiarisation": "familiarization",
551
- "familiarise": "familiarize",
552
- "familiarised": "familiarized",
553
- "familiarises": "familiarizes",
554
- "familiarising": "familiarizing",
555
- "fantasise": "fantasize",
556
- "fantasised": "fantasized",
557
- "fantasises": "fantasizes",
558
- "fantasising": "fantasizing",
559
- "favour": "favor",
560
- "favourable": "favorable",
561
- "favourably": "favorably",
562
- "favoured": "favored",
563
- "favouring": "favoring",
564
- "favourite": "favorite",
565
- "favourites": "favorites",
566
- "favouritism": "favoritism",
567
- "favours": "favors",
568
- "feminise": "feminize",
569
- "feminised": "feminized",
570
- "feminises": "feminizes",
571
- "feminising": "feminizing",
572
- "fertilisation": "fertilization",
573
- "fertilise": "fertilize",
574
- "fertilised": "fertilized",
575
- "fertiliser": "fertilizer",
576
- "fertilisers": "fertilizers",
577
- "fertilises": "fertilizes",
578
- "fertilising": "fertilizing",
579
- "fervour": "fervor",
580
- "fibre": "fiber",
581
- "fibreglass": "fiberglass",
582
- "fibres": "fibers",
583
- "fictionalisation": "fictionalization",
584
- "fictionalisations": "fictionalizations",
585
- "fictionalise": "fictionalize",
586
- "fictionalised": "fictionalized",
587
- "fictionalises": "fictionalizes",
588
- "fictionalising": "fictionalizing",
589
- "fillet": "filet",
590
- "filleted": "fileted",
591
- "filleting": "fileting",
592
- "fillets": "filets",
593
- "finalisation": "finalization",
594
- "finalise": "finalize",
595
- "finalised": "finalized",
596
- "finalises": "finalizes",
597
- "finalising": "finalizing",
598
- "flautist": "flutist",
599
- "flautists": "flutists",
600
- "flavour": "flavor",
601
- "flavoured": "flavored",
602
- "flavouring": "flavoring",
603
- "flavourings": "flavorings",
604
- "flavourless": "flavorless",
605
- "flavours": "flavors",
606
- "flavoursome": "flavorsome",
607
- "flyer / flier": "flier / flyer",
608
- "foetal": "fetal",
609
- "foetid": "fetid",
610
- "foetus": "fetus",
611
- "foetuses": "fetuses",
612
- "formalisation": "formalization",
613
- "formalise": "formalize",
614
- "formalised": "formalized",
615
- "formalises": "formalizes",
616
- "formalising": "formalizing",
617
- "fossilisation": "fossilization",
618
- "fossilise": "fossilize",
619
- "fossilised": "fossilized",
620
- "fossilises": "fossilizes",
621
- "fossilising": "fossilizing",
622
- "fraternisation": "fraternization",
623
- "fraternise": "fraternize",
624
- "fraternised": "fraternized",
625
- "fraternises": "fraternizes",
626
- "fraternising": "fraternizing",
627
- "fulfil": "fulfill",
628
- "fulfilment": "fulfillment",
629
- "fulfils": "fulfills",
630
- "funnelled": "funneled",
631
- "funnelling": "funneling",
632
- "gage": "gauge",
633
- "gaged": "gauged",
634
- "gages": "gauges",
635
- "gaging": "gauging",
636
- "galvanise": "galvanize",
637
- "galvanised": "galvanized",
638
- "galvanises": "galvanizes",
639
- "galvanising": "galvanizing",
640
- "gambolled": "gamboled",
641
- "gambolling": "gamboling",
642
- "gaol": "jail",
643
- "gaolbird": "jailbird",
644
- "gaolbirds": "jailbirds",
645
- "gaolbreak": "jailbreak",
646
- "gaolbreaks": "jailbreaks",
647
- "gaoled": "jailed",
648
- "gaoler": "jailer",
649
- "gaolers": "jailers",
650
- "gaoling": "jailing",
651
- "gaols": "jails",
652
- "gasses": "gases",
653
- "generalisation": "generalization",
654
- "generalisations": "generalizations",
655
- "generalise": "generalize",
656
- "generalised": "generalized",
657
- "generalises": "generalizes",
658
- "generalising": "generalizing",
659
- "ghettoise": "ghettoize",
660
- "ghettoised": "ghettoized",
661
- "ghettoises": "ghettoizes",
662
- "ghettoising": "ghettoizing",
663
- "gipsies": "gypsies",
664
- "glamor": "glamour",
665
- "glamorise": "glamorize",
666
- "glamorised": "glamorized",
667
- "glamorises": "glamorizes",
668
- "glamorising": "glamorizing",
669
- "globalisation": "globalization",
670
- "globalise": "globalize",
671
- "globalised": "globalized",
672
- "globalises": "globalizes",
673
- "globalising": "globalizing",
674
- "glueing": "gluing",
675
- "goitre": "goiter",
676
- "goitres": "goiters",
677
- "gonorrhoea": "gonorrhea",
678
- "gramme": "gram",
679
- "grammes": "grams",
680
- "gravelled": "graveled",
681
- "grey": "gray",
682
- "greyed": "grayed",
683
- "greying": "graying",
684
- "greyish": "grayish",
685
- "greyness": "grayness",
686
- "greys": "grays",
687
- "grovelled": "groveled",
688
- "grovelling": "groveling",
689
- "groyne": "groin",
690
- "groynes": "groins",
691
- "gruelling": "grueling",
692
- "gruellingly": "gruelingly",
693
- "gryphon": "griffin",
694
- "gryphons": "griffins",
695
- "gynaecological": "gynecological",
696
- "gynaecologist": "gynecologist",
697
- "gynaecologists": "gynecologists",
698
- "gynaecology": "gynecology",
699
- "haematological": "hematological",
700
- "haematologist": "hematologist",
701
- "haematologists": "hematologists",
702
- "haematology": "hematology",
703
- "haemoglobin": "hemoglobin",
704
- "haemophilia": "hemophilia",
705
- "haemophiliac": "hemophiliac",
706
- "haemophiliacs": "hemophiliacs",
707
- "haemorrhage": "hemorrhage",
708
- "haemorrhaged": "hemorrhaged",
709
- "haemorrhages": "hemorrhages",
710
- "haemorrhaging": "hemorrhaging",
711
- "haemorrhoids": "hemorrhoids",
712
- "harbour": "harbor",
713
- "harboured": "harbored",
714
- "harbouring": "harboring",
715
- "harbours": "harbors",
716
- "harmonisation": "harmonization",
717
- "harmonise": "harmonize",
718
- "harmonised": "harmonized",
719
- "harmonises": "harmonizes",
720
- "harmonising": "harmonizing",
721
- "homoeopath": "homeopath",
722
- "homoeopathic": "homeopathic",
723
- "homoeopaths": "homeopaths",
724
- "homoeopathy": "homeopathy",
725
- "homogenise": "homogenize",
726
- "homogenised": "homogenized",
727
- "homogenises": "homogenizes",
728
- "homogenising": "homogenizing",
729
- "honour": "honor",
730
- "honourable": "honorable",
731
- "honourably": "honorably",
732
- "honoured": "honored",
733
- "honouring": "honoring",
734
- "honours": "honors",
735
- "hospitalisation": "hospitalization",
736
- "hospitalise": "hospitalize",
737
- "hospitalised": "hospitalized",
738
- "hospitalises": "hospitalizes",
739
- "hospitalising": "hospitalizing",
740
- "humanise": "humanize",
741
- "humanised": "humanized",
742
- "humanises": "humanizes",
743
- "humanising": "humanizing",
744
- "humour": "humor",
745
- "humoured": "humored",
746
- "humouring": "humoring",
747
- "humourless": "humorless",
748
- "humours": "humors",
749
- "hybridise": "hybridize",
750
- "hybridised": "hybridized",
751
- "hybridises": "hybridizes",
752
- "hybridising": "hybridizing",
753
- "hypnotise": "hypnotize",
754
- "hypnotised": "hypnotized",
755
- "hypnotises": "hypnotizes",
756
- "hypnotising": "hypnotizing",
757
- "hypothesise": "hypothesize",
758
- "hypothesised": "hypothesized",
759
- "hypothesises": "hypothesizes",
760
- "hypothesising": "hypothesizing",
761
- "idealisation": "idealization",
762
- "idealise": "idealize",
763
- "idealised": "idealized",
764
- "idealises": "idealizes",
765
- "idealising": "idealizing",
766
- "idolise": "idolize",
767
- "idolised": "idolized",
768
- "idolises": "idolizes",
769
- "idolising": "idolizing",
770
- "immobilisation": "immobilization",
771
- "immobilise": "immobilize",
772
- "immobilised": "immobilized",
773
- "immobiliser": "immobilizer",
774
- "immobilisers": "immobilizers",
775
- "immobilises": "immobilizes",
776
- "immobilising": "immobilizing",
777
- "immortalise": "immortalize",
778
- "immortalised": "immortalized",
779
- "immortalises": "immortalizes",
780
- "immortalising": "immortalizing",
781
- "immunisation": "immunization",
782
- "immunise": "immunize",
783
- "immunised": "immunized",
784
- "immunises": "immunizes",
785
- "immunising": "immunizing",
786
- "impanelled": "impaneled",
787
- "impanelling": "impaneling",
788
- "imperilled": "imperiled",
789
- "imperilling": "imperiling",
790
- "individualise": "individualize",
791
- "individualised": "individualized",
792
- "individualises": "individualizes",
793
- "individualising": "individualizing",
794
- "industrialise": "industrialize",
795
- "industrialised": "industrialized",
796
- "industrialises": "industrializes",
797
- "industrialising": "industrializing",
798
- "inflexion": "inflection",
799
- "inflexions": "inflections",
800
- "initialise": "initialize",
801
- "initialised": "initialized",
802
- "initialises": "initializes",
803
- "initialising": "initializing",
804
- "initialled": "initialed",
805
- "initialling": "initialing",
806
- "instal": "install",
807
- "instalment": "installment",
808
- "instalments": "installments",
809
- "instals": "installs",
810
- "instil": "instill",
811
- "instils": "instills",
812
- "institutionalisation": "institutionalization",
813
- "institutionalise": "institutionalize",
814
- "institutionalised": "institutionalized",
815
- "institutionalises": "institutionalizes",
816
- "institutionalising": "institutionalizing",
817
- "intellectualise": "intellectualize",
818
- "intellectualised": "intellectualized",
819
- "intellectualises": "intellectualizes",
820
- "intellectualising": "intellectualizing",
821
- "internalisation": "internalization",
822
- "internalise": "internalize",
823
- "internalised": "internalized",
824
- "internalises": "internalizes",
825
- "internalising": "internalizing",
826
- "internationalisation": "internationalization",
827
- "internationalise": "internationalize",
828
- "internationalised": "internationalized",
829
- "internationalises": "internationalizes",
830
- "internationalising": "internationalizing",
831
- "ionisation": "ionization",
832
- "ionise": "ionize",
833
- "ionised": "ionized",
834
- "ioniser": "ionizer",
835
- "ionisers": "ionizers",
836
- "ionises": "ionizes",
837
- "ionising": "ionizing",
838
- "italicise": "italicize",
839
- "italicised": "italicized",
840
- "italicises": "italicizes",
841
- "italicising": "italicizing",
842
- "itemise": "itemize",
843
- "itemised": "itemized",
844
- "itemises": "itemizes",
845
- "itemising": "itemizing",
846
- "jeopardise": "jeopardize",
847
- "jeopardised": "jeopardized",
848
- "jeopardises": "jeopardizes",
849
- "jeopardising": "jeopardizing",
850
- "jewelled": "jeweled",
851
- "jeweller": "jeweler",
852
- "jewellers": "jewelers",
853
- "jewellery": "jewelry",
854
- "judgement": "judgment",
855
- "kilogramme": "kilogram",
856
- "kilogrammes": "kilograms",
857
- "kilometre": "kilometer",
858
- "kilometres": "kilometers",
859
- "labelled": "labeled",
860
- "labelling": "labeling",
861
- "labour": "labor",
862
- "laboured": "labored",
863
- "labourer": "laborer",
864
- "labourers": "laborers",
865
- "labouring": "laboring",
866
- "labours": "labors",
867
- "lacklustre": "lackluster",
868
- "legalisation": "legalization",
869
- "legalise": "legalize",
870
- "legalised": "legalized",
871
- "legalises": "legalizes",
872
- "legalising": "legalizing",
873
- "legitimise": "legitimize",
874
- "legitimised": "legitimized",
875
- "legitimises": "legitimizes",
876
- "legitimising": "legitimizing",
877
- "leukaemia": "leukemia",
878
- "levelled": "leveled",
879
- "leveller": "leveler",
880
- "levellers": "levelers",
881
- "levelling": "leveling",
882
- "libelled": "libeled",
883
- "libelling": "libeling",
884
- "libellous": "libelous",
885
- "liberalisation": "liberalization",
886
- "liberalise": "liberalize",
887
- "liberalised": "liberalized",
888
- "liberalises": "liberalizes",
889
- "liberalising": "liberalizing",
890
- "licence": "license",
891
- "licenced": "licensed",
892
- "licences": "licenses",
893
- "licencing": "licensing",
894
- "likeable": "likable",
895
- "lionisation": "lionization",
896
- "lionise": "lionize",
897
- "lionised": "lionized",
898
- "lionises": "lionizes",
899
- "lionising": "lionizing",
900
- "liquidise": "liquidize",
901
- "liquidised": "liquidized",
902
- "liquidiser": "liquidizer",
903
- "liquidisers": "liquidizers",
904
- "liquidises": "liquidizes",
905
- "liquidising": "liquidizing",
906
- "litre": "liter",
907
- "litres": "liters",
908
- "localise": "localize",
909
- "localised": "localized",
910
- "localises": "localizes",
911
- "localising": "localizing",
912
- "louvre": "louver",
913
- "louvred": "louvered",
914
- "louvres": "louvers",
915
- "lustre": "luster",
916
- "magnetise": "magnetize",
917
- "magnetised": "magnetized",
918
- "magnetises": "magnetizes",
919
- "magnetising": "magnetizing",
920
- "manoeuvrability": "maneuverability",
921
- "manoeuvrable": "maneuverable",
922
- "manoeuvre": "maneuver",
923
- "manoeuvred": "maneuvered",
924
- "manoeuvres": "maneuvers",
925
- "manoeuvring": "maneuvering",
926
- "manoeuvrings": "maneuverings",
927
- "marginalisation": "marginalization",
928
- "marginalise": "marginalize",
929
- "marginalised": "marginalized",
930
- "marginalises": "marginalizes",
931
- "marginalising": "marginalizing",
932
- "marshalled": "marshaled",
933
- "marshalling": "marshaling",
934
- "marvelled": "marveled",
935
- "marvelling": "marveling",
936
- "marvellous": "marvelous",
937
- "marvellously": "marvelously",
938
- "materialisation": "materialization",
939
- "materialise": "materialize",
940
- "materialised": "materialized",
941
- "materialises": "materializes",
942
- "materialising": "materializing",
943
- "maximisation": "maximization",
944
- "maximise": "maximize",
945
- "maximised": "maximized",
946
- "maximises": "maximizes",
947
- "maximising": "maximizing",
948
- "meagre": "meager",
949
- "mechanisation": "mechanization",
950
- "mechanise": "mechanize",
951
- "mechanised": "mechanized",
952
- "mechanises": "mechanizes",
953
- "mechanising": "mechanizing",
954
- "mediaeval": "medieval",
955
- "memorialise": "memorialize",
956
- "memorialised": "memorialized",
957
- "memorialises": "memorializes",
958
- "memorialising": "memorializing",
959
- "memorise": "memorize",
960
- "memorised": "memorized",
961
- "memorises": "memorizes",
962
- "memorising": "memorizing",
963
- "mesmerise": "mesmerize",
964
- "mesmerised": "mesmerized",
965
- "mesmerises": "mesmerizes",
966
- "mesmerising": "mesmerizing",
967
- "metabolise": "metabolize",
968
- "metabolised": "metabolized",
969
- "metabolises": "metabolizes",
970
- "metabolising": "metabolizing",
971
- "metre": "meter",
972
- "metres": "meters",
973
- "mhm": "hmm",
974
- "micrometre": "micrometer",
975
- "micrometres": "micrometers",
976
- "militarise": "militarize",
977
- "militarised": "militarized",
978
- "militarises": "militarizes",
979
- "militarising": "militarizing",
980
- "milligramme": "milligram",
981
- "milligrammes": "milligrams",
982
- "millilitre": "milliliter",
983
- "millilitres": "milliliters",
984
- "millimetre": "millimeter",
985
- "millimetres": "millimeters",
986
- "miniaturisation": "miniaturization",
987
- "miniaturise": "miniaturize",
988
- "miniaturised": "miniaturized",
989
- "miniaturises": "miniaturizes",
990
- "miniaturising": "miniaturizing",
991
- "minibusses": "minibuses",
992
- "minimise": "minimize",
993
- "minimised": "minimized",
994
- "minimises": "minimizes",
995
- "minimising": "minimizing",
996
- "misbehaviour": "misbehavior",
997
- "misdemeanour": "misdemeanor",
998
- "misdemeanours": "misdemeanors",
999
- "misspelt": "misspelled",
1000
- "mitre": "miter",
1001
- "mitres": "miters",
1002
- "mm": "hmm",
1003
- "mmm": "hmm",
1004
- "mobilisation": "mobilization",
1005
- "mobilise": "mobilize",
1006
- "mobilised": "mobilized",
1007
- "mobilises": "mobilizes",
1008
- "mobilising": "mobilizing",
1009
- "modelled": "modeled",
1010
- "modeller": "modeler",
1011
- "modellers": "modelers",
1012
- "modelling": "modeling",
1013
- "modernise": "modernize",
1014
- "modernised": "modernized",
1015
- "modernises": "modernizes",
1016
- "modernising": "modernizing",
1017
- "moisturise": "moisturize",
1018
- "moisturised": "moisturized",
1019
- "moisturiser": "moisturizer",
1020
- "moisturisers": "moisturizers",
1021
- "moisturises": "moisturizes",
1022
- "moisturising": "moisturizing",
1023
- "monologue": "monolog",
1024
- "monologues": "monologs",
1025
- "monopolisation": "monopolization",
1026
- "monopolise": "monopolize",
1027
- "monopolised": "monopolized",
1028
- "monopolises": "monopolizes",
1029
- "monopolising": "monopolizing",
1030
- "moralise": "moralize",
1031
- "moralised": "moralized",
1032
- "moralises": "moralizes",
1033
- "moralising": "moralizing",
1034
- "motorised": "motorized",
1035
- "mould": "mold",
1036
- "moulded": "molded",
1037
- "moulder": "molder",
1038
- "mouldered": "moldered",
1039
- "mouldering": "moldering",
1040
- "moulders": "molders",
1041
- "mouldier": "moldier",
1042
- "mouldiest": "moldiest",
1043
- "moulding": "molding",
1044
- "mouldings": "moldings",
1045
- "moulds": "molds",
1046
- "mouldy": "moldy",
1047
- "moult": "molt",
1048
- "moulted": "molted",
1049
- "moulting": "molting",
1050
- "moults": "molts",
1051
- "moustache": "mustache",
1052
- "moustached": "mustached",
1053
- "moustaches": "mustaches",
1054
- "moustachioed": "mustachioed",
1055
- "multicoloured": "multicolored",
1056
- "nationalisation": "nationalization",
1057
- "nationalisations": "nationalizations",
1058
- "nationalise": "nationalize",
1059
- "nationalised": "nationalized",
1060
- "nationalises": "nationalizes",
1061
- "nationalising": "nationalizing",
1062
- "naturalisation": "naturalization",
1063
- "naturalise": "naturalize",
1064
- "naturalised": "naturalized",
1065
- "naturalises": "naturalizes",
1066
- "naturalising": "naturalizing",
1067
- "neighbour": "neighbor",
1068
- "neighbourhood": "neighborhood",
1069
- "neighbourhoods": "neighborhoods",
1070
- "neighbouring": "neighboring",
1071
- "neighbourliness": "neighborliness",
1072
- "neighbourly": "neighborly",
1073
- "neighbours": "neighbors",
1074
- "neutralisation": "neutralization",
1075
- "neutralise": "neutralize",
1076
- "neutralised": "neutralized",
1077
- "neutralises": "neutralizes",
1078
- "neutralising": "neutralizing",
1079
- "normalisation": "normalization",
1080
- "normalise": "normalize",
1081
- "normalised": "normalized",
1082
- "normalises": "normalizes",
1083
- "normalising": "normalizing",
1084
- "odour": "odor",
1085
- "odourless": "odorless",
1086
- "odours": "odors",
1087
- "oesophagus": "esophagus",
1088
- "oesophaguses": "esophaguses",
1089
- "oestrogen": "estrogen",
1090
- "offence": "offense",
1091
- "offences": "offenses",
1092
- "omelette": "omelet",
1093
- "omelettes": "omelets",
1094
- "optimise": "optimize",
1095
- "optimised": "optimized",
1096
- "optimises": "optimizes",
1097
- "optimising": "optimizing",
1098
- "organisation": "organization",
1099
- "organisational": "organizational",
1100
- "organisations": "organizations",
1101
- "organise": "organize",
1102
- "organised": "organized",
1103
- "organiser": "organizer",
1104
- "organisers": "organizers",
1105
- "organises": "organizes",
1106
- "organising": "organizing",
1107
- "orthopaedic": "orthopedic",
1108
- "orthopaedics": "orthopedics",
1109
- "ostracise": "ostracize",
1110
- "ostracised": "ostracized",
1111
- "ostracises": "ostracizes",
1112
- "ostracising": "ostracizing",
1113
- "outmanoeuvre": "outmaneuver",
1114
- "outmanoeuvred": "outmaneuvered",
1115
- "outmanoeuvres": "outmaneuvers",
1116
- "outmanoeuvring": "outmaneuvering",
1117
- "overemphasise": "overemphasize",
1118
- "overemphasised": "overemphasized",
1119
- "overemphasises": "overemphasizes",
1120
- "overemphasising": "overemphasizing",
1121
- "oxidisation": "oxidization",
1122
- "oxidise": "oxidize",
1123
- "oxidised": "oxidized",
1124
- "oxidises": "oxidizes",
1125
- "oxidising": "oxidizing",
1126
- "paederast": "pederast",
1127
- "paederasts": "pederasts",
1128
- "paediatric": "pediatric",
1129
- "paediatrician": "pediatrician",
1130
- "paediatricians": "pediatricians",
1131
- "paediatrics": "pediatrics",
1132
- "paedophile": "pedophile",
1133
- "paedophiles": "pedophiles",
1134
- "paedophilia": "pedophilia",
1135
- "palaeolithic": "paleolithic",
1136
- "palaeontologist": "paleontologist",
1137
- "palaeontologists": "paleontologists",
1138
- "palaeontology": "paleontology",
1139
- "panelled": "paneled",
1140
- "panelling": "paneling",
1141
- "panellist": "panelist",
1142
- "panellists": "panelists",
1143
- "paralyse": "paralyze",
1144
- "paralysed": "paralyzed",
1145
- "paralyses": "paralyzes",
1146
- "paralysing": "paralyzing",
1147
- "parcelled": "parceled",
1148
- "parcelling": "parceling",
1149
- "parlour": "parlor",
1150
- "parlours": "parlors",
1151
- "particularise": "particularize",
1152
- "particularised": "particularized",
1153
- "particularises": "particularizes",
1154
- "particularising": "particularizing",
1155
- "passivisation": "passivization",
1156
- "passivise": "passivize",
1157
- "passivised": "passivized",
1158
- "passivises": "passivizes",
1159
- "passivising": "passivizing",
1160
- "pasteurisation": "pasteurization",
1161
- "pasteurise": "pasteurize",
1162
- "pasteurised": "pasteurized",
1163
- "pasteurises": "pasteurizes",
1164
- "pasteurising": "pasteurizing",
1165
- "patronise": "patronize",
1166
- "patronised": "patronized",
1167
- "patronises": "patronizes",
1168
- "patronising": "patronizing",
1169
- "patronisingly": "patronizingly",
1170
- "pedalled": "pedaled",
1171
- "pedalling": "pedaling",
1172
- "pedestrianisation": "pedestrianization",
1173
- "pedestrianise": "pedestrianize",
1174
- "pedestrianised": "pedestrianized",
1175
- "pedestrianises": "pedestrianizes",
1176
- "pedestrianising": "pedestrianizing",
1177
- "penalise": "penalize",
1178
- "penalised": "penalized",
1179
- "penalises": "penalizes",
1180
- "penalising": "penalizing",
1181
- "pencilled": "penciled",
1182
- "pencilling": "penciling",
1183
- "personalise": "personalize",
1184
- "personalised": "personalized",
1185
- "personalises": "personalizes",
1186
- "personalising": "personalizing",
1187
- "pharmacopoeia": "pharmacopeia",
1188
- "pharmacopoeias": "pharmacopeias",
1189
- "philosophise": "philosophize",
1190
- "philosophised": "philosophized",
1191
- "philosophises": "philosophizes",
1192
- "philosophising": "philosophizing",
1193
- "philtre": "filter",
1194
- "philtres": "filters",
1195
- "phoney": "phony",
1196
- "plagiarise": "plagiarize",
1197
- "plagiarised": "plagiarized",
1198
- "plagiarises": "plagiarizes",
1199
- "plagiarising": "plagiarizing",
1200
- "plough": "plow",
1201
- "ploughed": "plowed",
1202
- "ploughing": "plowing",
1203
- "ploughman": "plowman",
1204
- "ploughmen": "plowmen",
1205
- "ploughs": "plows",
1206
- "ploughshare": "plowshare",
1207
- "ploughshares": "plowshares",
1208
- "polarisation": "polarization",
1209
- "polarise": "polarize",
1210
- "polarised": "polarized",
1211
- "polarises": "polarizes",
1212
- "polarising": "polarizing",
1213
- "politicisation": "politicization",
1214
- "politicise": "politicize",
1215
- "politicised": "politicized",
1216
- "politicises": "politicizes",
1217
- "politicising": "politicizing",
1218
- "popularisation": "popularization",
1219
- "popularise": "popularize",
1220
- "popularised": "popularized",
1221
- "popularises": "popularizes",
1222
- "popularising": "popularizing",
1223
- "pouffe": "pouf",
1224
- "pouffes": "poufs",
1225
- "practise": "practice",
1226
- "practised": "practiced",
1227
- "practises": "practices",
1228
- "practising": "practicing",
1229
- "praesidium": "presidium",
1230
- "praesidiums": "presidiums",
1231
- "pressurisation": "pressurization",
1232
- "pressurise": "pressurize",
1233
- "pressurised": "pressurized",
1234
- "pressurises": "pressurizes",
1235
- "pressurising": "pressurizing",
1236
- "pretence": "pretense",
1237
- "pretences": "pretenses",
1238
- "primaeval": "primeval",
1239
- "prioritisation": "prioritization",
1240
- "prioritise": "prioritize",
1241
- "prioritised": "prioritized",
1242
- "prioritises": "prioritizes",
1243
- "prioritising": "prioritizing",
1244
- "privatisation": "privatization",
1245
- "privatisations": "privatizations",
1246
- "privatise": "privatize",
1247
- "privatised": "privatized",
1248
- "privatises": "privatizes",
1249
- "privatising": "privatizing",
1250
- "professionalisation": "professionalization",
1251
- "professionalise": "professionalize",
1252
- "professionalised": "professionalized",
1253
- "professionalises": "professionalizes",
1254
- "professionalising": "professionalizing",
1255
- "programme": "program",
1256
- "programmes": "programs",
1257
- "prologue": "prolog",
1258
- "prologues": "prologs",
1259
- "propagandise": "propagandize",
1260
- "propagandised": "propagandized",
1261
- "propagandises": "propagandizes",
1262
- "propagandising": "propagandizing",
1263
- "proselytise": "proselytize",
1264
- "proselytised": "proselytized",
1265
- "proselytiser": "proselytizer",
1266
- "proselytisers": "proselytizers",
1267
- "proselytises": "proselytizes",
1268
- "proselytising": "proselytizing",
1269
- "psychoanalyse": "psychoanalyze",
1270
- "psychoanalysed": "psychoanalyzed",
1271
- "psychoanalyses": "psychoanalyzes",
1272
- "psychoanalysing": "psychoanalyzing",
1273
- "publicise": "publicize",
1274
- "publicised": "publicized",
1275
- "publicises": "publicizes",
1276
- "publicising": "publicizing",
1277
- "pulverisation": "pulverization",
1278
- "pulverise": "pulverize",
1279
- "pulverised": "pulverized",
1280
- "pulverises": "pulverizes",
1281
- "pulverising": "pulverizing",
1282
- "pummelled": "pummel",
1283
- "pummelling": "pummeled",
1284
- "pyjama": "pajama",
1285
- "pyjamas": "pajamas",
1286
- "pzazz": "pizzazz",
1287
- "quarrelled": "quarreled",
1288
- "quarrelling": "quarreling",
1289
- "radicalise": "radicalize",
1290
- "radicalised": "radicalized",
1291
- "radicalises": "radicalizes",
1292
- "radicalising": "radicalizing",
1293
- "rancour": "rancor",
1294
- "randomise": "randomize",
1295
- "randomised": "randomized",
1296
- "randomises": "randomizes",
1297
- "randomising": "randomizing",
1298
- "rationalisation": "rationalization",
1299
- "rationalisations": "rationalizations",
1300
- "rationalise": "rationalize",
1301
- "rationalised": "rationalized",
1302
- "rationalises": "rationalizes",
1303
- "rationalising": "rationalizing",
1304
- "ravelled": "raveled",
1305
- "ravelling": "raveling",
1306
- "realisable": "realizable",
1307
- "realisation": "realization",
1308
- "realisations": "realizations",
1309
- "realise": "realize",
1310
- "realised": "realized",
1311
- "realises": "realizes",
1312
- "realising": "realizing",
1313
- "recognisable": "recognizable",
1314
- "recognisably": "recognizably",
1315
- "recognisance": "recognizance",
1316
- "recognise": "recognize",
1317
- "recognised": "recognized",
1318
- "recognises": "recognizes",
1319
- "recognising": "recognizing",
1320
- "reconnoitre": "reconnoiter",
1321
- "reconnoitred": "reconnoitered",
1322
- "reconnoitres": "reconnoiters",
1323
- "reconnoitring": "reconnoitering",
1324
- "refuelled": "refueled",
1325
- "refuelling": "refueling",
1326
- "regularisation": "regularization",
1327
- "regularise": "regularize",
1328
- "regularised": "regularized",
1329
- "regularises": "regularizes",
1330
- "regularising": "regularizing",
1331
- "remodelled": "remodeled",
1332
- "remodelling": "remodeling",
1333
- "remould": "remold",
1334
- "remoulded": "remolded",
1335
- "remoulding": "remolding",
1336
- "remoulds": "remolds",
1337
- "reorganisation": "reorganization",
1338
- "reorganisations": "reorganizations",
1339
- "reorganise": "reorganize",
1340
- "reorganised": "reorganized",
1341
- "reorganises": "reorganizes",
1342
- "reorganising": "reorganizing",
1343
- "revelled": "reveled",
1344
- "reveller": "reveler",
1345
- "revellers": "revelers",
1346
- "revelling": "reveling",
1347
- "revitalise": "revitalize",
1348
- "revitalised": "revitalized",
1349
- "revitalises": "revitalizes",
1350
- "revitalising": "revitalizing",
1351
- "revolutionise": "revolutionize",
1352
- "revolutionised": "revolutionized",
1353
- "revolutionises": "revolutionizes",
1354
- "revolutionising": "revolutionizing",
1355
- "rhapsodise": "rhapsodize",
1356
- "rhapsodised": "rhapsodized",
1357
- "rhapsodises": "rhapsodizes",
1358
- "rhapsodising": "rhapsodizing",
1359
- "rigour": "rigor",
1360
- "rigours": "rigors",
1361
- "ritualised": "ritualized",
1362
- "rivalled": "rivaled",
1363
- "rivalling": "rivaling",
1364
- "romanticise": "romanticize",
1365
- "romanticised": "romanticized",
1366
- "romanticises": "romanticizes",
1367
- "romanticising": "romanticizing",
1368
- "rumour": "rumor",
1369
- "rumoured": "rumored",
1370
- "rumours": "rumors",
1371
- "sabre": "saber",
1372
- "sabres": "sabers",
1373
- "saltpetre": "saltpeter",
1374
- "sanitise": "sanitize",
1375
- "sanitised": "sanitized",
1376
- "sanitises": "sanitizes",
1377
- "sanitising": "sanitizing",
1378
- "satirise": "satirize",
1379
- "satirised": "satirized",
1380
- "satirises": "satirizes",
1381
- "satirising": "satirizing",
1382
- "saviour": "savior",
1383
- "saviours": "saviors",
1384
- "savour": "savor",
1385
- "savoured": "savored",
1386
- "savouries": "savories",
1387
- "savouring": "savoring",
1388
- "savours": "savors",
1389
- "savoury": "savory",
1390
- "scandalise": "scandalize",
1391
- "scandalised": "scandalized",
1392
- "scandalises": "scandalizes",
1393
- "scandalising": "scandalizing",
1394
- "sceptic": "skeptic",
1395
- "sceptical": "skeptical",
1396
- "sceptically": "skeptically",
1397
- "scepticism": "skepticism",
1398
- "sceptics": "skeptics",
1399
- "sceptre": "scepter",
1400
- "sceptres": "scepters",
1401
- "scrutinise": "scrutinize",
1402
- "scrutinised": "scrutinized",
1403
- "scrutinises": "scrutinizes",
1404
- "scrutinising": "scrutinizing",
1405
- "secularisation": "secularization",
1406
- "secularise": "secularize",
1407
- "secularised": "secularized",
1408
- "secularises": "secularizes",
1409
- "secularising": "secularizing",
1410
- "sensationalise": "sensationalize",
1411
- "sensationalised": "sensationalized",
1412
- "sensationalises": "sensationalizes",
1413
- "sensationalising": "sensationalizing",
1414
- "sensitise": "sensitize",
1415
- "sensitised": "sensitized",
1416
- "sensitises": "sensitizes",
1417
- "sensitising": "sensitizing",
1418
- "sentimentalise": "sentimentalize",
1419
- "sentimentalised": "sentimentalized",
1420
- "sentimentalises": "sentimentalizes",
1421
- "sentimentalising": "sentimentalizing",
1422
- "sepulchre": "sepulcher",
1423
- "sepulchres": "sepulchers",
1424
- "serialisation": "serialization",
1425
- "serialisations": "serializations",
1426
- "serialise": "serialize",
1427
- "serialised": "serialized",
1428
- "serialises": "serializes",
1429
- "serialising": "serializing",
1430
- "sermonise": "sermonize",
1431
- "sermonised": "sermonized",
1432
- "sermonises": "sermonizes",
1433
- "sermonising": "sermonizing",
1434
- "sheikh": "sheik",
1435
- "shovelled": "shoveled",
1436
- "shovelling": "shoveling",
1437
- "shrivelled": "shriveled",
1438
- "shrivelling": "shriveling",
1439
- "signalise": "signalize",
1440
- "signalised": "signalized",
1441
- "signalises": "signalizes",
1442
- "signalising": "signalizing",
1443
- "signalled": "signaled",
1444
- "signalling": "signaling",
1445
- "smoulder": "smolder",
1446
- "smouldered": "smoldered",
1447
- "smouldering": "smoldering",
1448
- "smoulders": "smolders",
1449
- "snivelled": "sniveled",
1450
- "snivelling": "sniveling",
1451
- "snorkelled": "snorkeled",
1452
- "snorkelling": "snorkeling",
1453
- "snowplough": "snowplow",
1454
- "snowploughs": "snowplow",
1455
- "socialisation": "socialization",
1456
- "socialise": "socialize",
1457
- "socialised": "socialized",
1458
- "socialises": "socializes",
1459
- "socialising": "socializing",
1460
- "sodomise": "sodomize",
1461
- "sodomised": "sodomized",
1462
- "sodomises": "sodomizes",
1463
- "sodomising": "sodomizing",
1464
- "solemnise": "solemnize",
1465
- "solemnised": "solemnized",
1466
- "solemnises": "solemnizes",
1467
- "solemnising": "solemnizing",
1468
- "sombre": "somber",
1469
- "specialisation": "specialization",
1470
- "specialisations": "specializations",
1471
- "specialise": "specialize",
1472
- "specialised": "specialized",
1473
- "specialises": "specializes",
1474
- "specialising": "specializing",
1475
- "spectre": "specter",
1476
- "spectres": "specters",
1477
- "spiralled": "spiraled",
1478
- "spiralling": "spiraling",
1479
- "splendour": "splendor",
1480
- "splendours": "splendors",
1481
- "squirrelled": "squirreled",
1482
- "squirrelling": "squirreling",
1483
- "stabilisation": "stabilization",
1484
- "stabilise": "stabilize",
1485
- "stabilised": "stabilized",
1486
- "stabiliser": "stabilizer",
1487
- "stabilisers": "stabilizers",
1488
- "stabilises": "stabilizes",
1489
- "stabilising": "stabilizing",
1490
- "standardisation": "standardization",
1491
- "standardise": "standardize",
1492
- "standardised": "standardized",
1493
- "standardises": "standardizes",
1494
- "standardising": "standardizing",
1495
- "stencilled": "stenciled",
1496
- "stencilling": "stenciling",
1497
- "sterilisation": "sterilization",
1498
- "sterilisations": "sterilizations",
1499
- "sterilise": "sterilize",
1500
- "sterilised": "sterilized",
1501
- "steriliser": "sterilizer",
1502
- "sterilisers": "sterilizers",
1503
- "sterilises": "sterilizes",
1504
- "sterilising": "sterilizing",
1505
- "stigmatisation": "stigmatization",
1506
- "stigmatise": "stigmatize",
1507
- "stigmatised": "stigmatized",
1508
- "stigmatises": "stigmatizes",
1509
- "stigmatising": "stigmatizing",
1510
- "storey": "story",
1511
- "storeys": "stories",
1512
- "subsidisation": "subsidization",
1513
- "subsidise": "subsidize",
1514
- "subsidised": "subsidized",
1515
- "subsidiser": "subsidizer",
1516
- "subsidisers": "subsidizers",
1517
- "subsidises": "subsidizes",
1518
- "subsidising": "subsidizing",
1519
- "succour": "succor",
1520
- "succoured": "succored",
1521
- "succouring": "succoring",
1522
- "succours": "succors",
1523
- "sulphate": "sulfate",
1524
- "sulphates": "sulfates",
1525
- "sulphide": "sulfide",
1526
- "sulphides": "sulfides",
1527
- "sulphur": "sulfur",
1528
- "sulphurous": "sulfurous",
1529
- "summarise": "summarize",
1530
- "summarised": "summarized",
1531
- "summarises": "summarizes",
1532
- "summarising": "summarizing",
1533
- "swivelled": "swiveled",
1534
- "swivelling": "swiveling",
1535
- "symbolise": "symbolize",
1536
- "symbolised": "symbolized",
1537
- "symbolises": "symbolizes",
1538
- "symbolising": "symbolizing",
1539
- "sympathise": "sympathize",
1540
- "sympathised": "sympathized",
1541
- "sympathiser": "sympathizer",
1542
- "sympathisers": "sympathizers",
1543
- "sympathises": "sympathizes",
1544
- "sympathising": "sympathizing",
1545
- "synchronisation": "synchronization",
1546
- "synchronise": "synchronize",
1547
- "synchronised": "synchronized",
1548
- "synchronises": "synchronizes",
1549
- "synchronising": "synchronizing",
1550
- "synthesise": "synthesize",
1551
- "synthesised": "synthesized",
1552
- "synthesiser": "synthesizer",
1553
- "synthesisers": "synthesizers",
1554
- "synthesises": "synthesizes",
1555
- "synthesising": "synthesizing",
1556
- "syphon": "siphon",
1557
- "syphoned": "siphoned",
1558
- "syphoning": "siphoning",
1559
- "syphons": "siphons",
1560
- "systematisation": "systematization",
1561
- "systematise": "systematize",
1562
- "systematised": "systematized",
1563
- "systematises": "systematizes",
1564
- "systematising": "systematizing",
1565
- "tantalise": "tantalize",
1566
- "tantalised": "tantalized",
1567
- "tantalises": "tantalizes",
1568
- "tantalising": "tantalizing",
1569
- "tantalisingly": "tantalizingly",
1570
- "tasselled": "tasseled",
1571
- "technicolour": "technicolor",
1572
- "temporise": "temporize",
1573
- "temporised": "temporized",
1574
- "temporises": "temporizes",
1575
- "temporising": "temporizing",
1576
- "tenderise": "tenderize",
1577
- "tenderised": "tenderized",
1578
- "tenderises": "tenderizes",
1579
- "tenderising": "tenderizing",
1580
- "terrorise": "terrorize",
1581
- "terrorised": "terrorized",
1582
- "terrorises": "terrorizes",
1583
- "terrorising": "terrorizing",
1584
- "theatre": "theater",
1585
- "theatregoer": "theatergoer",
1586
- "theatregoers": "theatergoers",
1587
- "theatres": "theaters",
1588
- "theorise": "theorize",
1589
- "theorised": "theorized",
1590
- "theorises": "theorizes",
1591
- "theorising": "theorizing",
1592
- "tonne": "ton",
1593
- "tonnes": "tons",
1594
- "towelled": "toweled",
1595
- "towelling": "toweling",
1596
- "toxaemia": "toxemia",
1597
- "tranquillise": "tranquilize",
1598
- "tranquillised": "tranquilized",
1599
- "tranquilliser": "tranquilizer",
1600
- "tranquillisers": "tranquilizers",
1601
- "tranquillises": "tranquilizes",
1602
- "tranquillising": "tranquilizing",
1603
- "tranquillity": "tranquility",
1604
- "tranquillize": "tranquilize",
1605
- "tranquillized": "tranquilized",
1606
- "tranquillizer": "tranquilizer",
1607
- "tranquillizers": "tranquilizers",
1608
- "tranquillizes": "tranquilizes",
1609
- "tranquillizing": "tranquilizing",
1610
- "tranquilly": "tranquility",
1611
- "transistorised": "transistorized",
1612
- "traumatise": "traumatize",
1613
- "traumatised": "traumatized",
1614
- "traumatises": "traumatizes",
1615
- "traumatising": "traumatizing",
1616
- "travelled": "traveled",
1617
- "traveller": "traveler",
1618
- "travellers": "travelers",
1619
- "travelling": "traveling",
1620
- "travelog": "travelogue",
1621
- "travelogs": "travelogues",
1622
- "trialled": "trialed",
1623
- "trialling": "trialing",
1624
- "tricolour": "tricolor",
1625
- "tricolours": "tricolors",
1626
- "trivialise": "trivialize",
1627
- "trivialised": "trivialized",
1628
- "trivialises": "trivializes",
1629
- "trivialising": "trivializing",
1630
- "tumour": "tumor",
1631
- "tumours": "tumors",
1632
- "tunnelled": "tunneled",
1633
- "tunnelling": "tunneling",
1634
- "tyrannise": "tyrannize",
1635
- "tyrannised": "tyrannized",
1636
- "tyrannises": "tyrannizes",
1637
- "tyrannising": "tyrannizing",
1638
- "tyre": "tire",
1639
- "tyres": "tires",
1640
- "unauthorised": "unauthorized",
1641
- "uncivilised": "uncivilized",
1642
- "underutilised": "underutilized",
1643
- "unequalled": "unequaled",
1644
- "unfavourable": "unfavorable",
1645
- "unfavourably": "unfavorably",
1646
- "unionisation": "unionization",
1647
- "unionise": "unionize",
1648
- "unionised": "unionized",
1649
- "unionises": "unionizes",
1650
- "unionising": "unionizing",
1651
- "unorganised": "unorganized",
1652
- "unravelled": "unraveled",
1653
- "unravelling": "unraveling",
1654
- "unrecognisable": "unrecognizable",
1655
- "unrecognised": "unrecognized",
1656
- "unrivalled": "unrivaled",
1657
- "unsavoury": "unsavory",
1658
- "untrammelled": "untrammeled",
1659
- "urbanisation": "urbanization",
1660
- "urbanise": "urbanize",
1661
- "urbanised": "urbanized",
1662
- "urbanises": "urbanizes",
1663
- "urbanising": "urbanizing",
1664
- "utilisable": "utilizable",
1665
- "utilisation": "utilization",
1666
- "utilise": "utilize",
1667
- "utilised": "utilized",
1668
- "utilises": "utilizes",
1669
- "utilising": "utilizing",
1670
- "valour": "valor",
1671
- "vandalise": "vandalize",
1672
- "vandalised": "vandalized",
1673
- "vandalises": "vandalizes",
1674
- "vandalising": "vandalizing",
1675
- "vaporisation": "vaporization",
1676
- "vaporise": "vaporize",
1677
- "vaporised": "vaporized",
1678
- "vaporises": "vaporizes",
1679
- "vaporising": "vaporizing",
1680
- "vapour": "vapor",
1681
- "vapours": "vapors",
1682
- "verbalise": "verbalize",
1683
- "verbalised": "verbalized",
1684
- "verbalises": "verbalizes",
1685
- "verbalising": "verbalizing",
1686
- "victimisation": "victimization",
1687
- "victimise": "victimize",
1688
- "victimised": "victimized",
1689
- "victimises": "victimizes",
1690
- "victimising": "victimizing",
1691
- "videodisc": "videodisk",
1692
- "videodiscs": "videodisks",
1693
- "vigour": "vigor",
1694
- "visualisation": "visualization",
1695
- "visualisations": "visualizations",
1696
- "visualise": "visualize",
1697
- "visualised": "visualized",
1698
- "visualises": "visualizes",
1699
- "visualising": "visualizing",
1700
- "vocalisation": "vocalization",
1701
- "vocalisations": "vocalizations",
1702
- "vocalise": "vocalize",
1703
- "vocalised": "vocalized",
1704
- "vocalises": "vocalizes",
1705
- "vocalising": "vocalizing",
1706
- "vulcanised": "vulcanized",
1707
- "vulgarisation": "vulgarization",
1708
- "vulgarise": "vulgarize",
1709
- "vulgarised": "vulgarized",
1710
- "vulgarises": "vulgarizes",
1711
- "vulgarising": "vulgarizing",
1712
- "waggon": "wagon",
1713
- "waggons": "wagons",
1714
- "watercolour": "watercolor",
1715
- "watercolours": "watercolors",
1716
- "weaselled": "weaseled",
1717
- "weaselling": "weaseling",
1718
- "westernisation": "westernization",
1719
- "westernise": "westernize",
1720
- "westernised": "westernized",
1721
- "westernises": "westernizes",
1722
- "westernising": "westernizing",
1723
- "womanise": "womanize",
1724
- "womanised": "womanized",
1725
- "womaniser": "womanizer",
1726
- "womanisers": "womanizers",
1727
- "womanises": "womanizes",
1728
- "womanising": "womanizing",
1729
- "woollen": "woolen",
1730
- "woollens": "woolens",
1731
- "woollies": "woolies",
1732
- "woolly": "wooly",
1733
- "worshipped": "worshiped",
1734
- "worshipper": "worshiper",
1735
- "worshipping": "worshiping",
1736
- "yodelled": "yodeled",
1737
- "yodelling": "yodeling",
1738
- "yoghourt": "yogurt",
1739
- "yoghourts": "yogurts",
1740
- "yoghurt": "yogurt",
1741
- "yoghurts": "yogurts"
1742
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/benchmark/requirements.benchmark.txt DELETED
@@ -1,6 +0,0 @@
1
- transformers
2
- jiwer
3
- evaluate
4
- datasets
5
- memory_profiler
6
- py3nvml
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/benchmark/speed_benchmark.py DELETED
@@ -1,31 +0,0 @@
1
- import argparse
2
- import timeit
3
-
4
- from typing import Callable
5
-
6
- from utils import inference
7
-
8
- parser = argparse.ArgumentParser(description="Speed benchmark")
9
- parser.add_argument(
10
- "--repeat",
11
- type=int,
12
- default=3,
13
- help="Times an experiment will be run.",
14
- )
15
- args = parser.parse_args()
16
-
17
-
18
- def measure_speed(func: Callable[[], None]):
19
- # as written in https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat,
20
- # min should be taken rather than the average
21
- runtimes = timeit.repeat(
22
- func,
23
- repeat=args.repeat,
24
- number=10,
25
- )
26
- print(runtimes)
27
- print("Min execution time: %.3fs" % (min(runtimes) / 10.0))
28
-
29
-
30
- if __name__ == "__main__":
31
- measure_speed(inference)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/benchmark/utils.py DELETED
@@ -1,39 +0,0 @@
1
- import logging
2
-
3
- from threading import Thread
4
- from typing import Optional
5
-
6
- from faster_whisper import WhisperModel
7
-
8
- model_path = "large-v3"
9
- model = WhisperModel(model_path, device="cuda")
10
-
11
-
12
- def inference():
13
- segments, info = model.transcribe("benchmark.m4a", language="fr")
14
- for segment in segments:
15
- print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
16
-
17
-
18
- def get_logger(name: Optional[str] = None) -> logging.Logger:
19
- formatter = logging.Formatter("%(levelname)s: %(message)s")
20
- logger = logging.getLogger(name)
21
- logger.setLevel(logging.DEBUG)
22
- handler = logging.StreamHandler()
23
- handler.setFormatter(formatter)
24
- logger.addHandler(handler)
25
- return logger
26
-
27
-
28
- class MyThread(Thread):
29
- def __init__(self, func, params):
30
- super(MyThread, self).__init__()
31
- self.func = func
32
- self.params = params
33
- self.result = None
34
-
35
- def run(self):
36
- self.result = self.func(*self.params)
37
-
38
- def get_result(self):
39
- return self.result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/benchmark/wer_benchmark.py DELETED
@@ -1,64 +0,0 @@
1
- import argparse
2
- import json
3
- import os
4
-
5
- from datasets import load_dataset
6
- from evaluate import load
7
- from tqdm import tqdm
8
- from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
9
-
10
- from faster_whisper import WhisperModel
11
-
12
- parser = argparse.ArgumentParser(description="WER benchmark")
13
- parser.add_argument(
14
- "--audio_numb",
15
- type=int,
16
- default=None,
17
- help="Specify the number of validation audio files in the dataset."
18
- " Set to None to retrieve all audio files.",
19
- )
20
- args = parser.parse_args()
21
-
22
- model_path = "large-v3"
23
- model = WhisperModel(model_path, device="cuda")
24
-
25
- # load the dataset with streaming mode
26
- dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)
27
-
28
- # define the evaluation metric
29
- wer_metric = load("wer")
30
-
31
- with open(os.path.join(os.path.dirname(__file__), "normalizer.json"), "r") as f:
32
- normalizer = EnglishTextNormalizer(json.load(f))
33
-
34
-
35
- def inference(batch):
36
- batch["transcription"] = []
37
- for sample in batch["audio"]:
38
- segments, info = model.transcribe(sample["array"], language="en")
39
- batch["transcription"].append("".join([segment.text for segment in segments]))
40
- batch["reference"] = batch["text"]
41
- return batch
42
-
43
-
44
- dataset = dataset.map(function=inference, batched=True, batch_size=16)
45
-
46
- all_transcriptions = []
47
- all_references = []
48
-
49
- # iterate over the dataset and run inference
50
- for i, result in tqdm(enumerate(dataset), desc="Evaluating..."):
51
- all_transcriptions.append(result["transcription"])
52
- all_references.append(result["reference"])
53
- if args.audio_numb and i == (args.audio_numb - 1):
54
- break
55
-
56
- # normalize predictions and references
57
- all_transcriptions = [normalizer(transcription) for transcription in all_transcriptions]
58
- all_references = [normalizer(reference) for reference in all_references]
59
-
60
- # compute the WER metric
61
- wer = 100 * wer_metric.compute(
62
- predictions=all_transcriptions, references=all_references
63
- )
64
- print("WER: %.3f" % wer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- from faster_whisper.audio import decode_audio
2
- from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel
3
- from faster_whisper.utils import available_models, download_model, format_timestamp
4
- from faster_whisper.version import __version__
5
-
6
- __all__ = [
7
- "available_models",
8
- "decode_audio",
9
- "WhisperModel",
10
- "BatchedInferencePipeline",
11
- "download_model",
12
- "format_timestamp",
13
- "__version__",
14
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/__init__.py DELETED
File without changes
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/pyannote_vad_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
3
- size 17719103
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/silero_vad.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b99cbfd39246b6706f98ec13c7c50c6b299181f2474fa05cbc8046acc274396
3
- size 2313101
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/audio.py DELETED
@@ -1,58 +0,0 @@
1
- from typing import BinaryIO, Union
2
-
3
- import torch
4
- import torchaudio
5
-
6
-
7
- def decode_audio(
8
- input_file: Union[str, BinaryIO],
9
- sampling_rate: int = 16000,
10
- split_stereo: bool = False,
11
- ):
12
- """Decodes the audio.
13
-
14
- Args:
15
- input_file: Path to the input file or a file-like object.
16
- sampling_rate: Resample the audio to this sample rate.
17
- split_stereo: Return separate left and right channels.
18
-
19
- Returns:
20
- A float32 Torch Tensor.
21
-
22
- If `split_stereo` is enabled, the function returns a 2-tuple with the
23
- separated left and right channels.
24
- """
25
-
26
- waveform, audio_sf = torchaudio.load(input_file) # waveform: channels X T
27
-
28
- if audio_sf != sampling_rate:
29
- waveform = torchaudio.functional.resample(
30
- waveform, orig_freq=audio_sf, new_freq=sampling_rate
31
- )
32
- if split_stereo:
33
- return waveform[0], waveform[1]
34
-
35
- return waveform.mean(0)
36
-
37
-
38
- def pad_or_trim(array, length: int, *, axis: int = -1):
39
- """
40
- Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
41
- """
42
- axis = axis % array.ndim
43
- if array.shape[axis] > length:
44
- idx = [Ellipsis] * axis + [slice(length)] + [Ellipsis] * (array.ndim - axis - 1)
45
- return array[idx]
46
-
47
- if array.shape[axis] < length:
48
- pad_widths = (
49
- [
50
- 0,
51
- ]
52
- * array.ndim
53
- * 2
54
- )
55
- pad_widths[2 * axis] = length - array.shape[axis]
56
- array = torch.nn.functional.pad(array, tuple(pad_widths[::-1]))
57
-
58
- return array
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/feature_extractor.py DELETED
@@ -1,114 +0,0 @@
1
- import torch
2
-
3
-
4
- # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py # noqa: E501
5
- class FeatureExtractor:
6
- def __init__(
7
- self,
8
- device: str = "auto",
9
- feature_size=80,
10
- sampling_rate=16000,
11
- hop_length=160,
12
- chunk_length=30,
13
- n_fft=400,
14
- ):
15
- if device == "auto":
16
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
17
- else:
18
- self.device = device
19
- self.n_fft = n_fft
20
- self.hop_length = hop_length
21
- self.chunk_length = chunk_length
22
- self.n_samples = chunk_length * sampling_rate
23
- self.nb_max_frames = self.n_samples // hop_length
24
- self.time_per_frame = hop_length / sampling_rate
25
- self.sampling_rate = sampling_rate
26
- self.mel_filters = self.get_mel_filters(
27
- sampling_rate, n_fft, n_mels=feature_size
28
- )
29
-
30
- @staticmethod
31
- def get_mel_filters(sr, n_fft, n_mels=128):
32
- """
33
- Implementation of librosa.filters.mel in Pytorch
34
- """
35
- # Initialize the weights
36
- n_mels = int(n_mels)
37
-
38
- # Center freqs of each FFT bin
39
- fftfreqs = torch.fft.rfftfreq(n=n_fft, d=1.0 / sr)
40
-
41
- # 'Center freqs' of mel bands - uniformly spaced between limits
42
- min_mel = 0.0
43
- max_mel = 45.245640471924965
44
-
45
- mels = torch.linspace(min_mel, max_mel, n_mels + 2)
46
-
47
- # Fill in the linear scale
48
- f_min = 0.0
49
- f_sp = 200.0 / 3
50
- freqs = f_min + f_sp * mels
51
-
52
- # And now the nonlinear scale
53
- min_log_hz = 1000.0 # beginning of log region (Hz)
54
- min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
55
- logstep = torch.log(torch.tensor(6.4)) / 27.0 # step size for log region
56
-
57
- # If we have vector data, vectorize
58
- log_t = mels >= min_log_mel
59
- freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
60
-
61
- mel_f = freqs
62
-
63
- fdiff = torch.diff(mel_f)
64
- ramps = mel_f.view(-1, 1) - fftfreqs.view(1, -1)
65
-
66
- lower = -ramps[:-2] / fdiff[:-1].unsqueeze(1)
67
- upper = ramps[2:] / fdiff[1:].unsqueeze(1)
68
-
69
- # Intersect them with each other and zero, vectorized across all i
70
- weights = torch.maximum(torch.zeros_like(lower), torch.minimum(lower, upper))
71
-
72
- # Slaney-style mel is scaled to be approx constant energy per channel
73
- enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
74
- weights *= enorm.unsqueeze(1)
75
-
76
- return weights
77
-
78
- def __call__(self, waveform, padding=True, chunk_length=None, to_cpu=False):
79
- """
80
- Compute the log-Mel spectrogram of the provided audio.
81
- """
82
-
83
- if chunk_length is not None:
84
- self.n_samples = chunk_length * self.sampling_rate
85
- self.nb_max_frames = self.n_samples // self.hop_length
86
-
87
- if waveform.dtype is not torch.float32:
88
- waveform = waveform.to(torch.float32)
89
-
90
- waveform = (
91
- waveform.to(self.device)
92
- if self.device == "cuda" and not waveform.is_cuda
93
- else waveform
94
- )
95
-
96
- if padding:
97
- waveform = torch.nn.functional.pad(waveform, (0, self.n_samples))
98
-
99
- window = torch.hann_window(self.n_fft).to(waveform.device)
100
-
101
- stft = torch.stft(
102
- waveform, self.n_fft, self.hop_length, window=window, return_complex=True
103
- )
104
- magnitudes = stft[..., :-1].abs() ** 2
105
-
106
- mel_spec = self.mel_filters.to(waveform.device) @ magnitudes
107
-
108
- log_spec = torch.clamp(mel_spec, min=1e-10).log10()
109
- log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
110
- log_spec = (log_spec + 4.0) / 4.0
111
-
112
- # When the model is running on multiple GPUs, the output should be moved
113
- # to the CPU since we don't know which GPU will handle the next job.
114
- return log_spec.cpu() if to_cpu else log_spec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/tokenizer.py DELETED
@@ -1,314 +0,0 @@
1
- import string
2
-
3
- from functools import cached_property
4
- from typing import List, Optional, Tuple
5
-
6
- import tokenizers
7
-
8
-
9
- class Tokenizer:
10
- """Simple wrapper around a tokenizers.Tokenizer."""
11
-
12
- def __init__(
13
- self,
14
- tokenizer: tokenizers.Tokenizer,
15
- multilingual: bool,
16
- task: Optional[str] = None,
17
- language: Optional[str] = None,
18
- ):
19
- self.tokenizer = tokenizer
20
-
21
- if multilingual:
22
- if task not in _TASKS:
23
- raise ValueError(
24
- "'%s' is not a valid task (accepted tasks: %s)"
25
- % (task, ", ".join(_TASKS))
26
- )
27
-
28
- if language not in _LANGUAGE_CODES:
29
- raise ValueError(
30
- "'%s' is not a valid language code (accepted language codes: %s)"
31
- % (language, ", ".join(_LANGUAGE_CODES))
32
- )
33
-
34
- self.task = self.tokenizer.token_to_id("<|%s|>" % task)
35
- self.language = self.tokenizer.token_to_id("<|%s|>" % language)
36
- self.language_code = language
37
- else:
38
- self.task = None
39
- self.language = None
40
- self.language_code = "en"
41
-
42
- @cached_property
43
- def transcribe(self) -> int:
44
- return self.tokenizer.token_to_id("<|transcribe|>")
45
-
46
- @cached_property
47
- def translate(self) -> int:
48
- return self.tokenizer.token_to_id("<|translate|>")
49
-
50
- @cached_property
51
- def sot(self) -> int:
52
- return self.tokenizer.token_to_id("<|startoftranscript|>")
53
-
54
- @cached_property
55
- def sot_lm(self) -> int:
56
- return self.tokenizer.token_to_id("<|startoflm|>")
57
-
58
- @cached_property
59
- def sot_prev(self) -> int:
60
- return self.tokenizer.token_to_id("<|startofprev|>")
61
-
62
- @cached_property
63
- def eot(self) -> int:
64
- return self.tokenizer.token_to_id("<|endoftext|>")
65
-
66
- @cached_property
67
- def no_timestamps(self) -> int:
68
- return self.tokenizer.token_to_id("<|notimestamps|>")
69
-
70
- @property
71
- def timestamp_begin(self) -> int:
72
- return self.no_timestamps + 1
73
-
74
- @property
75
- def sot_sequence(self) -> List[int]:
76
- sequence = [self.sot]
77
-
78
- if self.language is not None:
79
- sequence.append(self.language)
80
-
81
- if self.task is not None:
82
- sequence.append(self.task)
83
-
84
- return sequence
85
-
86
- def encode(self, text: str) -> List[int]:
87
- return self.tokenizer.encode(text, add_special_tokens=False).ids
88
-
89
- def decode(self, tokens: List[int]) -> str:
90
- text_tokens = [token for token in tokens if token < self.eot]
91
- return self.tokenizer.decode(text_tokens)
92
-
93
- def decode_with_timestamps(self, tokens: List[int]) -> str:
94
- outputs = [[]]
95
-
96
- for token in tokens:
97
- if token >= self.timestamp_begin:
98
- timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
99
- outputs.append(timestamp)
100
- outputs.append([])
101
- else:
102
- outputs[-1].append(token)
103
-
104
- return "".join(
105
- [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
106
- )
107
-
108
- @cached_property
109
- def non_speech_tokens(self) -> Tuple[int]:
110
- """
111
- Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
112
- annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
113
-
114
- - ♪♪♪
115
- - ( SPEAKING FOREIGN LANGUAGE )
116
- - [DAVID] Hey there,
117
-
118
- keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
119
- """
120
- symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
121
- symbols += (
122
- "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
123
- )
124
-
125
- # symbols that may be a single token or multiple tokens depending on the tokenizer.
126
- # In case they're multiple tokens, suppress the first token, which is safe because:
127
- # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
128
- # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
129
- miscellaneous = set("♩♪♫♬♭♮♯")
130
- assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
131
-
132
- # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
133
- result = {self.encode(" -")[0], self.encode(" '")[0]}
134
- for symbol in symbols + list(miscellaneous):
135
- for tokens in [
136
- self.encode(symbol),
137
- self.encode(" " + symbol),
138
- ]:
139
- if len(tokens) == 1 or symbol in miscellaneous:
140
- result.add(tokens[0])
141
-
142
- return tuple(sorted(result))
143
-
144
- def split_to_word_tokens(
145
- self, tokens: List[int]
146
- ) -> Tuple[List[str], List[List[int]]]:
147
- if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
148
- # These languages don't typically use spaces, so it is difficult to split words
149
- # without morpheme analysis. Here, we instead split words at any
150
- # position where the tokens are decoded as valid unicode points
151
- return self.split_tokens_on_unicode(tokens)
152
-
153
- return self.split_tokens_on_spaces(tokens)
154
-
155
- def split_tokens_on_unicode(
156
- self, tokens: List[int]
157
- ) -> Tuple[List[str], List[List[int]]]:
158
- decoded_full = self.decode_with_timestamps(tokens)
159
- replacement_char = "\ufffd"
160
-
161
- words = []
162
- word_tokens = []
163
- current_tokens = []
164
- unicode_offset = 0
165
-
166
- for token in tokens:
167
- current_tokens.append(token)
168
- decoded = self.decode_with_timestamps(current_tokens)
169
-
170
- try:
171
- replacement_char_index = decoded.index(replacement_char)
172
- replacement_char_index += unicode_offset
173
- except ValueError:
174
- replacement_char_index = None
175
-
176
- if replacement_char_index is None or (
177
- replacement_char_index < len(decoded_full)
178
- and decoded_full[replacement_char_index] == replacement_char
179
- ):
180
- words.append(decoded)
181
- word_tokens.append(current_tokens)
182
- current_tokens = []
183
- unicode_offset += len(decoded)
184
-
185
- return words, word_tokens
186
-
187
- def split_tokens_on_spaces(
188
- self, tokens: List[int]
189
- ) -> Tuple[List[str], List[List[int]]]:
190
- subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
191
- words = []
192
- word_tokens = []
193
-
194
- for subword, subword_tokens in zip(subwords, subword_tokens_list):
195
- special = subword_tokens[0] >= self.eot
196
- with_space = subword.startswith(" ")
197
- punctuation = subword.strip() in string.punctuation
198
- if special or with_space or punctuation or len(words) == 0:
199
- words.append(subword)
200
- word_tokens.append(subword_tokens)
201
- else:
202
- words[-1] = words[-1] + subword
203
- word_tokens[-1].extend(subword_tokens)
204
-
205
- return words, word_tokens
206
-
207
-
208
- _TASKS = (
209
- "transcribe",
210
- "translate",
211
- )
212
-
213
- _LANGUAGE_CODES = (
214
- "af",
215
- "am",
216
- "ar",
217
- "as",
218
- "az",
219
- "ba",
220
- "be",
221
- "bg",
222
- "bn",
223
- "bo",
224
- "br",
225
- "bs",
226
- "ca",
227
- "cs",
228
- "cy",
229
- "da",
230
- "de",
231
- "el",
232
- "en",
233
- "es",
234
- "et",
235
- "eu",
236
- "fa",
237
- "fi",
238
- "fo",
239
- "fr",
240
- "gl",
241
- "gu",
242
- "ha",
243
- "haw",
244
- "he",
245
- "hi",
246
- "hr",
247
- "ht",
248
- "hu",
249
- "hy",
250
- "id",
251
- "is",
252
- "it",
253
- "ja",
254
- "jw",
255
- "ka",
256
- "kk",
257
- "km",
258
- "kn",
259
- "ko",
260
- "la",
261
- "lb",
262
- "ln",
263
- "lo",
264
- "lt",
265
- "lv",
266
- "mg",
267
- "mi",
268
- "mk",
269
- "ml",
270
- "mn",
271
- "mr",
272
- "ms",
273
- "mt",
274
- "my",
275
- "ne",
276
- "nl",
277
- "nn",
278
- "no",
279
- "oc",
280
- "pa",
281
- "pl",
282
- "ps",
283
- "pt",
284
- "ro",
285
- "ru",
286
- "sa",
287
- "sd",
288
- "si",
289
- "sk",
290
- "sl",
291
- "sn",
292
- "so",
293
- "sq",
294
- "sr",
295
- "su",
296
- "sv",
297
- "sw",
298
- "ta",
299
- "te",
300
- "tg",
301
- "th",
302
- "tk",
303
- "tl",
304
- "tr",
305
- "tt",
306
- "uk",
307
- "ur",
308
- "uz",
309
- "vi",
310
- "yi",
311
- "yo",
312
- "zh",
313
- "yue",
314
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/transcribe.py DELETED
@@ -1,2170 +0,0 @@
1
- import itertools
2
- import json
3
- import logging
4
- import os
5
- import random
6
- import zlib
7
-
8
- from collections import Counter, defaultdict
9
- from inspect import signature
10
- from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
11
-
12
- import ctranslate2
13
- import numpy as np
14
- import tokenizers
15
- import torch
16
-
17
- from pyannote.audio import Model
18
- from tqdm import tqdm
19
-
20
- from faster_whisper.audio import decode_audio, pad_or_trim
21
- from faster_whisper.feature_extractor import FeatureExtractor
22
- from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
23
- from faster_whisper.utils import (
24
- download_model,
25
- format_timestamp,
26
- get_assets_path,
27
- get_end,
28
- get_logger,
29
- )
30
- from faster_whisper.vad import (
31
- SpeechTimestampsMap,
32
- VadOptions,
33
- VoiceActivitySegmentation,
34
- collect_chunks,
35
- get_speech_timestamps,
36
- merge_chunks,
37
- )
38
-
39
-
40
- class Word(NamedTuple):
41
- start: float
42
- end: float
43
- word: str
44
- probability: float
45
-
46
-
47
- class Segment(NamedTuple):
48
- id: int
49
- seek: int
50
- start: float
51
- end: float
52
- text: str
53
- tokens: List[int]
54
- avg_logprob: float
55
- compression_ratio: float
56
- no_speech_prob: float
57
- words: Optional[List[Word]]
58
- temperature: Optional[float] = 1.0
59
-
60
-
61
- # Added additional parameters for multilingual videos and fixes below
62
- class TranscriptionOptions(NamedTuple):
63
- beam_size: int
64
- best_of: int
65
- patience: float
66
- length_penalty: float
67
- repetition_penalty: float
68
- no_repeat_ngram_size: int
69
- log_prob_threshold: Optional[float]
70
- log_prob_low_threshold: Optional[float]
71
- no_speech_threshold: Optional[float]
72
- compression_ratio_threshold: Optional[float]
73
- condition_on_previous_text: bool
74
- prompt_reset_on_temperature: float
75
- temperatures: List[float]
76
- initial_prompt: Optional[Union[str, Iterable[int]]]
77
- prefix: Optional[str]
78
- suppress_blank: bool
79
- suppress_tokens: Optional[List[int]]
80
- without_timestamps: bool
81
- max_initial_timestamp: float
82
- word_timestamps: bool
83
- prepend_punctuations: str
84
- append_punctuations: str
85
- multilingual: bool
86
- output_language: Optional[str]
87
- max_new_tokens: Optional[int]
88
- clip_timestamps: Union[str, List[float]]
89
- hallucination_silence_threshold: Optional[float]
90
- hotwords: Optional[str]
91
-
92
-
93
- class TranscriptionInfo(NamedTuple):
94
- language: str
95
- language_probability: float
96
- duration: float
97
- duration_after_vad: float
98
- all_language_probs: Optional[List[Tuple[str, float]]]
99
- transcription_options: TranscriptionOptions
100
- vad_options: VadOptions
101
-
102
-
103
- # The code below is originally from HF pipeline and is used in whisper-x
104
- # (https://github.com/m-bain/whisperX) and adapted for faster_whisper
105
-
106
-
107
- class BatchedInferencePipeline:
108
- """
109
- Huggingface Pipeline wrapper for WhisperModel.
110
- Copyright (c) 2022, Max Bain
111
- All rights reserved.
112
- Modified by Mobius Labs GmbH
113
- """
114
-
115
- def __init__(
116
- self,
117
- model,
118
- use_vad_model: bool = True,
119
- options: Optional[NamedTuple] = None,
120
- tokenizer=None,
121
- chunk_length: int = 30,
122
- vad_device: Union[int, str, "torch.device"] = "auto",
123
- vad_onset: float = 0.500,
124
- vad_offset: float = 0.363,
125
- language: Optional[str] = None,
126
- ):
127
- self.model: WhisperModel = model
128
- self.tokenizer = tokenizer
129
- self.options = options
130
- self.preset_language = language
131
- self.use_vad_model = use_vad_model
132
- self.vad_onset = vad_onset
133
- self.vad_offset = vad_offset
134
- self.vad_model_path = os.path.join(get_assets_path(), "pyannote_vad_model.bin")
135
- if self.use_vad_model:
136
- self.vad_device = self.get_device(vad_device)
137
- self.vad_model = self.load_vad_model(
138
- vad_onset=self.vad_onset, vad_offset=self.vad_offset
139
- )
140
- else:
141
- self.vad_model = None
142
- self.chunk_length = chunk_length # VAD merging size
143
- self.last_speech_timestamp = 0.0
144
-
145
- def get_device(self, device: Union[int, str, "torch.device"]):
146
- """
147
- Converts the input device into a torch.device object.
148
-
149
- The input can be an integer, a string, or a `torch.device` object.
150
-
151
- The function handles a special case where the input device is "auto".
152
- When "auto" is specified, the device will default to the
153
- device of the model (self.model.device). If the model's device is also "auto",
154
- it selects "cuda" if a CUDA-capable device is available; otherwise, it selects "cpu".
155
- """
156
- if isinstance(device, torch.device):
157
- return device
158
- elif isinstance(device, str):
159
- if device == "auto" and self.model.device == "auto":
160
- device = "cuda" if torch.cuda.is_available() else "cpu"
161
- elif device == "auto":
162
- device = self.model.device
163
- return torch.device(device)
164
- elif device < 0:
165
- return torch.device("cpu")
166
- else:
167
- return torch.device(f"cuda:{device}")
168
-
169
- def forward(self, features, segments_metadata, **forward_params):
170
- encoder_output, outputs = self.model.generate_segment_batched(
171
- features, self.tokenizer, forward_params
172
- )
173
-
174
- segmented_outputs = []
175
- segment_sizes = []
176
- for segment_metadata, output in zip(segments_metadata, outputs):
177
- duration = segment_metadata["end_time"] - segment_metadata["start_time"]
178
- segment_size = int(duration * self.model.frames_per_second)
179
- segment_sizes.append(segment_size)
180
- (
181
- subsegments,
182
- seek,
183
- single_timestamp_ending,
184
- ) = self.model._split_segments_by_timestamps(
185
- tokenizer=self.tokenizer,
186
- tokens=output["tokens"],
187
- time_offset=segment_metadata["start_time"],
188
- segment_size=segment_size,
189
- segment_duration=duration,
190
- seek=0,
191
- )
192
- segmented_outputs.append(
193
- [
194
- dict(
195
- text=self.tokenizer.decode(subsegment["tokens"]),
196
- avg_logprob=output["avg_logprob"],
197
- no_speech_prob=output["no_speech_prob"],
198
- tokens=subsegment["tokens"],
199
- start=subsegment["start"],
200
- end=subsegment["end"],
201
- compression_ratio=get_compression_ratio(
202
- self.tokenizer.decode(subsegment["tokens"])
203
- ),
204
- )
205
- for subsegment in subsegments
206
- ]
207
- )
208
- if forward_params["word_timestamps"]:
209
- self.last_speech_timestamp = self.model.add_word_timestamps(
210
- segmented_outputs,
211
- self.tokenizer,
212
- encoder_output,
213
- segment_sizes,
214
- forward_params["prepend_punctuations"],
215
- forward_params["append_punctuations"],
216
- self.last_speech_timestamp,
217
- )
218
-
219
- return segmented_outputs
220
-
221
- def get_language_and_tokenizer(
222
- self, audio, task: Optional[str] = None, language: Optional[str] = None
223
- ):
224
- all_language_probs = None
225
- language_probability = 1.0
226
-
227
- if self.tokenizer is None:
228
- if not language:
229
- (
230
- language,
231
- language_probability,
232
- all_language_probs,
233
- ) = self.model.detect_language(audio)
234
- task = task or "transcribe"
235
- self.tokenizer = Tokenizer(
236
- self.model.hf_tokenizer,
237
- self.model.model.is_multilingual,
238
- task=task,
239
- language=language,
240
- )
241
- else:
242
- if task is not None:
243
- self.tokenizer.task = self.tokenizer.tokenizer.token_to_id(
244
- f"<|{task}|>"
245
- )
246
-
247
- if language is not None:
248
- self.tokenizer.language = self.tokenizer.tokenizer.token_to_id(
249
- f"<|{language}|>"
250
- )
251
- self.tokenizer.language_code = language
252
-
253
- return language, language_probability, task, all_language_probs
254
-
255
- @staticmethod
256
- def audio_split(audio, segments, sampling_rate):
257
- """Returns splitted audio chunks as iterator"""
258
- audio_segments = []
259
- segments_metadata = []
260
- for seg in segments:
261
- f1 = int(seg["start"] * sampling_rate)
262
- f2 = int(seg["end"] * sampling_rate)
263
- seg_metadata = {
264
- "start_time": seg["start"],
265
- "end_time": seg["end"],
266
- "stitched_seg": seg["segments"],
267
- }
268
- audio_segments.append(audio[f1:f2])
269
- segments_metadata.append(seg_metadata)
270
- return audio_segments, segments_metadata
271
-
272
- def load_vad_model(self, vad_onset=0.500, vad_offset=0.363):
273
- vad_model = Model.from_pretrained(self.vad_model_path)
274
- hyperparameters = {
275
- "onset": vad_onset,
276
- "offset": vad_offset,
277
- "min_duration_on": 0.1,
278
- "min_duration_off": 0.1,
279
- }
280
-
281
- vad_pipeline = VoiceActivitySegmentation(
282
- segmentation=vad_model, device=torch.device(self.vad_device)
283
- )
284
- vad_pipeline.instantiate(hyperparameters)
285
- return vad_pipeline
286
-
287
- def transcribe(
288
- self,
289
- audio: Union[str, torch.Tensor, np.ndarray],
290
- vad_segments: Optional[List[dict]] = None,
291
- batch_size: int = 16,
292
- language: Optional[str] = None,
293
- task: str = None,
294
- log_progress: bool = False,
295
- beam_size: int = 5,
296
- best_of: int = 5,
297
- patience: float = 1,
298
- length_penalty: float = 1,
299
- repetition_penalty: float = 1,
300
- no_repeat_ngram_size: int = 0,
301
- temperature: Union[float, List[float], Tuple[float, ...]] = [
302
- 0.0,
303
- 0.2,
304
- 0.4,
305
- 0.6,
306
- 0.8,
307
- 1.0,
308
- ],
309
- compression_ratio_threshold: Optional[float] = 2.4,
310
- log_prob_threshold: Optional[float] = -1.0,
311
- log_prob_low_threshold: Optional[float] = None,
312
- no_speech_threshold: Optional[float] = 0.6,
313
- initial_prompt: Optional[Union[str, Iterable[int]]] = None,
314
- prefix: Optional[str] = None,
315
- suppress_blank: bool = True,
316
- suppress_tokens: Optional[List[int]] = [-1],
317
- prepend_punctuations: str = "\"'“¿([{-",
318
- append_punctuations: str = "\"'.。,,!!??::”)]}、",
319
- max_new_tokens: Optional[int] = None,
320
- hotwords: Optional[str] = None,
321
- word_timestamps: bool = False,
322
- without_timestamps: bool = True,
323
- ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
324
- """transcribe audio in chunks in batched fashion and return with language info.
325
-
326
- Arguments:
327
- audio: audio file as numpy array/path for batched transcription.
328
- vad_segments: Optionally provide list of dictionaries each containing "start", "end",
329
- and "segments" keys.
330
- "start" and "end" keys specify the start and end of the voiced region within
331
- 30 sec boundary. An additional key "segments" contains all the start
332
- and end of voiced regions within that 30sec boundary as a list of tuples.
333
- If no vad_segments specified, it uses internal vad model automatically segment them.
334
- batch_size: the maximum number of parallel requests to model for decoding.
335
- language: The language spoken in the audio.
336
- task: either "transcribe" or "translate".
337
- log_progress: whether to show progress bar or not.
338
- beam_size: Beam size to use for decoding.
339
- best_of: Number of candidates when sampling with non-zero temperature.
340
- patience: Beam search patience factor.
341
- length_penalty: Exponential length penalty constant.
342
- repetition_penalty: Penalty applied to the score of previously generated tokens
343
- (set > 1 to penalize).
344
- no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable).
345
- temperature: Temperature for sampling. It can be a tuple of temperatures,
346
- which will be successively used upon failures according to either
347
- `compression_ratio_threshold` or `log_prob_threshold`.
348
- compression_ratio_threshold: If the gzip compression ratio is above this value,
349
- treat as failed.
350
- log_prob_threshold: If the average log probability over sampled tokens is
351
- below this value, treat as failed.
352
- log_prob_low_threshold: This parameter alone is sufficient to skip an output text,
353
- whereas log_prob_threshold also looks for appropriate no_speech_threshold value.
354
- This value should be less than log_prob_threshold.
355
- no_speech_threshold: If the no_speech probability is higher than this value AND
356
- the average log probability over sampled tokens is below `log_prob_threshold`,
357
- consider the segment as silent.
358
- initial_prompt: Optional text string or iterable of token ids to provide as a
359
- prompt for the first window.
360
- prefix: Optional text to provide as a prefix for the first window.
361
- suppress_blank: Suppress blank outputs at the beginning of the sampling.
362
- suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
363
- of symbols as defined in `tokenizer.non_speech_tokens()`.
364
- prepend_punctuations: If word_timestamps is True, merge these punctuation symbols
365
- with the next word
366
- append_punctuations: If word_timestamps is True, merge these punctuation symbols
367
- with the previous word
368
- max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set,
369
- the maximum will be set by the default max_length.
370
- hotwords:
371
- Hotwords/hint phrases to the model. Has no effect if prefix is not None.
372
- word_timestamps: Extract word-level timestamps using the cross-attention pattern
373
- and dynamic time warping, and include the timestamps for each word in each segment.
374
- Set as False.
375
- without_timestamps: Only sample text tokens.
376
-
377
- Static params: (Fixed for batched version)
378
- max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
379
- multilingual: If True, perform transcription on multilingual videos. Set as False.
380
- output_language: Valid only if multilingual is set to True.
381
- Specifies the string representing the output language. One of
382
- 'en' (English) or 'hybrid' (code-switched transcription). set as None.
383
- condition_on_previous_text: If True, the previous output of the model is provided
384
- as a prompt for the next window; disabling may make the text inconsistent across
385
- windows, but the model becomes less prone to getting stuck in a failure loop,
386
- such as repetition looping or timestamps going out of sync. Set as False
387
- prompt_reset_on_temperature: Resets prompt if temperature is above this value.
388
- Arg has effect only if condition_on_previous_text is True. Set at 0.5
389
- #TODO: support "hallucination_silence_threshold" when "word_timestamps=True"
390
- hallucination_silence_threshold: Optional[float]
391
- When word_timestamps is True, skip silent periods longer than this threshold
392
- (in seconds) when a possible hallucination is detected. set as None.
393
- clip_timestamps:
394
- Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
395
- process. The last end timestamp defaults to the end of the file. Set as "0".
396
-
397
- unused:
398
- language_detection_threshold: If the maximum probability of the language tokens is
399
- higher than this value, the language is detected.
400
- language_detection_segments: Number of segments to consider for the language detection.
401
- vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
402
- without speech. This step is using the Silero VAD model
403
- https://github.com/snakers4/silero-vad.
404
- vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
405
- parameters and default values in the class `VadOptions`).
406
- chunk_length: The length of audio segments. If it is not None, it will overwrite the
407
- default chunk_length of the FeatureExtractor.
408
-
409
-
410
- Returns:
411
- A tuple with:
412
-
413
- - a generator over transcribed batched segments.
414
- - an instance of TranscriptionInfo.
415
- """
416
-
417
- sampling_rate = self.model.feature_extractor.sampling_rate
418
-
419
- if isinstance(audio, np.ndarray):
420
- audio = torch.from_numpy(audio)
421
- elif not isinstance(audio, torch.Tensor):
422
- audio = decode_audio(audio, sampling_rate=sampling_rate)
423
- duration = audio.shape[0] / sampling_rate
424
-
425
- # if no segment split is provided, use vad_model and generate segments
426
- if not vad_segments:
427
- # run the audio if it is less than 30 sec even without vad_segments
428
- if self.use_vad_model:
429
- vad_segments = self.vad_model(
430
- {
431
- "waveform": audio.unsqueeze(0),
432
- "sample_rate": 16000,
433
- }
434
- )
435
- vad_segments = merge_chunks(
436
- vad_segments,
437
- self.chunk_length,
438
- onset=self.vad_onset,
439
- offset=self.vad_offset,
440
- )
441
- elif duration < self.chunk_length:
442
- vad_segments = [
443
- {"start": 0.0, "end": duration, "segments": [(0.0, duration)]}
444
- ]
445
- else:
446
- raise RuntimeError(
447
- "No vad segments found. Set 'use_vad_model' to True while loading the model"
448
- )
449
- if self.model.model.is_multilingual:
450
- language = language or self.preset_language
451
- elif language != "en":
452
- if language is not None:
453
- self.model.logger.warning(
454
- f"English-only model is used, but {language} language is"
455
- "chosen, setting language to 'en'."
456
- )
457
- language = "en"
458
-
459
- (
460
- language,
461
- language_probability,
462
- task,
463
- all_language_probs,
464
- ) = self.get_language_and_tokenizer(audio, task, language)
465
-
466
- duration_after_vad = sum(
467
- segment["end"] - segment["start"] for segment in vad_segments
468
- )
469
-
470
- # batched options: see the difference with default options in WhisperModel
471
- batched_options = TranscriptionOptions(
472
- beam_size=beam_size,
473
- best_of=best_of,
474
- patience=patience,
475
- length_penalty=length_penalty,
476
- repetition_penalty=repetition_penalty,
477
- no_repeat_ngram_size=no_repeat_ngram_size,
478
- log_prob_threshold=log_prob_threshold,
479
- log_prob_low_threshold=log_prob_low_threshold,
480
- no_speech_threshold=no_speech_threshold,
481
- compression_ratio_threshold=compression_ratio_threshold,
482
- temperatures=(
483
- temperature if isinstance(temperature, (list, tuple)) else [temperature]
484
- ),
485
- initial_prompt=initial_prompt,
486
- prefix=prefix,
487
- suppress_blank=suppress_blank,
488
- suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens),
489
- prepend_punctuations=prepend_punctuations,
490
- append_punctuations=append_punctuations,
491
- max_new_tokens=max_new_tokens,
492
- hotwords=hotwords,
493
- word_timestamps=word_timestamps,
494
- hallucination_silence_threshold=None,
495
- condition_on_previous_text=False,
496
- clip_timestamps="0",
497
- prompt_reset_on_temperature=0.5,
498
- multilingual=False,
499
- output_language=None,
500
- without_timestamps=without_timestamps,
501
- max_initial_timestamp=0.0,
502
- )
503
-
504
- info = TranscriptionInfo(
505
- language=language,
506
- language_probability=language_probability,
507
- duration=duration,
508
- duration_after_vad=duration_after_vad,
509
- transcription_options=batched_options,
510
- vad_options=None,
511
- all_language_probs=all_language_probs,
512
- )
513
-
514
- audio_segments, segments_metadata = self.audio_split(
515
- audio, vad_segments, sampling_rate
516
- )
517
- to_cpu = (
518
- self.model.model.device == "cuda" and len(self.model.model.device_index) > 1
519
- )
520
- audio_segments = torch.nested.nested_tensor(audio_segments).to_padded_tensor(
521
- padding=0
522
- )
523
- features = torch.stack(
524
- [
525
- self.model.feature_extractor(audio_segment, to_cpu=to_cpu)[
526
- ..., : self.model.feature_extractor.nb_max_frames
527
- ]
528
- for audio_segment in audio_segments
529
- ]
530
- )
531
-
532
- segments = self._batched_segments_generator(
533
- features,
534
- segments_metadata,
535
- batch_size,
536
- batched_options,
537
- log_progress,
538
- )
539
-
540
- return segments, info
541
-
542
- def _batched_segments_generator(
543
- self, features, segments_metadata, batch_size, options, log_progress
544
- ):
545
- pbar = tqdm(total=len(features), disable=not log_progress, position=0)
546
- seg_idx = 0
547
- for i in range(0, len(features), batch_size):
548
- results = self.forward(
549
- features[i : i + batch_size],
550
- segments_metadata[i : i + batch_size],
551
- **options._asdict(),
552
- )
553
-
554
- for result in results:
555
- for segment in result:
556
- seg_idx += 1
557
- yield Segment(
558
- seek=int(result[-1]["end"] * self.model.frames_per_second),
559
- id=seg_idx,
560
- text=segment["text"],
561
- start=round(segment["start"], 3),
562
- end=round(segment["end"], 3),
563
- words=(
564
- None
565
- if not options.word_timestamps
566
- else [Word(**word) for word in segment["words"]]
567
- ),
568
- tokens=segment["tokens"],
569
- avg_logprob=segment["avg_logprob"],
570
- no_speech_prob=segment["no_speech_prob"],
571
- compression_ratio=segment["compression_ratio"],
572
- )
573
-
574
- pbar.update(1)
575
-
576
- pbar.close()
577
- # revert the tokenizer if multilingual inference is enabled
578
- if self.preset_language is None:
579
- self.tokenizer = None
580
- self.last_speech_timestamp = 0.0
581
-
582
-
583
- class WhisperModel:
584
- def __init__(
585
- self,
586
- model_size_or_path: str,
587
- device: str = "auto",
588
- device_index: Union[int, List[int]] = 0,
589
- compute_type: str = "default",
590
- cpu_threads: int = 16,
591
- num_workers: int = 1,
592
- download_root: Optional[str] = None,
593
- local_files_only: bool = False,
594
- files: dict = None,
595
- **model_kwargs,
596
- ):
597
- """Initializes the Whisper model.
598
-
599
- Args:
600
- model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
601
- small, small.en, distil-small.en, medium, medium.en, distil-medium.en, large-v1,
602
- large-v2, large-v3, large, distil-large-v2 or distil-large-v3), a path to a
603
- converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub.
604
- When a size or a model ID is configured, the converted model is downloaded
605
- from the Hugging Face Hub.
606
- device: Device to use for computation ("cpu", "cuda", "auto").
607
- device_index: Device ID to use.
608
- The model can also be loaded on multiple GPUs by passing a list of IDs
609
- (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
610
- when transcribe() is called from multiple Python threads (see also num_workers).
611
- compute_type: Type to use for computation.
612
- See https://opennmt.net/CTranslate2/quantization.html.
613
- cpu_threads: Number of threads to use when running on CPU (4 by default).
614
- A non zero value overrides the OMP_NUM_THREADS environment variable.
615
- num_workers: When transcribe() is called from multiple Python threads,
616
- having multiple workers enables true parallelism when running the model
617
- (concurrent calls to self.model.generate() will run in parallel).
618
- This can improve the global throughput at the cost of increased memory usage.
619
- download_root: Directory where the models should be saved. If not set, the models
620
- are saved in the standard Hugging Face cache directory.
621
- local_files_only: If True, avoid downloading the file and return the path to the
622
- local cached file if it exists.
623
- files: Load model files from the memory. This argument is a dictionary mapping file names
624
- to file contents as file-like or bytes objects. If this is set, model_path acts as an
625
- identifier for this model.
626
- """
627
- self.logger = get_logger()
628
-
629
- tokenizer_bytes, preprocessor_bytes = None, None
630
- if files:
631
- model_path = model_size_or_path
632
- tokenizer_bytes = files.pop("tokenizer.json", None)
633
- preprocessor_bytes = files.pop("preprocessor_config.json", None)
634
- elif os.path.isdir(model_size_or_path):
635
- model_path = model_size_or_path
636
- else:
637
- model_path = download_model(
638
- model_size_or_path,
639
- local_files_only=local_files_only,
640
- cache_dir=download_root,
641
- )
642
- self.device = device
643
- # set the random seed to make sure consistency across runs
644
- ctranslate2.set_random_seed(42)
645
- self.model = ctranslate2.models.Whisper(
646
- model_path,
647
- device=self.device,
648
- device_index=device_index,
649
- compute_type=compute_type,
650
- intra_threads=cpu_threads,
651
- inter_threads=num_workers,
652
- files=files,
653
- **model_kwargs,
654
- )
655
-
656
- tokenizer_file = os.path.join(model_path, "tokenizer.json")
657
- if tokenizer_bytes:
658
- self.hf_tokenizer = tokenizers.Tokenizer.from_buffer(tokenizer_bytes)
659
- elif os.path.isfile(tokenizer_file):
660
- self.hf_tokenizer = tokenizers.Tokenizer.from_file(tokenizer_file)
661
- else:
662
- self.hf_tokenizer = tokenizers.Tokenizer.from_pretrained(
663
- "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
664
- )
665
- self.feat_kwargs = self._get_feature_kwargs(model_path, preprocessor_bytes)
666
- self.feature_extractor = FeatureExtractor(
667
- **self.feat_kwargs, device=self.device
668
- )
669
- self.input_stride = 2
670
- self.num_samples_per_token = (
671
- self.feature_extractor.hop_length * self.input_stride
672
- )
673
- self.frames_per_second = (
674
- self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
675
- )
676
- self.tokens_per_second = (
677
- self.feature_extractor.sampling_rate // self.num_samples_per_token
678
- )
679
- self.time_precision = 0.02
680
- self.max_length = 448
681
-
682
- @property
683
- def supported_languages(self) -> List[str]:
684
- """The languages supported by the model."""
685
- return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
686
-
687
- def _get_feature_kwargs(self, model_path, preprocessor_bytes=None) -> dict:
688
- config = {}
689
- try:
690
- config_path = os.path.join(model_path, "preprocessor_config.json")
691
- if preprocessor_bytes:
692
- config = json.loads(preprocessor_bytes)
693
- elif os.path.isfile(config_path):
694
- with open(config_path, "r", encoding="utf-8") as file:
695
- config = json.load(file)
696
- else:
697
- return config
698
- valid_keys = signature(FeatureExtractor.__init__).parameters.keys()
699
- return {k: v for k, v in config.items() if k in valid_keys}
700
- except json.JSONDecodeError as e:
701
- self.logger.warning("Could not load preprocessor config: %s", e)
702
-
703
- return config
704
-
705
- def transcribe(
706
- self,
707
- audio: Union[str, BinaryIO, torch.Tensor, np.ndarray],
708
- language: Optional[str] = None,
709
- task: str = "transcribe",
710
- beam_size: int = 5,
711
- best_of: int = 5,
712
- patience: float = 1,
713
- length_penalty: float = 1,
714
- repetition_penalty: float = 1,
715
- no_repeat_ngram_size: int = 0,
716
- temperature: Union[float, List[float], Tuple[float, ...]] = [
717
- 0.0,
718
- 0.2,
719
- 0.4,
720
- 0.6,
721
- 0.8,
722
- 1.0,
723
- ],
724
- compression_ratio_threshold: Optional[float] = 2.4,
725
- log_prob_threshold: Optional[float] = -1.0,
726
- log_prob_low_threshold: Optional[float] = None,
727
- no_speech_threshold: Optional[float] = 0.6,
728
- condition_on_previous_text: bool = True,
729
- prompt_reset_on_temperature: float = 0.5,
730
- initial_prompt: Optional[Union[str, Iterable[int]]] = None,
731
- prefix: Optional[str] = None,
732
- suppress_blank: bool = True,
733
- suppress_tokens: Optional[List[int]] = [-1],
734
- without_timestamps: bool = False,
735
- max_initial_timestamp: float = 1.0,
736
- word_timestamps: bool = False,
737
- prepend_punctuations: str = "\"'“¿([{-",
738
- append_punctuations: str = "\"'.。,,!!??::”)]}、",
739
- multilingual: bool = False,
740
- output_language: Optional[str] = None,
741
- vad_filter: bool = False,
742
- vad_parameters: Optional[Union[dict, VadOptions]] = None,
743
- max_new_tokens: Optional[int] = None,
744
- chunk_length: Optional[int] = None,
745
- clip_timestamps: Union[str, List[float]] = "0",
746
- hallucination_silence_threshold: Optional[float] = None,
747
- hotwords: Optional[str] = None,
748
- language_detection_threshold: Optional[float] = None,
749
- language_detection_segments: int = 1,
750
- ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
751
- """Transcribes an input file.
752
-
753
- Arguments:
754
- audio: Path to the input file (or a file-like object), or the audio waveform.
755
- language: The language spoken in the audio. It should be a language code such
756
- as "en" or "fr". If not set, the language will be detected in the first 30 seconds
757
- of audio.
758
- task: Task to execute (transcribe or translate).
759
- beam_size: Beam size to use for decoding.
760
- best_of: Number of candidates when sampling with non-zero temperature.
761
- patience: Beam search patience factor.
762
- length_penalty: Exponential length penalty constant.
763
- repetition_penalty: Penalty applied to the score of previously generated tokens
764
- (set > 1 to penalize).
765
- no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable).
766
- temperature: Temperature for sampling. It can be a tuple of temperatures,
767
- which will be successively used upon failures according to either
768
- `compression_ratio_threshold` or `log_prob_threshold`.
769
- compression_ratio_threshold: If the gzip compression ratio is above this value,
770
- treat as failed.
771
- log_prob_threshold: If the average log probability over sampled tokens is
772
- below this value, treat as failed.
773
- log_prob_low_threshold: This parameter alone is sufficient to skip an output text,
774
- wheras log_prob_threshold also looks for appropriate no_speech_threshold value.
775
- This value should be less than log_prob_threshold.
776
- no_speech_threshold: If the no_speech probability is higher than this value AND
777
- the average log probability over sampled tokens is below `log_prob_threshold`,
778
- consider the segment as silent.
779
- condition_on_previous_text: If True, the previous output of the model is provided
780
- as a prompt for the next window; disabling may make the text inconsistent across
781
- windows, but the model becomes less prone to getting stuck in a failure loop,
782
- such as repetition looping or timestamps going out of sync.
783
- prompt_reset_on_temperature: Resets prompt if temperature is above this value.
784
- Arg has effect only if condition_on_previous_text is True.
785
- initial_prompt: Optional text string or iterable of token ids to provide as a
786
- prompt for the first window.
787
- prefix: Optional text to provide as a prefix for the first window.
788
- suppress_blank: Suppress blank outputs at the beginning of the sampling.
789
- suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
790
- of symbols as defined in `tokenizer.non_speech_tokens()`.
791
- without_timestamps: Only sample text tokens.
792
- max_initial_timestamp: The initial timestamp cannot be later than this.
793
- word_timestamps: Extract word-level timestamps using the cross-attention pattern
794
- and dynamic time warping, and include the timestamps for each word in each segment.
795
- prepend_punctuations: If word_timestamps is True, merge these punctuation symbols
796
- with the next word
797
- append_punctuations: If word_timestamps is True, merge these punctuation symbols
798
- with the previous word
799
- multilingual: If True, perform transcription on multilingual videos
800
- and return the transcript based
801
- on the 'output_language' flag.
802
- output_language: Valid only if multilingual is set to True.
803
- Specifies the string representing the output language. One of
804
- 'en' (English) or 'hybrid' (code-switched transcription).
805
- vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
806
- without speech. This step is using the Silero VAD model
807
- https://github.com/snakers4/silero-vad.
808
- vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
809
- parameters and default values in the class `VadOptions`).
810
- max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set,
811
- the maximum will be set by the default max_length.
812
- chunk_length: The length of audio segments. If it is not None, it will overwrite the
813
- default chunk_length of the FeatureExtractor.
814
- clip_timestamps:
815
- Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
816
- process. The last end timestamp defaults to the end of the file.
817
- vad_filter will be ignored if clip_timestamps is used.
818
- hallucination_silence_threshold:
819
- When word_timestamps is True, skip silent periods longer than this threshold
820
- (in seconds) when a possible hallucination is detected
821
- hotwords:
822
- Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
823
- language_detection_threshold: If the maximum probability of the language tokens is higher
824
- than this value, the language is detected.
825
- language_detection_segments: Number of segments to consider for the language detection.
826
- Returns:
827
- A tuple with:
828
-
829
- - a generator over transcribed segments
830
- - an instance of TranscriptionInfo
831
- """
832
-
833
- sampling_rate = self.feature_extractor.sampling_rate
834
-
835
- if isinstance(audio, np.ndarray):
836
- audio = torch.from_numpy(audio)
837
- elif not isinstance(audio, torch.Tensor):
838
- audio = decode_audio(audio, sampling_rate=sampling_rate)
839
-
840
- duration = audio.shape[0] / sampling_rate
841
- duration_after_vad = duration
842
-
843
- self.logger.info(
844
- "Processing audio with duration %s", format_timestamp(duration)
845
- )
846
-
847
- if vad_filter and clip_timestamps == "0":
848
- if vad_parameters is None:
849
- vad_parameters = VadOptions()
850
- elif isinstance(vad_parameters, dict):
851
- vad_parameters = VadOptions(**vad_parameters)
852
- speech_chunks = get_speech_timestamps(audio, vad_parameters)
853
- audio = collect_chunks(audio, speech_chunks)
854
- duration_after_vad = audio.shape[0] / sampling_rate
855
-
856
- self.logger.info(
857
- "VAD filter removed %s of audio",
858
- format_timestamp(duration - duration_after_vad),
859
- )
860
-
861
- if self.logger.isEnabledFor(logging.DEBUG):
862
- self.logger.debug(
863
- "VAD filter kept the following audio segments: %s",
864
- ", ".join(
865
- "[%s -> %s]"
866
- % (
867
- format_timestamp(chunk["start"] / sampling_rate),
868
- format_timestamp(chunk["end"] / sampling_rate),
869
- )
870
- for chunk in speech_chunks
871
- ),
872
- )
873
-
874
- else:
875
- speech_chunks = None
876
-
877
- to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
878
- features = self.feature_extractor(
879
- audio, chunk_length=chunk_length, to_cpu=to_cpu
880
- )
881
-
882
- encoder_output = None
883
- all_language_probs = None
884
-
885
- # setting output_language for multilingual videos
886
- if multilingual:
887
- if output_language is None:
888
- output_language = "en"
889
- elif output_language not in ["en", "hybrid"]:
890
- raise ValueError("Output language needs to be one of 'en'/'hybrid'.")
891
-
892
- # detecting the language if not provided
893
- if language is None:
894
- if not self.model.is_multilingual:
895
- language = "en"
896
- language_probability = 1
897
- else:
898
- if (
899
- language_detection_segments is None
900
- or language_detection_segments < 1
901
- ):
902
- language_detection_segments = 1
903
- start_timestamp = (
904
- float(clip_timestamps.split(",")[0])
905
- if isinstance(clip_timestamps, str)
906
- else clip_timestamps[0]
907
- )
908
- content_frames = (
909
- features.shape[-1] - self.feature_extractor.nb_max_frames
910
- )
911
- seek = (
912
- int(start_timestamp * self.frames_per_second)
913
- if start_timestamp * self.frames_per_second < content_frames
914
- else 0
915
- )
916
- end_frames = min(
917
- seek
918
- + self.feature_extractor.nb_max_frames
919
- * language_detection_segments,
920
- content_frames,
921
- )
922
- detected_language_info = {}
923
- while seek <= end_frames:
924
- segment = features[
925
- :, seek : seek + self.feature_extractor.nb_max_frames
926
- ]
927
- encoder_output = self.encode(segment)
928
- # results is a list of tuple[str, float] with language names and
929
- # probabilities.
930
- results = self.model.detect_language(encoder_output)[0]
931
- # Parse language names to strip out markers
932
- all_language_probs = [
933
- (token[2:-2], prob) for (token, prob) in results
934
- ]
935
- # Get top language token and probability
936
- language, language_probability = all_language_probs[0]
937
- if (
938
- language_detection_threshold is None
939
- or language_probability > language_detection_threshold
940
- ):
941
- break
942
- detected_language_info.setdefault(language, []).append(
943
- language_probability
944
- )
945
- seek += segment.shape[-1]
946
- else:
947
- # If no language detected for all segments, the majority vote of the highest
948
- # projected languages for all segments is used to determine the language.
949
- language = max(
950
- detected_language_info,
951
- key=lambda lang: len(detected_language_info[lang]),
952
- )
953
- language_probability = max(detected_language_info[language])
954
-
955
- self.logger.info(
956
- "Detected language '%s' with probability %.2f",
957
- language,
958
- language_probability,
959
- )
960
- else:
961
- if not self.model.is_multilingual and language != "en":
962
- self.logger.warning(
963
- "The current model is English-only but the language parameter is set to '%s'; "
964
- "using 'en' instead." % language
965
- )
966
- language = "en"
967
-
968
- language_probability = 1
969
-
970
- tokenizer = Tokenizer(
971
- self.hf_tokenizer,
972
- self.model.is_multilingual,
973
- task=task,
974
- language=language,
975
- )
976
-
977
- options = TranscriptionOptions(
978
- beam_size=beam_size,
979
- best_of=best_of,
980
- patience=patience,
981
- length_penalty=length_penalty,
982
- repetition_penalty=repetition_penalty,
983
- no_repeat_ngram_size=no_repeat_ngram_size,
984
- log_prob_threshold=log_prob_threshold,
985
- log_prob_low_threshold=log_prob_low_threshold,
986
- no_speech_threshold=no_speech_threshold,
987
- compression_ratio_threshold=compression_ratio_threshold,
988
- condition_on_previous_text=condition_on_previous_text,
989
- prompt_reset_on_temperature=prompt_reset_on_temperature,
990
- temperatures=(
991
- temperature if isinstance(temperature, (list, tuple)) else [temperature]
992
- ),
993
- initial_prompt=initial_prompt,
994
- prefix=prefix,
995
- suppress_blank=suppress_blank,
996
- suppress_tokens=(
997
- get_suppressed_tokens(tokenizer, suppress_tokens)
998
- if suppress_tokens
999
- else suppress_tokens
1000
- ),
1001
- without_timestamps=without_timestamps,
1002
- max_initial_timestamp=max_initial_timestamp,
1003
- word_timestamps=word_timestamps,
1004
- prepend_punctuations=prepend_punctuations,
1005
- append_punctuations=append_punctuations,
1006
- multilingual=multilingual,
1007
- output_language=output_language,
1008
- max_new_tokens=max_new_tokens,
1009
- clip_timestamps=clip_timestamps,
1010
- hallucination_silence_threshold=hallucination_silence_threshold,
1011
- hotwords=hotwords,
1012
- )
1013
-
1014
- segments = self.generate_segments(features, tokenizer, options, encoder_output)
1015
-
1016
- if speech_chunks:
1017
- segments = restore_speech_timestamps(segments, speech_chunks, sampling_rate)
1018
-
1019
- info = TranscriptionInfo(
1020
- language=language,
1021
- language_probability=language_probability,
1022
- duration=duration,
1023
- duration_after_vad=duration_after_vad,
1024
- transcription_options=options,
1025
- vad_options=vad_parameters,
1026
- all_language_probs=all_language_probs,
1027
- )
1028
- return segments, info
1029
-
1030
- def _split_segments_by_timestamps(
1031
- self,
1032
- tokenizer: Tokenizer,
1033
- tokens: List[int],
1034
- time_offset: float,
1035
- segment_size: int,
1036
- segment_duration: float,
1037
- seek: int,
1038
- ) -> List[List[int]]:
1039
- current_segments = []
1040
- single_timestamp_ending = (
1041
- len(tokens) >= 2 and tokens[-2] < tokenizer.timestamp_begin <= tokens[-1]
1042
- )
1043
-
1044
- consecutive_timestamps = [
1045
- i
1046
- for i in range(len(tokens))
1047
- if i > 0
1048
- and tokens[i] >= tokenizer.timestamp_begin
1049
- and tokens[i - 1] >= tokenizer.timestamp_begin
1050
- ]
1051
-
1052
- if len(consecutive_timestamps) > 0:
1053
- slices = list(consecutive_timestamps)
1054
- if single_timestamp_ending:
1055
- slices.append(len(tokens))
1056
-
1057
- last_slice = 0
1058
- for current_slice in slices:
1059
- sliced_tokens = tokens[last_slice:current_slice]
1060
- start_timestamp_position = sliced_tokens[0] - tokenizer.timestamp_begin
1061
- end_timestamp_position = sliced_tokens[-1] - tokenizer.timestamp_begin
1062
- start_time = (
1063
- time_offset + start_timestamp_position * self.time_precision
1064
- )
1065
- end_time = time_offset + end_timestamp_position * self.time_precision
1066
-
1067
- current_segments.append(
1068
- dict(
1069
- seek=seek,
1070
- start=start_time,
1071
- end=end_time,
1072
- tokens=sliced_tokens,
1073
- )
1074
- )
1075
- last_slice = current_slice
1076
-
1077
- if single_timestamp_ending:
1078
- # single timestamp at the end means no speech after the last timestamp.
1079
- seek += segment_size
1080
- else:
1081
- # otherwise, ignore the unfinished segment and seek to the last timestamp
1082
- last_timestamp_position = (
1083
- tokens[last_slice - 1] - tokenizer.timestamp_begin
1084
- )
1085
- seek += last_timestamp_position * self.input_stride
1086
-
1087
- else:
1088
- duration = segment_duration
1089
- timestamps = [
1090
- token for token in tokens if token >= tokenizer.timestamp_begin
1091
- ]
1092
- if len(timestamps) > 0 and timestamps[-1] != tokenizer.timestamp_begin:
1093
- last_timestamp_position = timestamps[-1] - tokenizer.timestamp_begin
1094
- duration = last_timestamp_position * self.time_precision
1095
-
1096
- current_segments.append(
1097
- dict(
1098
- seek=seek,
1099
- start=time_offset,
1100
- end=time_offset + duration,
1101
- tokens=tokens,
1102
- )
1103
- )
1104
-
1105
- seek += segment_size
1106
-
1107
- return current_segments, seek, single_timestamp_ending
1108
-
1109
- def generate_segments(
1110
- self,
1111
- features: torch.Tensor,
1112
- tokenizer: Tokenizer,
1113
- options: TranscriptionOptions,
1114
- encoder_output: Optional[ctranslate2.StorageView] = None,
1115
- ) -> Iterable[Segment]:
1116
- content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames
1117
- content_duration = float(content_frames * self.feature_extractor.time_per_frame)
1118
-
1119
- if isinstance(options.clip_timestamps, str):
1120
- options = options._replace(
1121
- clip_timestamps=[
1122
- float(ts)
1123
- for ts in (
1124
- options.clip_timestamps.split(",")
1125
- if options.clip_timestamps
1126
- else []
1127
- )
1128
- ]
1129
- )
1130
- seek_points: List[int] = [
1131
- round(ts * self.frames_per_second) for ts in options.clip_timestamps
1132
- ]
1133
- if len(seek_points) == 0:
1134
- seek_points.append(0)
1135
- if len(seek_points) % 2 == 1:
1136
- seek_points.append(content_frames)
1137
- seek_clips: List[Tuple[int, int]] = list(
1138
- zip(seek_points[::2], seek_points[1::2])
1139
- )
1140
-
1141
- punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、"
1142
-
1143
- idx = 0
1144
- clip_idx = 0
1145
- seek = seek_clips[clip_idx][0]
1146
- all_tokens = []
1147
- prompt_reset_since = 0
1148
-
1149
- if options.initial_prompt is not None:
1150
- if isinstance(options.initial_prompt, str):
1151
- initial_prompt = " " + options.initial_prompt.strip()
1152
- initial_prompt_tokens = tokenizer.encode(initial_prompt)
1153
- all_tokens.extend(initial_prompt_tokens)
1154
- else:
1155
- all_tokens.extend(options.initial_prompt)
1156
-
1157
- last_speech_timestamp = 0.0
1158
- # NOTE: This loop is obscurely flattened to make the diff readable.
1159
- # A later commit should turn this into a simpler nested loop.
1160
- # for seek_clip_start, seek_clip_end in seek_clips:
1161
- # while seek < seek_clip_end
1162
- while clip_idx < len(seek_clips):
1163
- seek_clip_start, seek_clip_end = seek_clips[clip_idx]
1164
- if seek_clip_end > content_frames:
1165
- seek_clip_end = content_frames
1166
- if seek < seek_clip_start:
1167
- seek = seek_clip_start
1168
- if seek >= seek_clip_end:
1169
- clip_idx += 1
1170
- if clip_idx < len(seek_clips):
1171
- seek = seek_clips[clip_idx][0]
1172
- continue
1173
- time_offset = seek * self.feature_extractor.time_per_frame
1174
- window_end_time = float(
1175
- (seek + self.feature_extractor.nb_max_frames)
1176
- * self.feature_extractor.time_per_frame
1177
- )
1178
- segment_size = min(
1179
- self.feature_extractor.nb_max_frames,
1180
- content_frames - seek,
1181
- seek_clip_end - seek,
1182
- )
1183
- segment = features[:, seek : seek + segment_size]
1184
- segment_duration = segment_size * self.feature_extractor.time_per_frame
1185
- segment = pad_or_trim(segment, self.feature_extractor.nb_max_frames)
1186
-
1187
- if self.logger.isEnabledFor(logging.DEBUG):
1188
- self.logger.debug(
1189
- "Processing segment at %s", format_timestamp(time_offset)
1190
- )
1191
-
1192
- previous_tokens = all_tokens[prompt_reset_since:]
1193
-
1194
- if encoder_output is None:
1195
- encoder_output = self.encode(segment)
1196
-
1197
- # Perform language detection at every segment to update task based on output language,
1198
- # if the language is english, task is transcribe,
1199
- # else the task is translate to english (default)
1200
- # or transcribe if 'output_language' is 'hybrid'.
1201
- if options.multilingual:
1202
- results = self.model.detect_language(encoder_output)
1203
- language_token, language_probability = results[0][0]
1204
- language = language_token[2:-2]
1205
- if options.output_language == "en" and language != "en":
1206
- task = "translate"
1207
- else:
1208
- task = "transcribe"
1209
-
1210
- # Update tokenizer based on task and language
1211
- tokenizer.task = tokenizer.tokenizer.token_to_id(f"<|{task}|>")
1212
- tokenizer.language = tokenizer.tokenizer.token_to_id(language_token)
1213
- tokenizer.language_code = language
1214
- # Update prompt based on task and language
1215
- prompt = self.get_prompt(
1216
- tokenizer,
1217
- previous_tokens,
1218
- without_timestamps=options.without_timestamps,
1219
- prefix=options.prefix if seek == 0 else None,
1220
- hotwords=options.hotwords,
1221
- )
1222
-
1223
- if seek > 0 or encoder_output is None:
1224
- encoder_output = self.encode(segment)
1225
-
1226
- (
1227
- result,
1228
- avg_logprob,
1229
- temperature,
1230
- compression_ratio,
1231
- ) = self.generate_with_fallback(encoder_output, prompt, tokenizer, options)
1232
-
1233
- if options.no_speech_threshold is not None:
1234
- # no voice activity check
1235
- should_skip = result.no_speech_prob > options.no_speech_threshold
1236
-
1237
- if (
1238
- options.log_prob_threshold is not None
1239
- and avg_logprob > options.log_prob_threshold
1240
- ):
1241
- # don't skip if the logprob is high enough, despite the no_speech_prob
1242
- should_skip = False
1243
-
1244
- if should_skip:
1245
- self.logger.debug(
1246
- "No speech threshold is met (%f > %f)",
1247
- result.no_speech_prob,
1248
- options.no_speech_threshold,
1249
- )
1250
-
1251
- # Skip if the logprob is very low (below the threshold value),
1252
- # despite no_speech_prob being low (ex: Too ambiguous outputs)
1253
- if options.log_prob_low_threshold:
1254
- if avg_logprob < options.log_prob_low_threshold:
1255
- should_skip = True
1256
- self.logger.debug(
1257
- "log prob low threshold is met (%f > %f)",
1258
- avg_logprob,
1259
- options.log_prob_low_threshold,
1260
- )
1261
-
1262
- if should_skip:
1263
- # fast-forward to the next segment boundary
1264
- seek += segment_size
1265
- continue
1266
-
1267
- tokens = result.sequences_ids[0]
1268
-
1269
- previous_seek = seek
1270
-
1271
- # anomalous words are very long/short/improbable
1272
- def word_anomaly_score(word: dict) -> float:
1273
- probability = word.get("probability", 0.0)
1274
- duration = word["end"] - word["start"]
1275
- score = 0.0
1276
- if probability < 0.15:
1277
- score += 1.0
1278
- if duration < 0.133:
1279
- score += (0.133 - duration) * 15
1280
- if duration > 2.0:
1281
- score += duration - 2.0
1282
- return score
1283
-
1284
- def is_segment_anomaly(segment: Optional[dict]) -> bool:
1285
- if segment is None or not segment["words"]:
1286
- return False
1287
- words = [w for w in segment["words"] if w["word"] not in punctuation]
1288
- words = words[:8]
1289
- score = sum(word_anomaly_score(w) for w in words)
1290
- return score >= 3 or score + 0.01 >= len(words)
1291
-
1292
- def next_words_segment(segments: List[dict]) -> Optional[dict]:
1293
- return next((s for s in segments if s["words"]), None)
1294
-
1295
- (
1296
- current_segments,
1297
- seek,
1298
- single_timestamp_ending,
1299
- ) = self._split_segments_by_timestamps(
1300
- tokenizer=tokenizer,
1301
- tokens=tokens,
1302
- time_offset=time_offset,
1303
- segment_size=segment_size,
1304
- segment_duration=segment_duration,
1305
- seek=seek,
1306
- )
1307
-
1308
- if options.word_timestamps:
1309
- self.add_word_timestamps(
1310
- [current_segments],
1311
- tokenizer,
1312
- encoder_output,
1313
- segment_size,
1314
- options.prepend_punctuations,
1315
- options.append_punctuations,
1316
- last_speech_timestamp=last_speech_timestamp,
1317
- )
1318
- if not single_timestamp_ending:
1319
- last_word_end = get_end(current_segments)
1320
- if last_word_end is not None and last_word_end > time_offset:
1321
- seek = round(last_word_end * self.frames_per_second)
1322
-
1323
- # skip silence before possible hallucinations
1324
- if options.hallucination_silence_threshold is not None:
1325
- threshold = options.hallucination_silence_threshold
1326
-
1327
- # if first segment might be a hallucination, skip leading silence
1328
- first_segment = next_words_segment(current_segments)
1329
- if first_segment is not None and is_segment_anomaly(first_segment):
1330
- gap = first_segment["start"] - time_offset
1331
- if gap > threshold:
1332
- seek = previous_seek + round(gap * self.frames_per_second)
1333
- continue
1334
-
1335
- # skip silence before any possible hallucination that is surrounded
1336
- # by silence or more hallucinations
1337
- hal_last_end = last_speech_timestamp
1338
- for si in range(len(current_segments)):
1339
- segment = current_segments[si]
1340
- if not segment["words"]:
1341
- continue
1342
- if is_segment_anomaly(segment):
1343
- next_segment = next_words_segment(
1344
- current_segments[si + 1 :]
1345
- )
1346
- if next_segment is not None:
1347
- hal_next_start = next_segment["words"][0]["start"]
1348
- else:
1349
- hal_next_start = time_offset + segment_duration
1350
- silence_before = (
1351
- segment["start"] - hal_last_end > threshold
1352
- or segment["start"] < threshold
1353
- or segment["start"] - time_offset < 2.0
1354
- )
1355
- silence_after = (
1356
- hal_next_start - segment["end"] > threshold
1357
- or is_segment_anomaly(next_segment)
1358
- or window_end_time - segment["end"] < 2.0
1359
- )
1360
- if silence_before and silence_after:
1361
- seek = round(
1362
- max(time_offset + 1, segment["start"])
1363
- * self.frames_per_second
1364
- )
1365
- if content_duration - segment["end"] < threshold:
1366
- seek = content_frames
1367
- current_segments[si:] = []
1368
- break
1369
- hal_last_end = segment["end"]
1370
-
1371
- last_word_end = get_end(current_segments)
1372
- if last_word_end is not None:
1373
- last_speech_timestamp = last_word_end
1374
- for segment in current_segments:
1375
- tokens = segment["tokens"]
1376
- text = tokenizer.decode(tokens)
1377
-
1378
- if segment["start"] == segment["end"] or not text.strip():
1379
- continue
1380
-
1381
- all_tokens.extend(tokens)
1382
- idx += 1
1383
-
1384
- yield Segment(
1385
- id=idx,
1386
- seek=seek,
1387
- start=segment["start"],
1388
- end=segment["end"],
1389
- text=text,
1390
- tokens=tokens,
1391
- temperature=temperature,
1392
- avg_logprob=avg_logprob,
1393
- compression_ratio=compression_ratio,
1394
- no_speech_prob=result.no_speech_prob,
1395
- words=(
1396
- [Word(**word) for word in segment["words"]]
1397
- if options.word_timestamps
1398
- else None
1399
- ),
1400
- )
1401
-
1402
- if (
1403
- not options.condition_on_previous_text
1404
- or temperature > options.prompt_reset_on_temperature
1405
- ):
1406
- if options.condition_on_previous_text:
1407
- self.logger.debug(
1408
- "Reset prompt. prompt_reset_on_temperature threshold is met %f > %f",
1409
- temperature,
1410
- options.prompt_reset_on_temperature,
1411
- )
1412
-
1413
- prompt_reset_since = len(all_tokens)
1414
-
1415
- def encode(self, features: torch.Tensor) -> ctranslate2.StorageView:
1416
- # When the model is running on multiple GPUs, the encoder output should be moved
1417
- # to the CPU since we don't know which GPU will handle the next job.
1418
- to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
1419
-
1420
- if features.ndim == 2:
1421
- features = features.unsqueeze(0)
1422
- features = get_ctranslate2_storage(features)
1423
-
1424
- return self.model.encode(features, to_cpu=to_cpu)
1425
-
1426
- def generate_with_fallback(
1427
- self,
1428
- encoder_output: ctranslate2.StorageView,
1429
- prompt: List[int],
1430
- tokenizer: Tokenizer,
1431
- options: TranscriptionOptions,
1432
- ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float, float]:
1433
- decode_result = None
1434
- all_results = []
1435
- below_cr_threshold_results = []
1436
-
1437
- max_initial_timestamp_index = int(
1438
- round(options.max_initial_timestamp / self.time_precision)
1439
- )
1440
- if options.max_new_tokens is not None:
1441
- max_length = len(prompt) + options.max_new_tokens
1442
- else:
1443
- max_length = self.max_length
1444
-
1445
- if max_length > self.max_length:
1446
- raise ValueError(
1447
- f"The length of the prompt is {len(prompt)}, and the `max_new_tokens` "
1448
- f"{max_length - len(prompt)}. Thus, the combined length of the prompt "
1449
- f"and `max_new_tokens` is: {max_length}. This exceeds the "
1450
- f"`max_length` of the Whisper model: {self.max_length}. "
1451
- "You should either reduce the length of your prompt, or "
1452
- "reduce the value of `max_new_tokens`, "
1453
- f"so that their combined length is less that {self.max_length}."
1454
- )
1455
-
1456
- for temperature in options.temperatures:
1457
- if temperature > 0:
1458
- kwargs = {
1459
- "beam_size": 1,
1460
- "num_hypotheses": options.best_of,
1461
- "sampling_topk": 0,
1462
- "sampling_temperature": temperature,
1463
- }
1464
- else:
1465
- kwargs = {
1466
- "beam_size": options.beam_size,
1467
- "patience": options.patience,
1468
- }
1469
-
1470
- result = self.model.generate(
1471
- encoder_output,
1472
- [prompt],
1473
- length_penalty=options.length_penalty,
1474
- repetition_penalty=options.repetition_penalty,
1475
- no_repeat_ngram_size=options.no_repeat_ngram_size,
1476
- max_length=max_length,
1477
- return_scores=True,
1478
- return_no_speech_prob=True,
1479
- suppress_blank=options.suppress_blank,
1480
- suppress_tokens=options.suppress_tokens,
1481
- max_initial_timestamp_index=max_initial_timestamp_index,
1482
- **kwargs,
1483
- )[0]
1484
-
1485
- tokens = result.sequences_ids[0]
1486
-
1487
- # Recover the average log prob from the returned score.
1488
- seq_len = len(tokens)
1489
- cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
1490
- avg_logprob = cum_logprob / (seq_len + 1)
1491
-
1492
- text = tokenizer.decode(tokens).strip()
1493
- compression_ratio = get_compression_ratio(text)
1494
-
1495
- decode_result = (
1496
- result,
1497
- avg_logprob,
1498
- temperature,
1499
- compression_ratio,
1500
- )
1501
- all_results.append(decode_result)
1502
-
1503
- needs_fallback = False
1504
-
1505
- if options.compression_ratio_threshold is not None:
1506
- if compression_ratio > options.compression_ratio_threshold:
1507
- needs_fallback = True # too repetitive
1508
-
1509
- self.logger.debug(
1510
- "Compression ratio threshold is not met with temperature %.1f (%f > %f)",
1511
- temperature,
1512
- compression_ratio,
1513
- options.compression_ratio_threshold,
1514
- )
1515
- else:
1516
- below_cr_threshold_results.append(decode_result)
1517
-
1518
- if (
1519
- options.log_prob_threshold is not None
1520
- and avg_logprob < options.log_prob_threshold
1521
- ):
1522
- needs_fallback = True # average log probability is too low
1523
-
1524
- self.logger.debug(
1525
- "Log probability threshold is not met with temperature %.1f (%f < %f)",
1526
- temperature,
1527
- avg_logprob,
1528
- options.log_prob_threshold,
1529
- )
1530
-
1531
- if (
1532
- options.no_speech_threshold is not None
1533
- and result.no_speech_prob > options.no_speech_threshold
1534
- and options.log_prob_threshold is not None
1535
- and avg_logprob < options.log_prob_threshold
1536
- ):
1537
- needs_fallback = False # silence
1538
-
1539
- if not needs_fallback:
1540
- break
1541
- else:
1542
- # all failed, select the result with the highest average log probability
1543
- decode_result = max(
1544
- below_cr_threshold_results or all_results, key=lambda x: x[1]
1545
- )
1546
- # to pass final temperature for prompt_reset_on_temperature
1547
- decode_result = (
1548
- decode_result[0],
1549
- decode_result[1],
1550
- temperature,
1551
- decode_result[3],
1552
- )
1553
-
1554
- return decode_result
1555
-
1556
- def get_prompt(
1557
- self,
1558
- tokenizer: Tokenizer,
1559
- previous_tokens: List[int],
1560
- without_timestamps: bool = False,
1561
- prefix: Optional[str] = None,
1562
- hotwords: Optional[str] = None,
1563
- ) -> List[int]:
1564
- prompt = []
1565
-
1566
- if previous_tokens or (hotwords and not prefix):
1567
- prompt.append(tokenizer.sot_prev)
1568
- if hotwords and not prefix:
1569
- hotwords_tokens = tokenizer.encode(" " + hotwords.strip())
1570
- if len(hotwords_tokens) >= self.max_length // 2:
1571
- hotwords_tokens = hotwords_tokens[: self.max_length // 2 - 1]
1572
- prompt.extend(hotwords_tokens)
1573
- if previous_tokens:
1574
- prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :])
1575
-
1576
- prompt.extend(tokenizer.sot_sequence)
1577
-
1578
- if without_timestamps:
1579
- prompt.append(tokenizer.no_timestamps)
1580
-
1581
- if prefix:
1582
- prefix_tokens = tokenizer.encode(" " + prefix.strip())
1583
- if len(prefix_tokens) >= self.max_length // 2:
1584
- prefix_tokens = prefix_tokens[: self.max_length // 2 - 1]
1585
- if not without_timestamps:
1586
- prompt.append(tokenizer.timestamp_begin)
1587
- prompt.extend(prefix_tokens)
1588
-
1589
- return prompt
1590
-
1591
- def add_word_timestamps(
1592
- self,
1593
- segments: List[dict],
1594
- tokenizer: Tokenizer,
1595
- encoder_output: ctranslate2.StorageView,
1596
- num_frames: int,
1597
- prepend_punctuations: str,
1598
- append_punctuations: str,
1599
- last_speech_timestamp: float,
1600
- ) -> float:
1601
- if len(segments) == 0:
1602
- return
1603
-
1604
- text_tokens = []
1605
- text_tokens_per_segment = []
1606
- for segment in segments:
1607
- segment_tokens = [
1608
- [token for token in subsegment["tokens"] if token < tokenizer.eot]
1609
- for subsegment in segment
1610
- ]
1611
- text_tokens.append(list(itertools.chain.from_iterable(segment_tokens)))
1612
- text_tokens_per_segment.append(segment_tokens)
1613
-
1614
- alignments = self.find_alignment(
1615
- tokenizer, text_tokens, encoder_output, num_frames
1616
- )
1617
- median_max_durations = []
1618
- for alignment in alignments:
1619
- word_durations = np.array(
1620
- [word["end"] - word["start"] for word in alignment]
1621
- )
1622
- word_durations = word_durations[word_durations.nonzero()]
1623
- median_duration = (
1624
- np.median(word_durations) if len(word_durations) > 0 else 0.0
1625
- )
1626
- median_duration = min(0.7, float(median_duration))
1627
- max_duration = median_duration * 2
1628
-
1629
- # hack: truncate long words at sentence boundaries.
1630
- # a better segmentation algorithm based on VAD should be able to replace this.
1631
- if len(word_durations) > 0:
1632
- sentence_end_marks = ".。!!??"
1633
- # ensure words at sentence boundaries
1634
- # are not longer than twice the median word duration.
1635
- for i in range(1, len(alignment)):
1636
- if alignment[i]["end"] - alignment[i]["start"] > max_duration:
1637
- if alignment[i]["word"] in sentence_end_marks:
1638
- alignment[i]["end"] = alignment[i]["start"] + max_duration
1639
- elif alignment[i - 1]["word"] in sentence_end_marks:
1640
- alignment[i]["start"] = alignment[i]["end"] - max_duration
1641
-
1642
- merge_punctuations(alignment, prepend_punctuations, append_punctuations)
1643
- median_max_durations.append((median_duration, max_duration))
1644
-
1645
- for segment_idx, segment in enumerate(segments):
1646
- word_index = 0
1647
- time_offset = segment[0]["start"]
1648
- median_duration, max_duration = median_max_durations[segment_idx]
1649
- for subsegment_idx, subsegment in enumerate(segment):
1650
- saved_tokens = 0
1651
- words = []
1652
-
1653
- while word_index < len(alignments[segment_idx]) and saved_tokens < len(
1654
- text_tokens_per_segment[segment_idx][subsegment_idx]
1655
- ):
1656
- timing = alignments[segment_idx][word_index]
1657
-
1658
- if timing["word"]:
1659
- words.append(
1660
- dict(
1661
- word=timing["word"],
1662
- start=round(time_offset + timing["start"], 2),
1663
- end=round(time_offset + timing["end"], 2),
1664
- probability=timing["probability"],
1665
- )
1666
- )
1667
-
1668
- saved_tokens += len(timing["tokens"])
1669
- word_index += 1
1670
-
1671
- # hack: truncate long words at segment boundaries.
1672
- # a better segmentation algorithm based on VAD should be able to replace this.
1673
- if len(words) > 0:
1674
- # ensure the first and second word after a pause is not longer than
1675
- # twice the median word duration.
1676
- if words[0][
1677
- "end"
1678
- ] - last_speech_timestamp > median_duration * 4 and (
1679
- words[0]["end"] - words[0]["start"] > max_duration
1680
- or (
1681
- len(words) > 1
1682
- and words[1]["end"] - words[0]["start"] > max_duration * 2
1683
- )
1684
- ):
1685
- if (
1686
- len(words) > 1
1687
- and words[1]["end"] - words[1]["start"] > max_duration
1688
- ):
1689
- boundary = max(
1690
- words[1]["end"] / 2, words[1]["end"] - max_duration
1691
- )
1692
- words[0]["end"] = words[1]["start"] = boundary
1693
- words[0]["start"] = max(0, words[0]["end"] - max_duration)
1694
-
1695
- # prefer the segment-level start timestamp if the first word is too long.
1696
- if (
1697
- subsegment["start"] < words[0]["end"]
1698
- and subsegment["start"] - 0.5 > words[0]["start"]
1699
- ):
1700
- words[0]["start"] = max(
1701
- 0,
1702
- min(words[0]["end"] - median_duration, subsegment["start"]),
1703
- )
1704
- else:
1705
- subsegment["start"] = words[0]["start"]
1706
-
1707
- # prefer the segment-level end timestamp if the last word is too long.
1708
- if (
1709
- subsegment["end"] > words[-1]["start"]
1710
- and subsegment["end"] + 0.5 < words[-1]["end"]
1711
- ):
1712
- words[-1]["end"] = max(
1713
- words[-1]["start"] + median_duration, subsegment["end"]
1714
- )
1715
- else:
1716
- subsegment["end"] = words[-1]["end"]
1717
-
1718
- last_speech_timestamp = subsegment["end"]
1719
- segments[segment_idx][subsegment_idx]["words"] = words
1720
- return last_speech_timestamp
1721
-
1722
- def find_alignment(
1723
- self,
1724
- tokenizer: Tokenizer,
1725
- text_tokens: List[int],
1726
- encoder_output: ctranslate2.StorageView,
1727
- num_frames: int,
1728
- median_filter_width: int = 7,
1729
- ) -> List[dict]:
1730
- if len(text_tokens) == 0:
1731
- return []
1732
-
1733
- results = self.model.align(
1734
- encoder_output,
1735
- tokenizer.sot_sequence,
1736
- text_tokens,
1737
- num_frames,
1738
- median_filter_width=median_filter_width,
1739
- )
1740
- return_list = []
1741
- for result, text_token in zip(results, text_tokens):
1742
- text_token_probs = result.text_token_probs
1743
- alignments = result.alignments
1744
- text_indices = np.array([pair[0] for pair in alignments])
1745
- time_indices = np.array([pair[1] for pair in alignments])
1746
-
1747
- words, word_tokens = tokenizer.split_to_word_tokens(
1748
- text_token + [tokenizer.eot]
1749
- )
1750
- if len(word_tokens) <= 1:
1751
- # return on eot only
1752
- # >>> np.pad([], (1, 0))
1753
- # array([0.])
1754
- # This results in crashes when we lookup jump_times with float, like
1755
- # IndexError: arrays used as indices must be of integer (or boolean) type
1756
- return []
1757
- word_boundaries = np.pad(
1758
- np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0)
1759
- )
1760
- if len(word_boundaries) <= 1:
1761
- return []
1762
-
1763
- jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(
1764
- bool
1765
- )
1766
- jump_times = time_indices[jumps] / self.tokens_per_second
1767
- start_times = jump_times[word_boundaries[:-1]]
1768
- end_times = jump_times[word_boundaries[1:]]
1769
- word_probabilities = [
1770
- np.mean(text_token_probs[i:j])
1771
- for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
1772
- ]
1773
-
1774
- return_list.append(
1775
- [
1776
- dict(
1777
- word=word,
1778
- tokens=tokens,
1779
- start=start,
1780
- end=end,
1781
- probability=probability,
1782
- )
1783
- for word, tokens, start, end, probability in zip(
1784
- words, word_tokens, start_times, end_times, word_probabilities
1785
- )
1786
- ]
1787
- )
1788
- return return_list
1789
-
1790
- def generate_segment_batched(
1791
- self,
1792
- features: torch.Tensor,
1793
- tokenizer: Tokenizer,
1794
- options: dict,
1795
- ):
1796
- batch_size = features.shape[0]
1797
- all_tokens = []
1798
- prompt_reset_since = 0
1799
-
1800
- if options["initial_prompt"] is not None:
1801
- initial_prompt = " " + options["initial_prompt"].strip()
1802
- initial_prompt_tokens = tokenizer.encode(initial_prompt)
1803
- all_tokens.extend(initial_prompt_tokens)
1804
- previous_tokens = all_tokens[prompt_reset_since:]
1805
- prompt = self.get_prompt(
1806
- tokenizer,
1807
- previous_tokens,
1808
- without_timestamps=options["without_timestamps"],
1809
- prefix=options["prefix"],
1810
- )
1811
-
1812
- encoder_output = self.encode(features)
1813
-
1814
- result = self.model.generate(
1815
- encoder_output,
1816
- [prompt] * batch_size,
1817
- beam_size=options["beam_size"],
1818
- patience=options["patience"],
1819
- length_penalty=options["length_penalty"],
1820
- max_length=self.max_length,
1821
- suppress_blank=options["suppress_blank"],
1822
- suppress_tokens=options["suppress_tokens"],
1823
- return_scores=True,
1824
- return_no_speech_prob=True,
1825
- )
1826
-
1827
- output = []
1828
- for res in result:
1829
- output.append({})
1830
- # return scores
1831
- seq_len = len(res.sequences_ids[0])
1832
- cum_logprob = res.scores[0] * (seq_len ** options["length_penalty"])
1833
- output[-1]["avg_logprob"] = cum_logprob / (seq_len + 1)
1834
-
1835
- # return no speech prob
1836
- output[-1]["no_speech_prob"] = res.no_speech_prob
1837
- output[-1]["tokens"] = res.sequences_ids[0]
1838
-
1839
- return encoder_output, output
1840
-
1841
- def detect_language(self, audio: torch.Tensor):
1842
- to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
1843
- segment = self.feature_extractor(audio, padding=True, to_cpu=to_cpu)[
1844
- :, : self.feature_extractor.nb_max_frames
1845
- ]
1846
- encoder_output = self.encode(segment)
1847
- results = self.model.detect_language(encoder_output)
1848
- language_token, language_probability = results[0][0]
1849
- language = language_token[2:-2]
1850
- self.logger.info(
1851
- f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio..."
1852
- )
1853
- all_language_probs = [(token[2:-2], prob) for (token, prob) in results[0]]
1854
- return language, language_probability, all_language_probs
1855
-
1856
- def detect_language_multi_segment(
1857
- self, audio: Union[str, BinaryIO, torch.Tensor], params: Optional[dict] = None
1858
- ):
1859
- """
1860
- Detect language based on N highly-confident segments of a language.
1861
- """
1862
- # The threshold is used to decide if the audio is silence or not.
1863
- # The default is 0.02 (2.0%) i.e, if more than 2.0% of the audio is silent,
1864
- # the audio is considered as silence.
1865
- if not params:
1866
- params = {
1867
- "multilingual": False,
1868
- "speech_percentage_threshold": 0.02,
1869
- "language_detection_segments": 4,
1870
- "vad_filter": True,
1871
- "vad_min_silence_duration": 2500,
1872
- "language_threshold": 0.7,
1873
- }
1874
-
1875
- if params.get("multilingual", False):
1876
- logging.warning(
1877
- "lang_id is not supported for multilingual audios, detecting the major language."
1878
- )
1879
-
1880
- speech_percentage_threshold = params.get("speech_percentage_threshold", 0.02)
1881
- language_threshold = params.get("language_threshold", 0.7)
1882
- num_detection_segments = params.get("language_detection_segments", 4)
1883
- vad_filter_enabled = params.get("vad_filter", True)
1884
- vad_params = dict(
1885
- min_silence_duration_ms=params.get("vad_min_silence_duration", 2500)
1886
- )
1887
-
1888
- if vad_filter_enabled:
1889
- vad_params = VadOptions(**vad_params)
1890
-
1891
- # decode audio if it is not decoded already
1892
- sampling_rate = self.feature_extractor.sampling_rate
1893
- if not isinstance(audio, torch.Tensor):
1894
- audio: torch.Tensor = decode_audio(audio, sampling_rate=sampling_rate)
1895
-
1896
- # calculate duration of audio as number of seconds
1897
- # audio.shape[0] is the number of samples in the audio
1898
- # sampling_rate is the number of samples per second
1899
- # if we divide the number of samples by the number of samples per second,
1900
- # we get the duration in seconds
1901
- duration = audio.shape[0] / sampling_rate
1902
-
1903
- # Check if vad is enabled, and collect voiced segments
1904
- if vad_filter_enabled:
1905
- # get chunks of audio that contain speech
1906
- speech_chunks = get_speech_timestamps(audio, vad_params)
1907
- # merge chunks of audio that contain speech into a single array
1908
- audio = collect_chunks(audio, speech_chunks)
1909
-
1910
- # calculate new duration of audio without silence
1911
- duration_vad = audio.shape[0] / sampling_rate
1912
-
1913
- logging.debug(
1914
- f"Lang ID: VAD filter removed {duration - duration_vad} sec of audio"
1915
- )
1916
-
1917
- # if the audio after VAD is less than 2% of the original audio, consider it as silence
1918
- if duration_vad / duration < speech_percentage_threshold:
1919
- return {"language_code": None, "language_confidence": 1.0}
1920
-
1921
- # update duration to be the duration after VAD
1922
- duration = duration_vad
1923
-
1924
- # if the duration of the audio is less than 1 second, consider it as silence
1925
- if duration < 1.0:
1926
- return {"language_code": None, "language_confidence": 1.0}
1927
-
1928
- # number of feature frames in 30 seconds of audio is 3000
1929
- nb_max_frames = self.feature_extractor.nb_max_frames
1930
-
1931
- # extract features from audio with padding (default)
1932
- to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
1933
- features = self.feature_extractor(audio, to_cpu=to_cpu)
1934
-
1935
- # number of segments in the audio
1936
- num_segments = features.shape[-1] // nb_max_frames
1937
- # more number of segments than possible with the duration of file
1938
- if num_detection_segments > num_segments:
1939
- logging.warning(
1940
- f"Lang ID: Can not have more segments, setting {num_segments} segments."
1941
- )
1942
- num_detection_segments = num_segments
1943
-
1944
- # create a list of indices to randomly select segments from
1945
- indices = list(range(num_detection_segments))
1946
-
1947
- # fix seed to get deterministic results
1948
- random.seed(0)
1949
- random.shuffle(indices)
1950
-
1951
- detected_languages = []
1952
- all_language_probabilities = defaultdict(list)
1953
- confident_language_probabilities = defaultdict(list)
1954
- num_confident_segments_per_language = defaultdict(int)
1955
-
1956
- # Iterate over the randomly selected indices of the segments.
1957
- #
1958
- # For each segment, extract features and detect language.
1959
- #
1960
- # If the language is confident, add it to the list of confident segments for that language.
1961
- #
1962
- # If the number of confident segments for a language
1963
- # is greater than or equal to the number of detection segments,
1964
- # return the language and the average probability of the language.
1965
- #
1966
- # If we are unable to get sufficient number of confident predcitions,
1967
- # return the most frequently detected language with maximum probability.
1968
- #
1969
- # We need to get sufficient number of confident predictions per language, not in total.
1970
-
1971
- for i in indices:
1972
- segment_features = features[:, i * nb_max_frames : (i + 1) * nb_max_frames]
1973
- try:
1974
- encoder_output = self.encode(segment_features)
1975
- results = self.model.detect_language(encoder_output)[0]
1976
-
1977
- except ValueError as e: # or RuntimeError
1978
- logging.error(f"Inference error:{e}")
1979
-
1980
- # results is the list of classes (languages) and their probabilities (descending),
1981
- # for eg: [('<|de|>', 0.482177734375),('<|en|>', 0.283447265625),...]
1982
-
1983
- # take top language token and probability
1984
- # and parse language token to strip out markers
1985
- # for eg: '<|de|>' -> 'de'
1986
-
1987
- language_token = results[0][0]
1988
- language = language_token[2:-2]
1989
-
1990
- language_probability = results[0][1]
1991
-
1992
- detected_languages.append(language)
1993
- all_language_probabilities[language].append(language_probability)
1994
-
1995
- # only consider if the language prediction is confident
1996
- if language_probability > language_threshold:
1997
- num_confident_segments_per_language[language] += 1
1998
-
1999
- # Add language and probability to the list of languages when it is confident
2000
- confident_language_probabilities[language].append(language_probability)
2001
-
2002
- # return the language when sufficient number of confident segments is achieved
2003
- if (
2004
- num_confident_segments_per_language[language]
2005
- >= num_detection_segments
2006
- ):
2007
- # Considering the average probability of only confident segments
2008
- mean = sum(confident_language_probabilities[language]) / len(
2009
- confident_language_probabilities[language]
2010
- )
2011
- return {
2012
- "language_code": language,
2013
- "language_confidence": mean,
2014
- }
2015
-
2016
- # if we are unable to get sufficient number of confident predictions,
2017
- # return the most frequently detected language.
2018
- # if there is a tie, return the one with maximum average probability.
2019
- counter = Counter(detected_languages)
2020
-
2021
- # Define the key function to select frequent language with attached probabilities
2022
- def key_func(language):
2023
- # Calculate the frequency of the language
2024
- frequency = counter[language]
2025
-
2026
- # Calculate the average probability of the language
2027
- prob_avg = sum(all_language_probabilities[language]) / len(
2028
- all_language_probabilities[language]
2029
- )
2030
-
2031
- return frequency, prob_avg
2032
-
2033
- if detected_languages:
2034
- # Use the key function to find the language with maximum frequency and probability
2035
- max_language = max(detected_languages, key=key_func)
2036
- max_probability = sum(all_language_probabilities[max_language]) / len(
2037
- all_language_probabilities[max_language]
2038
- )
2039
-
2040
- # Do additional checks for silence for non-confident case
2041
- # calculate RMS amplitude and DC offset
2042
- dc_offset = audio.mean()
2043
- audio_minus_dc_offset = audio - dc_offset
2044
- is_silent = (
2045
- torch.all(audio.abs() < 0.01)
2046
- or torch.sqrt(torch.mean(audio_minus_dc_offset**2)) < 0.01
2047
- )
2048
-
2049
- if is_silent:
2050
- return {"language_code": None, "language_confidence": 1.0}
2051
-
2052
- return {
2053
- "language_code": max_language,
2054
- "language_confidence": max_probability,
2055
- }
2056
-
2057
- # Language is not detected for any segment and none of prev conditions met
2058
- return {"language_code": None, "language_confidence": 1.0}
2059
-
2060
-
2061
- def restore_speech_timestamps(
2062
- segments: Iterable[Segment],
2063
- speech_chunks: List[dict],
2064
- sampling_rate: int,
2065
- ) -> Iterable[Segment]:
2066
- ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
2067
-
2068
- for segment in segments:
2069
- if segment.words:
2070
- words = []
2071
- for word in segment.words:
2072
- # Ensure the word start and end times are resolved to the same chunk.
2073
- middle = (word.start + word.end) / 2
2074
- chunk_index = ts_map.get_chunk_index(middle)
2075
- word = word._replace(
2076
- start=ts_map.get_original_time(word.start, chunk_index),
2077
- end=ts_map.get_original_time(word.end, chunk_index),
2078
- )
2079
- words.append(word)
2080
-
2081
- segment = segment._replace(
2082
- start=words[0].start,
2083
- end=words[-1].end,
2084
- words=words,
2085
- )
2086
-
2087
- else:
2088
- segment = segment._replace(
2089
- start=ts_map.get_original_time(segment.start),
2090
- end=ts_map.get_original_time(segment.end),
2091
- )
2092
-
2093
- yield segment
2094
-
2095
-
2096
- def get_ctranslate2_storage(segment: torch.Tensor) -> ctranslate2.StorageView:
2097
- segment = segment.contiguous()
2098
- segment = ctranslate2.StorageView.from_array(
2099
- segment if segment.is_cuda else segment.numpy()
2100
- ) # torch cpu tensors don't implement __array_interface__
2101
- # https://github.com/pytorch/pytorch/issues/51156
2102
- return segment
2103
-
2104
-
2105
- def get_compression_ratio(text: str) -> float:
2106
- text_bytes = text.encode("utf-8")
2107
- return len(text_bytes) / len(zlib.compress(text_bytes))
2108
-
2109
-
2110
- def get_suppressed_tokens(
2111
- tokenizer: Tokenizer,
2112
- suppress_tokens: Tuple[int],
2113
- ) -> Optional[List[int]]:
2114
- if -1 in suppress_tokens:
2115
- suppress_tokens = [t for t in suppress_tokens if t >= 0]
2116
- suppress_tokens.extend(tokenizer.non_speech_tokens)
2117
- elif suppress_tokens is None or len(suppress_tokens) == 0:
2118
- suppress_tokens = [] # interpret empty string as an empty list
2119
- else:
2120
- assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
2121
-
2122
- suppress_tokens.extend(
2123
- [
2124
- tokenizer.transcribe,
2125
- tokenizer.translate,
2126
- tokenizer.sot,
2127
- tokenizer.sot_prev,
2128
- tokenizer.sot_lm,
2129
- ]
2130
- )
2131
-
2132
- return tuple(sorted(set(suppress_tokens)))
2133
-
2134
-
2135
- def merge_punctuations(alignment: List[dict], prepended: str, appended: str) -> None:
2136
- # merge prepended punctuations
2137
- i = len(alignment) - 2
2138
- j = len(alignment) - 1
2139
- while i >= 0:
2140
- previous = alignment[i]
2141
- following = alignment[j]
2142
- if previous["word"].startswith(" ") and previous["word"].strip() in prepended:
2143
- # prepend it to the following word
2144
- following["word"] = previous["word"] + following["word"]
2145
- if "tokens" in alignment[0].keys():
2146
- following["tokens"] = previous["tokens"] + following["tokens"]
2147
- previous["tokens"] = []
2148
- previous["word"] = ""
2149
-
2150
- else:
2151
- j = i
2152
- i -= 1
2153
-
2154
- # merge appended punctuations
2155
- i = 0
2156
- j = 1
2157
- while j < len(alignment):
2158
- previous = alignment[i]
2159
- following = alignment[j]
2160
- if not previous["word"].endswith(" ") and following["word"] in appended:
2161
- # append it to the previous word
2162
- previous["word"] = previous["word"] + following["word"]
2163
- if "tokens" in alignment[0].keys():
2164
- previous["tokens"] = previous["tokens"] + following["tokens"]
2165
- following["tokens"] = []
2166
- following["word"] = ""
2167
-
2168
- else:
2169
- i = j
2170
- j += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/utils.py DELETED
@@ -1,157 +0,0 @@
1
- import logging
2
- import os
3
- import re
4
-
5
- from typing import List, Optional
6
-
7
- import huggingface_hub
8
- import requests
9
-
10
- from tqdm.auto import tqdm
11
-
12
- _MODELS = {
13
- "tiny.en": "Systran/faster-whisper-tiny.en",
14
- "tiny": "Systran/faster-whisper-tiny",
15
- "base.en": "Systran/faster-whisper-base.en",
16
- "base": "Systran/faster-whisper-base",
17
- "small.en": "Systran/faster-whisper-small.en",
18
- "small": "Systran/faster-whisper-small",
19
- "medium.en": "Systran/faster-whisper-medium.en",
20
- "medium": "Systran/faster-whisper-medium",
21
- "large-v1": "Systran/faster-whisper-large-v1",
22
- "large-v2": "Systran/faster-whisper-large-v2",
23
- "large-v3": "Systran/faster-whisper-large-v3",
24
- "large": "Systran/faster-whisper-large-v3",
25
- "distil-large-v2": "Systran/faster-distil-whisper-large-v2",
26
- "distil-medium.en": "Systran/faster-distil-whisper-medium.en",
27
- "distil-small.en": "Systran/faster-distil-whisper-small.en",
28
- "distil-large-v3": "Systran/faster-distil-whisper-large-v3",
29
- }
30
-
31
-
32
- def available_models() -> List[str]:
33
- """Returns the names of available models."""
34
- return list(_MODELS.keys())
35
-
36
-
37
- def get_assets_path():
38
- """Returns the path to the assets directory."""
39
- return os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
40
-
41
-
42
- def get_logger():
43
- """Returns the module logger."""
44
- return logging.getLogger("faster_whisper")
45
-
46
-
47
- def download_model(
48
- size_or_id: str,
49
- output_dir: Optional[str] = None,
50
- local_files_only: bool = False,
51
- cache_dir: Optional[str] = None,
52
- ):
53
- """Downloads a CTranslate2 Whisper model from the Hugging Face Hub.
54
-
55
- Args:
56
- size_or_id: Size of the model to download from https://huggingface.co/Systran
57
- (tiny, tiny.en, base, base.en, small, small.en, distil-small.en, medium, medium.en,
58
- distil-medium.en, large-v1, large-v2, large-v3, large, distil-large-v2,
59
- distil-large-v3), or a CTranslate2-converted model ID from the Hugging Face Hub
60
- (e.g. Systran/faster-whisper-large-v3).
61
- output_dir: Directory where the model should be saved. If not set, the model is saved in
62
- the cache directory.
63
- local_files_only: If True, avoid downloading the file and return the path to the local
64
- cached file if it exists.
65
- cache_dir: Path to the folder where cached files are stored.
66
-
67
- Returns:
68
- The path to the downloaded model.
69
-
70
- Raises:
71
- ValueError: if the model size is invalid.
72
- """
73
- if re.match(r".*/.*", size_or_id):
74
- repo_id = size_or_id
75
- else:
76
- repo_id = _MODELS.get(size_or_id)
77
- if repo_id is None:
78
- raise ValueError(
79
- "Invalid model size '%s', expected one of: %s"
80
- % (size_or_id, ", ".join(_MODELS.keys()))
81
- )
82
-
83
- allow_patterns = [
84
- "config.json",
85
- "preprocessor_config.json",
86
- "model.bin",
87
- "tokenizer.json",
88
- "vocabulary.*",
89
- ]
90
-
91
- kwargs = {
92
- "local_files_only": local_files_only,
93
- "allow_patterns": allow_patterns,
94
- "tqdm_class": disabled_tqdm,
95
- }
96
-
97
- if output_dir is not None:
98
- kwargs["local_dir"] = output_dir
99
- kwargs["local_dir_use_symlinks"] = False
100
-
101
- if cache_dir is not None:
102
- kwargs["cache_dir"] = cache_dir
103
-
104
- try:
105
- return huggingface_hub.snapshot_download(repo_id, **kwargs)
106
- except (
107
- huggingface_hub.utils.HfHubHTTPError,
108
- requests.exceptions.ConnectionError,
109
- ) as exception:
110
- logger = get_logger()
111
- logger.warning(
112
- "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
113
- repo_id,
114
- exception,
115
- )
116
- logger.warning(
117
- "Trying to load the model directly from the local cache, if it exists."
118
- )
119
-
120
- kwargs["local_files_only"] = True
121
- return huggingface_hub.snapshot_download(repo_id, **kwargs)
122
-
123
-
124
- def format_timestamp(
125
- seconds: float,
126
- always_include_hours: bool = False,
127
- decimal_marker: str = ".",
128
- ) -> str:
129
- assert seconds >= 0, "non-negative timestamp expected"
130
- milliseconds = round(seconds * 1000.0)
131
-
132
- hours = milliseconds // 3_600_000
133
- milliseconds -= hours * 3_600_000
134
-
135
- minutes = milliseconds // 60_000
136
- milliseconds -= minutes * 60_000
137
-
138
- seconds = milliseconds // 1_000
139
- milliseconds -= seconds * 1_000
140
-
141
- hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
142
- return (
143
- f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
144
- )
145
-
146
-
147
- class disabled_tqdm(tqdm):
148
- def __init__(self, *args, **kwargs):
149
- kwargs["disable"] = True
150
- super().__init__(*args, **kwargs)
151
-
152
-
153
- def get_end(segments: List[dict]) -> Optional[float]:
154
- return next(
155
- (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
156
- segments[-1]["end"] if segments else None,
157
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/vad.py DELETED
@@ -1,596 +0,0 @@
1
- import bisect
2
- import functools
3
- import os
4
-
5
- from abc import ABC
6
- from collections.abc import Callable
7
- from typing import List, NamedTuple, Optional, Union
8
-
9
- import numpy as np
10
- import torch
11
-
12
- from pyannote.audio.core.io import AudioFile
13
- from pyannote.audio.pipelines import VoiceActivityDetection
14
- from pyannote.audio.pipelines.utils import PipelineModel
15
- from pyannote.core import Annotation, Segment, SlidingWindowFeature
16
-
17
- from faster_whisper.utils import get_assets_path
18
-
19
-
20
- # The code below is adapted from https://github.com/snakers4/silero-vad.
21
- class VadOptions(NamedTuple):
22
- """VAD options.
23
-
24
- Attributes:
25
- threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
26
- probabilities ABOVE this value are considered as SPEECH. It is better to tune this
27
- parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
28
- min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
29
- max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
30
- than max_speech_duration_s will be split at the timestamp of the last silence that
31
- lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
32
- split aggressively just before max_speech_duration_s.
33
- min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
34
- before separating it
35
- speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
36
- """
37
-
38
- threshold: float = 0.5
39
- min_speech_duration_ms: int = 250
40
- max_speech_duration_s: float = float("inf")
41
- min_silence_duration_ms: int = 2000
42
- speech_pad_ms: int = 400
43
-
44
-
45
- def get_speech_timestamps(
46
- audio: torch.Tensor,
47
- vad_options: Optional[VadOptions] = None,
48
- **kwargs,
49
- ) -> List[dict]:
50
- """This method is used for splitting long audios into speech chunks using silero VAD.
51
-
52
- Args:
53
- audio: One dimensional float array.
54
- vad_options: Options for VAD processing.
55
- kwargs: VAD options passed as keyword arguments for backward compatibility.
56
-
57
- Returns:
58
- List of dicts containing begin and end samples of each speech chunk.
59
- """
60
- if vad_options is None:
61
- vad_options = VadOptions(**kwargs)
62
-
63
- threshold = vad_options.threshold
64
- min_speech_duration_ms = vad_options.min_speech_duration_ms
65
- max_speech_duration_s = vad_options.max_speech_duration_s
66
- min_silence_duration_ms = vad_options.min_silence_duration_ms
67
- window_size_samples = 512
68
- speech_pad_ms = vad_options.speech_pad_ms
69
- sampling_rate = 16000
70
- min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
71
- speech_pad_samples = sampling_rate * speech_pad_ms / 1000
72
- max_speech_samples = (
73
- sampling_rate * max_speech_duration_s
74
- - window_size_samples
75
- - 2 * speech_pad_samples
76
- )
77
- min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
78
- min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
79
-
80
- audio_length_samples = len(audio)
81
-
82
- model = get_vad_model()
83
- state, context = model.get_initial_states(batch_size=1)
84
-
85
- speech_probs = []
86
- for current_start_sample in range(0, audio_length_samples, window_size_samples):
87
- chunk = audio[current_start_sample : current_start_sample + window_size_samples]
88
- if len(chunk) < window_size_samples:
89
- chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
90
- speech_prob, state, context = model(chunk, state, context, sampling_rate)
91
- speech_probs.append(speech_prob)
92
-
93
- triggered = False
94
- speeches = []
95
- current_speech = {}
96
- neg_threshold = threshold - 0.15
97
-
98
- # to save potential segment end (and tolerate some silence)
99
- temp_end = 0
100
- # to save potential segment limits in case of maximum segment size reached
101
- prev_end = next_start = 0
102
-
103
- for i, speech_prob in enumerate(speech_probs):
104
- if (speech_prob >= threshold) and temp_end:
105
- temp_end = 0
106
- if next_start < prev_end:
107
- next_start = window_size_samples * i
108
-
109
- if (speech_prob >= threshold) and not triggered:
110
- triggered = True
111
- current_speech["start"] = window_size_samples * i
112
- continue
113
-
114
- if (
115
- triggered
116
- and (window_size_samples * i) - current_speech["start"] > max_speech_samples
117
- ):
118
- if prev_end:
119
- current_speech["end"] = prev_end
120
- speeches.append(current_speech)
121
- current_speech = {}
122
- # previously reached silence (< neg_thres) and is still not speech (< thres)
123
- if next_start < prev_end:
124
- triggered = False
125
- else:
126
- current_speech["start"] = next_start
127
- prev_end = next_start = temp_end = 0
128
- else:
129
- current_speech["end"] = window_size_samples * i
130
- speeches.append(current_speech)
131
- current_speech = {}
132
- prev_end = next_start = temp_end = 0
133
- triggered = False
134
- continue
135
-
136
- if (speech_prob < neg_threshold) and triggered:
137
- if not temp_end:
138
- temp_end = window_size_samples * i
139
- # condition to avoid cutting in very short silence
140
- if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
141
- prev_end = temp_end
142
- if (window_size_samples * i) - temp_end < min_silence_samples:
143
- continue
144
- else:
145
- current_speech["end"] = temp_end
146
- if (
147
- current_speech["end"] - current_speech["start"]
148
- ) > min_speech_samples:
149
- speeches.append(current_speech)
150
- current_speech = {}
151
- prev_end = next_start = temp_end = 0
152
- triggered = False
153
- continue
154
-
155
- if (
156
- current_speech
157
- and (audio_length_samples - current_speech["start"]) > min_speech_samples
158
- ):
159
- current_speech["end"] = audio_length_samples
160
- speeches.append(current_speech)
161
-
162
- for i, speech in enumerate(speeches):
163
- if i == 0:
164
- speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
165
- if i != len(speeches) - 1:
166
- silence_duration = speeches[i + 1]["start"] - speech["end"]
167
- if silence_duration < 2 * speech_pad_samples:
168
- speech["end"] += int(silence_duration // 2)
169
- speeches[i + 1]["start"] = int(
170
- max(0, speeches[i + 1]["start"] - silence_duration // 2)
171
- )
172
- else:
173
- speech["end"] = int(
174
- min(audio_length_samples, speech["end"] + speech_pad_samples)
175
- )
176
- speeches[i + 1]["start"] = int(
177
- max(0, speeches[i + 1]["start"] - speech_pad_samples)
178
- )
179
- else:
180
- speech["end"] = int(
181
- min(audio_length_samples, speech["end"] + speech_pad_samples)
182
- )
183
-
184
- return speeches
185
-
186
-
187
- def collect_chunks(audio: torch.Tensor, chunks: List[dict]) -> torch.Tensor:
188
- """Collects and concatenates audio chunks."""
189
- if not chunks:
190
- return torch.tensor([], dtype=torch.float32)
191
-
192
- return torch.cat([audio[chunk["start"] : chunk["end"]] for chunk in chunks])
193
-
194
-
195
- class SpeechTimestampsMap:
196
- """Helper class to restore original speech timestamps."""
197
-
198
- def __init__(self, chunks: List[dict], sampling_rate: int, time_precision: int = 2):
199
- self.sampling_rate = sampling_rate
200
- self.time_precision = time_precision
201
- self.chunk_end_sample = []
202
- self.total_silence_before = []
203
-
204
- previous_end = 0
205
- silent_samples = 0
206
-
207
- for chunk in chunks:
208
- silent_samples += chunk["start"] - previous_end
209
- previous_end = chunk["end"]
210
-
211
- self.chunk_end_sample.append(chunk["end"] - silent_samples)
212
- self.total_silence_before.append(silent_samples / sampling_rate)
213
-
214
- def get_original_time(
215
- self,
216
- time: float,
217
- chunk_index: Optional[int] = None,
218
- ) -> float:
219
- if chunk_index is None:
220
- chunk_index = self.get_chunk_index(time)
221
-
222
- total_silence_before = self.total_silence_before[chunk_index]
223
- return round(total_silence_before + time, self.time_precision)
224
-
225
- def get_chunk_index(self, time: float) -> int:
226
- sample = int(time * self.sampling_rate)
227
- return min(
228
- bisect.bisect(self.chunk_end_sample, sample),
229
- len(self.chunk_end_sample) - 1,
230
- )
231
-
232
-
233
- @functools.lru_cache
234
- def get_vad_model():
235
- """Returns the VAD model instance."""
236
- path = os.path.join(get_assets_path(), "silero_vad.onnx")
237
- return SileroVADModel(path)
238
-
239
-
240
- class SileroVADModel:
241
- def __init__(self, path):
242
- try:
243
- import onnxruntime
244
- except ImportError as e:
245
- raise RuntimeError(
246
- "Applying the VAD filter requires the onnxruntime package"
247
- ) from e
248
-
249
- opts = onnxruntime.SessionOptions()
250
- opts.inter_op_num_threads = 1
251
- opts.intra_op_num_threads = 1
252
- opts.log_severity_level = 4
253
-
254
- self.session = onnxruntime.InferenceSession(
255
- path,
256
- providers=["CPUExecutionProvider"],
257
- sess_options=opts,
258
- )
259
-
260
- def get_initial_states(self, batch_size: int):
261
- state = np.zeros((2, batch_size, 128), dtype=np.float32)
262
- context = np.zeros((batch_size, 64), dtype=np.float32)
263
- return state, context
264
-
265
- def __call__(self, x, state, context, sr: int):
266
- if len(x.shape) == 1:
267
- x = np.expand_dims(x, 0)
268
- if len(x.shape) > 2:
269
- raise ValueError(
270
- f"Too many dimensions for input audio chunk {len(x.shape)}"
271
- )
272
- if sr / x.shape[1] > 31.25:
273
- raise ValueError("Input audio chunk is too short")
274
-
275
- x = np.concatenate([context, x], axis=1)
276
-
277
- ort_inputs = {
278
- "input": x,
279
- "state": state,
280
- "sr": np.array(sr, dtype="int64"),
281
- }
282
-
283
- out, state = self.session.run(None, ort_inputs)
284
- context = x[..., -64:]
285
-
286
- return out, state, context
287
-
288
-
289
- # BSD 2-Clause License
290
-
291
- # Copyright (c) 2024, Max Bain
292
-
293
- # Redistribution and use in source and binary forms, with or without
294
- # modification, are permitted provided that the following conditions are met:
295
-
296
- # 1. Redistributions of source code must retain the above copyright notice, this
297
- # list of conditions and the following disclaimer.
298
-
299
- # 2. Redistributions in binary form must reproduce the above copyright notice,
300
- # this list of conditions and the following disclaimer in the documentation
301
- # and/or other materials provided with the distribution.
302
-
303
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
304
- # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
305
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
306
- # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
307
- # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
308
- # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
309
- # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
310
- # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
311
- # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
312
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
313
-
314
-
315
- # The code below is copied from whisper-x (https://github.com/m-bain/whisperX)
316
- # and adapted for faster_whisper.
317
- class SegmentX:
318
- def __init__(self, start, end, speaker=None):
319
- self.start = start
320
- self.end = end
321
- self.speaker = speaker
322
-
323
-
324
- class VoiceActivitySegmentation(VoiceActivityDetection, ABC):
325
- """Pipeline wrapper class for Voice Activity Segmentation based on VAD scores."""
326
-
327
- def __init__(
328
- self,
329
- segmentation: PipelineModel = "pyannote/segmentation",
330
- device: Optional[Union[str, torch.device]] = None,
331
- fscore: bool = False,
332
- use_auth_token: Optional[str] = None,
333
- **inference_kwargs,
334
- ):
335
- """Initialize the pipeline with the model name and the optional device.
336
-
337
- Args:
338
- dict parameters of VoiceActivityDetection class from pyannote:
339
- segmentation (PipelineModel): Loaded model name.
340
- device (torch.device or None): Device to perform the segmentation.
341
- fscore (bool): Flag indicating whether to compute F-score during inference.
342
- use_auth_token (str or None): Optional authentication token for model access.
343
- inference_kwargs (dict): Additional arguments from VoiceActivityDetection pipeline.
344
- """
345
- super().__init__(
346
- segmentation=segmentation,
347
- device=device,
348
- fscore=fscore,
349
- use_auth_token=use_auth_token,
350
- **inference_kwargs,
351
- )
352
-
353
- def apply(
354
- self, file: AudioFile, hook: Optional[Callable] = None
355
- ) -> SlidingWindowFeature:
356
- """Apply voice activity detection on the audio file.
357
-
358
- Args:
359
- file (AudioFile): Processed file.
360
- hook (callable): Hook called with signature: hook("step_name", step_artefact, file=file)
361
-
362
- Returns:
363
- segmentations (SlidingWindowFeature): Voice activity segmentation.
364
- """
365
- # setup hook (e.g. for debugging purposes)
366
- hook = self.setup_hook(file, hook=hook)
367
-
368
- # apply segmentation model if needed
369
- # output shape is (num_chunks, num_frames, 1)
370
- if self.training:
371
- if self.CACHED_SEGMENTATION in file:
372
- segmentations = file[self.CACHED_SEGMENTATION]
373
- else:
374
- segmentations = self._segmentation(file)
375
- file[self.CACHED_SEGMENTATION] = segmentations
376
- else:
377
- segmentations: SlidingWindowFeature = self._segmentation(file)
378
-
379
- return segmentations
380
-
381
-
382
- class BinarizeVadScores:
383
- """Binarize detection scores using hysteresis thresholding.
384
-
385
- Reference:
386
- Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of
387
- RNN-based Voice Activity Detection", InterSpeech 2015.
388
-
389
- Modified by Max Bain to include WhisperX's min-cut operation
390
- https://arxiv.org/abs/2303.00747
391
-
392
- """
393
-
394
- def __init__(
395
- self,
396
- onset: float = 0.5,
397
- offset: Optional[float] = None,
398
- min_duration_on: float = 0.0,
399
- min_duration_off: float = 0.0,
400
- pad_onset: float = 0.0,
401
- pad_offset: float = 0.0,
402
- max_duration: float = float("inf"),
403
- ):
404
- """Initializes the parameters for Binarizing the VAD scores.
405
-
406
- Args:
407
- onset (float, optional):
408
- Onset threshold. Defaults to 0.5.
409
- offset (float, optional):
410
- Offset threshold. Defaults to `onset`.
411
- min_duration_on (float, optional):
412
- Remove active regions shorter than that many seconds. Defaults to 0s.
413
- min_duration_off (float, optional):
414
- Fill inactive regions shorter than that many seconds. Defaults to 0s.
415
- pad_onset (float, optional):
416
- Extend active regions by moving their start time by that many seconds.
417
- Defaults to 0s.
418
- pad_offset (float, optional):
419
- Extend active regions by moving their end time by that many seconds.
420
- Defaults to 0s.
421
- max_duration (float):
422
- The maximum length of an active segment.
423
- """
424
- super().__init__()
425
-
426
- self.onset = onset
427
- self.offset = offset or onset
428
-
429
- self.pad_onset = pad_onset
430
- self.pad_offset = pad_offset
431
-
432
- self.min_duration_on = min_duration_on
433
- self.min_duration_off = min_duration_off
434
-
435
- self.max_duration = max_duration
436
-
437
- def __get_active_regions(self, scores: SlidingWindowFeature) -> Annotation:
438
- """Extract active regions from VAD scores.
439
-
440
- Args:
441
- scores (SlidingWindowFeature): Detection scores.
442
-
443
- Returns:
444
- active (Annotation): Active regions.
445
- """
446
- num_frames, num_classes = scores.data.shape
447
- frames = scores.sliding_window
448
- timestamps = [frames[i].middle for i in range(num_frames)]
449
- # annotation meant to store 'active' regions
450
- active = Annotation()
451
- for k, k_scores in enumerate(scores.data.T):
452
- label = k if scores.labels is None else scores.labels[k]
453
-
454
- # initial state
455
- start = timestamps[0]
456
- is_active = k_scores[0] > self.onset
457
- curr_scores = [k_scores[0]]
458
- curr_timestamps = [start]
459
- t = start
460
- # optionally add `strict=False` for python 3.10 or later
461
- for t, y in zip(timestamps[1:], k_scores[1:]):
462
- # currently active
463
- if is_active:
464
- curr_duration = t - start
465
- if curr_duration > self.max_duration:
466
- search_after = len(curr_scores) // 2
467
- # divide segment
468
- min_score_div_idx = search_after + np.argmin(
469
- curr_scores[search_after:]
470
- )
471
- min_score_t = curr_timestamps[min_score_div_idx]
472
- region = Segment(
473
- start - self.pad_onset, min_score_t + self.pad_offset
474
- )
475
- active[region, k] = label
476
- start = curr_timestamps[min_score_div_idx]
477
- curr_scores = curr_scores[min_score_div_idx + 1 :]
478
- curr_timestamps = curr_timestamps[min_score_div_idx + 1 :]
479
- # switching from active to inactive
480
- elif y < self.offset:
481
- region = Segment(start - self.pad_onset, t + self.pad_offset)
482
- active[region, k] = label
483
- start = t
484
- is_active = False
485
- curr_scores = []
486
- curr_timestamps = []
487
- curr_scores.append(y)
488
- curr_timestamps.append(t)
489
- # currently inactive
490
- else:
491
- # switching from inactive to active
492
- if y > self.onset:
493
- start = t
494
- is_active = True
495
-
496
- # if active at the end, add final region
497
- if is_active:
498
- region = Segment(start - self.pad_onset, t + self.pad_offset)
499
- active[region, k] = label
500
-
501
- return active
502
-
503
- def __call__(self, scores: SlidingWindowFeature) -> Annotation:
504
- """Binarize detection scores.
505
-
506
- Args:
507
- scores (SlidingWindowFeature): Detection scores.
508
-
509
- Returns:
510
- active (Annotation): Binarized scores.
511
- """
512
- active = self.__get_active_regions(scores)
513
- # because of padding, some active regions might be overlapping: merge them.
514
- # also: fill same speaker gaps shorter than min_duration_off
515
- if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0:
516
- if self.max_duration < float("inf"):
517
- raise NotImplementedError("This would break current max_duration param")
518
- active = active.support(collar=self.min_duration_off)
519
-
520
- # remove tracks shorter than min_duration_on
521
- if self.min_duration_on > 0:
522
- for segment, track in list(active.itertracks()):
523
- if segment.duration < self.min_duration_on:
524
- del active[segment, track]
525
-
526
- return active
527
-
528
-
529
- def merge_chunks(
530
- segments,
531
- chunk_length,
532
- onset: float = 0.5,
533
- offset: Optional[float] = None,
534
- edge_padding: float = 0.1,
535
- ):
536
- """
537
- Merge operation described in whisper-x paper
538
- """
539
- curr_end = 0
540
- merged_segments = []
541
- seg_idxs = []
542
- speaker_idxs = []
543
-
544
- assert chunk_length > 0
545
- binarize = BinarizeVadScores(max_duration=chunk_length, onset=onset, offset=offset)
546
- segments = binarize(segments)
547
- segments_list = []
548
- for speech_turn in segments.get_timeline():
549
- segments_list.append(
550
- SegmentX(
551
- max(0.0, speech_turn.start - edge_padding),
552
- speech_turn.end + edge_padding,
553
- "UNKNOWN",
554
- )
555
- ) # 100ms edge padding to account for edge errors
556
-
557
- if len(segments_list) == 0:
558
- print("No active speech found in audio")
559
- return []
560
-
561
- # Make sur the starting point is the start of the segment.
562
- curr_start = segments_list[0].start
563
-
564
- for idx, seg in enumerate(segments_list):
565
- # if any segment start timing is less than previous segment end timing,
566
- # reset the edge padding. Similarly for end timing.
567
- if idx > 0:
568
- if seg.start < segments_list[idx - 1].end:
569
- seg.start += edge_padding
570
- if idx < len(segments_list) - 1:
571
- if seg.end > segments_list[idx + 1].start:
572
- seg.end -= edge_padding
573
-
574
- if seg.end - curr_start > chunk_length and curr_end - curr_start > 0:
575
- merged_segments.append(
576
- {
577
- "start": curr_start,
578
- "end": curr_end,
579
- "segments": seg_idxs,
580
- }
581
- )
582
- curr_start = seg.start
583
- seg_idxs = []
584
- speaker_idxs = []
585
- curr_end = seg.end
586
- seg_idxs.append((seg.start, seg.end))
587
- speaker_idxs.append(seg.speaker)
588
- # add final
589
- merged_segments.append(
590
- {
591
- "start": curr_start,
592
- "end": curr_end,
593
- "segments": seg_idxs,
594
- }
595
- )
596
- return merged_segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/version.py DELETED
@@ -1,3 +0,0 @@
1
- """Version information."""
2
-
3
- __version__ = "1.0.3"
 
 
 
 
whisper_pipeline/faster-whisper-main/docker/Dockerfile DELETED
@@ -1,6 +0,0 @@
1
- FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
2
- WORKDIR /root
3
- RUN apt-get update -y && apt-get install -y python3-pip
4
- COPY infer.py jfk.flac ./
5
- RUN pip3 install faster-whisper
6
- CMD ["python3", "infer.py"]
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/docker/infer.py DELETED
@@ -1,7 +0,0 @@
1
- from faster_whisper import WhisperModel
2
-
3
- jfk_path = "jfk.flac"
4
- model = WhisperModel("tiny", device="cuda")
5
- segments, info = model.transcribe(jfk_path, word_timestamps=True)
6
- for segment in segments:
7
- print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/docker/jfk.flac DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:63a4b1e4c1dc655ac70961ffbf518acd249df237e5a0152faae9a4a836949715
3
- size 1152693
 
 
 
 
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/PKG-INFO DELETED
@@ -1,347 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: faster-whisper
3
- Version: 1.0.3
4
- Summary: Faster Whisper transcription with CTranslate2
5
- Home-page: https://github.com/SYSTRAN/faster-whisper
6
- Author: Guillaume Klein
7
- License: MIT
8
- Keywords: openai whisper speech ctranslate2 inference quantization transformer
9
- Platform: UNKNOWN
10
- Classifier: Development Status :: 4 - Beta
11
- Classifier: Intended Audience :: Developers
12
- Classifier: Intended Audience :: Science/Research
13
- Classifier: License :: OSI Approved :: MIT License
14
- Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3 :: Only
16
- Classifier: Programming Language :: Python :: 3.8
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
- Requires-Python: >=3.8
22
- Description-Content-Type: text/markdown
23
- Provides-Extra: conversion
24
- Provides-Extra: dev
25
- License-File: LICENSE
26
-
27
- [![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
28
-
29
- # Faster Whisper transcription with CTranslate2
30
-
31
- **faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.
32
-
33
- This implementation is up to 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.
34
-
35
- ## Benchmark
36
-
37
- ### Whisper
38
-
39
- For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations:
40
-
41
- * [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258)
42
- * [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362)
43
- * [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e)
44
-
45
- ### Large-v2 model on GPU
46
-
47
- | Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
48
- | --- | --- | --- | --- | --- | --- |
49
- | openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
50
- | faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
51
- | faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB |
52
-
53
- *Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.*
54
-
55
- ### Small model on CPU
56
-
57
- | Implementation | Precision | Beam size | Time | Max. memory |
58
- | --- | --- | --- | --- | --- |
59
- | openai/whisper | fp32 | 5 | 10m31s | 3101MB |
60
- | whisper.cpp | fp32 | 5 | 17m42s | 1581MB |
61
- | whisper.cpp | fp16 | 5 | 12m39s | 873MB |
62
- | faster-whisper | fp32 | 5 | 2m44s | 1675MB |
63
- | faster-whisper | int8 | 5 | 2m04s | 995MB |
64
-
65
- *Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.*
66
-
67
-
68
- ### Distil-whisper
69
-
70
- | Implementation | Precision | Beam size | Time | Gigaspeech WER |
71
- | --- | --- | --- | --- | --- |
72
- | distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 |
73
- | [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 |
74
- | distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 |
75
- | [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 |
76
-
77
- *Executed with CUDA 11.4 on a NVIDIA 3090.*
78
-
79
- <details>
80
- <summary>testing details (click to expand)</summary>
81
-
82
- For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting:
83
- ```python
84
- from faster_whisper import WhisperModel
85
-
86
- model_size = "distil-large-v2"
87
- # model_size = "distil-medium.en"
88
- # Run on GPU with FP16
89
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
90
- segments, info = model.transcribe("audio.mp3", beam_size=5, language="en")
91
- ```
92
- </details>
93
-
94
- ## Requirements
95
-
96
- * Python 3.8 or greater
97
-
98
-
99
- ### GPU
100
-
101
- GPU execution requires the following NVIDIA libraries to be installed:
102
-
103
- * [cuBLAS for CUDA 12](https://developer.nvidia.com/cublas)
104
- * [cuDNN 8 for CUDA 12](https://developer.nvidia.com/cudnn)
105
-
106
- **Note**: Latest versions of `ctranslate2` support CUDA 12 only. For CUDA 11, the current workaround is downgrading to the `3.24.0` version of `ctranslate2` (This can be done with `pip install --force-reinstall ctranslate2==3.24.0` or specifying the version in a `requirements.txt`).
107
-
108
- There are multiple ways to install the NVIDIA libraries mentioned above. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below.
109
-
110
- <details>
111
- <summary>Other installation methods (click to expand)</summary>
112
-
113
-
114
- **Note:** For all these methods below, keep in mind the above note regarding CUDA versions. Depending on your setup, you may need to install the _CUDA 11_ versions of libraries that correspond to the CUDA 12 libraries listed in the instructions below.
115
-
116
- #### Use Docker
117
-
118
- The libraries (cuBLAS, cuDNN) are installed in these official NVIDIA CUDA Docker images: `nvidia/cuda:12.0.0-runtime-ubuntu20.04` or `nvidia/cuda:12.0.0-runtime-ubuntu22.04`.
119
-
120
- #### Install with `pip` (Linux only)
121
-
122
- On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python.
123
-
124
- ```bash
125
- pip install nvidia-cublas-cu12 nvidia-cudnn-cu12
126
-
127
- export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
128
- ```
129
-
130
- **Note**: Version 9+ of `nvidia-cudnn-cu12` appears to cause issues due its reliance on cuDNN 9 (Faster-Whisper does not currently support cuDNN 9). Ensure your version of the Python package is for cuDNN 8.
131
-
132
- #### Download the libraries from Purfview's repository (Windows & Linux)
133
-
134
- Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`.
135
-
136
- </details>
137
-
138
- ## Installation
139
-
140
- The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/):
141
-
142
- ```bash
143
- pip install faster-whisper
144
- ```
145
-
146
- <details>
147
- <summary>Other installation methods (click to expand)</summary>
148
-
149
- ### Install the master branch
150
-
151
- ```bash
152
- pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz"
153
- ```
154
-
155
- ### Install a specific commit
156
-
157
- ```bash
158
- pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz"
159
- ```
160
-
161
- </details>
162
-
163
- ## Usage
164
-
165
- ### Faster-whisper
166
-
167
- ```python
168
- from faster_whisper import WhisperModel
169
-
170
- model_size = "large-v3"
171
-
172
- # Run on GPU with FP16
173
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
174
-
175
- # or run on GPU with INT8
176
- # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
177
- # or run on CPU with INT8
178
- # model = WhisperModel(model_size, device="cpu", compute_type="int8")
179
-
180
- segments, info = model.transcribe("audio.mp3", beam_size=5)
181
-
182
- print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
183
-
184
- for segment in segments:
185
- print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
186
- ```
187
-
188
- **Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop:
189
-
190
- ```python
191
- segments, _ = model.transcribe("audio.mp3")
192
- segments = list(segments) # The transcription will actually run here.
193
- ```
194
-
195
- ### multi-segment language detection
196
-
197
- To directly use the model for improved language detection, the following code snippet can be used:
198
-
199
- ```python
200
- from faster_whisper import WhisperModel
201
- model = WhisperModel("medium", device="cuda", compute_type="float16")
202
- language_info = model.detect_language_multi_segment("audio.mp3")
203
- ```
204
-
205
- ### Batched faster-whisper
206
-
207
-
208
- The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-2 Clause license and integrates its VAD model to this library. We modify this implementation and also replaced the feature extraction with a faster torch-based implementation. Batched version improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference.
209
-
210
- The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
211
-
212
- ```python
213
- from faster_whisper import WhisperModel, BatchedInferencePipeline
214
-
215
- model = WhisperModel("medium", device="cuda", compute_type="float16")
216
- batched_model = BatchedInferencePipeline(model=model)
217
- segments, info = batched_model.transcribe("audio.mp3", batch_size=16)
218
-
219
- for segment in segments:
220
- print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
221
- ```
222
-
223
- ### Faster Distil-Whisper
224
-
225
- The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3)
226
- checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet
227
- demonstrates how to run inference with distil-large-v3 on a specified audio file:
228
-
229
- ```python
230
- from faster_whisper import WhisperModel
231
-
232
- model_size = "distil-large-v3"
233
-
234
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
235
- segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False)
236
-
237
- for segment in segments:
238
- print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
239
- ```
240
-
241
- For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3).
242
-
243
- ### Word-level timestamps
244
-
245
- ```python
246
- segments, _ = model.transcribe("audio.mp3", word_timestamps=True)
247
-
248
- for segment in segments:
249
- for word in segment.words:
250
- print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
251
- ```
252
-
253
- ### VAD filter
254
-
255
- The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech:
256
-
257
- ```python
258
- segments, _ = model.transcribe("audio.mp3", vad_filter=True)
259
- ```
260
-
261
- The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`:
262
-
263
- ```python
264
- segments, _ = model.transcribe(
265
- "audio.mp3",
266
- vad_filter=True,
267
- vad_parameters=dict(min_silence_duration_ms=500),
268
- )
269
- ```
270
-
271
- ### Logging
272
-
273
- The library logging level can be configured like this:
274
-
275
- ```python
276
- import logging
277
-
278
- logging.basicConfig()
279
- logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
280
- ```
281
-
282
- ### Going further
283
-
284
- See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation.
285
-
286
- ## Community integrations
287
-
288
- Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
289
-
290
-
291
- * [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription.
292
- * [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment
293
- * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
294
- * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.
295
- * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS.
296
- * [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines.
297
- * [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT.
298
- * [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor)
299
- * [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux.
300
- * [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art.
301
- * [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time.
302
- * [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface.
303
-
304
- ## Model conversion
305
-
306
- When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
307
-
308
- We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
309
-
310
- For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
311
-
312
- ```bash
313
- pip install transformers[torch]>=4.23
314
-
315
- ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
316
- --copy_files tokenizer.json preprocessor_config.json --quantization float16
317
- ```
318
-
319
- * The option `--model` accepts a model name on the Hub or a path to a model directory.
320
- * If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
321
-
322
- Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
323
-
324
- ### Load a converted model
325
-
326
- 1. Directly load the model from a local directory:
327
- ```python
328
- model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
329
- ```
330
-
331
- 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
332
- ```python
333
- model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
334
- ```
335
-
336
- ## Comparing performance against other implementations
337
-
338
- If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular:
339
-
340
- * Verify that the same transcription options are used, especially the same beam size. For example in openai/whisper, `model.transcribe` uses a default beam size of 1 but here we use a default beam size of 5.
341
- * When running on CPU, make sure to set the same number of threads. Many frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script:
342
-
343
- ```bash
344
- OMP_NUM_THREADS=4 python3 my_script.py
345
- ```
346
-
347
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/SOURCES.txt DELETED
@@ -1,25 +0,0 @@
1
- LICENSE
2
- MANIFEST.in
3
- README.md
4
- requirements.conversion.txt
5
- requirements.txt
6
- setup.cfg
7
- setup.py
8
- faster_whisper/__init__.py
9
- faster_whisper/audio.py
10
- faster_whisper/feature_extractor.py
11
- faster_whisper/tokenizer.py
12
- faster_whisper/transcribe.py
13
- faster_whisper/utils.py
14
- faster_whisper/vad.py
15
- faster_whisper/version.py
16
- faster_whisper.egg-info/PKG-INFO
17
- faster_whisper.egg-info/SOURCES.txt
18
- faster_whisper.egg-info/dependency_links.txt
19
- faster_whisper.egg-info/requires.txt
20
- faster_whisper.egg-info/top_level.txt
21
- faster_whisper/assets/__init__.py
22
- faster_whisper/assets/pyannote_vad_model.bin
23
- faster_whisper/assets/silero_vad.onnx
24
- tests/test_transcribe.py
25
- tests/test_utils.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/dependency_links.txt DELETED
@@ -1 +0,0 @@
1
-
 
 
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/requires.txt DELETED
@@ -1,17 +0,0 @@
1
- ctranslate2<5,>=4.0
2
- huggingface_hub>=0.13
3
- onnxruntime<2,>=1.14
4
- pyannote-audio
5
- tokenizers<1,>=0.13
6
- torch
7
- torchaudio
8
- tqdm
9
-
10
- [conversion]
11
- transformers[torch]>=4.23
12
-
13
- [dev]
14
- black==23.*
15
- flake8==6.*
16
- isort==5.*
17
- pytest==7.*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/top_level.txt DELETED
@@ -1 +0,0 @@
1
- faster_whisper
 
 
whisper_pipeline/faster-whisper-main/faster_whisper/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- from faster_whisper.audio import decode_audio
2
- from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel
3
- from faster_whisper.utils import available_models, download_model, format_timestamp
4
- from faster_whisper.version import __version__
5
-
6
- __all__ = [
7
- "available_models",
8
- "decode_audio",
9
- "WhisperModel",
10
- "BatchedInferencePipeline",
11
- "download_model",
12
- "format_timestamp",
13
- "__version__",
14
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (572 Bytes)
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/audio.cpython-310.pyc DELETED
Binary file (1.59 kB)
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/feature_extractor.cpython-310.pyc DELETED
Binary file (2.73 kB)
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/tokenizer.cpython-310.pyc DELETED
Binary file (6.78 kB)
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/transcribe.cpython-310.pyc DELETED
Binary file (53.3 kB)
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/utils.cpython-310.pyc DELETED
Binary file (5.13 kB)
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/vad.cpython-310.pyc DELETED
Binary file (15.2 kB)
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/version.cpython-310.pyc DELETED
Binary file (248 Bytes)
 
whisper_pipeline/faster-whisper-main/faster_whisper/assets/__init__.py DELETED
File without changes
whisper_pipeline/faster-whisper-main/faster_whisper/assets/pyannote_vad_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
3
- size 17719103
 
 
 
 
whisper_pipeline/faster-whisper-main/faster_whisper/assets/silero_vad.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b99cbfd39246b6706f98ec13c7c50c6b299181f2474fa05cbc8046acc274396
3
- size 2313101
 
 
 
 
whisper_pipeline/faster-whisper-main/faster_whisper/audio.py DELETED
@@ -1,58 +0,0 @@
1
- from typing import BinaryIO, Union
2
-
3
- import torch
4
- import torchaudio
5
-
6
-
7
- def decode_audio(
8
- input_file: Union[str, BinaryIO],
9
- sampling_rate: int = 16000,
10
- split_stereo: bool = False,
11
- ):
12
- """Decodes the audio.
13
-
14
- Args:
15
- input_file: Path to the input file or a file-like object.
16
- sampling_rate: Resample the audio to this sample rate.
17
- split_stereo: Return separate left and right channels.
18
-
19
- Returns:
20
- A float32 Torch Tensor.
21
-
22
- If `split_stereo` is enabled, the function returns a 2-tuple with the
23
- separated left and right channels.
24
- """
25
-
26
- waveform, audio_sf = torchaudio.load(input_file) # waveform: channels X T
27
-
28
- if audio_sf != sampling_rate:
29
- waveform = torchaudio.functional.resample(
30
- waveform, orig_freq=audio_sf, new_freq=sampling_rate
31
- )
32
- if split_stereo:
33
- return waveform[0], waveform[1]
34
-
35
- return waveform.mean(0)
36
-
37
-
38
- def pad_or_trim(array, length: int, *, axis: int = -1):
39
- """
40
- Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
41
- """
42
- axis = axis % array.ndim
43
- if array.shape[axis] > length:
44
- idx = [Ellipsis] * axis + [slice(length)] + [Ellipsis] * (array.ndim - axis - 1)
45
- return array[idx]
46
-
47
- if array.shape[axis] < length:
48
- pad_widths = (
49
- [
50
- 0,
51
- ]
52
- * array.ndim
53
- * 2
54
- )
55
- pad_widths[2 * axis] = length - array.shape[axis]
56
- array = torch.nn.functional.pad(array, tuple(pad_widths[::-1]))
57
-
58
- return array
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_pipeline/faster-whisper-main/faster_whisper/feature_extractor.py DELETED
@@ -1,114 +0,0 @@
1
- import torch
2
-
3
-
4
- # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py # noqa: E501
5
- class FeatureExtractor:
6
- def __init__(
7
- self,
8
- device: str = "auto",
9
- feature_size=80,
10
- sampling_rate=16000,
11
- hop_length=160,
12
- chunk_length=30,
13
- n_fft=400,
14
- ):
15
- if device == "auto":
16
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
17
- else:
18
- self.device = device
19
- self.n_fft = n_fft
20
- self.hop_length = hop_length
21
- self.chunk_length = chunk_length
22
- self.n_samples = chunk_length * sampling_rate
23
- self.nb_max_frames = self.n_samples // hop_length
24
- self.time_per_frame = hop_length / sampling_rate
25
- self.sampling_rate = sampling_rate
26
- self.mel_filters = self.get_mel_filters(
27
- sampling_rate, n_fft, n_mels=feature_size
28
- )
29
-
30
- @staticmethod
31
- def get_mel_filters(sr, n_fft, n_mels=128):
32
- """
33
- Implementation of librosa.filters.mel in Pytorch
34
- """
35
- # Initialize the weights
36
- n_mels = int(n_mels)
37
-
38
- # Center freqs of each FFT bin
39
- fftfreqs = torch.fft.rfftfreq(n=n_fft, d=1.0 / sr)
40
-
41
- # 'Center freqs' of mel bands - uniformly spaced between limits
42
- min_mel = 0.0
43
- max_mel = 45.245640471924965
44
-
45
- mels = torch.linspace(min_mel, max_mel, n_mels + 2)
46
-
47
- # Fill in the linear scale
48
- f_min = 0.0
49
- f_sp = 200.0 / 3
50
- freqs = f_min + f_sp * mels
51
-
52
- # And now the nonlinear scale
53
- min_log_hz = 1000.0 # beginning of log region (Hz)
54
- min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
55
- logstep = torch.log(torch.tensor(6.4)) / 27.0 # step size for log region
56
-
57
- # If we have vector data, vectorize
58
- log_t = mels >= min_log_mel
59
- freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
60
-
61
- mel_f = freqs
62
-
63
- fdiff = torch.diff(mel_f)
64
- ramps = mel_f.view(-1, 1) - fftfreqs.view(1, -1)
65
-
66
- lower = -ramps[:-2] / fdiff[:-1].unsqueeze(1)
67
- upper = ramps[2:] / fdiff[1:].unsqueeze(1)
68
-
69
- # Intersect them with each other and zero, vectorized across all i
70
- weights = torch.maximum(torch.zeros_like(lower), torch.minimum(lower, upper))
71
-
72
- # Slaney-style mel is scaled to be approx constant energy per channel
73
- enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
74
- weights *= enorm.unsqueeze(1)
75
-
76
- return weights
77
-
78
- def __call__(self, waveform, padding=True, chunk_length=None, to_cpu=False):
79
- """
80
- Compute the log-Mel spectrogram of the provided audio.
81
- """
82
-
83
- if chunk_length is not None:
84
- self.n_samples = chunk_length * self.sampling_rate
85
- self.nb_max_frames = self.n_samples // self.hop_length
86
-
87
- if waveform.dtype is not torch.float32:
88
- waveform = waveform.to(torch.float32)
89
-
90
- waveform = (
91
- waveform.to(self.device)
92
- if self.device == "cuda" and not waveform.is_cuda
93
- else waveform
94
- )
95
-
96
- if padding:
97
- waveform = torch.nn.functional.pad(waveform, (0, self.n_samples))
98
-
99
- window = torch.hann_window(self.n_fft).to(waveform.device)
100
-
101
- stft = torch.stft(
102
- waveform, self.n_fft, self.hop_length, window=window, return_complex=True
103
- )
104
- magnitudes = stft[..., :-1].abs() ** 2
105
-
106
- mel_spec = self.mel_filters.to(waveform.device) @ magnitudes
107
-
108
- log_spec = torch.clamp(mel_spec, min=1e-10).log10()
109
- log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
110
- log_spec = (log_spec + 4.0) / 4.0
111
-
112
- # When the model is running on multiple GPUs, the output should be moved
113
- # to the CPU since we don't know which GPU will handle the next job.
114
- return log_spec.cpu() if to_cpu else log_spec