Spaces:
Configuration error
Configuration error
Fedir Zadniprovskyi
commited on
Commit
·
c8f37a4
1
Parent(s):
188bcec
feat: allow using any ctraslate2 compatible model #14
Browse files- Dockerfile.cpu +1 -1
- Dockerfile.cuda +1 -1
- README.md +5 -5
- examples/live-audio/script.sh +1 -1
- examples/youtube/script.sh +2 -2
- faster_whisper_server/config.py +9 -24
- faster_whisper_server/main.py +7 -7
Dockerfile.cpu
CHANGED
@@ -15,7 +15,7 @@ RUN poetry install --only main
|
|
15 |
COPY ./faster_whisper_server ./faster_whisper_server
|
16 |
ENTRYPOINT ["poetry", "run"]
|
17 |
CMD ["uvicorn", "faster_whisper_server.main:app"]
|
18 |
-
ENV WHISPER_MODEL=medium.en
|
19 |
ENV WHISPER_INFERENCE_DEVICE=cpu
|
20 |
ENV WHISPER_COMPUTE_TYPE=int8
|
21 |
ENV UVICORN_HOST=0.0.0.0
|
|
|
15 |
COPY ./faster_whisper_server ./faster_whisper_server
|
16 |
ENTRYPOINT ["poetry", "run"]
|
17 |
CMD ["uvicorn", "faster_whisper_server.main:app"]
|
18 |
+
ENV WHISPER_MODEL=Systran/faster-whisper-medium.en
|
19 |
ENV WHISPER_INFERENCE_DEVICE=cpu
|
20 |
ENV WHISPER_COMPUTE_TYPE=int8
|
21 |
ENV UVICORN_HOST=0.0.0.0
|
Dockerfile.cuda
CHANGED
@@ -15,7 +15,7 @@ RUN poetry install --only main
|
|
15 |
COPY ./faster_whisper_server ./faster_whisper_server
|
16 |
ENTRYPOINT ["poetry", "run"]
|
17 |
CMD ["uvicorn", "faster_whisper_server.main:app"]
|
18 |
-
ENV WHISPER_MODEL=distil-large-v3
|
19 |
ENV WHISPER_INFERENCE_DEVICE=cuda
|
20 |
ENV UVICORN_HOST=0.0.0.0
|
21 |
ENV UVICORN_PORT=8000
|
|
|
15 |
COPY ./faster_whisper_server ./faster_whisper_server
|
16 |
ENTRYPOINT ["poetry", "run"]
|
17 |
CMD ["uvicorn", "faster_whisper_server.main:app"]
|
18 |
+
ENV WHISPER_MODEL=Systran/faster-distil-whisper-large-v3
|
19 |
ENV WHISPER_INFERENCE_DEVICE=cuda
|
20 |
ENV UVICORN_HOST=0.0.0.0
|
21 |
ENV UVICORN_PORT=8000
|
README.md
CHANGED
@@ -38,9 +38,9 @@ export OPENAI_API_KEY="cant-be-empty"
|
|
38 |
export OPENAI_BASE_URL=http://localhost:8000/v1/
|
39 |
```
|
40 |
```bash
|
41 |
-
openai api audio.transcriptions.create -m distil-large-v3 -f audio.wav --response-format text
|
42 |
|
43 |
-
openai api audio.translations.create -m distil-large-v3 -f audio.wav --response-format verbose_json
|
44 |
```
|
45 |
### OpenAI API Python SDK
|
46 |
```python
|
@@ -50,7 +50,7 @@ client = OpenAI(api_key="cant-be-empty", base_url="http://localhost:8000/v1/")
|
|
50 |
|
51 |
audio_file = open("audio.wav", "rb")
|
52 |
transcript = client.audio.transcriptions.create(
|
53 |
-
model="distil-large-v3", file=audio_file
|
54 |
)
|
55 |
print(transcript.text)
|
56 |
```
|
@@ -61,9 +61,9 @@ print(transcript.text)
|
|
61 |
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
|
62 |
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
|
63 |
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true"
|
64 |
-
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "
|
65 |
# It's recommended that you always specify the language as that will reduce the transcription time
|
66 |
-
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "
|
67 |
|
68 |
curl http://localhost:8000/v1/audio/translations -F "[email protected]"
|
69 |
```
|
|
|
38 |
export OPENAI_BASE_URL=http://localhost:8000/v1/
|
39 |
```
|
40 |
```bash
|
41 |
+
openai api audio.transcriptions.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format text
|
42 |
|
43 |
+
openai api audio.translations.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format verbose_json
|
44 |
```
|
45 |
### OpenAI API Python SDK
|
46 |
```python
|
|
|
50 |
|
51 |
audio_file = open("audio.wav", "rb")
|
52 |
transcript = client.audio.transcriptions.create(
|
53 |
+
model="Systran/faster-distil-whisper-large-v3", file=audio_file
|
54 |
)
|
55 |
print(transcript.text)
|
56 |
```
|
|
|
61 |
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
|
62 |
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
|
63 |
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true"
|
64 |
+
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "model=Systran/faster-distil-whisper-large-v3"
|
65 |
# It's recommended that you always specify the language as that will reduce the transcription time
|
66 |
+
curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "language=en"
|
67 |
|
68 |
curl http://localhost:8000/v1/audio/translations -F "[email protected]"
|
69 |
```
|
examples/live-audio/script.sh
CHANGED
@@ -7,7 +7,7 @@ set -e
|
|
7 |
# ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
|
8 |
# rm -f audio.mp3
|
9 |
|
10 |
-
export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
|
11 |
|
12 |
# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
|
13 |
docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
|
|
|
7 |
# ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
|
8 |
# rm -f audio.mp3
|
9 |
|
10 |
+
export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
|
11 |
|
12 |
# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
|
13 |
docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
|
examples/youtube/script.sh
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
set -e
|
4 |
|
5 |
# NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
|
6 |
-
export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
|
7 |
|
8 |
# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
|
9 |
docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
|
@@ -13,7 +13,7 @@ docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface
|
|
13 |
# Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
|
14 |
youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
|
15 |
|
16 |
-
# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `distil-large-v3` takes ~30 seconds on Nvidia L4. `tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `distil-large-v3`.
|
17 |
curl -s http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt
|
18 |
|
19 |
# Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat
|
|
|
3 |
set -e
|
4 |
|
5 |
# NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
|
6 |
+
export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
|
7 |
|
8 |
# Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
|
9 |
docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
|
|
|
13 |
# Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
|
14 |
youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
|
15 |
|
16 |
+
# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `Systran/faster-distil-whisper-large-v3` takes ~30 seconds on Nvidia L4. `Systran/faster-whisper-tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `Systran/faster-distil-whisper-large-v3`.
|
17 |
curl -s http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt
|
18 |
|
19 |
# Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat
|
faster_whisper_server/config.py
CHANGED
@@ -46,26 +46,6 @@ class ResponseFormat(enum.StrEnum):
|
|
46 |
# I see a lot of equivalence between this new LLM OS and operating systems of today.
|
47 |
|
48 |
|
49 |
-
# https://huggingface.co/Systran
|
50 |
-
class Model(enum.StrEnum):
|
51 |
-
TINY_EN = "tiny.en"
|
52 |
-
TINY = "tiny"
|
53 |
-
BASE_EN = "base.en"
|
54 |
-
BASE = "base"
|
55 |
-
SMALL_EN = "small.en"
|
56 |
-
SMALL = "small"
|
57 |
-
MEDIUM_EN = "medium.en"
|
58 |
-
MEDIUM = "medium"
|
59 |
-
LARGE = "large"
|
60 |
-
LARGE_V1 = "large-v1"
|
61 |
-
LARGE_V2 = "large-v2"
|
62 |
-
LARGE_V3 = "large-v3"
|
63 |
-
DISTIL_SMALL_EN = "distil-small.en"
|
64 |
-
DISTIL_MEDIUM_EN = "distil-medium.en"
|
65 |
-
DISTIL_LARGE_V2 = "distil-large-v2"
|
66 |
-
DISTIL_LARGE_V3 = "distil-large-v3"
|
67 |
-
|
68 |
-
|
69 |
class Device(enum.StrEnum):
|
70 |
CPU = "cpu"
|
71 |
CUDA = "cuda"
|
@@ -189,7 +169,12 @@ class Language(enum.StrEnum):
|
|
189 |
|
190 |
|
191 |
class WhisperConfig(BaseModel):
|
192 |
-
model:
|
|
|
|
|
|
|
|
|
|
|
193 |
inference_device: Device = Field(default=Device.AUTO)
|
194 |
compute_type: Quantization = Field(default=Quantization.DEFAULT)
|
195 |
|
@@ -209,21 +194,21 @@ class Config(BaseSettings):
|
|
209 |
default_response_format: ResponseFormat = ResponseFormat.JSON
|
210 |
whisper: WhisperConfig = WhisperConfig()
|
211 |
max_models: int = 1
|
|
|
212 |
"""
|
213 |
Max duration to for the next audio chunk before transcription is finilized and connection is closed.
|
214 |
"""
|
215 |
-
max_no_data_seconds: float = 1.0
|
216 |
min_duration: float = 1.0
|
217 |
word_timestamp_error_margin: float = 0.2
|
|
|
218 |
"""
|
219 |
Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
|
220 |
"""
|
221 |
-
|
222 |
"""
|
223 |
Controls how many latest seconds of audio are being passed through VAD.
|
224 |
Should be greater than `max_inactivity_seconds`
|
225 |
"""
|
226 |
-
inactivity_window_seconds: float = 10.0
|
227 |
|
228 |
|
229 |
config = Config()
|
|
|
46 |
# I see a lot of equivalence between this new LLM OS and operating systems of today.
|
47 |
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
class Device(enum.StrEnum):
|
50 |
CPU = "cpu"
|
51 |
CUDA = "cuda"
|
|
|
169 |
|
170 |
|
171 |
class WhisperConfig(BaseModel):
|
172 |
+
model: str = Field(default="Systran/faster-whisper-medium.en")
|
173 |
+
"""
|
174 |
+
Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2.
|
175 |
+
Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran
|
176 |
+
You can find other supported models at https://huggingface.co/models?p=2&sort=trending&search=ctranslate2 and https://huggingface.co/models?sort=trending&search=ct2
|
177 |
+
"""
|
178 |
inference_device: Device = Field(default=Device.AUTO)
|
179 |
compute_type: Quantization = Field(default=Quantization.DEFAULT)
|
180 |
|
|
|
194 |
default_response_format: ResponseFormat = ResponseFormat.JSON
|
195 |
whisper: WhisperConfig = WhisperConfig()
|
196 |
max_models: int = 1
|
197 |
+
max_no_data_seconds: float = 1.0
|
198 |
"""
|
199 |
Max duration to for the next audio chunk before transcription is finilized and connection is closed.
|
200 |
"""
|
|
|
201 |
min_duration: float = 1.0
|
202 |
word_timestamp_error_margin: float = 0.2
|
203 |
+
max_inactivity_seconds: float = 5.0
|
204 |
"""
|
205 |
Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
|
206 |
"""
|
207 |
+
inactivity_window_seconds: float = 10.0
|
208 |
"""
|
209 |
Controls how many latest seconds of audio are being passed through VAD.
|
210 |
Should be greater than `max_inactivity_seconds`
|
211 |
"""
|
|
|
212 |
|
213 |
|
214 |
config = Config()
|
faster_whisper_server/main.py
CHANGED
@@ -26,7 +26,6 @@ from faster_whisper_server.audio import AudioStream, audio_samples_from_file
|
|
26 |
from faster_whisper_server.config import (
|
27 |
SAMPLES_PER_SECOND,
|
28 |
Language,
|
29 |
-
Model,
|
30 |
ResponseFormat,
|
31 |
config,
|
32 |
)
|
@@ -37,10 +36,10 @@ from faster_whisper_server.server_models import (
|
|
37 |
)
|
38 |
from faster_whisper_server.transcriber import audio_transcriber
|
39 |
|
40 |
-
models: OrderedDict[
|
41 |
|
42 |
|
43 |
-
def load_model(model_name:
|
44 |
if model_name in models:
|
45 |
logger.debug(f"{model_name} model already loaded")
|
46 |
return models[model_name]
|
@@ -50,8 +49,9 @@ def load_model(model_name: Model) -> WhisperModel:
|
|
50 |
f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
|
51 |
)
|
52 |
del models[oldest_model_name]
|
53 |
-
logger.debug(f"Loading {model_name}")
|
54 |
start = time.perf_counter()
|
|
|
55 |
whisper = WhisperModel(
|
56 |
model_name,
|
57 |
device=config.whisper.inference_device,
|
@@ -84,7 +84,7 @@ def health() -> Response:
|
|
84 |
@app.post("/v1/audio/translations")
|
85 |
def translate_file(
|
86 |
file: Annotated[UploadFile, Form()],
|
87 |
-
model: Annotated[
|
88 |
prompt: Annotated[str | None, Form()] = None,
|
89 |
response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
|
90 |
temperature: Annotated[float, Form()] = 0.0,
|
@@ -135,7 +135,7 @@ def translate_file(
|
|
135 |
@app.post("/v1/audio/transcriptions")
|
136 |
def transcribe_file(
|
137 |
file: Annotated[UploadFile, Form()],
|
138 |
-
model: Annotated[
|
139 |
language: Annotated[Language | None, Form()] = config.default_language,
|
140 |
prompt: Annotated[str | None, Form()] = None,
|
141 |
response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
|
@@ -235,7 +235,7 @@ async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
|
|
235 |
@app.websocket("/v1/audio/transcriptions")
|
236 |
async def transcribe_stream(
|
237 |
ws: WebSocket,
|
238 |
-
model: Annotated[
|
239 |
language: Annotated[Language | None, Query()] = config.default_language,
|
240 |
response_format: Annotated[
|
241 |
ResponseFormat, Query()
|
|
|
26 |
from faster_whisper_server.config import (
|
27 |
SAMPLES_PER_SECOND,
|
28 |
Language,
|
|
|
29 |
ResponseFormat,
|
30 |
config,
|
31 |
)
|
|
|
36 |
)
|
37 |
from faster_whisper_server.transcriber import audio_transcriber
|
38 |
|
39 |
+
models: OrderedDict[str, WhisperModel] = OrderedDict()
|
40 |
|
41 |
|
42 |
+
def load_model(model_name: str) -> WhisperModel:
|
43 |
if model_name in models:
|
44 |
logger.debug(f"{model_name} model already loaded")
|
45 |
return models[model_name]
|
|
|
49 |
f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
|
50 |
)
|
51 |
del models[oldest_model_name]
|
52 |
+
logger.debug(f"Loading {model_name}...")
|
53 |
start = time.perf_counter()
|
54 |
+
# NOTE: will raise an exception if the model name isn't valid
|
55 |
whisper = WhisperModel(
|
56 |
model_name,
|
57 |
device=config.whisper.inference_device,
|
|
|
84 |
@app.post("/v1/audio/translations")
|
85 |
def translate_file(
|
86 |
file: Annotated[UploadFile, Form()],
|
87 |
+
model: Annotated[str, Form()] = config.whisper.model,
|
88 |
prompt: Annotated[str | None, Form()] = None,
|
89 |
response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
|
90 |
temperature: Annotated[float, Form()] = 0.0,
|
|
|
135 |
@app.post("/v1/audio/transcriptions")
|
136 |
def transcribe_file(
|
137 |
file: Annotated[UploadFile, Form()],
|
138 |
+
model: Annotated[str, Form()] = config.whisper.model,
|
139 |
language: Annotated[Language | None, Form()] = config.default_language,
|
140 |
prompt: Annotated[str | None, Form()] = None,
|
141 |
response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
|
|
|
235 |
@app.websocket("/v1/audio/transcriptions")
|
236 |
async def transcribe_stream(
|
237 |
ws: WebSocket,
|
238 |
+
model: Annotated[str, Query()] = config.whisper.model,
|
239 |
language: Annotated[Language | None, Query()] = config.default_language,
|
240 |
response_format: Annotated[
|
241 |
ResponseFormat, Query()
|