Fedir Zadniprovskyi commited on
Commit
c8f37a4
·
1 Parent(s): 188bcec

feat: allow using any ctraslate2 compatible model #14

Browse files
Dockerfile.cpu CHANGED
@@ -15,7 +15,7 @@ RUN poetry install --only main
15
  COPY ./faster_whisper_server ./faster_whisper_server
16
  ENTRYPOINT ["poetry", "run"]
17
  CMD ["uvicorn", "faster_whisper_server.main:app"]
18
- ENV WHISPER_MODEL=medium.en
19
  ENV WHISPER_INFERENCE_DEVICE=cpu
20
  ENV WHISPER_COMPUTE_TYPE=int8
21
  ENV UVICORN_HOST=0.0.0.0
 
15
  COPY ./faster_whisper_server ./faster_whisper_server
16
  ENTRYPOINT ["poetry", "run"]
17
  CMD ["uvicorn", "faster_whisper_server.main:app"]
18
+ ENV WHISPER_MODEL=Systran/faster-whisper-medium.en
19
  ENV WHISPER_INFERENCE_DEVICE=cpu
20
  ENV WHISPER_COMPUTE_TYPE=int8
21
  ENV UVICORN_HOST=0.0.0.0
Dockerfile.cuda CHANGED
@@ -15,7 +15,7 @@ RUN poetry install --only main
15
  COPY ./faster_whisper_server ./faster_whisper_server
16
  ENTRYPOINT ["poetry", "run"]
17
  CMD ["uvicorn", "faster_whisper_server.main:app"]
18
- ENV WHISPER_MODEL=distil-large-v3
19
  ENV WHISPER_INFERENCE_DEVICE=cuda
20
  ENV UVICORN_HOST=0.0.0.0
21
  ENV UVICORN_PORT=8000
 
15
  COPY ./faster_whisper_server ./faster_whisper_server
16
  ENTRYPOINT ["poetry", "run"]
17
  CMD ["uvicorn", "faster_whisper_server.main:app"]
18
+ ENV WHISPER_MODEL=Systran/faster-distil-whisper-large-v3
19
  ENV WHISPER_INFERENCE_DEVICE=cuda
20
  ENV UVICORN_HOST=0.0.0.0
21
  ENV UVICORN_PORT=8000
README.md CHANGED
@@ -38,9 +38,9 @@ export OPENAI_API_KEY="cant-be-empty"
38
  export OPENAI_BASE_URL=http://localhost:8000/v1/
39
  ```
40
  ```bash
41
- openai api audio.transcriptions.create -m distil-large-v3 -f audio.wav --response-format text
42
 
43
- openai api audio.translations.create -m distil-large-v3 -f audio.wav --response-format verbose_json
44
  ```
45
  ### OpenAI API Python SDK
46
  ```python
@@ -50,7 +50,7 @@ client = OpenAI(api_key="cant-be-empty", base_url="http://localhost:8000/v1/")
50
 
51
  audio_file = open("audio.wav", "rb")
52
  transcript = client.audio.transcriptions.create(
53
- model="distil-large-v3", file=audio_file
54
  )
55
  print(transcript.text)
56
  ```
@@ -61,9 +61,9 @@ print(transcript.text)
61
  curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
62
  curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
63
  curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true"
64
- curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "model=distil-large-v3"
65
  # It's recommended that you always specify the language as that will reduce the transcription time
66
- curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "model=distil-large-v3" -F "language=en"
67
 
68
  curl http://localhost:8000/v1/audio/translations -F "[email protected]"
69
  ```
 
38
  export OPENAI_BASE_URL=http://localhost:8000/v1/
39
  ```
40
  ```bash
41
+ openai api audio.transcriptions.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format text
42
 
43
+ openai api audio.translations.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format verbose_json
44
  ```
45
  ### OpenAI API Python SDK
46
  ```python
 
50
 
51
  audio_file = open("audio.wav", "rb")
52
  transcript = client.audio.transcriptions.create(
53
+ model="Systran/faster-distil-whisper-large-v3", file=audio_file
54
  )
55
  print(transcript.text)
56
  ```
 
61
  curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
62
  curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
63
  curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true"
64
+ curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "model=Systran/faster-distil-whisper-large-v3"
65
  # It's recommended that you always specify the language as that will reduce the transcription time
66
+ curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "language=en"
67
 
68
  curl http://localhost:8000/v1/audio/translations -F "[email protected]"
69
  ```
examples/live-audio/script.sh CHANGED
@@ -7,7 +7,7 @@ set -e
7
  # ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
8
  # rm -f audio.mp3
9
 
10
- export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
11
 
12
  # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
13
  docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
 
7
  # ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
8
  # rm -f audio.mp3
9
 
10
+ export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
11
 
12
  # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
13
  docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
examples/youtube/script.sh CHANGED
@@ -3,7 +3,7 @@
3
  set -e
4
 
5
  # NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
6
- export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
7
 
8
  # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
9
  docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
@@ -13,7 +13,7 @@ docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface
13
  # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
14
  youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
15
 
16
- # Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `distil-large-v3` takes ~30 seconds on Nvidia L4. `tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `distil-large-v3`.
17
  curl -s http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt
18
 
19
  # Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat
 
3
  set -e
4
 
5
  # NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
6
+ export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
7
 
8
  # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
9
  docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
 
13
  # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
14
  youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
15
 
16
+ # Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `Systran/faster-distil-whisper-large-v3` takes ~30 seconds on Nvidia L4. `Systran/faster-whisper-tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `Systran/faster-distil-whisper-large-v3`.
17
  curl -s http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt
18
 
19
  # Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat
faster_whisper_server/config.py CHANGED
@@ -46,26 +46,6 @@ class ResponseFormat(enum.StrEnum):
46
  # I see a lot of equivalence between this new LLM OS and operating systems of today.
47
 
48
 
49
- # https://huggingface.co/Systran
50
- class Model(enum.StrEnum):
51
- TINY_EN = "tiny.en"
52
- TINY = "tiny"
53
- BASE_EN = "base.en"
54
- BASE = "base"
55
- SMALL_EN = "small.en"
56
- SMALL = "small"
57
- MEDIUM_EN = "medium.en"
58
- MEDIUM = "medium"
59
- LARGE = "large"
60
- LARGE_V1 = "large-v1"
61
- LARGE_V2 = "large-v2"
62
- LARGE_V3 = "large-v3"
63
- DISTIL_SMALL_EN = "distil-small.en"
64
- DISTIL_MEDIUM_EN = "distil-medium.en"
65
- DISTIL_LARGE_V2 = "distil-large-v2"
66
- DISTIL_LARGE_V3 = "distil-large-v3"
67
-
68
-
69
  class Device(enum.StrEnum):
70
  CPU = "cpu"
71
  CUDA = "cuda"
@@ -189,7 +169,12 @@ class Language(enum.StrEnum):
189
 
190
 
191
  class WhisperConfig(BaseModel):
192
- model: Model = Field(default=Model.MEDIUM_EN)
 
 
 
 
 
193
  inference_device: Device = Field(default=Device.AUTO)
194
  compute_type: Quantization = Field(default=Quantization.DEFAULT)
195
 
@@ -209,21 +194,21 @@ class Config(BaseSettings):
209
  default_response_format: ResponseFormat = ResponseFormat.JSON
210
  whisper: WhisperConfig = WhisperConfig()
211
  max_models: int = 1
 
212
  """
213
  Max duration to for the next audio chunk before transcription is finilized and connection is closed.
214
  """
215
- max_no_data_seconds: float = 1.0
216
  min_duration: float = 1.0
217
  word_timestamp_error_margin: float = 0.2
 
218
  """
219
  Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
220
  """
221
- max_inactivity_seconds: float = 5.0
222
  """
223
  Controls how many latest seconds of audio are being passed through VAD.
224
  Should be greater than `max_inactivity_seconds`
225
  """
226
- inactivity_window_seconds: float = 10.0
227
 
228
 
229
  config = Config()
 
46
  # I see a lot of equivalence between this new LLM OS and operating systems of today.
47
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  class Device(enum.StrEnum):
50
  CPU = "cpu"
51
  CUDA = "cuda"
 
169
 
170
 
171
  class WhisperConfig(BaseModel):
172
+ model: str = Field(default="Systran/faster-whisper-medium.en")
173
+ """
174
+ Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2.
175
+ Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran
176
+ You can find other supported models at https://huggingface.co/models?p=2&sort=trending&search=ctranslate2 and https://huggingface.co/models?sort=trending&search=ct2
177
+ """
178
  inference_device: Device = Field(default=Device.AUTO)
179
  compute_type: Quantization = Field(default=Quantization.DEFAULT)
180
 
 
194
  default_response_format: ResponseFormat = ResponseFormat.JSON
195
  whisper: WhisperConfig = WhisperConfig()
196
  max_models: int = 1
197
+ max_no_data_seconds: float = 1.0
198
  """
199
  Max duration to for the next audio chunk before transcription is finilized and connection is closed.
200
  """
 
201
  min_duration: float = 1.0
202
  word_timestamp_error_margin: float = 0.2
203
+ max_inactivity_seconds: float = 5.0
204
  """
205
  Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
206
  """
207
+ inactivity_window_seconds: float = 10.0
208
  """
209
  Controls how many latest seconds of audio are being passed through VAD.
210
  Should be greater than `max_inactivity_seconds`
211
  """
 
212
 
213
 
214
  config = Config()
faster_whisper_server/main.py CHANGED
@@ -26,7 +26,6 @@ from faster_whisper_server.audio import AudioStream, audio_samples_from_file
26
  from faster_whisper_server.config import (
27
  SAMPLES_PER_SECOND,
28
  Language,
29
- Model,
30
  ResponseFormat,
31
  config,
32
  )
@@ -37,10 +36,10 @@ from faster_whisper_server.server_models import (
37
  )
38
  from faster_whisper_server.transcriber import audio_transcriber
39
 
40
- models: OrderedDict[Model, WhisperModel] = OrderedDict()
41
 
42
 
43
- def load_model(model_name: Model) -> WhisperModel:
44
  if model_name in models:
45
  logger.debug(f"{model_name} model already loaded")
46
  return models[model_name]
@@ -50,8 +49,9 @@ def load_model(model_name: Model) -> WhisperModel:
50
  f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
51
  )
52
  del models[oldest_model_name]
53
- logger.debug(f"Loading {model_name}")
54
  start = time.perf_counter()
 
55
  whisper = WhisperModel(
56
  model_name,
57
  device=config.whisper.inference_device,
@@ -84,7 +84,7 @@ def health() -> Response:
84
  @app.post("/v1/audio/translations")
85
  def translate_file(
86
  file: Annotated[UploadFile, Form()],
87
- model: Annotated[Model, Form()] = config.whisper.model,
88
  prompt: Annotated[str | None, Form()] = None,
89
  response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
90
  temperature: Annotated[float, Form()] = 0.0,
@@ -135,7 +135,7 @@ def translate_file(
135
  @app.post("/v1/audio/transcriptions")
136
  def transcribe_file(
137
  file: Annotated[UploadFile, Form()],
138
- model: Annotated[Model, Form()] = config.whisper.model,
139
  language: Annotated[Language | None, Form()] = config.default_language,
140
  prompt: Annotated[str | None, Form()] = None,
141
  response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
@@ -235,7 +235,7 @@ async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
235
  @app.websocket("/v1/audio/transcriptions")
236
  async def transcribe_stream(
237
  ws: WebSocket,
238
- model: Annotated[Model, Query()] = config.whisper.model,
239
  language: Annotated[Language | None, Query()] = config.default_language,
240
  response_format: Annotated[
241
  ResponseFormat, Query()
 
26
  from faster_whisper_server.config import (
27
  SAMPLES_PER_SECOND,
28
  Language,
 
29
  ResponseFormat,
30
  config,
31
  )
 
36
  )
37
  from faster_whisper_server.transcriber import audio_transcriber
38
 
39
+ models: OrderedDict[str, WhisperModel] = OrderedDict()
40
 
41
 
42
+ def load_model(model_name: str) -> WhisperModel:
43
  if model_name in models:
44
  logger.debug(f"{model_name} model already loaded")
45
  return models[model_name]
 
49
  f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
50
  )
51
  del models[oldest_model_name]
52
+ logger.debug(f"Loading {model_name}...")
53
  start = time.perf_counter()
54
+ # NOTE: will raise an exception if the model name isn't valid
55
  whisper = WhisperModel(
56
  model_name,
57
  device=config.whisper.inference_device,
 
84
  @app.post("/v1/audio/translations")
85
  def translate_file(
86
  file: Annotated[UploadFile, Form()],
87
+ model: Annotated[str, Form()] = config.whisper.model,
88
  prompt: Annotated[str | None, Form()] = None,
89
  response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
90
  temperature: Annotated[float, Form()] = 0.0,
 
135
  @app.post("/v1/audio/transcriptions")
136
  def transcribe_file(
137
  file: Annotated[UploadFile, Form()],
138
+ model: Annotated[str, Form()] = config.whisper.model,
139
  language: Annotated[Language | None, Form()] = config.default_language,
140
  prompt: Annotated[str | None, Form()] = None,
141
  response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
 
235
  @app.websocket("/v1/audio/transcriptions")
236
  async def transcribe_stream(
237
  ws: WebSocket,
238
+ model: Annotated[str, Query()] = config.whisper.model,
239
  language: Annotated[Language | None, Query()] = config.default_language,
240
  response_format: Annotated[
241
  ResponseFormat, Query()