Spaces:

fedirz
/

faster-whisper-server

Configuration error

App Files Files Community

Fedir Zadniprovskyi commited on Jun 2, 2024

Commit

c8f37a4

1 Parent(s): 188bcec

feat: allow using any ctraslate2 compatible model #14

Browse files

Files changed (7) hide show

Dockerfile.cpu +1 -1
Dockerfile.cuda +1 -1
README.md +5 -5
examples/live-audio/script.sh +1 -1
examples/youtube/script.sh +2 -2
faster_whisper_server/config.py +9 -24
faster_whisper_server/main.py +7 -7

Dockerfile.cpu CHANGED Viewed

@@ -15,7 +15,7 @@ RUN poetry install --only main
 COPY ./faster_whisper_server ./faster_whisper_server
 ENTRYPOINT ["poetry", "run"]
 CMD ["uvicorn", "faster_whisper_server.main:app"]
-ENV WHISPER_MODEL=medium.en
 ENV WHISPER_INFERENCE_DEVICE=cpu
 ENV WHISPER_COMPUTE_TYPE=int8
 ENV UVICORN_HOST=0.0.0.0

 COPY ./faster_whisper_server ./faster_whisper_server
 ENTRYPOINT ["poetry", "run"]
 CMD ["uvicorn", "faster_whisper_server.main:app"]
+ENV WHISPER_MODEL=Systran/faster-whisper-medium.en
 ENV WHISPER_INFERENCE_DEVICE=cpu
 ENV WHISPER_COMPUTE_TYPE=int8
 ENV UVICORN_HOST=0.0.0.0

Dockerfile.cuda CHANGED Viewed

@@ -15,7 +15,7 @@ RUN poetry install --only main
 COPY ./faster_whisper_server ./faster_whisper_server
 ENTRYPOINT ["poetry", "run"]
 CMD ["uvicorn", "faster_whisper_server.main:app"]
-ENV WHISPER_MODEL=distil-large-v3
 ENV WHISPER_INFERENCE_DEVICE=cuda
 ENV UVICORN_HOST=0.0.0.0
 ENV UVICORN_PORT=8000

 COPY ./faster_whisper_server ./faster_whisper_server
 ENTRYPOINT ["poetry", "run"]
 CMD ["uvicorn", "faster_whisper_server.main:app"]
+ENV WHISPER_MODEL=Systran/faster-distil-whisper-large-v3
 ENV WHISPER_INFERENCE_DEVICE=cuda
 ENV UVICORN_HOST=0.0.0.0
 ENV UVICORN_PORT=8000

README.md CHANGED Viewed

@@ -38,9 +38,9 @@ export OPENAI_API_KEY="cant-be-empty"
 export OPENAI_BASE_URL=http://localhost:8000/v1/
 ```
 ```bash
-openai api audio.transcriptions.create -m distil-large-v3 -f audio.wav --response-format text
-openai api audio.translations.create -m distil-large-v3 -f audio.wav --response-format verbose_json
 ```
 ### OpenAI API Python SDK
 ```python
@@ -50,7 +50,7 @@ client = OpenAI(api_key="cant-be-empty", base_url="http://localhost:8000/v1/")
 audio_file = open("audio.wav", "rb")
 transcript = client.audio.transcriptions.create(
-    model="distil-large-v3", file=audio_file
 )
 print(transcript.text)
 ```
@@ -61,9 +61,9 @@ print(transcript.text)
 curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
 curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
 curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true"
-curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "model=distil-large-v3"
 # It's recommended that you always specify the language as that will reduce the transcription time
-curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "model=distil-large-v3" -F "language=en"
 curl http://localhost:8000/v1/audio/translations -F "[email protected]"
 ```

 export OPENAI_BASE_URL=http://localhost:8000/v1/
 ```
 ```bash
+openai api audio.transcriptions.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format text
+openai api audio.translations.create -m Systran/faster-distil-whisper-large-v3 -f audio.wav --response-format verbose_json
 ```
 ### OpenAI API Python SDK
 ```python
 audio_file = open("audio.wav", "rb")
 transcript = client.audio.transcriptions.create(
+    model="Systran/faster-distil-whisper-large-v3", file=audio_file
 )
 print(transcript.text)
 ```
 curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
 curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]"
 curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true"
+curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "model=Systran/faster-distil-whisper-large-v3"
 # It's recommended that you always specify the language as that will reduce the transcription time
+curl http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "language=en"
 curl http://localhost:8000/v1/audio/translations -F "[email protected]"
 ```

examples/live-audio/script.sh CHANGED Viewed

@@ -7,7 +7,7 @@ set -e
 # ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
 # rm -f audio.mp3
-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
 docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda

 # ffmpeg -y -hide_banner -loglevel quiet -i audio.mp3 -ac 1 -ar 16000 -f s16le -acodec pcm_s16le audio.pcm
 # rm -f audio.mp3
+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
 docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda

examples/youtube/script.sh CHANGED Viewed

@@ -3,7 +3,7 @@
 set -e
 # NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
-export WHISPER_MODEL=distil-large-v3 # or tiny.en if you are running on a CPU for a faster inference.
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
 docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
@@ -13,7 +13,7 @@ docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface
 # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
 youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
-# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `distil-large-v3` takes ~30 seconds on Nvidia L4. `tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `distil-large-v3`.
 curl -s http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt
 # Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat

 set -e
 # NOTE: do not use any distil-* model other than the large ones as they don't work on long audio files for some reason.
+export WHISPER_MODEL=Systran/faster-distil-whisper-large-v3 # or Systran/faster-whisper-tiny.en if you are running on a CPU for a faster inference.
 # Ensure you have `faster-whisper-server` running. If this is your first time running it expect to wait up-to a minute for the model to be downloaded and loaded into memory. You can run `curl localhost:8000/health` to check if the server is ready or watch the logs with `docker logs -f <container_id>`.
 docker run --detach --gpus=all --publish 8000:8000 --volume ~/.cache/huggingface:/root/.cache/huggingface --env WHISPER_MODEL=$WHISPER_MODEL fedirz/faster-whisper-server:0.1-cuda
 # Download the audio from a YouTube video. In this example I'm downloading "The Evolution of the Operating System" by Asionometry YouTube channel. I highly checking this channel out, the guy produces very high content. If you don't have `youtube-dl`, you'll have to install it. https://github.com/ytdl-org/youtube-dl
 youtube-dl --extract-audio --audio-format mp3 -o the-evolution-of-the-operating-system.mp3 'https://www.youtube.com/watch?v=1lG7lFLXBIs'
+# Make a request to the API to transcribe the audio. The response will be streamed to the terminal and saved to a file. The video is 30 minutes long, so it might take a while to transcribe, especially if you are running this on a CPU. `Systran/faster-distil-whisper-large-v3` takes ~30 seconds on Nvidia L4. `Systran/faster-whisper-tiny.en` takes ~1 minute on Ryzen 7 7700X. The .txt file in the example was transcribed using `Systran/faster-distil-whisper-large-v3`.
 curl -s http://localhost:8000/v1/audio/transcriptions -F "[email protected]" -F "stream=true" -F "language=en" -F "response_format=text" | tee the-evolution-of-the-operating-system.txt
 # Here I'm using `aichat` which is a CLI LLM client. You could use any other client that supports attaching/uploading files. https://github.com/sigoden/aichat

faster_whisper_server/config.py CHANGED Viewed

@@ -46,26 +46,6 @@ class ResponseFormat(enum.StrEnum):
     # I see a lot of equivalence between this new LLM OS and operating systems of today.
-# https://huggingface.co/Systran
-class Model(enum.StrEnum):
-    TINY_EN = "tiny.en"
-    TINY = "tiny"
-    BASE_EN = "base.en"
-    BASE = "base"
-    SMALL_EN = "small.en"
-    SMALL = "small"
-    MEDIUM_EN = "medium.en"
-    MEDIUM = "medium"
-    LARGE = "large"
-    LARGE_V1 = "large-v1"
-    LARGE_V2 = "large-v2"
-    LARGE_V3 = "large-v3"
-    DISTIL_SMALL_EN = "distil-small.en"
-    DISTIL_MEDIUM_EN = "distil-medium.en"
-    DISTIL_LARGE_V2 = "distil-large-v2"
-    DISTIL_LARGE_V3 = "distil-large-v3"
 class Device(enum.StrEnum):
     CPU = "cpu"
     CUDA = "cuda"
@@ -189,7 +169,12 @@ class Language(enum.StrEnum):
 class WhisperConfig(BaseModel):
-    model: Model = Field(default=Model.MEDIUM_EN)
     inference_device: Device = Field(default=Device.AUTO)
     compute_type: Quantization = Field(default=Quantization.DEFAULT)
@@ -209,21 +194,21 @@ class Config(BaseSettings):
     default_response_format: ResponseFormat = ResponseFormat.JSON
     whisper: WhisperConfig = WhisperConfig()
     max_models: int = 1
     """
     Max duration to for the next audio chunk before transcription is finilized and connection is closed.
     """
-    max_no_data_seconds: float = 1.0
     min_duration: float = 1.0
     word_timestamp_error_margin: float = 0.2
     """
     Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
     """
-    max_inactivity_seconds: float = 5.0
     """
     Controls how many latest seconds of audio are being passed through VAD.
     Should be greater than `max_inactivity_seconds`
     """
-    inactivity_window_seconds: float = 10.0
 config = Config()

     # I see a lot of equivalence between this new LLM OS and operating systems of today.
 class Device(enum.StrEnum):
     CPU = "cpu"
     CUDA = "cuda"
 class WhisperConfig(BaseModel):
+    model: str = Field(default="Systran/faster-whisper-medium.en")
+    """
+    Huggingface model to use for transcription. Note, the model must support being ran using CTranslate2.
+    Models created by authors of `faster-whisper` can be found at https://huggingface.co/Systran
+    You can find other supported models at https://huggingface.co/models?p=2&sort=trending&search=ctranslate2 and https://huggingface.co/models?sort=trending&search=ct2
+    """
     inference_device: Device = Field(default=Device.AUTO)
     compute_type: Quantization = Field(default=Quantization.DEFAULT)
     default_response_format: ResponseFormat = ResponseFormat.JSON
     whisper: WhisperConfig = WhisperConfig()
     max_models: int = 1
+    max_no_data_seconds: float = 1.0
     """
     Max duration to for the next audio chunk before transcription is finilized and connection is closed.
     """
     min_duration: float = 1.0
     word_timestamp_error_margin: float = 0.2
+    max_inactivity_seconds: float = 5.0
     """
     Max allowed audio duration without any speech being detected before transcription is finilized and connection is closed.
     """
+    inactivity_window_seconds: float = 10.0
     """
     Controls how many latest seconds of audio are being passed through VAD.
     Should be greater than `max_inactivity_seconds`
     """
 config = Config()

faster_whisper_server/main.py CHANGED Viewed

@@ -26,7 +26,6 @@ from faster_whisper_server.audio import AudioStream, audio_samples_from_file
 from faster_whisper_server.config import (
     SAMPLES_PER_SECOND,
     Language,
-    Model,
     ResponseFormat,
     config,
 )
@@ -37,10 +36,10 @@ from faster_whisper_server.server_models import (
 )
 from faster_whisper_server.transcriber import audio_transcriber
-models: OrderedDict[Model, WhisperModel] = OrderedDict()
-def load_model(model_name: Model) -> WhisperModel:
     if model_name in models:
         logger.debug(f"{model_name} model already loaded")
         return models[model_name]
@@ -50,8 +49,9 @@ def load_model(model_name: Model) -> WhisperModel:
             f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
         )
         del models[oldest_model_name]
-    logger.debug(f"Loading {model_name}")
     start = time.perf_counter()
     whisper = WhisperModel(
         model_name,
         device=config.whisper.inference_device,
@@ -84,7 +84,7 @@ def health() -> Response:
 @app.post("/v1/audio/translations")
 def translate_file(
     file: Annotated[UploadFile, Form()],
-    model: Annotated[Model, Form()] = config.whisper.model,
     prompt: Annotated[str | None, Form()] = None,
     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
     temperature: Annotated[float, Form()] = 0.0,
@@ -135,7 +135,7 @@ def translate_file(
 @app.post("/v1/audio/transcriptions")
 def transcribe_file(
     file: Annotated[UploadFile, Form()],
-    model: Annotated[Model, Form()] = config.whisper.model,
     language: Annotated[Language | None, Form()] = config.default_language,
     prompt: Annotated[str | None, Form()] = None,
     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
@@ -235,7 +235,7 @@ async def audio_receiver(ws: WebSocket, audio_stream: AudioStream) -> None:
 @app.websocket("/v1/audio/transcriptions")
 async def transcribe_stream(
     ws: WebSocket,
-    model: Annotated[Model, Query()] = config.whisper.model,
     language: Annotated[Language | None, Query()] = config.default_language,
     response_format: Annotated[
         ResponseFormat, Query()

 from faster_whisper_server.config import (
     SAMPLES_PER_SECOND,
     Language,
     ResponseFormat,
     config,
 )
 )
 from faster_whisper_server.transcriber import audio_transcriber
+models: OrderedDict[str, WhisperModel] = OrderedDict()
+def load_model(model_name: str) -> WhisperModel:
     if model_name in models:
         logger.debug(f"{model_name} model already loaded")
         return models[model_name]
             f"Max models ({config.max_models}) reached. Unloading the oldest model: {oldest_model_name}"
         )
         del models[oldest_model_name]
+    logger.debug(f"Loading {model_name}...")
     start = time.perf_counter()
+    # NOTE: will raise an exception if the model name isn't valid
     whisper = WhisperModel(
         model_name,
         device=config.whisper.inference_device,
 @app.post("/v1/audio/translations")
 def translate_file(
     file: Annotated[UploadFile, Form()],
+    model: Annotated[str, Form()] = config.whisper.model,
     prompt: Annotated[str | None, Form()] = None,
     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
     temperature: Annotated[float, Form()] = 0.0,
 @app.post("/v1/audio/transcriptions")
 def transcribe_file(
     file: Annotated[UploadFile, Form()],
+    model: Annotated[str, Form()] = config.whisper.model,
     language: Annotated[Language | None, Form()] = config.default_language,
     prompt: Annotated[str | None, Form()] = None,
     response_format: Annotated[ResponseFormat, Form()] = config.default_response_format,
 @app.websocket("/v1/audio/transcriptions")
 async def transcribe_stream(
     ws: WebSocket,
+    model: Annotated[str, Query()] = config.whisper.model,
     language: Annotated[Language | None, Query()] = config.default_language,
     response_format: Annotated[
         ResponseFormat, Query()