whisper-webui-translate

Running

App Files Files Community

avans06 commited on Oct 10, 2024

Commit

f96cbbf

1 Parent(s): 96b845f

Translation model incorporates Llama-3.1-8B-Instruct.

Browse files

Process in segments when the duration of Transcribe is longer.

Files changed (7) hide show

app.py +25 -24
config.json5 +6 -0
requirements-fasterWhisper.txt +1 -2
requirements-whisper.txt +0 -1
requirements.txt +1 -2
src/utils.py +21 -0
src/vad.py +57 -13

app.py CHANGED Viewed

@@ -1085,12 +1085,12 @@ def create_ui(app_config: ApplicationConfig):
     }
     common_word_timestamps_inputs = lambda : {
-        gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps, elem_id="word_timestamps"),
-        gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words, elem_id="highlight_words"),
     }
     common_segments_filter_inputs = lambda : {
-        gr.Checkbox(label="Whisper Segments Filter", value=app_config.whisper_segments_filter, elem_id="whisperSegmentsFilter") if idx == 0 else
         gr.Text(label=f"Filter {idx}", value=filterStr, elem_id=f"whisperSegmentsFilter{idx}") for idx, filterStr in enumerate([""] + app_config.whisper_segments_filters)
     }
@@ -1101,10 +1101,10 @@ def create_ui(app_config: ApplicationConfig):
         app_config.diarization = False
     common_diarization_inputs = lambda : {
-        gr.Checkbox(label="Diarization", value=app_config.diarization, interactive=has_diarization_libs, elem_id="diarization"),
-        gr.Number(label="Diarization - Speakers", precision=0, value=app_config.diarization_speakers, interactive=has_diarization_libs, elem_id="diarization_speakers"),
-        gr.Number(label="Diarization - Min Speakers", precision=0, value=app_config.diarization_min_speakers, interactive=has_diarization_libs, elem_id="diarization_min_speakers"),
-        gr.Number(label="Diarization - Max Speakers", precision=0, value=app_config.diarization_max_speakers, interactive=has_diarization_libs, elem_id="diarization_max_speakers")
     }
     common_output = lambda : [
@@ -1117,6 +1117,7 @@ def create_ui(app_config: ApplicationConfig):
     css = """
 .scroll-show textarea {
     overflow-y: auto !important;
 }
 .scroll-show textarea::-webkit-scrollbar {
     all: initial !important;
@@ -1191,29 +1192,29 @@ def create_ui(app_config: ApplicationConfig):
                             inputDict.update(common_word_timestamps_inputs())
                             if isFull:
                                 inputDict.update({
-                                    gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations, elem_id = "prepend_punctuations"),
-                                    gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations, elem_id = "append_punctuations")})
                         if isFull:
                             with gr.Accordion("Whisper Advanced options", open=False):
                                 inputDict.update({
-                                    gr.TextArea(label="Initial Prompt", elem_id = "initial_prompt"),
-                                    gr.Number(label="Temperature", value=app_config.temperature, elem_id = "temperature"),
-                                    gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of"),
-                                    gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size"),
-                                    gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience"),
-                                    gr.Number(label="Length Penalty - Any temperature", value=lambda : None if app_config.length_penalty is None else app_config.length_penalty, elem_id = "length_penalty"),
-                                    gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens"),
-                                    gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text"),
-                                    gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16"),
-                                    gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback, elem_id = "temperature_increment_on_fallback"),
-                                    gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold, elem_id = "compression_ratio_threshold"),
-                                    gr.Number(label="Logprob threshold", value=app_config.logprob_threshold, elem_id = "logprob_threshold"),
-                                    gr.Number(label="No speech threshold", value=app_config.no_speech_threshold, elem_id = "no_speech_threshold"),
                                     })
                                 if app_config.whisper_implementation == "faster-whisper":
                                     inputDict.update({
-                                        gr.Number(label="Repetition Penalty", value=app_config.repetition_penalty, elem_id = "repetition_penalty"),
-                                        gr.Number(label="No Repeat Ngram Size", value=app_config.no_repeat_ngram_size, precision=0, elem_id = "no_repeat_ngram_size")
                                     })
                         with gr.Accordion("Whisper Segments Filter options", open=False):
                             inputDict.update(common_segments_filter_inputs())

     }
     common_word_timestamps_inputs = lambda : {
+        gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps, elem_id="word_timestamps", info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment."),
+        gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words, elem_id="highlight_words", info="if word_timestamps is True, underline each word as it is spoken in srt and vtt"),
     }
     common_segments_filter_inputs = lambda : {
+        gr.Checkbox(label="Whisper Segments Filter", value=app_config.whisper_segments_filter, elem_id="whisperSegmentsFilter", info="Filter the results of Whisper transcribe with the following conditions. It is recommended to enable this feature when using the large-v3 model to avoid hallucinations.") if idx == 0 else
         gr.Text(label=f"Filter {idx}", value=filterStr, elem_id=f"whisperSegmentsFilter{idx}") for idx, filterStr in enumerate([""] + app_config.whisper_segments_filters)
     }
         app_config.diarization = False
     common_diarization_inputs = lambda : {
+        gr.Checkbox(label="Diarization", value=app_config.diarization, interactive=has_diarization_libs, elem_id="diarization", info="Whether to perform speaker diarization"),
+        gr.Number(label="Diarization - Speakers", precision=0, value=app_config.diarization_speakers, interactive=has_diarization_libs, elem_id="diarization_speakers", info="The number of speakers to detect"),
+        gr.Number(label="Diarization - Min Speakers", precision=0, value=app_config.diarization_min_speakers, interactive=has_diarization_libs, elem_id="diarization_min_speakers", info="The minimum number of speakers to detect"),
+        gr.Number(label="Diarization - Max Speakers", precision=0, value=app_config.diarization_max_speakers, interactive=has_diarization_libs, elem_id="diarization_max_speakers", info="The maximum number of speakers to detect")
     }
     common_output = lambda : [
     css = """
 .scroll-show textarea {
     overflow-y: auto !important;
+    scrollbar-width: auto !important;
 }
 .scroll-show textarea::-webkit-scrollbar {
     all: initial !important;
                             inputDict.update(common_word_timestamps_inputs())
                             if isFull:
                                 inputDict.update({
+                                    gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations, elem_id = "prepend_punctuations", info="if word_timestamps is True, merge these punctuation symbols with the next word"),
+                                    gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations, elem_id = "append_punctuations", info="if word_timestamps is True, merge these punctuation symbols with the previous word")})
                         if isFull:
                             with gr.Accordion("Whisper Advanced options", open=False):
                                 inputDict.update({
+                                    gr.TextArea(label="Initial Prompt", elem_id = "initial_prompt", info="Optional text to provide as a prompt for the first window"),
+                                    gr.Number(label="Temperature", value=app_config.temperature, elem_id = "temperature", info="Temperature to use for sampling"),
+                                    gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of", info="Number of candidates when sampling with non-zero temperature"),
+                                    gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size", info="Number of beams in beam search, only applicable when temperature is zero"),
+                                    gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience", info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
+                                    gr.Number(label="Length Penalty - Any temperature", value=lambda : None if app_config.length_penalty is None else app_config.length_penalty, elem_id = "length_penalty", info="Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default"),
+                                    gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens", info="Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations"),
+                                    gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text", info="If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop"),
+                                    gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16", info="Whether to perform inference in fp16; True by default; It will be ignored in faster-whisper because it is already a quantized model."),
+                                    gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback, elem_id = "temperature_increment_on_fallback", info="Temperature to increase when falling back when the decoding fails to meet either of the thresholds below"),
+                                    gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold, elem_id = "compression_ratio_threshold", info="If the gzip compression ratio is higher than this value, treat the decoding as failed"),
+                                    gr.Number(label="Logprob threshold", value=app_config.logprob_threshold, elem_id = "logprob_threshold", info="If the average log probability is lower than this value, treat the decoding as failed"),
+                                    gr.Number(label="No speech threshold", value=app_config.no_speech_threshold, elem_id = "no_speech_threshold", info="If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence"),
                                     })
                                 if app_config.whisper_implementation == "faster-whisper":
                                     inputDict.update({
+                                        gr.Number(label="Repetition Penalty", value=app_config.repetition_penalty, elem_id = "repetition_penalty", info="[faster-whisper] The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0."),
+                                        gr.Number(label="No Repeat Ngram Size", value=app_config.no_repeat_ngram_size, precision=0, elem_id = "no_repeat_ngram_size", info="[faster-whisper] The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.")
                                     })
                         with gr.Accordion("Whisper Segments Filter options", open=False):
                             inputDict.update(common_segments_filter_inputs())

config.json5 CHANGED Viewed

@@ -294,6 +294,12 @@
       }
     ],
     "Llama": [
       {
         "name": "Meta-Llama-3-8B-Instruct-ct2-int8_float16/avan",
         "url": "avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16",

       }
     ],
     "Llama": [
+      {
+        "name": "Meta-Llama-3.1-8B-Instruct-ct2-int8_float16/avan",
+        "url": "avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16",
+        "type": "huggingface",
+        "tokenizer_url": "avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16"
+      },
       {
         "name": "Meta-Llama-3-8B-Instruct-ct2-int8_float16/avan",
         "url": "avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16",

requirements-fasterWhisper.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 transformers
 ctranslate2>=4.2.1
-faster-whisper>=1.0.1
 ffmpeg-python==0.2.0
 gradio==3.50.2
 yt-dlp
@@ -14,7 +14,6 @@ sentencepiece
 # Needed by diarization
 intervaltree
 srt
-torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
 # Needed by ALMA-GPTQ

 transformers
 ctranslate2>=4.2.1
+faster-whisper>=1.0.2
 ffmpeg-python==0.2.0
 gradio==3.50.2
 yt-dlp
 # Needed by diarization
 intervaltree
 srt
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
 # Needed by ALMA-GPTQ

requirements-whisper.txt CHANGED Viewed

@@ -13,7 +13,6 @@ sentencepiece
 # Needed by diarization
 intervaltree
 srt
-torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
 # Needed by ALMA-GPTQ

 # Needed by diarization
 intervaltree
 srt
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
 # Needed by ALMA-GPTQ

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 transformers
 ctranslate2>=4.2.1
-faster-whisper>=1.0.1
 ffmpeg-python==0.2.0
 gradio==3.50.2
 yt-dlp
@@ -14,7 +14,6 @@ sentencepiece
 # Needed by diarization
 intervaltree
 srt
-torch
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
 # Needed by ALMA-GPTQ

 transformers
 ctranslate2>=4.2.1
+faster-whisper>=1.0.2
 ffmpeg-python==0.2.0
 gradio==3.50.2
 yt-dlp
 # Needed by diarization
 intervaltree
 srt
 https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
 # Needed by ALMA-GPTQ

src/utils.py CHANGED Viewed

@@ -298,6 +298,27 @@ def process_text(text: str, maxLineWidth=None):
     return '\n'.join(lines)
 def slugify(value, allow_unicode=False, is_lower=False):
     """
     Taken from https://github.com/django/django/blob/master/django/utils/text.py

     return '\n'.join(lines)
+def len_wide(text: str):
+    """
+    Use east_asian_width to automatically determine the Character Width of the string, replacing the textwrap.wrap function.
+    # East_Asian_Width (ea)
+    ea ; A         ; Ambiguous
+    ea ; F         ; Fullwidth
+    ea ; H         ; Halfwidth
+    ea ; N         ; Neutral
+    ea ; Na        ; Narrow
+    ea ; W         ; Wide
+    https://stackoverflow.com/a/31666966
+    """
+    width = 0
+    for char in text:
+        width += (1 if unicodedata.east_asian_width(char) not in {'W', 'F'} else 2)
+    return width
 def slugify(value, allow_unicode=False, is_lower=False):
     """
     Taken from https://github.com/django/django/blob/master/django/utils/text.py

src/vad.py CHANGED Viewed

@@ -26,7 +26,7 @@ import torch
 import ffmpeg
 import numpy as np
-from src.utils import format_timestamp
 from enum import Enum
 class NonSpeechStrategy(Enum):
@@ -405,21 +405,65 @@ class AbstractTranscription(ABC):
                 if (segment_start > max_source_time):
                     continue
                 segment_end = min(max_source_time, segment_end)
                 new_segment = segment.copy()
-            # Add to start and end
-            new_segment['start'] = segment_start + adjust_seconds
-            new_segment['end'] = segment_end + adjust_seconds
-            # Handle words
-            if ('words' in new_segment):
-                for word in new_segment['words']:
                     # Adjust start and end
-                    word['start'] = word['start'] + adjust_seconds
-                    word['end'] = word['end'] + adjust_seconds
-            result.append(new_segment)
         return result
     def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):

 import ffmpeg
 import numpy as np
+from src.utils import format_timestamp, len_wide
 from enum import Enum
 class NonSpeechStrategy(Enum):
                 if (segment_start > max_source_time):
                     continue
                 segment_end = min(max_source_time, segment_end)
+                # {'text': 'XXX', 'start': 0.0, 'end': 99.99,
+                #  'temperature': 0.0, 'avg_logprob': -0.09..., 'compression_ratio': 1.234..., 'no_speech_prob': 0.123...,
+                #  'words': [{...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}, ...]}
                 new_segment = segment.copy()
+            segment_duration = segment_end - segment_start
+            if ("text" in segment and "words" in segment and segment_duration > 10):
+                segment_words = new_segment["words"]
+                del new_segment["text"]
+                del new_segment["start"]
+                del new_segment["end"]
+                del new_segment["words"]
+                sub_segment = new_segment.copy()
+                sub_text = ""
+                sub_words = []
+                word_length = 0
+                for idx, word in enumerate(segment_words):
+                    word2 = segment_words[idx + 1] if idx + 1 < len(segment_words) else None
                     # Adjust start and end
+                    word["start"] = word["start"] + adjust_seconds
+                    word["end"] = word["end"] + adjust_seconds
+                    if "start" not in sub_segment:
+                        sub_segment["start"] = float(word["start"])
+                    sub_text += word["word"]
+                    sub_words.append(word)
+                    word_length += len_wide(word["word"])
+                    if (sub_text.rstrip().endswith(".") or
+                        (word_length > 90 and (sub_text.rstrip().endswith(",") or sub_text.rstrip().endswith("?"))) or
+                        (word_length > 120 and word2 and (word2["word"].lstrip().startswith(",") or ((word2["word"].strip() in ["and", "or", "but"])))) or
+                        (word_length > 180 and sub_text.endswith(" "))):
+                        sub_segment["text"] = sub_text
+                        sub_segment["end"] = float(word["end"])
+                        sub_segment["words"] = sub_words
+                        result.append(sub_segment)
+                        sub_segment = new_segment.copy()
+                        sub_text = ""
+                        sub_words = []
+                        word_length = 0
+                if "start" in sub_segment:
+                    sub_segment["text"] = sub_text
+                    sub_segment["end"] = float(word["end"])
+                    sub_segment["words"] = sub_words
+                    result.append(sub_segment)
+            else:
+                # Add to start and end
+                new_segment['start'] = segment_start + adjust_seconds
+                new_segment["end"] = segment_end + adjust_seconds
+                # Handle words
+                if ("words" in new_segment):
+                    for word in new_segment["words"]:
+                        # Adjust start and end
+                        word["start"] = word["start"] + adjust_seconds
+                        word["end"] = word["end"] + adjust_seconds
+                result.append(new_segment)
         return result
     def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):