avans06 commited on
Commit
f96cbbf
·
1 Parent(s): 96b845f

Translation model incorporates Llama-3.1-8B-Instruct.

Browse files

Process in segments when the duration of Transcribe is longer.

app.py CHANGED
@@ -1085,12 +1085,12 @@ def create_ui(app_config: ApplicationConfig):
1085
  }
1086
 
1087
  common_word_timestamps_inputs = lambda : {
1088
- gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps, elem_id="word_timestamps"),
1089
- gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words, elem_id="highlight_words"),
1090
  }
1091
 
1092
  common_segments_filter_inputs = lambda : {
1093
- gr.Checkbox(label="Whisper Segments Filter", value=app_config.whisper_segments_filter, elem_id="whisperSegmentsFilter") if idx == 0 else
1094
  gr.Text(label=f"Filter {idx}", value=filterStr, elem_id=f"whisperSegmentsFilter{idx}") for idx, filterStr in enumerate([""] + app_config.whisper_segments_filters)
1095
  }
1096
 
@@ -1101,10 +1101,10 @@ def create_ui(app_config: ApplicationConfig):
1101
  app_config.diarization = False
1102
 
1103
  common_diarization_inputs = lambda : {
1104
- gr.Checkbox(label="Diarization", value=app_config.diarization, interactive=has_diarization_libs, elem_id="diarization"),
1105
- gr.Number(label="Diarization - Speakers", precision=0, value=app_config.diarization_speakers, interactive=has_diarization_libs, elem_id="diarization_speakers"),
1106
- gr.Number(label="Diarization - Min Speakers", precision=0, value=app_config.diarization_min_speakers, interactive=has_diarization_libs, elem_id="diarization_min_speakers"),
1107
- gr.Number(label="Diarization - Max Speakers", precision=0, value=app_config.diarization_max_speakers, interactive=has_diarization_libs, elem_id="diarization_max_speakers")
1108
  }
1109
 
1110
  common_output = lambda : [
@@ -1117,6 +1117,7 @@ def create_ui(app_config: ApplicationConfig):
1117
  css = """
1118
  .scroll-show textarea {
1119
  overflow-y: auto !important;
 
1120
  }
1121
  .scroll-show textarea::-webkit-scrollbar {
1122
  all: initial !important;
@@ -1191,29 +1192,29 @@ def create_ui(app_config: ApplicationConfig):
1191
  inputDict.update(common_word_timestamps_inputs())
1192
  if isFull:
1193
  inputDict.update({
1194
- gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations, elem_id = "prepend_punctuations"),
1195
- gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations, elem_id = "append_punctuations")})
1196
  if isFull:
1197
  with gr.Accordion("Whisper Advanced options", open=False):
1198
  inputDict.update({
1199
- gr.TextArea(label="Initial Prompt", elem_id = "initial_prompt"),
1200
- gr.Number(label="Temperature", value=app_config.temperature, elem_id = "temperature"),
1201
- gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of"),
1202
- gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size"),
1203
- gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience"),
1204
- gr.Number(label="Length Penalty - Any temperature", value=lambda : None if app_config.length_penalty is None else app_config.length_penalty, elem_id = "length_penalty"),
1205
- gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens"),
1206
- gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text"),
1207
- gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16"),
1208
- gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback, elem_id = "temperature_increment_on_fallback"),
1209
- gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold, elem_id = "compression_ratio_threshold"),
1210
- gr.Number(label="Logprob threshold", value=app_config.logprob_threshold, elem_id = "logprob_threshold"),
1211
- gr.Number(label="No speech threshold", value=app_config.no_speech_threshold, elem_id = "no_speech_threshold"),
1212
  })
1213
  if app_config.whisper_implementation == "faster-whisper":
1214
  inputDict.update({
1215
- gr.Number(label="Repetition Penalty", value=app_config.repetition_penalty, elem_id = "repetition_penalty"),
1216
- gr.Number(label="No Repeat Ngram Size", value=app_config.no_repeat_ngram_size, precision=0, elem_id = "no_repeat_ngram_size")
1217
  })
1218
  with gr.Accordion("Whisper Segments Filter options", open=False):
1219
  inputDict.update(common_segments_filter_inputs())
 
1085
  }
1086
 
1087
  common_word_timestamps_inputs = lambda : {
1088
+ gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps, elem_id="word_timestamps", info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment."),
1089
+ gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words, elem_id="highlight_words", info="if word_timestamps is True, underline each word as it is spoken in srt and vtt"),
1090
  }
1091
 
1092
  common_segments_filter_inputs = lambda : {
1093
+ gr.Checkbox(label="Whisper Segments Filter", value=app_config.whisper_segments_filter, elem_id="whisperSegmentsFilter", info="Filter the results of Whisper transcribe with the following conditions. It is recommended to enable this feature when using the large-v3 model to avoid hallucinations.") if idx == 0 else
1094
  gr.Text(label=f"Filter {idx}", value=filterStr, elem_id=f"whisperSegmentsFilter{idx}") for idx, filterStr in enumerate([""] + app_config.whisper_segments_filters)
1095
  }
1096
 
 
1101
  app_config.diarization = False
1102
 
1103
  common_diarization_inputs = lambda : {
1104
+ gr.Checkbox(label="Diarization", value=app_config.diarization, interactive=has_diarization_libs, elem_id="diarization", info="Whether to perform speaker diarization"),
1105
+ gr.Number(label="Diarization - Speakers", precision=0, value=app_config.diarization_speakers, interactive=has_diarization_libs, elem_id="diarization_speakers", info="The number of speakers to detect"),
1106
+ gr.Number(label="Diarization - Min Speakers", precision=0, value=app_config.diarization_min_speakers, interactive=has_diarization_libs, elem_id="diarization_min_speakers", info="The minimum number of speakers to detect"),
1107
+ gr.Number(label="Diarization - Max Speakers", precision=0, value=app_config.diarization_max_speakers, interactive=has_diarization_libs, elem_id="diarization_max_speakers", info="The maximum number of speakers to detect")
1108
  }
1109
 
1110
  common_output = lambda : [
 
1117
  css = """
1118
  .scroll-show textarea {
1119
  overflow-y: auto !important;
1120
+ scrollbar-width: auto !important;
1121
  }
1122
  .scroll-show textarea::-webkit-scrollbar {
1123
  all: initial !important;
 
1192
  inputDict.update(common_word_timestamps_inputs())
1193
  if isFull:
1194
  inputDict.update({
1195
+ gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations, elem_id = "prepend_punctuations", info="if word_timestamps is True, merge these punctuation symbols with the next word"),
1196
+ gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations, elem_id = "append_punctuations", info="if word_timestamps is True, merge these punctuation symbols with the previous word")})
1197
  if isFull:
1198
  with gr.Accordion("Whisper Advanced options", open=False):
1199
  inputDict.update({
1200
+ gr.TextArea(label="Initial Prompt", elem_id = "initial_prompt", info="Optional text to provide as a prompt for the first window"),
1201
+ gr.Number(label="Temperature", value=app_config.temperature, elem_id = "temperature", info="Temperature to use for sampling"),
1202
+ gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of", info="Number of candidates when sampling with non-zero temperature"),
1203
+ gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size", info="Number of beams in beam search, only applicable when temperature is zero"),
1204
+ gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience", info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
1205
+ gr.Number(label="Length Penalty - Any temperature", value=lambda : None if app_config.length_penalty is None else app_config.length_penalty, elem_id = "length_penalty", info="Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default"),
1206
+ gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens", info="Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations"),
1207
+ gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text", info="If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop"),
1208
+ gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16", info="Whether to perform inference in fp16; True by default; It will be ignored in faster-whisper because it is already a quantized model."),
1209
+ gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback, elem_id = "temperature_increment_on_fallback", info="Temperature to increase when falling back when the decoding fails to meet either of the thresholds below"),
1210
+ gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold, elem_id = "compression_ratio_threshold", info="If the gzip compression ratio is higher than this value, treat the decoding as failed"),
1211
+ gr.Number(label="Logprob threshold", value=app_config.logprob_threshold, elem_id = "logprob_threshold", info="If the average log probability is lower than this value, treat the decoding as failed"),
1212
+ gr.Number(label="No speech threshold", value=app_config.no_speech_threshold, elem_id = "no_speech_threshold", info="If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence"),
1213
  })
1214
  if app_config.whisper_implementation == "faster-whisper":
1215
  inputDict.update({
1216
+ gr.Number(label="Repetition Penalty", value=app_config.repetition_penalty, elem_id = "repetition_penalty", info="[faster-whisper] The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0."),
1217
+ gr.Number(label="No Repeat Ngram Size", value=app_config.no_repeat_ngram_size, precision=0, elem_id = "no_repeat_ngram_size", info="[faster-whisper] The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.")
1218
  })
1219
  with gr.Accordion("Whisper Segments Filter options", open=False):
1220
  inputDict.update(common_segments_filter_inputs())
config.json5 CHANGED
@@ -294,6 +294,12 @@
294
  }
295
  ],
296
  "Llama": [
 
 
 
 
 
 
297
  {
298
  "name": "Meta-Llama-3-8B-Instruct-ct2-int8_float16/avan",
299
  "url": "avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16",
 
294
  }
295
  ],
296
  "Llama": [
297
+ {
298
+ "name": "Meta-Llama-3.1-8B-Instruct-ct2-int8_float16/avan",
299
+ "url": "avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16",
300
+ "type": "huggingface",
301
+ "tokenizer_url": "avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16"
302
+ },
303
  {
304
  "name": "Meta-Llama-3-8B-Instruct-ct2-int8_float16/avan",
305
  "url": "avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16",
requirements-fasterWhisper.txt CHANGED
@@ -1,6 +1,6 @@
1
  transformers
2
  ctranslate2>=4.2.1
3
- faster-whisper>=1.0.1
4
  ffmpeg-python==0.2.0
5
  gradio==3.50.2
6
  yt-dlp
@@ -14,7 +14,6 @@ sentencepiece
14
  # Needed by diarization
15
  intervaltree
16
  srt
17
- torch
18
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
19
 
20
  # Needed by ALMA-GPTQ
 
1
  transformers
2
  ctranslate2>=4.2.1
3
+ faster-whisper>=1.0.2
4
  ffmpeg-python==0.2.0
5
  gradio==3.50.2
6
  yt-dlp
 
14
  # Needed by diarization
15
  intervaltree
16
  srt
 
17
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
18
 
19
  # Needed by ALMA-GPTQ
requirements-whisper.txt CHANGED
@@ -13,7 +13,6 @@ sentencepiece
13
  # Needed by diarization
14
  intervaltree
15
  srt
16
- torch
17
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
18
 
19
  # Needed by ALMA-GPTQ
 
13
  # Needed by diarization
14
  intervaltree
15
  srt
 
16
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
17
 
18
  # Needed by ALMA-GPTQ
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  transformers
2
  ctranslate2>=4.2.1
3
- faster-whisper>=1.0.1
4
  ffmpeg-python==0.2.0
5
  gradio==3.50.2
6
  yt-dlp
@@ -14,7 +14,6 @@ sentencepiece
14
  # Needed by diarization
15
  intervaltree
16
  srt
17
- torch
18
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
19
 
20
  # Needed by ALMA-GPTQ
 
1
  transformers
2
  ctranslate2>=4.2.1
3
+ faster-whisper>=1.0.2
4
  ffmpeg-python==0.2.0
5
  gradio==3.50.2
6
  yt-dlp
 
14
  # Needed by diarization
15
  intervaltree
16
  srt
 
17
  https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
18
 
19
  # Needed by ALMA-GPTQ
src/utils.py CHANGED
@@ -298,6 +298,27 @@ def process_text(text: str, maxLineWidth=None):
298
 
299
  return '\n'.join(lines)
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  def slugify(value, allow_unicode=False, is_lower=False):
302
  """
303
  Taken from https://github.com/django/django/blob/master/django/utils/text.py
 
298
 
299
  return '\n'.join(lines)
300
 
301
+ def len_wide(text: str):
302
+ """
303
+ Use east_asian_width to automatically determine the Character Width of the string, replacing the textwrap.wrap function.
304
+
305
+ # East_Asian_Width (ea)
306
+
307
+ ea ; A ; Ambiguous
308
+ ea ; F ; Fullwidth
309
+ ea ; H ; Halfwidth
310
+ ea ; N ; Neutral
311
+ ea ; Na ; Narrow
312
+ ea ; W ; Wide
313
+ https://stackoverflow.com/a/31666966
314
+ """
315
+ width = 0
316
+ for char in text:
317
+ width += (1 if unicodedata.east_asian_width(char) not in {'W', 'F'} else 2)
318
+
319
+ return width
320
+
321
+
322
  def slugify(value, allow_unicode=False, is_lower=False):
323
  """
324
  Taken from https://github.com/django/django/blob/master/django/utils/text.py
src/vad.py CHANGED
@@ -26,7 +26,7 @@ import torch
26
  import ffmpeg
27
  import numpy as np
28
 
29
- from src.utils import format_timestamp
30
  from enum import Enum
31
 
32
  class NonSpeechStrategy(Enum):
@@ -405,21 +405,65 @@ class AbstractTranscription(ABC):
405
  if (segment_start > max_source_time):
406
  continue
407
  segment_end = min(max_source_time, segment_end)
408
-
 
 
409
  new_segment = segment.copy()
410
 
411
- # Add to start and end
412
- new_segment['start'] = segment_start + adjust_seconds
413
- new_segment['end'] = segment_end + adjust_seconds
414
-
415
- # Handle words
416
- if ('words' in new_segment):
417
- for word in new_segment['words']:
 
 
 
 
 
 
 
418
  # Adjust start and end
419
- word['start'] = word['start'] + adjust_seconds
420
- word['end'] = word['end'] + adjust_seconds
421
-
422
- result.append(new_segment)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  return result
424
 
425
  def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
 
26
  import ffmpeg
27
  import numpy as np
28
 
29
+ from src.utils import format_timestamp, len_wide
30
  from enum import Enum
31
 
32
  class NonSpeechStrategy(Enum):
 
405
  if (segment_start > max_source_time):
406
  continue
407
  segment_end = min(max_source_time, segment_end)
408
+ # {'text': 'XXX', 'start': 0.0, 'end': 99.99,
409
+ # 'temperature': 0.0, 'avg_logprob': -0.09..., 'compression_ratio': 1.234..., 'no_speech_prob': 0.123...,
410
+ # 'words': [{...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}, ...]}
411
  new_segment = segment.copy()
412
 
413
+ segment_duration = segment_end - segment_start
414
+ if ("text" in segment and "words" in segment and segment_duration > 10):
415
+ segment_words = new_segment["words"]
416
+ del new_segment["text"]
417
+ del new_segment["start"]
418
+ del new_segment["end"]
419
+ del new_segment["words"]
420
+ sub_segment = new_segment.copy()
421
+ sub_text = ""
422
+ sub_words = []
423
+ word_length = 0
424
+
425
+ for idx, word in enumerate(segment_words):
426
+ word2 = segment_words[idx + 1] if idx + 1 < len(segment_words) else None
427
  # Adjust start and end
428
+ word["start"] = word["start"] + adjust_seconds
429
+ word["end"] = word["end"] + adjust_seconds
430
+
431
+ if "start" not in sub_segment:
432
+ sub_segment["start"] = float(word["start"])
433
+
434
+ sub_text += word["word"]
435
+ sub_words.append(word)
436
+ word_length += len_wide(word["word"])
437
+ if (sub_text.rstrip().endswith(".") or
438
+ (word_length > 90 and (sub_text.rstrip().endswith(",") or sub_text.rstrip().endswith("?"))) or
439
+ (word_length > 120 and word2 and (word2["word"].lstrip().startswith(",") or ((word2["word"].strip() in ["and", "or", "but"])))) or
440
+ (word_length > 180 and sub_text.endswith(" "))):
441
+ sub_segment["text"] = sub_text
442
+ sub_segment["end"] = float(word["end"])
443
+ sub_segment["words"] = sub_words
444
+ result.append(sub_segment)
445
+ sub_segment = new_segment.copy()
446
+ sub_text = ""
447
+ sub_words = []
448
+ word_length = 0
449
+ if "start" in sub_segment:
450
+ sub_segment["text"] = sub_text
451
+ sub_segment["end"] = float(word["end"])
452
+ sub_segment["words"] = sub_words
453
+ result.append(sub_segment)
454
+ else:
455
+ # Add to start and end
456
+ new_segment['start'] = segment_start + adjust_seconds
457
+ new_segment["end"] = segment_end + adjust_seconds
458
+
459
+ # Handle words
460
+ if ("words" in new_segment):
461
+ for word in new_segment["words"]:
462
+ # Adjust start and end
463
+ word["start"] = word["start"] + adjust_seconds
464
+ word["end"] = word["end"] + adjust_seconds
465
+
466
+ result.append(new_segment)
467
  return result
468
 
469
  def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):