Translation model incorporates Llama-3.1-8B-Instruct.
Browse filesProcess in segments when the duration of Transcribe is longer.
- app.py +25 -24
- config.json5 +6 -0
- requirements-fasterWhisper.txt +1 -2
- requirements-whisper.txt +0 -1
- requirements.txt +1 -2
- src/utils.py +21 -0
- src/vad.py +57 -13
app.py
CHANGED
@@ -1085,12 +1085,12 @@ def create_ui(app_config: ApplicationConfig):
|
|
1085 |
}
|
1086 |
|
1087 |
common_word_timestamps_inputs = lambda : {
|
1088 |
-
gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps, elem_id="word_timestamps"),
|
1089 |
-
gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words, elem_id="highlight_words"),
|
1090 |
}
|
1091 |
|
1092 |
common_segments_filter_inputs = lambda : {
|
1093 |
-
gr.Checkbox(label="Whisper Segments Filter", value=app_config.whisper_segments_filter, elem_id="whisperSegmentsFilter") if idx == 0 else
|
1094 |
gr.Text(label=f"Filter {idx}", value=filterStr, elem_id=f"whisperSegmentsFilter{idx}") for idx, filterStr in enumerate([""] + app_config.whisper_segments_filters)
|
1095 |
}
|
1096 |
|
@@ -1101,10 +1101,10 @@ def create_ui(app_config: ApplicationConfig):
|
|
1101 |
app_config.diarization = False
|
1102 |
|
1103 |
common_diarization_inputs = lambda : {
|
1104 |
-
gr.Checkbox(label="Diarization", value=app_config.diarization, interactive=has_diarization_libs, elem_id="diarization"),
|
1105 |
-
gr.Number(label="Diarization - Speakers", precision=0, value=app_config.diarization_speakers, interactive=has_diarization_libs, elem_id="diarization_speakers"),
|
1106 |
-
gr.Number(label="Diarization - Min Speakers", precision=0, value=app_config.diarization_min_speakers, interactive=has_diarization_libs, elem_id="diarization_min_speakers"),
|
1107 |
-
gr.Number(label="Diarization - Max Speakers", precision=0, value=app_config.diarization_max_speakers, interactive=has_diarization_libs, elem_id="diarization_max_speakers")
|
1108 |
}
|
1109 |
|
1110 |
common_output = lambda : [
|
@@ -1117,6 +1117,7 @@ def create_ui(app_config: ApplicationConfig):
|
|
1117 |
css = """
|
1118 |
.scroll-show textarea {
|
1119 |
overflow-y: auto !important;
|
|
|
1120 |
}
|
1121 |
.scroll-show textarea::-webkit-scrollbar {
|
1122 |
all: initial !important;
|
@@ -1191,29 +1192,29 @@ def create_ui(app_config: ApplicationConfig):
|
|
1191 |
inputDict.update(common_word_timestamps_inputs())
|
1192 |
if isFull:
|
1193 |
inputDict.update({
|
1194 |
-
gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations, elem_id = "prepend_punctuations"),
|
1195 |
-
gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations, elem_id = "append_punctuations")})
|
1196 |
if isFull:
|
1197 |
with gr.Accordion("Whisper Advanced options", open=False):
|
1198 |
inputDict.update({
|
1199 |
-
gr.TextArea(label="Initial Prompt", elem_id = "initial_prompt"),
|
1200 |
-
gr.Number(label="Temperature", value=app_config.temperature, elem_id = "temperature"),
|
1201 |
-
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of"),
|
1202 |
-
gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size"),
|
1203 |
-
gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience"),
|
1204 |
-
gr.Number(label="Length Penalty - Any temperature", value=lambda : None if app_config.length_penalty is None else app_config.length_penalty, elem_id = "length_penalty"),
|
1205 |
-
gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens"),
|
1206 |
-
gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text"),
|
1207 |
-
gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16"),
|
1208 |
-
gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback, elem_id = "temperature_increment_on_fallback"),
|
1209 |
-
gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold, elem_id = "compression_ratio_threshold"),
|
1210 |
-
gr.Number(label="Logprob threshold", value=app_config.logprob_threshold, elem_id = "logprob_threshold"),
|
1211 |
-
gr.Number(label="No speech threshold", value=app_config.no_speech_threshold, elem_id = "no_speech_threshold"),
|
1212 |
})
|
1213 |
if app_config.whisper_implementation == "faster-whisper":
|
1214 |
inputDict.update({
|
1215 |
-
gr.Number(label="Repetition Penalty", value=app_config.repetition_penalty, elem_id = "repetition_penalty"),
|
1216 |
-
gr.Number(label="No Repeat Ngram Size", value=app_config.no_repeat_ngram_size, precision=0, elem_id = "no_repeat_ngram_size")
|
1217 |
})
|
1218 |
with gr.Accordion("Whisper Segments Filter options", open=False):
|
1219 |
inputDict.update(common_segments_filter_inputs())
|
|
|
1085 |
}
|
1086 |
|
1087 |
common_word_timestamps_inputs = lambda : {
|
1088 |
+
gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps, elem_id="word_timestamps", info="Extract word-level timestamps using the cross-attention pattern and dynamic time warping, and include the timestamps for each word in each segment."),
|
1089 |
+
gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words, elem_id="highlight_words", info="if word_timestamps is True, underline each word as it is spoken in srt and vtt"),
|
1090 |
}
|
1091 |
|
1092 |
common_segments_filter_inputs = lambda : {
|
1093 |
+
gr.Checkbox(label="Whisper Segments Filter", value=app_config.whisper_segments_filter, elem_id="whisperSegmentsFilter", info="Filter the results of Whisper transcribe with the following conditions. It is recommended to enable this feature when using the large-v3 model to avoid hallucinations.") if idx == 0 else
|
1094 |
gr.Text(label=f"Filter {idx}", value=filterStr, elem_id=f"whisperSegmentsFilter{idx}") for idx, filterStr in enumerate([""] + app_config.whisper_segments_filters)
|
1095 |
}
|
1096 |
|
|
|
1101 |
app_config.diarization = False
|
1102 |
|
1103 |
common_diarization_inputs = lambda : {
|
1104 |
+
gr.Checkbox(label="Diarization", value=app_config.diarization, interactive=has_diarization_libs, elem_id="diarization", info="Whether to perform speaker diarization"),
|
1105 |
+
gr.Number(label="Diarization - Speakers", precision=0, value=app_config.diarization_speakers, interactive=has_diarization_libs, elem_id="diarization_speakers", info="The number of speakers to detect"),
|
1106 |
+
gr.Number(label="Diarization - Min Speakers", precision=0, value=app_config.diarization_min_speakers, interactive=has_diarization_libs, elem_id="diarization_min_speakers", info="The minimum number of speakers to detect"),
|
1107 |
+
gr.Number(label="Diarization - Max Speakers", precision=0, value=app_config.diarization_max_speakers, interactive=has_diarization_libs, elem_id="diarization_max_speakers", info="The maximum number of speakers to detect")
|
1108 |
}
|
1109 |
|
1110 |
common_output = lambda : [
|
|
|
1117 |
css = """
|
1118 |
.scroll-show textarea {
|
1119 |
overflow-y: auto !important;
|
1120 |
+
scrollbar-width: auto !important;
|
1121 |
}
|
1122 |
.scroll-show textarea::-webkit-scrollbar {
|
1123 |
all: initial !important;
|
|
|
1192 |
inputDict.update(common_word_timestamps_inputs())
|
1193 |
if isFull:
|
1194 |
inputDict.update({
|
1195 |
+
gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations, elem_id = "prepend_punctuations", info="if word_timestamps is True, merge these punctuation symbols with the next word"),
|
1196 |
+
gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations, elem_id = "append_punctuations", info="if word_timestamps is True, merge these punctuation symbols with the previous word")})
|
1197 |
if isFull:
|
1198 |
with gr.Accordion("Whisper Advanced options", open=False):
|
1199 |
inputDict.update({
|
1200 |
+
gr.TextArea(label="Initial Prompt", elem_id = "initial_prompt", info="Optional text to provide as a prompt for the first window"),
|
1201 |
+
gr.Number(label="Temperature", value=app_config.temperature, elem_id = "temperature", info="Temperature to use for sampling"),
|
1202 |
+
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0, elem_id = "best_of", info="Number of candidates when sampling with non-zero temperature"),
|
1203 |
+
gr.Number(label="Beam Size - Zero temperature", value=app_config.beam_size, precision=0, elem_id = "beam_size", info="Number of beams in beam search, only applicable when temperature is zero"),
|
1204 |
+
gr.Number(label="Patience - Zero temperature", value=app_config.patience, elem_id = "patience", info="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search"),
|
1205 |
+
gr.Number(label="Length Penalty - Any temperature", value=lambda : None if app_config.length_penalty is None else app_config.length_penalty, elem_id = "length_penalty", info="Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default"),
|
1206 |
+
gr.Text(label="Suppress Tokens - Comma-separated list of token IDs", value=app_config.suppress_tokens, elem_id = "suppress_tokens", info="Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations"),
|
1207 |
+
gr.Checkbox(label="Condition on previous text", value=app_config.condition_on_previous_text, elem_id = "condition_on_previous_text", info="If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop"),
|
1208 |
+
gr.Checkbox(label="FP16", value=app_config.fp16, elem_id = "fp16", info="Whether to perform inference in fp16; True by default; It will be ignored in faster-whisper because it is already a quantized model."),
|
1209 |
+
gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback, elem_id = "temperature_increment_on_fallback", info="Temperature to increase when falling back when the decoding fails to meet either of the thresholds below"),
|
1210 |
+
gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold, elem_id = "compression_ratio_threshold", info="If the gzip compression ratio is higher than this value, treat the decoding as failed"),
|
1211 |
+
gr.Number(label="Logprob threshold", value=app_config.logprob_threshold, elem_id = "logprob_threshold", info="If the average log probability is lower than this value, treat the decoding as failed"),
|
1212 |
+
gr.Number(label="No speech threshold", value=app_config.no_speech_threshold, elem_id = "no_speech_threshold", info="If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence"),
|
1213 |
})
|
1214 |
if app_config.whisper_implementation == "faster-whisper":
|
1215 |
inputDict.update({
|
1216 |
+
gr.Number(label="Repetition Penalty", value=app_config.repetition_penalty, elem_id = "repetition_penalty", info="[faster-whisper] The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0."),
|
1217 |
+
gr.Number(label="No Repeat Ngram Size", value=app_config.no_repeat_ngram_size, precision=0, elem_id = "no_repeat_ngram_size", info="[faster-whisper] The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.")
|
1218 |
})
|
1219 |
with gr.Accordion("Whisper Segments Filter options", open=False):
|
1220 |
inputDict.update(common_segments_filter_inputs())
|
config.json5
CHANGED
@@ -294,6 +294,12 @@
|
|
294 |
}
|
295 |
],
|
296 |
"Llama": [
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
{
|
298 |
"name": "Meta-Llama-3-8B-Instruct-ct2-int8_float16/avan",
|
299 |
"url": "avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16",
|
|
|
294 |
}
|
295 |
],
|
296 |
"Llama": [
|
297 |
+
{
|
298 |
+
"name": "Meta-Llama-3.1-8B-Instruct-ct2-int8_float16/avan",
|
299 |
+
"url": "avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16",
|
300 |
+
"type": "huggingface",
|
301 |
+
"tokenizer_url": "avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16"
|
302 |
+
},
|
303 |
{
|
304 |
"name": "Meta-Llama-3-8B-Instruct-ct2-int8_float16/avan",
|
305 |
"url": "avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16",
|
requirements-fasterWhisper.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
transformers
|
2 |
ctranslate2>=4.2.1
|
3 |
-
faster-whisper>=1.0.
|
4 |
ffmpeg-python==0.2.0
|
5 |
gradio==3.50.2
|
6 |
yt-dlp
|
@@ -14,7 +14,6 @@ sentencepiece
|
|
14 |
# Needed by diarization
|
15 |
intervaltree
|
16 |
srt
|
17 |
-
torch
|
18 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
19 |
|
20 |
# Needed by ALMA-GPTQ
|
|
|
1 |
transformers
|
2 |
ctranslate2>=4.2.1
|
3 |
+
faster-whisper>=1.0.2
|
4 |
ffmpeg-python==0.2.0
|
5 |
gradio==3.50.2
|
6 |
yt-dlp
|
|
|
14 |
# Needed by diarization
|
15 |
intervaltree
|
16 |
srt
|
|
|
17 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
18 |
|
19 |
# Needed by ALMA-GPTQ
|
requirements-whisper.txt
CHANGED
@@ -13,7 +13,6 @@ sentencepiece
|
|
13 |
# Needed by diarization
|
14 |
intervaltree
|
15 |
srt
|
16 |
-
torch
|
17 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
18 |
|
19 |
# Needed by ALMA-GPTQ
|
|
|
13 |
# Needed by diarization
|
14 |
intervaltree
|
15 |
srt
|
|
|
16 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
17 |
|
18 |
# Needed by ALMA-GPTQ
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
transformers
|
2 |
ctranslate2>=4.2.1
|
3 |
-
faster-whisper>=1.0.
|
4 |
ffmpeg-python==0.2.0
|
5 |
gradio==3.50.2
|
6 |
yt-dlp
|
@@ -14,7 +14,6 @@ sentencepiece
|
|
14 |
# Needed by diarization
|
15 |
intervaltree
|
16 |
srt
|
17 |
-
torch
|
18 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
19 |
|
20 |
# Needed by ALMA-GPTQ
|
|
|
1 |
transformers
|
2 |
ctranslate2>=4.2.1
|
3 |
+
faster-whisper>=1.0.2
|
4 |
ffmpeg-python==0.2.0
|
5 |
gradio==3.50.2
|
6 |
yt-dlp
|
|
|
14 |
# Needed by diarization
|
15 |
intervaltree
|
16 |
srt
|
|
|
17 |
https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
|
18 |
|
19 |
# Needed by ALMA-GPTQ
|
src/utils.py
CHANGED
@@ -298,6 +298,27 @@ def process_text(text: str, maxLineWidth=None):
|
|
298 |
|
299 |
return '\n'.join(lines)
|
300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
def slugify(value, allow_unicode=False, is_lower=False):
|
302 |
"""
|
303 |
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
|
|
298 |
|
299 |
return '\n'.join(lines)
|
300 |
|
301 |
+
def len_wide(text: str):
|
302 |
+
"""
|
303 |
+
Use east_asian_width to automatically determine the Character Width of the string, replacing the textwrap.wrap function.
|
304 |
+
|
305 |
+
# East_Asian_Width (ea)
|
306 |
+
|
307 |
+
ea ; A ; Ambiguous
|
308 |
+
ea ; F ; Fullwidth
|
309 |
+
ea ; H ; Halfwidth
|
310 |
+
ea ; N ; Neutral
|
311 |
+
ea ; Na ; Narrow
|
312 |
+
ea ; W ; Wide
|
313 |
+
https://stackoverflow.com/a/31666966
|
314 |
+
"""
|
315 |
+
width = 0
|
316 |
+
for char in text:
|
317 |
+
width += (1 if unicodedata.east_asian_width(char) not in {'W', 'F'} else 2)
|
318 |
+
|
319 |
+
return width
|
320 |
+
|
321 |
+
|
322 |
def slugify(value, allow_unicode=False, is_lower=False):
|
323 |
"""
|
324 |
Taken from https://github.com/django/django/blob/master/django/utils/text.py
|
src/vad.py
CHANGED
@@ -26,7 +26,7 @@ import torch
|
|
26 |
import ffmpeg
|
27 |
import numpy as np
|
28 |
|
29 |
-
from src.utils import format_timestamp
|
30 |
from enum import Enum
|
31 |
|
32 |
class NonSpeechStrategy(Enum):
|
@@ -405,21 +405,65 @@ class AbstractTranscription(ABC):
|
|
405 |
if (segment_start > max_source_time):
|
406 |
continue
|
407 |
segment_end = min(max_source_time, segment_end)
|
408 |
-
|
|
|
|
|
409 |
new_segment = segment.copy()
|
410 |
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
# Adjust start and end
|
419 |
-
word[
|
420 |
-
word[
|
421 |
-
|
422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
return result
|
424 |
|
425 |
def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
|
|
|
26 |
import ffmpeg
|
27 |
import numpy as np
|
28 |
|
29 |
+
from src.utils import format_timestamp, len_wide
|
30 |
from enum import Enum
|
31 |
|
32 |
class NonSpeechStrategy(Enum):
|
|
|
405 |
if (segment_start > max_source_time):
|
406 |
continue
|
407 |
segment_end = min(max_source_time, segment_end)
|
408 |
+
# {'text': 'XXX', 'start': 0.0, 'end': 99.99,
|
409 |
+
# 'temperature': 0.0, 'avg_logprob': -0.09..., 'compression_ratio': 1.234..., 'no_speech_prob': 0.123...,
|
410 |
+
# 'words': [{...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}, {...}, ...]}
|
411 |
new_segment = segment.copy()
|
412 |
|
413 |
+
segment_duration = segment_end - segment_start
|
414 |
+
if ("text" in segment and "words" in segment and segment_duration > 10):
|
415 |
+
segment_words = new_segment["words"]
|
416 |
+
del new_segment["text"]
|
417 |
+
del new_segment["start"]
|
418 |
+
del new_segment["end"]
|
419 |
+
del new_segment["words"]
|
420 |
+
sub_segment = new_segment.copy()
|
421 |
+
sub_text = ""
|
422 |
+
sub_words = []
|
423 |
+
word_length = 0
|
424 |
+
|
425 |
+
for idx, word in enumerate(segment_words):
|
426 |
+
word2 = segment_words[idx + 1] if idx + 1 < len(segment_words) else None
|
427 |
# Adjust start and end
|
428 |
+
word["start"] = word["start"] + adjust_seconds
|
429 |
+
word["end"] = word["end"] + adjust_seconds
|
430 |
+
|
431 |
+
if "start" not in sub_segment:
|
432 |
+
sub_segment["start"] = float(word["start"])
|
433 |
+
|
434 |
+
sub_text += word["word"]
|
435 |
+
sub_words.append(word)
|
436 |
+
word_length += len_wide(word["word"])
|
437 |
+
if (sub_text.rstrip().endswith(".") or
|
438 |
+
(word_length > 90 and (sub_text.rstrip().endswith(",") or sub_text.rstrip().endswith("?"))) or
|
439 |
+
(word_length > 120 and word2 and (word2["word"].lstrip().startswith(",") or ((word2["word"].strip() in ["and", "or", "but"])))) or
|
440 |
+
(word_length > 180 and sub_text.endswith(" "))):
|
441 |
+
sub_segment["text"] = sub_text
|
442 |
+
sub_segment["end"] = float(word["end"])
|
443 |
+
sub_segment["words"] = sub_words
|
444 |
+
result.append(sub_segment)
|
445 |
+
sub_segment = new_segment.copy()
|
446 |
+
sub_text = ""
|
447 |
+
sub_words = []
|
448 |
+
word_length = 0
|
449 |
+
if "start" in sub_segment:
|
450 |
+
sub_segment["text"] = sub_text
|
451 |
+
sub_segment["end"] = float(word["end"])
|
452 |
+
sub_segment["words"] = sub_words
|
453 |
+
result.append(sub_segment)
|
454 |
+
else:
|
455 |
+
# Add to start and end
|
456 |
+
new_segment['start'] = segment_start + adjust_seconds
|
457 |
+
new_segment["end"] = segment_end + adjust_seconds
|
458 |
+
|
459 |
+
# Handle words
|
460 |
+
if ("words" in new_segment):
|
461 |
+
for word in new_segment["words"]:
|
462 |
+
# Adjust start and end
|
463 |
+
word["start"] = word["start"] + adjust_seconds
|
464 |
+
word["end"] = word["end"] + adjust_seconds
|
465 |
+
|
466 |
+
result.append(new_segment)
|
467 |
return result
|
468 |
|
469 |
def multiply_timestamps(self, timestamps: List[Dict[str, Any]], factor: float):
|