Spaces:
Sleeping
Sleeping
Show time
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ import numpy as np
|
|
3 |
import pandas as pd
|
4 |
import torch
|
5 |
import torchaudio
|
6 |
-
|
7 |
from lang_id import identify_languages
|
8 |
from whisper import transcribe
|
9 |
|
@@ -60,9 +60,11 @@ def process_audio(audio, chunk_duration, language_set):
|
|
60 |
audio_sec += chunk_duration
|
61 |
|
62 |
print(f"Processing audio chunk of length {len(chunk)}")
|
63 |
-
volume_norm = np.linalg.norm(chunk)
|
64 |
length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
|
|
|
65 |
selected_scores, all_scores = identify_languages(chunk, language_set)
|
|
|
66 |
|
67 |
# 日本語と英語の確率値を取得
|
68 |
ja_prob = selected_scores['Japanese']
|
@@ -74,7 +76,9 @@ def process_audio(audio, chunk_duration, language_set):
|
|
74 |
top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
|
75 |
|
76 |
# テキストの認識
|
|
|
77 |
transcription = transcribe(chunk)
|
|
|
78 |
|
79 |
data.append({
|
80 |
"Time": audio_sec,
|
@@ -82,6 +86,8 @@ def process_audio(audio, chunk_duration, language_set):
|
|
82 |
"Volume": volume_norm,
|
83 |
"Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
|
84 |
"Language": top3_languages,
|
|
|
|
|
85 |
"Text": transcription,
|
86 |
})
|
87 |
|
|
|
3 |
import pandas as pd
|
4 |
import torch
|
5 |
import torchaudio
|
6 |
+
from datetime import datetime
|
7 |
from lang_id import identify_languages
|
8 |
from whisper import transcribe
|
9 |
|
|
|
60 |
audio_sec += chunk_duration
|
61 |
|
62 |
print(f"Processing audio chunk of length {len(chunk)}")
|
63 |
+
volume_norm = np.linalg.norm(chunk)
|
64 |
length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
|
65 |
+
s = datetime.now()
|
66 |
selected_scores, all_scores = identify_languages(chunk, language_set)
|
67 |
+
lang_id_time = (datetime.now() - s).total_seconds()
|
68 |
|
69 |
# 日本語と英語の確率値を取得
|
70 |
ja_prob = selected_scores['Japanese']
|
|
|
76 |
top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
|
77 |
|
78 |
# テキストの認識
|
79 |
+
s = datetime.now()
|
80 |
transcription = transcribe(chunk)
|
81 |
+
transcribe_time = (datetime.now() - s).total_seconds()
|
82 |
|
83 |
data.append({
|
84 |
"Time": audio_sec,
|
|
|
86 |
"Volume": volume_norm,
|
87 |
"Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
|
88 |
"Language": top3_languages,
|
89 |
+
"Lang ID Time": lang_id_time,
|
90 |
+
"Transcribe Time": transcribe_time,
|
91 |
"Text": transcription,
|
92 |
})
|
93 |
|