Spaces:

tan-z-tan
/

speech_language_detection

Running

tan-z-tan commited on Jun 21, 2024

Commit

1ecc4f1

1 Parent(s): fc40471

Show time

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import numpy as np
 import pandas as pd
 import torch
 import torchaudio
 from lang_id import identify_languages
 from whisper import transcribe
@@ -60,9 +60,11 @@ def process_audio(audio, chunk_duration, language_set):
         audio_sec += chunk_duration
         print(f"Processing audio chunk of length {len(chunk)}")
-        volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
         length = len(chunk) / SAMPLING_RATE  # 音声データの長さ（秒）
         selected_scores, all_scores = identify_languages(chunk, language_set)
         # 日本語と英語の確率値を取得
         ja_prob = selected_scores['Japanese']
@@ -74,7 +76,9 @@ def process_audio(audio, chunk_duration, language_set):
         top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
         # テキストの認識
         transcription = transcribe(chunk)
         data.append({
             "Time": audio_sec,
@@ -82,6 +86,8 @@ def process_audio(audio, chunk_duration, language_set):
             "Volume": volume_norm,
             "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
             "Language": top3_languages,
             "Text": transcription,
         })

 import pandas as pd
 import torch
 import torchaudio
+from datetime import datetime
 from lang_id import identify_languages
 from whisper import transcribe
         audio_sec += chunk_duration
         print(f"Processing audio chunk of length {len(chunk)}")
+        volume_norm = np.linalg.norm(chunk)
         length = len(chunk) / SAMPLING_RATE  # 音声データの長さ（秒）
+        s = datetime.now()
         selected_scores, all_scores = identify_languages(chunk, language_set)
+        lang_id_time = (datetime.now() - s).total_seconds()
         # 日本語と英語の確率値を取得
         ja_prob = selected_scores['Japanese']
         top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
         # テキストの認識
+        s = datetime.now()
         transcription = transcribe(chunk)
+        transcribe_time = (datetime.now() - s).total_seconds()
         data.append({
             "Time": audio_sec,
             "Volume": volume_norm,
             "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
             "Language": top3_languages,
+            "Lang ID Time": lang_id_time,
+            "Transcribe Time": transcribe_time,
             "Text": transcription,
         })