Spaces:

tan-z-tan
/

speech_language_detection

Running

App Files Files Community

tan-z-tan commited on Jun 21, 2024

Commit

9996005

1 Parent(s): 5e7654d

Fix

Browse files

Files changed (1) hide show

app.py +20 -24

app.py CHANGED Viewed

@@ -7,20 +7,12 @@ import torchaudio
 from lang_id import identify_languages
 from whisper import transcribe
-# # Whisperモデルとプロセッサのロード
-# model_name = "openai/whisper-tiny"
-# processor = WhisperProcessor.from_pretrained(model_name)
-# model = WhisperForConditionalGeneration.from_pretrained(model_name)
-# # デバイスの設定（GPUが利用可能な場合はGPUを使用）
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# model.to(device)
 # アプリケーションの状態を保持する変数
 data = []
 current_chunk = []
 SAMPLING_RATE = 16000
-CHUNK_DURATION = 5  # 5秒ごとのチャンク
 def normalize_audio(audio):
@@ -38,12 +30,18 @@ def resample_audio(audio, orig_sr, target_sr=16000):
     return audio
-def process_audio(audio):
-    global data, current_chunk
     print("Process_audio")
     print(audio)
     sr, audio_data = audio
     print(audio_data.shape, audio_data.dtype)
     # 一番最初にSampling rateを揃えておく
     audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
@@ -56,15 +54,15 @@ def process_audio(audio):
     current_chunk.append(audio_data)
     total_chunk = np.concatenate(current_chunk)
-    while len(total_chunk) >= SAMPLING_RATE * CHUNK_DURATION:
-        chunk = total_chunk[:SAMPLING_RATE * CHUNK_DURATION]
-        total_chunk = total_chunk[SAMPLING_RATE * CHUNK_DURATION:]  # 処理済みの部分を削除
-        audio_sec += CHUNK_DURATION
         print(f"Processing audio chunk of length {len(chunk)}")
         volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
         length = len(chunk) / SAMPLING_RATE  # 音声データの長さ（秒）
-        selected_scores, all_scores = identify_languages(chunk)
         # 日本語と英語の確率値を取得
         ja_prob = selected_scores['Japanese']
@@ -79,7 +77,6 @@ def process_audio(audio):
         transcription = transcribe(chunk)
         data.append({
-            # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
             "Time": audio_sec,
             "Length (s)": length,
             "Volume": volume_norm,
@@ -95,14 +92,16 @@ def process_audio(audio):
     current_chunk = [total_chunk]
-inputs_file = gr.Audio(sources=["upload"], type="numpy")
-inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
 outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
 with gr.Blocks() as demo:
     with gr.TabItem("Upload"):
-        inputs_file = gr.Audio(sources=["upload"], type="numpy")
-        outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
         gr.Interface(
             fn=process_audio,
             inputs=inputs_file,
@@ -113,8 +112,6 @@ with gr.Blocks() as demo:
         )
     with gr.TabItem("Microphone"):
-        inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
-        outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
         gr.Interface(
             fn=process_audio,
             inputs=inputs_stream,
@@ -124,6 +121,5 @@ with gr.Blocks() as demo:
             description="Speak into the microphone and see real-time audio processing results."
         )
 if __name__ == "__main__":
     demo.launch()

 from lang_id import identify_languages
 from whisper import transcribe
 # アプリケーションの状態を保持する変数
 data = []
 current_chunk = []
 SAMPLING_RATE = 16000
+CHUNK_DURATION = 5  # 初期値としての5秒
 def normalize_audio(audio):
     return audio
+def process_audio(audio, chunk_duration, language_set):
+    global data, current_chunk, SAMPLING_RATE
     print("Process_audio")
     print(audio)
+    if audio is None:
+        return
     sr, audio_data = audio
+    # language_set
+    language_set = [lang.strip() for lang in language_set.split(",")]
     print(audio_data.shape, audio_data.dtype)
     # 一番最初にSampling rateを揃えておく
     audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
     current_chunk.append(audio_data)
     total_chunk = np.concatenate(current_chunk)
+    while len(total_chunk) >= SAMPLING_RATE * chunk_duration:
+        chunk = total_chunk[:SAMPLING_RATE * chunk_duration]
+        total_chunk = total_chunk[SAMPLING_RATE * chunk_duration:]  # 処理済みの部分を削除
+        audio_sec += chunk_duration
         print(f"Processing audio chunk of length {len(chunk)}")
         volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
         length = len(chunk) / SAMPLING_RATE  # 音声データの長さ（秒）
+        selected_scores, all_scores = identify_languages(chunk, language_set)
         # 日本語と英語の確率値を取得
         ja_prob = selected_scores['Japanese']
         transcription = transcribe(chunk)
         data.append({
             "Time": audio_sec,
             "Length (s)": length,
             "Volume": volume_norm,
     current_chunk = [total_chunk]
+# パラメータの入力コンポーネント
+chunk_duration_input = gr.Number(value=5, label="Chunk Duration (seconds)")
+language_set_input = gr.Textbox(value="Japanese,English", label="Language Set (comma-separated)")
+inputs_file = [gr.Audio(sources=["upload"], type="numpy"), chunk_duration_input, language_set_input]
+inputs_stream = [gr.Audio(sources=["microphone"], type="numpy", streaming=True), chunk_duration_input, language_set_input]
 outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
 with gr.Blocks() as demo:
     with gr.TabItem("Upload"):
         gr.Interface(
             fn=process_audio,
             inputs=inputs_file,
         )
     with gr.TabItem("Microphone"):
         gr.Interface(
             fn=process_audio,
             inputs=inputs_stream,
             description="Speak into the microphone and see real-time audio processing results."
         )
 if __name__ == "__main__":
     demo.launch()