tan-z-tan commited on
Commit
1ecc4f1
·
1 Parent(s): fc40471
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -3,7 +3,7 @@ import numpy as np
3
  import pandas as pd
4
  import torch
5
  import torchaudio
6
-
7
  from lang_id import identify_languages
8
  from whisper import transcribe
9
 
@@ -60,9 +60,11 @@ def process_audio(audio, chunk_duration, language_set):
60
  audio_sec += chunk_duration
61
 
62
  print(f"Processing audio chunk of length {len(chunk)}")
63
- volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
64
  length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
 
65
  selected_scores, all_scores = identify_languages(chunk, language_set)
 
66
 
67
  # 日本語と英語の確率値を取得
68
  ja_prob = selected_scores['Japanese']
@@ -74,7 +76,9 @@ def process_audio(audio, chunk_duration, language_set):
74
  top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
75
 
76
  # テキストの認識
 
77
  transcription = transcribe(chunk)
 
78
 
79
  data.append({
80
  "Time": audio_sec,
@@ -82,6 +86,8 @@ def process_audio(audio, chunk_duration, language_set):
82
  "Volume": volume_norm,
83
  "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
84
  "Language": top3_languages,
 
 
85
  "Text": transcription,
86
  })
87
 
 
3
  import pandas as pd
4
  import torch
5
  import torchaudio
6
+ from datetime import datetime
7
  from lang_id import identify_languages
8
  from whisper import transcribe
9
 
 
60
  audio_sec += chunk_duration
61
 
62
  print(f"Processing audio chunk of length {len(chunk)}")
63
+ volume_norm = np.linalg.norm(chunk)
64
  length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
65
+ s = datetime.now()
66
  selected_scores, all_scores = identify_languages(chunk, language_set)
67
+ lang_id_time = (datetime.now() - s).total_seconds()
68
 
69
  # 日本語と英語の確率値を取得
70
  ja_prob = selected_scores['Japanese']
 
76
  top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
77
 
78
  # テキストの認識
79
+ s = datetime.now()
80
  transcription = transcribe(chunk)
81
+ transcribe_time = (datetime.now() - s).total_seconds()
82
 
83
  data.append({
84
  "Time": audio_sec,
 
86
  "Volume": volume_norm,
87
  "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
88
  "Language": top3_languages,
89
+ "Lang ID Time": lang_id_time,
90
+ "Transcribe Time": transcribe_time,
91
  "Text": transcription,
92
  })
93