tan-z-tan commited on
Commit
9996005
·
1 Parent(s): 5e7654d
Files changed (1) hide show
  1. app.py +20 -24
app.py CHANGED
@@ -7,20 +7,12 @@ import torchaudio
7
  from lang_id import identify_languages
8
  from whisper import transcribe
9
 
10
- # # Whisperモデルとプロセッサのロード
11
- # model_name = "openai/whisper-tiny"
12
- # processor = WhisperProcessor.from_pretrained(model_name)
13
- # model = WhisperForConditionalGeneration.from_pretrained(model_name)
14
- # # デバイスの設定(GPUが利用可能な場合はGPUを使用)
15
- # device = "cuda" if torch.cuda.is_available() else "cpu"
16
- # model.to(device)
17
-
18
  # アプリケーションの状態を保持する変数
19
  data = []
20
  current_chunk = []
21
 
22
  SAMPLING_RATE = 16000
23
- CHUNK_DURATION = 5 # 5秒ごとのチャンク
24
 
25
 
26
  def normalize_audio(audio):
@@ -38,12 +30,18 @@ def resample_audio(audio, orig_sr, target_sr=16000):
38
  return audio
39
 
40
 
41
- def process_audio(audio):
42
- global data, current_chunk
43
  print("Process_audio")
44
  print(audio)
 
 
 
45
  sr, audio_data = audio
46
 
 
 
 
47
  print(audio_data.shape, audio_data.dtype)
48
  # 一番最初にSampling rateを揃えておく
49
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
@@ -56,15 +54,15 @@ def process_audio(audio):
56
  current_chunk.append(audio_data)
57
  total_chunk = np.concatenate(current_chunk)
58
 
59
- while len(total_chunk) >= SAMPLING_RATE * CHUNK_DURATION:
60
- chunk = total_chunk[:SAMPLING_RATE * CHUNK_DURATION]
61
- total_chunk = total_chunk[SAMPLING_RATE * CHUNK_DURATION:] # 処理済みの部分を削除
62
- audio_sec += CHUNK_DURATION
63
 
64
  print(f"Processing audio chunk of length {len(chunk)}")
65
  volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
66
  length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
67
- selected_scores, all_scores = identify_languages(chunk)
68
 
69
  # 日本語と英語の確率値を取得
70
  ja_prob = selected_scores['Japanese']
@@ -79,7 +77,6 @@ def process_audio(audio):
79
  transcription = transcribe(chunk)
80
 
81
  data.append({
82
- # "Time": pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
83
  "Time": audio_sec,
84
  "Length (s)": length,
85
  "Volume": volume_norm,
@@ -95,14 +92,16 @@ def process_audio(audio):
95
  current_chunk = [total_chunk]
96
 
97
 
98
- inputs_file = gr.Audio(sources=["upload"], type="numpy")
99
- inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
 
 
 
 
100
  outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
101
 
102
  with gr.Blocks() as demo:
103
  with gr.TabItem("Upload"):
104
- inputs_file = gr.Audio(sources=["upload"], type="numpy")
105
- outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
106
  gr.Interface(
107
  fn=process_audio,
108
  inputs=inputs_file,
@@ -113,8 +112,6 @@ with gr.Blocks() as demo:
113
  )
114
 
115
  with gr.TabItem("Microphone"):
116
- inputs_stream = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
117
- outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
118
  gr.Interface(
119
  fn=process_audio,
120
  inputs=inputs_stream,
@@ -124,6 +121,5 @@ with gr.Blocks() as demo:
124
  description="Speak into the microphone and see real-time audio processing results."
125
  )
126
 
127
-
128
  if __name__ == "__main__":
129
  demo.launch()
 
7
  from lang_id import identify_languages
8
  from whisper import transcribe
9
 
 
 
 
 
 
 
 
 
10
  # アプリケーションの状態を保持する変数
11
  data = []
12
  current_chunk = []
13
 
14
  SAMPLING_RATE = 16000
15
+ CHUNK_DURATION = 5 # 初期値としての5
16
 
17
 
18
  def normalize_audio(audio):
 
30
  return audio
31
 
32
 
33
+ def process_audio(audio, chunk_duration, language_set):
34
+ global data, current_chunk, SAMPLING_RATE
35
  print("Process_audio")
36
  print(audio)
37
+ if audio is None:
38
+ return
39
+
40
  sr, audio_data = audio
41
 
42
+ # language_set
43
+ language_set = [lang.strip() for lang in language_set.split(",")]
44
+
45
  print(audio_data.shape, audio_data.dtype)
46
  # 一番最初にSampling rateを揃えておく
47
  audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
 
54
  current_chunk.append(audio_data)
55
  total_chunk = np.concatenate(current_chunk)
56
 
57
+ while len(total_chunk) >= SAMPLING_RATE * chunk_duration:
58
+ chunk = total_chunk[:SAMPLING_RATE * chunk_duration]
59
+ total_chunk = total_chunk[SAMPLING_RATE * chunk_duration:] # 処理済みの部分を削除
60
+ audio_sec += chunk_duration
61
 
62
  print(f"Processing audio chunk of length {len(chunk)}")
63
  volume_norm = np.linalg.norm(chunk) / np.finfo(np.float32).max
64
  length = len(chunk) / SAMPLING_RATE # 音声データの長さ(秒)
65
+ selected_scores, all_scores = identify_languages(chunk, language_set)
66
 
67
  # 日本語と英語の確率値を取得
68
  ja_prob = selected_scores['Japanese']
 
77
  transcription = transcribe(chunk)
78
 
79
  data.append({
 
80
  "Time": audio_sec,
81
  "Length (s)": length,
82
  "Volume": volume_norm,
 
92
  current_chunk = [total_chunk]
93
 
94
 
95
+ # パラメータの入力コンポーネント
96
+ chunk_duration_input = gr.Number(value=5, label="Chunk Duration (seconds)")
97
+ language_set_input = gr.Textbox(value="Japanese,English", label="Language Set (comma-separated)")
98
+
99
+ inputs_file = [gr.Audio(sources=["upload"], type="numpy"), chunk_duration_input, language_set_input]
100
+ inputs_stream = [gr.Audio(sources=["microphone"], type="numpy", streaming=True), chunk_duration_input, language_set_input]
101
  outputs = [gr.Audio(type="numpy"), gr.DataFrame(headers=["Time", "Volume", "Length (s)"])]
102
 
103
  with gr.Blocks() as demo:
104
  with gr.TabItem("Upload"):
 
 
105
  gr.Interface(
106
  fn=process_audio,
107
  inputs=inputs_file,
 
112
  )
113
 
114
  with gr.TabItem("Microphone"):
 
 
115
  gr.Interface(
116
  fn=process_audio,
117
  inputs=inputs_stream,
 
121
  description="Speak into the microphone and see real-time audio processing results."
122
  )
123
 
 
124
  if __name__ == "__main__":
125
  demo.launch()