litagin commited on
Commit
ecd87d6
1 Parent(s): 0f26f17
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  embeddings/all_filelists.txt filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  embeddings/all_filelists.txt filter=lfs diff=lfs merge=lfs -text
37
+ embeddings/all_filelist.txt filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,6 +1,9 @@
 
1
  import os
 
2
  import tempfile
3
  import zipfile
 
4
  from pathlib import Path
5
 
6
  import gradio as gr
@@ -12,7 +15,7 @@ from loguru import logger
12
  from pyannote.audio import Inference, Model
13
 
14
  HF_REPO_ID = "litagin/voice-samples-22050"
15
- RESNET34_ROOT = Path("./embeddings")
16
  RESNET34_DIM = 256
17
  AUDIO_ZIP_DIR = Path("./audio_files_zipped_by_game_22_050")
18
 
@@ -30,7 +33,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
  logger.info(f"Device: {device}")
31
 
32
  logger.info("Loading resnet34 vectors...")
33
- resnet34_embs = np.load(RESNET34_ROOT / "all_embs.npy")
34
  resnet34_embs_normalized = resnet34_embs / np.linalg.norm(
35
  resnet34_embs, axis=1, keepdims=True
36
  )
@@ -41,11 +44,11 @@ inference = Inference(model_resnet34, window="whole")
41
  inference.to(device)
42
 
43
  logger.info("Loading filelist...")
44
- with open(RESNET34_ROOT / "all_filelists.txt", "r", encoding="utf-8") as file:
45
  files = [line.strip() for line in file]
46
 
47
 
48
- def get_speaker_name(file_idx: int):
49
  filepath = Path(files[file_idx])
50
  game_name = filepath.parent.parent.name
51
  speaker_name = filepath.parent.name
@@ -54,22 +57,39 @@ def get_speaker_name(file_idx: int):
54
 
55
  # スピーカーIDの配列を取得
56
  logger.info("Getting speaker ids...")
57
- all_speaker_set = set([get_speaker_name(i) for i in range(len(files))])
58
  id2speaker = {i: speaker for i, speaker in enumerate(sorted(all_speaker_set))}
59
  num_speakers = len(id2speaker)
60
  speaker2id = {speaker: i for i, speaker in id2speaker.items()}
61
- speaker_id_array = np.array(
62
- [speaker2id[get_speaker_name(i)] for i in range(len(files))]
63
- )
 
 
 
 
 
64
 
65
 
66
- # def get_zip_archive_path_and_internal_path(file_path: Path) -> tuple[str, str]:
67
- # # 構造: audio_files/{game_name}/{speaker_name}/{audio_file}
68
- # game_name = file_path.parent.parent.name
69
- # speaker_name = file_path.parent.name
70
- # archive_path = AUDIO_ZIP_DIR / game_name / f"{speaker_name}.zip"
71
- # internal_path = file_path.name # ZIP内のパスはファイル名のみ
72
- # return str(archive_path), str(internal_path)
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
75
  def get_zip_archive_path_and_internal_path(file_path: Path) -> tuple[str, str]:
@@ -77,7 +97,7 @@ def get_zip_archive_path_and_internal_path(file_path: Path) -> tuple[str, str]:
77
  game_name = file_path.parent.parent.name
78
  speaker_name = file_path.parent.name
79
  archive_path = AUDIO_ZIP_DIR / f"{game_name}.zip"
80
- internal_path = f"{speaker_name}/{file_path.name}" # ZIP内のパスを "speaker_name/ファイル名" とする
81
  return str(archive_path), str(internal_path)
82
 
83
 
@@ -105,7 +125,18 @@ def get_emb(audio_path: Path | str) -> np.ndarray:
105
  return emb
106
 
107
 
108
- def search(audio_path: str):
 
 
 
 
 
 
 
 
 
 
 
109
  logger.info("Computing embeddings...")
110
  emb = get_emb(audio_path) # ユーザー入力の音声ファイル
111
  emb = emb.reshape(1, -1) # (1, dim)
@@ -121,25 +152,48 @@ def search(audio_path: str):
121
  top_k = 10
122
  top_k_indices = np.argsort(similarities)[::-1][:top_k]
123
  top_k_files = [files[file_idx] for file_idx in top_k_indices]
 
124
  top_k_scores = similarities[top_k_indices]
 
125
  logger.info("Fetching audio files...")
126
- result = []
 
127
 
128
- for i, (f, file_idx, score) in enumerate(
129
- zip(top_k_files, top_k_indices, top_k_scores)
130
- ):
131
- waveform_np, sample_rate = load_audio_from_zip(Path(f))
132
- result.append(
133
  gr.Audio(
134
  value=(sample_rate, waveform_np),
135
- label=f"Top {i+1}: {get_speaker_name(file_idx)}, {score:.4f}",
136
  )
137
  )
 
 
 
 
 
 
 
 
 
138
  logger.success("Audio files fetched.")
139
- return result
140
-
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- def get_label(audio_path: str, num_top_classes: int = 10):
143
  logger.info("Computing embeddings...")
144
  emb = get_emb(audio_path) # ユーザー入力の音声ファイル
145
  emb = emb.reshape(1, -1) # (1, dim)
@@ -158,38 +212,113 @@ def get_label(audio_path: str, num_top_classes: int = 10):
158
 
159
  # このキャラクターのトップ10の類似度を選択
160
  top_similarities = np.sort(similarities[character_indices])[::-1][
161
- :num_top_classes
162
  ]
163
 
164
  # 平均スコアを計算
165
  average_score = np.mean(top_similarities)
166
 
167
  # スピーカー名を取得
168
- speaker_name = id2speaker[character_id]
169
 
170
- speaker_scores[speaker_name] = average_score
171
 
172
  # スコアでソートして上位10件を返す
173
- sorted_scores = dict(
174
- sorted(speaker_scores.items(), key=lambda item: item[1], reverse=True)[:10]
 
 
 
 
 
175
  )
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  logger.success("Average scores calculated.")
178
- return sorted_scores
 
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
  with gr.Blocks() as app:
182
- input_audio = gr.Audio(type="filepath")
183
- with gr.Row():
184
- with gr.Column():
185
- btn_audio = gr.Button("似ている音声を検索")
186
- top_k = 10
187
- components = [gr.Audio(label=f"Top {i+1}") for i in range(top_k)]
188
- with gr.Column():
189
- btn_label = gr.Button("似ている話者を検索")
190
- label = gr.Label(num_top_classes=10)
191
-
192
- btn_audio.click(search, inputs=[input_audio], outputs=components)
193
- btn_label.click(get_label, inputs=[input_audio], outputs=[label])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  app.launch()
 
1
+ import json
2
  import os
3
+ import pprint
4
  import tempfile
5
  import zipfile
6
+ from dataclasses import dataclass
7
  from pathlib import Path
8
 
9
  import gradio as gr
 
15
  from pyannote.audio import Inference, Model
16
 
17
  HF_REPO_ID = "litagin/voice-samples-22050"
18
+ EMB_ROOT = Path("./embeddings")
19
  RESNET34_DIM = 256
20
  AUDIO_ZIP_DIR = Path("./audio_files_zipped_by_game_22_050")
21
 
 
33
  logger.info(f"Device: {device}")
34
 
35
  logger.info("Loading resnet34 vectors...")
36
+ resnet34_embs = np.load(EMB_ROOT / "all_embs.npy")
37
  resnet34_embs_normalized = resnet34_embs / np.linalg.norm(
38
  resnet34_embs, axis=1, keepdims=True
39
  )
 
44
  inference.to(device)
45
 
46
  logger.info("Loading filelist...")
47
+ with open(EMB_ROOT / "all_filelist.txt", "r", encoding="utf-8") as file:
48
  files = [line.strip() for line in file]
49
 
50
 
51
+ def get_speaker_key(file_idx: int):
52
  filepath = Path(files[file_idx])
53
  game_name = filepath.parent.parent.name
54
  speaker_name = filepath.parent.name
 
57
 
58
  # スピーカーIDの配列を取得
59
  logger.info("Getting speaker ids...")
60
+ all_speaker_set = set([get_speaker_key(i) for i in range(len(files))])
61
  id2speaker = {i: speaker for i, speaker in enumerate(sorted(all_speaker_set))}
62
  num_speakers = len(id2speaker)
63
  speaker2id = {speaker: i for i, speaker in id2speaker.items()}
64
+ speaker_id_array = np.array([speaker2id[get_speaker_key(i)] for i in range(len(files))])
65
+
66
+
67
+ @dataclass
68
+ class GameInfo:
69
+ company: str
70
+ name: str
71
+ url: str
72
 
73
 
74
+ logger.info("Loading game dictionary...")
75
+ """
76
+ [
77
+ {
78
+ "key": "Aino+Links_Sousaku_Kanojo_no_Ren'ai_Koushiki",
79
+ "company": "Aino+Links",
80
+ "name": "創作彼女の恋愛公式",
81
+ "url": "http://ainolinks.com/"
82
+ },
83
+ ...
84
+ ]
85
+ """
86
+ with open("game_info.json", "r", encoding="utf-8") as file:
87
+ game_info = json.load(file)
88
+
89
+ game_dict = {
90
+ game["key"]: GameInfo(company=game["company"], name=game["name"], url=game["url"])
91
+ for game in game_info
92
+ }
93
 
94
 
95
  def get_zip_archive_path_and_internal_path(file_path: Path) -> tuple[str, str]:
 
97
  game_name = file_path.parent.parent.name
98
  speaker_name = file_path.parent.name
99
  archive_path = AUDIO_ZIP_DIR / f"{game_name}.zip"
100
+ internal_path = f"{speaker_name}/{file_path.name}"
101
  return str(archive_path), str(internal_path)
102
 
103
 
 
125
  return emb
126
 
127
 
128
+ def search_audio_files(audio_path: str):
129
+ # Check audio duration, require < 30s
130
+ logger.info(f"Getting duration of {audio_path}...")
131
+ waveform, sample_rate = librosa.load(audio_path, sr=None)
132
+ duration = librosa.get_duration(y=waveform, sr=sample_rate)
133
+ logger.info(f"Duration: {duration:.2f}s")
134
+ if duration > 30:
135
+ logger.error(f"Duration is too long: {duration:.2f}s")
136
+ return [
137
+ f"音声ファイルは30秒以下である必要があります。現在のファイルの長さ: {duration:.2f}s"
138
+ ] + [None] * 20
139
+
140
  logger.info("Computing embeddings...")
141
  emb = get_emb(audio_path) # ユーザー入力の音声ファイル
142
  emb = emb.reshape(1, -1) # (1, dim)
 
152
  top_k = 10
153
  top_k_indices = np.argsort(similarities)[::-1][:top_k]
154
  top_k_files = [files[file_idx] for file_idx in top_k_indices]
155
+ logger.info(f"Top {top_k} files:\n{pprint.pformat(top_k_files)}")
156
  top_k_scores = similarities[top_k_indices]
157
+ logger.info(f"Top {top_k} scores:\n{pprint.pformat(top_k_scores)}")
158
  logger.info("Fetching audio files...")
159
+ audio_result = []
160
+ info_result = []
161
 
162
+ for i, (file_idx, score) in enumerate(zip(top_k_indices, top_k_scores)):
163
+ file_path = Path(files[file_idx])
164
+ waveform_np, sample_rate = load_audio_from_zip(file_path)
165
+ audio_result.append(
 
166
  gr.Audio(
167
  value=(sample_rate, waveform_np),
168
+ label=f"Top {i+1} ({score:.4f})",
169
  )
170
  )
171
+ game_key = file_path.parent.parent.name
172
+ speaker_name = file_path.parent.name
173
+ game = game_dict[game_key]
174
+ game_info_md = f"""
175
+ ## {i+1}位 (スコア{score:.4f})
176
+ - ゲーム名: **{game.name}** ({game.company})
177
+ - 公式サイト: {game.url}
178
+ - キャラクター名: **{speaker_name}**"""
179
+ info_result.append(gr.Markdown(game_info_md))
180
  logger.success("Audio files fetched.")
181
+ return ["成功"] + info_result + audio_result
182
+
183
+
184
+ def get_label(audio_path: str, num_top_classes_to_use: int = 10):
185
+ # Check audio duration, require < 30s
186
+ logger.info(f"Getting duration of {audio_path}...")
187
+ waveform, sample_rate = librosa.load(audio_path, sr=None)
188
+ duration = librosa.get_duration(y=waveform, sr=sample_rate)
189
+ logger.info(f"Duration: {duration:.2f}s")
190
+ if duration > 30:
191
+ logger.error(f"Duration is too long: {duration:.2f}s")
192
+ return (
193
+ f"音声ファイルは30秒以下である必要があります。現在のファイルの長さ: {duration:.2f}s",
194
+ None,
195
+ )
196
 
 
197
  logger.info("Computing embeddings...")
198
  emb = get_emb(audio_path) # ユーザー入力の音声ファイル
199
  emb = emb.reshape(1, -1) # (1, dim)
 
212
 
213
  # このキャラクターのトップ10の類似度を選択
214
  top_similarities = np.sort(similarities[character_indices])[::-1][
215
+ :num_top_classes_to_use
216
  ]
217
 
218
  # 平均スコアを計算
219
  average_score = np.mean(top_similarities)
220
 
221
  # スピーカー名を取得
222
+ speaker_key = id2speaker[character_id]
223
 
224
+ speaker_scores[speaker_key] = average_score
225
 
226
  # スコアでソートして上位10件を返す
227
+ sorted_scores_list = sorted(
228
+ speaker_scores.items(), key=lambda x: x[1], reverse=True
229
+ )
230
+ sorted_scores_list = sorted_scores_list[:10]
231
+ logger.success("Average scores calculated.")
232
+ logger.info(
233
+ f"Top {num_top_classes_to_use} speakers:\n{pprint.pformat(sorted_scores_list)}"
234
  )
235
 
236
+ results = []
237
+ for i, (speaker_key, score) in enumerate(sorted_scores_list):
238
+ game_key, speaker_name = speaker_key.split("/")
239
+ result_md = f"""
240
+ ## {i+1}位 (スコア{score:.4f})
241
+ - ゲーム名: **{game_dict[game_key].name}** ({game_dict[game_key].company})
242
+ - 公式サイト: {game_dict[game_key].url}
243
+ - キャラクター名: {speaker_name}
244
+ ---"""
245
+ results.append(result_md)
246
+
247
+ all_result_md = "\n\n".join(results)
248
+
249
  logger.success("Average scores calculated.")
250
+ return "成功", all_result_md
251
+
252
 
253
+ def make_game_info_md(game_key: str) -> str:
254
+ game = game_dict[game_key]
255
+ return f"[**{game.name}** ({game.company})]({game.url})"
256
+
257
+
258
+ def make_speaker_info_md(game_key: str, speaker_name: str) -> str:
259
+ game = game_dict[game_key]
260
+ return f"[{game.name} ({game.company})]({game.url})\n{speaker_name}"
261
+
262
+
263
+ initial_md = """
264
+ # ギャルゲー似た声検索
265
+
266
+ - 与えられた音声に対して、声が似ているような日本のギャルゲー(ビジュアルノベル・エロゲー)の音声を検索するアプリです
267
+ - 「この声と似たキャラクターが出ているギャルゲーは?」「この音声AIの声に聞き覚えあるけど、学習元は誰なのかな?」といった疑問の参考になるかもしれません
268
+ - 次ができます:
269
+ - セリフ単位でのTop 10の音声のサンプル表示
270
+ - キャラクター単位でのTop 10のキャラクター表示
271
+ - ゲームの公式サイトへのリンクもありますが、**18歳未満の方はリンク先へのアクセスを控えてください**
272
+ - 全てのゲームや、その中の全ての音声が網羅されているわけではありません(データについては下記詳細を参照)
273
+ """
274
+
275
+ details_md = """
276
+
277
+ ## 音声データ
278
+
279
+ - 音声データは全て [OOPPEENN/Galgame_Dataset](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset) から取得
280
+ - 音声ファイル処理: 各キャラクターについて次を行う
281
+ - 総ファイル数が100未満の場合はモブキャラとして除外
282
+ - 「2秒以上20秒未満」の音声ファイルのうち、時系列的に最初の100ファイルに加え、ランダムに最大200ファイル、合計最大300ファイルを選択
283
+ - 22050Hz oggでリサンプリング
284
+
285
+ ## 音声ファイル同士の類似度計算
286
+ - 話者埋め込み: [pyannote/wespeaker-voxceleb-resnet34-LM](https://huggingface.co/pyannote/wespeaker-voxceleb-resnet34-LM) の256次元の話者埋め込み
287
+ - 類似度計算: 2つの音声ファイルの話者埋め込みベクトルのコサイン類似度
288
+
289
+ ## キャラクター検索
290
+ - 与えられた音声に対して、全ての音声ファイルとの類似度を計算
291
+ - 各キャラクターについて、類似度の高い10ファイルの平均類似度を計算し、スコアとする
292
+ - そのスコアでソートして上位10キャラクターを表示
293
+ """
294
 
295
  with gr.Blocks() as app:
296
+ gr.Markdown(initial_md)
297
+ with gr.Accordion(label="詳細", open=False):
298
+ gr.Markdown(details_md)
299
+ input_audio = gr.Audio(type="filepath", label="音声ファイルを入力")
300
+ with gr.Tab(label="セリフ音声検索"):
301
+ btn_audio = gr.Button("似ているセリフ音声を検索")
302
+ info_audio = gr.Textbox(label="情報")
303
+ num_candidates = 10
304
+ audio_components = []
305
+ game_info_components = []
306
+ for i in range(num_candidates):
307
+ with gr.Row(variant="panel"):
308
+ game_info_components.append(gr.Markdown(label=f"Top {i+1}"))
309
+ audio_components.append(gr.Audio(label=f"Top {i+1}"))
310
+ with gr.Tab(label="キャラクター検索"):
311
+ btn_character = gr.Button("似ているキャラクターを検索")
312
+ info_character = gr.Textbox(label="情報")
313
+ result_character = gr.Markdown("ここに結果が表示されます")
314
+
315
+ btn_audio.click(
316
+ search_audio_files,
317
+ inputs=[input_audio],
318
+ outputs=[info_audio] + game_info_components + audio_components,
319
+ )
320
+ btn_character.click(
321
+ get_label, inputs=[input_audio], outputs=[info_character, result_character]
322
+ )
323
 
324
  app.launch()
embeddings/{all_filelists.txt → all_filelist.txt} RENAMED
File without changes