Spaces:

terapyon
/

podcast-search

Sleeping

App Files Files Community

terapyon commited on Jan 4

Commit

d788666

1 Parent(s): 608103a

srtの分割を1分にし、configなどを整え、READMEを書いた

Browse files

Files changed (4) hide show

README.md +135 -0
src/config.py +8 -0
src/episode.py +8 -13
src/store.py +4 -8

README.md CHANGED Viewed

	@@ -3,3 +3,138 @@
3	Podcast terapyon channelを検索する仕組み
4
5

 Podcast terapyon channelを検索する仕組み
+## 使い方
+### タイトルリスト
+- 以下のファイルを`store` フォルダに置く
+- `title-list-202301-202501.parquet`
+- 以下のカラムを持つ
+  - id: int
+  - date: str (2023-01-09)
+  - length: int
+  - audio: str (オーディオファイルURL)
+  - title: str
+タイトルリストファイルの例
+<div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>id</th>
+      <th>date</th>
+      <th>length</th>
+      <th>audio</th>
+      <th>title</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>69</td>
+      <td>2023-01-09</td>
+      <td>20993616</td>
+      <td>https://anchor.fm/s/14480e04/podcast/play/6323...</td>
+      <td>#69 2023年新年挨拶から 2022年の振り返りと2023年の抱負</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>70</td>
+      <td>2023-03-09</td>
+      <td>103287296</td>
+      <td>https://anchor.fm/s/14480e04/podcast/play/6621...</td>
+      <td>#70 PyCon JP Association代表理事退任と今後の展望をIqbalさんと語る</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>71</td>
+      <td>2023-03-22</td>
+      <td>116393694</td>
+      <td>https://anchor.fm/s/14480e04/podcast/play/6706...</td>
+      <td>#71 hirokikyさんをゲストに 自然言語処理系AI Chat GPT / Whisp...</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>72</td>
+      <td>2023-05-04</td>
+      <td>49642320</td>
+      <td>https://anchor.fm/s/14480e04/podcast/play/6976...</td>
+      <td>#72 PyCon US 2023 ひとり振り返り</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>73</td>
+      <td>2023-05-24</td>
+      <td>150643013</td>
+      <td>https://anchor.fm/s/14480e04/podcast/play/7094...</td>
+      <td>#73 Nyohoさんをゲストに Scratchからディープラーニングや数学の話</td>
+    </tr>
+  </tbody>
+</table>
+</div>
+### 文字データ作成
+- dataフォルダをを作る(srcと同じ階層)
+- dataフォルダに、srtファイルを入れる
+  - (以下に従うと、srtファイルからIDが取得できる)
+  - 拡張子を `.srt` とする
+  - ファイル名に、ID(整数)が1つだけ入ってること
+  - IDの前後に、 `-` または `_` で区切られいること
+- 以下のスクリプトを実行する。 `store` フォルダに `parquet` ファイルが srtファイル分できる
+```
+% python src/episode.py
+```
+### データベース作成
+以下のコマンドで、テーブル作成から必要な3つのデータをDuckDB(永続化)を作る
+```
+% python src/store.py all
+```
+上記のコマンドの詳細
+- テーブル作成 create table
+  - `python src/store.py create`
+- タイトルリスト insert
+  - `python src/store.py podcastinsert`
+- エピソードとテキスト insert
+  - `python src/store.py episodeinsert`
+- ベクトル化 embedding
+  - `python src/store.py embed`
+- ベクトルデータ index
+  - `python src/store.py index`
+### 検索UI
+```
+% streamlit run src/app.py
+```
+- Podcastタイトル(複数)を選ぶ。未選択の場合すべてとなる
+- 検索したいワードをテキストボックスに入力
+- 10個のセンテンス(文章)候補が出てくる
+- 表の左をクリックすると、下部に文字列が表示される
+- 音声のタイミング（分・秒）が表示される・・未実装
+- そのタイミングの音声がその場で聞ける・・将来的に実装したいが実現方法未確定

src/config.py CHANGED Viewed

@@ -1,6 +1,14 @@
 from pathlib import Path
 # import logging
 HERE = Path(__file__).resolve().parent
 DUCKDB_FILE = HERE.parent / "db" / "terapyon-podcast.duckdb"

+from datetime import timedelta
+import re
 from pathlib import Path
 # import logging
 HERE = Path(__file__).resolve().parent
 DUCKDB_FILE = HERE.parent / "db" / "terapyon-podcast.duckdb"
+STORE_DIR = HERE.parent / "store"
+DATA_DIR = HERE.parent / "data"
+PODCAST_TITLE_LIST = str(STORE_DIR / 'title-list-202301-202501.parquet')
+EPISODES_PARQUET = str(STORE_DIR / 'podcast-*.parquet')
+divider_time = timedelta(minutes=1)
+RE_PODCAST_SRT_FILE = re.compile(r"[_-](\d+)[_-]")

src/episode.py CHANGED Viewed

@@ -1,23 +1,15 @@
 from dataclasses import dataclass
 from datetime import time as dt_time
 from datetime import timedelta
-from pathlib import Path
-import re
 import pandas as pd
-HERE = Path(__file__).parent
-DATA_DIR = HERE.parent / "data"
-STORE_DIR = HERE.parent / "store"
-divider_time = timedelta(minutes=5)
-RE_PODCAST = re.compile(r"[_-](\d+)[_-]")
 @dataclass
 class SplitedText:
     part: int
-    start: timedelta
-    end: timedelta
     text: str
@@ -72,7 +64,10 @@ def make_episode(id_: int, title: str, srt_filename: str) -> Episode:
             if start and second and text:
                 if abs(second - start) > divider_time:
                     end = second
-                    st = SplitedText(part=part, start=start, end=end, text=text)
                     episode.texts.append(st)
                     # print(text)
@@ -95,7 +90,7 @@ def make_df(episode: Episode) -> pd.DataFrame:
 def get_srt_files():
     lst = []
     for file_path in DATA_DIR.glob("*.srt"):
-        m = RE_PODCAST.search(file_path.name)
         if m is not None:
             filename = file_path.name
             id_ = int(m.group(1))

 from dataclasses import dataclass
 from datetime import time as dt_time
 from datetime import timedelta
 import pandas as pd
+from config import STORE_DIR, DATA_DIR, divider_time, RE_PODCAST_SRT_FILE
 @dataclass
 class SplitedText:
     part: int
+    start: int
+    end: int
     text: str
             if start and second and text:
                 if abs(second - start) > divider_time:
                     end = second
+                    st = SplitedText(part=part,
+                                     start=int(start.total_seconds()),
+                                     end=int(end.total_seconds()),
+                                     text=text)
                     episode.texts.append(st)
                     # print(text)
 def get_srt_files():
     lst = []
     for file_path in DATA_DIR.glob("*.srt"):
+        m = RE_PODCAST_SRT_FILE.search(file_path.name)
         if m is not None:
             filename = file_path.name
             id_ = int(m.group(1))

src/store.py CHANGED Viewed

@@ -1,17 +1,13 @@
-from pathlib import Path
 import duckdb
 from embedding import get_embeddings
 from config import DUCKDB_FILE
-HERE = Path(__file__).parent
-STORE_DIR = HERE.parent / "store"
 def create_table():
     conn = duckdb.connect(DUCKDB_FILE)
     podcasts_create = """CREATE TABLE podcasts (
-        id BIGINT PRIMARY KEY,
         title TEXT, date DATE, guests TEXT[], length BIGINT, audio TEXT
         );
     """
@@ -39,7 +35,7 @@ def insert_podcast():
         SELECT id, title, date, [], length, audio
           FROM read_parquet(?);
     """
-    conn.execute(sql, [str(STORE_DIR / 'title-list-202301-202501.parquet')])
     conn.commit()
     conn.close()
@@ -50,7 +46,7 @@ def insert_episodes():
         SELECT id, part, start, end_, text
           FROM read_parquet(?);
     """
-    conn.execute(sql, [str(STORE_DIR / 'podcast-*.parquet')])
     conn.commit()
     conn.close()

 import duckdb
 from embedding import get_embeddings
 from config import DUCKDB_FILE
+from config import PODCAST_TITLE_LIST, EPISODES_PARQUET
 def create_table():
     conn = duckdb.connect(DUCKDB_FILE)
     podcasts_create = """CREATE TABLE podcasts (
+        id BIGINT PRIMARY KEY,
         title TEXT, date DATE, guests TEXT[], length BIGINT, audio TEXT
         );
     """
         SELECT id, title, date, [], length, audio
           FROM read_parquet(?);
     """
+    conn.execute(sql, [PODCAST_TITLE_LIST])
     conn.commit()
     conn.close()
         SELECT id, part, start, end_, text
           FROM read_parquet(?);
     """
+    conn.execute(sql, [EPISODES_PARQUET])
     conn.commit()
     conn.close()