terapyon commited on
Commit
da71779
·
unverified ·
2 Parent(s): af9a512 b8736f2

Merge pull request #7 from terapyon/terada/mt-245-hf-deploy

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. src/app.py +19 -7
  4. src/config.py +6 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # podcast-search
2
 
3
  Podcast terapyon channelを検索する仕組み
 
1
+ ---
2
+ title: Podcast Search
3
+ emoji: 🚀
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ sdk_version: 1.41.1
8
+ app_file: src/app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: terapyon channel の検索
12
+ ---
13
+
14
  # podcast-search
15
 
16
  Podcast terapyon channelを検索する仕組み
src/app.py CHANGED
@@ -1,13 +1,23 @@
1
  from datetime import timedelta
 
2
  import streamlit as st
3
  import duckdb
4
  from embedding import get_embeddings
5
- from config import DUCKDB_FILE
6
 
7
 
8
  @st.cache_resource
9
  def get_conn():
10
- return duckdb.connect(DUCKDB_FILE)
 
 
 
 
 
 
 
 
 
11
 
12
 
13
  title_query = """SELECT id, title FROM podcasts
@@ -20,11 +30,11 @@ query = """WITH filtered_podcasts AS (
20
  WHERE id in ?
21
  ),
22
  ordered_embeddings AS (
23
- SELECT embeddings.id, embeddings.part
24
  FROM embeddings
25
  JOIN filtered_podcasts fp ON embeddings.id = fp.id
26
- ORDER BY array_distance(embedding, ?::FLOAT[1024])
27
- LIMIT 10
28
  )
29
  SELECT
30
  p.title,
@@ -33,6 +43,7 @@ SELECT
33
  e.text,
34
  e.part,
35
  p.audio,
 
36
  FROM
37
  ordered_embeddings oe
38
  JOIN
@@ -42,7 +53,8 @@ SELECT
42
  JOIN
43
  podcasts p
44
  ON
45
- oe.id = p.id;
 
46
  """
47
 
48
  st.title("terapyon cannel search")
@@ -65,7 +77,7 @@ if word:
65
  result = conn.execute(query,
66
  (selected_ids, word_embedding,)).df()
67
  selected = st.dataframe(result,
68
- column_order=["title", "date", "part", "start", "text", "audio"],
69
  on_select="rerun",
70
  selection_mode="single-row")
71
  if selected:
 
1
  from datetime import timedelta
2
+ import os
3
  import streamlit as st
4
  import duckdb
5
  from embedding import get_embeddings
6
+ from config import HF_HOST, DUCKDB_FILE, HF_REPO_TYPE, HF_REPO_ID, HF_FILENAME
7
 
8
 
9
  @st.cache_resource
10
  def get_conn():
11
+ if HF_HOST:
12
+ os.environ["HUGGINGFACE_TOKEN"] = os.getenv("HF_TOKEN", "")
13
+ from huggingface_hub import hf_hub_download
14
+ local_file = hf_hub_download(
15
+ repo_type=HF_REPO_TYPE,
16
+ repo_id=HF_REPO_ID,
17
+ filename=HF_FILENAME)
18
+ return duckdb.connect(local_file)
19
+ else:
20
+ return duckdb.connect(DUCKDB_FILE)
21
 
22
 
23
  title_query = """SELECT id, title FROM podcasts
 
30
  WHERE id in ?
31
  ),
32
  ordered_embeddings AS (
33
+ SELECT embeddings.id, embeddings.part, array_distance(embedding, ?::FLOAT[1024]) AS distance
34
  FROM embeddings
35
  JOIN filtered_podcasts fp ON embeddings.id = fp.id
36
+ ORDER BY distance
37
+ LIMIT 10
38
  )
39
  SELECT
40
  p.title,
 
43
  e.text,
44
  e.part,
45
  p.audio,
46
+ oe.distance,
47
  FROM
48
  ordered_embeddings oe
49
  JOIN
 
53
  JOIN
54
  podcasts p
55
  ON
56
+ oe.id = p.id
57
+ ORDER BY oe.distance;
58
  """
59
 
60
  st.title("terapyon cannel search")
 
77
  result = conn.execute(query,
78
  (selected_ids, word_embedding,)).df()
79
  selected = st.dataframe(result,
80
+ column_order=["title", "date", "part", "start", "distance", "text", "audio"],
81
  on_select="rerun",
82
  selection_mode="single-row")
83
  if selected:
src/config.py CHANGED
@@ -4,8 +4,14 @@ from pathlib import Path
4
  # import logging
5
 
6
 
 
 
 
 
 
7
  HERE = Path(__file__).resolve().parent
8
  DUCKDB_FILE = HERE.parent / "db" / "terapyon-podcast.duckdb"
 
9
  STORE_DIR = HERE.parent / "store"
10
  DATA_DIR = HERE.parent / "data"
11
  PODCAST_TITLE_LIST = str(STORE_DIR / 'title-list-202301-202501.parquet')
 
4
  # import logging
5
 
6
 
7
+ HF_HOST = False
8
+ HF_REPO_TYPE = "dataset"
9
+ HF_REPO_ID = "terapyon/terapyon-podcast"
10
+ HF_FILENAME = "terapyon-podcast-20250104.duckdb"
11
+
12
  HERE = Path(__file__).resolve().parent
13
  DUCKDB_FILE = HERE.parent / "db" / "terapyon-podcast.duckdb"
14
+
15
  STORE_DIR = HERE.parent / "store"
16
  DATA_DIR = HERE.parent / "data"
17
  PODCAST_TITLE_LIST = str(STORE_DIR / 'title-list-202301-202501.parquet')