alonsosilva commited on
Commit
4b6ea6b
·
1 Parent(s): 6fc78fc
Files changed (3) hide show
  1. Dockerfile +23 -0
  2. app.py +99 -0
  3. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11
2
+
3
+ # Set up a new user named "user" with user ID 1000 for permission
4
+ RUN useradd -m -u 1000 user
5
+ # Switch to the "user" user
6
+ USER user
7
+ # Set home to the user's home directory
8
+ ENV HOME=/home/user \
9
+ PATH=/home/user/.local/bin:$PATH
10
+
11
+ # Upgreade pip
12
+ RUN pip install --no-cache-dir --upgrade pip
13
+
14
+ COPY --chown=user requirements.txt requirements.txt
15
+
16
+ # Install requirements
17
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
18
+
19
+ COPY --chown=user app.py app.py
20
+
21
+ COPY --chown=user data/ data/
22
+
23
+ ENTRYPOINT ["solara", "run", "app.py", "--host=0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import sqlite_vec
3
+
4
+ from datasets import load_dataset
5
+
6
+ dataset = load_dataset("m3hrdadfi/recipe_nlg_lite")
7
+ recipe_names = dataset["train"]["name"]
8
+
9
+ from sentence_transformers import SentenceTransformer
10
+
11
+ tfm_base = SentenceTransformer("all-MiniLM-L6-v2")
12
+ X_tfm = tfm_base.encode(recipe_names)
13
+ n_feats = X_tfm.shape[1]
14
+
15
+ import polars as pl
16
+ import solara
17
+
18
+ @solara.component
19
+ def Display_Full(query,db,limit):
20
+ with db:
21
+ rows_orig = db.execute(
22
+ f"""
23
+ SELECT
24
+ rowid,
25
+ distance
26
+ FROM vec_sents
27
+ WHERE embedding MATCH ?
28
+ ORDER BY distance
29
+ LIMIT {limit}
30
+ """,
31
+ [sqlite_vec.serialize_float32(query)],
32
+ ).fetchall()
33
+ df1 = pl.DataFrame({"results": [recipe_names[rowid] for rowid in [dict(row)["rowid"] for row in rows_orig]]})
34
+ with solara.Column():
35
+ solara.Markdown("## Full precision")
36
+ solara.DataFrame(df1, items_per_page=10)
37
+
38
+ @solara.component
39
+ def Display_Binary(query,db,limit):
40
+ with db:
41
+ rows_bin = db.execute(
42
+ f"""
43
+ SELECT
44
+ rowid,
45
+ distance
46
+ FROM bin_vec_sents
47
+ WHERE embedding MATCH vec_quantize_binary(?)
48
+ ORDER BY distance
49
+ LIMIT {limit}
50
+ """,
51
+ [sqlite_vec.serialize_float32(query)],
52
+ ).fetchall()
53
+ df2 = pl.DataFrame({"results": [recipe_names[rowid] for rowid in [dict(row)["rowid"] for row in rows_bin]]})
54
+ with solara.Column():
55
+ solara.Markdown("## Binary quantization")
56
+ solara.DataFrame(df2, items_per_page=10)
57
+
58
+
59
+ @solara.component
60
+ def Page():
61
+ with solara.Column(margin=10):
62
+ with solara.Head():
63
+ solara.Title("Recipe finder")
64
+ solara.Markdown("# Recipe finder")
65
+ solara.Markdown("I built this tool to help me get a feeling of binary embedding quantization in [sqlite-vec](https://alexgarcia.xyz/sqlite-vec/). For any given text, it gives the top 10 results. The dataset I'm using is [m3hrdadfi/recipe_nlg_lite](https://hf.co/datasets/m3hrdadfi/recipe_nlg_lite) which consists of 6,119 recipes. Inspired by [Exploring SQLite-vec](https://www.youtube.com/watch?v=wYU66AjRIAc) by [@fishnets88](https://twitter.com/fishnets88)")
66
+ q = solara.use_reactive("I would like to have some vegetable soup")
67
+ solara.InputText("Enter a query", value=q, continuous_update=True)
68
+ query = tfm_base.encode([q.value])[0]
69
+ limit = 10
70
+ db = sqlite3.connect(":memory:")
71
+ db.enable_load_extension(True)
72
+ sqlite_vec.load(db)
73
+ db.enable_load_extension(False)
74
+ db.row_factory = sqlite3.Row
75
+
76
+ db.execute(f"create virtual table vec_sents using vec0(embedding float[{n_feats}])")
77
+
78
+ with db:
79
+ for i, item in enumerate([{"vector": x} for i, x in enumerate(X_tfm)]):
80
+ db.execute(
81
+ "INSERT INTO vec_sents(rowid, embedding) VALUES (?, ?)",
82
+ [i, sqlite_vec.serialize_float32(item["vector"])],
83
+ )
84
+
85
+ db.execute(f"create virtual table bin_vec_sents using vec0(embedding bit[{n_feats}])")
86
+
87
+ with db:
88
+ for i, item in enumerate([{"vector": x} for i, x in enumerate(X_tfm)]):
89
+ db.execute(
90
+ "INSERT INTO bin_vec_sents(rowid, embedding) VALUES (?, vec_quantize_binary(?))",
91
+ [i, sqlite_vec.serialize_float32(item["vector"])],
92
+ )
93
+
94
+
95
+
96
+ with solara.Row():
97
+ Display_Full(query,db,limit)
98
+ Display_Binary(query,db,limit)
99
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ solara==1.40.0
2
+ sqlite-vec==0.1.3
3
+ datasets==3.0.1
4
+ sentence-transformers==3.2.0
5
+ polars==1.9.0