Spaces:
Running
Running
added duckdb-nsql benchmark setup
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +4 -0
- duckdb-nsql/LICENSE +201 -0
- duckdb-nsql/README.md +63 -0
- duckdb-nsql/eval/README.md +90 -0
- duckdb-nsql/eval/constants.py +37 -0
- duckdb-nsql/eval/data/databases/flightinfo/flightinfo.duckdb +0 -0
- duckdb-nsql/eval/data/databases/hn/hn.duckdb +0 -0
- duckdb-nsql/eval/data/databases/json/json.duckdb +0 -0
- duckdb-nsql/eval/data/databases/laptop/laptop.duckdb +0 -0
- duckdb-nsql/eval/data/databases/laptop_array/laptop_array.duckdb +0 -0
- duckdb-nsql/eval/data/databases/laptop_json/laptop_json.duckdb +0 -0
- duckdb-nsql/eval/data/databases/laptop_struct/laptop_struct.duckdb +0 -0
- duckdb-nsql/eval/data/databases/none/none.duckdb +0 -0
- duckdb-nsql/eval/data/databases/nyc/nyc.duckdb +0 -0
- duckdb-nsql/eval/data/databases/product/product.duckdb +0 -0
- duckdb-nsql/eval/data/databases/transactions/transactions.duckdb +0 -0
- duckdb-nsql/eval/data/databases/who/who.duckdb +0 -0
- duckdb-nsql/eval/data/dev.json +602 -0
- duckdb-nsql/eval/data/tables.json +3465 -0
- duckdb-nsql/eval/data_utils.py +79 -0
- duckdb-nsql/eval/doc_retriever.py +92 -0
- duckdb-nsql/eval/evaluate.py +392 -0
- duckdb-nsql/eval/get_manifest.py +29 -0
- duckdb-nsql/eval/loaders.py +78 -0
- duckdb-nsql/eval/metric_utils.py +40 -0
- duckdb-nsql/eval/metrics/test_suite_sql_eval +1 -0
- duckdb-nsql/eval/predict.py +345 -0
- duckdb-nsql/eval/prompt_formatters.py +1056 -0
- duckdb-nsql/eval/schema.py +115 -0
- duckdb-nsql/eval/text_to_sql.py +260 -0
- duckdb-nsql/manifest/.flake8 +11 -0
- duckdb-nsql/manifest/.pre-commit-config.yaml +23 -0
- duckdb-nsql/manifest/CHANGELOG.rst +93 -0
- duckdb-nsql/manifest/LICENSE +201 -0
- duckdb-nsql/manifest/Makefile +27 -0
- duckdb-nsql/manifest/README.md +304 -0
- duckdb-nsql/manifest/examples/langchain_chatgpt.ipynb +455 -0
- duckdb-nsql/manifest/examples/manifest_async.py +27 -0
- duckdb-nsql/manifest/examples/manifest_azure.ipynb +149 -0
- duckdb-nsql/manifest/examples/manifest_chatgpt.ipynb +101 -0
- duckdb-nsql/manifest/examples/manifest_connection_pool.ipynb +208 -0
- duckdb-nsql/manifest/examples/manifest_diffusers.ipynb +0 -0
- duckdb-nsql/manifest/examples/manifest_embedding.ipynb +156 -0
- duckdb-nsql/manifest/examples/manifest_google.ipynb +117 -0
- duckdb-nsql/manifest/examples/manifest_openrouter.ipynb +108 -0
- duckdb-nsql/manifest/examples/manifest_streaming.ipynb +105 -0
- duckdb-nsql/manifest/examples/manifest_together.ipynb +106 -0
- duckdb-nsql/manifest/manifest/__init__.py +6 -0
- duckdb-nsql/manifest/manifest/api/__init__.py +1 -0
- duckdb-nsql/manifest/manifest/api/app.py +301 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.DS_Store
|
2 |
+
__pycache__/
|
3 |
+
*.egg-info
|
4 |
+
.ipynb_checkpoints/
|
duckdb-nsql/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
duckdb-nsql/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DuckDB-NSQL
|
2 |
+
Numbers Station Text to SQL model for DuckDB.
|
3 |
+
|
4 |
+
NSQL is a family of autoregressive open-source foundational models (FMs) that are particularly designed for SQL generation tasks. We are thrilled to introduce DuckDB-NSQL in this repository, an FM tailored for local DuckDB SQL analytics tasks. All model weights can be found on HuggingFace.
|
5 |
+
|
6 |
+
| Model Name | Size | Link |
|
7 |
+
| --------------------------------------| ---- | -------------------------------------------------------------- |
|
8 |
+
| motherduckdb/DuckDB-NSQL-7B-v0.1 | 7B | [link](https://huggingface.co/motherduckdb/DuckDB-NSQL-7B-v0.1) |
|
9 |
+
| motherduckdb/DuckDB-NSQL-7B-v0.1-GGUF | 7B | [link](https://huggingface.co/motherduckdb/DuckDB-NSQL-7B-v0.1-GGUF)|
|
10 |
+
|
11 |
+
## Setup
|
12 |
+
To install all the necessary dependencies, please run
|
13 |
+
```
|
14 |
+
pip install -r requirements.txt
|
15 |
+
```
|
16 |
+
|
17 |
+
## Usage
|
18 |
+
Please refer to the examples in the `examples/` folder to learn how to connect to a local DuckDB and directly query your data. A simple notebook is provided in the `examples/` directory for reference.
|
19 |
+
|
20 |
+
To host the model with llama.cpp, please execute the following:
|
21 |
+
|
22 |
+
```python
|
23 |
+
# Import necessary modules
|
24 |
+
from llama_cpp import Llama
|
25 |
+
from wurlitzer import pipes
|
26 |
+
|
27 |
+
# Set up client with model path and context size
|
28 |
+
with pipes() as (out, err):
|
29 |
+
client = Llama(
|
30 |
+
model_path="DuckDB-NSQL-7B-v0.1-q8_0.gguf",
|
31 |
+
n_ctx=2048,
|
32 |
+
)
|
33 |
+
```
|
34 |
+
|
35 |
+
To load the DuckDB database and query against it, please execute the following:
|
36 |
+
|
37 |
+
```python
|
38 |
+
# Import necessary modules
|
39 |
+
import duckdb
|
40 |
+
from utils import generate_sql
|
41 |
+
|
42 |
+
# Connect to DuckDB database
|
43 |
+
con = duckdb.connect("nyc.duckdb")
|
44 |
+
|
45 |
+
# Sample question for SQL generation
|
46 |
+
question = "alter taxi table and add struct column with name test and keys a:int, b:double"
|
47 |
+
|
48 |
+
# Generate SQL, check validity, and print
|
49 |
+
sql = generate_sql(question, con, client)
|
50 |
+
print(sql)
|
51 |
+
```
|
52 |
+
|
53 |
+
## Training Data
|
54 |
+
|
55 |
+
The training data for this model consists of two parts: 1) 200k synthetically generated DuckDB SQL queries, based on the DuckDB v.0.9.2 documentation, and 2) labeled text-to-SQL pairs from [NSText2SQL](https://huggingface.co/datasets/NumbersStation/NSText2SQL) transpiled to DuckDB SQL using [sqlglot](https://github.com/tobymao/sqlglot).
|
56 |
+
|
57 |
+
## Evaluate the benchmark
|
58 |
+
|
59 |
+
Please refer to the `eval/` folder to check the details for evaluating the model against our proposed DuckDB benchmark.
|
60 |
+
|
61 |
+
## Acknowledgement
|
62 |
+
|
63 |
+
We would like to express our appreciation to all authors of the evaluation scripts. Their work made this project possible.
|
duckdb-nsql/eval/README.md
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This folder contains the suite for evaluating the DuckDB-Text2SQL model.
|
2 |
+
|
3 |
+
Please install the dependencies listed in the requirements.txt file located in the parent folder.
|
4 |
+
|
5 |
+
## Setup
|
6 |
+
To evaluate against the benchmark dataset, you need to prepare the evaluation script using this benchmark.
|
7 |
+
|
8 |
+
```
|
9 |
+
mkdir metrics
|
10 |
+
cd metrics
|
11 |
+
git clone [email protected]:ElementAI/test-suite-sql-eval.git test_suite_sql_eval
|
12 |
+
cd ..
|
13 |
+
```
|
14 |
+
|
15 |
+
You need to add a new remote to evaluate against duckdb in the test-suite-sql-eval folder. And check the latest duckdb-only branch (640a12975abf75a94e917caca149d56dbc6bcdd7).
|
16 |
+
|
17 |
+
```
|
18 |
+
git remote add till https://github.com/tdoehmen/test-suite-sql-eval.git
|
19 |
+
git fetch till
|
20 |
+
git checkout till/duckdb-only
|
21 |
+
```
|
22 |
+
|
23 |
+
Next, prepare the docs for retrieval.
|
24 |
+
```
|
25 |
+
mkdir docs
|
26 |
+
cd docs
|
27 |
+
git clone https://github.com/duckdb/duckdb-web.git
|
28 |
+
cd ..
|
29 |
+
```
|
30 |
+
|
31 |
+
#### Dataset
|
32 |
+
The benchmark dataset is located in the `data/` folder and includes all databases (`data/databases`), table schemas (`data/tables.json`), and examples (`data/dev.json`).
|
33 |
+
|
34 |
+
#### Eval
|
35 |
+
Start a manifest session with the model you want to evaluate.
|
36 |
+
|
37 |
+
```bash
|
38 |
+
python -m manifest.api.app \
|
39 |
+
--model_type huggingface \
|
40 |
+
--model_generation_type text-generation \
|
41 |
+
--model_name_or_path motherduckdb/DuckDB-NSQL-7B-v0.1 \
|
42 |
+
--fp16 \
|
43 |
+
--device 0
|
44 |
+
```
|
45 |
+
|
46 |
+
Then, from the `DuckDB-NSQL` main folder, run:
|
47 |
+
|
48 |
+
```bash
|
49 |
+
python eval/predict.py \
|
50 |
+
predict \
|
51 |
+
eval/data/dev.json \
|
52 |
+
eval/data/tables.json \
|
53 |
+
--output-dir output/ \
|
54 |
+
--stop-tokens ';' \
|
55 |
+
--stop-tokens '--' \
|
56 |
+
--stop-tokens '```' \
|
57 |
+
--stop-tokens '###' \
|
58 |
+
--overwrite-manifest \
|
59 |
+
--manifest-client huggingface \
|
60 |
+
--manifest-connection http://localhost:5000 \
|
61 |
+
--prompt-format duckdbinst
|
62 |
+
```
|
63 |
+
This will format the prompt using the duckdbinst style.
|
64 |
+
|
65 |
+
To evaluate the prediction, first run the following in a Python shell:
|
66 |
+
|
67 |
+
```python
|
68 |
+
try:
|
69 |
+
import duckdb
|
70 |
+
|
71 |
+
con = duckdb.connect()
|
72 |
+
con.install_extension("httpfs")
|
73 |
+
con.load_extension("httpfs")
|
74 |
+
except Exception as e:
|
75 |
+
print(f"Error loading duckdb extensions: {e}")
|
76 |
+
```
|
77 |
+
|
78 |
+
Then, run the evaluation script:
|
79 |
+
|
80 |
+
```bash
|
81 |
+
python eval/evaluate.py \
|
82 |
+
evaluate \
|
83 |
+
--gold eval/data/dev.json \
|
84 |
+
--db eval/data/databases/ \
|
85 |
+
--tables eval/data/tables.json \
|
86 |
+
--output-dir output/ \
|
87 |
+
--pred [PREDICITON_FILE]
|
88 |
+
```
|
89 |
+
|
90 |
+
To view the output, all the information is located in the prediction file in the [output-dir]. Here, `query` is gold and `pred` is predicted.
|
duckdb-nsql/eval/constants.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Constants."""
|
2 |
+
|
3 |
+
from prompt_formatters import (
|
4 |
+
DuckDBFormatter,
|
5 |
+
MotherDuckFormatter,
|
6 |
+
DuckDBInstFormatter,
|
7 |
+
DuckDBInstNoShorthandFormatter,
|
8 |
+
RajkumarFormatter,
|
9 |
+
DuckDBChat,
|
10 |
+
DuckDBInstFormatterLlamaShort,
|
11 |
+
DuckDBInstFormatterGraniteShort,
|
12 |
+
DuckDBInstFormatterLlama,
|
13 |
+
DuckDBInstFormatterLlamaBasic,
|
14 |
+
DuckDBInstFormatterGranite,
|
15 |
+
DuckDBInstFormatterPhi,
|
16 |
+
DuckDBInstFormatterGPTmini,
|
17 |
+
DuckDBInstFormatterPhiAzure,
|
18 |
+
DuckDBInstFormatterLlamaSyntax,
|
19 |
+
)
|
20 |
+
|
21 |
+
PROMPT_FORMATTERS = {
|
22 |
+
"rajkumar": RajkumarFormatter,
|
23 |
+
"duckdb": DuckDBFormatter,
|
24 |
+
"motherduck": MotherDuckFormatter,
|
25 |
+
"duckdbinst": DuckDBInstFormatter,
|
26 |
+
"duckdbinstllamashort": DuckDBInstFormatterLlamaShort,
|
27 |
+
"duckdbinstgraniteshort": DuckDBInstFormatterGraniteShort,
|
28 |
+
"duckdbinstllama": DuckDBInstFormatterLlama,
|
29 |
+
"duckdbinstgranite": DuckDBInstFormatterGranite,
|
30 |
+
"duckdbinstnoshort": DuckDBInstNoShorthandFormatter,
|
31 |
+
"duckdbchat": DuckDBChat,
|
32 |
+
"duckdbinstphi": DuckDBInstFormatterPhi,
|
33 |
+
"duckdbinstgptmini": DuckDBInstFormatterPhi,
|
34 |
+
"duckdbinstphiazure": DuckDBInstFormatterPhiAzure,
|
35 |
+
"duckdbinstllamabasic": DuckDBInstFormatterLlamaBasic,
|
36 |
+
"duckdbinstllamasyntax": DuckDBInstFormatterLlamaSyntax
|
37 |
+
}
|
duckdb-nsql/eval/data/databases/flightinfo/flightinfo.duckdb
ADDED
Binary file (799 kB). View file
|
|
duckdb-nsql/eval/data/databases/hn/hn.duckdb
ADDED
Binary file (799 kB). View file
|
|
duckdb-nsql/eval/data/databases/json/json.duckdb
ADDED
Binary file (537 kB). View file
|
|
duckdb-nsql/eval/data/databases/laptop/laptop.duckdb
ADDED
Binary file (537 kB). View file
|
|
duckdb-nsql/eval/data/databases/laptop_array/laptop_array.duckdb
ADDED
Binary file (799 kB). View file
|
|
duckdb-nsql/eval/data/databases/laptop_json/laptop_json.duckdb
ADDED
Binary file (799 kB). View file
|
|
duckdb-nsql/eval/data/databases/laptop_struct/laptop_struct.duckdb
ADDED
Binary file (799 kB). View file
|
|
duckdb-nsql/eval/data/databases/none/none.duckdb
ADDED
Binary file (12.3 kB). View file
|
|
duckdb-nsql/eval/data/databases/nyc/nyc.duckdb
ADDED
Binary file (799 kB). View file
|
|
duckdb-nsql/eval/data/databases/product/product.duckdb
ADDED
Binary file (274 kB). View file
|
|
duckdb-nsql/eval/data/databases/transactions/transactions.duckdb
ADDED
Binary file (537 kB). View file
|
|
duckdb-nsql/eval/data/databases/who/who.duckdb
ADDED
Binary file (537 kB). View file
|
|
duckdb-nsql/eval/data/dev.json
ADDED
@@ -0,0 +1,602 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"db_id": "hn",
|
4 |
+
"query": "SELECT COUNT(*) as domain_count, \nSUBSTRING(SPLIT_PART(url, '//', 2), 1, POSITION('/' IN SPLIT_PART(url, '//', 2)) - 1) as domain \nFROM hacker_news\nWHERE url IS NOT NULL GROUP BY domain ORDER BY domain_count DESC LIMIT 10;",
|
5 |
+
"setup_sql": ";",
|
6 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
7 |
+
"question": "what are the top domains being shared on hacker_news?",
|
8 |
+
"category": "hard"
|
9 |
+
},
|
10 |
+
{
|
11 |
+
"db_id": "laptop",
|
12 |
+
"query": "SELECT c.firstname, c.lastname, COUNT(*) AS num_pcs_bought\nFROM customers c\nJOIN sales s ON c.customer_id = s.customer_id\nJOIN pcs p ON s.model = p.model\nGROUP BY c.customer_id, c.firstname, c.lastname\nORDER BY num_pcs_bought DESC\nLIMIT 1;",
|
13 |
+
"setup_sql": ";",
|
14 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
15 |
+
"question": "Who bought the most PCs, print also the users name?",
|
16 |
+
"category": "medium"
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"db_id": "transactions",
|
20 |
+
"query": "select users.id, users.name, sum(transactions.amount) as balance from users join transactions on users.id = transactions.user_id group by users.id, users.name having balance = 0",
|
21 |
+
"setup_sql": ";",
|
22 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
23 |
+
"question": "list the names off account holders who have negative balances",
|
24 |
+
"category": "easy"
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"db_id": "laptop",
|
28 |
+
"query": "SELECT model FROM products WHERE maker = 'B';",
|
29 |
+
"setup_sql": ";",
|
30 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
31 |
+
"question": "List only the model number of all products made by maker B.",
|
32 |
+
"category": "easy"
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"db_id": "laptop",
|
36 |
+
"query": "SELECT model FROM products WHERE maker <> 'B';",
|
37 |
+
"setup_sql": ";",
|
38 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
39 |
+
"question": "List the model numbers of all products not made by maker B.",
|
40 |
+
"category": "easy"
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"db_id": "laptop",
|
44 |
+
"query": "SELECT AVG(speed) FROM pcs WHERE speed >= 3.00",
|
45 |
+
"setup_sql": ";",
|
46 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
47 |
+
"question": "Return the average speed all PCs with speed >= 3.00",
|
48 |
+
"category": "easy"
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"db_id": "laptop",
|
52 |
+
"query": "SELECT MAX(price) FROM printers WHERE color = 'TRUE' AND type='laser'",
|
53 |
+
"setup_sql": ";",
|
54 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
55 |
+
"question": "Return the price of the most expensive color laser printer",
|
56 |
+
"category": "medium"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"db_id": "laptop",
|
60 |
+
"query": "SELECT MIN(paid) FROM sales WHERE type_of_payment LIKE '%visa%'",
|
61 |
+
"setup_sql": ";",
|
62 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
63 |
+
"question": "Return the minimum amount paid by customers who used a visa card (debit or credit) to purchase a product",
|
64 |
+
"category": "medium"
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"db_id": "laptop",
|
68 |
+
"query": "SELECT customer_id FROM customers WHERE firstname LIKE '%e%' OR lastname LIKE '%e%'",
|
69 |
+
"setup_sql": ";",
|
70 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
71 |
+
"question": "Find the customer_id of customers who have the letter 'e' either in their first name or in their last name",
|
72 |
+
"category": "medium"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"db_id": "laptop",
|
76 |
+
"query": "SELECT model, price/0.85 AS 'price (USD)' FROM laptops WHERE ram >= 1024",
|
77 |
+
"setup_sql": ";",
|
78 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
79 |
+
"question": "Assume all prices in the table Laptops are in Euro. List the prices of laptops with at least 1024 ram. You should return the price in USD in a column called 'price (USD)'. Assume that 1 USD = 0.85 EURO. Name the price column 'price (USD)'.",
|
80 |
+
"category": "hard"
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"db_id": "laptop",
|
84 |
+
"query": "SELECT maker FROM products GROUP BY maker HAVING COUNT(maker) > 4;",
|
85 |
+
"setup_sql": ";",
|
86 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
87 |
+
"question": "Return a list of makers that make more than four different products.",
|
88 |
+
"category": "medium"
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"db_id": "laptop",
|
92 |
+
"query": "SELECT model FROM laptops WHERE speed > 1.7 ORDER BY speed DESC;",
|
93 |
+
"setup_sql": ";",
|
94 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
95 |
+
"question": "List all the laptop model numbers that have a speed greater than 1.7 in descending order of speed.",
|
96 |
+
"category": "medium"
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"db_id": "laptop",
|
100 |
+
"query": "SELECT firstname \n FROM sales \n JOIN customers ON sales.customer_id = customers.customer_id \n GROUP BY firstname \n ORDER BY COUNT(firstname);",
|
101 |
+
"setup_sql": ";",
|
102 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
103 |
+
"question": "List firstnames of customers in an ascending order based on the number of purchases made by customers with this firstname.",
|
104 |
+
"category": "medium"
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"db_id": "laptop",
|
108 |
+
"query": "SELECT DISTINCT maker FROM products JOIN pcs ON products.model = pcs.model WHERE ram > 1500;",
|
109 |
+
"setup_sql": ";",
|
110 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
111 |
+
"question": "List all the makers (with only one entry per maker) who make PCs with RAM greater than 1500.",
|
112 |
+
"category": "medium"
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"db_id": "laptop",
|
116 |
+
"query": "SELECT city, AVG(paid) as 'avg_spend' FROM sales JOIN customers ON sales.customer_id = customers.customer_id GROUP BY city",
|
117 |
+
"setup_sql": ";",
|
118 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
119 |
+
"question": "Find the city and the average amount of money spent by customers in each city. Name the column for the amount 'avg_spend'",
|
120 |
+
"category": "medium"
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"db_id": "laptop",
|
124 |
+
"query": "SELECT color, MAX(price) as 'max_price' FROM printers GROUP BY color;",
|
125 |
+
"setup_sql": ";",
|
126 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
127 |
+
"question": "Find the maximum price for each color of printer. Name the column for the maximum price 'max_price'",
|
128 |
+
"category": "medium"
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"db_id": "who",
|
132 |
+
"query": "select country_name, max(pm25_concentration) as worst_pm25_for_country\nfrom ambient_air_quality\ngroup by country_name\norder by worst_pm25_for_country desc\nlimit 1",
|
133 |
+
"setup_sql": ";",
|
134 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
135 |
+
"question": "Find the country with the worst single reading of air quality (highest PM 2.5 value). Show the PM 2.5 value as well.",
|
136 |
+
"category": "medium"
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"db_id": "who",
|
140 |
+
"query": "select country_name, avg(pm25_concentration) as worst_avg_pm25_for_country\nfrom ambient_air_quality\ngroup by country_name\norder by worst_avg_pm25_for_country desc\nlimit 1",
|
141 |
+
"setup_sql": ";",
|
142 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
143 |
+
"question": "Find the country with the worst average air quality (highest PM 2.5 value). Show the PM 2.5 value as well.",
|
144 |
+
"category": "medium"
|
145 |
+
},
|
146 |
+
{
|
147 |
+
"db_id": "who",
|
148 |
+
"query": "select distinct country_name from ambient_air_quality order by country_name",
|
149 |
+
"setup_sql": ";",
|
150 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
151 |
+
"question": "Find all countries for which WHO air quality data is available. Sort alphabetically.",
|
152 |
+
"category": "medium"
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"db_id": "who",
|
156 |
+
"query": "select year, avg(pm25_concentration) from ambient_air_quality \nwhere country_name = 'Singapore'\ngroup by year\norder by year",
|
157 |
+
"setup_sql": ";",
|
158 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
159 |
+
"question": "Find Singapore air quality defined as PM2.5 concentration over time",
|
160 |
+
"category": "medium"
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"db_id": "nyc",
|
164 |
+
"query": "SELECT COLUMNS('^trip_') FROM rideshare LIMIT 10;",
|
165 |
+
"setup_sql": ";",
|
166 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
167 |
+
"question": "select only the column names from the rideshare table that start with trip_ and return the first 10 values",
|
168 |
+
"category": "duckdb"
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"db_id": "nyc",
|
172 |
+
"query": "SELECT * FROM rideshare USING SAMPLE 1%;",
|
173 |
+
"setup_sql": ";",
|
174 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
175 |
+
"question": "select a 1% sample from the nyc.rideshare table",
|
176 |
+
"category": "duckdb"
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"db_id": "laptop",
|
180 |
+
"query": "SELECT * EXCLUDE (customer_id) FROM customers;\n",
|
181 |
+
"setup_sql": ";",
|
182 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
183 |
+
"question": "select all columns from the customer table, except customer_id",
|
184 |
+
"category": "duckdb"
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"db_id": "nyc",
|
188 |
+
"query": "SUMMARIZE rideshare;",
|
189 |
+
"setup_sql": ";",
|
190 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
191 |
+
"question": "show summary statistics of the rideshare table",
|
192 |
+
"category": "duckdb"
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"db_id": "none",
|
196 |
+
"query": "SELECT * FROM read_csv_auto(\n'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')",
|
197 |
+
"setup_sql": ";",
|
198 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
199 |
+
"question": "read a CSV from https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
|
200 |
+
"category": "duckdb"
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"db_id": "none",
|
204 |
+
"query": "COPY (SELECT * FROM read_csv_auto(\n'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'))\nTO 'titanic.parquet' (FORMAT 'parquet');",
|
205 |
+
"setup_sql": ";",
|
206 |
+
"validation_sql": "SELECT * FROM 'titanic.parquet'",
|
207 |
+
"question": "read a CSV from https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv and convert it to a parquet file called \"titanic\"",
|
208 |
+
"category": "duckdb"
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"db_id": "none",
|
212 |
+
"query": "CREATE TABLE titanic AS (SELECT * FROM read_csv_auto(\n'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'))",
|
213 |
+
"setup_sql": ";",
|
214 |
+
"validation_sql": "SELECT * FROM titanic;",
|
215 |
+
"question": "create a table called \"titanic\" from CSV file https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
|
216 |
+
"category": "duckdb"
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"db_id": "none",
|
220 |
+
"query": "PRAGMA default_null_order='NULLS LAST';",
|
221 |
+
"setup_sql": ";",
|
222 |
+
"validation_sql": "SELECT current_setting('default_null_order');",
|
223 |
+
"question": "configure duckdb to put null values last when sorting",
|
224 |
+
"category": "duckdb"
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"db_id": "none",
|
228 |
+
"query": "CREATE TABLE IF NOT EXISTS products (\n maker varchar(10),\n model varchar(10),\n type varchar(10));",
|
229 |
+
"setup_sql": ";",
|
230 |
+
"validation_sql": "SELECT * FROM products;",
|
231 |
+
"question": "create a table about products, that contains a maker, model and type column",
|
232 |
+
"category": "ddl"
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"db_id": "product",
|
236 |
+
"query": "INSERT INTO products (maker, model, type)\nVALUES\n ('A', '1001', 'pc');",
|
237 |
+
"setup_sql": ";",
|
238 |
+
"validation_sql": "SELECT * FROM products;",
|
239 |
+
"question": "add a row with values for model \"1001\" of type \"pc\", from maker \"A\" to products table",
|
240 |
+
"category": "ddl"
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"db_id": "none",
|
244 |
+
"query": "CALL pragma_version();\n",
|
245 |
+
"setup_sql": ";",
|
246 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
247 |
+
"question": "get current version of duckdb",
|
248 |
+
"category": "duckdb"
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"db_id": "nyc",
|
252 |
+
"query": "PRAGMA table_info('rideshare');",
|
253 |
+
"setup_sql": ";",
|
254 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
255 |
+
"question": "list all columns in table nyc.rideshare",
|
256 |
+
"category": "duckdb"
|
257 |
+
},
|
258 |
+
{
|
259 |
+
"db_id": "nyc",
|
260 |
+
"query": "PRAGMA show_tables;",
|
261 |
+
"setup_sql": ";",
|
262 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
263 |
+
"question": "show all tables in the curent database",
|
264 |
+
"category": "duckdb"
|
265 |
+
},
|
266 |
+
{
|
267 |
+
"db_id": "laptop",
|
268 |
+
"query": "SELECT customer_id, model, sum(paid) FROM sales GROUP BY ALL",
|
269 |
+
"setup_sql": ";",
|
270 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
271 |
+
"question": "how much did each customer spend per model type?",
|
272 |
+
"category": "easy"
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"db_id": "nyc",
|
276 |
+
"query": "SELECT Max(datediff('minute', tpep_pickup_datetime, tpep_dropoff_datetime)) from nyc.taxi",
|
277 |
+
"setup_sql": ";",
|
278 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
279 |
+
"question": "What was the longest taxi ride in minutes?",
|
280 |
+
"category": "hard"
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"db_id": "who",
|
284 |
+
"query": "with per_region as (\n select avg(pm10_concentration) as avg_pm10, who_region from ambient_air_quality group by who_region\n), max_region as (\n select who_region from per_region where avg_pm10 = (select max(avg_pm10) from per_region)\n), min_city_value_in_max_region as (\n select min(pm10_concentration) from ambient_air_quality where who_region in (from max_region)\n), min_city_in_max_region as (\n select city from ambient_air_quality where pm10_concentration in (from min_city_value_in_max_region) and who_region in (from max_region)\n)\nfrom min_city_in_max_region",
|
285 |
+
"setup_sql": ";",
|
286 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
287 |
+
"question": "What is the city with the lowest pm10 concentration in the region with the highest average pm10 concentration?",
|
288 |
+
"category": "hard"
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"db_id": "hn",
|
292 |
+
"query": "SELECT *, regexp_extract(text, '([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,63})',0) email from hacker_news where email[:4]='test'",
|
293 |
+
"setup_sql": ";",
|
294 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
295 |
+
"question": "Get all posts on hn that contain an email address starting with test. Return all original columns, plus a new column containing the email address.",
|
296 |
+
"category": "hard"
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"db_id": "json",
|
300 |
+
"query": "SELECT employee.id, employee.first_name FROM employee_json",
|
301 |
+
"setup_sql": ";",
|
302 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
303 |
+
"question": "Extract id and first_name properties as individual columns from the employee struct",
|
304 |
+
"category": "duckdb"
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"db_id": "who",
|
308 |
+
"query": "SELECT who_region[1]::INT as region, * EXCLUDE (who_region) FROM who.ambient_air_quality",
|
309 |
+
"setup_sql": ";",
|
310 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
311 |
+
"question": "count quality measurements per region. Make sure to return the region code (first char of who_region) as integer and sort by region.",
|
312 |
+
"category": "duckdb"
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"db_id": "flightinfo",
|
316 |
+
"query": "SELECT seat.seat_number FROM seat \nJOIN direct_flight ON direct_flight.flight_number = seat.flight_number \nJOIN airport AS departure_airport ON departure_airport.iata_code = direct_flight.departure_airport_iata_code \nJOIN airport AS arriving_airport ON arriving_airport.iata_code = direct_flight.arriving_airport_iata_code \nJOIN city AS departure_city ON departure_city.city_zipcode = departure_airport.city_zip_code \nJOIN city AS arriving_city ON arriving_city.city_zipcode = arriving_airport.city_zip_code \nWHERE departure_city.city_name = 'Bruxelles' AND arriving_city.city_name = 'Newark';",
|
317 |
+
"setup_sql": ";",
|
318 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
319 |
+
"question": "Which seats were available on the flight from Bruxelles to Newark?",
|
320 |
+
"category": "hard"
|
321 |
+
},
|
322 |
+
{
|
323 |
+
"db_id": "laptop",
|
324 |
+
"query": "COPY customers FROM 'customers_12_12_2023.csv';",
|
325 |
+
"setup_sql": "COPY customers TO 'customers_12_12_2023.csv';",
|
326 |
+
"validation_sql": "SELECT * FROM customers;",
|
327 |
+
"question": "copy content of csv file customers_12_12_2023.csv into customers table",
|
328 |
+
"category": "duckdb"
|
329 |
+
},
|
330 |
+
{
|
331 |
+
"db_id": "laptop",
|
332 |
+
"query": "COPY customers FROM 'customers_12_12_2023.csv' (DELIMITER '\\t');",
|
333 |
+
"setup_sql": "COPY customers TO 'customers_12_12_2023.csv' (FORMAT CSV, DELIMITER '\\t');",
|
334 |
+
"validation_sql": "SELECT * FROM customers;",
|
335 |
+
"question": "copy content of csv file costomers_12_12_2023.csv into customers table with tab separator",
|
336 |
+
"category": "duckdb"
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"db_id": "laptop",
|
340 |
+
"query": "COPY customers FROM 'customers_partitioned/city=Amsterdam/*.parquet';",
|
341 |
+
"setup_sql": "COPY customers TO 'customers_partitioned' (FORMAT PARQUET, PARTITION_BY (city), OVERWRITE_OR_IGNORE True);",
|
342 |
+
"validation_sql": "SELECT * FROM customers;;",
|
343 |
+
"question": "copy any parquet files from 'customers_partitioned/city=Amsterdam/' into customers table",
|
344 |
+
"category": "duckdb"
|
345 |
+
},
|
346 |
+
{
|
347 |
+
"db_id": "laptop",
|
348 |
+
"query": "COPY customers(customer_id) FROM 'customers_customer_ids.csv';",
|
349 |
+
"setup_sql": "COPY customers(customer_id) TO 'customers_customer_ids.csv';",
|
350 |
+
"validation_sql": "SELECT * FROM customers;",
|
351 |
+
"question": "copy only the customer_id column from the customers_customer_ids.csv into the customers tables",
|
352 |
+
"category": "duckdb"
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"db_id": "laptop",
|
356 |
+
"query": "CREATE TABLE test_tbl AS SELECT * FROM read_json_auto('test.json');",
|
357 |
+
"setup_sql": "COPY customers TO 'test.json'\n",
|
358 |
+
"validation_sql": "SELECT * FROM test_tbl;",
|
359 |
+
"question": "read json file from test.json and create new table from it called 'test_tbl'",
|
360 |
+
"category": "duckdb"
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"db_id": "laptop",
|
364 |
+
"query": "SELECT * FROM read_csv_auto('test.csv');",
|
365 |
+
"setup_sql": "COPY customers TO 'test.csv';",
|
366 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
367 |
+
"question": "read csv from test.csv",
|
368 |
+
"category": "duckdb"
|
369 |
+
},
|
370 |
+
{
|
371 |
+
"db_id": "laptop",
|
372 |
+
"query": "SELECT * FROM read_csv_auto('test.csv', columns={'customer_id': 'VARCHAR', 'firstname': 'VARCHAR', 'lastname': 'VARCHAR'});",
|
373 |
+
"setup_sql": "COPY customers(customer_id, firstname, lastname) TO 'test.csv';",
|
374 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
375 |
+
"question": "read csv from test.csv with predefined column and types - customer_id: string, firstname: string, lastname: string",
|
376 |
+
"category": "duckdb"
|
377 |
+
},
|
378 |
+
{
|
379 |
+
"db_id": "laptop",
|
380 |
+
"query": "SELECT * EXCLUDE (ram, hd) FROM pcs;",
|
381 |
+
"setup_sql": ";",
|
382 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
383 |
+
"question": "select all columns from pcs table except for ram and hd",
|
384 |
+
"category": "duckdb"
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"db_id": "laptop",
|
388 |
+
"query": "SELECT COLUMNS('name$') FROM customers;",
|
389 |
+
"setup_sql": ";",
|
390 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
391 |
+
"question": "select all columns ending with 'name' from customers table",
|
392 |
+
"category": "duckdb"
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"db_id": "laptop",
|
396 |
+
"query": "SELECT LENGTH(COLUMNS('name$')) FROM customers",
|
397 |
+
"setup_sql": ";",
|
398 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
399 |
+
"question": "for each column ending with 'name' in the customers table, compute the string length",
|
400 |
+
"category": "duckdb"
|
401 |
+
},
|
402 |
+
{
|
403 |
+
"db_id": "laptop",
|
404 |
+
"query": "SELECT * REPLACE (upper(city) AS city) FROM customers;",
|
405 |
+
"setup_sql": ";",
|
406 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
407 |
+
"question": "get all columns from customer table, and make all city names uppercase",
|
408 |
+
"category": "duckdb"
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"db_id": "laptop",
|
412 |
+
"query": "EXPLAIN SELECT * FROM customers",
|
413 |
+
"setup_sql": ";",
|
414 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
415 |
+
"question": "show query plan for query: SELECT * from customers",
|
416 |
+
"category": "duckdb"
|
417 |
+
},
|
418 |
+
{
|
419 |
+
"db_id": "laptop",
|
420 |
+
"query": "SELECT ascii(lastname) FROM customers;",
|
421 |
+
"setup_sql": ";",
|
422 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
423 |
+
"question": "get the first character of the firstname column and cast it to an INT",
|
424 |
+
"category": "duckdb"
|
425 |
+
},
|
426 |
+
{
|
427 |
+
"db_id": "laptop",
|
428 |
+
"query": "SELECT model, speed::INTEGER FROM laptops;",
|
429 |
+
"setup_sql": ";",
|
430 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
431 |
+
"question": "get laptop name and speed, return the speed as integer",
|
432 |
+
"category": "duckdb"
|
433 |
+
},
|
434 |
+
{
|
435 |
+
"db_id": "laptop_array",
|
436 |
+
"query": "SELECT phone_numbers[1] FROM customers;",
|
437 |
+
"setup_sql": ";",
|
438 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
439 |
+
"question": "get the first phone number of each customer",
|
440 |
+
"category": "duckdb"
|
441 |
+
},
|
442 |
+
{
|
443 |
+
"db_id": "laptop_array",
|
444 |
+
"query": "INSERT INTO customers(customer_id, phone_numbers) VALUES (5, ['12312323', '23123344']);",
|
445 |
+
"setup_sql": ";",
|
446 |
+
"validation_sql": "SELECT * FROM customers;",
|
447 |
+
"question": "insert two phone numbers to customer with id 5 [\\\"12312323\\\", and '23123344']",
|
448 |
+
"category": "duckdb"
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"db_id": "laptop",
|
452 |
+
"query": "ALTER TABLE customers ADD COLUMN phone_numbers VARCHAR[];",
|
453 |
+
"setup_sql": ";",
|
454 |
+
"validation_sql": "DESCRIBE customers;",
|
455 |
+
"question": "how to add a new column phone_numbers to the customers table, with array type varchar",
|
456 |
+
"category": "duckdb"
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"db_id": "laptop",
|
460 |
+
"query": "SELECT firstname[1] FROM customers;",
|
461 |
+
"setup_sql": ";",
|
462 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
463 |
+
"question": "get the first letter of the customers firstname",
|
464 |
+
"category": "duckdb"
|
465 |
+
},
|
466 |
+
{
|
467 |
+
"db_id": "laptop_array",
|
468 |
+
"query": "SELECT phone_numbers[:2] FROM customers;",
|
469 |
+
"setup_sql": ";",
|
470 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
471 |
+
"question": "get the first two phone numbers from the phone numbers array of each customer",
|
472 |
+
"category": "duckdb"
|
473 |
+
},
|
474 |
+
{
|
475 |
+
"db_id": "laptop",
|
476 |
+
"query": "SELECT {'a':1, 'b':2, 'c':3};",
|
477 |
+
"setup_sql": ";",
|
478 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
479 |
+
"question": "create a struct with keys a, b, c and values 1,2,3",
|
480 |
+
"category": "duckdb"
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"db_id": "laptop",
|
484 |
+
"query": "SELECT [1,2,3];\n",
|
485 |
+
"setup_sql": ";",
|
486 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
487 |
+
"question": "create array with values 1,2,3",
|
488 |
+
"category": "duckdb"
|
489 |
+
},
|
490 |
+
{
|
491 |
+
"db_id": "laptop",
|
492 |
+
"query": "CREATE TABLE test (embeddings FLOAT[100]);",
|
493 |
+
"setup_sql": ";",
|
494 |
+
"validation_sql": "DESCRIBE test;",
|
495 |
+
"question": "create table test with a fix-sized array column with 100 dimenions, called embeddings",
|
496 |
+
"category": "duckdb"
|
497 |
+
},
|
498 |
+
{
|
499 |
+
"db_id": "laptop",
|
500 |
+
"query": "CREATE TABLE test (person STRUCT(name VARCHAR, id INTEGER));",
|
501 |
+
"setup_sql": ";",
|
502 |
+
"validation_sql": "DESCRIBE test;",
|
503 |
+
"question": "create table test with a struct column called person with properties name and id",
|
504 |
+
"category": "duckdb"
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"db_id": "laptop_struct",
|
508 |
+
"query": "SELECT person.name, person.id FROM test;",
|
509 |
+
"setup_sql": ";",
|
510 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
511 |
+
"question": "get persons name and persons id from the test table.",
|
512 |
+
"category": "duckdb"
|
513 |
+
},
|
514 |
+
{
|
515 |
+
"db_id": "laptop",
|
516 |
+
"query": "UPDATE customers SET email = NULL;",
|
517 |
+
"setup_sql": ";",
|
518 |
+
"validation_sql": "SELECT email FROM customers;",
|
519 |
+
"question": "remove all values from email column in customers table",
|
520 |
+
"category": "duckdb"
|
521 |
+
},
|
522 |
+
{
|
523 |
+
"db_id": "laptop_json",
|
524 |
+
"query": "ALTER TABLE customers ALTER COLUMN email SET DATA TYPE VARCHAR;",
|
525 |
+
"setup_sql": ";",
|
526 |
+
"validation_sql": "DESCRIBE customers;",
|
527 |
+
"question": "make customer email of type VARCHAR",
|
528 |
+
"category": "duckdb"
|
529 |
+
},
|
530 |
+
{
|
531 |
+
"db_id": "laptop_json",
|
532 |
+
"query": "INSERT INTO customers (customer_id, email) VALUES (5,'{\"from\": \"[email protected]\", \"to\": \"[email protected]\"}');",
|
533 |
+
"setup_sql": ";",
|
534 |
+
"validation_sql": "SELECT * FROM customers;",
|
535 |
+
"question": "insert json into customer email for customer id 5: {'from': '[email protected]', 'to': '[email protected]'}",
|
536 |
+
"category": "duckdb"
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"db_id": "laptop_json",
|
540 |
+
"query": "SELECT customers.email->>'from' FROM customers;",
|
541 |
+
"setup_sql": ";",
|
542 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
543 |
+
"question": "get 'from' field from customer email json",
|
544 |
+
"category": "duckdb"
|
545 |
+
},
|
546 |
+
{
|
547 |
+
"db_id": "laptop",
|
548 |
+
"query": "SUMMARIZE customers;",
|
549 |
+
"setup_sql": ";",
|
550 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
551 |
+
"question": "summarize the customer table",
|
552 |
+
"category": "duckdb"
|
553 |
+
},
|
554 |
+
{
|
555 |
+
"db_id": "laptop",
|
556 |
+
"query": "SELECT * FROM customers USING SAMPLE 10% (reservoir);",
|
557 |
+
"setup_sql": ";",
|
558 |
+
"validation_sql": "SELECT count(*) FROM ddb_benchmark_result;",
|
559 |
+
"question": "sample 10% from the customers table using reservoir sampling",
|
560 |
+
"category": "duckdb"
|
561 |
+
},
|
562 |
+
{
|
563 |
+
"db_id": "laptop",
|
564 |
+
"query": "SET threads = 10;",
|
565 |
+
"setup_sql": ";",
|
566 |
+
"validation_sql": "SELECT current_setting('threads');",
|
567 |
+
"question": "set number of threads to 10",
|
568 |
+
"category": "duckdb"
|
569 |
+
},
|
570 |
+
{
|
571 |
+
"db_id": "laptop",
|
572 |
+
"query": "SET memory_limit = '20G';\n",
|
573 |
+
"setup_sql": ";",
|
574 |
+
"validation_sql": "SELECT current_setting('memory_limit');",
|
575 |
+
"question": "set memory limit to 20 gigabyte",
|
576 |
+
"category": "duckdb"
|
577 |
+
},
|
578 |
+
{
|
579 |
+
"db_id": "laptop",
|
580 |
+
"query": "SELECT * EXCLUDE (price), avg(price) FROM laptops GROUP BY ALL;",
|
581 |
+
"setup_sql": ";",
|
582 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
583 |
+
"question": "show the average price of laptop and group by the remaining columns",
|
584 |
+
"category": "duckdb"
|
585 |
+
},
|
586 |
+
{
|
587 |
+
"db_id": "laptop",
|
588 |
+
"query": "SELECT * FROM laptops WHERE price > 1000 ORDER BY ALL;\n",
|
589 |
+
"setup_sql": ";",
|
590 |
+
"validation_sql": "SELECT * FROM ddb_benchmark_result;",
|
591 |
+
"question": "show all laptops with price above 1000 and order by all columns",
|
592 |
+
"category": "duckdb"
|
593 |
+
},
|
594 |
+
{
|
595 |
+
"db_id": "laptop",
|
596 |
+
"query": "ATTACH 'who.ddb';",
|
597 |
+
"setup_sql": ";",
|
598 |
+
"validation_sql": "SHOW DATABASES;",
|
599 |
+
"question": "attach database file who.ddb",
|
600 |
+
"category": "duckdb"
|
601 |
+
}
|
602 |
+
]
|
duckdb-nsql/eval/data/tables.json
ADDED
@@ -0,0 +1,3465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"db_id": "hn",
|
4 |
+
"column_names": [
|
5 |
+
[
|
6 |
+
-1,
|
7 |
+
"*"
|
8 |
+
],
|
9 |
+
[
|
10 |
+
0,
|
11 |
+
"title"
|
12 |
+
],
|
13 |
+
[
|
14 |
+
0,
|
15 |
+
"url"
|
16 |
+
],
|
17 |
+
[
|
18 |
+
0,
|
19 |
+
"text"
|
20 |
+
],
|
21 |
+
[
|
22 |
+
0,
|
23 |
+
"dead"
|
24 |
+
],
|
25 |
+
[
|
26 |
+
0,
|
27 |
+
"by"
|
28 |
+
],
|
29 |
+
[
|
30 |
+
0,
|
31 |
+
"score"
|
32 |
+
],
|
33 |
+
[
|
34 |
+
0,
|
35 |
+
"time"
|
36 |
+
],
|
37 |
+
[
|
38 |
+
0,
|
39 |
+
"timestamp"
|
40 |
+
],
|
41 |
+
[
|
42 |
+
0,
|
43 |
+
"type"
|
44 |
+
],
|
45 |
+
[
|
46 |
+
0,
|
47 |
+
"id"
|
48 |
+
],
|
49 |
+
[
|
50 |
+
0,
|
51 |
+
"parent"
|
52 |
+
],
|
53 |
+
[
|
54 |
+
0,
|
55 |
+
"descendants"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
0,
|
59 |
+
"ranking"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
0,
|
63 |
+
"deleted"
|
64 |
+
]
|
65 |
+
],
|
66 |
+
"column_names_original": [
|
67 |
+
[
|
68 |
+
-1,
|
69 |
+
"*"
|
70 |
+
],
|
71 |
+
[
|
72 |
+
0,
|
73 |
+
"title"
|
74 |
+
],
|
75 |
+
[
|
76 |
+
0,
|
77 |
+
"url"
|
78 |
+
],
|
79 |
+
[
|
80 |
+
0,
|
81 |
+
"text"
|
82 |
+
],
|
83 |
+
[
|
84 |
+
0,
|
85 |
+
"dead"
|
86 |
+
],
|
87 |
+
[
|
88 |
+
0,
|
89 |
+
"by"
|
90 |
+
],
|
91 |
+
[
|
92 |
+
0,
|
93 |
+
"score"
|
94 |
+
],
|
95 |
+
[
|
96 |
+
0,
|
97 |
+
"time"
|
98 |
+
],
|
99 |
+
[
|
100 |
+
0,
|
101 |
+
"timestamp"
|
102 |
+
],
|
103 |
+
[
|
104 |
+
0,
|
105 |
+
"type"
|
106 |
+
],
|
107 |
+
[
|
108 |
+
0,
|
109 |
+
"id"
|
110 |
+
],
|
111 |
+
[
|
112 |
+
0,
|
113 |
+
"parent"
|
114 |
+
],
|
115 |
+
[
|
116 |
+
0,
|
117 |
+
"descendants"
|
118 |
+
],
|
119 |
+
[
|
120 |
+
0,
|
121 |
+
"ranking"
|
122 |
+
],
|
123 |
+
[
|
124 |
+
0,
|
125 |
+
"deleted"
|
126 |
+
]
|
127 |
+
],
|
128 |
+
"column_types": [
|
129 |
+
"text",
|
130 |
+
"varchar",
|
131 |
+
"varchar",
|
132 |
+
"varchar",
|
133 |
+
"boolean",
|
134 |
+
"varchar",
|
135 |
+
"bigint",
|
136 |
+
"bigint",
|
137 |
+
"timestamp",
|
138 |
+
"varchar",
|
139 |
+
"bigint",
|
140 |
+
"bigint",
|
141 |
+
"bigint",
|
142 |
+
"bigint",
|
143 |
+
"boolean"
|
144 |
+
],
|
145 |
+
"foreign_keys": {},
|
146 |
+
"primary_keys": {},
|
147 |
+
"table_names": [
|
148 |
+
"hacker_news"
|
149 |
+
],
|
150 |
+
"table_names_original": [
|
151 |
+
"hacker_news"
|
152 |
+
]
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"db_id": "laptop",
|
156 |
+
"column_names": [
|
157 |
+
[
|
158 |
+
-1,
|
159 |
+
"*"
|
160 |
+
],
|
161 |
+
[
|
162 |
+
0,
|
163 |
+
"customer_id"
|
164 |
+
],
|
165 |
+
[
|
166 |
+
0,
|
167 |
+
"firstname"
|
168 |
+
],
|
169 |
+
[
|
170 |
+
0,
|
171 |
+
"lastname"
|
172 |
+
],
|
173 |
+
[
|
174 |
+
0,
|
175 |
+
"city"
|
176 |
+
],
|
177 |
+
[
|
178 |
+
0,
|
179 |
+
"address"
|
180 |
+
],
|
181 |
+
[
|
182 |
+
0,
|
183 |
+
"email"
|
184 |
+
],
|
185 |
+
[
|
186 |
+
1,
|
187 |
+
"model"
|
188 |
+
],
|
189 |
+
[
|
190 |
+
1,
|
191 |
+
"speed"
|
192 |
+
],
|
193 |
+
[
|
194 |
+
1,
|
195 |
+
"ram"
|
196 |
+
],
|
197 |
+
[
|
198 |
+
1,
|
199 |
+
"hd"
|
200 |
+
],
|
201 |
+
[
|
202 |
+
1,
|
203 |
+
"screen"
|
204 |
+
],
|
205 |
+
[
|
206 |
+
1,
|
207 |
+
"price"
|
208 |
+
],
|
209 |
+
[
|
210 |
+
2,
|
211 |
+
"model"
|
212 |
+
],
|
213 |
+
[
|
214 |
+
2,
|
215 |
+
"speed"
|
216 |
+
],
|
217 |
+
[
|
218 |
+
2,
|
219 |
+
"ram"
|
220 |
+
],
|
221 |
+
[
|
222 |
+
2,
|
223 |
+
"hd"
|
224 |
+
],
|
225 |
+
[
|
226 |
+
2,
|
227 |
+
"price"
|
228 |
+
],
|
229 |
+
[
|
230 |
+
3,
|
231 |
+
"model"
|
232 |
+
],
|
233 |
+
[
|
234 |
+
3,
|
235 |
+
"color"
|
236 |
+
],
|
237 |
+
[
|
238 |
+
3,
|
239 |
+
"type"
|
240 |
+
],
|
241 |
+
[
|
242 |
+
3,
|
243 |
+
"price"
|
244 |
+
],
|
245 |
+
[
|
246 |
+
4,
|
247 |
+
"maker"
|
248 |
+
],
|
249 |
+
[
|
250 |
+
4,
|
251 |
+
"model"
|
252 |
+
],
|
253 |
+
[
|
254 |
+
4,
|
255 |
+
"type"
|
256 |
+
],
|
257 |
+
[
|
258 |
+
5,
|
259 |
+
"customer_id"
|
260 |
+
],
|
261 |
+
[
|
262 |
+
5,
|
263 |
+
"model"
|
264 |
+
],
|
265 |
+
[
|
266 |
+
5,
|
267 |
+
"quantity"
|
268 |
+
],
|
269 |
+
[
|
270 |
+
5,
|
271 |
+
"day"
|
272 |
+
],
|
273 |
+
[
|
274 |
+
5,
|
275 |
+
"paid"
|
276 |
+
],
|
277 |
+
[
|
278 |
+
5,
|
279 |
+
"type_of_payment"
|
280 |
+
]
|
281 |
+
],
|
282 |
+
"column_names_original": [
|
283 |
+
[
|
284 |
+
-1,
|
285 |
+
"*"
|
286 |
+
],
|
287 |
+
[
|
288 |
+
0,
|
289 |
+
"customer_id"
|
290 |
+
],
|
291 |
+
[
|
292 |
+
0,
|
293 |
+
"firstname"
|
294 |
+
],
|
295 |
+
[
|
296 |
+
0,
|
297 |
+
"lastname"
|
298 |
+
],
|
299 |
+
[
|
300 |
+
0,
|
301 |
+
"city"
|
302 |
+
],
|
303 |
+
[
|
304 |
+
0,
|
305 |
+
"address"
|
306 |
+
],
|
307 |
+
[
|
308 |
+
0,
|
309 |
+
"email"
|
310 |
+
],
|
311 |
+
[
|
312 |
+
1,
|
313 |
+
"model"
|
314 |
+
],
|
315 |
+
[
|
316 |
+
1,
|
317 |
+
"speed"
|
318 |
+
],
|
319 |
+
[
|
320 |
+
1,
|
321 |
+
"ram"
|
322 |
+
],
|
323 |
+
[
|
324 |
+
1,
|
325 |
+
"hd"
|
326 |
+
],
|
327 |
+
[
|
328 |
+
1,
|
329 |
+
"screen"
|
330 |
+
],
|
331 |
+
[
|
332 |
+
1,
|
333 |
+
"price"
|
334 |
+
],
|
335 |
+
[
|
336 |
+
2,
|
337 |
+
"model"
|
338 |
+
],
|
339 |
+
[
|
340 |
+
2,
|
341 |
+
"speed"
|
342 |
+
],
|
343 |
+
[
|
344 |
+
2,
|
345 |
+
"ram"
|
346 |
+
],
|
347 |
+
[
|
348 |
+
2,
|
349 |
+
"hd"
|
350 |
+
],
|
351 |
+
[
|
352 |
+
2,
|
353 |
+
"price"
|
354 |
+
],
|
355 |
+
[
|
356 |
+
3,
|
357 |
+
"model"
|
358 |
+
],
|
359 |
+
[
|
360 |
+
3,
|
361 |
+
"color"
|
362 |
+
],
|
363 |
+
[
|
364 |
+
3,
|
365 |
+
"type"
|
366 |
+
],
|
367 |
+
[
|
368 |
+
3,
|
369 |
+
"price"
|
370 |
+
],
|
371 |
+
[
|
372 |
+
4,
|
373 |
+
"maker"
|
374 |
+
],
|
375 |
+
[
|
376 |
+
4,
|
377 |
+
"model"
|
378 |
+
],
|
379 |
+
[
|
380 |
+
4,
|
381 |
+
"type"
|
382 |
+
],
|
383 |
+
[
|
384 |
+
5,
|
385 |
+
"customer_id"
|
386 |
+
],
|
387 |
+
[
|
388 |
+
5,
|
389 |
+
"model"
|
390 |
+
],
|
391 |
+
[
|
392 |
+
5,
|
393 |
+
"quantity"
|
394 |
+
],
|
395 |
+
[
|
396 |
+
5,
|
397 |
+
"day"
|
398 |
+
],
|
399 |
+
[
|
400 |
+
5,
|
401 |
+
"paid"
|
402 |
+
],
|
403 |
+
[
|
404 |
+
5,
|
405 |
+
"type_of_payment"
|
406 |
+
]
|
407 |
+
],
|
408 |
+
"column_types": [
|
409 |
+
"text",
|
410 |
+
"char",
|
411 |
+
"varchar",
|
412 |
+
"varchar",
|
413 |
+
"varchar",
|
414 |
+
"varchar",
|
415 |
+
"varchar",
|
416 |
+
"char",
|
417 |
+
"double",
|
418 |
+
"int",
|
419 |
+
"int",
|
420 |
+
"double",
|
421 |
+
"double",
|
422 |
+
"char",
|
423 |
+
"double",
|
424 |
+
"int",
|
425 |
+
"int",
|
426 |
+
"double",
|
427 |
+
"char",
|
428 |
+
"varchar",
|
429 |
+
"varchar",
|
430 |
+
"double",
|
431 |
+
"char",
|
432 |
+
"char",
|
433 |
+
"varchar",
|
434 |
+
"char",
|
435 |
+
"char",
|
436 |
+
"int",
|
437 |
+
"date",
|
438 |
+
"double",
|
439 |
+
"varchar"
|
440 |
+
],
|
441 |
+
"foreign_keys": {},
|
442 |
+
"primary_keys": {},
|
443 |
+
"table_names": [
|
444 |
+
"customers",
|
445 |
+
"laptops",
|
446 |
+
"pcs",
|
447 |
+
"printers",
|
448 |
+
"products",
|
449 |
+
"sales"
|
450 |
+
],
|
451 |
+
"table_names_original": [
|
452 |
+
"customers",
|
453 |
+
"laptops",
|
454 |
+
"pcs",
|
455 |
+
"printers",
|
456 |
+
"products",
|
457 |
+
"sales"
|
458 |
+
]
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"db_id": "transactions",
|
462 |
+
"column_names": [
|
463 |
+
[
|
464 |
+
-1,
|
465 |
+
"*"
|
466 |
+
],
|
467 |
+
[
|
468 |
+
0,
|
469 |
+
"id"
|
470 |
+
],
|
471 |
+
[
|
472 |
+
0,
|
473 |
+
"name"
|
474 |
+
],
|
475 |
+
[
|
476 |
+
1,
|
477 |
+
"user_id"
|
478 |
+
],
|
479 |
+
[
|
480 |
+
1,
|
481 |
+
"amount"
|
482 |
+
]
|
483 |
+
],
|
484 |
+
"column_names_original": [
|
485 |
+
[
|
486 |
+
-1,
|
487 |
+
"*"
|
488 |
+
],
|
489 |
+
[
|
490 |
+
0,
|
491 |
+
"id"
|
492 |
+
],
|
493 |
+
[
|
494 |
+
0,
|
495 |
+
"name"
|
496 |
+
],
|
497 |
+
[
|
498 |
+
1,
|
499 |
+
"user_id"
|
500 |
+
],
|
501 |
+
[
|
502 |
+
1,
|
503 |
+
"amount"
|
504 |
+
]
|
505 |
+
],
|
506 |
+
"column_types": [
|
507 |
+
"text",
|
508 |
+
"int",
|
509 |
+
"varchar",
|
510 |
+
"int",
|
511 |
+
"int"
|
512 |
+
],
|
513 |
+
"foreign_keys": {},
|
514 |
+
"primary_keys": {},
|
515 |
+
"table_names": [
|
516 |
+
"users",
|
517 |
+
"transactions"
|
518 |
+
],
|
519 |
+
"table_names_original": [
|
520 |
+
"users",
|
521 |
+
"transactions"
|
522 |
+
]
|
523 |
+
},
|
524 |
+
{
|
525 |
+
"db_id": "who",
|
526 |
+
"column_names": [
|
527 |
+
[
|
528 |
+
-1,
|
529 |
+
"*"
|
530 |
+
],
|
531 |
+
[
|
532 |
+
0,
|
533 |
+
"who_region"
|
534 |
+
],
|
535 |
+
[
|
536 |
+
0,
|
537 |
+
"iso3"
|
538 |
+
],
|
539 |
+
[
|
540 |
+
0,
|
541 |
+
"country_name"
|
542 |
+
],
|
543 |
+
[
|
544 |
+
0,
|
545 |
+
"city"
|
546 |
+
],
|
547 |
+
[
|
548 |
+
0,
|
549 |
+
"year"
|
550 |
+
],
|
551 |
+
[
|
552 |
+
0,
|
553 |
+
"version"
|
554 |
+
],
|
555 |
+
[
|
556 |
+
0,
|
557 |
+
"pm10_concentration"
|
558 |
+
],
|
559 |
+
[
|
560 |
+
0,
|
561 |
+
"pm25_concentration"
|
562 |
+
],
|
563 |
+
[
|
564 |
+
0,
|
565 |
+
"no2_concentration"
|
566 |
+
],
|
567 |
+
[
|
568 |
+
0,
|
569 |
+
"pm10_tempcov"
|
570 |
+
],
|
571 |
+
[
|
572 |
+
0,
|
573 |
+
"pm25_tempcov"
|
574 |
+
],
|
575 |
+
[
|
576 |
+
0,
|
577 |
+
"no2_tempcov"
|
578 |
+
],
|
579 |
+
[
|
580 |
+
0,
|
581 |
+
"type_of_stations"
|
582 |
+
],
|
583 |
+
[
|
584 |
+
0,
|
585 |
+
"reference"
|
586 |
+
],
|
587 |
+
[
|
588 |
+
0,
|
589 |
+
"web_link"
|
590 |
+
],
|
591 |
+
[
|
592 |
+
0,
|
593 |
+
"population"
|
594 |
+
],
|
595 |
+
[
|
596 |
+
0,
|
597 |
+
"population_source"
|
598 |
+
],
|
599 |
+
[
|
600 |
+
0,
|
601 |
+
"latitude"
|
602 |
+
],
|
603 |
+
[
|
604 |
+
0,
|
605 |
+
"longitude"
|
606 |
+
],
|
607 |
+
[
|
608 |
+
0,
|
609 |
+
"who_ms"
|
610 |
+
]
|
611 |
+
],
|
612 |
+
"column_names_original": [
|
613 |
+
[
|
614 |
+
-1,
|
615 |
+
"*"
|
616 |
+
],
|
617 |
+
[
|
618 |
+
0,
|
619 |
+
"who_region"
|
620 |
+
],
|
621 |
+
[
|
622 |
+
0,
|
623 |
+
"iso3"
|
624 |
+
],
|
625 |
+
[
|
626 |
+
0,
|
627 |
+
"country_name"
|
628 |
+
],
|
629 |
+
[
|
630 |
+
0,
|
631 |
+
"city"
|
632 |
+
],
|
633 |
+
[
|
634 |
+
0,
|
635 |
+
"year"
|
636 |
+
],
|
637 |
+
[
|
638 |
+
0,
|
639 |
+
"version"
|
640 |
+
],
|
641 |
+
[
|
642 |
+
0,
|
643 |
+
"pm10_concentration"
|
644 |
+
],
|
645 |
+
[
|
646 |
+
0,
|
647 |
+
"pm25_concentration"
|
648 |
+
],
|
649 |
+
[
|
650 |
+
0,
|
651 |
+
"no2_concentration"
|
652 |
+
],
|
653 |
+
[
|
654 |
+
0,
|
655 |
+
"pm10_tempcov"
|
656 |
+
],
|
657 |
+
[
|
658 |
+
0,
|
659 |
+
"pm25_tempcov"
|
660 |
+
],
|
661 |
+
[
|
662 |
+
0,
|
663 |
+
"no2_tempcov"
|
664 |
+
],
|
665 |
+
[
|
666 |
+
0,
|
667 |
+
"type_of_stations"
|
668 |
+
],
|
669 |
+
[
|
670 |
+
0,
|
671 |
+
"reference"
|
672 |
+
],
|
673 |
+
[
|
674 |
+
0,
|
675 |
+
"web_link"
|
676 |
+
],
|
677 |
+
[
|
678 |
+
0,
|
679 |
+
"population"
|
680 |
+
],
|
681 |
+
[
|
682 |
+
0,
|
683 |
+
"population_source"
|
684 |
+
],
|
685 |
+
[
|
686 |
+
0,
|
687 |
+
"latitude"
|
688 |
+
],
|
689 |
+
[
|
690 |
+
0,
|
691 |
+
"longitude"
|
692 |
+
],
|
693 |
+
[
|
694 |
+
0,
|
695 |
+
"who_ms"
|
696 |
+
]
|
697 |
+
],
|
698 |
+
"column_types": [
|
699 |
+
"text",
|
700 |
+
"varchar",
|
701 |
+
"varchar",
|
702 |
+
"varchar",
|
703 |
+
"varchar",
|
704 |
+
"bigint",
|
705 |
+
"varchar",
|
706 |
+
"bigint",
|
707 |
+
"bigint",
|
708 |
+
"bigint",
|
709 |
+
"bigint",
|
710 |
+
"bigint",
|
711 |
+
"bigint",
|
712 |
+
"varchar",
|
713 |
+
"varchar",
|
714 |
+
"varchar",
|
715 |
+
"varchar",
|
716 |
+
"varchar",
|
717 |
+
"float",
|
718 |
+
"float",
|
719 |
+
"bigint"
|
720 |
+
],
|
721 |
+
"foreign_keys": {},
|
722 |
+
"primary_keys": {},
|
723 |
+
"table_names": [
|
724 |
+
"ambient_air_quality"
|
725 |
+
],
|
726 |
+
"table_names_original": [
|
727 |
+
"ambient_air_quality"
|
728 |
+
]
|
729 |
+
},
|
730 |
+
{
|
731 |
+
"db_id": "nyc",
|
732 |
+
"column_names": [
|
733 |
+
[
|
734 |
+
-1,
|
735 |
+
"*"
|
736 |
+
],
|
737 |
+
[
|
738 |
+
0,
|
739 |
+
"unique_key"
|
740 |
+
],
|
741 |
+
[
|
742 |
+
0,
|
743 |
+
"created_date"
|
744 |
+
],
|
745 |
+
[
|
746 |
+
0,
|
747 |
+
"closed_date"
|
748 |
+
],
|
749 |
+
[
|
750 |
+
0,
|
751 |
+
"agency"
|
752 |
+
],
|
753 |
+
[
|
754 |
+
0,
|
755 |
+
"agency_name"
|
756 |
+
],
|
757 |
+
[
|
758 |
+
0,
|
759 |
+
"complaint_type"
|
760 |
+
],
|
761 |
+
[
|
762 |
+
0,
|
763 |
+
"descriptor"
|
764 |
+
],
|
765 |
+
[
|
766 |
+
0,
|
767 |
+
"location_type"
|
768 |
+
],
|
769 |
+
[
|
770 |
+
0,
|
771 |
+
"incident_zip"
|
772 |
+
],
|
773 |
+
[
|
774 |
+
0,
|
775 |
+
"incident_address"
|
776 |
+
],
|
777 |
+
[
|
778 |
+
0,
|
779 |
+
"street_name"
|
780 |
+
],
|
781 |
+
[
|
782 |
+
0,
|
783 |
+
"cross_street_1"
|
784 |
+
],
|
785 |
+
[
|
786 |
+
0,
|
787 |
+
"cross_street_2"
|
788 |
+
],
|
789 |
+
[
|
790 |
+
0,
|
791 |
+
"intersection_street_1"
|
792 |
+
],
|
793 |
+
[
|
794 |
+
0,
|
795 |
+
"intersection_street_2"
|
796 |
+
],
|
797 |
+
[
|
798 |
+
0,
|
799 |
+
"address_type"
|
800 |
+
],
|
801 |
+
[
|
802 |
+
0,
|
803 |
+
"city"
|
804 |
+
],
|
805 |
+
[
|
806 |
+
0,
|
807 |
+
"landmark"
|
808 |
+
],
|
809 |
+
[
|
810 |
+
0,
|
811 |
+
"facility_type"
|
812 |
+
],
|
813 |
+
[
|
814 |
+
0,
|
815 |
+
"status"
|
816 |
+
],
|
817 |
+
[
|
818 |
+
0,
|
819 |
+
"due_date"
|
820 |
+
],
|
821 |
+
[
|
822 |
+
0,
|
823 |
+
"resolution_description"
|
824 |
+
],
|
825 |
+
[
|
826 |
+
0,
|
827 |
+
"resolution_action_updated_date"
|
828 |
+
],
|
829 |
+
[
|
830 |
+
0,
|
831 |
+
"community_board"
|
832 |
+
],
|
833 |
+
[
|
834 |
+
0,
|
835 |
+
"bbl"
|
836 |
+
],
|
837 |
+
[
|
838 |
+
0,
|
839 |
+
"borough"
|
840 |
+
],
|
841 |
+
[
|
842 |
+
0,
|
843 |
+
"x_coordinate_state_plane"
|
844 |
+
],
|
845 |
+
[
|
846 |
+
0,
|
847 |
+
"y_coordinate_state_plane"
|
848 |
+
],
|
849 |
+
[
|
850 |
+
0,
|
851 |
+
"open_data_channel_type"
|
852 |
+
],
|
853 |
+
[
|
854 |
+
0,
|
855 |
+
"park_facility_name"
|
856 |
+
],
|
857 |
+
[
|
858 |
+
0,
|
859 |
+
"park_borough"
|
860 |
+
],
|
861 |
+
[
|
862 |
+
0,
|
863 |
+
"vehicle_type"
|
864 |
+
],
|
865 |
+
[
|
866 |
+
0,
|
867 |
+
"taxi_company_borough"
|
868 |
+
],
|
869 |
+
[
|
870 |
+
0,
|
871 |
+
"taxi_pick_up_location"
|
872 |
+
],
|
873 |
+
[
|
874 |
+
0,
|
875 |
+
"bridge_highway_name"
|
876 |
+
],
|
877 |
+
[
|
878 |
+
0,
|
879 |
+
"bridge_highway_direction"
|
880 |
+
],
|
881 |
+
[
|
882 |
+
0,
|
883 |
+
"road_ramp"
|
884 |
+
],
|
885 |
+
[
|
886 |
+
0,
|
887 |
+
"bridge_highway_segment"
|
888 |
+
],
|
889 |
+
[
|
890 |
+
0,
|
891 |
+
"latitude"
|
892 |
+
],
|
893 |
+
[
|
894 |
+
0,
|
895 |
+
"longitude"
|
896 |
+
],
|
897 |
+
[
|
898 |
+
1,
|
899 |
+
"hvfhs_license_num"
|
900 |
+
],
|
901 |
+
[
|
902 |
+
1,
|
903 |
+
"dispatching_base_num"
|
904 |
+
],
|
905 |
+
[
|
906 |
+
1,
|
907 |
+
"originating_base_num"
|
908 |
+
],
|
909 |
+
[
|
910 |
+
1,
|
911 |
+
"request_datetime"
|
912 |
+
],
|
913 |
+
[
|
914 |
+
1,
|
915 |
+
"on_scene_datetime"
|
916 |
+
],
|
917 |
+
[
|
918 |
+
1,
|
919 |
+
"pickup_datetime"
|
920 |
+
],
|
921 |
+
[
|
922 |
+
1,
|
923 |
+
"dropoff_datetime"
|
924 |
+
],
|
925 |
+
[
|
926 |
+
1,
|
927 |
+
"PULocationID"
|
928 |
+
],
|
929 |
+
[
|
930 |
+
1,
|
931 |
+
"DOLocationID"
|
932 |
+
],
|
933 |
+
[
|
934 |
+
1,
|
935 |
+
"trip_miles"
|
936 |
+
],
|
937 |
+
[
|
938 |
+
1,
|
939 |
+
"trip_time"
|
940 |
+
],
|
941 |
+
[
|
942 |
+
1,
|
943 |
+
"base_passenger_fare"
|
944 |
+
],
|
945 |
+
[
|
946 |
+
1,
|
947 |
+
"tolls"
|
948 |
+
],
|
949 |
+
[
|
950 |
+
1,
|
951 |
+
"bcf"
|
952 |
+
],
|
953 |
+
[
|
954 |
+
1,
|
955 |
+
"sales_tax"
|
956 |
+
],
|
957 |
+
[
|
958 |
+
1,
|
959 |
+
"congestion_surcharge"
|
960 |
+
],
|
961 |
+
[
|
962 |
+
1,
|
963 |
+
"airport_fee"
|
964 |
+
],
|
965 |
+
[
|
966 |
+
1,
|
967 |
+
"tips"
|
968 |
+
],
|
969 |
+
[
|
970 |
+
1,
|
971 |
+
"driver_pay"
|
972 |
+
],
|
973 |
+
[
|
974 |
+
1,
|
975 |
+
"shared_request_flag"
|
976 |
+
],
|
977 |
+
[
|
978 |
+
1,
|
979 |
+
"shared_match_flag"
|
980 |
+
],
|
981 |
+
[
|
982 |
+
1,
|
983 |
+
"access_a_ride_flag"
|
984 |
+
],
|
985 |
+
[
|
986 |
+
1,
|
987 |
+
"wav_request_flag"
|
988 |
+
],
|
989 |
+
[
|
990 |
+
1,
|
991 |
+
"wav_match_flag"
|
992 |
+
],
|
993 |
+
[
|
994 |
+
2,
|
995 |
+
"VendorID"
|
996 |
+
],
|
997 |
+
[
|
998 |
+
2,
|
999 |
+
"tpep_pickup_datetime"
|
1000 |
+
],
|
1001 |
+
[
|
1002 |
+
2,
|
1003 |
+
"tpep_dropoff_datetime"
|
1004 |
+
],
|
1005 |
+
[
|
1006 |
+
2,
|
1007 |
+
"passenger_count"
|
1008 |
+
],
|
1009 |
+
[
|
1010 |
+
2,
|
1011 |
+
"trip_distance"
|
1012 |
+
],
|
1013 |
+
[
|
1014 |
+
2,
|
1015 |
+
"RatecodeID"
|
1016 |
+
],
|
1017 |
+
[
|
1018 |
+
2,
|
1019 |
+
"store_and_fwd_flag"
|
1020 |
+
],
|
1021 |
+
[
|
1022 |
+
2,
|
1023 |
+
"PULocationID"
|
1024 |
+
],
|
1025 |
+
[
|
1026 |
+
2,
|
1027 |
+
"DOLocationID"
|
1028 |
+
],
|
1029 |
+
[
|
1030 |
+
2,
|
1031 |
+
"payment_type"
|
1032 |
+
],
|
1033 |
+
[
|
1034 |
+
2,
|
1035 |
+
"fare_amount"
|
1036 |
+
],
|
1037 |
+
[
|
1038 |
+
2,
|
1039 |
+
"extra"
|
1040 |
+
],
|
1041 |
+
[
|
1042 |
+
2,
|
1043 |
+
"mta_tax"
|
1044 |
+
],
|
1045 |
+
[
|
1046 |
+
2,
|
1047 |
+
"tip_amount"
|
1048 |
+
],
|
1049 |
+
[
|
1050 |
+
2,
|
1051 |
+
"tolls_amount"
|
1052 |
+
],
|
1053 |
+
[
|
1054 |
+
2,
|
1055 |
+
"improvement_surcharge"
|
1056 |
+
],
|
1057 |
+
[
|
1058 |
+
2,
|
1059 |
+
"total_amount"
|
1060 |
+
],
|
1061 |
+
[
|
1062 |
+
2,
|
1063 |
+
"congestion_surcharge"
|
1064 |
+
],
|
1065 |
+
[
|
1066 |
+
2,
|
1067 |
+
"airport_fee"
|
1068 |
+
]
|
1069 |
+
],
|
1070 |
+
"column_names_original": [
|
1071 |
+
[
|
1072 |
+
-1,
|
1073 |
+
"*"
|
1074 |
+
],
|
1075 |
+
[
|
1076 |
+
0,
|
1077 |
+
"unique_key"
|
1078 |
+
],
|
1079 |
+
[
|
1080 |
+
0,
|
1081 |
+
"created_date"
|
1082 |
+
],
|
1083 |
+
[
|
1084 |
+
0,
|
1085 |
+
"closed_date"
|
1086 |
+
],
|
1087 |
+
[
|
1088 |
+
0,
|
1089 |
+
"agency"
|
1090 |
+
],
|
1091 |
+
[
|
1092 |
+
0,
|
1093 |
+
"agency_name"
|
1094 |
+
],
|
1095 |
+
[
|
1096 |
+
0,
|
1097 |
+
"complaint_type"
|
1098 |
+
],
|
1099 |
+
[
|
1100 |
+
0,
|
1101 |
+
"descriptor"
|
1102 |
+
],
|
1103 |
+
[
|
1104 |
+
0,
|
1105 |
+
"location_type"
|
1106 |
+
],
|
1107 |
+
[
|
1108 |
+
0,
|
1109 |
+
"incident_zip"
|
1110 |
+
],
|
1111 |
+
[
|
1112 |
+
0,
|
1113 |
+
"incident_address"
|
1114 |
+
],
|
1115 |
+
[
|
1116 |
+
0,
|
1117 |
+
"street_name"
|
1118 |
+
],
|
1119 |
+
[
|
1120 |
+
0,
|
1121 |
+
"cross_street_1"
|
1122 |
+
],
|
1123 |
+
[
|
1124 |
+
0,
|
1125 |
+
"cross_street_2"
|
1126 |
+
],
|
1127 |
+
[
|
1128 |
+
0,
|
1129 |
+
"intersection_street_1"
|
1130 |
+
],
|
1131 |
+
[
|
1132 |
+
0,
|
1133 |
+
"intersection_street_2"
|
1134 |
+
],
|
1135 |
+
[
|
1136 |
+
0,
|
1137 |
+
"address_type"
|
1138 |
+
],
|
1139 |
+
[
|
1140 |
+
0,
|
1141 |
+
"city"
|
1142 |
+
],
|
1143 |
+
[
|
1144 |
+
0,
|
1145 |
+
"landmark"
|
1146 |
+
],
|
1147 |
+
[
|
1148 |
+
0,
|
1149 |
+
"facility_type"
|
1150 |
+
],
|
1151 |
+
[
|
1152 |
+
0,
|
1153 |
+
"status"
|
1154 |
+
],
|
1155 |
+
[
|
1156 |
+
0,
|
1157 |
+
"due_date"
|
1158 |
+
],
|
1159 |
+
[
|
1160 |
+
0,
|
1161 |
+
"resolution_description"
|
1162 |
+
],
|
1163 |
+
[
|
1164 |
+
0,
|
1165 |
+
"resolution_action_updated_date"
|
1166 |
+
],
|
1167 |
+
[
|
1168 |
+
0,
|
1169 |
+
"community_board"
|
1170 |
+
],
|
1171 |
+
[
|
1172 |
+
0,
|
1173 |
+
"bbl"
|
1174 |
+
],
|
1175 |
+
[
|
1176 |
+
0,
|
1177 |
+
"borough"
|
1178 |
+
],
|
1179 |
+
[
|
1180 |
+
0,
|
1181 |
+
"x_coordinate_state_plane"
|
1182 |
+
],
|
1183 |
+
[
|
1184 |
+
0,
|
1185 |
+
"y_coordinate_state_plane"
|
1186 |
+
],
|
1187 |
+
[
|
1188 |
+
0,
|
1189 |
+
"open_data_channel_type"
|
1190 |
+
],
|
1191 |
+
[
|
1192 |
+
0,
|
1193 |
+
"park_facility_name"
|
1194 |
+
],
|
1195 |
+
[
|
1196 |
+
0,
|
1197 |
+
"park_borough"
|
1198 |
+
],
|
1199 |
+
[
|
1200 |
+
0,
|
1201 |
+
"vehicle_type"
|
1202 |
+
],
|
1203 |
+
[
|
1204 |
+
0,
|
1205 |
+
"taxi_company_borough"
|
1206 |
+
],
|
1207 |
+
[
|
1208 |
+
0,
|
1209 |
+
"taxi_pick_up_location"
|
1210 |
+
],
|
1211 |
+
[
|
1212 |
+
0,
|
1213 |
+
"bridge_highway_name"
|
1214 |
+
],
|
1215 |
+
[
|
1216 |
+
0,
|
1217 |
+
"bridge_highway_direction"
|
1218 |
+
],
|
1219 |
+
[
|
1220 |
+
0,
|
1221 |
+
"road_ramp"
|
1222 |
+
],
|
1223 |
+
[
|
1224 |
+
0,
|
1225 |
+
"bridge_highway_segment"
|
1226 |
+
],
|
1227 |
+
[
|
1228 |
+
0,
|
1229 |
+
"latitude"
|
1230 |
+
],
|
1231 |
+
[
|
1232 |
+
0,
|
1233 |
+
"longitude"
|
1234 |
+
],
|
1235 |
+
[
|
1236 |
+
1,
|
1237 |
+
"hvfhs_license_num"
|
1238 |
+
],
|
1239 |
+
[
|
1240 |
+
1,
|
1241 |
+
"dispatching_base_num"
|
1242 |
+
],
|
1243 |
+
[
|
1244 |
+
1,
|
1245 |
+
"originating_base_num"
|
1246 |
+
],
|
1247 |
+
[
|
1248 |
+
1,
|
1249 |
+
"request_datetime"
|
1250 |
+
],
|
1251 |
+
[
|
1252 |
+
1,
|
1253 |
+
"on_scene_datetime"
|
1254 |
+
],
|
1255 |
+
[
|
1256 |
+
1,
|
1257 |
+
"pickup_datetime"
|
1258 |
+
],
|
1259 |
+
[
|
1260 |
+
1,
|
1261 |
+
"dropoff_datetime"
|
1262 |
+
],
|
1263 |
+
[
|
1264 |
+
1,
|
1265 |
+
"PULocationID"
|
1266 |
+
],
|
1267 |
+
[
|
1268 |
+
1,
|
1269 |
+
"DOLocationID"
|
1270 |
+
],
|
1271 |
+
[
|
1272 |
+
1,
|
1273 |
+
"trip_miles"
|
1274 |
+
],
|
1275 |
+
[
|
1276 |
+
1,
|
1277 |
+
"trip_time"
|
1278 |
+
],
|
1279 |
+
[
|
1280 |
+
1,
|
1281 |
+
"base_passenger_fare"
|
1282 |
+
],
|
1283 |
+
[
|
1284 |
+
1,
|
1285 |
+
"tolls"
|
1286 |
+
],
|
1287 |
+
[
|
1288 |
+
1,
|
1289 |
+
"bcf"
|
1290 |
+
],
|
1291 |
+
[
|
1292 |
+
1,
|
1293 |
+
"sales_tax"
|
1294 |
+
],
|
1295 |
+
[
|
1296 |
+
1,
|
1297 |
+
"congestion_surcharge"
|
1298 |
+
],
|
1299 |
+
[
|
1300 |
+
1,
|
1301 |
+
"airport_fee"
|
1302 |
+
],
|
1303 |
+
[
|
1304 |
+
1,
|
1305 |
+
"tips"
|
1306 |
+
],
|
1307 |
+
[
|
1308 |
+
1,
|
1309 |
+
"driver_pay"
|
1310 |
+
],
|
1311 |
+
[
|
1312 |
+
1,
|
1313 |
+
"shared_request_flag"
|
1314 |
+
],
|
1315 |
+
[
|
1316 |
+
1,
|
1317 |
+
"shared_match_flag"
|
1318 |
+
],
|
1319 |
+
[
|
1320 |
+
1,
|
1321 |
+
"access_a_ride_flag"
|
1322 |
+
],
|
1323 |
+
[
|
1324 |
+
1,
|
1325 |
+
"wav_request_flag"
|
1326 |
+
],
|
1327 |
+
[
|
1328 |
+
1,
|
1329 |
+
"wav_match_flag"
|
1330 |
+
],
|
1331 |
+
[
|
1332 |
+
2,
|
1333 |
+
"VendorID"
|
1334 |
+
],
|
1335 |
+
[
|
1336 |
+
2,
|
1337 |
+
"tpep_pickup_datetime"
|
1338 |
+
],
|
1339 |
+
[
|
1340 |
+
2,
|
1341 |
+
"tpep_dropoff_datetime"
|
1342 |
+
],
|
1343 |
+
[
|
1344 |
+
2,
|
1345 |
+
"passenger_count"
|
1346 |
+
],
|
1347 |
+
[
|
1348 |
+
2,
|
1349 |
+
"trip_distance"
|
1350 |
+
],
|
1351 |
+
[
|
1352 |
+
2,
|
1353 |
+
"RatecodeID"
|
1354 |
+
],
|
1355 |
+
[
|
1356 |
+
2,
|
1357 |
+
"store_and_fwd_flag"
|
1358 |
+
],
|
1359 |
+
[
|
1360 |
+
2,
|
1361 |
+
"PULocationID"
|
1362 |
+
],
|
1363 |
+
[
|
1364 |
+
2,
|
1365 |
+
"DOLocationID"
|
1366 |
+
],
|
1367 |
+
[
|
1368 |
+
2,
|
1369 |
+
"payment_type"
|
1370 |
+
],
|
1371 |
+
[
|
1372 |
+
2,
|
1373 |
+
"fare_amount"
|
1374 |
+
],
|
1375 |
+
[
|
1376 |
+
2,
|
1377 |
+
"extra"
|
1378 |
+
],
|
1379 |
+
[
|
1380 |
+
2,
|
1381 |
+
"mta_tax"
|
1382 |
+
],
|
1383 |
+
[
|
1384 |
+
2,
|
1385 |
+
"tip_amount"
|
1386 |
+
],
|
1387 |
+
[
|
1388 |
+
2,
|
1389 |
+
"tolls_amount"
|
1390 |
+
],
|
1391 |
+
[
|
1392 |
+
2,
|
1393 |
+
"improvement_surcharge"
|
1394 |
+
],
|
1395 |
+
[
|
1396 |
+
2,
|
1397 |
+
"total_amount"
|
1398 |
+
],
|
1399 |
+
[
|
1400 |
+
2,
|
1401 |
+
"congestion_surcharge"
|
1402 |
+
],
|
1403 |
+
[
|
1404 |
+
2,
|
1405 |
+
"airport_fee"
|
1406 |
+
]
|
1407 |
+
],
|
1408 |
+
"column_types": [
|
1409 |
+
"text",
|
1410 |
+
"bigint",
|
1411 |
+
"timestamp",
|
1412 |
+
"timestamp",
|
1413 |
+
"varchar",
|
1414 |
+
"varchar",
|
1415 |
+
"varchar",
|
1416 |
+
"varchar",
|
1417 |
+
"varchar",
|
1418 |
+
"varchar",
|
1419 |
+
"varchar",
|
1420 |
+
"varchar",
|
1421 |
+
"varchar",
|
1422 |
+
"varchar",
|
1423 |
+
"varchar",
|
1424 |
+
"varchar",
|
1425 |
+
"varchar",
|
1426 |
+
"varchar",
|
1427 |
+
"varchar",
|
1428 |
+
"varchar",
|
1429 |
+
"varchar",
|
1430 |
+
"timestamp",
|
1431 |
+
"varchar",
|
1432 |
+
"timestamp",
|
1433 |
+
"varchar",
|
1434 |
+
"varchar",
|
1435 |
+
"varchar",
|
1436 |
+
"varchar",
|
1437 |
+
"varchar",
|
1438 |
+
"varchar",
|
1439 |
+
"varchar",
|
1440 |
+
"varchar",
|
1441 |
+
"varchar",
|
1442 |
+
"varchar",
|
1443 |
+
"varchar",
|
1444 |
+
"varchar",
|
1445 |
+
"varchar",
|
1446 |
+
"varchar",
|
1447 |
+
"varchar",
|
1448 |
+
"double",
|
1449 |
+
"double",
|
1450 |
+
"varchar",
|
1451 |
+
"varchar",
|
1452 |
+
"varchar",
|
1453 |
+
"timestamp",
|
1454 |
+
"timestamp",
|
1455 |
+
"timestamp",
|
1456 |
+
"timestamp",
|
1457 |
+
"bigint",
|
1458 |
+
"bigint",
|
1459 |
+
"double",
|
1460 |
+
"bigint",
|
1461 |
+
"double",
|
1462 |
+
"double",
|
1463 |
+
"double",
|
1464 |
+
"double",
|
1465 |
+
"double",
|
1466 |
+
"double",
|
1467 |
+
"double",
|
1468 |
+
"double",
|
1469 |
+
"varchar",
|
1470 |
+
"varchar",
|
1471 |
+
"varchar",
|
1472 |
+
"varchar",
|
1473 |
+
"varchar",
|
1474 |
+
"bigint",
|
1475 |
+
"timestamp",
|
1476 |
+
"timestamp",
|
1477 |
+
"double",
|
1478 |
+
"double",
|
1479 |
+
"double",
|
1480 |
+
"varchar",
|
1481 |
+
"bigint",
|
1482 |
+
"bigint",
|
1483 |
+
"bigint",
|
1484 |
+
"double",
|
1485 |
+
"double",
|
1486 |
+
"double",
|
1487 |
+
"double",
|
1488 |
+
"double",
|
1489 |
+
"double",
|
1490 |
+
"double",
|
1491 |
+
"double",
|
1492 |
+
"double"
|
1493 |
+
],
|
1494 |
+
"foreign_keys": {},
|
1495 |
+
"primary_keys": {},
|
1496 |
+
"table_names": [
|
1497 |
+
"service_requests",
|
1498 |
+
"rideshare",
|
1499 |
+
"taxi"
|
1500 |
+
],
|
1501 |
+
"table_names_original": [
|
1502 |
+
"service_requests",
|
1503 |
+
"rideshare",
|
1504 |
+
"taxi"
|
1505 |
+
]
|
1506 |
+
},
|
1507 |
+
{
|
1508 |
+
"db_id": "product",
|
1509 |
+
"column_names": [
|
1510 |
+
[
|
1511 |
+
-1,
|
1512 |
+
"*"
|
1513 |
+
],
|
1514 |
+
[
|
1515 |
+
0,
|
1516 |
+
"maker"
|
1517 |
+
],
|
1518 |
+
[
|
1519 |
+
0,
|
1520 |
+
"model"
|
1521 |
+
],
|
1522 |
+
[
|
1523 |
+
0,
|
1524 |
+
"type"
|
1525 |
+
]
|
1526 |
+
],
|
1527 |
+
"column_names_original": [
|
1528 |
+
[
|
1529 |
+
-1,
|
1530 |
+
"*"
|
1531 |
+
],
|
1532 |
+
[
|
1533 |
+
0,
|
1534 |
+
"maker"
|
1535 |
+
],
|
1536 |
+
[
|
1537 |
+
0,
|
1538 |
+
"model"
|
1539 |
+
],
|
1540 |
+
[
|
1541 |
+
0,
|
1542 |
+
"type"
|
1543 |
+
]
|
1544 |
+
],
|
1545 |
+
"column_types": [
|
1546 |
+
"text",
|
1547 |
+
"varchar",
|
1548 |
+
"varchar",
|
1549 |
+
"varchar"
|
1550 |
+
],
|
1551 |
+
"foreign_keys": {},
|
1552 |
+
"primary_keys": {},
|
1553 |
+
"table_names": [
|
1554 |
+
"products"
|
1555 |
+
],
|
1556 |
+
"table_names_original": [
|
1557 |
+
"products"
|
1558 |
+
]
|
1559 |
+
},
|
1560 |
+
{
|
1561 |
+
"db_id": "json",
|
1562 |
+
"column_names": [
|
1563 |
+
[
|
1564 |
+
-1,
|
1565 |
+
"*"
|
1566 |
+
],
|
1567 |
+
[
|
1568 |
+
0,
|
1569 |
+
"employee"
|
1570 |
+
]
|
1571 |
+
],
|
1572 |
+
"column_names_original": [
|
1573 |
+
[
|
1574 |
+
-1,
|
1575 |
+
"*"
|
1576 |
+
],
|
1577 |
+
[
|
1578 |
+
0,
|
1579 |
+
"employee"
|
1580 |
+
]
|
1581 |
+
],
|
1582 |
+
"column_types": [
|
1583 |
+
"text",
|
1584 |
+
"struct(id int, first_name text, last_name text, email text)"
|
1585 |
+
],
|
1586 |
+
"foreign_keys": {},
|
1587 |
+
"primary_keys": {},
|
1588 |
+
"table_names": [
|
1589 |
+
"employee_json"
|
1590 |
+
],
|
1591 |
+
"table_names_original": [
|
1592 |
+
"employee_json"
|
1593 |
+
]
|
1594 |
+
},
|
1595 |
+
{
|
1596 |
+
"db_id": "flightinfo",
|
1597 |
+
"column_names": [
|
1598 |
+
[
|
1599 |
+
-1,
|
1600 |
+
"*"
|
1601 |
+
],
|
1602 |
+
[
|
1603 |
+
0,
|
1604 |
+
"icao_code"
|
1605 |
+
],
|
1606 |
+
[
|
1607 |
+
0,
|
1608 |
+
"email"
|
1609 |
+
],
|
1610 |
+
[
|
1611 |
+
0,
|
1612 |
+
"name"
|
1613 |
+
],
|
1614 |
+
[
|
1615 |
+
0,
|
1616 |
+
"phone_number"
|
1617 |
+
],
|
1618 |
+
[
|
1619 |
+
0,
|
1620 |
+
"iata_code"
|
1621 |
+
],
|
1622 |
+
[
|
1623 |
+
1,
|
1624 |
+
"title"
|
1625 |
+
],
|
1626 |
+
[
|
1627 |
+
1,
|
1628 |
+
"description"
|
1629 |
+
],
|
1630 |
+
[
|
1631 |
+
1,
|
1632 |
+
"price"
|
1633 |
+
],
|
1634 |
+
[
|
1635 |
+
1,
|
1636 |
+
"service_type"
|
1637 |
+
],
|
1638 |
+
[
|
1639 |
+
1,
|
1640 |
+
"airline_icao_code"
|
1641 |
+
],
|
1642 |
+
[
|
1643 |
+
2,
|
1644 |
+
"iata_code"
|
1645 |
+
],
|
1646 |
+
[
|
1647 |
+
2,
|
1648 |
+
"address"
|
1649 |
+
],
|
1650 |
+
[
|
1651 |
+
2,
|
1652 |
+
"name"
|
1653 |
+
],
|
1654 |
+
[
|
1655 |
+
2,
|
1656 |
+
"phone_number"
|
1657 |
+
],
|
1658 |
+
[
|
1659 |
+
2,
|
1660 |
+
"email"
|
1661 |
+
],
|
1662 |
+
[
|
1663 |
+
2,
|
1664 |
+
"city_zip_code"
|
1665 |
+
],
|
1666 |
+
[
|
1667 |
+
2,
|
1668 |
+
"city_dbpedia"
|
1669 |
+
],
|
1670 |
+
[
|
1671 |
+
3,
|
1672 |
+
"title"
|
1673 |
+
],
|
1674 |
+
[
|
1675 |
+
3,
|
1676 |
+
"cabin_bag_dimension_cm"
|
1677 |
+
],
|
1678 |
+
[
|
1679 |
+
3,
|
1680 |
+
"cabin_bags_no"
|
1681 |
+
],
|
1682 |
+
[
|
1683 |
+
3,
|
1684 |
+
"cabin_bg_weight_kg"
|
1685 |
+
],
|
1686 |
+
[
|
1687 |
+
3,
|
1688 |
+
"checked_bag_dimension_cm"
|
1689 |
+
],
|
1690 |
+
[
|
1691 |
+
3,
|
1692 |
+
"checked_bags_no"
|
1693 |
+
],
|
1694 |
+
[
|
1695 |
+
3,
|
1696 |
+
"checked_bag_weight_kg"
|
1697 |
+
],
|
1698 |
+
[
|
1699 |
+
3,
|
1700 |
+
"excessive_price_perkg"
|
1701 |
+
],
|
1702 |
+
[
|
1703 |
+
3,
|
1704 |
+
"flight_type"
|
1705 |
+
],
|
1706 |
+
[
|
1707 |
+
3,
|
1708 |
+
"airline_icao_code"
|
1709 |
+
],
|
1710 |
+
[
|
1711 |
+
4,
|
1712 |
+
"title"
|
1713 |
+
],
|
1714 |
+
[
|
1715 |
+
4,
|
1716 |
+
"description"
|
1717 |
+
],
|
1718 |
+
[
|
1719 |
+
4,
|
1720 |
+
"airline_icao_code"
|
1721 |
+
],
|
1722 |
+
[
|
1723 |
+
5,
|
1724 |
+
"title"
|
1725 |
+
],
|
1726 |
+
[
|
1727 |
+
5,
|
1728 |
+
"description"
|
1729 |
+
],
|
1730 |
+
[
|
1731 |
+
5,
|
1732 |
+
"due_date"
|
1733 |
+
],
|
1734 |
+
[
|
1735 |
+
5,
|
1736 |
+
"refund_postdue_percentage"
|
1737 |
+
],
|
1738 |
+
[
|
1739 |
+
5,
|
1740 |
+
"refund_predue_percentage"
|
1741 |
+
],
|
1742 |
+
[
|
1743 |
+
5,
|
1744 |
+
"airline_icao_code"
|
1745 |
+
],
|
1746 |
+
[
|
1747 |
+
6,
|
1748 |
+
"city_zipcode"
|
1749 |
+
],
|
1750 |
+
[
|
1751 |
+
6,
|
1752 |
+
"city_name"
|
1753 |
+
],
|
1754 |
+
[
|
1755 |
+
6,
|
1756 |
+
"country_iso_code"
|
1757 |
+
],
|
1758 |
+
[
|
1759 |
+
7,
|
1760 |
+
"country_iso_code"
|
1761 |
+
],
|
1762 |
+
[
|
1763 |
+
7,
|
1764 |
+
"country_name"
|
1765 |
+
],
|
1766 |
+
[
|
1767 |
+
8,
|
1768 |
+
"flight_number"
|
1769 |
+
],
|
1770 |
+
[
|
1771 |
+
8,
|
1772 |
+
"departure_airport_iata_code"
|
1773 |
+
],
|
1774 |
+
[
|
1775 |
+
8,
|
1776 |
+
"arriving_airport_iata_code"
|
1777 |
+
],
|
1778 |
+
[
|
1779 |
+
9,
|
1780 |
+
"number"
|
1781 |
+
],
|
1782 |
+
[
|
1783 |
+
9,
|
1784 |
+
"departure_date"
|
1785 |
+
],
|
1786 |
+
[
|
1787 |
+
9,
|
1788 |
+
"arrival_date"
|
1789 |
+
],
|
1790 |
+
[
|
1791 |
+
9,
|
1792 |
+
"distance_km"
|
1793 |
+
],
|
1794 |
+
[
|
1795 |
+
9,
|
1796 |
+
"is_available"
|
1797 |
+
],
|
1798 |
+
[
|
1799 |
+
9,
|
1800 |
+
"duration_min"
|
1801 |
+
],
|
1802 |
+
[
|
1803 |
+
9,
|
1804 |
+
"airline_icao_code"
|
1805 |
+
],
|
1806 |
+
[
|
1807 |
+
9,
|
1808 |
+
"type"
|
1809 |
+
],
|
1810 |
+
[
|
1811 |
+
10,
|
1812 |
+
"title"
|
1813 |
+
],
|
1814 |
+
[
|
1815 |
+
10,
|
1816 |
+
"description"
|
1817 |
+
],
|
1818 |
+
[
|
1819 |
+
10,
|
1820 |
+
"cabin_class_title"
|
1821 |
+
],
|
1822 |
+
[
|
1823 |
+
10,
|
1824 |
+
"baggage_policy_title"
|
1825 |
+
],
|
1826 |
+
[
|
1827 |
+
10,
|
1828 |
+
"cancelation_policy_title"
|
1829 |
+
],
|
1830 |
+
[
|
1831 |
+
11,
|
1832 |
+
"subflight_number"
|
1833 |
+
],
|
1834 |
+
[
|
1835 |
+
11,
|
1836 |
+
"flight_number"
|
1837 |
+
],
|
1838 |
+
[
|
1839 |
+
12,
|
1840 |
+
"flight_package_title"
|
1841 |
+
],
|
1842 |
+
[
|
1843 |
+
12,
|
1844 |
+
"airline_service_title"
|
1845 |
+
],
|
1846 |
+
[
|
1847 |
+
13,
|
1848 |
+
"seat_number"
|
1849 |
+
],
|
1850 |
+
[
|
1851 |
+
13,
|
1852 |
+
"is_available"
|
1853 |
+
],
|
1854 |
+
[
|
1855 |
+
13,
|
1856 |
+
"flight_number"
|
1857 |
+
],
|
1858 |
+
[
|
1859 |
+
14,
|
1860 |
+
"meal_type"
|
1861 |
+
],
|
1862 |
+
[
|
1863 |
+
14,
|
1864 |
+
"airline_service_title"
|
1865 |
+
],
|
1866 |
+
[
|
1867 |
+
15,
|
1868 |
+
"duration_min"
|
1869 |
+
],
|
1870 |
+
[
|
1871 |
+
15,
|
1872 |
+
"duration_from"
|
1873 |
+
],
|
1874 |
+
[
|
1875 |
+
15,
|
1876 |
+
"duration_to"
|
1877 |
+
],
|
1878 |
+
[
|
1879 |
+
15,
|
1880 |
+
"airport_iatacode"
|
1881 |
+
],
|
1882 |
+
[
|
1883 |
+
15,
|
1884 |
+
"flight_number"
|
1885 |
+
],
|
1886 |
+
[
|
1887 |
+
16,
|
1888 |
+
"flight_number"
|
1889 |
+
],
|
1890 |
+
[
|
1891 |
+
16,
|
1892 |
+
"package_title"
|
1893 |
+
],
|
1894 |
+
[
|
1895 |
+
16,
|
1896 |
+
"trip_id"
|
1897 |
+
],
|
1898 |
+
[
|
1899 |
+
16,
|
1900 |
+
"requested_excessive_baggage_kg"
|
1901 |
+
],
|
1902 |
+
[
|
1903 |
+
16,
|
1904 |
+
"seat_number"
|
1905 |
+
],
|
1906 |
+
[
|
1907 |
+
16,
|
1908 |
+
"chosen_meal_service_price"
|
1909 |
+
],
|
1910 |
+
[
|
1911 |
+
16,
|
1912 |
+
"chosen_wifi_service_price"
|
1913 |
+
],
|
1914 |
+
[
|
1915 |
+
16,
|
1916 |
+
"price"
|
1917 |
+
],
|
1918 |
+
[
|
1919 |
+
17,
|
1920 |
+
"id"
|
1921 |
+
],
|
1922 |
+
[
|
1923 |
+
17,
|
1924 |
+
"tax"
|
1925 |
+
],
|
1926 |
+
[
|
1927 |
+
17,
|
1928 |
+
"booking_date"
|
1929 |
+
],
|
1930 |
+
[
|
1931 |
+
17,
|
1932 |
+
"user_email"
|
1933 |
+
],
|
1934 |
+
[
|
1935 |
+
17,
|
1936 |
+
"type"
|
1937 |
+
],
|
1938 |
+
[
|
1939 |
+
18,
|
1940 |
+
"email"
|
1941 |
+
],
|
1942 |
+
[
|
1943 |
+
18,
|
1944 |
+
"first_name"
|
1945 |
+
],
|
1946 |
+
[
|
1947 |
+
18,
|
1948 |
+
"last_name"
|
1949 |
+
],
|
1950 |
+
[
|
1951 |
+
18,
|
1952 |
+
"birthdate"
|
1953 |
+
],
|
1954 |
+
[
|
1955 |
+
18,
|
1956 |
+
"passport_number"
|
1957 |
+
],
|
1958 |
+
[
|
1959 |
+
18,
|
1960 |
+
"address"
|
1961 |
+
],
|
1962 |
+
[
|
1963 |
+
18,
|
1964 |
+
"password"
|
1965 |
+
],
|
1966 |
+
[
|
1967 |
+
18,
|
1968 |
+
"phone_number"
|
1969 |
+
],
|
1970 |
+
[
|
1971 |
+
19,
|
1972 |
+
"wifi_onboard_service_bandwidth_MB"
|
1973 |
+
],
|
1974 |
+
[
|
1975 |
+
19,
|
1976 |
+
"airline_service_title"
|
1977 |
+
]
|
1978 |
+
],
|
1979 |
+
"column_names_original": [
|
1980 |
+
[
|
1981 |
+
-1,
|
1982 |
+
"*"
|
1983 |
+
],
|
1984 |
+
[
|
1985 |
+
0,
|
1986 |
+
"icao_code"
|
1987 |
+
],
|
1988 |
+
[
|
1989 |
+
0,
|
1990 |
+
"email"
|
1991 |
+
],
|
1992 |
+
[
|
1993 |
+
0,
|
1994 |
+
"name"
|
1995 |
+
],
|
1996 |
+
[
|
1997 |
+
0,
|
1998 |
+
"phone_number"
|
1999 |
+
],
|
2000 |
+
[
|
2001 |
+
0,
|
2002 |
+
"iata_code"
|
2003 |
+
],
|
2004 |
+
[
|
2005 |
+
1,
|
2006 |
+
"title"
|
2007 |
+
],
|
2008 |
+
[
|
2009 |
+
1,
|
2010 |
+
"description"
|
2011 |
+
],
|
2012 |
+
[
|
2013 |
+
1,
|
2014 |
+
"price"
|
2015 |
+
],
|
2016 |
+
[
|
2017 |
+
1,
|
2018 |
+
"service_type"
|
2019 |
+
],
|
2020 |
+
[
|
2021 |
+
1,
|
2022 |
+
"airline_icao_code"
|
2023 |
+
],
|
2024 |
+
[
|
2025 |
+
2,
|
2026 |
+
"iata_code"
|
2027 |
+
],
|
2028 |
+
[
|
2029 |
+
2,
|
2030 |
+
"address"
|
2031 |
+
],
|
2032 |
+
[
|
2033 |
+
2,
|
2034 |
+
"name"
|
2035 |
+
],
|
2036 |
+
[
|
2037 |
+
2,
|
2038 |
+
"phone_number"
|
2039 |
+
],
|
2040 |
+
[
|
2041 |
+
2,
|
2042 |
+
"email"
|
2043 |
+
],
|
2044 |
+
[
|
2045 |
+
2,
|
2046 |
+
"city_zip_code"
|
2047 |
+
],
|
2048 |
+
[
|
2049 |
+
2,
|
2050 |
+
"city_dbpedia"
|
2051 |
+
],
|
2052 |
+
[
|
2053 |
+
3,
|
2054 |
+
"title"
|
2055 |
+
],
|
2056 |
+
[
|
2057 |
+
3,
|
2058 |
+
"cabin_bag_dimension_cm"
|
2059 |
+
],
|
2060 |
+
[
|
2061 |
+
3,
|
2062 |
+
"cabin_bags_no"
|
2063 |
+
],
|
2064 |
+
[
|
2065 |
+
3,
|
2066 |
+
"cabin_bg_weight_kg"
|
2067 |
+
],
|
2068 |
+
[
|
2069 |
+
3,
|
2070 |
+
"checked_bag_dimension_cm"
|
2071 |
+
],
|
2072 |
+
[
|
2073 |
+
3,
|
2074 |
+
"checked_bags_no"
|
2075 |
+
],
|
2076 |
+
[
|
2077 |
+
3,
|
2078 |
+
"checked_bag_weight_kg"
|
2079 |
+
],
|
2080 |
+
[
|
2081 |
+
3,
|
2082 |
+
"excessive_price_perkg"
|
2083 |
+
],
|
2084 |
+
[
|
2085 |
+
3,
|
2086 |
+
"flight_type"
|
2087 |
+
],
|
2088 |
+
[
|
2089 |
+
3,
|
2090 |
+
"airline_icao_code"
|
2091 |
+
],
|
2092 |
+
[
|
2093 |
+
4,
|
2094 |
+
"title"
|
2095 |
+
],
|
2096 |
+
[
|
2097 |
+
4,
|
2098 |
+
"description"
|
2099 |
+
],
|
2100 |
+
[
|
2101 |
+
4,
|
2102 |
+
"airline_icao_code"
|
2103 |
+
],
|
2104 |
+
[
|
2105 |
+
5,
|
2106 |
+
"title"
|
2107 |
+
],
|
2108 |
+
[
|
2109 |
+
5,
|
2110 |
+
"description"
|
2111 |
+
],
|
2112 |
+
[
|
2113 |
+
5,
|
2114 |
+
"due_date"
|
2115 |
+
],
|
2116 |
+
[
|
2117 |
+
5,
|
2118 |
+
"refund_postdue_percentage"
|
2119 |
+
],
|
2120 |
+
[
|
2121 |
+
5,
|
2122 |
+
"refund_predue_percentage"
|
2123 |
+
],
|
2124 |
+
[
|
2125 |
+
5,
|
2126 |
+
"airline_icao_code"
|
2127 |
+
],
|
2128 |
+
[
|
2129 |
+
6,
|
2130 |
+
"city_zipcode"
|
2131 |
+
],
|
2132 |
+
[
|
2133 |
+
6,
|
2134 |
+
"city_name"
|
2135 |
+
],
|
2136 |
+
[
|
2137 |
+
6,
|
2138 |
+
"country_iso_code"
|
2139 |
+
],
|
2140 |
+
[
|
2141 |
+
7,
|
2142 |
+
"country_iso_code"
|
2143 |
+
],
|
2144 |
+
[
|
2145 |
+
7,
|
2146 |
+
"country_name"
|
2147 |
+
],
|
2148 |
+
[
|
2149 |
+
8,
|
2150 |
+
"flight_number"
|
2151 |
+
],
|
2152 |
+
[
|
2153 |
+
8,
|
2154 |
+
"departure_airport_iata_code"
|
2155 |
+
],
|
2156 |
+
[
|
2157 |
+
8,
|
2158 |
+
"arriving_airport_iata_code"
|
2159 |
+
],
|
2160 |
+
[
|
2161 |
+
9,
|
2162 |
+
"number"
|
2163 |
+
],
|
2164 |
+
[
|
2165 |
+
9,
|
2166 |
+
"departure_date"
|
2167 |
+
],
|
2168 |
+
[
|
2169 |
+
9,
|
2170 |
+
"arrival_date"
|
2171 |
+
],
|
2172 |
+
[
|
2173 |
+
9,
|
2174 |
+
"distance_km"
|
2175 |
+
],
|
2176 |
+
[
|
2177 |
+
9,
|
2178 |
+
"is_available"
|
2179 |
+
],
|
2180 |
+
[
|
2181 |
+
9,
|
2182 |
+
"duration_min"
|
2183 |
+
],
|
2184 |
+
[
|
2185 |
+
9,
|
2186 |
+
"airline_icao_code"
|
2187 |
+
],
|
2188 |
+
[
|
2189 |
+
9,
|
2190 |
+
"type"
|
2191 |
+
],
|
2192 |
+
[
|
2193 |
+
10,
|
2194 |
+
"title"
|
2195 |
+
],
|
2196 |
+
[
|
2197 |
+
10,
|
2198 |
+
"description"
|
2199 |
+
],
|
2200 |
+
[
|
2201 |
+
10,
|
2202 |
+
"cabin_class_title"
|
2203 |
+
],
|
2204 |
+
[
|
2205 |
+
10,
|
2206 |
+
"baggage_policy_title"
|
2207 |
+
],
|
2208 |
+
[
|
2209 |
+
10,
|
2210 |
+
"cancelation_policy_title"
|
2211 |
+
],
|
2212 |
+
[
|
2213 |
+
11,
|
2214 |
+
"subflight_number"
|
2215 |
+
],
|
2216 |
+
[
|
2217 |
+
11,
|
2218 |
+
"flight_number"
|
2219 |
+
],
|
2220 |
+
[
|
2221 |
+
12,
|
2222 |
+
"flight_package_title"
|
2223 |
+
],
|
2224 |
+
[
|
2225 |
+
12,
|
2226 |
+
"airline_service_title"
|
2227 |
+
],
|
2228 |
+
[
|
2229 |
+
13,
|
2230 |
+
"seat_number"
|
2231 |
+
],
|
2232 |
+
[
|
2233 |
+
13,
|
2234 |
+
"is_available"
|
2235 |
+
],
|
2236 |
+
[
|
2237 |
+
13,
|
2238 |
+
"flight_number"
|
2239 |
+
],
|
2240 |
+
[
|
2241 |
+
14,
|
2242 |
+
"meal_type"
|
2243 |
+
],
|
2244 |
+
[
|
2245 |
+
14,
|
2246 |
+
"airline_service_title"
|
2247 |
+
],
|
2248 |
+
[
|
2249 |
+
15,
|
2250 |
+
"duration_min"
|
2251 |
+
],
|
2252 |
+
[
|
2253 |
+
15,
|
2254 |
+
"duration_from"
|
2255 |
+
],
|
2256 |
+
[
|
2257 |
+
15,
|
2258 |
+
"duration_to"
|
2259 |
+
],
|
2260 |
+
[
|
2261 |
+
15,
|
2262 |
+
"airport_iatacode"
|
2263 |
+
],
|
2264 |
+
[
|
2265 |
+
15,
|
2266 |
+
"flight_number"
|
2267 |
+
],
|
2268 |
+
[
|
2269 |
+
16,
|
2270 |
+
"flight_number"
|
2271 |
+
],
|
2272 |
+
[
|
2273 |
+
16,
|
2274 |
+
"package_title"
|
2275 |
+
],
|
2276 |
+
[
|
2277 |
+
16,
|
2278 |
+
"trip_id"
|
2279 |
+
],
|
2280 |
+
[
|
2281 |
+
16,
|
2282 |
+
"requested_excessive_baggage_kg"
|
2283 |
+
],
|
2284 |
+
[
|
2285 |
+
16,
|
2286 |
+
"seat_number"
|
2287 |
+
],
|
2288 |
+
[
|
2289 |
+
16,
|
2290 |
+
"chosen_meal_service_price"
|
2291 |
+
],
|
2292 |
+
[
|
2293 |
+
16,
|
2294 |
+
"chosen_wifi_service_price"
|
2295 |
+
],
|
2296 |
+
[
|
2297 |
+
16,
|
2298 |
+
"price"
|
2299 |
+
],
|
2300 |
+
[
|
2301 |
+
17,
|
2302 |
+
"id"
|
2303 |
+
],
|
2304 |
+
[
|
2305 |
+
17,
|
2306 |
+
"tax"
|
2307 |
+
],
|
2308 |
+
[
|
2309 |
+
17,
|
2310 |
+
"booking_date"
|
2311 |
+
],
|
2312 |
+
[
|
2313 |
+
17,
|
2314 |
+
"user_email"
|
2315 |
+
],
|
2316 |
+
[
|
2317 |
+
17,
|
2318 |
+
"type"
|
2319 |
+
],
|
2320 |
+
[
|
2321 |
+
18,
|
2322 |
+
"email"
|
2323 |
+
],
|
2324 |
+
[
|
2325 |
+
18,
|
2326 |
+
"first_name"
|
2327 |
+
],
|
2328 |
+
[
|
2329 |
+
18,
|
2330 |
+
"last_name"
|
2331 |
+
],
|
2332 |
+
[
|
2333 |
+
18,
|
2334 |
+
"birthdate"
|
2335 |
+
],
|
2336 |
+
[
|
2337 |
+
18,
|
2338 |
+
"passport_number"
|
2339 |
+
],
|
2340 |
+
[
|
2341 |
+
18,
|
2342 |
+
"address"
|
2343 |
+
],
|
2344 |
+
[
|
2345 |
+
18,
|
2346 |
+
"password"
|
2347 |
+
],
|
2348 |
+
[
|
2349 |
+
18,
|
2350 |
+
"phone_number"
|
2351 |
+
],
|
2352 |
+
[
|
2353 |
+
19,
|
2354 |
+
"wifi_onboard_service_bandwidth_MB"
|
2355 |
+
],
|
2356 |
+
[
|
2357 |
+
19,
|
2358 |
+
"airline_service_title"
|
2359 |
+
]
|
2360 |
+
],
|
2361 |
+
"column_types": [
|
2362 |
+
"text",
|
2363 |
+
"varchar",
|
2364 |
+
"varchar",
|
2365 |
+
"varchar",
|
2366 |
+
"varchar",
|
2367 |
+
"varchar",
|
2368 |
+
"varchar",
|
2369 |
+
"varchar",
|
2370 |
+
"double",
|
2371 |
+
"varchar",
|
2372 |
+
"varchar",
|
2373 |
+
"varchar",
|
2374 |
+
"text",
|
2375 |
+
"varchar",
|
2376 |
+
"varchar",
|
2377 |
+
"varchar",
|
2378 |
+
"varchar",
|
2379 |
+
"varchar",
|
2380 |
+
"varchar",
|
2381 |
+
"double",
|
2382 |
+
"double",
|
2383 |
+
"double",
|
2384 |
+
"double",
|
2385 |
+
"double",
|
2386 |
+
"double",
|
2387 |
+
"double",
|
2388 |
+
"varchar",
|
2389 |
+
"varchar",
|
2390 |
+
"varchar",
|
2391 |
+
"text",
|
2392 |
+
"varchar",
|
2393 |
+
"varchar",
|
2394 |
+
"text",
|
2395 |
+
"text",
|
2396 |
+
"int",
|
2397 |
+
"int",
|
2398 |
+
"varchar",
|
2399 |
+
"varchar",
|
2400 |
+
"varchar",
|
2401 |
+
"varchar",
|
2402 |
+
"varchar",
|
2403 |
+
"text",
|
2404 |
+
"varchar",
|
2405 |
+
"varchar",
|
2406 |
+
"varchar",
|
2407 |
+
"varchar",
|
2408 |
+
"datetime",
|
2409 |
+
"datetime",
|
2410 |
+
"double",
|
2411 |
+
"tinyint",
|
2412 |
+
"double",
|
2413 |
+
"varchar",
|
2414 |
+
"varchar",
|
2415 |
+
"varchar",
|
2416 |
+
"text",
|
2417 |
+
"varchar",
|
2418 |
+
"varchar",
|
2419 |
+
"varchar",
|
2420 |
+
"varchar",
|
2421 |
+
"varchar",
|
2422 |
+
"varchar",
|
2423 |
+
"varchar",
|
2424 |
+
"varchar",
|
2425 |
+
"tinyint",
|
2426 |
+
"varchar",
|
2427 |
+
"varchar",
|
2428 |
+
"varchar",
|
2429 |
+
"double",
|
2430 |
+
"datetime",
|
2431 |
+
"datetime",
|
2432 |
+
"varchar",
|
2433 |
+
"varchar",
|
2434 |
+
"varchar",
|
2435 |
+
"varchar",
|
2436 |
+
"int",
|
2437 |
+
"int",
|
2438 |
+
"varchar",
|
2439 |
+
"int",
|
2440 |
+
"int",
|
2441 |
+
"double",
|
2442 |
+
"int",
|
2443 |
+
"double",
|
2444 |
+
"datetime",
|
2445 |
+
"varchar",
|
2446 |
+
"varchar",
|
2447 |
+
"varchar",
|
2448 |
+
"varchar",
|
2449 |
+
"varchar",
|
2450 |
+
"date",
|
2451 |
+
"varchar",
|
2452 |
+
"varchar",
|
2453 |
+
"varchar",
|
2454 |
+
"double",
|
2455 |
+
"double",
|
2456 |
+
"varchar"
|
2457 |
+
],
|
2458 |
+
"foreign_keys": {},
|
2459 |
+
"primary_keys": {},
|
2460 |
+
"table_names": [
|
2461 |
+
"airline",
|
2462 |
+
"airline_service",
|
2463 |
+
"airport",
|
2464 |
+
"baggage_policy",
|
2465 |
+
"cabin_class",
|
2466 |
+
"cancellation_policy",
|
2467 |
+
"city",
|
2468 |
+
"country",
|
2469 |
+
"direct_flight",
|
2470 |
+
"flight",
|
2471 |
+
"flight_package",
|
2472 |
+
"non_direct_flight",
|
2473 |
+
"package_service",
|
2474 |
+
"seat",
|
2475 |
+
"special_meal_type",
|
2476 |
+
"stopping",
|
2477 |
+
"ticke",
|
2478 |
+
"trip",
|
2479 |
+
"user",
|
2480 |
+
"wifi_onboard_service"
|
2481 |
+
],
|
2482 |
+
"table_names_original": [
|
2483 |
+
"airline",
|
2484 |
+
"airline_service",
|
2485 |
+
"airport",
|
2486 |
+
"baggage_policy",
|
2487 |
+
"cabin_class",
|
2488 |
+
"cancellation_policy",
|
2489 |
+
"city",
|
2490 |
+
"country",
|
2491 |
+
"direct_flight",
|
2492 |
+
"flight",
|
2493 |
+
"flight_package",
|
2494 |
+
"non_direct_flight",
|
2495 |
+
"package_service",
|
2496 |
+
"seat",
|
2497 |
+
"special_meal_type",
|
2498 |
+
"stopping",
|
2499 |
+
"ticke",
|
2500 |
+
"trip",
|
2501 |
+
"user",
|
2502 |
+
"wifi_onboard_service"
|
2503 |
+
]
|
2504 |
+
},
|
2505 |
+
{
|
2506 |
+
"db_id": "none",
|
2507 |
+
"column_names": [
|
2508 |
+
[
|
2509 |
+
-1,
|
2510 |
+
"*"
|
2511 |
+
]
|
2512 |
+
],
|
2513 |
+
"column_names_original": [
|
2514 |
+
[
|
2515 |
+
-1,
|
2516 |
+
"*"
|
2517 |
+
]
|
2518 |
+
],
|
2519 |
+
"column_types": [
|
2520 |
+
"text"
|
2521 |
+
],
|
2522 |
+
"foreign_keys": {},
|
2523 |
+
"primary_keys": {},
|
2524 |
+
"table_names": [],
|
2525 |
+
"table_names_original": []
|
2526 |
+
},
|
2527 |
+
{
|
2528 |
+
"db_id": "laptop_array",
|
2529 |
+
"column_names": [
|
2530 |
+
[
|
2531 |
+
-1,
|
2532 |
+
"*"
|
2533 |
+
],
|
2534 |
+
[
|
2535 |
+
0,
|
2536 |
+
"customer_id"
|
2537 |
+
],
|
2538 |
+
[
|
2539 |
+
0,
|
2540 |
+
"firstname"
|
2541 |
+
],
|
2542 |
+
[
|
2543 |
+
0,
|
2544 |
+
"lastname"
|
2545 |
+
],
|
2546 |
+
[
|
2547 |
+
0,
|
2548 |
+
"city"
|
2549 |
+
],
|
2550 |
+
[
|
2551 |
+
0,
|
2552 |
+
"address"
|
2553 |
+
],
|
2554 |
+
[
|
2555 |
+
0,
|
2556 |
+
"email"
|
2557 |
+
],
|
2558 |
+
[
|
2559 |
+
0,
|
2560 |
+
"phone_numbers"
|
2561 |
+
],
|
2562 |
+
[
|
2563 |
+
1,
|
2564 |
+
"model"
|
2565 |
+
],
|
2566 |
+
[
|
2567 |
+
1,
|
2568 |
+
"speed"
|
2569 |
+
],
|
2570 |
+
[
|
2571 |
+
1,
|
2572 |
+
"ram"
|
2573 |
+
],
|
2574 |
+
[
|
2575 |
+
1,
|
2576 |
+
"hd"
|
2577 |
+
],
|
2578 |
+
[
|
2579 |
+
1,
|
2580 |
+
"screen"
|
2581 |
+
],
|
2582 |
+
[
|
2583 |
+
1,
|
2584 |
+
"price"
|
2585 |
+
],
|
2586 |
+
[
|
2587 |
+
2,
|
2588 |
+
"model"
|
2589 |
+
],
|
2590 |
+
[
|
2591 |
+
2,
|
2592 |
+
"speed"
|
2593 |
+
],
|
2594 |
+
[
|
2595 |
+
2,
|
2596 |
+
"ram"
|
2597 |
+
],
|
2598 |
+
[
|
2599 |
+
2,
|
2600 |
+
"hd"
|
2601 |
+
],
|
2602 |
+
[
|
2603 |
+
2,
|
2604 |
+
"price"
|
2605 |
+
],
|
2606 |
+
[
|
2607 |
+
3,
|
2608 |
+
"model"
|
2609 |
+
],
|
2610 |
+
[
|
2611 |
+
3,
|
2612 |
+
"color"
|
2613 |
+
],
|
2614 |
+
[
|
2615 |
+
3,
|
2616 |
+
"type"
|
2617 |
+
],
|
2618 |
+
[
|
2619 |
+
3,
|
2620 |
+
"price"
|
2621 |
+
],
|
2622 |
+
[
|
2623 |
+
4,
|
2624 |
+
"maker"
|
2625 |
+
],
|
2626 |
+
[
|
2627 |
+
4,
|
2628 |
+
"model"
|
2629 |
+
],
|
2630 |
+
[
|
2631 |
+
4,
|
2632 |
+
"type"
|
2633 |
+
],
|
2634 |
+
[
|
2635 |
+
5,
|
2636 |
+
"customer_id"
|
2637 |
+
],
|
2638 |
+
[
|
2639 |
+
5,
|
2640 |
+
"model"
|
2641 |
+
],
|
2642 |
+
[
|
2643 |
+
5,
|
2644 |
+
"quantity"
|
2645 |
+
],
|
2646 |
+
[
|
2647 |
+
5,
|
2648 |
+
"day"
|
2649 |
+
],
|
2650 |
+
[
|
2651 |
+
5,
|
2652 |
+
"paid"
|
2653 |
+
],
|
2654 |
+
[
|
2655 |
+
5,
|
2656 |
+
"type_of_payment"
|
2657 |
+
]
|
2658 |
+
],
|
2659 |
+
"column_names_original": [
|
2660 |
+
[
|
2661 |
+
-1,
|
2662 |
+
"*"
|
2663 |
+
],
|
2664 |
+
[
|
2665 |
+
0,
|
2666 |
+
"customer_id"
|
2667 |
+
],
|
2668 |
+
[
|
2669 |
+
0,
|
2670 |
+
"firstname"
|
2671 |
+
],
|
2672 |
+
[
|
2673 |
+
0,
|
2674 |
+
"lastname"
|
2675 |
+
],
|
2676 |
+
[
|
2677 |
+
0,
|
2678 |
+
"city"
|
2679 |
+
],
|
2680 |
+
[
|
2681 |
+
0,
|
2682 |
+
"address"
|
2683 |
+
],
|
2684 |
+
[
|
2685 |
+
0,
|
2686 |
+
"email"
|
2687 |
+
],
|
2688 |
+
[
|
2689 |
+
0,
|
2690 |
+
"phone_number"
|
2691 |
+
],
|
2692 |
+
[
|
2693 |
+
1,
|
2694 |
+
"model"
|
2695 |
+
],
|
2696 |
+
[
|
2697 |
+
1,
|
2698 |
+
"speed"
|
2699 |
+
],
|
2700 |
+
[
|
2701 |
+
1,
|
2702 |
+
"ram"
|
2703 |
+
],
|
2704 |
+
[
|
2705 |
+
1,
|
2706 |
+
"hd"
|
2707 |
+
],
|
2708 |
+
[
|
2709 |
+
1,
|
2710 |
+
"screen"
|
2711 |
+
],
|
2712 |
+
[
|
2713 |
+
1,
|
2714 |
+
"price"
|
2715 |
+
],
|
2716 |
+
[
|
2717 |
+
2,
|
2718 |
+
"model"
|
2719 |
+
],
|
2720 |
+
[
|
2721 |
+
2,
|
2722 |
+
"speed"
|
2723 |
+
],
|
2724 |
+
[
|
2725 |
+
2,
|
2726 |
+
"ram"
|
2727 |
+
],
|
2728 |
+
[
|
2729 |
+
2,
|
2730 |
+
"hd"
|
2731 |
+
],
|
2732 |
+
[
|
2733 |
+
2,
|
2734 |
+
"price"
|
2735 |
+
],
|
2736 |
+
[
|
2737 |
+
3,
|
2738 |
+
"model"
|
2739 |
+
],
|
2740 |
+
[
|
2741 |
+
3,
|
2742 |
+
"color"
|
2743 |
+
],
|
2744 |
+
[
|
2745 |
+
3,
|
2746 |
+
"type"
|
2747 |
+
],
|
2748 |
+
[
|
2749 |
+
3,
|
2750 |
+
"price"
|
2751 |
+
],
|
2752 |
+
[
|
2753 |
+
4,
|
2754 |
+
"maker"
|
2755 |
+
],
|
2756 |
+
[
|
2757 |
+
4,
|
2758 |
+
"model"
|
2759 |
+
],
|
2760 |
+
[
|
2761 |
+
4,
|
2762 |
+
"type"
|
2763 |
+
],
|
2764 |
+
[
|
2765 |
+
5,
|
2766 |
+
"customer_id"
|
2767 |
+
],
|
2768 |
+
[
|
2769 |
+
5,
|
2770 |
+
"model"
|
2771 |
+
],
|
2772 |
+
[
|
2773 |
+
5,
|
2774 |
+
"quantity"
|
2775 |
+
],
|
2776 |
+
[
|
2777 |
+
5,
|
2778 |
+
"day"
|
2779 |
+
],
|
2780 |
+
[
|
2781 |
+
5,
|
2782 |
+
"paid"
|
2783 |
+
],
|
2784 |
+
[
|
2785 |
+
5,
|
2786 |
+
"type_of_payment"
|
2787 |
+
]
|
2788 |
+
],
|
2789 |
+
"column_types": [
|
2790 |
+
"text",
|
2791 |
+
"char",
|
2792 |
+
"varchar",
|
2793 |
+
"varchar",
|
2794 |
+
"varchar",
|
2795 |
+
"varchar",
|
2796 |
+
"varchar",
|
2797 |
+
"array",
|
2798 |
+
"char",
|
2799 |
+
"double",
|
2800 |
+
"int",
|
2801 |
+
"int",
|
2802 |
+
"double",
|
2803 |
+
"double",
|
2804 |
+
"char",
|
2805 |
+
"double",
|
2806 |
+
"int",
|
2807 |
+
"int",
|
2808 |
+
"double",
|
2809 |
+
"char",
|
2810 |
+
"varchar",
|
2811 |
+
"varchar",
|
2812 |
+
"double",
|
2813 |
+
"char",
|
2814 |
+
"char",
|
2815 |
+
"varchar",
|
2816 |
+
"char",
|
2817 |
+
"char",
|
2818 |
+
"int",
|
2819 |
+
"date",
|
2820 |
+
"double",
|
2821 |
+
"varchar"
|
2822 |
+
],
|
2823 |
+
"foreign_keys": {},
|
2824 |
+
"primary_keys": {},
|
2825 |
+
"table_names": [
|
2826 |
+
"customers",
|
2827 |
+
"laptops",
|
2828 |
+
"pcs",
|
2829 |
+
"printers",
|
2830 |
+
"products",
|
2831 |
+
"sales"
|
2832 |
+
],
|
2833 |
+
"table_names_original": [
|
2834 |
+
"customers",
|
2835 |
+
"laptops",
|
2836 |
+
"pcs",
|
2837 |
+
"printers",
|
2838 |
+
"products",
|
2839 |
+
"sales"
|
2840 |
+
]
|
2841 |
+
},
|
2842 |
+
{
|
2843 |
+
"db_id": "laptop_struct",
|
2844 |
+
"column_names": [
|
2845 |
+
[
|
2846 |
+
-1,
|
2847 |
+
"*"
|
2848 |
+
],
|
2849 |
+
[
|
2850 |
+
0,
|
2851 |
+
"person"
|
2852 |
+
],
|
2853 |
+
[
|
2854 |
+
1,
|
2855 |
+
"customer_id"
|
2856 |
+
],
|
2857 |
+
[
|
2858 |
+
1,
|
2859 |
+
"firstname"
|
2860 |
+
],
|
2861 |
+
[
|
2862 |
+
1,
|
2863 |
+
"lastname"
|
2864 |
+
],
|
2865 |
+
[
|
2866 |
+
1,
|
2867 |
+
"city"
|
2868 |
+
],
|
2869 |
+
[
|
2870 |
+
1,
|
2871 |
+
"address"
|
2872 |
+
],
|
2873 |
+
[
|
2874 |
+
1,
|
2875 |
+
"email"
|
2876 |
+
],
|
2877 |
+
[
|
2878 |
+
2,
|
2879 |
+
"model"
|
2880 |
+
],
|
2881 |
+
[
|
2882 |
+
2,
|
2883 |
+
"speed"
|
2884 |
+
],
|
2885 |
+
[
|
2886 |
+
2,
|
2887 |
+
"ram"
|
2888 |
+
],
|
2889 |
+
[
|
2890 |
+
2,
|
2891 |
+
"hd"
|
2892 |
+
],
|
2893 |
+
[
|
2894 |
+
2,
|
2895 |
+
"screen"
|
2896 |
+
],
|
2897 |
+
[
|
2898 |
+
2,
|
2899 |
+
"price"
|
2900 |
+
],
|
2901 |
+
[
|
2902 |
+
3,
|
2903 |
+
"model"
|
2904 |
+
],
|
2905 |
+
[
|
2906 |
+
3,
|
2907 |
+
"speed"
|
2908 |
+
],
|
2909 |
+
[
|
2910 |
+
3,
|
2911 |
+
"ram"
|
2912 |
+
],
|
2913 |
+
[
|
2914 |
+
3,
|
2915 |
+
"hd"
|
2916 |
+
],
|
2917 |
+
[
|
2918 |
+
3,
|
2919 |
+
"price"
|
2920 |
+
],
|
2921 |
+
[
|
2922 |
+
4,
|
2923 |
+
"model"
|
2924 |
+
],
|
2925 |
+
[
|
2926 |
+
4,
|
2927 |
+
"color"
|
2928 |
+
],
|
2929 |
+
[
|
2930 |
+
4,
|
2931 |
+
"type"
|
2932 |
+
],
|
2933 |
+
[
|
2934 |
+
4,
|
2935 |
+
"price"
|
2936 |
+
],
|
2937 |
+
[
|
2938 |
+
5,
|
2939 |
+
"maker"
|
2940 |
+
],
|
2941 |
+
[
|
2942 |
+
5,
|
2943 |
+
"model"
|
2944 |
+
],
|
2945 |
+
[
|
2946 |
+
5,
|
2947 |
+
"type"
|
2948 |
+
],
|
2949 |
+
[
|
2950 |
+
6,
|
2951 |
+
"customer_id"
|
2952 |
+
],
|
2953 |
+
[
|
2954 |
+
6,
|
2955 |
+
"model"
|
2956 |
+
],
|
2957 |
+
[
|
2958 |
+
6,
|
2959 |
+
"quantity"
|
2960 |
+
],
|
2961 |
+
[
|
2962 |
+
6,
|
2963 |
+
"day"
|
2964 |
+
],
|
2965 |
+
[
|
2966 |
+
6,
|
2967 |
+
"paid"
|
2968 |
+
],
|
2969 |
+
[
|
2970 |
+
6,
|
2971 |
+
"type_of_payment"
|
2972 |
+
]
|
2973 |
+
],
|
2974 |
+
"column_names_original": [
|
2975 |
+
[
|
2976 |
+
-1,
|
2977 |
+
"*"
|
2978 |
+
],
|
2979 |
+
[
|
2980 |
+
0,
|
2981 |
+
"person"
|
2982 |
+
],
|
2983 |
+
[
|
2984 |
+
1,
|
2985 |
+
"customer_id"
|
2986 |
+
],
|
2987 |
+
[
|
2988 |
+
1,
|
2989 |
+
"firstname"
|
2990 |
+
],
|
2991 |
+
[
|
2992 |
+
1,
|
2993 |
+
"lastname"
|
2994 |
+
],
|
2995 |
+
[
|
2996 |
+
1,
|
2997 |
+
"city"
|
2998 |
+
],
|
2999 |
+
[
|
3000 |
+
1,
|
3001 |
+
"address"
|
3002 |
+
],
|
3003 |
+
[
|
3004 |
+
1,
|
3005 |
+
"email"
|
3006 |
+
],
|
3007 |
+
[
|
3008 |
+
2,
|
3009 |
+
"model"
|
3010 |
+
],
|
3011 |
+
[
|
3012 |
+
2,
|
3013 |
+
"speed"
|
3014 |
+
],
|
3015 |
+
[
|
3016 |
+
2,
|
3017 |
+
"ram"
|
3018 |
+
],
|
3019 |
+
[
|
3020 |
+
2,
|
3021 |
+
"hd"
|
3022 |
+
],
|
3023 |
+
[
|
3024 |
+
2,
|
3025 |
+
"screen"
|
3026 |
+
],
|
3027 |
+
[
|
3028 |
+
2,
|
3029 |
+
"price"
|
3030 |
+
],
|
3031 |
+
[
|
3032 |
+
3,
|
3033 |
+
"model"
|
3034 |
+
],
|
3035 |
+
[
|
3036 |
+
3,
|
3037 |
+
"speed"
|
3038 |
+
],
|
3039 |
+
[
|
3040 |
+
3,
|
3041 |
+
"ram"
|
3042 |
+
],
|
3043 |
+
[
|
3044 |
+
3,
|
3045 |
+
"hd"
|
3046 |
+
],
|
3047 |
+
[
|
3048 |
+
3,
|
3049 |
+
"price"
|
3050 |
+
],
|
3051 |
+
[
|
3052 |
+
4,
|
3053 |
+
"model"
|
3054 |
+
],
|
3055 |
+
[
|
3056 |
+
4,
|
3057 |
+
"color"
|
3058 |
+
],
|
3059 |
+
[
|
3060 |
+
4,
|
3061 |
+
"type"
|
3062 |
+
],
|
3063 |
+
[
|
3064 |
+
4,
|
3065 |
+
"price"
|
3066 |
+
],
|
3067 |
+
[
|
3068 |
+
5,
|
3069 |
+
"maker"
|
3070 |
+
],
|
3071 |
+
[
|
3072 |
+
5,
|
3073 |
+
"model"
|
3074 |
+
],
|
3075 |
+
[
|
3076 |
+
5,
|
3077 |
+
"type"
|
3078 |
+
],
|
3079 |
+
[
|
3080 |
+
6,
|
3081 |
+
"customer_id"
|
3082 |
+
],
|
3083 |
+
[
|
3084 |
+
6,
|
3085 |
+
"model"
|
3086 |
+
],
|
3087 |
+
[
|
3088 |
+
6,
|
3089 |
+
"quantity"
|
3090 |
+
],
|
3091 |
+
[
|
3092 |
+
6,
|
3093 |
+
"day"
|
3094 |
+
],
|
3095 |
+
[
|
3096 |
+
6,
|
3097 |
+
"paid"
|
3098 |
+
],
|
3099 |
+
[
|
3100 |
+
6,
|
3101 |
+
"type_of_payment"
|
3102 |
+
]
|
3103 |
+
],
|
3104 |
+
"column_types": [
|
3105 |
+
"text",
|
3106 |
+
"struct(id int, name: text)",
|
3107 |
+
"char",
|
3108 |
+
"varchar",
|
3109 |
+
"varchar",
|
3110 |
+
"varchar",
|
3111 |
+
"varchar",
|
3112 |
+
"varchar",
|
3113 |
+
"char",
|
3114 |
+
"double",
|
3115 |
+
"int",
|
3116 |
+
"int",
|
3117 |
+
"double",
|
3118 |
+
"double",
|
3119 |
+
"char",
|
3120 |
+
"double",
|
3121 |
+
"int",
|
3122 |
+
"int",
|
3123 |
+
"double",
|
3124 |
+
"char",
|
3125 |
+
"varchar",
|
3126 |
+
"varchar",
|
3127 |
+
"double",
|
3128 |
+
"char",
|
3129 |
+
"char",
|
3130 |
+
"varchar",
|
3131 |
+
"char",
|
3132 |
+
"char",
|
3133 |
+
"int",
|
3134 |
+
"date",
|
3135 |
+
"double",
|
3136 |
+
"varchar"
|
3137 |
+
],
|
3138 |
+
"foreign_keys": {},
|
3139 |
+
"primary_keys": {},
|
3140 |
+
"table_names": [
|
3141 |
+
"test",
|
3142 |
+
"customers",
|
3143 |
+
"laptops",
|
3144 |
+
"pcs",
|
3145 |
+
"printers",
|
3146 |
+
"products",
|
3147 |
+
"sales"
|
3148 |
+
],
|
3149 |
+
"table_names_original": [
|
3150 |
+
"test",
|
3151 |
+
"customers",
|
3152 |
+
"laptops",
|
3153 |
+
"pcs",
|
3154 |
+
"printers",
|
3155 |
+
"products",
|
3156 |
+
"sales"
|
3157 |
+
]
|
3158 |
+
},
|
3159 |
+
{
|
3160 |
+
"db_id": "laptop_json",
|
3161 |
+
"column_names": [
|
3162 |
+
[
|
3163 |
+
-1,
|
3164 |
+
"*"
|
3165 |
+
],
|
3166 |
+
[
|
3167 |
+
0,
|
3168 |
+
"customer_id"
|
3169 |
+
],
|
3170 |
+
[
|
3171 |
+
0,
|
3172 |
+
"firstname"
|
3173 |
+
],
|
3174 |
+
[
|
3175 |
+
0,
|
3176 |
+
"lastname"
|
3177 |
+
],
|
3178 |
+
[
|
3179 |
+
0,
|
3180 |
+
"city"
|
3181 |
+
],
|
3182 |
+
[
|
3183 |
+
0,
|
3184 |
+
"address"
|
3185 |
+
],
|
3186 |
+
[
|
3187 |
+
0,
|
3188 |
+
"email"
|
3189 |
+
],
|
3190 |
+
[
|
3191 |
+
1,
|
3192 |
+
"model"
|
3193 |
+
],
|
3194 |
+
[
|
3195 |
+
1,
|
3196 |
+
"speed"
|
3197 |
+
],
|
3198 |
+
[
|
3199 |
+
1,
|
3200 |
+
"ram"
|
3201 |
+
],
|
3202 |
+
[
|
3203 |
+
1,
|
3204 |
+
"hd"
|
3205 |
+
],
|
3206 |
+
[
|
3207 |
+
1,
|
3208 |
+
"screen"
|
3209 |
+
],
|
3210 |
+
[
|
3211 |
+
1,
|
3212 |
+
"price"
|
3213 |
+
],
|
3214 |
+
[
|
3215 |
+
2,
|
3216 |
+
"model"
|
3217 |
+
],
|
3218 |
+
[
|
3219 |
+
2,
|
3220 |
+
"speed"
|
3221 |
+
],
|
3222 |
+
[
|
3223 |
+
2,
|
3224 |
+
"ram"
|
3225 |
+
],
|
3226 |
+
[
|
3227 |
+
2,
|
3228 |
+
"hd"
|
3229 |
+
],
|
3230 |
+
[
|
3231 |
+
2,
|
3232 |
+
"price"
|
3233 |
+
],
|
3234 |
+
[
|
3235 |
+
3,
|
3236 |
+
"model"
|
3237 |
+
],
|
3238 |
+
[
|
3239 |
+
3,
|
3240 |
+
"color"
|
3241 |
+
],
|
3242 |
+
[
|
3243 |
+
3,
|
3244 |
+
"type"
|
3245 |
+
],
|
3246 |
+
[
|
3247 |
+
3,
|
3248 |
+
"price"
|
3249 |
+
],
|
3250 |
+
[
|
3251 |
+
4,
|
3252 |
+
"maker"
|
3253 |
+
],
|
3254 |
+
[
|
3255 |
+
4,
|
3256 |
+
"model"
|
3257 |
+
],
|
3258 |
+
[
|
3259 |
+
4,
|
3260 |
+
"type"
|
3261 |
+
],
|
3262 |
+
[
|
3263 |
+
5,
|
3264 |
+
"customer_id"
|
3265 |
+
],
|
3266 |
+
[
|
3267 |
+
5,
|
3268 |
+
"model"
|
3269 |
+
],
|
3270 |
+
[
|
3271 |
+
5,
|
3272 |
+
"quantity"
|
3273 |
+
],
|
3274 |
+
[
|
3275 |
+
5,
|
3276 |
+
"day"
|
3277 |
+
],
|
3278 |
+
[
|
3279 |
+
5,
|
3280 |
+
"paid"
|
3281 |
+
],
|
3282 |
+
[
|
3283 |
+
5,
|
3284 |
+
"type_of_payment"
|
3285 |
+
]
|
3286 |
+
],
|
3287 |
+
"column_names_original": [
|
3288 |
+
[
|
3289 |
+
-1,
|
3290 |
+
"*"
|
3291 |
+
],
|
3292 |
+
[
|
3293 |
+
0,
|
3294 |
+
"customer_id"
|
3295 |
+
],
|
3296 |
+
[
|
3297 |
+
0,
|
3298 |
+
"firstname"
|
3299 |
+
],
|
3300 |
+
[
|
3301 |
+
0,
|
3302 |
+
"lastname"
|
3303 |
+
],
|
3304 |
+
[
|
3305 |
+
0,
|
3306 |
+
"city"
|
3307 |
+
],
|
3308 |
+
[
|
3309 |
+
0,
|
3310 |
+
"address"
|
3311 |
+
],
|
3312 |
+
[
|
3313 |
+
0,
|
3314 |
+
"email"
|
3315 |
+
],
|
3316 |
+
[
|
3317 |
+
1,
|
3318 |
+
"model"
|
3319 |
+
],
|
3320 |
+
[
|
3321 |
+
1,
|
3322 |
+
"speed"
|
3323 |
+
],
|
3324 |
+
[
|
3325 |
+
1,
|
3326 |
+
"ram"
|
3327 |
+
],
|
3328 |
+
[
|
3329 |
+
1,
|
3330 |
+
"hd"
|
3331 |
+
],
|
3332 |
+
[
|
3333 |
+
1,
|
3334 |
+
"screen"
|
3335 |
+
],
|
3336 |
+
[
|
3337 |
+
1,
|
3338 |
+
"price"
|
3339 |
+
],
|
3340 |
+
[
|
3341 |
+
2,
|
3342 |
+
"model"
|
3343 |
+
],
|
3344 |
+
[
|
3345 |
+
2,
|
3346 |
+
"speed"
|
3347 |
+
],
|
3348 |
+
[
|
3349 |
+
2,
|
3350 |
+
"ram"
|
3351 |
+
],
|
3352 |
+
[
|
3353 |
+
2,
|
3354 |
+
"hd"
|
3355 |
+
],
|
3356 |
+
[
|
3357 |
+
2,
|
3358 |
+
"price"
|
3359 |
+
],
|
3360 |
+
[
|
3361 |
+
3,
|
3362 |
+
"model"
|
3363 |
+
],
|
3364 |
+
[
|
3365 |
+
3,
|
3366 |
+
"color"
|
3367 |
+
],
|
3368 |
+
[
|
3369 |
+
3,
|
3370 |
+
"type"
|
3371 |
+
],
|
3372 |
+
[
|
3373 |
+
3,
|
3374 |
+
"price"
|
3375 |
+
],
|
3376 |
+
[
|
3377 |
+
4,
|
3378 |
+
"maker"
|
3379 |
+
],
|
3380 |
+
[
|
3381 |
+
4,
|
3382 |
+
"model"
|
3383 |
+
],
|
3384 |
+
[
|
3385 |
+
4,
|
3386 |
+
"type"
|
3387 |
+
],
|
3388 |
+
[
|
3389 |
+
5,
|
3390 |
+
"customer_id"
|
3391 |
+
],
|
3392 |
+
[
|
3393 |
+
5,
|
3394 |
+
"model"
|
3395 |
+
],
|
3396 |
+
[
|
3397 |
+
5,
|
3398 |
+
"quantity"
|
3399 |
+
],
|
3400 |
+
[
|
3401 |
+
5,
|
3402 |
+
"day"
|
3403 |
+
],
|
3404 |
+
[
|
3405 |
+
5,
|
3406 |
+
"paid"
|
3407 |
+
],
|
3408 |
+
[
|
3409 |
+
5,
|
3410 |
+
"type_of_payment"
|
3411 |
+
]
|
3412 |
+
],
|
3413 |
+
"column_types": [
|
3414 |
+
"text",
|
3415 |
+
"char",
|
3416 |
+
"varchar",
|
3417 |
+
"varchar",
|
3418 |
+
"varchar",
|
3419 |
+
"varchar",
|
3420 |
+
"json",
|
3421 |
+
"char",
|
3422 |
+
"double",
|
3423 |
+
"int",
|
3424 |
+
"int",
|
3425 |
+
"double",
|
3426 |
+
"double",
|
3427 |
+
"char",
|
3428 |
+
"double",
|
3429 |
+
"int",
|
3430 |
+
"int",
|
3431 |
+
"double",
|
3432 |
+
"char",
|
3433 |
+
"varchar",
|
3434 |
+
"varchar",
|
3435 |
+
"double",
|
3436 |
+
"char",
|
3437 |
+
"char",
|
3438 |
+
"varchar",
|
3439 |
+
"char",
|
3440 |
+
"char",
|
3441 |
+
"int",
|
3442 |
+
"date",
|
3443 |
+
"double",
|
3444 |
+
"varchar"
|
3445 |
+
],
|
3446 |
+
"foreign_keys": {},
|
3447 |
+
"primary_keys": {},
|
3448 |
+
"table_names": [
|
3449 |
+
"customers",
|
3450 |
+
"laptops",
|
3451 |
+
"pcs",
|
3452 |
+
"printers",
|
3453 |
+
"products",
|
3454 |
+
"sales"
|
3455 |
+
],
|
3456 |
+
"table_names_original": [
|
3457 |
+
"customers",
|
3458 |
+
"laptops",
|
3459 |
+
"pcs",
|
3460 |
+
"printers",
|
3461 |
+
"products",
|
3462 |
+
"sales"
|
3463 |
+
]
|
3464 |
+
}
|
3465 |
+
]
|
duckdb-nsql/eval/data_utils.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Training data prep utils."""
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
from collections import defaultdict
|
5 |
+
from schema import ForeignKey, Table, TableColumn
|
6 |
+
|
7 |
+
|
8 |
+
def read_tables_json(
|
9 |
+
schema_file: str,
|
10 |
+
lowercase: bool = False,
|
11 |
+
) -> dict[str, dict[str, Table]]:
|
12 |
+
"""Read tables json."""
|
13 |
+
data = json.load(open(schema_file))
|
14 |
+
db_to_tables = {}
|
15 |
+
for db in data:
|
16 |
+
db_name = db["db_id"]
|
17 |
+
table_names = db["table_names_original"]
|
18 |
+
db["column_names_original"] = [
|
19 |
+
[x[0], x[1]] for x in db["column_names_original"]
|
20 |
+
]
|
21 |
+
db["column_types"] = db["column_types"]
|
22 |
+
if lowercase:
|
23 |
+
table_names = [tn.lower() for tn in table_names]
|
24 |
+
pks = db["primary_keys"]
|
25 |
+
fks = db["foreign_keys"]
|
26 |
+
tables = defaultdict(list)
|
27 |
+
tables_pks = defaultdict(list)
|
28 |
+
tables_fks = defaultdict(list)
|
29 |
+
for idx, ((ti, col_name), col_type) in enumerate(
|
30 |
+
zip(db["column_names_original"], db["column_types"])
|
31 |
+
):
|
32 |
+
if ti == -1:
|
33 |
+
continue
|
34 |
+
if lowercase:
|
35 |
+
col_name = col_name.lower()
|
36 |
+
col_type = col_type.lower()
|
37 |
+
if idx in pks:
|
38 |
+
tables_pks[table_names[ti]].append(
|
39 |
+
TableColumn(name=col_name, dtype=col_type)
|
40 |
+
)
|
41 |
+
for fk in fks:
|
42 |
+
if idx == fk[0]:
|
43 |
+
other_column = db["column_names_original"][fk[1]]
|
44 |
+
other_column_type = db["column_types"][fk[1]]
|
45 |
+
other_table = table_names[other_column[0]]
|
46 |
+
tables_fks[table_names[ti]].append(
|
47 |
+
ForeignKey(
|
48 |
+
column=TableColumn(name=col_name, dtype=col_type),
|
49 |
+
references_name=other_table,
|
50 |
+
references_column=TableColumn(
|
51 |
+
name=other_column[1], dtype=other_column_type
|
52 |
+
),
|
53 |
+
)
|
54 |
+
)
|
55 |
+
tables[table_names[ti]].append(TableColumn(name=col_name, dtype=col_type))
|
56 |
+
db_to_tables[db_name] = {
|
57 |
+
table_name: Table(
|
58 |
+
name=table_name,
|
59 |
+
columns=tables[table_name],
|
60 |
+
pks=tables_pks[table_name],
|
61 |
+
fks=tables_fks[table_name],
|
62 |
+
examples=None,
|
63 |
+
)
|
64 |
+
for table_name in tables
|
65 |
+
}
|
66 |
+
return db_to_tables
|
67 |
+
|
68 |
+
|
69 |
+
def clean_str(target: str) -> str:
|
70 |
+
"""Clean string for question."""
|
71 |
+
if not target:
|
72 |
+
return target
|
73 |
+
|
74 |
+
target = re.sub(r"[^\x00-\x7f]", r" ", target)
|
75 |
+
line = re.sub(r"''", r" ", target)
|
76 |
+
line = re.sub(r"``", r" ", line)
|
77 |
+
line = re.sub(r"\"", r"'", line)
|
78 |
+
line = re.sub(r"[\t ]+", " ", line)
|
79 |
+
return line.strip()
|
duckdb-nsql/eval/doc_retriever.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Retrieve documentation for a given query."""
|
2 |
+
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import Any
|
5 |
+
from rich.console import Console
|
6 |
+
from tqdm import tqdm
|
7 |
+
import numpy as np
|
8 |
+
from manifest import Manifest
|
9 |
+
from langchain.text_splitter import MarkdownHeaderTextSplitter
|
10 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
+
|
12 |
+
console = Console(soft_wrap=True)
|
13 |
+
|
14 |
+
try:
|
15 |
+
EMBEDDING_MODEL = Manifest(
|
16 |
+
client_name="openaiembedding",
|
17 |
+
)
|
18 |
+
except Exception as e:
|
19 |
+
console.print(e)
|
20 |
+
console.print(
|
21 |
+
"Failed to load embedding model. Likely OPENAI API key is not set. Please set to run document retrieval.",
|
22 |
+
style="bold red",
|
23 |
+
)
|
24 |
+
|
25 |
+
|
26 |
+
def load_documentation(path: Path) -> dict[str, str]:
|
27 |
+
"""Load documentation from path."""
|
28 |
+
content = {}
|
29 |
+
for file in path.glob("**/*.md"):
|
30 |
+
with open(file, "r") as f:
|
31 |
+
data = f.read()
|
32 |
+
key = str(file).replace(str(path), "")
|
33 |
+
content[key] = data
|
34 |
+
return content
|
35 |
+
|
36 |
+
|
37 |
+
def split_documents(content: dict[str, str]) -> dict[str, Any]:
|
38 |
+
"""Split documents into chunks."""
|
39 |
+
md_splitted_docs = []
|
40 |
+
markdown_splitter = MarkdownHeaderTextSplitter(
|
41 |
+
headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
|
42 |
+
)
|
43 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
44 |
+
separators=["\n"], chunk_size=500, chunk_overlap=50, length_function=len
|
45 |
+
)
|
46 |
+
|
47 |
+
for file, raw_doc in content.items():
|
48 |
+
splitted_text = markdown_splitter.split_text(raw_doc)
|
49 |
+
for t in splitted_text:
|
50 |
+
t.metadata["source"] = file
|
51 |
+
md_splitted_docs.extend(splitted_text)
|
52 |
+
|
53 |
+
docs = text_splitter.split_documents(md_splitted_docs)
|
54 |
+
docs_as_dict = [doc.dict() for doc in docs]
|
55 |
+
return docs_as_dict
|
56 |
+
|
57 |
+
|
58 |
+
def get_embeddings(text: str) -> np.ndarray:
|
59 |
+
"""Get embeddings."""
|
60 |
+
return np.array(EMBEDDING_MODEL.run(text))
|
61 |
+
|
62 |
+
|
63 |
+
def embed_documents(
|
64 |
+
chunked_docs: dict[str, Any], key: str = "page_content"
|
65 |
+
) -> tuple[dict[str, Any], np.ndarray]:
|
66 |
+
"""Embed documents."""
|
67 |
+
all_embeddings = []
|
68 |
+
for doc in tqdm(chunked_docs):
|
69 |
+
emb = get_embeddings(doc[key])
|
70 |
+
doc["embedding"] = emb
|
71 |
+
all_embeddings.append(doc["embedding"])
|
72 |
+
full_embedding_mat = np.vstack(all_embeddings)
|
73 |
+
return chunked_docs, full_embedding_mat
|
74 |
+
|
75 |
+
|
76 |
+
def query_docs(
|
77 |
+
query: str,
|
78 |
+
docs: dict[str, Any],
|
79 |
+
embedding_mat: np.ndarray,
|
80 |
+
top_n: int = 10,
|
81 |
+
key: str = "page_content",
|
82 |
+
) -> tuple[list[int], list[str]]:
|
83 |
+
"""Query documents."""
|
84 |
+
query_embedding = get_embeddings(query)
|
85 |
+
scores = embedding_mat.dot(query_embedding)
|
86 |
+
sorted_indices = np.argsort(scores)[::-1]
|
87 |
+
top_n_indices = sorted_indices[:top_n]
|
88 |
+
top_n_indices_rev = top_n_indices[::-1]
|
89 |
+
returned_docs = []
|
90 |
+
for i in top_n_indices_rev:
|
91 |
+
returned_docs.append(docs[i][key])
|
92 |
+
return top_n_indices_rev.tolist(), returned_docs
|
duckdb-nsql/eval/evaluate.py
ADDED
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Evaluate text2sql spider model predictions."""
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
import signal
|
6 |
+
import sys
|
7 |
+
import traceback
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import Any
|
10 |
+
|
11 |
+
import click
|
12 |
+
import pandas as pd
|
13 |
+
from rich.console import Console
|
14 |
+
from tqdm.auto import tqdm
|
15 |
+
|
16 |
+
sys.path.append(os.path.join(os.path.dirname(__file__), "."))
|
17 |
+
# from metrics.spider import evaluation as spider_evaluation # type: ignore # noqa: E402
|
18 |
+
from metrics.test_suite_sql_eval import ( # type: ignore # noqa: E402
|
19 |
+
evaluation as test_suite_evaluation,
|
20 |
+
)
|
21 |
+
from data_utils import read_tables_json # type: ignore # noqa: E402
|
22 |
+
from metric_utils import ( # type: ignore # noqa: E402
|
23 |
+
correct_casing,
|
24 |
+
edit_distance,
|
25 |
+
)
|
26 |
+
|
27 |
+
console = Console(soft_wrap=True)
|
28 |
+
|
29 |
+
LEVELS = ["easy", "medium", "hard", "duckdb", "ddl", "all"]
|
30 |
+
PARTIAL_TYPES = [
|
31 |
+
"select",
|
32 |
+
"select(no AGG)",
|
33 |
+
"where",
|
34 |
+
"where(no OP)",
|
35 |
+
"group(no Having)",
|
36 |
+
"group",
|
37 |
+
"order",
|
38 |
+
"and/or",
|
39 |
+
"IUEN",
|
40 |
+
"keywords",
|
41 |
+
]
|
42 |
+
TIMEOUT_SECONDS = 30
|
43 |
+
|
44 |
+
|
45 |
+
def timeout_handler(signum: int, frame: Any) -> None:
|
46 |
+
raise TimeoutError("Function execution timed out.")
|
47 |
+
|
48 |
+
|
49 |
+
def print_scores(scores: dict, model_name: str, metric_type: str = "exec") -> None:
|
50 |
+
"""Print scores."""
|
51 |
+
|
52 |
+
def print_formated_s(
|
53 |
+
row_name: str, l: list[str], element_format: str = "{}", sep: str = "\t"
|
54 |
+
) -> None:
|
55 |
+
template = "{}" + sep + sep.join([element_format] * len(l))
|
56 |
+
console.print(template.format(row_name, *l))
|
57 |
+
|
58 |
+
# Add empty scores for each level if not present
|
59 |
+
for level in LEVELS:
|
60 |
+
if level not in scores:
|
61 |
+
scores[level] = {}
|
62 |
+
scores[level]["count"] = 0
|
63 |
+
scores[level]["exec"] = 0
|
64 |
+
scores[level]["exact"] = 0
|
65 |
+
|
66 |
+
print_formated_s("", LEVELS)
|
67 |
+
counts = [scores[level]["count"] for level in LEVELS]
|
68 |
+
print_formated_s("count", counts)
|
69 |
+
console.print(f">====================== {model_name} =====================")
|
70 |
+
if metric_type == "exec":
|
71 |
+
console.print(
|
72 |
+
">===================== EXECUTION ACCURACY ====================="
|
73 |
+
)
|
74 |
+
exec_scores = [scores[level]["exec"] for level in LEVELS]
|
75 |
+
print_formated_s("execution", exec_scores, element_format="{:.3f}")
|
76 |
+
|
77 |
+
elif metric_type == "exact":
|
78 |
+
console.print(
|
79 |
+
"\n>====================== EXACT MATCHING ACCURACY ====================="
|
80 |
+
)
|
81 |
+
exact_scores = [scores[level]["exact"] for level in LEVELS]
|
82 |
+
print_formated_s("exact match", exact_scores, element_format="{:.3f}")
|
83 |
+
|
84 |
+
|
85 |
+
def compute_exact_match_metric(
|
86 |
+
predictions: list,
|
87 |
+
references: list,
|
88 |
+
gold_dbs: list,
|
89 |
+
kmaps: dict,
|
90 |
+
db_dir: str,
|
91 |
+
categories,
|
92 |
+
) -> dict:
|
93 |
+
"""Compute exact match metric."""
|
94 |
+
exact_match = {}
|
95 |
+
exact_match["all"] = {}
|
96 |
+
exact_match["all"]["count"] = 0
|
97 |
+
exact_match["all"]["exact"] = 0
|
98 |
+
for prediction, reference, gold_db, category in tqdm(
|
99 |
+
zip(predictions, references, gold_dbs, categories), total=len(predictions)
|
100 |
+
):
|
101 |
+
if category not in exact_match:
|
102 |
+
exact_match[category] = {}
|
103 |
+
exact_match[category]["count"] = 0
|
104 |
+
exact_match[category]["exact"] = 0
|
105 |
+
exact_match["all"]["count"] += 1
|
106 |
+
exact_match[category]["count"] += 1
|
107 |
+
try:
|
108 |
+
match = int(prediction.trim() == reference.trim())
|
109 |
+
exact_match[category]["exact"] += match
|
110 |
+
exact_match["all"]["exact"] += match
|
111 |
+
except Exception:
|
112 |
+
pass
|
113 |
+
return exact_match
|
114 |
+
|
115 |
+
|
116 |
+
def compute_test_suite_metric(
|
117 |
+
predictions: list,
|
118 |
+
references: list,
|
119 |
+
gold_dbs: list,
|
120 |
+
setup_sqls: list,
|
121 |
+
validate_sqls: list,
|
122 |
+
kmaps: dict,
|
123 |
+
db_dir: str,
|
124 |
+
categories: list[str] = None,
|
125 |
+
) -> tuple[Any, list[int | None]]:
|
126 |
+
"""Compute test suite execution metric."""
|
127 |
+
evaluator = test_suite_evaluation.Evaluator(
|
128 |
+
db_dir=db_dir,
|
129 |
+
kmaps=kmaps,
|
130 |
+
etype="exec",
|
131 |
+
plug_value=False,
|
132 |
+
keep_distinct=False,
|
133 |
+
progress_bar_for_each_datapoint=False,
|
134 |
+
)
|
135 |
+
# Only used for Sparc/CoSQL
|
136 |
+
turn_scores: dict[str, list] = {"exec": [], "exact": []}
|
137 |
+
by_row_metrics: list[int | None] = []
|
138 |
+
for prediction, reference, gold_db, setup_sql, validate_sql, category in tqdm(
|
139 |
+
zip(predictions, references, gold_dbs, setup_sqls, validate_sqls, categories),
|
140 |
+
total=len(predictions),
|
141 |
+
):
|
142 |
+
turn_idx = 0
|
143 |
+
# skip final utterance-query pairs
|
144 |
+
if turn_idx < 0:
|
145 |
+
continue
|
146 |
+
|
147 |
+
# Register the timeout handler function
|
148 |
+
signal.signal(signal.SIGALRM, timeout_handler)
|
149 |
+
signal.alarm(TIMEOUT_SECONDS)
|
150 |
+
|
151 |
+
try:
|
152 |
+
ex_metrics = evaluator.evaluate_one(
|
153 |
+
gold_db,
|
154 |
+
reference,
|
155 |
+
prediction,
|
156 |
+
setup_sql,
|
157 |
+
validate_sql,
|
158 |
+
turn_scores,
|
159 |
+
idx=turn_idx,
|
160 |
+
category=category,
|
161 |
+
)
|
162 |
+
signal.alarm(0)
|
163 |
+
|
164 |
+
by_row_metrics.append(int(ex_metrics["exec"]))
|
165 |
+
except Exception as e:
|
166 |
+
raise e
|
167 |
+
by_row_metrics.append(None)
|
168 |
+
pass
|
169 |
+
evaluator.finalize()
|
170 |
+
return evaluator.scores, by_row_metrics
|
171 |
+
|
172 |
+
|
173 |
+
def compute_metrics(
|
174 |
+
gold_sqls: list[str],
|
175 |
+
pred_sqls: list[str],
|
176 |
+
gold_dbs: list[str],
|
177 |
+
setup_sqls: list[str],
|
178 |
+
validate_sqls: list[str],
|
179 |
+
kmaps: dict,
|
180 |
+
db_schemas: dict,
|
181 |
+
database_dir: str,
|
182 |
+
lowercase_schema_match: bool,
|
183 |
+
model_name: str,
|
184 |
+
categories: list[str] = None,
|
185 |
+
) -> dict[str, str]:
|
186 |
+
"""Compute all metrics for data slice."""
|
187 |
+
if len(gold_sqls) != len(pred_sqls):
|
188 |
+
raise ValueError(
|
189 |
+
f"Gold {len(gold_sqls)} and pred {len(pred_sqls)} have different number of lines!"
|
190 |
+
)
|
191 |
+
all_metrics: dict[str, Any] = {}
|
192 |
+
|
193 |
+
# Execution Accuracy
|
194 |
+
metrics, by_row_metrics = compute_test_suite_metric(
|
195 |
+
pred_sqls,
|
196 |
+
gold_sqls,
|
197 |
+
gold_dbs,
|
198 |
+
setup_sqls,
|
199 |
+
validate_sqls,
|
200 |
+
kmaps,
|
201 |
+
database_dir,
|
202 |
+
categories,
|
203 |
+
)
|
204 |
+
all_metrics["exec"] = metrics
|
205 |
+
all_metrics["by_row_exec"] = by_row_metrics
|
206 |
+
print_scores(metrics, model_name, "exec")
|
207 |
+
|
208 |
+
# Exact Match Accuracy
|
209 |
+
metrics = compute_exact_match_metric(
|
210 |
+
pred_sqls, gold_sqls, gold_dbs, kmaps, database_dir, categories
|
211 |
+
)
|
212 |
+
all_metrics["exact"] = metrics
|
213 |
+
print_scores(metrics, model_name, "exact")
|
214 |
+
|
215 |
+
# Equality Accuracy
|
216 |
+
per_row_match = [
|
217 |
+
int(gold.lower() == pred.lower()) for gold, pred in zip(gold_sqls, pred_sqls)
|
218 |
+
]
|
219 |
+
all_metrics["equality"] = {"equality": sum(per_row_match) / len(gold_sqls)}
|
220 |
+
all_metrics["by_row_equality"] = per_row_match
|
221 |
+
|
222 |
+
# Edit Distance
|
223 |
+
per_row_edit_dist = [
|
224 |
+
edit_distance(gold, pred) for gold, pred in zip(gold_sqls, pred_sqls)
|
225 |
+
]
|
226 |
+
edit_dist = sum(per_row_edit_dist) / len(gold_sqls)
|
227 |
+
all_metrics["edit_distance"] = {"edit_distance": edit_dist}
|
228 |
+
all_metrics["by_row_edit_distance"] = per_row_edit_dist
|
229 |
+
|
230 |
+
return all_metrics
|
231 |
+
|
232 |
+
|
233 |
+
def get_to_print(metrics: dict, key: str, model_name: str, num_rows: int) -> dict:
|
234 |
+
"""Get pretty print dictionary of metrics."""
|
235 |
+
return {
|
236 |
+
"slice": key,
|
237 |
+
"model": model_name,
|
238 |
+
"support": num_rows,
|
239 |
+
"exec": f"{metrics[key]['exec']['all']['exec']:.3f}",
|
240 |
+
"exact": f"{metrics[key]['exact']['all']['exact']:.3f}",
|
241 |
+
"equality": f"{metrics[key]['equality']['equality']:.3f}",
|
242 |
+
"edit_distance": f"{metrics[key]['edit_distance']['edit_distance']:.3f}",
|
243 |
+
}
|
244 |
+
|
245 |
+
|
246 |
+
@click.group()
|
247 |
+
def cli() -> None:
|
248 |
+
"""Entrypoint."""
|
249 |
+
pass
|
250 |
+
|
251 |
+
|
252 |
+
@cli.command()
|
253 |
+
@click.option("--gold", type=str, required=True)
|
254 |
+
@click.option("--pred", type=str, required=True)
|
255 |
+
@click.option("--tables", type=str, required=True)
|
256 |
+
@click.option("--db", type=str, default="")
|
257 |
+
@click.option("--slice-attribute", type=str, default=None)
|
258 |
+
@click.option("--output-dir", type=str, default="")
|
259 |
+
@click.option("--output-filename", type=str, default="")
|
260 |
+
@click.option(
|
261 |
+
"--correct-sql-casing", type=bool, is_flag=True, default=False, required=False
|
262 |
+
)
|
263 |
+
@click.option(
|
264 |
+
"--lowercase-schema-match", type=bool, is_flag=True, default=False, required=False
|
265 |
+
)
|
266 |
+
def evaluate(
|
267 |
+
gold: str,
|
268 |
+
pred: str,
|
269 |
+
tables: str,
|
270 |
+
db: str,
|
271 |
+
slice_attribute: str,
|
272 |
+
output_dir: str,
|
273 |
+
output_filename: str,
|
274 |
+
correct_sql_casing: bool,
|
275 |
+
lowercase_schema_match: bool,
|
276 |
+
) -> None:
|
277 |
+
"""Evaluate SQL.
|
278 |
+
|
279 |
+
Args:
|
280 |
+
gold: path to gold sql file.
|
281 |
+
pred: path to predicted json lines file.
|
282 |
+
tables: the json path of the table metadata.
|
283 |
+
db: path to database dir.
|
284 |
+
slice_attribute: json attribute in gold data to slice on.
|
285 |
+
output_dir: the prediction output directory
|
286 |
+
output_filename: the prediction output filename
|
287 |
+
correct_sql_casing: whether to correct casing of SQL keywords
|
288 |
+
lowercase_schema_match: whether to lowercase schema match
|
289 |
+
"""
|
290 |
+
gold_path = Path(gold)
|
291 |
+
pred_path = Path(pred)
|
292 |
+
model_name = pred_path.stem
|
293 |
+
if not output_filename:
|
294 |
+
output_filename = pred_path.stem + "_eval.json"
|
295 |
+
console.print(f"Saving to {Path(output_dir) / output_filename}")
|
296 |
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
297 |
+
|
298 |
+
kmaps = test_suite_evaluation.build_foreign_key_map_from_json(tables)
|
299 |
+
db_schemas = read_tables_json(tables)
|
300 |
+
|
301 |
+
gold_sqls_dict = json.load(gold_path.open("r", encoding="utf-8"))
|
302 |
+
pred_sqls_dict = [json.loads(l) for l in pred_path.open("r").readlines()]
|
303 |
+
|
304 |
+
# Data validation
|
305 |
+
assert len(gold_sqls_dict) == len(
|
306 |
+
pred_sqls_dict
|
307 |
+
), "Sample size doesn't match between pred and gold file"
|
308 |
+
|
309 |
+
# Keep track of everything
|
310 |
+
full_results = []
|
311 |
+
for gold_sql, pred_sql in zip(gold_sqls_dict, pred_sqls_dict):
|
312 |
+
merged_res = {**pred_sql, **gold_sql}
|
313 |
+
full_results.append(merged_res)
|
314 |
+
|
315 |
+
gold_sqls = [
|
316 |
+
re.sub(r"[\s\t\n]+", " ", p.get("gold", p.get("query", p.get("sql", ""))))
|
317 |
+
for p in gold_sqls_dict
|
318 |
+
]
|
319 |
+
setup_sqls = [re.sub(r"[\s\t\n]+", " ", p["setup_sql"]) for p in gold_sqls_dict]
|
320 |
+
validate_sqls = [
|
321 |
+
re.sub(r"[\s\t\n]+", " ", p["validation_sql"]) for p in gold_sqls_dict
|
322 |
+
]
|
323 |
+
gold_dbs = [p.get("db_id", p.get("db", "")) for p in gold_sqls_dict]
|
324 |
+
pred_sqls = [re.sub(r"[\s\t\n]+", " ", p["pred"]) for p in pred_sqls_dict]
|
325 |
+
categories = [p.get("category", "") for p in gold_sqls_dict]
|
326 |
+
if correct_sql_casing:
|
327 |
+
# One line to correct casing of SQL keywords using correct_casing(sql)
|
328 |
+
gold_sqls = [correct_casing(sql) for sql in gold_sqls]
|
329 |
+
pred_sqls = [correct_casing(sql) for sql in pred_sqls]
|
330 |
+
|
331 |
+
final_metrics: dict[str, dict[str, Any]] = {}
|
332 |
+
to_print = []
|
333 |
+
final_metrics["all"] = compute_metrics(
|
334 |
+
gold_sqls=gold_sqls,
|
335 |
+
pred_sqls=pred_sqls,
|
336 |
+
gold_dbs=gold_dbs,
|
337 |
+
setup_sqls=setup_sqls,
|
338 |
+
validate_sqls=validate_sqls,
|
339 |
+
kmaps=kmaps,
|
340 |
+
db_schemas=db_schemas,
|
341 |
+
database_dir=db,
|
342 |
+
lowercase_schema_match=lowercase_schema_match,
|
343 |
+
model_name=model_name + "(all)",
|
344 |
+
categories=categories,
|
345 |
+
)
|
346 |
+
|
347 |
+
for k, v in final_metrics["all"].items():
|
348 |
+
if k.startswith("by_row"):
|
349 |
+
assert len(v) == len(gold_sqls)
|
350 |
+
for dct, val in zip(full_results, v):
|
351 |
+
dct[k[len("by_row_") :]] = val
|
352 |
+
to_print.append(get_to_print(final_metrics, "all", model_name, len(gold_sqls)))
|
353 |
+
# TODO: could be way more efficient if we subsliced the results but...whatever
|
354 |
+
if slice_attribute:
|
355 |
+
for unq_value in sorted(set([g[slice_attribute] for g in gold_sqls_dict])):
|
356 |
+
idx_set = [
|
357 |
+
i
|
358 |
+
for i, g in enumerate(gold_sqls_dict)
|
359 |
+
if g[slice_attribute] == unq_value
|
360 |
+
]
|
361 |
+
print(f"Processing {unq_value} with {len(idx_set)} samples")
|
362 |
+
final_metrics[unq_value] = compute_metrics(
|
363 |
+
gold_sqls=[gold_sqls[i] for i in idx_set],
|
364 |
+
pred_sqls=[pred_sqls[i] for i in idx_set],
|
365 |
+
gold_dbs=[gold_dbs[i] for i in idx_set],
|
366 |
+
setup_sqls=[setup_sqls[i] for i in idx_set],
|
367 |
+
validate_sqls=[validate_sqls[i] for i in idx_set],
|
368 |
+
kmaps=kmaps,
|
369 |
+
db_schemas=db_schemas,
|
370 |
+
database_dir=db,
|
371 |
+
lowercase_schema_match=lowercase_schema_match,
|
372 |
+
model_name=model_name + f"({unq_value})",
|
373 |
+
categories=[categories[i] for i in idx_set],
|
374 |
+
)
|
375 |
+
to_print.append(
|
376 |
+
get_to_print(final_metrics, unq_value, model_name, len(idx_set))
|
377 |
+
)
|
378 |
+
|
379 |
+
df = pd.DataFrame(to_print)
|
380 |
+
console.print(df.to_csv(sep=",", index=False))
|
381 |
+
console.print("******")
|
382 |
+
console.print(f"Saved metrics to {Path(output_dir) / output_filename}")
|
383 |
+
json.dump(final_metrics, open(Path(output_dir) / output_filename, "w"), indent=4)
|
384 |
+
output_filename = str(output_filename).replace("_eval.json", "_fd.jsonl")
|
385 |
+
console.print(f"Saved dump to {Path(output_dir) / output_filename}")
|
386 |
+
with open(Path(output_dir) / output_filename, "w") as f:
|
387 |
+
for dct in full_results:
|
388 |
+
f.write(json.dumps(dct) + "\n")
|
389 |
+
|
390 |
+
|
391 |
+
if __name__ == "__main__":
|
392 |
+
cli()
|
duckdb-nsql/eval/get_manifest.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Manifest utils."""
|
2 |
+
from manifest import Manifest
|
3 |
+
from manifest.connections.client_pool import ClientConnection
|
4 |
+
|
5 |
+
|
6 |
+
def get_manifest(
|
7 |
+
manifest_client: str,
|
8 |
+
manifest_connection: str,
|
9 |
+
manifest_engine: str,
|
10 |
+
) -> Manifest:
|
11 |
+
"""Get manifest engine."""
|
12 |
+
if manifest_client in {"openai", "openaichat", "openai_mock", "openrouter", "azureendpoint"}:
|
13 |
+
manifest = Manifest(
|
14 |
+
client_name=manifest_client,
|
15 |
+
engine=manifest_engine,
|
16 |
+
)
|
17 |
+
elif manifest_client in {"huggingface"}:
|
18 |
+
manifest = Manifest(
|
19 |
+
client_pool=[
|
20 |
+
ClientConnection(
|
21 |
+
client_name=manifest_client,
|
22 |
+
client_connection=manifest_conn,
|
23 |
+
)
|
24 |
+
for manifest_conn in manifest_connection.split(";")
|
25 |
+
],
|
26 |
+
)
|
27 |
+
else:
|
28 |
+
raise ValueError(f"Unknown manifest client {manifest_client}")
|
29 |
+
return manifest
|
duckdb-nsql/eval/loaders.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Data loaders."""
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
import string
|
5 |
+
from abc import ABC, abstractmethod
|
6 |
+
|
7 |
+
from rich.console import Console
|
8 |
+
from data_utils import read_tables_json
|
9 |
+
from schema import Table
|
10 |
+
|
11 |
+
RE_COLUMN = re.compile(r"^select (.+?) from")
|
12 |
+
RE_CONDS = re.compile(r"where (.+?)$")
|
13 |
+
RE_COND = re.compile(r"^(.+?)\s*([=><])\s*(.+?)$")
|
14 |
+
|
15 |
+
translator = str.maketrans(
|
16 |
+
string.punctuation, " " * len(string.punctuation)
|
17 |
+
) # map punctuation to space
|
18 |
+
|
19 |
+
console = Console(soft_wrap=True)
|
20 |
+
|
21 |
+
|
22 |
+
def standardize_column(col: str) -> str:
|
23 |
+
"""Standardize the column name to SQL compatible."""
|
24 |
+
col_name = col.replace("#", "num").replace("%", "perc")
|
25 |
+
col_name = col_name.strip().lower().translate(translator)
|
26 |
+
col_name = re.sub("[^0-9a-z ]", " ", col_name).strip()
|
27 |
+
col_name = re.sub(" +", "_", col_name)
|
28 |
+
if not col_name:
|
29 |
+
console.print(f"original {col}, new {col_name}")
|
30 |
+
return col_name
|
31 |
+
|
32 |
+
|
33 |
+
def clean_col(col: str) -> str:
|
34 |
+
"""Remove table name and standardize column name."""
|
35 |
+
if "." in col and not col.endswith("."):
|
36 |
+
col = col.split(".")[-1]
|
37 |
+
return standardize_column(col)
|
38 |
+
|
39 |
+
|
40 |
+
class Loader(ABC):
|
41 |
+
"""Loader abstract class."""
|
42 |
+
|
43 |
+
@classmethod
|
44 |
+
@abstractmethod
|
45 |
+
def load_data(cls, path: str) -> list[dict]:
|
46 |
+
"""Load data from path."""
|
47 |
+
|
48 |
+
@classmethod
|
49 |
+
@abstractmethod
|
50 |
+
def load_table_metadata(cls, path: str) -> dict[str, dict[str, Table]]:
|
51 |
+
"""Extract table metadata from table-metadata-path."""
|
52 |
+
|
53 |
+
@classmethod
|
54 |
+
def format_output(cls, prediction: dict) -> dict:
|
55 |
+
"""Parse for spider format."""
|
56 |
+
return prediction
|
57 |
+
|
58 |
+
|
59 |
+
class DefaultLoader(Loader):
|
60 |
+
"""Spider loader and writer."""
|
61 |
+
|
62 |
+
@classmethod
|
63 |
+
def load_data(cls, path: str) -> list[dict]:
|
64 |
+
"""Load data from path."""
|
65 |
+
try:
|
66 |
+
with open(path) as f:
|
67 |
+
data = json.loads(f.read())
|
68 |
+
except json.decoder.JSONDecodeError:
|
69 |
+
# Try with jsonl
|
70 |
+
data = [json.loads(line) for line in open(path)]
|
71 |
+
return data
|
72 |
+
|
73 |
+
@classmethod
|
74 |
+
def load_table_metadata(cls, path: str) -> dict[str, dict[str, Table]]:
|
75 |
+
"""Extract table metadata from table-metadata-path."""
|
76 |
+
# load the tables
|
77 |
+
db_to_tables = read_tables_json(path, lowercase=True)
|
78 |
+
return db_to_tables
|
duckdb-nsql/eval/metric_utils.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Utility metrics."""
|
2 |
+
import sqlglot
|
3 |
+
from rich.console import Console
|
4 |
+
from sqlglot import parse_one
|
5 |
+
|
6 |
+
console = Console(soft_wrap=True)
|
7 |
+
|
8 |
+
|
9 |
+
def correct_casing(sql: str) -> str:
|
10 |
+
"""Correct casing of SQL."""
|
11 |
+
parse: sqlglot.expressions.Expression = parse_one(sql, read="sqlite")
|
12 |
+
return parse.sql()
|
13 |
+
|
14 |
+
|
15 |
+
def prec_recall_f1(gold: set, pred: set) -> dict[str, float]:
|
16 |
+
"""Compute precision, recall and F1 score."""
|
17 |
+
prec = len(gold.intersection(pred)) / len(pred) if pred else 0.0
|
18 |
+
recall = len(gold.intersection(pred)) / len(gold) if gold else 0.0
|
19 |
+
f1 = 2 * prec * recall / (prec + recall) if prec + recall else 0.0
|
20 |
+
return {"prec": prec, "recall": recall, "f1": f1}
|
21 |
+
|
22 |
+
|
23 |
+
def edit_distance(s1: str, s2: str) -> int:
|
24 |
+
"""Compute edit distance between two strings."""
|
25 |
+
# Make sure s1 is the shorter string
|
26 |
+
if len(s1) > len(s2):
|
27 |
+
s1, s2 = s2, s1
|
28 |
+
|
29 |
+
distances: list[int] = list(range(len(s1) + 1))
|
30 |
+
for i2, c2 in enumerate(s2):
|
31 |
+
distances_ = [i2 + 1]
|
32 |
+
for i1, c1 in enumerate(s1):
|
33 |
+
if c1 == c2:
|
34 |
+
distances_.append(distances[i1])
|
35 |
+
else:
|
36 |
+
distances_.append(
|
37 |
+
1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
|
38 |
+
)
|
39 |
+
distances = distances_
|
40 |
+
return distances[-1]
|
duckdb-nsql/eval/metrics/test_suite_sql_eval
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 640a12975abf75a94e917caca149d56dbc6bcdd7
|
duckdb-nsql/eval/predict.py
ADDED
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Run dataset on text2sql zazu experiment.
|
2 |
+
|
3 |
+
See README.md for more details.
|
4 |
+
"""
|
5 |
+
import datetime
|
6 |
+
import json
|
7 |
+
import multiprocessing
|
8 |
+
import random
|
9 |
+
import re
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
import click
|
13 |
+
import numpy as np
|
14 |
+
from constants import PROMPT_FORMATTERS
|
15 |
+
from loaders import DefaultLoader
|
16 |
+
from get_manifest import get_manifest
|
17 |
+
from manifest import Manifest
|
18 |
+
from prompt_formatters import RajkumarFormatter
|
19 |
+
from rich.console import Console
|
20 |
+
from schema import Table, TextToSQLModelResponse, TextToSQLParams
|
21 |
+
from text_to_sql import instruction_to_sql, instruction_to_sql_list
|
22 |
+
from doc_retriever import (
|
23 |
+
load_documentation,
|
24 |
+
split_documents,
|
25 |
+
embed_documents,
|
26 |
+
query_docs,
|
27 |
+
)
|
28 |
+
from tqdm import tqdm
|
29 |
+
from transformers import AutoTokenizer
|
30 |
+
|
31 |
+
console = Console(soft_wrap=True)
|
32 |
+
|
33 |
+
|
34 |
+
def generate_sql(
|
35 |
+
manifest: Manifest,
|
36 |
+
text_to_sql_in: list[TextToSQLParams],
|
37 |
+
retrieved_docs: list[list[str]],
|
38 |
+
prompt_formatter: RajkumarFormatter,
|
39 |
+
stop_tokens: list[str] | None = None,
|
40 |
+
overwrite_manifest: bool = False,
|
41 |
+
max_tokens: int = 300,
|
42 |
+
temperature: float = 0.1,
|
43 |
+
num_beams: int = 2,
|
44 |
+
parallel: bool = False,
|
45 |
+
) -> list[tuple[str, TextToSQLModelResponse]]:
|
46 |
+
"""Call our text2sql function with manifest of our choice."""
|
47 |
+
if parallel:
|
48 |
+
instruction_to_sql_resps: list[
|
49 |
+
TextToSQLModelResponse
|
50 |
+
] = instruction_to_sql_list(
|
51 |
+
params=text_to_sql_in,
|
52 |
+
extra_context=retrieved_docs,
|
53 |
+
manifest=manifest,
|
54 |
+
prompt_formatter=prompt_formatter,
|
55 |
+
overwrite_manifest=overwrite_manifest,
|
56 |
+
max_tokens=max_tokens,
|
57 |
+
temperature=0.1,
|
58 |
+
stop_sequences=stop_tokens,
|
59 |
+
num_beams=num_beams,
|
60 |
+
)
|
61 |
+
else:
|
62 |
+
instruction_to_sql_resps = [
|
63 |
+
instruction_to_sql(
|
64 |
+
params=_text_to_sql_in,
|
65 |
+
extra_context=_retrieved_docs,
|
66 |
+
manifest=manifest,
|
67 |
+
prompt_formatter=prompt_formatter,
|
68 |
+
overwrite_manifest=overwrite_manifest,
|
69 |
+
max_tokens=max_tokens,
|
70 |
+
temperature=temperature,
|
71 |
+
stop_sequences=stop_tokens,
|
72 |
+
num_beams=num_beams,
|
73 |
+
)
|
74 |
+
for _retrieved_docs, _text_to_sql_in in tqdm(
|
75 |
+
zip(retrieved_docs, text_to_sql_in),
|
76 |
+
desc="Generating SQL",
|
77 |
+
total=len(text_to_sql_in),
|
78 |
+
disable=(len(text_to_sql_in) <= 1),
|
79 |
+
)
|
80 |
+
]
|
81 |
+
assert len(instruction_to_sql_resps) == len(text_to_sql_in)
|
82 |
+
|
83 |
+
sql_statements = []
|
84 |
+
for i in range(len(instruction_to_sql_resps)):
|
85 |
+
sql_statement = instruction_to_sql_resps[i].output.strip()
|
86 |
+
if "<>" in sql_statement:
|
87 |
+
sql_statement.replace("<>", "!=")
|
88 |
+
# Models sometime train to predict <databasename/schema> | <sql>
|
89 |
+
sql_statement = sql_statement.split("|")[-1].strip()
|
90 |
+
sql_statements.append(sql_statement)
|
91 |
+
return list(zip(sql_statements, instruction_to_sql_resps))
|
92 |
+
|
93 |
+
|
94 |
+
def get_text_to_sql_in(
|
95 |
+
input_question: dict, db_to_tables: dict[str, dict[str, Table]]
|
96 |
+
) -> TextToSQLParams:
|
97 |
+
"""Format input question for text2sql function."""
|
98 |
+
question = input_question["question"]
|
99 |
+
db_id = input_question.get("db_id", None)
|
100 |
+
if db_id != "none":
|
101 |
+
table_params = list(db_to_tables.get(db_id, {}).values())
|
102 |
+
else:
|
103 |
+
table_params = []
|
104 |
+
if len(table_params) == 0:
|
105 |
+
console.print(f"[red] WARNING: No tables found for {db_id} [/red]")
|
106 |
+
text_to_sql_in = TextToSQLParams(
|
107 |
+
instruction=question,
|
108 |
+
database=db_id,
|
109 |
+
tables=table_params,
|
110 |
+
)
|
111 |
+
return text_to_sql_in
|
112 |
+
|
113 |
+
|
114 |
+
@click.group()
|
115 |
+
def cli() -> None:
|
116 |
+
"""Entrypoint."""
|
117 |
+
pass
|
118 |
+
|
119 |
+
|
120 |
+
@cli.command()
|
121 |
+
@click.argument("dataset-path")
|
122 |
+
@click.argument("table-meta-path")
|
123 |
+
@click.option("--output-dir", type=str, default="")
|
124 |
+
@click.option("--run-name", type=str, default="")
|
125 |
+
@click.option("--num-run", type=int, default=-1)
|
126 |
+
@click.option("--num-print", type=int, default=20)
|
127 |
+
# Format options
|
128 |
+
@click.option("--prompt-format", type=str, default="spider")
|
129 |
+
# Prompt options
|
130 |
+
@click.option("--stop-tokens", type=str, default=[], multiple=True)
|
131 |
+
@click.option("--max-tokens", type=int, default=200)
|
132 |
+
@click.option("--temperature", type=float, default=0)
|
133 |
+
@click.option("--num-beams", type=int, default=-1) # use whatever is in manifest
|
134 |
+
@click.option("--max-context-length", type=int, default=-1)
|
135 |
+
# Docs options
|
136 |
+
@click.option(
|
137 |
+
"--markdown-docs-path",
|
138 |
+
#type=click.Path(
|
139 |
+
# exists=True, file_okay=True, dir_okay=True, readable=True, path_type=Path
|
140 |
+
#),
|
141 |
+
default="eval/docs/duckdb-web/docs/archive/0.9.2/sql",
|
142 |
+
)
|
143 |
+
@click.option("--num-retrieved-docs", type=int, default=0)
|
144 |
+
# Manifest options
|
145 |
+
@click.option("--manifest-client", type=str, default="openai")
|
146 |
+
@click.option("--manifest-engine", type=str, default="gpt-4o")
|
147 |
+
@click.option("--manifest-connection", type=str, default="http://localhost:5005")
|
148 |
+
@click.option("--overwrite-manifest", is_flag=True, default=False)
|
149 |
+
@click.option("--parallel", is_flag=True, default=False)
|
150 |
+
def predict(
|
151 |
+
dataset_path: str,
|
152 |
+
table_meta_path: str,
|
153 |
+
output_dir: str,
|
154 |
+
run_name: str,
|
155 |
+
num_run: int,
|
156 |
+
num_print: int,
|
157 |
+
prompt_format: str,
|
158 |
+
stop_tokens: list[str],
|
159 |
+
max_tokens: int,
|
160 |
+
temperature: float,
|
161 |
+
num_beams: int,
|
162 |
+
max_context_length: int,
|
163 |
+
markdown_docs_path: Path,
|
164 |
+
num_retrieved_docs: int,
|
165 |
+
manifest_client: str,
|
166 |
+
manifest_engine: str,
|
167 |
+
manifest_connection: str,
|
168 |
+
overwrite_manifest: bool,
|
169 |
+
parallel: bool,
|
170 |
+
) -> None:
|
171 |
+
"""Predict SQL.
|
172 |
+
|
173 |
+
Args:
|
174 |
+
dataset_path: the dataset path.
|
175 |
+
table_meta_path: the json path of the table metadata.
|
176 |
+
database_path: the database path for sqlite.
|
177 |
+
output_dir: the prediction output directory
|
178 |
+
run_name: special prefix to add to filename
|
179 |
+
num_run: the number of examples to run
|
180 |
+
num_print: the number of examples to print
|
181 |
+
prompt_format: the format of the prompt. E.g., "rajkumar"
|
182 |
+
stop_tokens: the stop tokens to try
|
183 |
+
max_tokens: the max tokens
|
184 |
+
temperature: the temperature
|
185 |
+
num_beams: the number of beams
|
186 |
+
max_context_length: max context length for demonstration truncation (-1 means None)
|
187 |
+
markdown_docs_path: path to duckdb sql docs
|
188 |
+
num_retrieved_docs: number of docs to retrieve
|
189 |
+
manifest_client: the manifest client
|
190 |
+
manifest_engine: the manifest engine
|
191 |
+
manifest_connection: the manifest connection
|
192 |
+
"""
|
193 |
+
multiprocessing.set_start_method("spawn", force=True)
|
194 |
+
random.seed(0)
|
195 |
+
np.random.seed(0)
|
196 |
+
locals_dict = locals()
|
197 |
+
locals_dict["markdown_docs_path"] = str(markdown_docs_path)
|
198 |
+
console.print(json.dumps(locals_dict, indent=2))
|
199 |
+
|
200 |
+
data_formatter = DefaultLoader()
|
201 |
+
|
202 |
+
if prompt_format not in PROMPT_FORMATTERS:
|
203 |
+
raise ValueError(f"Unknown prompt format {prompt_format}")
|
204 |
+
prompt_formatter = PROMPT_FORMATTERS[prompt_format]()
|
205 |
+
|
206 |
+
# load manifest
|
207 |
+
manifest = get_manifest(
|
208 |
+
manifest_client=manifest_client,
|
209 |
+
manifest_connection=manifest_connection,
|
210 |
+
manifest_engine=manifest_engine,
|
211 |
+
)
|
212 |
+
manifest_params = manifest.client_pool.get_current_client().get_model_params()
|
213 |
+
console.print(f"Running with {manifest_params} manifest.")
|
214 |
+
model_name = manifest_params.get("engine", manifest_params["model_name"])
|
215 |
+
|
216 |
+
if manifest_client in {"openai", "openaichat", "openrouter", "azureendpoint"}:
|
217 |
+
tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
|
218 |
+
else:
|
219 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
220 |
+
|
221 |
+
if stop_tokens:
|
222 |
+
stop_tokens = [st.strip("'") for st in stop_tokens]
|
223 |
+
console.print(f"Stop tokens: {stop_tokens}")
|
224 |
+
|
225 |
+
# Get output filename
|
226 |
+
full_dataset_path = Path(dataset_path)
|
227 |
+
# Get todays date
|
228 |
+
date_today = datetime.datetime.now().strftime("%y-%m-%d")
|
229 |
+
if run_name:
|
230 |
+
run_name = f"{run_name}_"
|
231 |
+
suffix = f"{run_name}{full_dataset_path.stem}_{date_today}.json" # noqa: E501
|
232 |
+
prefix = f"{prompt_format}_{num_retrieved_docs}docs"
|
233 |
+
if manifest_client in {"openai", "openaiazure"}:
|
234 |
+
middleix = manifest_engine
|
235 |
+
elif manifest_client in {"huggingface", "ray"}:
|
236 |
+
middleix = Path(manifest_params.get("model_path", "")).name.replace("/", "-")
|
237 |
+
elif manifest_client in {"toma", "openrouter", "openaichat", "azureendpoint"}:
|
238 |
+
middleix = manifest_engine.split("/")[-1]
|
239 |
+
else:
|
240 |
+
raise ValueError(f"Unknown manifest client {manifest_client}")
|
241 |
+
output_filename = f"{prefix}_{middleix}_{suffix}"
|
242 |
+
console.print(f"Saving to {Path(output_dir) / output_filename}")
|
243 |
+
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
244 |
+
|
245 |
+
console.print("Loading metadata...")
|
246 |
+
db_to_tables = data_formatter.load_table_metadata(table_meta_path)
|
247 |
+
|
248 |
+
console.print("Loading data...")
|
249 |
+
data = data_formatter.load_data(dataset_path)
|
250 |
+
if num_run > 0:
|
251 |
+
console.print(f"Running on {min(len(data), num_run)} examples")
|
252 |
+
data = data[:num_run]
|
253 |
+
original_data = data
|
254 |
+
|
255 |
+
# load the examples
|
256 |
+
console.print("Formatting data...")
|
257 |
+
num_print = min(num_print, len(data))
|
258 |
+
token_lengths = []
|
259 |
+
text_to_sql_in = [
|
260 |
+
get_text_to_sql_in(input_question, db_to_tables) for input_question in data
|
261 |
+
]
|
262 |
+
|
263 |
+
if num_retrieved_docs > 0:
|
264 |
+
console.print("Loading documenration and indexing...")
|
265 |
+
retrieved_docs = []
|
266 |
+
doc_contents = load_documentation(markdown_docs_path)
|
267 |
+
chunked_docs = split_documents(doc_contents)
|
268 |
+
embedded_docs, full_embedding_mat = embed_documents(chunked_docs)
|
269 |
+
for i in tqdm(range(len(text_to_sql_in)), desc="Retrieving docs"):
|
270 |
+
_, retrieved_docs_strings = query_docs(
|
271 |
+
text_to_sql_in[i].instruction,
|
272 |
+
embedded_docs,
|
273 |
+
full_embedding_mat,
|
274 |
+
top_n=num_retrieved_docs,
|
275 |
+
)
|
276 |
+
retrieved_docs.append(retrieved_docs_strings)
|
277 |
+
else:
|
278 |
+
retrieved_docs = [[] for _ in range(len(text_to_sql_in))]
|
279 |
+
|
280 |
+
for i in range(num_print):
|
281 |
+
# Run a few to get some examples to print
|
282 |
+
generated_responses = generate_sql(
|
283 |
+
manifest=manifest,
|
284 |
+
text_to_sql_in=[text_to_sql_in[i]],
|
285 |
+
retrieved_docs=[retrieved_docs[i]],
|
286 |
+
stop_tokens=stop_tokens,
|
287 |
+
max_tokens=max_tokens,
|
288 |
+
temperature=temperature,
|
289 |
+
num_beams=num_beams,
|
290 |
+
prompt_formatter=prompt_formatter,
|
291 |
+
overwrite_manifest=overwrite_manifest,
|
292 |
+
parallel=parallel,
|
293 |
+
)
|
294 |
+
for prediction, model_response in generated_responses:
|
295 |
+
prediction = re.sub(r"[\s\t\n]+", " ", prediction)
|
296 |
+
token_lengths.append(len(tokenizer(prediction).input_ids))
|
297 |
+
console.print(f"[blue]Prompt:[/blue] {model_response.final_prompt}")
|
298 |
+
console.print(f"[red]Prediction:[/red] {prediction}")
|
299 |
+
if data[i].get("query") or data[i].get("sql"):
|
300 |
+
console.print(
|
301 |
+
"[purple]Gold:[/purple] "
|
302 |
+
f"{data[i].get('query') or data[i].get('sql')}"
|
303 |
+
)
|
304 |
+
console.print("\n****\n")
|
305 |
+
|
306 |
+
# Run the entire thing now - the to_print results will be in cache and fast
|
307 |
+
generated_sqls = generate_sql(
|
308 |
+
manifest=manifest,
|
309 |
+
text_to_sql_in=text_to_sql_in,
|
310 |
+
retrieved_docs=retrieved_docs,
|
311 |
+
stop_tokens=stop_tokens,
|
312 |
+
max_tokens=max_tokens,
|
313 |
+
temperature=temperature,
|
314 |
+
num_beams=num_beams,
|
315 |
+
prompt_formatter=prompt_formatter,
|
316 |
+
overwrite_manifest=overwrite_manifest,
|
317 |
+
parallel=parallel,
|
318 |
+
)
|
319 |
+
|
320 |
+
with open(Path(output_dir) / output_filename, "w") as fout:
|
321 |
+
for i, (prediction, model_response) in enumerate(generated_sqls):
|
322 |
+
if isinstance(model_response.final_prompt, str):
|
323 |
+
token_lengths.append(
|
324 |
+
len(tokenizer(model_response.final_prompt).input_ids)
|
325 |
+
)
|
326 |
+
else:
|
327 |
+
for prompt in model_response.final_prompt:
|
328 |
+
token_lengths.append(len(tokenizer(prompt["content"]).input_ids))
|
329 |
+
entry = {
|
330 |
+
**original_data[i],
|
331 |
+
"pred": prediction,
|
332 |
+
"raw_pred": model_response.output,
|
333 |
+
"raw_output": model_response.raw_output,
|
334 |
+
"prompt": model_response.final_prompt,
|
335 |
+
"tables": [tbl.dict() for tbl in text_to_sql_in[i].tables or []],
|
336 |
+
}
|
337 |
+
formatted_entry = data_formatter.format_output(entry)
|
338 |
+
print(json.dumps(formatted_entry), file=fout)
|
339 |
+
overflow = len([tl for tl in token_lengths if tl > 2048]) / len(token_lengths)
|
340 |
+
console.print(f"Overflow 2048 prompt {100*overflow:.2f}%")
|
341 |
+
console.print(f"Saved to {Path(output_dir) / output_filename}")
|
342 |
+
|
343 |
+
|
344 |
+
if __name__ == "__main__":
|
345 |
+
cli()
|
duckdb-nsql/eval/prompt_formatters.py
ADDED
@@ -0,0 +1,1056 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Rajkumar prompt formatter."""
|
2 |
+
|
3 |
+
from random import shuffle
|
4 |
+
from manifest import Manifest
|
5 |
+
from schema import Table
|
6 |
+
import re
|
7 |
+
|
8 |
+
|
9 |
+
class RajkumarFormatter:
|
10 |
+
"""RajkumarFormatter class.
|
11 |
+
|
12 |
+
From https://arxiv.org/pdf/2204.00498.pdf.
|
13 |
+
"""
|
14 |
+
|
15 |
+
table_sep: str = "\n\n"
|
16 |
+
shuffle_table_order: bool = True
|
17 |
+
_cache: dict[tuple[str, str, str], list[str]] = {}
|
18 |
+
clean_whitespace = False
|
19 |
+
|
20 |
+
@classmethod
|
21 |
+
def format_table(cls, table: Table) -> str:
|
22 |
+
"""Get table format."""
|
23 |
+
table_fmt = []
|
24 |
+
for col in table.columns or []:
|
25 |
+
# This is technically an incorrect type, but it should be a catchall word
|
26 |
+
table_fmt.append(f" {col.name} {col.dtype or 'any'}")
|
27 |
+
if table_fmt:
|
28 |
+
all_cols = ",\n".join(table_fmt)
|
29 |
+
create_tbl = f"CREATE TABLE {table.name} (\n{all_cols}\n)"
|
30 |
+
else:
|
31 |
+
create_tbl = f"CREATE TABLE {table.name}"
|
32 |
+
return create_tbl
|
33 |
+
|
34 |
+
@classmethod
|
35 |
+
def format_all_tables(cls, tables: list[Table], instruction: str) -> list[str]:
|
36 |
+
"""Get all tables format."""
|
37 |
+
table_texts = [cls.format_table(table) for table in tables]
|
38 |
+
key = ("tables", instruction, str(tables))
|
39 |
+
if key not in cls._cache:
|
40 |
+
shuffle(table_texts)
|
41 |
+
cls._cache[key] = table_texts
|
42 |
+
else:
|
43 |
+
table_texts = cls._cache[key]
|
44 |
+
return table_texts
|
45 |
+
|
46 |
+
@classmethod
|
47 |
+
def format_retrieved_context(
|
48 |
+
cls,
|
49 |
+
context: list[str],
|
50 |
+
) -> str:
|
51 |
+
"""Format retrieved context."""
|
52 |
+
context_str = "\n--------\n".join(context)
|
53 |
+
return f"\n\n/*\nHere is additional documentation about DuckDB that could be useful.\n--------\n{context_str}\n--------\n*/"
|
54 |
+
|
55 |
+
@classmethod
|
56 |
+
def format_prompt(
|
57 |
+
cls,
|
58 |
+
instruction: str,
|
59 |
+
table_text: str,
|
60 |
+
context_text: str,
|
61 |
+
) -> str | list[str]:
|
62 |
+
"""Get prompt format."""
|
63 |
+
return f"""{table_text}\n\n\n-- Using valid DuckDB SQL, answer the following question for the tables provided above.{context_text}\n\n-- {instruction}\n""" # noqa: E501
|
64 |
+
|
65 |
+
@classmethod
|
66 |
+
def format_model_output(cls, output_sql: str, prompt: str) -> str:
|
67 |
+
"""Format model output."""
|
68 |
+
clean_sql = (output_sql
|
69 |
+
.replace('```sql\n', '')
|
70 |
+
.replace('```duckdb\n', '')
|
71 |
+
.replace('```\n', '')
|
72 |
+
.replace('```', '')).strip()
|
73 |
+
|
74 |
+
if clean_sql.find(';') != -1:
|
75 |
+
clean_sql[:clean_sql.find(';')].strip()
|
76 |
+
|
77 |
+
if not clean_sql.endswith(";"):
|
78 |
+
clean_sql += ";"
|
79 |
+
|
80 |
+
return clean_sql
|
81 |
+
|
82 |
+
@classmethod
|
83 |
+
def format_gold_output(cls, output_sql: str) -> str:
|
84 |
+
"""Format gold output for demonstration."""
|
85 |
+
return output_sql
|
86 |
+
|
87 |
+
class MotherDuckFormatter(RajkumarFormatter):
|
88 |
+
"""MotherDuck class."""
|
89 |
+
|
90 |
+
@classmethod
|
91 |
+
def format_prompt(
|
92 |
+
cls,
|
93 |
+
instruction: str,
|
94 |
+
table_text: str,
|
95 |
+
context_text: str,
|
96 |
+
) -> str | list[str]:
|
97 |
+
"""Get prompt format."""
|
98 |
+
return f"""{table_text}\n\n\n-- Using valid DuckDB SQL, answer the following question for the tables provided above.{context_text}\n\n-- {instruction}\n```sql\n""" # noqa: E501
|
99 |
+
|
100 |
+
|
101 |
+
class DuckDBFormatter(RajkumarFormatter):
|
102 |
+
"""DuckDB class."""
|
103 |
+
|
104 |
+
@classmethod
|
105 |
+
def format_prompt(
|
106 |
+
cls,
|
107 |
+
instruction: str,
|
108 |
+
table_text: str,
|
109 |
+
context_text: str,
|
110 |
+
) -> str | list[str]:
|
111 |
+
"""Get prompt format."""
|
112 |
+
return f"""{table_text}\n\n\n-- Using valid DuckDB SQL, answer the following question for the tables provided above.{context_text}\n\n-- {instruction}\n```sql\n""" # noqa: E501
|
113 |
+
|
114 |
+
|
115 |
+
class DuckDBInstFormatter(RajkumarFormatter):
|
116 |
+
"""DuckDB Inst class."""
|
117 |
+
|
118 |
+
PROMPT_TEMPLATE = """### Instruction:\n{instruction}\n\n### Input:\n{input}{context}\n### Question:\n{question}\n\n### Response (use duckdb shorthand if possible):\n"""
|
119 |
+
INSTRUCTION_TEMPLATE = """Your task is to generate valid duckdb SQL to answer the following question{has_schema}""" # noqa: E501
|
120 |
+
|
121 |
+
@classmethod
|
122 |
+
def format_retrieved_context(
|
123 |
+
cls,
|
124 |
+
context: list[str],
|
125 |
+
) -> str:
|
126 |
+
"""Format retrieved context."""
|
127 |
+
context_str = "\n--------\n".join(context)
|
128 |
+
return f"\n### Documentation:\n{context_str}\n"
|
129 |
+
|
130 |
+
@classmethod
|
131 |
+
def format_prompt(
|
132 |
+
cls,
|
133 |
+
instruction: str,
|
134 |
+
table_text: str,
|
135 |
+
context_text: str,
|
136 |
+
) -> str | list[str]:
|
137 |
+
"""Get prompt format."""
|
138 |
+
input = ""
|
139 |
+
if table_text:
|
140 |
+
input = """Here is the database schema that the SQL query will run on:\n{schema}\n""".format( # noqa: E501
|
141 |
+
schema=table_text
|
142 |
+
)
|
143 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
144 |
+
instruction=cls.INSTRUCTION_TEMPLATE.format(
|
145 |
+
has_schema="."
|
146 |
+
if table_text == ""
|
147 |
+
else ", given a duckdb database schema."
|
148 |
+
),
|
149 |
+
context=context_text,
|
150 |
+
input=input,
|
151 |
+
question=instruction,
|
152 |
+
)
|
153 |
+
return instruction
|
154 |
+
|
155 |
+
class DuckDBInstFormatterLlamaShort(RajkumarFormatter):
|
156 |
+
"""DuckDB Inst class."""
|
157 |
+
|
158 |
+
PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
159 |
+
|
160 |
+
Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
|
161 |
+
|
162 |
+
Here are some DuckDB SQL syntax specifics you should be aware of:
|
163 |
+
|
164 |
+
- DuckDB uses double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity and single quotes (') to define string literals
|
165 |
+
- DuckDB can query CSV, Parquet, and JSON directly without loading them first, e.g. `SELECT * FROM 'data.csv';`
|
166 |
+
- DuckDB supports CREATE TABLE AS (CTAS): `CREATE TABLE new_table AS SELECT * FROM old_table;`
|
167 |
+
- DuckDB queries can start with FROM, and optionally omit SELECT *, e.g. `FROM my_table WHERE condition;` is equivalent to `SELECT * FROM my_table WHERE condition;`
|
168 |
+
- DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly, e.g. `SELECT 1 + 1 AS result;`
|
169 |
+
- DuckDB supports attaching multiple databases, unsing the ATTACH statement: `ATTACH 'my_database.duckdb' AS mydb;`. Tables within attached databases can be accessed using the dot notation (.), e.g. `SELECT * FROM mydb.table_name syntax`. The default databases doesn't require the do notation to access tables. The default database can be changed with the USE statement, e.g. `USE my_db;`.
|
170 |
+
- DuckDB is generally more lenient with implicit type conversions (e.g. `SELECT '42' + 1;` - Implicit cast, result is 43), but you can always be explicit using `::`, e.g. `SELECT '42'::INTEGER + 1;`
|
171 |
+
- DuckDB can extract parts of strings and lists using [start:end] or [start:end:step] syntax. Indexes start at 1. String slicing: `SELECT 'DuckDB'[1:4];`. Array/List slicing: `SELECT [1, 2, 3, 4][1:3];`
|
172 |
+
- DuckDB has a powerful way to select or transform multiple columns using patterns or functions. You can select columns matching a pattern: `SELECT COLUMNS('sales_.*') FROM sales_data;` or transform multiple columns with a function: `SELECT AVG(COLUMNS('sales_.*')) FROM sales_data;`
|
173 |
+
- DuckDB has an easy way to include/exclude or modify columns when selecting all: e.g. Exclude: `SELECT * EXCLUDE (sensitive_data) FROM users;` Replace: `SELECT * REPLACE (UPPER(name) AS name) FROM users;`
|
174 |
+
- DuckDB has a shorthand for grouping/ordering by all non-aggregated/all columns. e.g `SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;` and `SELECT * FROM my_table ORDER BY ALL;`
|
175 |
+
- DuckDB can combine tables by matching column names, not just their positions using UNION BY NAME. E.g. `SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;`
|
176 |
+
- DuckDB has an inutitive syntax to create List/Struct/Map and Array types. Create complex types using intuitive syntax. List: `SELECT [1, 2, 3] AS my_list;`, Struct: `{{'a': 1, 'b': 'text'}} AS my_struct;`, Map: `MAP([1,2],['one','two']) as my_map;`. All types can also be nested into each other. Array types are fixed size, while list types have variable size. Compared to Structs, MAPs do not need to have the same keys present for each row, but keys can only be of type Integer or Varchar. Example: `CREATE TABLE example (my_list INTEGER[], my_struct STRUCT(a INTEGER, b TEXT), my_map MAP(INTEGER, VARCHAR), my_array INTEGER[3], my_nested_struct STRUCT(a INTEGER, b Integer[3]));`
|
177 |
+
- DuckDB has an inutive syntax to access struct fields using dot notation (.) or brackets ([]) with the field name. Maps fields can be accessed by brackets ([]).
|
178 |
+
- DuckDB's way of converting between text and timestamps, and extract date parts. Current date as 'YYYY-MM-DD': `SELECT strftime(NOW(), '%Y-%m-%d');` String to timestamp: `SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP;`, Extract Year from date: `SELECT EXTRACT(YEAR FROM DATE '2023-07-23');`
|
179 |
+
- Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses. E.g.: `SELECT a + b AS total FROM my_table WHERE total > 10 GROUP BY total HAVING total < 20;`
|
180 |
+
- DuckDB allows generating lists using expressions similar to Python list comprehensions. E.g. `SELECT [x*2 FOR x IN [1, 2, 3]];` Returns [2, 4, 6].
|
181 |
+
- DuckDB allows chaining multiple function calls together using the dot (.) operator. E.g.: `SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB';`
|
182 |
+
- DuckDB has a JSON data type. It supports selecting fields from the JSON with a JSON-Path expression using the arrow operator, -> (returns JSON) or ->> (returns text) with JSONPath expressions. For example: `SELECT data->'$.user.id' AS user_id, data->>'$.event_type' AS event_type FROM events;`
|
183 |
+
- DuckDB has built-in functions for regex regexp_matches(column, regex), regexp_replace(column, regex), and regexp_extract(column, regex).
|
184 |
+
- DuckDB has a way to quickly get a subset of your data with `SELECT * FROM large_table USING SAMPLE 10%;`<|eot_id|>
|
185 |
+
<|start_header_id|>user<|end_header_id|>
|
186 |
+
|
187 |
+
Database Schema:
|
188 |
+
Here is the schema of the DuckDB database that the SQL query will run on:
|
189 |
+
```sql
|
190 |
+
{schema}
|
191 |
+
```
|
192 |
+
|
193 |
+
Question:
|
194 |
+
Here is the question or an instruction the user provided:
|
195 |
+
{question}
|
196 |
+
|
197 |
+
Task:
|
198 |
+
Write a DuckDB SQL query for the given question!<|eot_id|>
|
199 |
+
<|start_header_id|>assistant<|end_header_id|>
|
200 |
+
|
201 |
+
```sql
|
202 |
+
"""
|
203 |
+
|
204 |
+
@classmethod
|
205 |
+
def format_retrieved_context(
|
206 |
+
cls,
|
207 |
+
context: list[str],
|
208 |
+
) -> str:
|
209 |
+
"""Format retrieved context."""
|
210 |
+
context_str = "\n--------\n".join(context)
|
211 |
+
return f"\n### Documentation:\n{context_str}\n"
|
212 |
+
|
213 |
+
@classmethod
|
214 |
+
def format_prompt(
|
215 |
+
cls,
|
216 |
+
instruction: str,
|
217 |
+
table_text: str,
|
218 |
+
context_text: str,
|
219 |
+
) -> str | list[str]:
|
220 |
+
"""Get prompt format."""
|
221 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
222 |
+
schema=table_text,
|
223 |
+
question=instruction
|
224 |
+
)
|
225 |
+
return instruction
|
226 |
+
|
227 |
+
class DuckDBInstFormatterLlamaBasic(RajkumarFormatter):
|
228 |
+
"""DuckDB Inst class."""
|
229 |
+
|
230 |
+
PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
231 |
+
|
232 |
+
Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.<|eot_id|>
|
233 |
+
<|start_header_id|>user<|end_header_id|>
|
234 |
+
|
235 |
+
Database Schema:
|
236 |
+
Here is the schema of the DuckDB database that the SQL query will run on:
|
237 |
+
```sql
|
238 |
+
{schema}
|
239 |
+
```
|
240 |
+
|
241 |
+
Question:
|
242 |
+
Here is the question or an instruction the user provided:
|
243 |
+
{question}
|
244 |
+
|
245 |
+
Task:
|
246 |
+
Write a DuckDB SQL query for the given question!<|eot_id|>
|
247 |
+
<|start_header_id|>assistant<|end_header_id|>
|
248 |
+
|
249 |
+
```sql
|
250 |
+
"""
|
251 |
+
|
252 |
+
@classmethod
|
253 |
+
def format_retrieved_context(
|
254 |
+
cls,
|
255 |
+
context: list[str],
|
256 |
+
) -> str:
|
257 |
+
"""Format retrieved context."""
|
258 |
+
context_str = "\n--------\n".join(context)
|
259 |
+
return f"\n### Documentation:\n{context_str}\n"
|
260 |
+
|
261 |
+
@classmethod
|
262 |
+
def format_prompt(
|
263 |
+
cls,
|
264 |
+
instruction: str,
|
265 |
+
table_text: str,
|
266 |
+
context_text: str,
|
267 |
+
) -> str | list[str]:
|
268 |
+
"""Get prompt format."""
|
269 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
270 |
+
schema=table_text,
|
271 |
+
question=instruction
|
272 |
+
)
|
273 |
+
return instruction
|
274 |
+
|
275 |
+
class DuckDBInstFormatterLlamaSyntax(RajkumarFormatter):
|
276 |
+
"""DuckDB Inst class."""
|
277 |
+
|
278 |
+
PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
|
279 |
+
|
280 |
+
Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
|
281 |
+
|
282 |
+
Here are some DuckDB SQL syntax specifics you should be aware of:
|
283 |
+
|
284 |
+
- DuckDB uses double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity and single quotes (') to define string literals
|
285 |
+
- DuckDB can query CSV, Parquet, and JSON directly without loading them first, e.g. `SELECT * FROM 'data.csv';`
|
286 |
+
- DuckDB supports CREATE TABLE AS (CTAS): `CREATE TABLE new_table AS SELECT * FROM old_table;`
|
287 |
+
- DuckDB queries can start with FROM, and optionally omit SELECT *, e.g. `FROM my_table WHERE condition;` is equivalent to `SELECT * FROM my_table WHERE condition;`
|
288 |
+
- DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly, e.g. `SELECT 1 + 1 AS result;`
|
289 |
+
- DuckDB supports attaching multiple databases, unsing the ATTACH statement: `ATTACH 'my_database.duckdb' AS mydb;`. Tables within attached databases can be accessed using the dot notation (.), e.g. `SELECT * FROM mydb.table_name syntax`. The default databases doesn't require the do notation to access tables. The default database can be changed with the USE statement, e.g. `USE my_db;`.
|
290 |
+
- DuckDB is generally more lenient with implicit type conversions (e.g. `SELECT '42' + 1;` - Implicit cast, result is 43), but you can always be explicit using `::`, e.g. `SELECT '42'::INTEGER + 1;`
|
291 |
+
- DuckDB can extract parts of strings and lists using [start:end] or [start:end:step] syntax. Indexes start at 1. String slicing: `SELECT 'DuckDB'[1:4];`. Array/List slicing: `SELECT [1, 2, 3, 4][1:3];`
|
292 |
+
- DuckDB has a powerful way to select or transform multiple columns using patterns or functions. You can select columns matching a pattern: `SELECT COLUMNS('sales_.*') FROM sales_data;` or transform multiple columns with a function: `SELECT AVG(COLUMNS('sales_.*')) FROM sales_data;`
|
293 |
+
- DuckDB has an easy way to include/exclude or modify columns when selecting all: e.g. Exclude: `SELECT * EXCLUDE (sensitive_data) FROM users;` Replace: `SELECT * REPLACE (UPPER(name) AS name) FROM users;`
|
294 |
+
- DuckDB has a shorthand for grouping/ordering by all non-aggregated/all columns. e.g `SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;` and `SELECT * FROM my_table ORDER BY ALL;`
|
295 |
+
- DuckDB can combine tables by matching column names, not just their positions using UNION BY NAME. E.g. `SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;`
|
296 |
+
- DuckDB has an inutitive syntax to create List/Struct/Map and Array types. Create complex types using intuitive syntax. List: `SELECT [1, 2, 3] AS my_list;`, Struct: `{{'a': 1, 'b': 'text'}} AS my_struct;`, Map: `MAP([1,2],['one','two']) as my_map;`. All types can also be nested into each other. Array types are fixed size, while list types have variable size. Compared to Structs, MAPs do not need to have the same keys present for each row, but keys can only be of type Integer or Varchar. Example: `CREATE TABLE example (my_list INTEGER[], my_struct STRUCT(a INTEGER, b TEXT), my_map MAP(INTEGER, VARCHAR), my_array INTEGER[3], my_nested_struct STRUCT(a INTEGER, b Integer[3]));`
|
297 |
+
- DuckDB has an inutive syntax to access struct fields using dot notation (.) or brackets ([]) with the field name. Maps fields can be accessed by brackets ([]).
|
298 |
+
- DuckDB's way of converting between text and timestamps, and extract date parts. Current date as 'YYYY-MM-DD': `SELECT strftime(NOW(), '%Y-%m-%d');` String to timestamp: `SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP;`, Extract Year from date: `SELECT EXTRACT(YEAR FROM DATE '2023-07-23');`
|
299 |
+
- Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses. E.g.: `SELECT a + b AS total FROM my_table WHERE total > 10 GROUP BY total HAVING total < 20;`
|
300 |
+
- DuckDB allows generating lists using expressions similar to Python list comprehensions. E.g. `SELECT [x*2 FOR x IN [1, 2, 3]];` Returns [2, 4, 6].
|
301 |
+
- DuckDB allows chaining multiple function calls together using the dot (.) operator. E.g.: `SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB';`
|
302 |
+
- DuckDB has a JSON data type. It supports selecting fields from the JSON with a JSON-Path expression using the arrow operator, -> (returns JSON) or ->> (returns text) with JSONPath expressions. For example: `SELECT data->'$.user.id' AS user_id, data->>'$.event_type' AS event_type FROM events;`
|
303 |
+
- DuckDB has built-in functions for regex regexp_matches(column, regex), regexp_replace(column, regex), and regexp_extract(column, regex).
|
304 |
+
- DuckDB has a way to quickly get a subset of your data with `SELECT * FROM large_table USING SAMPLE 10%;`<|eot_id|>
|
305 |
+
<|start_header_id|>user<|end_header_id|>
|
306 |
+
|
307 |
+
Database Schema:
|
308 |
+
Here is the schema of the DuckDB database that the SQL query will run on:
|
309 |
+
```sql
|
310 |
+
{schema}
|
311 |
+
```
|
312 |
+
|
313 |
+
Question:
|
314 |
+
Here is the question or an instruction the user provided:
|
315 |
+
{question}
|
316 |
+
|
317 |
+
Task:
|
318 |
+
Write a DuckDB SQL query for the given question!<|eot_id|>
|
319 |
+
<|start_header_id|>assistant<|end_header_id|>
|
320 |
+
|
321 |
+
```sql
|
322 |
+
"""
|
323 |
+
|
324 |
+
@classmethod
|
325 |
+
def format_retrieved_context(
|
326 |
+
cls,
|
327 |
+
context: list[str],
|
328 |
+
) -> str:
|
329 |
+
"""Format retrieved context."""
|
330 |
+
context_str = "\n--------\n".join(context)
|
331 |
+
return f"\n### Documentation:\n{context_str}\n"
|
332 |
+
|
333 |
+
@classmethod
|
334 |
+
def format_prompt(
|
335 |
+
cls,
|
336 |
+
instruction: str,
|
337 |
+
table_text: str,
|
338 |
+
context_text: str,
|
339 |
+
) -> str | list[str]:
|
340 |
+
"""Get prompt format."""
|
341 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
342 |
+
schema=table_text,
|
343 |
+
question=instruction
|
344 |
+
)
|
345 |
+
return instruction
|
346 |
+
|
347 |
+
class DuckDBInstFormatterGraniteShort(RajkumarFormatter):
|
348 |
+
"""DuckDB Inst class."""
|
349 |
+
|
350 |
+
PROMPT_TEMPLATE = """System:
|
351 |
+
Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
|
352 |
+
|
353 |
+
Here are some DuckDB SQL syntax specifics you should be aware of:
|
354 |
+
|
355 |
+
|
356 |
+
- DuckDB use double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity and single quotes (') to define string literals
|
357 |
+
- DuckDB can query CSV, Parquet, and JSON directly without loading them first, e.g. `SELECT * FROM 'data.csv';`
|
358 |
+
- DuckDB supports CREATE TABLE AS (CTAS): `CREATE TABLE new_table AS SELECT * FROM old_table;`
|
359 |
+
- DuckDB queries can start with FROM, and optionally omit SELECT *, e.g. `FROM my_table WHERE condition;` is equivalent to `SELECT * FROM my_table WHERE condition;`
|
360 |
+
- DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly, e.g. `SELECT 1 + 1 AS result;`
|
361 |
+
- DuckDB supports attaching multiple databases, unsing the ATTACH statement: `ATTACH 'my_database.duckdb' AS mydb;`. Tables within attached databases can be accessed using the dot notation (.), e.g. `SELECT * FROM mydb.table_name syntax`. The default databases doesn't require the do notation to access tables. The default database can be changed with the USE statement, e.g. `USE my_db;`.
|
362 |
+
- DuckDB is generally more lenient with implicit type conversions (e.g. `SELECT '42' + 1;` - Implicit cast, result is 43), but you can always be explicit using `::`, e.g. `SELECT '42'::INTEGER + 1;`
|
363 |
+
- DuckDB can extract parts of strings and lists using [start:end] or [start:end:step] syntax. Indexes start at 1. String slicing: `SELECT 'DuckDB'[1:4];`. Array/List slicing: `SELECT [1, 2, 3, 4][1:3];`
|
364 |
+
- DuckDB has a powerful way to select or transform multiple columns using patterns or functions. You can select columns matching a pattern: `SELECT COLUMNS('sales_.*') FROM sales_data;` or transform multiple columns with a function: `SELECT AVG(COLUMNS('sales_.*')) FROM sales_data;`
|
365 |
+
- DuckDB an easy way to include/exclude or modify columns when selecting all: e.g. Exclude: `SELECT * EXCLUDE (sensitive_data) FROM users;` Replace: `SELECT * REPLACE (UPPER(name) AS name) FROM users;`
|
366 |
+
- DuckDB has a shorthand for grouping/ordering by all non-aggregated/all columns. e.g `SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;` and `SELECT * FROM my_table ORDER BY ALL;`
|
367 |
+
- DuckDB can combine tables by matching column names, not just their positions using UNION BY NAME. E.g. `SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;`
|
368 |
+
- DuckDB has an inutitive syntax to create List/Struct/Map and Array types. Create complex types using intuitive syntax. List: `SELECT [1, 2, 3] AS my_list;`, Struct: `{{'a': 1, 'b': 'text'}} AS my_struct;`, Map: `MAP([1,2],['one','two']) as my_map;`. All types can also be nested into each other. Array types are fixed size, while list types have variable size. Compared to Structs, MAPs do not need to have the same keys present for each row, but keys can only be of type Integer or Varchar. Example: `CREATE TABLE example (my_list INTEGER[], my_struct STRUCT(a INTEGER, b TEXT), my_map MAP(INTEGER, VARCHAR), my_array INTEGER[3], my_nested_struct STRUCT(a INTEGER, b Integer[3]));`
|
369 |
+
- DuckDB has an inutive syntax to access struct fields using dot notation (.) or brackets ([]) with the field name. Maps fields can be accessed by brackets ([]).
|
370 |
+
- DuckDB's way of converting between text and timestamps, and extract date parts. Current date as 'YYYY-MM-DD': `SELECT strftime(NOW(), '%Y-%m-%d');` String to timestamp: `SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP;`, Extract Year from date: `SELECT EXTRACT(YEAR FROM DATE '2023-07-23');`
|
371 |
+
- Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses. E.g.: `SELECT a + b AS total FROM my_table WHERE total > 10 GROUP BY total HAVING total < 20;`
|
372 |
+
- DuckDB allows generating lists using expressions similar to Python list comprehensions. E.g. `SELECT [x*2 FOR x IN [1, 2, 3]];` Returns [2, 4, 6].
|
373 |
+
- DuckDB allows chaining multiple function calls together using the dot (.) operator. E.g.: `SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB';`
|
374 |
+
- DuckDB has a JSON data type. It supports selecting fields from the JSON with a JSON-Path expression using the arrow operator, -> (returns JSON) or ->> (returns text) with JSONPath expressions. For example: `SELECT data->'$.user.id' AS user_id, data->>'$.event_type' AS event_type FROM events;`
|
375 |
+
- DuckDB has built-in functions for regex regexp_matches(column, regex), regexp_replace(column, regex), and regexp_extract(column, regex).
|
376 |
+
- DuckDB has a way to quickly get a subset of your data with `SELECT * FROM large_table USING SAMPLE 10%;`
|
377 |
+
|
378 |
+
Here is the schema of the DuckDB database that the SQL query will run on:
|
379 |
+
{schema}
|
380 |
+
|
381 |
+
Question:
|
382 |
+
Here is the question or an instruction the user provided:
|
383 |
+
{question}
|
384 |
+
|
385 |
+
Write a DuckDB SQL query for the given question!
|
386 |
+
|
387 |
+
Answer:
|
388 |
+
```
|
389 |
+
"""
|
390 |
+
|
391 |
+
@classmethod
|
392 |
+
def format_retrieved_context(
|
393 |
+
cls,
|
394 |
+
context: list[str],
|
395 |
+
) -> str:
|
396 |
+
"""Format retrieved context."""
|
397 |
+
context_str = "\n--------\n".join(context)
|
398 |
+
return f"\n### Documentation:\n{context_str}\n"
|
399 |
+
|
400 |
+
@classmethod
|
401 |
+
def format_prompt(
|
402 |
+
cls,
|
403 |
+
instruction: str,
|
404 |
+
table_text: str,
|
405 |
+
context_text: str,
|
406 |
+
) -> str | list[str]:
|
407 |
+
"""Get prompt format."""
|
408 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
409 |
+
schema=table_text,
|
410 |
+
question=instruction
|
411 |
+
)
|
412 |
+
return instruction
|
413 |
+
|
414 |
+
class DuckDBInstFormatterLlama(RajkumarFormatter):
|
415 |
+
"""DuckDB Inst class."""
|
416 |
+
|
417 |
+
PROMPT_TEMPLATE = """<|begin_of_text|>
|
418 |
+
|
419 |
+
Your task is to generate valid DuckDB SQL to answer the following question, given a DuckDB database schema.
|
420 |
+
|
421 |
+
## DuckDB SQL syntax specifics you should be aware of:
|
422 |
+
|
423 |
+
### Case Insensitivity and Quoting:
|
424 |
+
|
425 |
+
Identifiers (tables, columns): Case-insensitive, but DuckDB remembers the case you use. Use double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity.
|
426 |
+
```
|
427 |
+
CREATE TABLE "My Table" ("column_name" VARCHAR); -- Spaces and mixed case
|
428 |
+
SELECT "column_name" FROM "My Table";
|
429 |
+
```
|
430 |
+
|
431 |
+
### String Literals: Always use single quotes (') to define string literals.
|
432 |
+
```
|
433 |
+
SELECT 'This is a string' AS text;
|
434 |
+
```
|
435 |
+
|
436 |
+
### Direct File Querying: Query CSV, Parquet, and JSON files directly without loading them first.
|
437 |
+
|
438 |
+
```
|
439 |
+
SELECT * FROM 'data.csv';
|
440 |
+
SELECT * FROM 'data.parquet';
|
441 |
+
SELECT * FROM 'data.json';
|
442 |
+
```
|
443 |
+
|
444 |
+
### CREATE TABLE AS (CTAS): Create tables from query results.
|
445 |
+
|
446 |
+
```
|
447 |
+
CREATE TABLE squares AS SELECT i, i * i AS square FROM generate_series(1, 10) t(i);
|
448 |
+
```
|
449 |
+
|
450 |
+
### FROM-First Syntax (Optional SELECT): Start queries with FROM, and optionally omit SELECT *.
|
451 |
+
|
452 |
+
```
|
453 |
+
FROM my_table WHERE condition; -- Equivalent to SELECT * FROM my_table WHERE condition
|
454 |
+
```
|
455 |
+
|
456 |
+
### SELECT without FROM: DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly.
|
457 |
+
|
458 |
+
```
|
459 |
+
SELECT 1 + 1 AS result;
|
460 |
+
```
|
461 |
+
|
462 |
+
### GROUP BY ALL/ORDER BY ALL: Shorthand for grouping/ordering by all non-aggregated/all columns.
|
463 |
+
|
464 |
+
```
|
465 |
+
SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;
|
466 |
+
SELECT * FROM my_table ORDER BY ALL;
|
467 |
+
```
|
468 |
+
|
469 |
+
### SELECT COLUMNS(): Powerful way to select or transform multiple columns using patterns or functions.
|
470 |
+
|
471 |
+
```
|
472 |
+
-- Select columns matching a pattern
|
473 |
+
SELECT COLUMNS('sales_.*') FROM sales_data;
|
474 |
+
|
475 |
+
-- Transform multiple columns with a function
|
476 |
+
SELECT AVG(COLUMNS(*)) FROM sales_data;
|
477 |
+
```
|
478 |
+
|
479 |
+
### UNION BY NAME: Combine tables by matching column names, not just their positions.
|
480 |
+
|
481 |
+
```
|
482 |
+
SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;
|
483 |
+
```
|
484 |
+
|
485 |
+
### Implicit/Explicit Casting: DuckDB is generally more lenient with implicit type conversions, but you can always be explicit using ::
|
486 |
+
|
487 |
+
```
|
488 |
+
SELECT '42' + 1; -- Implicit cast, result is 43
|
489 |
+
SELECT '42'::INTEGER + 1; -- Explicit cast, result is 43
|
490 |
+
```
|
491 |
+
|
492 |
+
### String/List Slicing: Extract parts of strings and lists using [start:end] or [start:end:step] syntax.
|
493 |
+
|
494 |
+
```
|
495 |
+
SELECT 'DuckDB'[1:4]; -- Returns 'Duck'
|
496 |
+
SELECT [1, 2, 3, 4][1:3]; -- Returns [1, 2, 3]
|
497 |
+
```
|
498 |
+
|
499 |
+
### Simple List/Struct/Map/Array Creation: Create complex types using intuitive syntax.
|
500 |
+
|
501 |
+
In a SELECT statement:
|
502 |
+
```
|
503 |
+
SELECT [1, 2, 3] AS my_list, {{'a': 1, 'b': 'text'}} AS my_struct, MAP([1,2],['one','two']) as my_map;
|
504 |
+
```
|
505 |
+
|
506 |
+
When creating a table:
|
507 |
+
```
|
508 |
+
CREATE TABLE data (
|
509 |
+
my_list INTEGER[],
|
510 |
+
my_struct STRUCT(a INTEGER, b TEXT),
|
511 |
+
my_map MAP(INTEGER, VARCHAR),
|
512 |
+
my_array INTEGER[3]
|
513 |
+
);
|
514 |
+
```
|
515 |
+
|
516 |
+
### Timestamp Conversions and Extraction: Convert between text and timestamps, and extract date parts.
|
517 |
+
|
518 |
+
```
|
519 |
+
SELECT strftime(NOW(), '%Y-%m-%d'); -- Current date as 'YYYY-MM-DD'
|
520 |
+
SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP; -- String to timestamp
|
521 |
+
SELECT EXTRACT(YEAR FROM DATE '2023-07-23'); -- Extract year
|
522 |
+
```
|
523 |
+
|
524 |
+
### Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses.
|
525 |
+
|
526 |
+
```
|
527 |
+
SELECT a + b AS total
|
528 |
+
FROM my_table
|
529 |
+
WHERE total > 10
|
530 |
+
GROUP BY total
|
531 |
+
HAVING total < 20;
|
532 |
+
```
|
533 |
+
|
534 |
+
### List Comprehensions: Generate lists using expressions similar to Python list comprehensions.
|
535 |
+
|
536 |
+
```
|
537 |
+
SELECT [x*2 FOR x IN [1, 2, 3]]; -- Returns [2, 4, 6]
|
538 |
+
```
|
539 |
+
|
540 |
+
### Function Chaining: Chain multiple function calls together using the dot (.) operator.
|
541 |
+
|
542 |
+
```
|
543 |
+
SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB'
|
544 |
+
```
|
545 |
+
|
546 |
+
### Regular Expressions: DuckDB has built-in functions for regex matching, replacement, and extraction.
|
547 |
+
|
548 |
+
```
|
549 |
+
SELECT regexp_matches('DuckDB', 'Duck'); -- Returns true
|
550 |
+
SELECT regexp_replace('DuckDB', 'Duck', 'Goose'); -- Returns 'GooseDB'
|
551 |
+
SELECT regexp_extract('DuckDB', '(\w+)(DB)', 1); -- Returns 'Duck'
|
552 |
+
```
|
553 |
+
|
554 |
+
### Sampling: Quickly get a subset of your data with SAMPLE or TABLESAMPLE.
|
555 |
+
|
556 |
+
```
|
557 |
+
SELECT * FROM large_table USING SAMPLE 10%; -- Random 10% sample
|
558 |
+
SELECT * FROM large_table TABLESAMPLE BERNOULLI(10); -- Bernoulli sampling
|
559 |
+
```
|
560 |
+
|
561 |
+
### ATTACH and Access: Attach external databases and reference their objects using databasename.table_name syntax.
|
562 |
+
|
563 |
+
```
|
564 |
+
ATTACH 'my_database.duckdb' AS mydb;
|
565 |
+
SELECT * FROM mydb.my_table;
|
566 |
+
```
|
567 |
+
|
568 |
+
### SUMMARIZE: Get summary statistics (min, max, unique count, average, standard deviation, quartiles, and count) of a table.
|
569 |
+
|
570 |
+
```
|
571 |
+
SUMMARIZE table_name;
|
572 |
+
```
|
573 |
+
|
574 |
+
### DESCRIBE: Get schema of a table (column_name, column_type, null, key, default, extra).
|
575 |
+
|
576 |
+
```
|
577 |
+
DESCRIBE table_name;
|
578 |
+
```
|
579 |
+
|
580 |
+
Database Schema:
|
581 |
+
Here is the schema of the DuckDB database that the SQL query will run on:
|
582 |
+
{schema}
|
583 |
+
|
584 |
+
Question:
|
585 |
+
Here is the question or an instruction the user provided:
|
586 |
+
{question}
|
587 |
+
|
588 |
+
Task:
|
589 |
+
Write a DuckDB SQL query for the given question!
|
590 |
+
|
591 |
+
Here is the valid DuckDB SQL query:
|
592 |
+
```
|
593 |
+
"""
|
594 |
+
|
595 |
+
@classmethod
|
596 |
+
def format_retrieved_context(
|
597 |
+
cls,
|
598 |
+
context: list[str],
|
599 |
+
) -> str:
|
600 |
+
"""Format retrieved context."""
|
601 |
+
context_str = "\n--------\n".join(context)
|
602 |
+
return f"\n### Documentation:\n{context_str}\n"
|
603 |
+
|
604 |
+
@classmethod
|
605 |
+
def format_prompt(
|
606 |
+
cls,
|
607 |
+
instruction: str,
|
608 |
+
table_text: str,
|
609 |
+
context_text: str,
|
610 |
+
) -> str | list[str]:
|
611 |
+
"""Get prompt format."""
|
612 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
613 |
+
schema=table_text,
|
614 |
+
question=instruction
|
615 |
+
)
|
616 |
+
return instruction
|
617 |
+
|
618 |
+
class DuckDBInstFormatterGranite(RajkumarFormatter):
|
619 |
+
"""DuckDB Inst class."""
|
620 |
+
|
621 |
+
PROMPT_TEMPLATE = """System:
|
622 |
+
|
623 |
+
Your task is to generate valid DuckDB SQL to answer the following question, given a DuckDB database schema.
|
624 |
+
|
625 |
+
## DuckDB SQL syntax specifics you should be aware of:
|
626 |
+
|
627 |
+
### Case Insensitivity and Quoting:
|
628 |
+
|
629 |
+
Identifiers (tables, columns): Case-insensitive, but DuckDB remembers the case you use. Use double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity.
|
630 |
+
```
|
631 |
+
CREATE TABLE "My Table" ("column_name" VARCHAR); -- Spaces and mixed case
|
632 |
+
SELECT "column_name" FROM "My Table";
|
633 |
+
```
|
634 |
+
|
635 |
+
### String Literals: Always use single quotes (') to define string literals.
|
636 |
+
```
|
637 |
+
SELECT 'This is a string' AS text;
|
638 |
+
```
|
639 |
+
|
640 |
+
### Direct File Querying: Query CSV, Parquet, and JSON files directly without loading them first.
|
641 |
+
|
642 |
+
```
|
643 |
+
SELECT * FROM 'data.csv';
|
644 |
+
SELECT * FROM 'data.parquet';
|
645 |
+
SELECT * FROM 'data.json';
|
646 |
+
```
|
647 |
+
|
648 |
+
### CREATE TABLE AS (CTAS): Create tables from query results.
|
649 |
+
|
650 |
+
```
|
651 |
+
CREATE TABLE squares AS SELECT i, i * i AS square FROM generate_series(1, 10) t(i);
|
652 |
+
```
|
653 |
+
|
654 |
+
### FROM-First Syntax (Optional SELECT): Start queries with FROM, and optionally omit SELECT *.
|
655 |
+
|
656 |
+
```
|
657 |
+
FROM my_table WHERE condition; -- Equivalent to SELECT * FROM my_table WHERE condition
|
658 |
+
```
|
659 |
+
|
660 |
+
### SELECT without FROM: DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly.
|
661 |
+
|
662 |
+
```
|
663 |
+
SELECT 1 + 1 AS result;
|
664 |
+
```
|
665 |
+
|
666 |
+
### GROUP BY ALL/ORDER BY ALL: Shorthand for grouping/ordering by all non-aggregated/all columns.
|
667 |
+
|
668 |
+
```
|
669 |
+
SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;
|
670 |
+
SELECT * FROM my_table ORDER BY ALL;
|
671 |
+
```
|
672 |
+
|
673 |
+
### SELECT COLUMNS(): Powerful way to select or transform multiple columns using patterns or functions.
|
674 |
+
|
675 |
+
```
|
676 |
+
-- Select columns matching a pattern
|
677 |
+
SELECT COLUMNS('sales_.*') FROM sales_data;
|
678 |
+
|
679 |
+
-- Transform multiple columns with a function
|
680 |
+
SELECT AVG(COLUMNS(*)) FROM sales_data;
|
681 |
+
```
|
682 |
+
|
683 |
+
### UNION BY NAME: Combine tables by matching column names, not just their positions.
|
684 |
+
|
685 |
+
```
|
686 |
+
SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;
|
687 |
+
```
|
688 |
+
|
689 |
+
### Implicit/Explicit Casting: DuckDB is generally more lenient with implicit type conversions, but you can always be explicit using ::
|
690 |
+
|
691 |
+
```
|
692 |
+
SELECT '42' + 1; -- Implicit cast, result is 43
|
693 |
+
SELECT '42'::INTEGER + 1; -- Explicit cast, result is 43
|
694 |
+
```
|
695 |
+
|
696 |
+
### String/List Slicing: Extract parts of strings and lists using [start:end] or [start:end:step] syntax.
|
697 |
+
|
698 |
+
```
|
699 |
+
SELECT 'DuckDB'[1:4]; -- Returns 'Duck'
|
700 |
+
SELECT [1, 2, 3, 4][1:3]; -- Returns [1, 2, 3]
|
701 |
+
```
|
702 |
+
|
703 |
+
### Simple List/Struct/Map/Array Creation: Create complex types using intuitive syntax.
|
704 |
+
|
705 |
+
In a SELECT statement:
|
706 |
+
```
|
707 |
+
SELECT [1, 2, 3] AS my_list, {{'a': 1, 'b': 'text'}} AS my_struct, MAP([1,2],['one','two']) as my_map;
|
708 |
+
```
|
709 |
+
|
710 |
+
When creating a table:
|
711 |
+
```
|
712 |
+
CREATE TABLE data (
|
713 |
+
my_list INTEGER[],
|
714 |
+
my_struct STRUCT(a INTEGER, b TEXT),
|
715 |
+
my_map MAP(INTEGER, VARCHAR),
|
716 |
+
my_array INTEGER[3]
|
717 |
+
);
|
718 |
+
```
|
719 |
+
|
720 |
+
### Timestamp Conversions and Extraction: Convert between text and timestamps, and extract date parts.
|
721 |
+
|
722 |
+
```
|
723 |
+
SELECT strftime(NOW(), '%Y-%m-%d'); -- Current date as 'YYYY-MM-DD'
|
724 |
+
SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP; -- String to timestamp
|
725 |
+
SELECT EXTRACT(YEAR FROM DATE '2023-07-23'); -- Extract year
|
726 |
+
```
|
727 |
+
|
728 |
+
### Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses.
|
729 |
+
|
730 |
+
```
|
731 |
+
SELECT a + b AS total
|
732 |
+
FROM my_table
|
733 |
+
WHERE total > 10
|
734 |
+
GROUP BY total
|
735 |
+
HAVING total < 20;
|
736 |
+
```
|
737 |
+
|
738 |
+
### List Comprehensions: Generate lists using expressions similar to Python list comprehensions.
|
739 |
+
|
740 |
+
```
|
741 |
+
SELECT [x*2 FOR x IN [1, 2, 3]]; -- Returns [2, 4, 6]
|
742 |
+
```
|
743 |
+
|
744 |
+
### Function Chaining: Chain multiple function calls together using the dot (.) operator.
|
745 |
+
|
746 |
+
```
|
747 |
+
SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB'
|
748 |
+
```
|
749 |
+
|
750 |
+
### Regular Expressions: DuckDB has built-in functions for regex matching, replacement, and extraction.
|
751 |
+
|
752 |
+
```
|
753 |
+
SELECT regexp_matches('DuckDB', 'Duck'); -- Returns true
|
754 |
+
SELECT regexp_replace('DuckDB', 'Duck', 'Goose'); -- Returns 'GooseDB'
|
755 |
+
SELECT regexp_extract('DuckDB', '(\w+)(DB)', 1); -- Returns 'Duck'
|
756 |
+
```
|
757 |
+
|
758 |
+
### Sampling: Quickly get a subset of your data with SAMPLE or TABLESAMPLE.
|
759 |
+
|
760 |
+
```
|
761 |
+
SELECT * FROM large_table USING SAMPLE 10%; -- Random 10% sample
|
762 |
+
SELECT * FROM large_table TABLESAMPLE BERNOULLI(10); -- Bernoulli sampling
|
763 |
+
```
|
764 |
+
|
765 |
+
### ATTACH and Access: Attach external databases and reference their objects using databasename.table_name syntax.
|
766 |
+
|
767 |
+
```
|
768 |
+
ATTACH 'my_database.duckdb' AS mydb;
|
769 |
+
SELECT * FROM mydb.my_table;
|
770 |
+
```
|
771 |
+
|
772 |
+
### SUMMARIZE: Get summary statistics (min, max, unique count, average, standard deviation, quartiles, and count) of a table.
|
773 |
+
|
774 |
+
```
|
775 |
+
SUMMARIZE table_name;
|
776 |
+
```
|
777 |
+
|
778 |
+
### DESCRIBE: Get schema of a table (column_name, column_type, null, key, default, extra).
|
779 |
+
|
780 |
+
```
|
781 |
+
DESCRIBE table_name;
|
782 |
+
```
|
783 |
+
|
784 |
+
Here is the schema of the DuckDB database that the SQL query will run on:
|
785 |
+
{schema}
|
786 |
+
|
787 |
+
Question:
|
788 |
+
Here is the question or an instruction the user provided:
|
789 |
+
{question}
|
790 |
+
|
791 |
+
Please write a DuckDB SQL query that answers the user's question or instruction. Use DuckDB-specific syntax if possible.
|
792 |
+
|
793 |
+
Answer:
|
794 |
+
```
|
795 |
+
"""
|
796 |
+
|
797 |
+
@classmethod
|
798 |
+
def format_retrieved_context(
|
799 |
+
cls,
|
800 |
+
context: list[str],
|
801 |
+
) -> str:
|
802 |
+
"""Format retrieved context."""
|
803 |
+
context_str = "\n--------\n".join(context)
|
804 |
+
return f"\n### Documentation:\n{context_str}\n"
|
805 |
+
|
806 |
+
@classmethod
|
807 |
+
def format_prompt(
|
808 |
+
cls,
|
809 |
+
instruction: str,
|
810 |
+
table_text: str,
|
811 |
+
context_text: str,
|
812 |
+
) -> str | list[str]:
|
813 |
+
"""Get prompt format."""
|
814 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
815 |
+
schema=table_text,
|
816 |
+
question=instruction
|
817 |
+
)
|
818 |
+
return instruction
|
819 |
+
|
820 |
+
class DuckDBInstFormatterPhi(RajkumarFormatter):
|
821 |
+
"""DuckDB Inst class."""
|
822 |
+
|
823 |
+
PROMPT_TEMPLATE = """<|endoftext|><|user|>
|
824 |
+
Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
|
825 |
+
|
826 |
+
Here are some DuckDB SQL syntax specifics you should be aware of:
|
827 |
+
|
828 |
+
|
829 |
+
- DuckDB use double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity and single quotes (') to define string literals
|
830 |
+
- DuckDB can query CSV, Parquet, and JSON directly without loading them first, e.g. `SELECT * FROM 'data.csv';`
|
831 |
+
- DuckDB supports CREATE TABLE AS (CTAS): `CREATE TABLE new_table AS SELECT * FROM old_table;`
|
832 |
+
- DuckDB queries can start with FROM, and optionally omit SELECT *, e.g. `FROM my_table WHERE condition;` is equivalent to `SELECT * FROM my_table WHERE condition;`
|
833 |
+
- DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly, e.g. `SELECT 1 + 1 AS result;`
|
834 |
+
- DuckDB supports attaching multiple databases, unsing the ATTACH statement: `ATTACH 'my_database.duckdb' AS mydb;`. Tables within attached databases can be accessed using the dot notation (.), e.g. `SELECT * FROM mydb.table_name syntax`. The default databases doesn't require the do notation to access tables. The default database can be changed with the USE statement, e.g. `USE my_db;`.
|
835 |
+
- DuckDB is generally more lenient with implicit type conversions (e.g. `SELECT '42' + 1;` - Implicit cast, result is 43), but you can always be explicit using `::`, e.g. `SELECT '42'::INTEGER + 1;`
|
836 |
+
- DuckDB can extract parts of strings and lists using [start:end] or [start:end:step] syntax. Indexes start at 1. String slicing: `SELECT 'DuckDB'[1:4];`. Array/List slicing: `SELECT [1, 2, 3, 4][1:3];`
|
837 |
+
- DuckDB has a powerful way to select or transform multiple columns using patterns or functions. You can select columns matching a pattern: `SELECT COLUMNS('sales_.*') FROM sales_data;` or transform multiple columns with a function: `SELECT AVG(COLUMNS('sales_.*')) FROM sales_data;`
|
838 |
+
- DuckDB an easy way to include/exclude or modify columns when selecting all: e.g. Exclude: `SELECT * EXCLUDE (sensitive_data) FROM users;` Replace: `SELECT * REPLACE (UPPER(name) AS name) FROM users;`
|
839 |
+
- DuckDB has a shorthand for grouping/ordering by all non-aggregated/all columns. e.g `SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;` and `SELECT * FROM my_table ORDER BY ALL;`
|
840 |
+
- DuckDB can combine tables by matching column names, not just their positions using UNION BY NAME. E.g. `SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;`
|
841 |
+
- DuckDB has an inutitive syntax to create List/Struct/Map and Array types. Create complex types using intuitive syntax. List: `SELECT [1, 2, 3] AS my_list;`, Struct: `{{'a': 1, 'b': 'text'}} AS my_struct;`, Map: `MAP([1,2],['one','two']) as my_map;`. All types can also be nested into each other. Array types are fixed size, while list types have variable size. Compared to Structs, MAPs do not need to have the same keys present for each row, but keys can only be of type Integer or Varchar. Example: `CREATE TABLE example (my_list INTEGER[], my_struct STRUCT(a INTEGER, b TEXT), my_map MAP(INTEGER, VARCHAR), my_array INTEGER[3], my_nested_struct STRUCT(a INTEGER, b Integer[3]));`
|
842 |
+
- DuckDB has an inutive syntax to access struct fields using dot notation (.) or brackets ([]) with the field name. Maps fields can be accessed by brackets ([]).
|
843 |
+
- DuckDB's way of converting between text and timestamps, and extract date parts. Current date as 'YYYY-MM-DD': `SELECT strftime(NOW(), '%Y-%m-%d');` String to timestamp: `SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP;`, Extract Year from date: `SELECT EXTRACT(YEAR FROM DATE '2023-07-23');`
|
844 |
+
- Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses. E.g.: `SELECT a + b AS total FROM my_table WHERE total > 10 GROUP BY total HAVING total < 20;`
|
845 |
+
- DuckDB allows generating lists using expressions similar to Python list comprehensions. E.g. `SELECT [x*2 FOR x IN [1, 2, 3]];` Returns [2, 4, 6].
|
846 |
+
- DuckDB allows chaining multiple function calls together using the dot (.) operator. E.g.: `SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB';`
|
847 |
+
- DuckDB has a JSON data type. It supports selecting fields from the JSON with a JSON-Path expression using the arrow operator, -> (returns JSON) or ->> (returns text) with JSONPath expressions. For example: `SELECT data->'$.user.id' AS user_id, data->>'$.event_type' AS event_type FROM events;`
|
848 |
+
- DuckDB has built-in functions for regex regexp_matches(column, regex), regexp_replace(column, regex), and regexp_extract(column, regex).
|
849 |
+
- DuckDB has a way to quickly get a subset of your data with `SELECT * FROM large_table USING SAMPLE 10%;`
|
850 |
+
|
851 |
+
Here is the schema of the DuckDB database that the SQL query will run on:
|
852 |
+
{schema}
|
853 |
+
|
854 |
+
Question:
|
855 |
+
Here is the question or an instruction the user provided:
|
856 |
+
{question}
|
857 |
+
|
858 |
+
Write a DuckDB SQL query for the given question!<|end|>
|
859 |
+
<|assistant|>
|
860 |
+
```sql
|
861 |
+
"""
|
862 |
+
|
863 |
+
@classmethod
|
864 |
+
def format_retrieved_context(
|
865 |
+
cls,
|
866 |
+
context: list[str],
|
867 |
+
) -> str:
|
868 |
+
"""Format retrieved context."""
|
869 |
+
context_str = "\n--------\n".join(context)
|
870 |
+
return f"\n### Documentation:\n{context_str}\n"
|
871 |
+
|
872 |
+
@classmethod
|
873 |
+
def format_prompt(
|
874 |
+
cls,
|
875 |
+
instruction: str,
|
876 |
+
table_text: str,
|
877 |
+
context_text: str,
|
878 |
+
) -> str | list[str]:
|
879 |
+
"""Get prompt format."""
|
880 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
881 |
+
schema=table_text,
|
882 |
+
question=instruction
|
883 |
+
)
|
884 |
+
return instruction
|
885 |
+
|
886 |
+
class DuckDBInstFormatterGPTmini(RajkumarFormatter):
|
887 |
+
"""DuckDB Inst class."""
|
888 |
+
|
889 |
+
PROMPT_TEMPLATE = """Schema:
|
890 |
+
```sql
|
891 |
+
{schema}
|
892 |
+
```
|
893 |
+
|
894 |
+
Question:
|
895 |
+
{question}
|
896 |
+
|
897 |
+
Write a valid DuckDB SQL query to answer the question!
|
898 |
+
"""
|
899 |
+
|
900 |
+
@classmethod
|
901 |
+
def format_retrieved_context(
|
902 |
+
cls,
|
903 |
+
context: list[str],
|
904 |
+
) -> str:
|
905 |
+
"""Format retrieved context."""
|
906 |
+
context_str = "\n--------\n".join(context)
|
907 |
+
return f"\n### Documentation:\n{context_str}\n"
|
908 |
+
|
909 |
+
@classmethod
|
910 |
+
def format_prompt(
|
911 |
+
cls,
|
912 |
+
instruction: str,
|
913 |
+
table_text: str,
|
914 |
+
context_text: str,
|
915 |
+
) -> str | list[str]:
|
916 |
+
"""Get prompt format."""
|
917 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
918 |
+
schema=table_text,
|
919 |
+
question=instruction
|
920 |
+
)
|
921 |
+
return instruction
|
922 |
+
|
923 |
+
class DuckDBInstFormatterPhiAzure(RajkumarFormatter):
|
924 |
+
"""DuckDB Inst class."""
|
925 |
+
|
926 |
+
PROMPT_TEMPLATE = """Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
|
927 |
+
|
928 |
+
Here is the schema of the DuckDB database that the SQL query will run on:
|
929 |
+
{schema}
|
930 |
+
|
931 |
+
Question:
|
932 |
+
Here is the question or an instruction the user provided:
|
933 |
+
{question}
|
934 |
+
|
935 |
+
Write a DuckDB SQL query for the given question!
|
936 |
+
"""
|
937 |
+
|
938 |
+
@classmethod
|
939 |
+
def format_retrieved_context(
|
940 |
+
cls,
|
941 |
+
context: list[str],
|
942 |
+
) -> str:
|
943 |
+
"""Format retrieved context."""
|
944 |
+
context_str = "\n--------\n".join(context)
|
945 |
+
return f"\n### Documentation:\n{context_str}\n"
|
946 |
+
|
947 |
+
@classmethod
|
948 |
+
def format_prompt(
|
949 |
+
cls,
|
950 |
+
instruction: str,
|
951 |
+
table_text: str,
|
952 |
+
context_text: str,
|
953 |
+
) -> str | list[str]:
|
954 |
+
"""Get prompt format."""
|
955 |
+
instruction = cls.PROMPT_TEMPLATE.format(
|
956 |
+
schema=table_text,
|
957 |
+
question=instruction
|
958 |
+
)
|
959 |
+
return instruction
|
960 |
+
|
961 |
+
class DuckDBInstNoShorthandFormatter(DuckDBInstFormatter):
|
962 |
+
"""DuckDB Inst class."""
|
963 |
+
|
964 |
+
PROMPT_TEMPLATE = """### Instruction:\n{instruction}\n\n### Input:\n{input}{context}\n### Question:\n{question}\n\n### Response:\n"""
|
965 |
+
INSTRUCTION_TEMPLATE = """Your task is to generate valid duckdb SQL to answer the following question{has_schema}""" # noqa: E501
|
966 |
+
|
967 |
+
|
968 |
+
class DuckDBChat:
|
969 |
+
"""DuckDB Inst class."""
|
970 |
+
|
971 |
+
table_sep: str = "\n\n"
|
972 |
+
shuffle_table_order: bool = True
|
973 |
+
_cache: dict[tuple[str, str, str], list[str]] = {}
|
974 |
+
clean_whitespace = False
|
975 |
+
model = None
|
976 |
+
|
977 |
+
@classmethod
|
978 |
+
def format_table(cls, table: Table) -> str:
|
979 |
+
"""Get table format."""
|
980 |
+
table_fmt = []
|
981 |
+
for col in table.columns or []:
|
982 |
+
# This is technically an incorrect type, but it should be a catchall word
|
983 |
+
table_fmt.append(f" {col.name} {col.dtype or 'any'}")
|
984 |
+
if table_fmt:
|
985 |
+
all_cols = ",\n".join(table_fmt)
|
986 |
+
create_tbl = f"CREATE TABLE {table.name} (\n{all_cols}\n)"
|
987 |
+
else:
|
988 |
+
create_tbl = f"CREATE TABLE {table.name}"
|
989 |
+
return create_tbl
|
990 |
+
|
991 |
+
@classmethod
|
992 |
+
def format_all_tables(cls, tables: list[Table], instruction: str) -> list[dict]:
|
993 |
+
"""Get all tables format."""
|
994 |
+
if not cls.model:
|
995 |
+
cls.model = Manifest(
|
996 |
+
engine="gpt-3.5-turbo",
|
997 |
+
client_name="openaichat",
|
998 |
+
cache_name="sqlite",
|
999 |
+
cache_connection=".manifest.sqlite",
|
1000 |
+
)
|
1001 |
+
table_texts = [cls.format_table(table) for table in tables]
|
1002 |
+
full_schema = cls.table_sep.join(table_texts)
|
1003 |
+
prompt = f"""SQL schema of my database:
|
1004 |
+
{full_schema}
|
1005 |
+
Explain in a few sentences what the data is about:
|
1006 |
+
"""
|
1007 |
+
messages = [
|
1008 |
+
{
|
1009 |
+
"role": "system",
|
1010 |
+
"content": "You are a helpful assistant that can generate an human redable summary of database content based on the schema.",
|
1011 |
+
},
|
1012 |
+
{"role": "user", "content": prompt},
|
1013 |
+
]
|
1014 |
+
explanation = cls.model.run(messages, temperature=0)
|
1015 |
+
messages.append({"role": "assistant", "content": explanation})
|
1016 |
+
return messages[1:]
|
1017 |
+
|
1018 |
+
@classmethod
|
1019 |
+
def format_retrieved_context(
|
1020 |
+
cls,
|
1021 |
+
context: list[str],
|
1022 |
+
) -> str:
|
1023 |
+
"""Format retrieved context."""
|
1024 |
+
context_str = "\n--------\n".join(context)
|
1025 |
+
return f"\n\nHere is additional documentation about DuckDB that could be useful.\n--------\n{context_str}\n--------\n"
|
1026 |
+
|
1027 |
+
@classmethod
|
1028 |
+
def format_prompt(
|
1029 |
+
cls,
|
1030 |
+
instruction: str,
|
1031 |
+
table_text: list[dict],
|
1032 |
+
context_text: str,
|
1033 |
+
) -> str | list[str]:
|
1034 |
+
"""Get prompt format."""
|
1035 |
+
prompt = f"""Now output a single SQL query without any explanation and do not add anything
|
1036 |
+
to the query that was not part of the question, also do not use markdown. Make sure to only
|
1037 |
+
use information provided in the prompt, or tables and columns from the schema above and write a query to answer the question.{context_text}\n\nMy quesiton is \n`{instruction}`\n\nGenerate the DuckDB specific SQL query:""" # noqa: E501
|
1038 |
+
messages = [
|
1039 |
+
{
|
1040 |
+
"role": "system",
|
1041 |
+
"content": "You are a helpful assistant that can generate DuckDB sql queries, which is a superset of Postgresql, based on the user input. You do not respond with any human readable text, only SQL code.",
|
1042 |
+
},
|
1043 |
+
*table_text,
|
1044 |
+
{"role": "user", "content": prompt},
|
1045 |
+
]
|
1046 |
+
return messages
|
1047 |
+
|
1048 |
+
@classmethod
|
1049 |
+
def format_model_output(cls, output_sql: str, prompt: str) -> str:
|
1050 |
+
"""Format model output."""
|
1051 |
+
return output_sql
|
1052 |
+
|
1053 |
+
@classmethod
|
1054 |
+
def format_gold_output(cls, output_sql: str) -> str:
|
1055 |
+
"""Format gold output for demonstration."""
|
1056 |
+
return output_sql
|
duckdb-nsql/eval/schema.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Text2SQL schemas."""
|
2 |
+
import enum
|
3 |
+
|
4 |
+
from manifest.response import Usage
|
5 |
+
from pydantic import BaseModel
|
6 |
+
|
7 |
+
DEFAULT_TABLE_NAME: str = "db_table"
|
8 |
+
|
9 |
+
|
10 |
+
class Dialect(str, enum.Enum):
|
11 |
+
"""SQGFluff and SQLGlot dialects.
|
12 |
+
|
13 |
+
Lucky for us, the dialects match both parsers.
|
14 |
+
|
15 |
+
Ref: https://github.com/sqlfluff/sqlfluff/blob/main/src/sqlfluff/core/dialects/__init__.py # noqa: E501
|
16 |
+
Ref: https://github.com/tobymao/sqlglot/blob/main/sqlglot/dialects/__init__.py # noqa: E501
|
17 |
+
"""
|
18 |
+
|
19 |
+
SNOWFLAKE = "snowflake"
|
20 |
+
BIGQUERY = "bigquery"
|
21 |
+
REDSHIFT = "redshift"
|
22 |
+
POSTGRES = "postgres"
|
23 |
+
UNKNOWN = "unknown"
|
24 |
+
|
25 |
+
@property
|
26 |
+
def dialect_str(self) -> str | None:
|
27 |
+
"""Get the dialect string for validation.
|
28 |
+
|
29 |
+
We need to pass in dialect = None for UNKNOWN dialects.
|
30 |
+
"""
|
31 |
+
if self != Dialect.UNKNOWN:
|
32 |
+
return self.value
|
33 |
+
else:
|
34 |
+
return None
|
35 |
+
|
36 |
+
@property
|
37 |
+
def quote_str(self) -> str:
|
38 |
+
"""Get the quote string for the dialect."""
|
39 |
+
if self == Dialect.SNOWFLAKE:
|
40 |
+
return '"'
|
41 |
+
elif self == Dialect.BIGQUERY:
|
42 |
+
return "`"
|
43 |
+
elif self == Dialect.REDSHIFT:
|
44 |
+
return '"'
|
45 |
+
elif self == Dialect.POSTGRES:
|
46 |
+
return '"'
|
47 |
+
elif self == Dialect.UNKNOWN:
|
48 |
+
return '"'
|
49 |
+
raise NotImplementedError(f"Quote string not implemented for dialect {self}")
|
50 |
+
|
51 |
+
def quote(self, string: str) -> str:
|
52 |
+
"""Quote a string."""
|
53 |
+
return f"{self.quote_str}{string}{self.quote_str}"
|
54 |
+
|
55 |
+
|
56 |
+
class ColumnOrLiteral(BaseModel):
|
57 |
+
"""Column that may or may not be a literal."""
|
58 |
+
|
59 |
+
name: str | None = None
|
60 |
+
literal: bool = False
|
61 |
+
|
62 |
+
def __hash__(self) -> int:
|
63 |
+
"""Hash."""
|
64 |
+
return hash((self.name, self.literal))
|
65 |
+
|
66 |
+
|
67 |
+
class TableColumn(BaseModel):
|
68 |
+
"""Table column."""
|
69 |
+
|
70 |
+
name: str
|
71 |
+
dtype: str | None
|
72 |
+
|
73 |
+
|
74 |
+
class ForeignKey(BaseModel):
|
75 |
+
"""Foreign key."""
|
76 |
+
|
77 |
+
# Referenced column
|
78 |
+
column: TableColumn
|
79 |
+
# References table name
|
80 |
+
references_name: str
|
81 |
+
# References column
|
82 |
+
references_column: TableColumn
|
83 |
+
|
84 |
+
|
85 |
+
class Table(BaseModel):
|
86 |
+
"""Table."""
|
87 |
+
|
88 |
+
name: str | None
|
89 |
+
columns: list[TableColumn] | None
|
90 |
+
pks: list[TableColumn] | None
|
91 |
+
# FK from this table to another column in another table
|
92 |
+
fks: list[ForeignKey] | None
|
93 |
+
examples: list[dict] | None
|
94 |
+
# Is the table a source or intermediate reference table
|
95 |
+
is_reference_table: bool = False
|
96 |
+
|
97 |
+
|
98 |
+
class TextToSQLParams(BaseModel):
|
99 |
+
"""A text to sql request."""
|
100 |
+
|
101 |
+
instruction: str
|
102 |
+
database: str | None
|
103 |
+
# Default to unknown
|
104 |
+
dialect: Dialect = Dialect.UNKNOWN
|
105 |
+
tables: list[Table] | None
|
106 |
+
|
107 |
+
|
108 |
+
class TextToSQLModelResponse(BaseModel):
|
109 |
+
"""Model for Autocomplete Responses."""
|
110 |
+
|
111 |
+
output: str
|
112 |
+
final_prompt: str | list[dict]
|
113 |
+
raw_output: str
|
114 |
+
usage: Usage
|
115 |
+
metadata: str | None = None
|
duckdb-nsql/eval/text_to_sql.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Text-to-SQL running."""
|
2 |
+
import asyncio
|
3 |
+
import json
|
4 |
+
import re
|
5 |
+
import time
|
6 |
+
from typing import cast
|
7 |
+
import duckdb
|
8 |
+
|
9 |
+
import structlog
|
10 |
+
from manifest import Manifest
|
11 |
+
from manifest.response import Response, Usage
|
12 |
+
from prompt_formatters import RajkumarFormatter, MotherDuckFormatter
|
13 |
+
from schema import DEFAULT_TABLE_NAME, TextToSQLModelResponse, TextToSQLParams
|
14 |
+
from tqdm.auto import tqdm
|
15 |
+
|
16 |
+
logger = structlog.get_logger()
|
17 |
+
|
18 |
+
|
19 |
+
def clean_whitespace(sql: str) -> str:
|
20 |
+
"""Clean whitespace."""
|
21 |
+
return re.sub(r"[\t\n\s]+", " ", sql)
|
22 |
+
|
23 |
+
|
24 |
+
def instruction_to_sql(
|
25 |
+
params: TextToSQLParams,
|
26 |
+
extra_context: list[str],
|
27 |
+
manifest: Manifest,
|
28 |
+
prompt_formatter: RajkumarFormatter = None,
|
29 |
+
overwrite_manifest: bool = False,
|
30 |
+
max_tokens: int = 300,
|
31 |
+
temperature: float = 0.1,
|
32 |
+
stop_sequences: list[str] | None = None,
|
33 |
+
num_beams: int = 1,
|
34 |
+
) -> TextToSQLModelResponse:
|
35 |
+
"""Parse the instruction to a sql command."""
|
36 |
+
return instruction_to_sql_list(
|
37 |
+
params=[params],
|
38 |
+
extra_context=[extra_context],
|
39 |
+
manifest=manifest,
|
40 |
+
prompt_formatter=prompt_formatter,
|
41 |
+
overwrite_manifest=overwrite_manifest,
|
42 |
+
max_tokens=max_tokens,
|
43 |
+
temperature=0.1,
|
44 |
+
stop_sequences=stop_sequences,
|
45 |
+
num_beams=num_beams,
|
46 |
+
)[0]
|
47 |
+
|
48 |
+
def run_motherduck_prompt_sql(params: list[TextToSQLParams]) -> list[TextToSQLModelResponse]:
|
49 |
+
results = []
|
50 |
+
for param in params:
|
51 |
+
con = duckdb.connect('md:')
|
52 |
+
try:
|
53 |
+
sql_query = con.execute("CALL prompt_sql(?);", [param.instruction]).fetchall()[0][0]
|
54 |
+
except Exception as e:
|
55 |
+
print(e)
|
56 |
+
sql_query = "SELECT * FROM hn.hacker_news LIMIT 1";
|
57 |
+
usage = Usage(
|
58 |
+
completion_tokens = 0,
|
59 |
+
prompt_tokens = 0,
|
60 |
+
total_tokens = 0
|
61 |
+
)
|
62 |
+
model_response = TextToSQLModelResponse(
|
63 |
+
output=sql_query,
|
64 |
+
raw_output=sql_query,
|
65 |
+
final_prompt=param.instruction,
|
66 |
+
usage=usage,
|
67 |
+
)
|
68 |
+
results.append(model_response)
|
69 |
+
return results
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
def instruction_to_sql_list(
|
74 |
+
params: list[TextToSQLParams],
|
75 |
+
extra_context: list[list[str]],
|
76 |
+
manifest: Manifest,
|
77 |
+
prompt_formatter: RajkumarFormatter = None,
|
78 |
+
overwrite_manifest: bool = False,
|
79 |
+
max_tokens: int = 300,
|
80 |
+
temperature: float = 0.1,
|
81 |
+
stop_sequences: list[str] | None = None,
|
82 |
+
num_beams: int = 1,
|
83 |
+
verbose: bool = False,
|
84 |
+
) -> list[TextToSQLModelResponse]:
|
85 |
+
"""Parse the list of instructions to sql commands.
|
86 |
+
|
87 |
+
Connector is used for default retry handlers only.
|
88 |
+
"""
|
89 |
+
if type(prompt_formatter) is MotherDuckFormatter:
|
90 |
+
return run_motherduck_prompt_sql(params)
|
91 |
+
|
92 |
+
if prompt_formatter is None:
|
93 |
+
raise ValueError("Prompt formatter is required.")
|
94 |
+
|
95 |
+
def construct_params(
|
96 |
+
params: TextToSQLParams,
|
97 |
+
context: list[str],
|
98 |
+
) -> str | list[dict]:
|
99 |
+
"""Turn params into prompt."""
|
100 |
+
if prompt_formatter.clean_whitespace:
|
101 |
+
instruction = clean_whitespace(params.instruction)
|
102 |
+
else:
|
103 |
+
instruction = params.instruction
|
104 |
+
|
105 |
+
table_texts = prompt_formatter.format_all_tables(
|
106 |
+
params.tables, instruction=instruction
|
107 |
+
)
|
108 |
+
# table_texts can be list of chat messages. Only join list of str.
|
109 |
+
if table_texts:
|
110 |
+
if isinstance(table_texts[0], str):
|
111 |
+
table_text = prompt_formatter.table_sep.join(table_texts)
|
112 |
+
else:
|
113 |
+
table_text = table_texts
|
114 |
+
else:
|
115 |
+
table_text = ""
|
116 |
+
|
117 |
+
if context:
|
118 |
+
context_text = prompt_formatter.format_retrieved_context(context)
|
119 |
+
else:
|
120 |
+
context_text = "" if isinstance(table_text, str) else []
|
121 |
+
prompt = prompt_formatter.format_prompt(
|
122 |
+
instruction,
|
123 |
+
table_text,
|
124 |
+
context_text,
|
125 |
+
)
|
126 |
+
return prompt
|
127 |
+
|
128 |
+
# If no inputs, return nothing
|
129 |
+
if not params:
|
130 |
+
return []
|
131 |
+
|
132 |
+
# Stitch together demonstrations and params
|
133 |
+
prompts: list[str | list[dict]] = []
|
134 |
+
for i, param in tqdm(
|
135 |
+
enumerate(params),
|
136 |
+
total=len(params),
|
137 |
+
desc="Constructing prompts",
|
138 |
+
disable=not verbose,
|
139 |
+
):
|
140 |
+
predict_str = construct_params(param, extra_context[i] if extra_context else [])
|
141 |
+
if isinstance(predict_str, str):
|
142 |
+
prompt = predict_str.lstrip()
|
143 |
+
else:
|
144 |
+
prompt = predict_str
|
145 |
+
prompts.append(prompt)
|
146 |
+
|
147 |
+
manifest_params = dict(
|
148 |
+
max_tokens=max_tokens,
|
149 |
+
overwrite_cache=overwrite_manifest,
|
150 |
+
num_beams=num_beams,
|
151 |
+
logprobs=5,
|
152 |
+
temperature=0.1,
|
153 |
+
do_sample=False if 0.1 <= 0 else True,
|
154 |
+
stop_sequences=stop_sequences or prompt_formatter.stop_sequences,
|
155 |
+
)
|
156 |
+
|
157 |
+
ret: list[TextToSQLModelResponse] = []
|
158 |
+
if len(params) == 1:
|
159 |
+
prompt = prompts[0]
|
160 |
+
success = False
|
161 |
+
retries = 0
|
162 |
+
while not success and retries < 5:
|
163 |
+
try:
|
164 |
+
model_response = _run_manifest(
|
165 |
+
prompt,
|
166 |
+
manifest_params,
|
167 |
+
prompt_formatter,
|
168 |
+
manifest,
|
169 |
+
stop_sequences=stop_sequences,
|
170 |
+
)
|
171 |
+
success = True
|
172 |
+
except:
|
173 |
+
retries +=1
|
174 |
+
|
175 |
+
usage = model_response.usage
|
176 |
+
model_response.usage = usage
|
177 |
+
ret.append(model_response)
|
178 |
+
else:
|
179 |
+
# We do not handle retry logic on parallel requests right now
|
180 |
+
loop = asyncio.new_event_loop()
|
181 |
+
asyncio.set_event_loop(loop)
|
182 |
+
response = cast(
|
183 |
+
Response,
|
184 |
+
loop.run_until_complete(
|
185 |
+
manifest.arun_batch(
|
186 |
+
prompts,
|
187 |
+
**manifest_params, # type: ignore
|
188 |
+
),
|
189 |
+
),
|
190 |
+
)
|
191 |
+
loop.close()
|
192 |
+
|
193 |
+
response_usage = response.get_usage()
|
194 |
+
response_text = response.get_parsed_response()
|
195 |
+
for prompt, resp in zip(prompts, response_text):
|
196 |
+
# This will restitch the query in the case we force it to start with SELECT
|
197 |
+
sql_query = prompt_formatter.format_model_output(cast(str, resp), prompt)
|
198 |
+
for token in stop_sequences:
|
199 |
+
sql_query = sql_query.split(token)[0]
|
200 |
+
logger.info(f"FINAL OUTPUT: {sql_query}")
|
201 |
+
ret.append(
|
202 |
+
TextToSQLModelResponse(
|
203 |
+
output=sql_query,
|
204 |
+
raw_output=cast(str, resp),
|
205 |
+
final_prompt=prompt,
|
206 |
+
usage=response_usage,
|
207 |
+
)
|
208 |
+
)
|
209 |
+
|
210 |
+
return ret
|
211 |
+
|
212 |
+
|
213 |
+
def _run_manifest(
|
214 |
+
prompt: str | list[str],
|
215 |
+
manifest_params: dict,
|
216 |
+
prompt_formatter: RajkumarFormatter,
|
217 |
+
manifest: Manifest,
|
218 |
+
stop_sequences: list[str] | None = None,
|
219 |
+
) -> TextToSQLModelResponse:
|
220 |
+
"""Run manifest for prompt format."""
|
221 |
+
logger.info(f"PARAMS: {manifest_params}")
|
222 |
+
if isinstance(prompt, list):
|
223 |
+
for p in prompt:
|
224 |
+
logger.info(f"PROMPT: {p['role']}: {p['content']}")
|
225 |
+
else:
|
226 |
+
logger.info(f"PROMPT: {prompt}")
|
227 |
+
start_time = time.time()
|
228 |
+
# Run result
|
229 |
+
response = cast(
|
230 |
+
Response,
|
231 |
+
manifest.run(
|
232 |
+
prompt,
|
233 |
+
return_response=True,
|
234 |
+
client_timeout=1800,
|
235 |
+
**manifest_params, # type: ignore
|
236 |
+
),
|
237 |
+
)
|
238 |
+
logger.info(f"TIME: {time.time() - start_time: .2f}")
|
239 |
+
|
240 |
+
response_usage = response.get_usage_obj()
|
241 |
+
summed_usage = Usage()
|
242 |
+
for usage in response_usage.usages:
|
243 |
+
summed_usage.completion_tokens += usage.completion_tokens
|
244 |
+
summed_usage.prompt_tokens += usage.prompt_tokens
|
245 |
+
summed_usage.total_tokens += usage.total_tokens
|
246 |
+
# This will restitch the query in the case we force it to start with SELECT
|
247 |
+
sql_query = prompt_formatter.format_model_output(
|
248 |
+
cast(str, response.get_response()), prompt
|
249 |
+
)
|
250 |
+
|
251 |
+
for token in stop_sequences:
|
252 |
+
sql_query = sql_query.split(token)[0]
|
253 |
+
logger.info(f"OUTPUT: {sql_query}")
|
254 |
+
model_response = TextToSQLModelResponse(
|
255 |
+
output=sql_query,
|
256 |
+
raw_output=cast(str, response.get_response()),
|
257 |
+
final_prompt=prompt,
|
258 |
+
usage=summed_usage,
|
259 |
+
)
|
260 |
+
return model_response
|
duckdb-nsql/manifest/.flake8
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is our code-style check. We currently allow the following exceptions:
|
2 |
+
# - E731: do not assign a lambda expression, use a def
|
3 |
+
# - E402: module level import not at top of file
|
4 |
+
# - W503: line break before binary operator
|
5 |
+
# - E203: whitespace before :
|
6 |
+
|
7 |
+
[flake8]
|
8 |
+
exclude = .git
|
9 |
+
max-line-length = 88
|
10 |
+
ignore = E731, E402, W503, E203, PAI100, PAI101, PAI201, PAI202, PAI203
|
11 |
+
per-file-ignores = __init__.py:F401, version.py:D100
|
duckdb-nsql/manifest/.pre-commit-config.yaml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
repos:
|
2 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
3 |
+
rev: v3.2.0
|
4 |
+
hooks:
|
5 |
+
- id: trailing-whitespace
|
6 |
+
- id: end-of-file-fixer
|
7 |
+
- id: check-yaml
|
8 |
+
- id: check-toml
|
9 |
+
- id: check-merge-conflict
|
10 |
+
- id: check-added-large-files
|
11 |
+
- repo: https://github.com/timothycrosley/isort
|
12 |
+
rev: 5.13.2
|
13 |
+
hooks:
|
14 |
+
- id: isort
|
15 |
+
- repo: https://github.com/psf/black
|
16 |
+
rev: 22.3.0
|
17 |
+
hooks:
|
18 |
+
- id: black
|
19 |
+
language_version: python3
|
20 |
+
- repo: https://github.com/PyCQA/flake8
|
21 |
+
rev: 6.0.0
|
22 |
+
hooks:
|
23 |
+
- id: flake8
|
duckdb-nsql/manifest/CHANGELOG.rst
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
0.1.10 - Unreleased
|
2 |
+
---------------------
|
3 |
+
|
4 |
+
0.1.9 - 2024-01-22
|
5 |
+
---------------------
|
6 |
+
Fixed
|
7 |
+
^^^^^
|
8 |
+
* Added trust code params HF models
|
9 |
+
* Added LRU cache to HF model param calls to avoid extra calls
|
10 |
+
* Fixed pydantic type issue HF model return
|
11 |
+
* Support for Python 3.10-3.11
|
12 |
+
|
13 |
+
0.1.8 - 2023-05-22
|
14 |
+
---------------------
|
15 |
+
Added
|
16 |
+
^^^^^
|
17 |
+
* Azure model support (completion and chat)
|
18 |
+
* Google Vertex API model support (completion and chat)
|
19 |
+
* Streaming responses for LM Completions (set stream=True)
|
20 |
+
|
21 |
+
Fixed
|
22 |
+
^^^^^
|
23 |
+
* `run` with batches now acts the same as async run except not async. We will batch requests into appropriate batchs sizes.
|
24 |
+
* Refactored client so unified preprocess and postprocess of requests and responses to better support model variants in request/response format.
|
25 |
+
|
26 |
+
0.1.7 - 2023-05-17
|
27 |
+
---------------------
|
28 |
+
Fixed
|
29 |
+
^^^^^
|
30 |
+
* `_run_chat` fixed bug where not passing in kwargs
|
31 |
+
|
32 |
+
0.1.6 - 2023-05-16
|
33 |
+
---------------------
|
34 |
+
Fixed
|
35 |
+
^^^^^
|
36 |
+
* Unified `run` and `run_chat` methods so it's just `run` now.
|
37 |
+
* LLama HF models for eval
|
38 |
+
|
39 |
+
0.1.5 - 2023-05-03
|
40 |
+
---------------------
|
41 |
+
Added
|
42 |
+
^^^^^
|
43 |
+
* Added chat input for chat models.
|
44 |
+
|
45 |
+
0.1.4 - 2023-04-24
|
46 |
+
---------------------
|
47 |
+
Added
|
48 |
+
^^^^^
|
49 |
+
* Connection pools to swap between clients
|
50 |
+
* Chunksize param for async runs
|
51 |
+
|
52 |
+
Fixed
|
53 |
+
^^^^^
|
54 |
+
* Determine cache and response by request type, not client name
|
55 |
+
* Refactor Response to use Pydantic types for Request and Response
|
56 |
+
|
57 |
+
0.1.1
|
58 |
+
---------------------
|
59 |
+
Added
|
60 |
+
^^^^^
|
61 |
+
* Async support in arun_batch
|
62 |
+
|
63 |
+
Fixed
|
64 |
+
^^^^^
|
65 |
+
* Batched runs now caches individual items
|
66 |
+
* Score prompt does not truncate outside token
|
67 |
+
|
68 |
+
Removed
|
69 |
+
^^^^^
|
70 |
+
* Deprecated chatGPT in favor of openaichat which uses OpenAI completions
|
71 |
+
* Deprecated Sessions
|
72 |
+
|
73 |
+
0.1.0 - 2022-01-31
|
74 |
+
---------------------
|
75 |
+
Added
|
76 |
+
^^^^^
|
77 |
+
* Batched inference support in `manifest.run`. No more separate `manifest.run_batch` method.
|
78 |
+
* Standard request base model for all language inputs.
|
79 |
+
* ChatGPT client. Requires CHATGPT_SESSION_KEY to be passed in.
|
80 |
+
* Diffusion model support
|
81 |
+
* Together model support
|
82 |
+
|
83 |
+
Removed
|
84 |
+
^^^^^^^
|
85 |
+
* `Prompt` class
|
86 |
+
* `OPT` client - OPT is now available in HuggingFace
|
87 |
+
|
88 |
+
0.0.1 - 2022-11-08
|
89 |
+
-------------------
|
90 |
+
First major pip release of Manifest. Install via `pip install manifest-ml`.
|
91 |
+
|
92 |
+
|
93 |
+
.. _@lorr1: https://github.com/lorr1
|
duckdb-nsql/manifest/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright [yyyy] [name of copyright owner]
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
duckdb-nsql/manifest/Makefile
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dev:
|
2 |
+
pip install -e .[all]
|
3 |
+
pre-commit install
|
4 |
+
|
5 |
+
test: dev check
|
6 |
+
pytest tests
|
7 |
+
|
8 |
+
format:
|
9 |
+
isort --atomic manifest/ tests/ web_app/
|
10 |
+
black manifest/ tests/ web_app/
|
11 |
+
|
12 |
+
check:
|
13 |
+
isort -c manifest/ tests/ web_app/
|
14 |
+
black manifest/ tests/ web_app/ --check
|
15 |
+
flake8 manifest/ tests/ web_app/
|
16 |
+
mypy manifest/ tests/ web_app/
|
17 |
+
|
18 |
+
clean:
|
19 |
+
pip uninstall -y manifest
|
20 |
+
rm -rf src/manifest.egg-info
|
21 |
+
rm -rf build/ dist/
|
22 |
+
|
23 |
+
prune:
|
24 |
+
@bash -c "git fetch -p";
|
25 |
+
@bash -c "for branch in $(git branch -vv | grep ': gone]' | awk '{print $1}'); do git branch -d $branch; done";
|
26 |
+
|
27 |
+
.PHONY: dev test clean check prune
|
duckdb-nsql/manifest/README.md
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Manifest
|
2 |
+
How to make prompt programming with Foundation Models a little easier.
|
3 |
+
|
4 |
+
|
5 |
+
# Table of Contents
|
6 |
+
- [Install](#install)
|
7 |
+
- [Getting Started](#getting-started)
|
8 |
+
- [Manifest](#manifest-components)
|
9 |
+
- [Other Models Types](#other-models)
|
10 |
+
- [Local HuggingFace Models](#local-huggingface-models)
|
11 |
+
- [Chat Models](#chat-models)
|
12 |
+
- [Embedding Models](#embedding-models)
|
13 |
+
- [Road Map](#road-map)
|
14 |
+
- [Development](#development)
|
15 |
+
- [Cite](#cite)
|
16 |
+
|
17 |
+
|
18 |
+
# Install
|
19 |
+
Install:
|
20 |
+
```bash
|
21 |
+
pip install manifest-ml
|
22 |
+
```
|
23 |
+
|
24 |
+
Install with diffusion support:
|
25 |
+
```bash
|
26 |
+
pip install manifest-ml[diffusers]
|
27 |
+
```
|
28 |
+
|
29 |
+
Install with HuggingFace local model support:
|
30 |
+
```bash
|
31 |
+
pip install manifest-ml[api]
|
32 |
+
```
|
33 |
+
|
34 |
+
Dev Install:
|
35 |
+
```bash
|
36 |
+
git clone [email protected]:HazyResearch/manifest.git
|
37 |
+
cd manifest
|
38 |
+
make dev
|
39 |
+
```
|
40 |
+
|
41 |
+
# Getting Started
|
42 |
+
Running is simple to get started. If using OpenAI, set `export OPENAI_API_KEY=<OPENAIKEY>` (or pass key in through variable `client_connection`) then run
|
43 |
+
|
44 |
+
```python
|
45 |
+
from manifest import Manifest
|
46 |
+
|
47 |
+
# Start a manifest session to OpenAI - default `engine=text-davinci-003`
|
48 |
+
manifest = Manifest(
|
49 |
+
client_name = "openai",
|
50 |
+
)
|
51 |
+
manifest.run("Why is the grass green?")
|
52 |
+
```
|
53 |
+
|
54 |
+
## Examples
|
55 |
+
We have example notebook and python scripts located at [examples](examples). These show how to use different models, model types (i.e. text, diffusers, or embedding models), and async running.
|
56 |
+
|
57 |
+
# Manifest Components
|
58 |
+
Manifest is meant to be a very light weight package to help with prompt design and iteration. Three key design decisions of Manifest are
|
59 |
+
|
60 |
+
* All models are behind APIs
|
61 |
+
* Supports caching of model inputs/outputs for iteration, reproducibility, and cost saving
|
62 |
+
* Unified API to support generate, score, and embed
|
63 |
+
|
64 |
+
## Models
|
65 |
+
Manifest provides model clients for [OpenAI](https://openai.com/), [AI21](https://studio.ai21.com/), [Cohere](https://cohere.ai/), [Together](https://together.xyz/), and HuggingFace (see [below](#huggingface-models) for how to use locally hosted HuggingFace models). You can toggle between the models by changing `client_name` and `client_connection`. For example, if a HuggingFace model is loaded locally, run
|
66 |
+
```python
|
67 |
+
manifest = Manifest(
|
68 |
+
client_name = "huggingface",
|
69 |
+
client_connection = "http://127.0.0.1:5000",
|
70 |
+
)
|
71 |
+
```
|
72 |
+
If you want to use Cohere, run
|
73 |
+
```python
|
74 |
+
manifest = Manifest(
|
75 |
+
client_name = "cohere",
|
76 |
+
client_connection = <COHERE_API_KEY>,
|
77 |
+
)
|
78 |
+
```
|
79 |
+
You can also just set `export COHERE_API_KEY=<COHERE_API_KEY>` and not use `client_connection`.
|
80 |
+
|
81 |
+
If you want to use AI21 Labs, run
|
82 |
+
```python
|
83 |
+
manifest = Manifest(
|
84 |
+
client_name = "ai21",
|
85 |
+
client_connection = <AI21_API_KEY>,
|
86 |
+
)
|
87 |
+
```
|
88 |
+
|
89 |
+
You can see the model details and possible model inputs to `run()` via
|
90 |
+
```python
|
91 |
+
print(manifest.client_pool.get_current_client().get_model_params())
|
92 |
+
print(manifest.client_pool.get_current_client().get_model_inputs())
|
93 |
+
```
|
94 |
+
|
95 |
+
## Global Cache
|
96 |
+
We support having queries and results stored in a global cache that can be shared across users. We treat inputs and outputs as key value pairs and support SQLite or Redis backends. To start with global caching using SQLite, run
|
97 |
+
|
98 |
+
```python
|
99 |
+
manifest = Manifest(
|
100 |
+
client_name = "openai",
|
101 |
+
cache_name = "sqlite",
|
102 |
+
cache_connection = "mycache.sqlite",
|
103 |
+
)
|
104 |
+
```
|
105 |
+
The cache will be saved in `mycache.sqlite`.
|
106 |
+
|
107 |
+
We also support Redis backend.
|
108 |
+
```python
|
109 |
+
manifest = Manifest(
|
110 |
+
client_name = "openai",
|
111 |
+
cache_name = "redis",
|
112 |
+
cache_connection = "localhost:6379"
|
113 |
+
)
|
114 |
+
```
|
115 |
+
As a hint, if you want to get Redis running, see the `docker run` command below under development.
|
116 |
+
|
117 |
+
## Running Queries
|
118 |
+
Once you have a session open, you can write and develop prompts.
|
119 |
+
|
120 |
+
```python
|
121 |
+
result = manifest.run("Hello, my name is Laurel")
|
122 |
+
```
|
123 |
+
|
124 |
+
You can also run over multiple examples if supported by the client.
|
125 |
+
```python
|
126 |
+
results = manifest.run(["Where are the cats?", "Where are the dogs?"])
|
127 |
+
```
|
128 |
+
|
129 |
+
We support async queries as well via
|
130 |
+
```python
|
131 |
+
import asyncio
|
132 |
+
results = asyncio.run(manifest.arun_batch(["Where are the cats?", "Where are the dogs?"]))
|
133 |
+
```
|
134 |
+
|
135 |
+
If something doesn't go right, you can also ask to get a raw manifest Response.
|
136 |
+
```python
|
137 |
+
result_object = manifest.run(["Where are the cats?", "Where are the dogs?"], return_response=True)
|
138 |
+
print(result_object.get_request_obj())
|
139 |
+
print(result_object.is_cached())
|
140 |
+
print(result_object.get_response_obj())
|
141 |
+
```
|
142 |
+
|
143 |
+
By default, we do not truncate results based on a stop token. You can change this by either passing a new stop token to a Manifest session or to a `run`.
|
144 |
+
```python
|
145 |
+
result = manifest.run(prompt, "Laurel", stop_token="and")
|
146 |
+
```
|
147 |
+
|
148 |
+
If you want to change default parameters to a model, we pass those as `kwargs` to the client.
|
149 |
+
```python
|
150 |
+
result = manifest.run(prompt, "Laurel", max_tokens=50)
|
151 |
+
```
|
152 |
+
|
153 |
+
## Streaming Queries
|
154 |
+
Manifest also supports streaming the model response back, assuming it's supported by the underlying client. When calling `run`, pass `stream=True` to get a streaming iterator in response.
|
155 |
+
|
156 |
+
```python
|
157 |
+
result_iterator = manifest.run("Tell me a story. Once upon a time", max_tokens=100, stream=True)
|
158 |
+
for res_text in result_iterator:
|
159 |
+
print(res_text)
|
160 |
+
```
|
161 |
+
Streaming responses are only supported for single string queries (not batch mode) for text completion models.
|
162 |
+
|
163 |
+
## Model Pools
|
164 |
+
Manifest supports querying multiple models with different schedulers. This is very much a work in progress effort, but Manifest will round robin select (or randomly select) the clients you want. You can use the same client multiple times with different connection strings (e.g. different API keys), or you can mix and match. The only requirement is that all clients are the same request type. I.e. you can't have a pool of generation models and embedding models.
|
165 |
+
|
166 |
+
To query between a local model and OpenAI,
|
167 |
+
```python
|
168 |
+
from manifest.connections.client_pool import ClientConnection
|
169 |
+
from manifest import Manifest
|
170 |
+
|
171 |
+
client_connection1 = ClientConnection(
|
172 |
+
client_name="huggingface",
|
173 |
+
client_connection="http://127.0.0.1:5000",
|
174 |
+
)
|
175 |
+
client_connection2 = ClientConnection(client_name="openai", engine="text-ada-001")
|
176 |
+
manifest = Manifest(
|
177 |
+
client_pool=[client_connection1, client_connection2],
|
178 |
+
cache_name="sqlite",
|
179 |
+
client_connection=sqlite_cache,
|
180 |
+
)
|
181 |
+
manifest.run(...)
|
182 |
+
```
|
183 |
+
|
184 |
+
The speed benefit comes in with async batched runs. When calling `arun_batch` with a list of prompts, Manifest supports a `chunk_size` param. This will break the prompts into `chunk_size` chunks to spread across the client pool. By default `chunk_size` is `-1` which means only one client will get all the prompts to run asynchronously. You must set `chunk_size > 1` to distribute across the pool. There is a further `batch_size` param which control the individual client `batch_size` to send to the model.
|
185 |
+
|
186 |
+
```python
|
187 |
+
responses = asyncio.run(manifest.arun_batch(prompts, max_tokens=30, chunk_size=20))
|
188 |
+
```
|
189 |
+
|
190 |
+
# Other Models
|
191 |
+
|
192 |
+
## Local Huggingface Models
|
193 |
+
To use a HuggingFace generative model, in `manifest/api` we have a Flask application that hosts the models for you.
|
194 |
+
|
195 |
+
In a separate terminal or Tmux/Screen session, to load 6B parameters models, run
|
196 |
+
```bash
|
197 |
+
python3 -m manifest.api.app \
|
198 |
+
--model_type huggingface \
|
199 |
+
--model_name_or_path EleutherAI/gpt-j-6B \
|
200 |
+
--device 0
|
201 |
+
```
|
202 |
+
You will see the Flask session start and output a URL `http://127.0.0.1:5000`. Pass this in to Manifest. If you want to use a different port, set the `FLASK_PORT` environment variable.
|
203 |
+
|
204 |
+
```python
|
205 |
+
manifest = Manifest(
|
206 |
+
client_name = "huggingface",
|
207 |
+
client_connection = "http://127.0.0.1:5000",
|
208 |
+
)
|
209 |
+
```
|
210 |
+
|
211 |
+
If you have a custom model you trained, pass the model path to `--model_name_or_path`.
|
212 |
+
|
213 |
+
To help load larger models, we also support using `parallelize()` from HF, [accelerate](https://huggingface.co/docs/accelerate/index), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), and [deepspeed](https://github.com/microsoft/DeepSpeed). You will need to install these packages first via `pip install manifest-ml[api]`. We list the commands to load larger models below.
|
214 |
+
|
215 |
+
* T0pp
|
216 |
+
```bash
|
217 |
+
python3 -m manifest.api.app \
|
218 |
+
--model_type huggingface \
|
219 |
+
--model_name_or_path bigscience/T0pp \
|
220 |
+
--use_hf_parallelize
|
221 |
+
```
|
222 |
+
|
223 |
+
* NeoX 20B (requires at least 60GB of GPU memory)
|
224 |
+
```bash
|
225 |
+
python3 -m manifest.api.app \
|
226 |
+
--model_type huggingface \
|
227 |
+
--model_name_or_path EleutherAI/gpt-neox-20b \
|
228 |
+
--use_accelerate_multigpu \
|
229 |
+
--percent_max_gpu_mem_reduction 0.75
|
230 |
+
```
|
231 |
+
* Bloom 175B (requires at least 240GB of GPU memory)
|
232 |
+
```bash
|
233 |
+
python3 -m manifest.api.app \
|
234 |
+
--model_type huggingface \
|
235 |
+
--model_name_or_path bigscience/bloom \
|
236 |
+
--use_bitsandbytes \
|
237 |
+
--percent_max_gpu_mem_reduction 0.85
|
238 |
+
```
|
239 |
+
|
240 |
+
## Chat Models
|
241 |
+
Manifest has specific support for executing against chat models in the more standard "system" / "user" dialogue. To pass in a dialogue history to Manifest, use the `run` command with a list of dictionary inputs with `role` and `content` keys using an associated chat model such as `openaichat`.
|
242 |
+
|
243 |
+
```python
|
244 |
+
manifest = Manifest(client_name="openaichat")
|
245 |
+
dialogue = [
|
246 |
+
{"role": "system", "content": "You are a helpful assistant who also responds in rhymes"},
|
247 |
+
{"role": "user", "content": "What is the date?"},
|
248 |
+
]
|
249 |
+
res = manifest.run(dialogue, max_tokens=100)
|
250 |
+
```
|
251 |
+
|
252 |
+
## Embedding Models
|
253 |
+
Manifest also supports getting embeddings from models and available APIs. We do this all through changing the `client_name` argument. You still use `run` and `abatch_run`.
|
254 |
+
|
255 |
+
To use OpenAI's embedding models, simply run
|
256 |
+
```python
|
257 |
+
manifest = Manifest(client_name="openaiembedding")
|
258 |
+
embedding_as_np = manifest.run("Get me an embedding for a bunny")
|
259 |
+
```
|
260 |
+
|
261 |
+
As explained above, you can load local HuggingFace models that give you embeddings, too. If you want to use a standard generative model, load the model as above use use `client_name="huggingfaceembedding"`. If you want to use a standard embedding model, like those from SentenceTransformers, load your local model via
|
262 |
+
```bash
|
263 |
+
python3 -m manifest.api.app \
|
264 |
+
--model_type sentence_transformers \
|
265 |
+
--model_name_or_path all-mpnet-base-v2 \
|
266 |
+
--device 0
|
267 |
+
```
|
268 |
+
|
269 |
+
# Road Map
|
270 |
+
Here's what's coming up next
|
271 |
+
- [ ] Clients
|
272 |
+
- [ ] HuggingFace Hub
|
273 |
+
- [x] Azure OpenAI
|
274 |
+
- [x] Google Vertex
|
275 |
+
- [ ] Anthropic
|
276 |
+
- [x] Streaming Support Completions
|
277 |
+
- [ ] Streaming Support Chat Models
|
278 |
+
- [ ] Data Types
|
279 |
+
- [ ] Diffusion Models
|
280 |
+
- [x] Orchestration
|
281 |
+
- [x] Connection pools
|
282 |
+
- [ ] Local Inference
|
283 |
+
- [ ] FlexGen
|
284 |
+
|
285 |
+
# Development
|
286 |
+
Before submitting a PR, run
|
287 |
+
```bash
|
288 |
+
export REDIS_PORT="6379" # or whatever PORT local redis is running for those tests
|
289 |
+
cd <REDIS_PATH>
|
290 |
+
docker run -d -p 127.0.0.1:${REDIS_PORT}:6379 -v `pwd`:`pwd` -w `pwd` --name manifest_redis_test redis
|
291 |
+
make test
|
292 |
+
```
|
293 |
+
|
294 |
+
# Cite
|
295 |
+
Please cite Manifest if you used it for any publications. Thanks!!
|
296 |
+
```
|
297 |
+
@misc{orr2022manifest,
|
298 |
+
author = {Orr, Laurel},
|
299 |
+
title = {Manifest},
|
300 |
+
year = {2022},
|
301 |
+
publisher = {GitHub},
|
302 |
+
howpublished = {\url{https://github.com/HazyResearch/manifest}},
|
303 |
+
}
|
304 |
+
```
|
duckdb-nsql/manifest/examples/langchain_chatgpt.ipynb
ADDED
@@ -0,0 +1,455 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"attachments": {},
|
5 |
+
"cell_type": "markdown",
|
6 |
+
"id": "b253f4d5",
|
7 |
+
"metadata": {},
|
8 |
+
"source": [
|
9 |
+
"# ChatGPT Clone using TOMA GPT-JT-6B\n",
|
10 |
+
"(adopted from ChatGPT Clone [notebook](https://github.com/hwchase17/langchain/blob/master/docs/examples/chains/chatgpt_clone.ipynb))"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 1,
|
16 |
+
"id": "b0302886",
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [
|
19 |
+
{
|
20 |
+
"name": "stdout",
|
21 |
+
"output_type": "stream",
|
22 |
+
"text": [
|
23 |
+
"env: TOMA_URL=https://staging.together.xyz/api\n"
|
24 |
+
]
|
25 |
+
}
|
26 |
+
],
|
27 |
+
"source": [
|
28 |
+
"%env TOMA_URL=https://staging.together.xyz/api"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"attachments": {},
|
33 |
+
"cell_type": "markdown",
|
34 |
+
"id": "93a18ea6",
|
35 |
+
"metadata": {},
|
36 |
+
"source": [
|
37 |
+
"Make sure you have langchain installed and manifest. For the most recent versions, run\n",
|
38 |
+
"```\n",
|
39 |
+
"pip install git+https://github.com/hwchase17/langchain.git\n",
|
40 |
+
"pip install git+https://github.com/HazyResearch/manifest.git\n",
|
41 |
+
"```"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": 35,
|
47 |
+
"id": "a99acd89",
|
48 |
+
"metadata": {},
|
49 |
+
"outputs": [
|
50 |
+
{
|
51 |
+
"name": "stdout",
|
52 |
+
"output_type": "stream",
|
53 |
+
"text": [
|
54 |
+
"\n",
|
55 |
+
"\n",
|
56 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
57 |
+
"Prompt after formatting:\n",
|
58 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
59 |
+
"\n",
|
60 |
+
"\n",
|
61 |
+
"Input: Classes are \"positive\" and \"negative\". For example given\n",
|
62 |
+
"Input: I love this product!\n",
|
63 |
+
"Output: positive.\n",
|
64 |
+
"I think this movie was one of the worst of the year. Script was boring!\n",
|
65 |
+
"Output:\u001b[0m\n",
|
66 |
+
"\n",
|
67 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
68 |
+
"negative.\n"
|
69 |
+
]
|
70 |
+
}
|
71 |
+
],
|
72 |
+
"source": [
|
73 |
+
"from manifest import Manifest\n",
|
74 |
+
"from langchain.llms.manifest import ManifestWrapper\n",
|
75 |
+
"from langchain import ConversationChain, LLMChain, PromptTemplate\n",
|
76 |
+
"from langchain.chains.conversation.memory import ConversationalBufferWindowMemory\n",
|
77 |
+
"\n",
|
78 |
+
"\n",
|
79 |
+
"template = \"\"\"I am a classification model. It will try to classify your input.\n",
|
80 |
+
"\n",
|
81 |
+
"{history}\n",
|
82 |
+
"Input: {human_input}\n",
|
83 |
+
"Output:\"\"\"\n",
|
84 |
+
"\n",
|
85 |
+
"prompt = PromptTemplate(\n",
|
86 |
+
" input_variables=[\"history\", \"human_input\"], \n",
|
87 |
+
" template=template\n",
|
88 |
+
")\n",
|
89 |
+
"\n",
|
90 |
+
"manifest = Manifest(\n",
|
91 |
+
" client_name=\"toma\",\n",
|
92 |
+
" engine=\"Together-gpt-JT-6B-v1\",\n",
|
93 |
+
" max_tokens=150,\n",
|
94 |
+
" top_p=0.9,\n",
|
95 |
+
" top_k=40,\n",
|
96 |
+
" stop_sequences=[\"\\n\"],\n",
|
97 |
+
")\n",
|
98 |
+
"\n",
|
99 |
+
"chatgpt_chain = LLMChain(\n",
|
100 |
+
" llm=ManifestWrapper(client=manifest), \n",
|
101 |
+
" prompt=prompt, \n",
|
102 |
+
" verbose=True, \n",
|
103 |
+
" memory=ConversationalBufferWindowMemory(k=8),\n",
|
104 |
+
")\n",
|
105 |
+
"\n",
|
106 |
+
"output = chatgpt_chain.predict(human_input=\"Classes are \\\"positive\\\" and \\\"negative\\\". For example given\\nInput: I love this product!\\nOutput: positive.\\nI think this movie was one of the worst of the year. Script was boring!\")\n",
|
107 |
+
"print(output)"
|
108 |
+
]
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"cell_type": "code",
|
112 |
+
"execution_count": 36,
|
113 |
+
"id": "4ef711d6",
|
114 |
+
"metadata": {},
|
115 |
+
"outputs": [
|
116 |
+
{
|
117 |
+
"name": "stdout",
|
118 |
+
"output_type": "stream",
|
119 |
+
"text": [
|
120 |
+
"\n",
|
121 |
+
"\n",
|
122 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
123 |
+
"Prompt after formatting:\n",
|
124 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
125 |
+
"\n",
|
126 |
+
"Human: Classes are \"positive\" and \"negative\". For example given\n",
|
127 |
+
"Input: I love this product!\n",
|
128 |
+
"Output: positive.\n",
|
129 |
+
"I think this movie was one of the worst of the year. Script was boring!\n",
|
130 |
+
"AI: negative.\n",
|
131 |
+
"Input: So awesome! I wish I could have gone\n",
|
132 |
+
"Output:\u001b[0m\n",
|
133 |
+
"\n",
|
134 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
135 |
+
"positive.\n"
|
136 |
+
]
|
137 |
+
}
|
138 |
+
],
|
139 |
+
"source": [
|
140 |
+
"output = chatgpt_chain.predict(human_input=\"So awesome! I wish I could have gone\")\n",
|
141 |
+
"print(output)"
|
142 |
+
]
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"cell_type": "code",
|
146 |
+
"execution_count": 37,
|
147 |
+
"id": "a5d6dac2",
|
148 |
+
"metadata": {},
|
149 |
+
"outputs": [
|
150 |
+
{
|
151 |
+
"name": "stdout",
|
152 |
+
"output_type": "stream",
|
153 |
+
"text": [
|
154 |
+
"\n",
|
155 |
+
"\n",
|
156 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
157 |
+
"Prompt after formatting:\n",
|
158 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
159 |
+
"\n",
|
160 |
+
"Human: Classes are \"positive\" and \"negative\". For example given\n",
|
161 |
+
"Input: I love this product!\n",
|
162 |
+
"Output: positive.\n",
|
163 |
+
"I think this movie was one of the worst of the year. Script was boring!\n",
|
164 |
+
"AI: negative.\n",
|
165 |
+
"Human: So awesome! I wish I could have gone\n",
|
166 |
+
"AI: positive.\n",
|
167 |
+
"Input: Hate it.\n",
|
168 |
+
"Output:\u001b[0m\n",
|
169 |
+
"\n",
|
170 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
171 |
+
"negative.\n"
|
172 |
+
]
|
173 |
+
}
|
174 |
+
],
|
175 |
+
"source": [
|
176 |
+
"output = chatgpt_chain.predict(human_input=\"Hate it.\")\n",
|
177 |
+
"print(output)"
|
178 |
+
]
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"cell_type": "code",
|
182 |
+
"execution_count": 43,
|
183 |
+
"id": "b9283077",
|
184 |
+
"metadata": {},
|
185 |
+
"outputs": [
|
186 |
+
{
|
187 |
+
"name": "stdout",
|
188 |
+
"output_type": "stream",
|
189 |
+
"text": [
|
190 |
+
"\n",
|
191 |
+
"\n",
|
192 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
193 |
+
"Prompt after formatting:\n",
|
194 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
195 |
+
"\n",
|
196 |
+
"\n",
|
197 |
+
"Input: Classes are fruits \"apple\", \"banana\", \"orange\", \"pear\". For example given\n",
|
198 |
+
"Input: This fruit rippens off of the tree.\n",
|
199 |
+
"Output: banana.\n",
|
200 |
+
"Often comes in bosc and bartlett varieties.\n",
|
201 |
+
"Output:\u001b[0m\n",
|
202 |
+
"\n",
|
203 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
204 |
+
"apple.\n"
|
205 |
+
]
|
206 |
+
}
|
207 |
+
],
|
208 |
+
"source": [
|
209 |
+
"chatgpt_chain.memory.clear()\n",
|
210 |
+
"output = chatgpt_chain.predict(human_input=\"Classes are fruits \\\"apple\\\", \\\"banana\\\", \\\"orange\\\", \\\"pear\\\". For example given\\nInput: This fruit rippens off of the tree.\\nOutput: banana.\\nOften comes in bosc and bartlett varieties.\")\n",
|
211 |
+
"print(output)"
|
212 |
+
]
|
213 |
+
},
|
214 |
+
{
|
215 |
+
"cell_type": "code",
|
216 |
+
"execution_count": 44,
|
217 |
+
"id": "cd0a23d9",
|
218 |
+
"metadata": {
|
219 |
+
"scrolled": true
|
220 |
+
},
|
221 |
+
"outputs": [
|
222 |
+
{
|
223 |
+
"name": "stdout",
|
224 |
+
"output_type": "stream",
|
225 |
+
"text": [
|
226 |
+
"\n",
|
227 |
+
"\n",
|
228 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
229 |
+
"Prompt after formatting:\n",
|
230 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
231 |
+
"\n",
|
232 |
+
"Human: Classes are fruits \"apple\", \"banana\", \"orange\", \"pear\". For example given\n",
|
233 |
+
"Input: This fruit rippens off of the tree.\n",
|
234 |
+
"Output: banana.\n",
|
235 |
+
"Often comes in bosc and bartlett varieties.\n",
|
236 |
+
"AI: apple.\n",
|
237 |
+
"Input: Often associated with monkeys\n",
|
238 |
+
"Output:\u001b[0m\n",
|
239 |
+
"\n",
|
240 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
241 |
+
"banana.\n"
|
242 |
+
]
|
243 |
+
}
|
244 |
+
],
|
245 |
+
"source": [
|
246 |
+
"output = chatgpt_chain.predict(human_input=\"Often associated with monkeys\")\n",
|
247 |
+
"print(output)"
|
248 |
+
]
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"cell_type": "code",
|
252 |
+
"execution_count": 45,
|
253 |
+
"id": "90db6eb2",
|
254 |
+
"metadata": {},
|
255 |
+
"outputs": [
|
256 |
+
{
|
257 |
+
"name": "stdout",
|
258 |
+
"output_type": "stream",
|
259 |
+
"text": [
|
260 |
+
"\n",
|
261 |
+
"\n",
|
262 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
263 |
+
"Prompt after formatting:\n",
|
264 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
265 |
+
"\n",
|
266 |
+
"Human: Classes are fruits \"apple\", \"banana\", \"orange\", \"pear\". For example given\n",
|
267 |
+
"Input: This fruit rippens off of the tree.\n",
|
268 |
+
"Output: banana.\n",
|
269 |
+
"Often comes in bosc and bartlett varieties.\n",
|
270 |
+
"AI: apple.\n",
|
271 |
+
"Human: Often associated with monkeys\n",
|
272 |
+
"AI: banana.\n",
|
273 |
+
"Input: Is the color red and often delicious.\n",
|
274 |
+
"Output:\u001b[0m\n",
|
275 |
+
"\n",
|
276 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
277 |
+
"apple.\n"
|
278 |
+
]
|
279 |
+
}
|
280 |
+
],
|
281 |
+
"source": [
|
282 |
+
"output = chatgpt_chain.predict(human_input=\"Is the color red and often delicious.\")\n",
|
283 |
+
"print(output)"
|
284 |
+
]
|
285 |
+
},
|
286 |
+
{
|
287 |
+
"cell_type": "code",
|
288 |
+
"execution_count": 48,
|
289 |
+
"id": "c3806f89",
|
290 |
+
"metadata": {},
|
291 |
+
"outputs": [
|
292 |
+
{
|
293 |
+
"name": "stdout",
|
294 |
+
"output_type": "stream",
|
295 |
+
"text": [
|
296 |
+
"\n",
|
297 |
+
"\n",
|
298 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
299 |
+
"Prompt after formatting:\n",
|
300 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
301 |
+
"\n",
|
302 |
+
"\n",
|
303 |
+
"Input: Classes are colors \"red\", \"green\", \"blue\", \"yellow\". For example given\n",
|
304 |
+
"Input: The color of a school bus.\n",
|
305 |
+
"Output: yellow.\n",
|
306 |
+
"Is the color of the sky\n",
|
307 |
+
"Output:\u001b[0m\n",
|
308 |
+
"\n",
|
309 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
310 |
+
"blue.\n"
|
311 |
+
]
|
312 |
+
}
|
313 |
+
],
|
314 |
+
"source": [
|
315 |
+
"chatgpt_chain.memory.clear()\n",
|
316 |
+
"output = chatgpt_chain.predict(human_input=\"Classes are colors \\\"red\\\", \\\"green\\\", \\\"blue\\\", \\\"yellow\\\". For example given\\nInput: The color of a school bus.\\nOutput: yellow.\\nIs the color of the sky\")\n",
|
317 |
+
"print(output)"
|
318 |
+
]
|
319 |
+
},
|
320 |
+
{
|
321 |
+
"cell_type": "code",
|
322 |
+
"execution_count": 49,
|
323 |
+
"id": "f508f597",
|
324 |
+
"metadata": {},
|
325 |
+
"outputs": [
|
326 |
+
{
|
327 |
+
"name": "stdout",
|
328 |
+
"output_type": "stream",
|
329 |
+
"text": [
|
330 |
+
"\n",
|
331 |
+
"\n",
|
332 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
333 |
+
"Prompt after formatting:\n",
|
334 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
335 |
+
"\n",
|
336 |
+
"Human: Classes are colors \"red\", \"green\", \"blue\", \"yellow\". For example given\n",
|
337 |
+
"Input: The color of a school bus.\n",
|
338 |
+
"Output: yellow.\n",
|
339 |
+
"Is the color of the sky\n",
|
340 |
+
"AI: blue.\n",
|
341 |
+
"Input: Color of a banana.\n",
|
342 |
+
"Output:\u001b[0m\n",
|
343 |
+
"\n",
|
344 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
345 |
+
"yellow.\n"
|
346 |
+
]
|
347 |
+
}
|
348 |
+
],
|
349 |
+
"source": [
|
350 |
+
"output = chatgpt_chain.predict(human_input=\"Color of a banana.\")\n",
|
351 |
+
"print(output)"
|
352 |
+
]
|
353 |
+
},
|
354 |
+
{
|
355 |
+
"cell_type": "code",
|
356 |
+
"execution_count": 50,
|
357 |
+
"id": "cbd607f4",
|
358 |
+
"metadata": {},
|
359 |
+
"outputs": [
|
360 |
+
{
|
361 |
+
"name": "stdout",
|
362 |
+
"output_type": "stream",
|
363 |
+
"text": [
|
364 |
+
"\n",
|
365 |
+
"\n",
|
366 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
367 |
+
"Prompt after formatting:\n",
|
368 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
369 |
+
"\n",
|
370 |
+
"Human: Classes are colors \"red\", \"green\", \"blue\", \"yellow\". For example given\n",
|
371 |
+
"Input: The color of a school bus.\n",
|
372 |
+
"Output: yellow.\n",
|
373 |
+
"Is the color of the sky\n",
|
374 |
+
"AI: blue.\n",
|
375 |
+
"Human: Color of a banana.\n",
|
376 |
+
"AI: yellow.\n",
|
377 |
+
"Input: When someone is sick they are this color.\n",
|
378 |
+
"Output:\u001b[0m\n",
|
379 |
+
"\n",
|
380 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
381 |
+
"green.\n"
|
382 |
+
]
|
383 |
+
}
|
384 |
+
],
|
385 |
+
"source": [
|
386 |
+
"output = chatgpt_chain.predict(human_input=\"When someone is sick they are this color.\")\n",
|
387 |
+
"print(output)"
|
388 |
+
]
|
389 |
+
},
|
390 |
+
{
|
391 |
+
"cell_type": "code",
|
392 |
+
"execution_count": 51,
|
393 |
+
"id": "d33e0e28",
|
394 |
+
"metadata": {},
|
395 |
+
"outputs": [
|
396 |
+
{
|
397 |
+
"name": "stdout",
|
398 |
+
"output_type": "stream",
|
399 |
+
"text": [
|
400 |
+
"\n",
|
401 |
+
"\n",
|
402 |
+
"\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
|
403 |
+
"Prompt after formatting:\n",
|
404 |
+
"\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
|
405 |
+
"\n",
|
406 |
+
"Human: Classes are colors \"red\", \"green\", \"blue\", \"yellow\". For example given\n",
|
407 |
+
"Input: The color of a school bus.\n",
|
408 |
+
"Output: yellow.\n",
|
409 |
+
"Is the color of the sky\n",
|
410 |
+
"AI: blue.\n",
|
411 |
+
"Human: Color of a banana.\n",
|
412 |
+
"AI: yellow.\n",
|
413 |
+
"Human: When someone is sick they are this color.\n",
|
414 |
+
"AI: green.\n",
|
415 |
+
"Input: Color of anger.\n",
|
416 |
+
"Output:\u001b[0m\n",
|
417 |
+
"\n",
|
418 |
+
"\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
|
419 |
+
"red.\n"
|
420 |
+
]
|
421 |
+
}
|
422 |
+
],
|
423 |
+
"source": [
|
424 |
+
"output = chatgpt_chain.predict(human_input=\"Color of anger.\")\n",
|
425 |
+
"print(output)"
|
426 |
+
]
|
427 |
+
}
|
428 |
+
],
|
429 |
+
"metadata": {
|
430 |
+
"kernelspec": {
|
431 |
+
"display_name": "bootleg",
|
432 |
+
"language": "python",
|
433 |
+
"name": "python3"
|
434 |
+
},
|
435 |
+
"language_info": {
|
436 |
+
"codemirror_mode": {
|
437 |
+
"name": "ipython",
|
438 |
+
"version": 3
|
439 |
+
},
|
440 |
+
"file_extension": ".py",
|
441 |
+
"mimetype": "text/x-python",
|
442 |
+
"name": "python",
|
443 |
+
"nbconvert_exporter": "python",
|
444 |
+
"pygments_lexer": "ipython3",
|
445 |
+
"version": "3.8.12 | packaged by conda-forge | (default, Jan 30 2022, 23:36:06) \n[Clang 11.1.0 ]"
|
446 |
+
},
|
447 |
+
"vscode": {
|
448 |
+
"interpreter": {
|
449 |
+
"hash": "7a3f97ab0465937066e9b79893b779dfc8a12d73c41f9d98a7bf05133c798250"
|
450 |
+
}
|
451 |
+
}
|
452 |
+
},
|
453 |
+
"nbformat": 4,
|
454 |
+
"nbformat_minor": 5
|
455 |
+
}
|
duckdb-nsql/manifest/examples/manifest_async.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import time
|
3 |
+
|
4 |
+
from manifest import Manifest
|
5 |
+
|
6 |
+
|
7 |
+
def main():
|
8 |
+
|
9 |
+
manifest = Manifest(
|
10 |
+
client_name="openaichat",
|
11 |
+
)
|
12 |
+
|
13 |
+
print("Running in serial")
|
14 |
+
prompts = [f"Tell me something interesting about {i}" for i in range(50)]
|
15 |
+
st = time.time()
|
16 |
+
for pmt in prompts:
|
17 |
+
_ = manifest.run(pmt)
|
18 |
+
print(f"For loop: {time.time() - st :.2f}")
|
19 |
+
|
20 |
+
print("Running with async")
|
21 |
+
st = time.time()
|
22 |
+
_ = asyncio.run(manifest.arun_batch(prompts, max_tokens=30))
|
23 |
+
print(f"Async loop: {time.time() - st :.2f}")
|
24 |
+
|
25 |
+
|
26 |
+
if __name__ == "__main__":
|
27 |
+
main()
|
duckdb-nsql/manifest/examples/manifest_azure.ipynb
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"%load_ext autoreload\n",
|
10 |
+
"%autoreload 2"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"AZURE_KEY = \"API_KEY::URL\"\n",
|
20 |
+
"OPENAI_KEY = \"sk-XXX\""
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"attachments": {},
|
25 |
+
"cell_type": "markdown",
|
26 |
+
"metadata": {},
|
27 |
+
"source": [
|
28 |
+
"## Use Azure and OpenAI models"
|
29 |
+
]
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"cell_type": "code",
|
33 |
+
"execution_count": null,
|
34 |
+
"metadata": {},
|
35 |
+
"outputs": [],
|
36 |
+
"source": [
|
37 |
+
"from manifest import Manifest\n",
|
38 |
+
"from manifest.connections.client_pool import ClientConnection\n",
|
39 |
+
"from pathlib import Path\n",
|
40 |
+
"\n",
|
41 |
+
"cache_path = Path(\"manifest.db\")\n",
|
42 |
+
"if cache_path.exists():\n",
|
43 |
+
" cache_path.unlink()\n",
|
44 |
+
"\n",
|
45 |
+
"\n",
|
46 |
+
"azure = ClientConnection(\n",
|
47 |
+
" client_name=\"azureopenai\",\n",
|
48 |
+
" client_connection=AZURE_KEY,\n",
|
49 |
+
" engine=\"text-davinci-003\",\n",
|
50 |
+
")\n",
|
51 |
+
"\n",
|
52 |
+
"manifest = Manifest(client_pool=[azure], \n",
|
53 |
+
" cache_name=\"sqlite\",\n",
|
54 |
+
" cache_connection=\"manifest.db\"\n",
|
55 |
+
")\n",
|
56 |
+
"\n",
|
57 |
+
"\n",
|
58 |
+
"openai = ClientConnection(\n",
|
59 |
+
" client_name=\"openai\",\n",
|
60 |
+
" client_connection=OPENAI_KEY,\n",
|
61 |
+
" engine=\"text-davinci-003\",\n",
|
62 |
+
")\n",
|
63 |
+
"\n",
|
64 |
+
"manifest_openai_nocache = Manifest(client_pool=[openai])\n",
|
65 |
+
"\n",
|
66 |
+
"manifest_openai = Manifest(client_pool=[openai], \n",
|
67 |
+
" cache_name=\"sqlite\",\n",
|
68 |
+
" cache_connection=\"manifest.db\"\n",
|
69 |
+
")"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": null,
|
75 |
+
"metadata": {},
|
76 |
+
"outputs": [],
|
77 |
+
"source": [
|
78 |
+
"# Show caches are the same\n",
|
79 |
+
"text = \"What is the meaning of life?\"\n",
|
80 |
+
"res = manifest.run(text, max_tokens=100, temperature=0.7, return_response=True)\n",
|
81 |
+
"print(res.get_response())\n",
|
82 |
+
"print(res.is_cached())\n",
|
83 |
+
"res2 = manifest_openai.run(text, max_tokens=100, temperature=0.7, return_response=True)\n",
|
84 |
+
"print(res2.is_cached())\n",
|
85 |
+
"\n",
|
86 |
+
"assert res2.get_response() == res.get_response()"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "code",
|
91 |
+
"execution_count": null,
|
92 |
+
"metadata": {},
|
93 |
+
"outputs": [],
|
94 |
+
"source": [
|
95 |
+
"azure_chat = ClientConnection(\n",
|
96 |
+
" client_name=\"azureopenaichat\",\n",
|
97 |
+
" client_connection=AZURE_KEY,\n",
|
98 |
+
" engine=\"gpt-3.5-turbo\",\n",
|
99 |
+
")\n",
|
100 |
+
"\n",
|
101 |
+
"manifest = Manifest(client_pool=[azure_chat])"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": null,
|
107 |
+
"metadata": {},
|
108 |
+
"outputs": [],
|
109 |
+
"source": [
|
110 |
+
"print(manifest.run(\"What do you think is the best food?\", max_tokens=100))\n",
|
111 |
+
"\n",
|
112 |
+
"chat_dict = [\n",
|
113 |
+
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
|
114 |
+
" {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
|
115 |
+
" {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
|
116 |
+
" {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
|
117 |
+
"]\n",
|
118 |
+
"print(manifest.run(chat_dict, max_tokens=100))"
|
119 |
+
]
|
120 |
+
}
|
121 |
+
],
|
122 |
+
"metadata": {
|
123 |
+
"kernelspec": {
|
124 |
+
"display_name": "manifest",
|
125 |
+
"language": "python",
|
126 |
+
"name": "python3"
|
127 |
+
},
|
128 |
+
"language_info": {
|
129 |
+
"codemirror_mode": {
|
130 |
+
"name": "ipython",
|
131 |
+
"version": 3
|
132 |
+
},
|
133 |
+
"file_extension": ".py",
|
134 |
+
"mimetype": "text/x-python",
|
135 |
+
"name": "python",
|
136 |
+
"nbconvert_exporter": "python",
|
137 |
+
"pygments_lexer": "ipython3",
|
138 |
+
"version": "3.10.4"
|
139 |
+
},
|
140 |
+
"orig_nbformat": 4,
|
141 |
+
"vscode": {
|
142 |
+
"interpreter": {
|
143 |
+
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
|
144 |
+
}
|
145 |
+
}
|
146 |
+
},
|
147 |
+
"nbformat": 4,
|
148 |
+
"nbformat_minor": 2
|
149 |
+
}
|
duckdb-nsql/manifest/examples/manifest_chatgpt.ipynb
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"%load_ext autoreload\n",
|
10 |
+
"%autoreload 2"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"OPENAI_KEY = \"sk-XXX\""
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"attachments": {},
|
24 |
+
"cell_type": "markdown",
|
25 |
+
"metadata": {},
|
26 |
+
"source": [
|
27 |
+
"## Use ChatOpenAI\n",
|
28 |
+
"\n",
|
29 |
+
"Set you `OPENAI_API_KEY` environment variable."
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": null,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"from manifest import Manifest\n",
|
39 |
+
"from manifest.connections.client_pool import ClientConnection\n",
|
40 |
+
"\n",
|
41 |
+
"openai_chat = ClientConnection(\n",
|
42 |
+
" client_name=\"openaichat\",\n",
|
43 |
+
" client_connection=OPENAI_KEY,\n",
|
44 |
+
" engine=\"gpt-3.5-turbo\"\n",
|
45 |
+
")\n",
|
46 |
+
"\n",
|
47 |
+
"manifest = Manifest(client_pool=[openai_chat])"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": null,
|
53 |
+
"metadata": {},
|
54 |
+
"outputs": [],
|
55 |
+
"source": [
|
56 |
+
"# Simple question\n",
|
57 |
+
"chat_dict = [\n",
|
58 |
+
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
|
59 |
+
" {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
|
60 |
+
" {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
|
61 |
+
" {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
|
62 |
+
"]\n",
|
63 |
+
"print(manifest.run(chat_dict, max_tokens=100))"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"cell_type": "code",
|
68 |
+
"execution_count": null,
|
69 |
+
"metadata": {},
|
70 |
+
"outputs": [],
|
71 |
+
"source": []
|
72 |
+
}
|
73 |
+
],
|
74 |
+
"metadata": {
|
75 |
+
"kernelspec": {
|
76 |
+
"display_name": "manifest",
|
77 |
+
"language": "python",
|
78 |
+
"name": "python3"
|
79 |
+
},
|
80 |
+
"language_info": {
|
81 |
+
"codemirror_mode": {
|
82 |
+
"name": "ipython",
|
83 |
+
"version": 3
|
84 |
+
},
|
85 |
+
"file_extension": ".py",
|
86 |
+
"mimetype": "text/x-python",
|
87 |
+
"name": "python",
|
88 |
+
"nbconvert_exporter": "python",
|
89 |
+
"pygments_lexer": "ipython3",
|
90 |
+
"version": "3.10.4"
|
91 |
+
},
|
92 |
+
"orig_nbformat": 4,
|
93 |
+
"vscode": {
|
94 |
+
"interpreter": {
|
95 |
+
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
|
96 |
+
}
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"nbformat": 4,
|
100 |
+
"nbformat_minor": 2
|
101 |
+
}
|
duckdb-nsql/manifest/examples/manifest_connection_pool.ipynb
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 2,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"%load_ext autoreload\n",
|
10 |
+
"%autoreload 2"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 1,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"OPENAI_KEY1 = \"sk-XXX\"\n",
|
20 |
+
"OPENAI_KEY2 = \"sk-XX\""
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"attachments": {},
|
25 |
+
"cell_type": "markdown",
|
26 |
+
"metadata": {},
|
27 |
+
"source": [
|
28 |
+
"## Use OpenAI\n",
|
29 |
+
"\n",
|
30 |
+
"Set you `OPENAI_API_KEY` environment variable."
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": 2,
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [],
|
38 |
+
"source": [
|
39 |
+
"from manifest import Manifest\n",
|
40 |
+
"from manifest.connections.client_pool import ClientConnection\n",
|
41 |
+
"\n",
|
42 |
+
"openai_ada = ClientConnection(\n",
|
43 |
+
" client_name=\"openai\",\n",
|
44 |
+
" client_connection=OPENAI_KEY1,\n",
|
45 |
+
" engine=\"text-ada-001\"\n",
|
46 |
+
")\n",
|
47 |
+
"\n",
|
48 |
+
"openai_curie = ClientConnection(\n",
|
49 |
+
" client_name=\"openai\",\n",
|
50 |
+
" client_connection=OPENAI_KEY2,\n",
|
51 |
+
" engine=\"text-curie-001\"\n",
|
52 |
+
")\n",
|
53 |
+
"\n",
|
54 |
+
"manifest = Manifest(client_pool=[openai_ada, openai_curie], client_pool_schedule=\"round_robin\")"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": 3,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [
|
62 |
+
{
|
63 |
+
"name": "stdout",
|
64 |
+
"output_type": "stream",
|
65 |
+
"text": [
|
66 |
+
"0\n",
|
67 |
+
"I am a model.\n",
|
68 |
+
"1\n",
|
69 |
+
"I am a MacBook Pro with a retina\n"
|
70 |
+
]
|
71 |
+
}
|
72 |
+
],
|
73 |
+
"source": [
|
74 |
+
"res = manifest.run(\"What model are you?\", temperature=0.0)\n",
|
75 |
+
"print(manifest.client_pool.current_client_id)\n",
|
76 |
+
"print(res)\n",
|
77 |
+
"res = manifest.run(\"What model are you?\", temperature=0.0)\n",
|
78 |
+
"print(manifest.client_pool.current_client_id)\n",
|
79 |
+
"print(res)"
|
80 |
+
]
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"attachments": {},
|
84 |
+
"cell_type": "markdown",
|
85 |
+
"metadata": {},
|
86 |
+
"source": [
|
87 |
+
"## With Async"
|
88 |
+
]
|
89 |
+
},
|
90 |
+
{
|
91 |
+
"cell_type": "code",
|
92 |
+
"execution_count": 4,
|
93 |
+
"metadata": {},
|
94 |
+
"outputs": [],
|
95 |
+
"source": [
|
96 |
+
"import nest_asyncio\n",
|
97 |
+
"# This is required for asyncio.run(...) to work in Jupyter notebooks.\n",
|
98 |
+
"nest_asyncio.apply()"
|
99 |
+
]
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"cell_type": "code",
|
103 |
+
"execution_count": 5,
|
104 |
+
"metadata": {},
|
105 |
+
"outputs": [],
|
106 |
+
"source": [
|
107 |
+
"from manifest import Manifest\n",
|
108 |
+
"from manifest.connections.client_pool import ClientConnection\n",
|
109 |
+
"\n",
|
110 |
+
"openai_ada = ClientConnection(\n",
|
111 |
+
" client_name=\"openai\",\n",
|
112 |
+
" client_connection=OPENAI_KEY1,\n",
|
113 |
+
" engine=\"text-ada-001\"\n",
|
114 |
+
")\n",
|
115 |
+
"\n",
|
116 |
+
"openai_babbage = ClientConnection(\n",
|
117 |
+
" client_name=\"openai\",\n",
|
118 |
+
" client_connection=OPENAI_KEY2,\n",
|
119 |
+
" engine=\"text-babbage-001\"\n",
|
120 |
+
")\n",
|
121 |
+
"\n",
|
122 |
+
"openai_curie = ClientConnection(\n",
|
123 |
+
" client_name=\"openai\",\n",
|
124 |
+
" client_connection=OPENAI_KEY2,\n",
|
125 |
+
" engine=\"text-curie-001\"\n",
|
126 |
+
")\n",
|
127 |
+
"\n",
|
128 |
+
"manifest = Manifest(client_pool=[openai_ada, openai_babbage, openai_curie], client_pool_schedule=\"round_robin\")\n",
|
129 |
+
"manifest_single_client = Manifest(client_pool=[openai_babbage])"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "code",
|
134 |
+
"execution_count": 6,
|
135 |
+
"metadata": {},
|
136 |
+
"outputs": [
|
137 |
+
{
|
138 |
+
"name": "stdout",
|
139 |
+
"output_type": "stream",
|
140 |
+
"text": [
|
141 |
+
"For loop: 128.68\n",
|
142 |
+
"Running with async single client\n",
|
143 |
+
"Running 1 tasks across all clients.\n",
|
144 |
+
"Async loop: 4.02\n",
|
145 |
+
"Running with async two clients but not chunking\n",
|
146 |
+
"Running 1 tasks across all clients.\n",
|
147 |
+
"Async loop: 3.92\n",
|
148 |
+
"Running with async two clients and chunk size\n",
|
149 |
+
"Running 20 tasks across all clients.\n",
|
150 |
+
"Async loop: 1.44\n"
|
151 |
+
]
|
152 |
+
}
|
153 |
+
],
|
154 |
+
"source": [
|
155 |
+
"import time\n",
|
156 |
+
"import asyncio\n",
|
157 |
+
"\n",
|
158 |
+
"prompts = [f\"Tell me something interesting about {i}\" for i in range(400)]\n",
|
159 |
+
"st = time.time()\n",
|
160 |
+
"for pmt in prompts:\n",
|
161 |
+
" _ = manifest_single_client.run(pmt, max_tokens=30)\n",
|
162 |
+
"print(f\"For loop: {time.time() - st :.2f}\")\n",
|
163 |
+
"\n",
|
164 |
+
"print(\"Running with async single client\")\n",
|
165 |
+
"st = time.time()\n",
|
166 |
+
"_ = asyncio.run(manifest_single_client.arun_batch(prompts, max_tokens=30, chunk_size=-1))\n",
|
167 |
+
"print(f\"Async loop: {time.time() - st :.2f}\")\n",
|
168 |
+
"\n",
|
169 |
+
"print(\"Running with async two clients but not chunking\")\n",
|
170 |
+
"st = time.time()\n",
|
171 |
+
"_ = asyncio.run(manifest.arun_batch(prompts, max_tokens=30, chunk_size=-1))\n",
|
172 |
+
"print(f\"Async loop: {time.time() - st :.2f}\")\n",
|
173 |
+
"\n",
|
174 |
+
"print(\"Running with async two clients and chunk size\")\n",
|
175 |
+
"st = time.time()\n",
|
176 |
+
"_ = asyncio.run(manifest.arun_batch(prompts, max_tokens=30, chunk_size=20))\n",
|
177 |
+
"print(f\"Async loop: {time.time() - st :.2f}\")"
|
178 |
+
]
|
179 |
+
}
|
180 |
+
],
|
181 |
+
"metadata": {
|
182 |
+
"kernelspec": {
|
183 |
+
"display_name": "manifest",
|
184 |
+
"language": "python",
|
185 |
+
"name": "python3"
|
186 |
+
},
|
187 |
+
"language_info": {
|
188 |
+
"codemirror_mode": {
|
189 |
+
"name": "ipython",
|
190 |
+
"version": 3
|
191 |
+
},
|
192 |
+
"file_extension": ".py",
|
193 |
+
"mimetype": "text/x-python",
|
194 |
+
"name": "python",
|
195 |
+
"nbconvert_exporter": "python",
|
196 |
+
"pygments_lexer": "ipython3",
|
197 |
+
"version": "3.10.4"
|
198 |
+
},
|
199 |
+
"orig_nbformat": 4,
|
200 |
+
"vscode": {
|
201 |
+
"interpreter": {
|
202 |
+
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
|
203 |
+
}
|
204 |
+
}
|
205 |
+
},
|
206 |
+
"nbformat": 4,
|
207 |
+
"nbformat_minor": 2
|
208 |
+
}
|
duckdb-nsql/manifest/examples/manifest_diffusers.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
duckdb-nsql/manifest/examples/manifest_embedding.ipynb
ADDED
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"%load_ext autoreload\n",
|
10 |
+
"%autoreload 2"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"attachments": {},
|
15 |
+
"cell_type": "markdown",
|
16 |
+
"metadata": {},
|
17 |
+
"source": [
|
18 |
+
"## Use OpenAI\n",
|
19 |
+
"\n",
|
20 |
+
"Set you `OPENAI_API_KEY` environment variable."
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 2,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [
|
28 |
+
{
|
29 |
+
"name": "stdout",
|
30 |
+
"output_type": "stream",
|
31 |
+
"text": [
|
32 |
+
"{'model_name': 'openaiembedding', 'engine': 'text-embedding-ada-002'}\n"
|
33 |
+
]
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"source": [
|
37 |
+
"from manifest import Manifest\n",
|
38 |
+
"\n",
|
39 |
+
"manifest = Manifest(client_name=\"openaiembedding\")\n",
|
40 |
+
"print(manifest.client_pool.get_next_client().get_model_params())"
|
41 |
+
]
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"cell_type": "code",
|
45 |
+
"execution_count": 3,
|
46 |
+
"metadata": {},
|
47 |
+
"outputs": [
|
48 |
+
{
|
49 |
+
"name": "stdout",
|
50 |
+
"output_type": "stream",
|
51 |
+
"text": [
|
52 |
+
"(1536,)\n"
|
53 |
+
]
|
54 |
+
}
|
55 |
+
],
|
56 |
+
"source": [
|
57 |
+
"emb = manifest.run(\"Is this an embedding?\")\n",
|
58 |
+
"print(emb.shape)"
|
59 |
+
]
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"attachments": {},
|
63 |
+
"cell_type": "markdown",
|
64 |
+
"metadata": {},
|
65 |
+
"source": [
|
66 |
+
"### Using Locally Hosted Huggingface LM\n",
|
67 |
+
"\n",
|
68 |
+
"Run\n",
|
69 |
+
"```\n",
|
70 |
+
"python3 manifest/api/app.py --model_type huggingface --model_name_or_path EleutherAI/gpt-neo-125M --device 0\n",
|
71 |
+
"```\n",
|
72 |
+
"or\n",
|
73 |
+
"```\n",
|
74 |
+
"python3 manifest/api/app.py --model_type sentence_transformers --model_name_or_path all-mpnet-base-v2 --device 0\n",
|
75 |
+
"```\n",
|
76 |
+
"\n",
|
77 |
+
"in a separate `screen` or `tmux`. Make sure to note the port. You can change this with `export FLASK_PORT=<port>`."
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "code",
|
82 |
+
"execution_count": 1,
|
83 |
+
"metadata": {},
|
84 |
+
"outputs": [
|
85 |
+
{
|
86 |
+
"name": "stdout",
|
87 |
+
"output_type": "stream",
|
88 |
+
"text": [
|
89 |
+
"{'model_name': 'all-mpnet-base-v2', 'model_path': 'all-mpnet-base-v2', 'client_name': 'huggingfaceembedding'}\n"
|
90 |
+
]
|
91 |
+
}
|
92 |
+
],
|
93 |
+
"source": [
|
94 |
+
"from manifest import Manifest\n",
|
95 |
+
"\n",
|
96 |
+
"# Local hosted GPT Neo 125M\n",
|
97 |
+
"manifest = Manifest(\n",
|
98 |
+
" client_name=\"huggingfaceembedding\",\n",
|
99 |
+
" client_connection=\"http://127.0.0.1:6000\",\n",
|
100 |
+
" cache_name=\"sqlite\",\n",
|
101 |
+
" cache_connection=\"my_sqlite_manifest.sqlite\"\n",
|
102 |
+
")\n",
|
103 |
+
"print(manifest.client_pool.get_next_client().get_model_params())"
|
104 |
+
]
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"cell_type": "code",
|
108 |
+
"execution_count": 4,
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [
|
111 |
+
{
|
112 |
+
"name": "stdout",
|
113 |
+
"output_type": "stream",
|
114 |
+
"text": [
|
115 |
+
"(768,)\n",
|
116 |
+
"(768,) (768,)\n"
|
117 |
+
]
|
118 |
+
}
|
119 |
+
],
|
120 |
+
"source": [
|
121 |
+
"emb = manifest.run(\"Is this an embedding?\")\n",
|
122 |
+
"print(emb.shape)\n",
|
123 |
+
"\n",
|
124 |
+
"emb = manifest.run([\"Is this an embedding?\", \"Bananas!!!\"])\n",
|
125 |
+
"print(emb[0].shape, emb[1].shape)"
|
126 |
+
]
|
127 |
+
}
|
128 |
+
],
|
129 |
+
"metadata": {
|
130 |
+
"kernelspec": {
|
131 |
+
"display_name": "manifest",
|
132 |
+
"language": "python",
|
133 |
+
"name": "python3"
|
134 |
+
},
|
135 |
+
"language_info": {
|
136 |
+
"codemirror_mode": {
|
137 |
+
"name": "ipython",
|
138 |
+
"version": 3
|
139 |
+
},
|
140 |
+
"file_extension": ".py",
|
141 |
+
"mimetype": "text/x-python",
|
142 |
+
"name": "python",
|
143 |
+
"nbconvert_exporter": "python",
|
144 |
+
"pygments_lexer": "ipython3",
|
145 |
+
"version": "3.10.4"
|
146 |
+
},
|
147 |
+
"orig_nbformat": 4,
|
148 |
+
"vscode": {
|
149 |
+
"interpreter": {
|
150 |
+
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
|
151 |
+
}
|
152 |
+
}
|
153 |
+
},
|
154 |
+
"nbformat": 4,
|
155 |
+
"nbformat_minor": 2
|
156 |
+
}
|
duckdb-nsql/manifest/examples/manifest_google.ipynb
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"%load_ext autoreload\n",
|
10 |
+
"%autoreload 2"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"GOOGLE_KEY = \"KEY::PROJECT_ID\""
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"attachments": {},
|
24 |
+
"cell_type": "markdown",
|
25 |
+
"metadata": {},
|
26 |
+
"source": [
|
27 |
+
"## Use GoogleVertexAPI"
|
28 |
+
]
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"cell_type": "code",
|
32 |
+
"execution_count": null,
|
33 |
+
"metadata": {},
|
34 |
+
"outputs": [],
|
35 |
+
"source": [
|
36 |
+
"from manifest import Manifest\n",
|
37 |
+
"from manifest.connections.client_pool import ClientConnection\n",
|
38 |
+
"\n",
|
39 |
+
"google_bison = ClientConnection(\n",
|
40 |
+
" client_name=\"google\",\n",
|
41 |
+
" client_connection=GOOGLE_KEY\n",
|
42 |
+
")\n",
|
43 |
+
"\n",
|
44 |
+
"manifest = Manifest(client_pool=[google_bison])"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": null,
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"# Simple question\n",
|
54 |
+
"print(manifest.run(\"What is your name\", max_tokens=40))"
|
55 |
+
]
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"cell_type": "code",
|
59 |
+
"execution_count": null,
|
60 |
+
"metadata": {},
|
61 |
+
"outputs": [],
|
62 |
+
"source": [
|
63 |
+
"from manifest import Manifest\n",
|
64 |
+
"from manifest.connections.client_pool import ClientConnection\n",
|
65 |
+
"\n",
|
66 |
+
"google_bison = ClientConnection(\n",
|
67 |
+
" client_name=\"googlechat\",\n",
|
68 |
+
" client_connection=GOOGLE_KEY\n",
|
69 |
+
")\n",
|
70 |
+
"\n",
|
71 |
+
"manifest = Manifest(client_pool=[google_bison])"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": null,
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"chat_dict = [\n",
|
81 |
+
" # {\"author\": \"bot\", \"content\": \"You are a helpful assistant.\"},\n",
|
82 |
+
" {\"author\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
|
83 |
+
" {\"author\": \"bot\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
|
84 |
+
" {\"author\": \"user\", \"content\": \"Where was it played?\"}\n",
|
85 |
+
"]\n",
|
86 |
+
"print(manifest.run(chat_dict, max_tokens=8))"
|
87 |
+
]
|
88 |
+
}
|
89 |
+
],
|
90 |
+
"metadata": {
|
91 |
+
"kernelspec": {
|
92 |
+
"display_name": "manifest",
|
93 |
+
"language": "python",
|
94 |
+
"name": "python3"
|
95 |
+
},
|
96 |
+
"language_info": {
|
97 |
+
"codemirror_mode": {
|
98 |
+
"name": "ipython",
|
99 |
+
"version": 3
|
100 |
+
},
|
101 |
+
"file_extension": ".py",
|
102 |
+
"mimetype": "text/x-python",
|
103 |
+
"name": "python",
|
104 |
+
"nbconvert_exporter": "python",
|
105 |
+
"pygments_lexer": "ipython3",
|
106 |
+
"version": "3.10.4"
|
107 |
+
},
|
108 |
+
"orig_nbformat": 4,
|
109 |
+
"vscode": {
|
110 |
+
"interpreter": {
|
111 |
+
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
|
112 |
+
}
|
113 |
+
}
|
114 |
+
},
|
115 |
+
"nbformat": 4,
|
116 |
+
"nbformat_minor": 2
|
117 |
+
}
|
duckdb-nsql/manifest/examples/manifest_openrouter.ipynb
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"%load_ext autoreload\n",
|
10 |
+
"%autoreload 2"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 4,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"OPENROUTER_API_KEY = \"sk-...\""
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"attachments": {},
|
24 |
+
"cell_type": "markdown",
|
25 |
+
"metadata": {},
|
26 |
+
"source": [
|
27 |
+
"## Use ChatOpenAI\n",
|
28 |
+
"\n",
|
29 |
+
"Set you `OPENROUTER_API_KEY` environment variable."
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 5,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"from manifest import Manifest\n",
|
39 |
+
"from manifest.connections.client_pool import ClientConnection\n",
|
40 |
+
"\n",
|
41 |
+
"openai_chat = ClientConnection(\n",
|
42 |
+
" client_name=\"openrouter\",\n",
|
43 |
+
" client_connection=OPENROUTER_API_KEY,\n",
|
44 |
+
" engine=\"meta-llama/codellama-70b-instruct\"\n",
|
45 |
+
")\n",
|
46 |
+
"\n",
|
47 |
+
"manifest = Manifest(client_pool=[openai_chat])"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": 6,
|
53 |
+
"metadata": {},
|
54 |
+
"outputs": [
|
55 |
+
{
|
56 |
+
"name": "stdout",
|
57 |
+
"output_type": "stream",
|
58 |
+
"text": [
|
59 |
+
"2020 World Series was played at the Globe Life Field in Arlington, Texas.\n"
|
60 |
+
]
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"source": [
|
64 |
+
"# Simple question\n",
|
65 |
+
"chat_dict = [\n",
|
66 |
+
" {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
|
67 |
+
" {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
|
68 |
+
" {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
|
69 |
+
" {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
|
70 |
+
"]\n",
|
71 |
+
"print(manifest.run(chat_dict, max_tokens=100))"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": null,
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": []
|
80 |
+
}
|
81 |
+
],
|
82 |
+
"metadata": {
|
83 |
+
"kernelspec": {
|
84 |
+
"display_name": "Python 3 (ipykernel)",
|
85 |
+
"language": "python",
|
86 |
+
"name": "python3"
|
87 |
+
},
|
88 |
+
"language_info": {
|
89 |
+
"codemirror_mode": {
|
90 |
+
"name": "ipython",
|
91 |
+
"version": 3
|
92 |
+
},
|
93 |
+
"file_extension": ".py",
|
94 |
+
"mimetype": "text/x-python",
|
95 |
+
"name": "python",
|
96 |
+
"nbconvert_exporter": "python",
|
97 |
+
"pygments_lexer": "ipython3",
|
98 |
+
"version": "3.11.5"
|
99 |
+
},
|
100 |
+
"vscode": {
|
101 |
+
"interpreter": {
|
102 |
+
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
|
103 |
+
}
|
104 |
+
}
|
105 |
+
},
|
106 |
+
"nbformat": 4,
|
107 |
+
"nbformat_minor": 4
|
108 |
+
}
|
duckdb-nsql/manifest/examples/manifest_streaming.ipynb
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"%load_ext autoreload\n",
|
10 |
+
"%autoreload 2"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": null,
|
16 |
+
"metadata": {},
|
17 |
+
"outputs": [],
|
18 |
+
"source": [
|
19 |
+
"OPENAI_KEY = \"sk-XXX\""
|
20 |
+
]
|
21 |
+
},
|
22 |
+
{
|
23 |
+
"attachments": {},
|
24 |
+
"cell_type": "markdown",
|
25 |
+
"metadata": {},
|
26 |
+
"source": [
|
27 |
+
"## Use ChatOpenAI\n",
|
28 |
+
"\n",
|
29 |
+
"Set you `OPENAI_API_KEY` environment variable."
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": null,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"from manifest import Manifest\n",
|
39 |
+
"from manifest.connections.client_pool import ClientConnection\n",
|
40 |
+
"\n",
|
41 |
+
"openai_chat = ClientConnection(\n",
|
42 |
+
" client_name=\"openaichat\",\n",
|
43 |
+
" client_connection=OPENAI_KEY,\n",
|
44 |
+
" engine=\"gpt-3.5-turbo\"\n",
|
45 |
+
")\n",
|
46 |
+
"\n",
|
47 |
+
"manifest = Manifest(client_pool=[openai_chat])"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": null,
|
53 |
+
"metadata": {},
|
54 |
+
"outputs": [],
|
55 |
+
"source": [
|
56 |
+
"manifest_iterator = manifest.run(\"Tell me a story about a fat cat.\\n\\nOnce upon a time\", max_tokens=200, stream=True)"
|
57 |
+
]
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"cell_type": "code",
|
61 |
+
"execution_count": null,
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [],
|
64 |
+
"source": [
|
65 |
+
"import sys\n",
|
66 |
+
"\n",
|
67 |
+
"cur_line_length = 0\n",
|
68 |
+
"# Iterate over stream\n",
|
69 |
+
"for res in manifest_iterator:\n",
|
70 |
+
" sys.stdout.write(res)\n",
|
71 |
+
" cur_line_length += len(res)\n",
|
72 |
+
" if cur_line_length > 80:\n",
|
73 |
+
" sys.stdout.write(\"\\n\")\n",
|
74 |
+
" cur_line_length = 0"
|
75 |
+
]
|
76 |
+
}
|
77 |
+
],
|
78 |
+
"metadata": {
|
79 |
+
"kernelspec": {
|
80 |
+
"display_name": "manifest",
|
81 |
+
"language": "python",
|
82 |
+
"name": "python3"
|
83 |
+
},
|
84 |
+
"language_info": {
|
85 |
+
"codemirror_mode": {
|
86 |
+
"name": "ipython",
|
87 |
+
"version": 3
|
88 |
+
},
|
89 |
+
"file_extension": ".py",
|
90 |
+
"mimetype": "text/x-python",
|
91 |
+
"name": "python",
|
92 |
+
"nbconvert_exporter": "python",
|
93 |
+
"pygments_lexer": "ipython3",
|
94 |
+
"version": "3.10.4"
|
95 |
+
},
|
96 |
+
"orig_nbformat": 4,
|
97 |
+
"vscode": {
|
98 |
+
"interpreter": {
|
99 |
+
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
|
100 |
+
}
|
101 |
+
}
|
102 |
+
},
|
103 |
+
"nbformat": 4,
|
104 |
+
"nbformat_minor": 2
|
105 |
+
}
|
duckdb-nsql/manifest/examples/manifest_together.ipynb
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"env: TOMA_URL=<TOMA_URL>\n"
|
13 |
+
]
|
14 |
+
}
|
15 |
+
],
|
16 |
+
"source": [
|
17 |
+
"%load_ext autoreload\n",
|
18 |
+
"%autoreload 2\n",
|
19 |
+
"\n",
|
20 |
+
"%env TOMA_URL=<TOMA_URL>"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": null,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [],
|
28 |
+
"source": [
|
29 |
+
"from manifest import Manifest\n",
|
30 |
+
"\n",
|
31 |
+
"# The responses are not fast\n",
|
32 |
+
"manifest = Manifest(\n",
|
33 |
+
" client_name=\"toma\",\n",
|
34 |
+
")\n",
|
35 |
+
"\n",
|
36 |
+
"print(manifest.run(\"What is the color of an apple?\"))"
|
37 |
+
]
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"attachments": {},
|
41 |
+
"cell_type": "markdown",
|
42 |
+
"metadata": {},
|
43 |
+
"source": [
|
44 |
+
"With a cache"
|
45 |
+
]
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"cell_type": "code",
|
49 |
+
"execution_count": null,
|
50 |
+
"metadata": {},
|
51 |
+
"outputs": [],
|
52 |
+
"source": [
|
53 |
+
"from manifest import Manifest\n",
|
54 |
+
"\n",
|
55 |
+
"# The responses are not fast\n",
|
56 |
+
"manifest = Manifest(\n",
|
57 |
+
" client_name=\"toma\",\n",
|
58 |
+
" cache_name=\"sqlite\",\n",
|
59 |
+
" cache_connection=\"my_manifest_cache.sqlite\",\n",
|
60 |
+
")\n",
|
61 |
+
"\n",
|
62 |
+
"res = manifest.run(\"What is the color of an apple?\", return_response=True)\n",
|
63 |
+
"print(res.get_response())\n",
|
64 |
+
"print(\"Is Cached?\", res.is_cached())\n",
|
65 |
+
"\n",
|
66 |
+
"res = manifest.run(\"What is the color of an apple?\", return_response=True)\n",
|
67 |
+
"print(res.get_response())\n",
|
68 |
+
"print(\"Is Cached?\", res.is_cached())"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "code",
|
73 |
+
"execution_count": null,
|
74 |
+
"metadata": {},
|
75 |
+
"outputs": [],
|
76 |
+
"source": []
|
77 |
+
}
|
78 |
+
],
|
79 |
+
"metadata": {
|
80 |
+
"kernelspec": {
|
81 |
+
"display_name": "manifest",
|
82 |
+
"language": "python",
|
83 |
+
"name": "python3"
|
84 |
+
},
|
85 |
+
"language_info": {
|
86 |
+
"codemirror_mode": {
|
87 |
+
"name": "ipython",
|
88 |
+
"version": 3
|
89 |
+
},
|
90 |
+
"file_extension": ".py",
|
91 |
+
"mimetype": "text/x-python",
|
92 |
+
"name": "python",
|
93 |
+
"nbconvert_exporter": "python",
|
94 |
+
"pygments_lexer": "ipython3",
|
95 |
+
"version": "3.10.4"
|
96 |
+
},
|
97 |
+
"orig_nbformat": 4,
|
98 |
+
"vscode": {
|
99 |
+
"interpreter": {
|
100 |
+
"hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
|
101 |
+
}
|
102 |
+
}
|
103 |
+
},
|
104 |
+
"nbformat": 4,
|
105 |
+
"nbformat_minor": 2
|
106 |
+
}
|
duckdb-nsql/manifest/manifest/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Manifest init."""
|
2 |
+
from manifest.manifest import Manifest
|
3 |
+
from manifest.request import Request
|
4 |
+
from manifest.response import Response
|
5 |
+
|
6 |
+
__all__ = ["Manifest", "Response", "Request"]
|
duckdb-nsql/manifest/manifest/api/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
"""Api init."""
|
duckdb-nsql/manifest/manifest/api/app.py
ADDED
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Flask app."""
|
2 |
+
import argparse
|
3 |
+
import io
|
4 |
+
import json
|
5 |
+
import logging
|
6 |
+
import os
|
7 |
+
import socket
|
8 |
+
from typing import Dict
|
9 |
+
|
10 |
+
import pkg_resources
|
11 |
+
from flask import Flask, Response, request
|
12 |
+
|
13 |
+
from manifest.api.models.diffuser import DiffuserModel
|
14 |
+
from manifest.api.models.huggingface import (
|
15 |
+
MODEL_GENTYPE_REGISTRY,
|
16 |
+
CrossModalEncoderModel,
|
17 |
+
TextGenerationModel,
|
18 |
+
)
|
19 |
+
from manifest.api.models.sentence_transformer import SentenceTransformerModel
|
20 |
+
from manifest.api.response import ModelResponse
|
21 |
+
|
22 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
23 |
+
|
24 |
+
logger = logging.getLogger(__name__)
|
25 |
+
app = Flask(__name__) # define app using Flask
|
26 |
+
# Will be global
|
27 |
+
model = None
|
28 |
+
model_type = None
|
29 |
+
PORT = int(os.environ.get("FLASK_PORT", 5000))
|
30 |
+
MODEL_CONSTRUCTORS = {
|
31 |
+
"huggingface": TextGenerationModel,
|
32 |
+
"sentence_transformers": SentenceTransformerModel,
|
33 |
+
"huggingface_crossmodal": CrossModalEncoderModel,
|
34 |
+
"diffuser": DiffuserModel,
|
35 |
+
}
|
36 |
+
|
37 |
+
|
38 |
+
def parse_args() -> argparse.Namespace:
|
39 |
+
"""Generate args."""
|
40 |
+
parser = argparse.ArgumentParser(description="Model args")
|
41 |
+
parser.add_argument(
|
42 |
+
"--model_type",
|
43 |
+
default=None,
|
44 |
+
type=str,
|
45 |
+
required=True,
|
46 |
+
help="Model type used for finding constructor.",
|
47 |
+
choices=MODEL_CONSTRUCTORS.keys(),
|
48 |
+
)
|
49 |
+
parser.add_argument(
|
50 |
+
"--model_generation_type",
|
51 |
+
default=None,
|
52 |
+
type=str,
|
53 |
+
help="Model generation type.",
|
54 |
+
choices=MODEL_GENTYPE_REGISTRY.keys(),
|
55 |
+
)
|
56 |
+
parser.add_argument(
|
57 |
+
"--model_name_or_path",
|
58 |
+
default=None,
|
59 |
+
type=str,
|
60 |
+
help="Name of model or path to model. Used in initialize of model class.",
|
61 |
+
)
|
62 |
+
parser.add_argument(
|
63 |
+
"--cache_dir", default=None, type=str, help="Cache directory for models."
|
64 |
+
)
|
65 |
+
parser.add_argument(
|
66 |
+
"--device", type=int, default=0, help="Model device. -1 for CPU."
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"--fp16", action="store_true", help="Force use fp16 for model params."
|
70 |
+
)
|
71 |
+
parser.add_argument(
|
72 |
+
"--percent_max_gpu_mem_reduction",
|
73 |
+
type=float,
|
74 |
+
default=0.85,
|
75 |
+
help="Used with accelerate multigpu. Scales down max memory.",
|
76 |
+
)
|
77 |
+
parser.add_argument(
|
78 |
+
"--use_bitsandbytes",
|
79 |
+
action="store_true",
|
80 |
+
help=("Use bits and bytes. " "This will override --device parameter."),
|
81 |
+
)
|
82 |
+
parser.add_argument(
|
83 |
+
"--use_accelerate_multigpu",
|
84 |
+
action="store_true",
|
85 |
+
help=(
|
86 |
+
"Use accelerate for multi gpu inference. "
|
87 |
+
"This will override --device parameter."
|
88 |
+
),
|
89 |
+
)
|
90 |
+
parser.add_argument(
|
91 |
+
"--use_hf_parallelize",
|
92 |
+
action="store_true",
|
93 |
+
help=(
|
94 |
+
"Use HF parallelize for multi gpu inference. "
|
95 |
+
"This will override --device parameter."
|
96 |
+
),
|
97 |
+
)
|
98 |
+
parser.add_argument(
|
99 |
+
"--use_deepspeed",
|
100 |
+
action="store_true",
|
101 |
+
help=("Use deepspeed. This will override --device parameter."),
|
102 |
+
)
|
103 |
+
args = parser.parse_args()
|
104 |
+
return args
|
105 |
+
|
106 |
+
|
107 |
+
def is_port_in_use(port: int) -> bool:
|
108 |
+
"""Check if port is in use."""
|
109 |
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
110 |
+
return s.connect_ex(("localhost", port)) == 0
|
111 |
+
|
112 |
+
|
113 |
+
def main() -> None:
|
114 |
+
"""Run main."""
|
115 |
+
kwargs = parse_args()
|
116 |
+
if is_port_in_use(PORT):
|
117 |
+
raise ValueError(f"Port {PORT} is already in use.")
|
118 |
+
global model_type
|
119 |
+
model_type = kwargs.model_type
|
120 |
+
model_gen_type = kwargs.model_generation_type
|
121 |
+
model_name_or_path = kwargs.model_name_or_path
|
122 |
+
if not model_name_or_path:
|
123 |
+
raise ValueError("Must provide model_name_or_path.")
|
124 |
+
if kwargs.use_accelerate_multigpu:
|
125 |
+
logger.info("Using accelerate. Overridding --device argument.")
|
126 |
+
if (
|
127 |
+
kwargs.percent_max_gpu_mem_reduction <= 0
|
128 |
+
or kwargs.percent_max_gpu_mem_reduction > 1
|
129 |
+
):
|
130 |
+
raise ValueError("percent_max_gpu_mem_reduction must be in (0, 1].")
|
131 |
+
if (
|
132 |
+
sum(
|
133 |
+
[
|
134 |
+
kwargs.use_accelerate_multigpu,
|
135 |
+
kwargs.use_hf_parallelize,
|
136 |
+
kwargs.use_bitsandbytes,
|
137 |
+
kwargs.use_deepspeed,
|
138 |
+
]
|
139 |
+
)
|
140 |
+
> 1
|
141 |
+
):
|
142 |
+
raise ValueError(
|
143 |
+
"Only one of use_accelerate_multigpu, use_hf_parallelize, "
|
144 |
+
"use_bitsandbytes, and use_deepspeed can be set."
|
145 |
+
)
|
146 |
+
# Global model
|
147 |
+
global model
|
148 |
+
model = MODEL_CONSTRUCTORS[model_type](
|
149 |
+
model_name_or_path,
|
150 |
+
model_type=model_gen_type,
|
151 |
+
cache_dir=kwargs.cache_dir,
|
152 |
+
device=kwargs.device,
|
153 |
+
use_accelerate=kwargs.use_accelerate_multigpu,
|
154 |
+
use_parallelize=kwargs.use_hf_parallelize,
|
155 |
+
use_bitsandbytes=kwargs.use_bitsandbytes,
|
156 |
+
use_deepspeed=kwargs.use_deepspeed,
|
157 |
+
perc_max_gpu_mem_red=kwargs.percent_max_gpu_mem_reduction,
|
158 |
+
use_fp16=kwargs.fp16,
|
159 |
+
)
|
160 |
+
app.run(host="0.0.0.0", port=PORT)
|
161 |
+
|
162 |
+
|
163 |
+
@app.route("/completions", methods=["POST"])
|
164 |
+
def completions() -> Response:
|
165 |
+
"""Get completions for generation."""
|
166 |
+
prompt = request.json["prompt"]
|
167 |
+
del request.json["prompt"]
|
168 |
+
generation_args = request.json
|
169 |
+
|
170 |
+
if not isinstance(prompt, (str, list)):
|
171 |
+
raise ValueError("Prompt must be a str or list of str")
|
172 |
+
try:
|
173 |
+
result_gens = []
|
174 |
+
for generations in model.generate(prompt, **generation_args):
|
175 |
+
result_gens.append(generations)
|
176 |
+
if model_type == "diffuser":
|
177 |
+
# Assign None logprob as it's not supported in diffusers
|
178 |
+
results = [
|
179 |
+
{"array": r[0], "logprob": None, "tokens": None, "token_logprobs": None}
|
180 |
+
for r in result_gens
|
181 |
+
]
|
182 |
+
res_type = "image_generation"
|
183 |
+
else:
|
184 |
+
results = [
|
185 |
+
{"text": r[0], "logprob": r[1], "tokens": r[2], "token_logprobs": r[3]}
|
186 |
+
for r in result_gens
|
187 |
+
]
|
188 |
+
res_type = "text_completion"
|
189 |
+
# transform the result into the openai format
|
190 |
+
return Response(
|
191 |
+
json.dumps(ModelResponse(results, response_type=res_type).__dict__()),
|
192 |
+
status=200,
|
193 |
+
)
|
194 |
+
except Exception as e:
|
195 |
+
logger.error(e)
|
196 |
+
return Response(
|
197 |
+
json.dumps({"message": str(e)}),
|
198 |
+
status=400,
|
199 |
+
)
|
200 |
+
|
201 |
+
|
202 |
+
@app.route("/embed", methods=["POST"])
|
203 |
+
def embed() -> Response:
|
204 |
+
"""Get embed for generation."""
|
205 |
+
if "modality" in request.json:
|
206 |
+
modality = request.json["modality"]
|
207 |
+
else:
|
208 |
+
modality = "text"
|
209 |
+
if modality == "text":
|
210 |
+
prompts = request.json["prompt"]
|
211 |
+
elif modality == "image":
|
212 |
+
import base64
|
213 |
+
|
214 |
+
from PIL import Image
|
215 |
+
|
216 |
+
prompts = [
|
217 |
+
Image.open(io.BytesIO(base64.b64decode(data)))
|
218 |
+
for data in request.json["prompt"]
|
219 |
+
]
|
220 |
+
else:
|
221 |
+
raise ValueError("modality must be text or image")
|
222 |
+
|
223 |
+
try:
|
224 |
+
results = []
|
225 |
+
embeddings = model.embed(prompts)
|
226 |
+
for embedding in embeddings:
|
227 |
+
results.append(
|
228 |
+
{
|
229 |
+
"array": embedding,
|
230 |
+
"logprob": None,
|
231 |
+
"tokens": None,
|
232 |
+
"token_logprobs": None,
|
233 |
+
}
|
234 |
+
)
|
235 |
+
|
236 |
+
return Response(
|
237 |
+
json.dumps(
|
238 |
+
ModelResponse(results, response_type="embedding_generation").__dict__()
|
239 |
+
),
|
240 |
+
status=200,
|
241 |
+
)
|
242 |
+
except Exception as e:
|
243 |
+
logger.error(e)
|
244 |
+
return Response(
|
245 |
+
json.dumps({"message": str(e)}),
|
246 |
+
status=400,
|
247 |
+
)
|
248 |
+
|
249 |
+
|
250 |
+
@app.route("/score_sequence", methods=["POST"])
|
251 |
+
def score_sequence() -> Response:
|
252 |
+
"""Get logprob of prompt."""
|
253 |
+
prompt = request.json["prompt"]
|
254 |
+
del request.json["prompt"]
|
255 |
+
generation_args = request.json
|
256 |
+
|
257 |
+
if not isinstance(prompt, (str, list)):
|
258 |
+
raise ValueError("Prompt must be a str or list of str")
|
259 |
+
|
260 |
+
try:
|
261 |
+
score_list = model.score_sequence(prompt, **generation_args)
|
262 |
+
results = [
|
263 |
+
{
|
264 |
+
"text": prompt if isinstance(prompt, str) else prompt[i],
|
265 |
+
"logprob": r[0],
|
266 |
+
"tokens": r[1],
|
267 |
+
"token_logprobs": r[2],
|
268 |
+
}
|
269 |
+
for i, r in enumerate(score_list)
|
270 |
+
]
|
271 |
+
# transform the result into the openai format
|
272 |
+
return Response(
|
273 |
+
json.dumps(
|
274 |
+
ModelResponse(results, response_type="prompt_logit_score").__dict__()
|
275 |
+
),
|
276 |
+
status=200,
|
277 |
+
)
|
278 |
+
except Exception as e:
|
279 |
+
logger.error(e)
|
280 |
+
return Response(
|
281 |
+
json.dumps({"message": str(e)}),
|
282 |
+
status=400,
|
283 |
+
)
|
284 |
+
|
285 |
+
|
286 |
+
@app.route("/params", methods=["POST"])
|
287 |
+
def params() -> Dict:
|
288 |
+
"""Get model params."""
|
289 |
+
return model.get_init_params()
|
290 |
+
|
291 |
+
|
292 |
+
@app.route("/")
|
293 |
+
def index() -> str:
|
294 |
+
"""Get index completion."""
|
295 |
+
fn = pkg_resources.resource_filename("metaseq", "service/index.html")
|
296 |
+
with open(fn) as f:
|
297 |
+
return f.read()
|
298 |
+
|
299 |
+
|
300 |
+
if __name__ == "__main__":
|
301 |
+
main()
|