tdoehmen commited on
Commit
b247dc4
1 Parent(s): 47bda6e

added duckdb-nsql benchmark setup

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +4 -0
  2. duckdb-nsql/LICENSE +201 -0
  3. duckdb-nsql/README.md +63 -0
  4. duckdb-nsql/eval/README.md +90 -0
  5. duckdb-nsql/eval/constants.py +37 -0
  6. duckdb-nsql/eval/data/databases/flightinfo/flightinfo.duckdb +0 -0
  7. duckdb-nsql/eval/data/databases/hn/hn.duckdb +0 -0
  8. duckdb-nsql/eval/data/databases/json/json.duckdb +0 -0
  9. duckdb-nsql/eval/data/databases/laptop/laptop.duckdb +0 -0
  10. duckdb-nsql/eval/data/databases/laptop_array/laptop_array.duckdb +0 -0
  11. duckdb-nsql/eval/data/databases/laptop_json/laptop_json.duckdb +0 -0
  12. duckdb-nsql/eval/data/databases/laptop_struct/laptop_struct.duckdb +0 -0
  13. duckdb-nsql/eval/data/databases/none/none.duckdb +0 -0
  14. duckdb-nsql/eval/data/databases/nyc/nyc.duckdb +0 -0
  15. duckdb-nsql/eval/data/databases/product/product.duckdb +0 -0
  16. duckdb-nsql/eval/data/databases/transactions/transactions.duckdb +0 -0
  17. duckdb-nsql/eval/data/databases/who/who.duckdb +0 -0
  18. duckdb-nsql/eval/data/dev.json +602 -0
  19. duckdb-nsql/eval/data/tables.json +3465 -0
  20. duckdb-nsql/eval/data_utils.py +79 -0
  21. duckdb-nsql/eval/doc_retriever.py +92 -0
  22. duckdb-nsql/eval/evaluate.py +392 -0
  23. duckdb-nsql/eval/get_manifest.py +29 -0
  24. duckdb-nsql/eval/loaders.py +78 -0
  25. duckdb-nsql/eval/metric_utils.py +40 -0
  26. duckdb-nsql/eval/metrics/test_suite_sql_eval +1 -0
  27. duckdb-nsql/eval/predict.py +345 -0
  28. duckdb-nsql/eval/prompt_formatters.py +1056 -0
  29. duckdb-nsql/eval/schema.py +115 -0
  30. duckdb-nsql/eval/text_to_sql.py +260 -0
  31. duckdb-nsql/manifest/.flake8 +11 -0
  32. duckdb-nsql/manifest/.pre-commit-config.yaml +23 -0
  33. duckdb-nsql/manifest/CHANGELOG.rst +93 -0
  34. duckdb-nsql/manifest/LICENSE +201 -0
  35. duckdb-nsql/manifest/Makefile +27 -0
  36. duckdb-nsql/manifest/README.md +304 -0
  37. duckdb-nsql/manifest/examples/langchain_chatgpt.ipynb +455 -0
  38. duckdb-nsql/manifest/examples/manifest_async.py +27 -0
  39. duckdb-nsql/manifest/examples/manifest_azure.ipynb +149 -0
  40. duckdb-nsql/manifest/examples/manifest_chatgpt.ipynb +101 -0
  41. duckdb-nsql/manifest/examples/manifest_connection_pool.ipynb +208 -0
  42. duckdb-nsql/manifest/examples/manifest_diffusers.ipynb +0 -0
  43. duckdb-nsql/manifest/examples/manifest_embedding.ipynb +156 -0
  44. duckdb-nsql/manifest/examples/manifest_google.ipynb +117 -0
  45. duckdb-nsql/manifest/examples/manifest_openrouter.ipynb +108 -0
  46. duckdb-nsql/manifest/examples/manifest_streaming.ipynb +105 -0
  47. duckdb-nsql/manifest/examples/manifest_together.ipynb +106 -0
  48. duckdb-nsql/manifest/manifest/__init__.py +6 -0
  49. duckdb-nsql/manifest/manifest/api/__init__.py +1 -0
  50. duckdb-nsql/manifest/manifest/api/app.py +301 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .DS_Store
2
+ __pycache__/
3
+ *.egg-info
4
+ .ipynb_checkpoints/
duckdb-nsql/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
duckdb-nsql/README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DuckDB-NSQL
2
+ Numbers Station Text to SQL model for DuckDB.
3
+
4
+ NSQL is a family of autoregressive open-source foundational models (FMs) that are particularly designed for SQL generation tasks. We are thrilled to introduce DuckDB-NSQL in this repository, an FM tailored for local DuckDB SQL analytics tasks. All model weights can be found on HuggingFace.
5
+
6
+ | Model Name | Size | Link |
7
+ | --------------------------------------| ---- | -------------------------------------------------------------- |
8
+ | motherduckdb/DuckDB-NSQL-7B-v0.1 | 7B | [link](https://huggingface.co/motherduckdb/DuckDB-NSQL-7B-v0.1) |
9
+ | motherduckdb/DuckDB-NSQL-7B-v0.1-GGUF | 7B | [link](https://huggingface.co/motherduckdb/DuckDB-NSQL-7B-v0.1-GGUF)|
10
+
11
+ ## Setup
12
+ To install all the necessary dependencies, please run
13
+ ```
14
+ pip install -r requirements.txt
15
+ ```
16
+
17
+ ## Usage
18
+ Please refer to the examples in the `examples/` folder to learn how to connect to a local DuckDB and directly query your data. A simple notebook is provided in the `examples/` directory for reference.
19
+
20
+ To host the model with llama.cpp, please execute the following:
21
+
22
+ ```python
23
+ # Import necessary modules
24
+ from llama_cpp import Llama
25
+ from wurlitzer import pipes
26
+
27
+ # Set up client with model path and context size
28
+ with pipes() as (out, err):
29
+ client = Llama(
30
+ model_path="DuckDB-NSQL-7B-v0.1-q8_0.gguf",
31
+ n_ctx=2048,
32
+ )
33
+ ```
34
+
35
+ To load the DuckDB database and query against it, please execute the following:
36
+
37
+ ```python
38
+ # Import necessary modules
39
+ import duckdb
40
+ from utils import generate_sql
41
+
42
+ # Connect to DuckDB database
43
+ con = duckdb.connect("nyc.duckdb")
44
+
45
+ # Sample question for SQL generation
46
+ question = "alter taxi table and add struct column with name test and keys a:int, b:double"
47
+
48
+ # Generate SQL, check validity, and print
49
+ sql = generate_sql(question, con, client)
50
+ print(sql)
51
+ ```
52
+
53
+ ## Training Data
54
+
55
+ The training data for this model consists of two parts: 1) 200k synthetically generated DuckDB SQL queries, based on the DuckDB v.0.9.2 documentation, and 2) labeled text-to-SQL pairs from [NSText2SQL](https://huggingface.co/datasets/NumbersStation/NSText2SQL) transpiled to DuckDB SQL using [sqlglot](https://github.com/tobymao/sqlglot).
56
+
57
+ ## Evaluate the benchmark
58
+
59
+ Please refer to the `eval/` folder to check the details for evaluating the model against our proposed DuckDB benchmark.
60
+
61
+ ## Acknowledgement
62
+
63
+ We would like to express our appreciation to all authors of the evaluation scripts. Their work made this project possible.
duckdb-nsql/eval/README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This folder contains the suite for evaluating the DuckDB-Text2SQL model.
2
+
3
+ Please install the dependencies listed in the requirements.txt file located in the parent folder.
4
+
5
+ ## Setup
6
+ To evaluate against the benchmark dataset, you need to prepare the evaluation script using this benchmark.
7
+
8
+ ```
9
+ mkdir metrics
10
+ cd metrics
11
+ git clone [email protected]:ElementAI/test-suite-sql-eval.git test_suite_sql_eval
12
+ cd ..
13
+ ```
14
+
15
+ You need to add a new remote to evaluate against duckdb in the test-suite-sql-eval folder. And check the latest duckdb-only branch (640a12975abf75a94e917caca149d56dbc6bcdd7).
16
+
17
+ ```
18
+ git remote add till https://github.com/tdoehmen/test-suite-sql-eval.git
19
+ git fetch till
20
+ git checkout till/duckdb-only
21
+ ```
22
+
23
+ Next, prepare the docs for retrieval.
24
+ ```
25
+ mkdir docs
26
+ cd docs
27
+ git clone https://github.com/duckdb/duckdb-web.git
28
+ cd ..
29
+ ```
30
+
31
+ #### Dataset
32
+ The benchmark dataset is located in the `data/` folder and includes all databases (`data/databases`), table schemas (`data/tables.json`), and examples (`data/dev.json`).
33
+
34
+ #### Eval
35
+ Start a manifest session with the model you want to evaluate.
36
+
37
+ ```bash
38
+ python -m manifest.api.app \
39
+ --model_type huggingface \
40
+ --model_generation_type text-generation \
41
+ --model_name_or_path motherduckdb/DuckDB-NSQL-7B-v0.1 \
42
+ --fp16 \
43
+ --device 0
44
+ ```
45
+
46
+ Then, from the `DuckDB-NSQL` main folder, run:
47
+
48
+ ```bash
49
+ python eval/predict.py \
50
+ predict \
51
+ eval/data/dev.json \
52
+ eval/data/tables.json \
53
+ --output-dir output/ \
54
+ --stop-tokens ';' \
55
+ --stop-tokens '--' \
56
+ --stop-tokens '```' \
57
+ --stop-tokens '###' \
58
+ --overwrite-manifest \
59
+ --manifest-client huggingface \
60
+ --manifest-connection http://localhost:5000 \
61
+ --prompt-format duckdbinst
62
+ ```
63
+ This will format the prompt using the duckdbinst style.
64
+
65
+ To evaluate the prediction, first run the following in a Python shell:
66
+
67
+ ```python
68
+ try:
69
+ import duckdb
70
+
71
+ con = duckdb.connect()
72
+ con.install_extension("httpfs")
73
+ con.load_extension("httpfs")
74
+ except Exception as e:
75
+ print(f"Error loading duckdb extensions: {e}")
76
+ ```
77
+
78
+ Then, run the evaluation script:
79
+
80
+ ```bash
81
+ python eval/evaluate.py \
82
+ evaluate \
83
+ --gold eval/data/dev.json \
84
+ --db eval/data/databases/ \
85
+ --tables eval/data/tables.json \
86
+ --output-dir output/ \
87
+ --pred [PREDICITON_FILE]
88
+ ```
89
+
90
+ To view the output, all the information is located in the prediction file in the [output-dir]. Here, `query` is gold and `pred` is predicted.
duckdb-nsql/eval/constants.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Constants."""
2
+
3
+ from prompt_formatters import (
4
+ DuckDBFormatter,
5
+ MotherDuckFormatter,
6
+ DuckDBInstFormatter,
7
+ DuckDBInstNoShorthandFormatter,
8
+ RajkumarFormatter,
9
+ DuckDBChat,
10
+ DuckDBInstFormatterLlamaShort,
11
+ DuckDBInstFormatterGraniteShort,
12
+ DuckDBInstFormatterLlama,
13
+ DuckDBInstFormatterLlamaBasic,
14
+ DuckDBInstFormatterGranite,
15
+ DuckDBInstFormatterPhi,
16
+ DuckDBInstFormatterGPTmini,
17
+ DuckDBInstFormatterPhiAzure,
18
+ DuckDBInstFormatterLlamaSyntax,
19
+ )
20
+
21
+ PROMPT_FORMATTERS = {
22
+ "rajkumar": RajkumarFormatter,
23
+ "duckdb": DuckDBFormatter,
24
+ "motherduck": MotherDuckFormatter,
25
+ "duckdbinst": DuckDBInstFormatter,
26
+ "duckdbinstllamashort": DuckDBInstFormatterLlamaShort,
27
+ "duckdbinstgraniteshort": DuckDBInstFormatterGraniteShort,
28
+ "duckdbinstllama": DuckDBInstFormatterLlama,
29
+ "duckdbinstgranite": DuckDBInstFormatterGranite,
30
+ "duckdbinstnoshort": DuckDBInstNoShorthandFormatter,
31
+ "duckdbchat": DuckDBChat,
32
+ "duckdbinstphi": DuckDBInstFormatterPhi,
33
+ "duckdbinstgptmini": DuckDBInstFormatterPhi,
34
+ "duckdbinstphiazure": DuckDBInstFormatterPhiAzure,
35
+ "duckdbinstllamabasic": DuckDBInstFormatterLlamaBasic,
36
+ "duckdbinstllamasyntax": DuckDBInstFormatterLlamaSyntax
37
+ }
duckdb-nsql/eval/data/databases/flightinfo/flightinfo.duckdb ADDED
Binary file (799 kB). View file
 
duckdb-nsql/eval/data/databases/hn/hn.duckdb ADDED
Binary file (799 kB). View file
 
duckdb-nsql/eval/data/databases/json/json.duckdb ADDED
Binary file (537 kB). View file
 
duckdb-nsql/eval/data/databases/laptop/laptop.duckdb ADDED
Binary file (537 kB). View file
 
duckdb-nsql/eval/data/databases/laptop_array/laptop_array.duckdb ADDED
Binary file (799 kB). View file
 
duckdb-nsql/eval/data/databases/laptop_json/laptop_json.duckdb ADDED
Binary file (799 kB). View file
 
duckdb-nsql/eval/data/databases/laptop_struct/laptop_struct.duckdb ADDED
Binary file (799 kB). View file
 
duckdb-nsql/eval/data/databases/none/none.duckdb ADDED
Binary file (12.3 kB). View file
 
duckdb-nsql/eval/data/databases/nyc/nyc.duckdb ADDED
Binary file (799 kB). View file
 
duckdb-nsql/eval/data/databases/product/product.duckdb ADDED
Binary file (274 kB). View file
 
duckdb-nsql/eval/data/databases/transactions/transactions.duckdb ADDED
Binary file (537 kB). View file
 
duckdb-nsql/eval/data/databases/who/who.duckdb ADDED
Binary file (537 kB). View file
 
duckdb-nsql/eval/data/dev.json ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "db_id": "hn",
4
+ "query": "SELECT COUNT(*) as domain_count, \nSUBSTRING(SPLIT_PART(url, '//', 2), 1, POSITION('/' IN SPLIT_PART(url, '//', 2)) - 1) as domain \nFROM hacker_news\nWHERE url IS NOT NULL GROUP BY domain ORDER BY domain_count DESC LIMIT 10;",
5
+ "setup_sql": ";",
6
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
7
+ "question": "what are the top domains being shared on hacker_news?",
8
+ "category": "hard"
9
+ },
10
+ {
11
+ "db_id": "laptop",
12
+ "query": "SELECT c.firstname, c.lastname, COUNT(*) AS num_pcs_bought\nFROM customers c\nJOIN sales s ON c.customer_id = s.customer_id\nJOIN pcs p ON s.model = p.model\nGROUP BY c.customer_id, c.firstname, c.lastname\nORDER BY num_pcs_bought DESC\nLIMIT 1;",
13
+ "setup_sql": ";",
14
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
15
+ "question": "Who bought the most PCs, print also the users name?",
16
+ "category": "medium"
17
+ },
18
+ {
19
+ "db_id": "transactions",
20
+ "query": "select users.id, users.name, sum(transactions.amount) as balance from users join transactions on users.id = transactions.user_id group by users.id, users.name having balance = 0",
21
+ "setup_sql": ";",
22
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
23
+ "question": "list the names off account holders who have negative balances",
24
+ "category": "easy"
25
+ },
26
+ {
27
+ "db_id": "laptop",
28
+ "query": "SELECT model FROM products WHERE maker = 'B';",
29
+ "setup_sql": ";",
30
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
31
+ "question": "List only the model number of all products made by maker B.",
32
+ "category": "easy"
33
+ },
34
+ {
35
+ "db_id": "laptop",
36
+ "query": "SELECT model FROM products WHERE maker <> 'B';",
37
+ "setup_sql": ";",
38
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
39
+ "question": "List the model numbers of all products not made by maker B.",
40
+ "category": "easy"
41
+ },
42
+ {
43
+ "db_id": "laptop",
44
+ "query": "SELECT AVG(speed) FROM pcs WHERE speed >= 3.00",
45
+ "setup_sql": ";",
46
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
47
+ "question": "Return the average speed all PCs with speed >= 3.00",
48
+ "category": "easy"
49
+ },
50
+ {
51
+ "db_id": "laptop",
52
+ "query": "SELECT MAX(price) FROM printers WHERE color = 'TRUE' AND type='laser'",
53
+ "setup_sql": ";",
54
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
55
+ "question": "Return the price of the most expensive color laser printer",
56
+ "category": "medium"
57
+ },
58
+ {
59
+ "db_id": "laptop",
60
+ "query": "SELECT MIN(paid) FROM sales WHERE type_of_payment LIKE '%visa%'",
61
+ "setup_sql": ";",
62
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
63
+ "question": "Return the minimum amount paid by customers who used a visa card (debit or credit) to purchase a product",
64
+ "category": "medium"
65
+ },
66
+ {
67
+ "db_id": "laptop",
68
+ "query": "SELECT customer_id FROM customers WHERE firstname LIKE '%e%' OR lastname LIKE '%e%'",
69
+ "setup_sql": ";",
70
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
71
+ "question": "Find the customer_id of customers who have the letter 'e' either in their first name or in their last name",
72
+ "category": "medium"
73
+ },
74
+ {
75
+ "db_id": "laptop",
76
+ "query": "SELECT model, price/0.85 AS 'price (USD)' FROM laptops WHERE ram >= 1024",
77
+ "setup_sql": ";",
78
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
79
+ "question": "Assume all prices in the table Laptops are in Euro. List the prices of laptops with at least 1024 ram. You should return the price in USD in a column called 'price (USD)'. Assume that 1 USD = 0.85 EURO. Name the price column 'price (USD)'.",
80
+ "category": "hard"
81
+ },
82
+ {
83
+ "db_id": "laptop",
84
+ "query": "SELECT maker FROM products GROUP BY maker HAVING COUNT(maker) > 4;",
85
+ "setup_sql": ";",
86
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
87
+ "question": "Return a list of makers that make more than four different products.",
88
+ "category": "medium"
89
+ },
90
+ {
91
+ "db_id": "laptop",
92
+ "query": "SELECT model FROM laptops WHERE speed > 1.7 ORDER BY speed DESC;",
93
+ "setup_sql": ";",
94
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
95
+ "question": "List all the laptop model numbers that have a speed greater than 1.7 in descending order of speed.",
96
+ "category": "medium"
97
+ },
98
+ {
99
+ "db_id": "laptop",
100
+ "query": "SELECT firstname \n FROM sales \n JOIN customers ON sales.customer_id = customers.customer_id \n GROUP BY firstname \n ORDER BY COUNT(firstname);",
101
+ "setup_sql": ";",
102
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
103
+ "question": "List firstnames of customers in an ascending order based on the number of purchases made by customers with this firstname.",
104
+ "category": "medium"
105
+ },
106
+ {
107
+ "db_id": "laptop",
108
+ "query": "SELECT DISTINCT maker FROM products JOIN pcs ON products.model = pcs.model WHERE ram > 1500;",
109
+ "setup_sql": ";",
110
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
111
+ "question": "List all the makers (with only one entry per maker) who make PCs with RAM greater than 1500.",
112
+ "category": "medium"
113
+ },
114
+ {
115
+ "db_id": "laptop",
116
+ "query": "SELECT city, AVG(paid) as 'avg_spend' FROM sales JOIN customers ON sales.customer_id = customers.customer_id GROUP BY city",
117
+ "setup_sql": ";",
118
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
119
+ "question": "Find the city and the average amount of money spent by customers in each city. Name the column for the amount 'avg_spend'",
120
+ "category": "medium"
121
+ },
122
+ {
123
+ "db_id": "laptop",
124
+ "query": "SELECT color, MAX(price) as 'max_price' FROM printers GROUP BY color;",
125
+ "setup_sql": ";",
126
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
127
+ "question": "Find the maximum price for each color of printer. Name the column for the maximum price 'max_price'",
128
+ "category": "medium"
129
+ },
130
+ {
131
+ "db_id": "who",
132
+ "query": "select country_name, max(pm25_concentration) as worst_pm25_for_country\nfrom ambient_air_quality\ngroup by country_name\norder by worst_pm25_for_country desc\nlimit 1",
133
+ "setup_sql": ";",
134
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
135
+ "question": "Find the country with the worst single reading of air quality (highest PM 2.5 value). Show the PM 2.5 value as well.",
136
+ "category": "medium"
137
+ },
138
+ {
139
+ "db_id": "who",
140
+ "query": "select country_name, avg(pm25_concentration) as worst_avg_pm25_for_country\nfrom ambient_air_quality\ngroup by country_name\norder by worst_avg_pm25_for_country desc\nlimit 1",
141
+ "setup_sql": ";",
142
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
143
+ "question": "Find the country with the worst average air quality (highest PM 2.5 value). Show the PM 2.5 value as well.",
144
+ "category": "medium"
145
+ },
146
+ {
147
+ "db_id": "who",
148
+ "query": "select distinct country_name from ambient_air_quality order by country_name",
149
+ "setup_sql": ";",
150
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
151
+ "question": "Find all countries for which WHO air quality data is available. Sort alphabetically.",
152
+ "category": "medium"
153
+ },
154
+ {
155
+ "db_id": "who",
156
+ "query": "select year, avg(pm25_concentration) from ambient_air_quality \nwhere country_name = 'Singapore'\ngroup by year\norder by year",
157
+ "setup_sql": ";",
158
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
159
+ "question": "Find Singapore air quality defined as PM2.5 concentration over time",
160
+ "category": "medium"
161
+ },
162
+ {
163
+ "db_id": "nyc",
164
+ "query": "SELECT COLUMNS('^trip_') FROM rideshare LIMIT 10;",
165
+ "setup_sql": ";",
166
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
167
+ "question": "select only the column names from the rideshare table that start with trip_ and return the first 10 values",
168
+ "category": "duckdb"
169
+ },
170
+ {
171
+ "db_id": "nyc",
172
+ "query": "SELECT * FROM rideshare USING SAMPLE 1%;",
173
+ "setup_sql": ";",
174
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
175
+ "question": "select a 1% sample from the nyc.rideshare table",
176
+ "category": "duckdb"
177
+ },
178
+ {
179
+ "db_id": "laptop",
180
+ "query": "SELECT * EXCLUDE (customer_id) FROM customers;\n",
181
+ "setup_sql": ";",
182
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
183
+ "question": "select all columns from the customer table, except customer_id",
184
+ "category": "duckdb"
185
+ },
186
+ {
187
+ "db_id": "nyc",
188
+ "query": "SUMMARIZE rideshare;",
189
+ "setup_sql": ";",
190
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
191
+ "question": "show summary statistics of the rideshare table",
192
+ "category": "duckdb"
193
+ },
194
+ {
195
+ "db_id": "none",
196
+ "query": "SELECT * FROM read_csv_auto(\n'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')",
197
+ "setup_sql": ";",
198
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
199
+ "question": "read a CSV from https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
200
+ "category": "duckdb"
201
+ },
202
+ {
203
+ "db_id": "none",
204
+ "query": "COPY (SELECT * FROM read_csv_auto(\n'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'))\nTO 'titanic.parquet' (FORMAT 'parquet');",
205
+ "setup_sql": ";",
206
+ "validation_sql": "SELECT * FROM 'titanic.parquet'",
207
+ "question": "read a CSV from https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv and convert it to a parquet file called \"titanic\"",
208
+ "category": "duckdb"
209
+ },
210
+ {
211
+ "db_id": "none",
212
+ "query": "CREATE TABLE titanic AS (SELECT * FROM read_csv_auto(\n'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'))",
213
+ "setup_sql": ";",
214
+ "validation_sql": "SELECT * FROM titanic;",
215
+ "question": "create a table called \"titanic\" from CSV file https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
216
+ "category": "duckdb"
217
+ },
218
+ {
219
+ "db_id": "none",
220
+ "query": "PRAGMA default_null_order='NULLS LAST';",
221
+ "setup_sql": ";",
222
+ "validation_sql": "SELECT current_setting('default_null_order');",
223
+ "question": "configure duckdb to put null values last when sorting",
224
+ "category": "duckdb"
225
+ },
226
+ {
227
+ "db_id": "none",
228
+ "query": "CREATE TABLE IF NOT EXISTS products (\n maker varchar(10),\n model varchar(10),\n type varchar(10));",
229
+ "setup_sql": ";",
230
+ "validation_sql": "SELECT * FROM products;",
231
+ "question": "create a table about products, that contains a maker, model and type column",
232
+ "category": "ddl"
233
+ },
234
+ {
235
+ "db_id": "product",
236
+ "query": "INSERT INTO products (maker, model, type)\nVALUES\n ('A', '1001', 'pc');",
237
+ "setup_sql": ";",
238
+ "validation_sql": "SELECT * FROM products;",
239
+ "question": "add a row with values for model \"1001\" of type \"pc\", from maker \"A\" to products table",
240
+ "category": "ddl"
241
+ },
242
+ {
243
+ "db_id": "none",
244
+ "query": "CALL pragma_version();\n",
245
+ "setup_sql": ";",
246
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
247
+ "question": "get current version of duckdb",
248
+ "category": "duckdb"
249
+ },
250
+ {
251
+ "db_id": "nyc",
252
+ "query": "PRAGMA table_info('rideshare');",
253
+ "setup_sql": ";",
254
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
255
+ "question": "list all columns in table nyc.rideshare",
256
+ "category": "duckdb"
257
+ },
258
+ {
259
+ "db_id": "nyc",
260
+ "query": "PRAGMA show_tables;",
261
+ "setup_sql": ";",
262
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
263
+ "question": "show all tables in the curent database",
264
+ "category": "duckdb"
265
+ },
266
+ {
267
+ "db_id": "laptop",
268
+ "query": "SELECT customer_id, model, sum(paid) FROM sales GROUP BY ALL",
269
+ "setup_sql": ";",
270
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
271
+ "question": "how much did each customer spend per model type?",
272
+ "category": "easy"
273
+ },
274
+ {
275
+ "db_id": "nyc",
276
+ "query": "SELECT Max(datediff('minute', tpep_pickup_datetime, tpep_dropoff_datetime)) from nyc.taxi",
277
+ "setup_sql": ";",
278
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
279
+ "question": "What was the longest taxi ride in minutes?",
280
+ "category": "hard"
281
+ },
282
+ {
283
+ "db_id": "who",
284
+ "query": "with per_region as (\n select avg(pm10_concentration) as avg_pm10, who_region from ambient_air_quality group by who_region\n), max_region as (\n select who_region from per_region where avg_pm10 = (select max(avg_pm10) from per_region)\n), min_city_value_in_max_region as (\n select min(pm10_concentration) from ambient_air_quality where who_region in (from max_region)\n), min_city_in_max_region as (\n select city from ambient_air_quality where pm10_concentration in (from min_city_value_in_max_region) and who_region in (from max_region)\n)\nfrom min_city_in_max_region",
285
+ "setup_sql": ";",
286
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
287
+ "question": "What is the city with the lowest pm10 concentration in the region with the highest average pm10 concentration?",
288
+ "category": "hard"
289
+ },
290
+ {
291
+ "db_id": "hn",
292
+ "query": "SELECT *, regexp_extract(text, '([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,63})',0) email from hacker_news where email[:4]='test'",
293
+ "setup_sql": ";",
294
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
295
+ "question": "Get all posts on hn that contain an email address starting with test. Return all original columns, plus a new column containing the email address.",
296
+ "category": "hard"
297
+ },
298
+ {
299
+ "db_id": "json",
300
+ "query": "SELECT employee.id, employee.first_name FROM employee_json",
301
+ "setup_sql": ";",
302
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
303
+ "question": "Extract id and first_name properties as individual columns from the employee struct",
304
+ "category": "duckdb"
305
+ },
306
+ {
307
+ "db_id": "who",
308
+ "query": "SELECT who_region[1]::INT as region, * EXCLUDE (who_region) FROM who.ambient_air_quality",
309
+ "setup_sql": ";",
310
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
311
+ "question": "count quality measurements per region. Make sure to return the region code (first char of who_region) as integer and sort by region.",
312
+ "category": "duckdb"
313
+ },
314
+ {
315
+ "db_id": "flightinfo",
316
+ "query": "SELECT seat.seat_number FROM seat \nJOIN direct_flight ON direct_flight.flight_number = seat.flight_number \nJOIN airport AS departure_airport ON departure_airport.iata_code = direct_flight.departure_airport_iata_code \nJOIN airport AS arriving_airport ON arriving_airport.iata_code = direct_flight.arriving_airport_iata_code \nJOIN city AS departure_city ON departure_city.city_zipcode = departure_airport.city_zip_code \nJOIN city AS arriving_city ON arriving_city.city_zipcode = arriving_airport.city_zip_code \nWHERE departure_city.city_name = 'Bruxelles' AND arriving_city.city_name = 'Newark';",
317
+ "setup_sql": ";",
318
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
319
+ "question": "Which seats were available on the flight from Bruxelles to Newark?",
320
+ "category": "hard"
321
+ },
322
+ {
323
+ "db_id": "laptop",
324
+ "query": "COPY customers FROM 'customers_12_12_2023.csv';",
325
+ "setup_sql": "COPY customers TO 'customers_12_12_2023.csv';",
326
+ "validation_sql": "SELECT * FROM customers;",
327
+ "question": "copy content of csv file customers_12_12_2023.csv into customers table",
328
+ "category": "duckdb"
329
+ },
330
+ {
331
+ "db_id": "laptop",
332
+ "query": "COPY customers FROM 'customers_12_12_2023.csv' (DELIMITER '\\t');",
333
+ "setup_sql": "COPY customers TO 'customers_12_12_2023.csv' (FORMAT CSV, DELIMITER '\\t');",
334
+ "validation_sql": "SELECT * FROM customers;",
335
+ "question": "copy content of csv file costomers_12_12_2023.csv into customers table with tab separator",
336
+ "category": "duckdb"
337
+ },
338
+ {
339
+ "db_id": "laptop",
340
+ "query": "COPY customers FROM 'customers_partitioned/city=Amsterdam/*.parquet';",
341
+ "setup_sql": "COPY customers TO 'customers_partitioned' (FORMAT PARQUET, PARTITION_BY (city), OVERWRITE_OR_IGNORE True);",
342
+ "validation_sql": "SELECT * FROM customers;;",
343
+ "question": "copy any parquet files from 'customers_partitioned/city=Amsterdam/' into customers table",
344
+ "category": "duckdb"
345
+ },
346
+ {
347
+ "db_id": "laptop",
348
+ "query": "COPY customers(customer_id) FROM 'customers_customer_ids.csv';",
349
+ "setup_sql": "COPY customers(customer_id) TO 'customers_customer_ids.csv';",
350
+ "validation_sql": "SELECT * FROM customers;",
351
+ "question": "copy only the customer_id column from the customers_customer_ids.csv into the customers tables",
352
+ "category": "duckdb"
353
+ },
354
+ {
355
+ "db_id": "laptop",
356
+ "query": "CREATE TABLE test_tbl AS SELECT * FROM read_json_auto('test.json');",
357
+ "setup_sql": "COPY customers TO 'test.json'\n",
358
+ "validation_sql": "SELECT * FROM test_tbl;",
359
+ "question": "read json file from test.json and create new table from it called 'test_tbl'",
360
+ "category": "duckdb"
361
+ },
362
+ {
363
+ "db_id": "laptop",
364
+ "query": "SELECT * FROM read_csv_auto('test.csv');",
365
+ "setup_sql": "COPY customers TO 'test.csv';",
366
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
367
+ "question": "read csv from test.csv",
368
+ "category": "duckdb"
369
+ },
370
+ {
371
+ "db_id": "laptop",
372
+ "query": "SELECT * FROM read_csv_auto('test.csv', columns={'customer_id': 'VARCHAR', 'firstname': 'VARCHAR', 'lastname': 'VARCHAR'});",
373
+ "setup_sql": "COPY customers(customer_id, firstname, lastname) TO 'test.csv';",
374
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
375
+ "question": "read csv from test.csv with predefined column and types - customer_id: string, firstname: string, lastname: string",
376
+ "category": "duckdb"
377
+ },
378
+ {
379
+ "db_id": "laptop",
380
+ "query": "SELECT * EXCLUDE (ram, hd) FROM pcs;",
381
+ "setup_sql": ";",
382
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
383
+ "question": "select all columns from pcs table except for ram and hd",
384
+ "category": "duckdb"
385
+ },
386
+ {
387
+ "db_id": "laptop",
388
+ "query": "SELECT COLUMNS('name$') FROM customers;",
389
+ "setup_sql": ";",
390
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
391
+ "question": "select all columns ending with 'name' from customers table",
392
+ "category": "duckdb"
393
+ },
394
+ {
395
+ "db_id": "laptop",
396
+ "query": "SELECT LENGTH(COLUMNS('name$')) FROM customers",
397
+ "setup_sql": ";",
398
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
399
+ "question": "for each column ending with 'name' in the customers table, compute the string length",
400
+ "category": "duckdb"
401
+ },
402
+ {
403
+ "db_id": "laptop",
404
+ "query": "SELECT * REPLACE (upper(city) AS city) FROM customers;",
405
+ "setup_sql": ";",
406
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
407
+ "question": "get all columns from customer table, and make all city names uppercase",
408
+ "category": "duckdb"
409
+ },
410
+ {
411
+ "db_id": "laptop",
412
+ "query": "EXPLAIN SELECT * FROM customers",
413
+ "setup_sql": ";",
414
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
415
+ "question": "show query plan for query: SELECT * from customers",
416
+ "category": "duckdb"
417
+ },
418
+ {
419
+ "db_id": "laptop",
420
+ "query": "SELECT ascii(lastname) FROM customers;",
421
+ "setup_sql": ";",
422
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
423
+ "question": "get the first character of the firstname column and cast it to an INT",
424
+ "category": "duckdb"
425
+ },
426
+ {
427
+ "db_id": "laptop",
428
+ "query": "SELECT model, speed::INTEGER FROM laptops;",
429
+ "setup_sql": ";",
430
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
431
+ "question": "get laptop name and speed, return the speed as integer",
432
+ "category": "duckdb"
433
+ },
434
+ {
435
+ "db_id": "laptop_array",
436
+ "query": "SELECT phone_numbers[1] FROM customers;",
437
+ "setup_sql": ";",
438
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
439
+ "question": "get the first phone number of each customer",
440
+ "category": "duckdb"
441
+ },
442
+ {
443
+ "db_id": "laptop_array",
444
+ "query": "INSERT INTO customers(customer_id, phone_numbers) VALUES (5, ['12312323', '23123344']);",
445
+ "setup_sql": ";",
446
+ "validation_sql": "SELECT * FROM customers;",
447
+ "question": "insert two phone numbers to customer with id 5 [\\\"12312323\\\", and '23123344']",
448
+ "category": "duckdb"
449
+ },
450
+ {
451
+ "db_id": "laptop",
452
+ "query": "ALTER TABLE customers ADD COLUMN phone_numbers VARCHAR[];",
453
+ "setup_sql": ";",
454
+ "validation_sql": "DESCRIBE customers;",
455
+ "question": "how to add a new column phone_numbers to the customers table, with array type varchar",
456
+ "category": "duckdb"
457
+ },
458
+ {
459
+ "db_id": "laptop",
460
+ "query": "SELECT firstname[1] FROM customers;",
461
+ "setup_sql": ";",
462
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
463
+ "question": "get the first letter of the customers firstname",
464
+ "category": "duckdb"
465
+ },
466
+ {
467
+ "db_id": "laptop_array",
468
+ "query": "SELECT phone_numbers[:2] FROM customers;",
469
+ "setup_sql": ";",
470
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
471
+ "question": "get the first two phone numbers from the phone numbers array of each customer",
472
+ "category": "duckdb"
473
+ },
474
+ {
475
+ "db_id": "laptop",
476
+ "query": "SELECT {'a':1, 'b':2, 'c':3};",
477
+ "setup_sql": ";",
478
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
479
+ "question": "create a struct with keys a, b, c and values 1,2,3",
480
+ "category": "duckdb"
481
+ },
482
+ {
483
+ "db_id": "laptop",
484
+ "query": "SELECT [1,2,3];\n",
485
+ "setup_sql": ";",
486
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
487
+ "question": "create array with values 1,2,3",
488
+ "category": "duckdb"
489
+ },
490
+ {
491
+ "db_id": "laptop",
492
+ "query": "CREATE TABLE test (embeddings FLOAT[100]);",
493
+ "setup_sql": ";",
494
+ "validation_sql": "DESCRIBE test;",
495
+ "question": "create table test with a fix-sized array column with 100 dimenions, called embeddings",
496
+ "category": "duckdb"
497
+ },
498
+ {
499
+ "db_id": "laptop",
500
+ "query": "CREATE TABLE test (person STRUCT(name VARCHAR, id INTEGER));",
501
+ "setup_sql": ";",
502
+ "validation_sql": "DESCRIBE test;",
503
+ "question": "create table test with a struct column called person with properties name and id",
504
+ "category": "duckdb"
505
+ },
506
+ {
507
+ "db_id": "laptop_struct",
508
+ "query": "SELECT person.name, person.id FROM test;",
509
+ "setup_sql": ";",
510
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
511
+ "question": "get persons name and persons id from the test table.",
512
+ "category": "duckdb"
513
+ },
514
+ {
515
+ "db_id": "laptop",
516
+ "query": "UPDATE customers SET email = NULL;",
517
+ "setup_sql": ";",
518
+ "validation_sql": "SELECT email FROM customers;",
519
+ "question": "remove all values from email column in customers table",
520
+ "category": "duckdb"
521
+ },
522
+ {
523
+ "db_id": "laptop_json",
524
+ "query": "ALTER TABLE customers ALTER COLUMN email SET DATA TYPE VARCHAR;",
525
+ "setup_sql": ";",
526
+ "validation_sql": "DESCRIBE customers;",
527
+ "question": "make customer email of type VARCHAR",
528
+ "category": "duckdb"
529
+ },
530
+ {
531
+ "db_id": "laptop_json",
532
+ "query": "INSERT INTO customers (customer_id, email) VALUES (5,'{\"from\": \"[email protected]\", \"to\": \"[email protected]\"}');",
533
+ "setup_sql": ";",
534
+ "validation_sql": "SELECT * FROM customers;",
535
+ "question": "insert json into customer email for customer id 5: {'from': '[email protected]', 'to': '[email protected]'}",
536
+ "category": "duckdb"
537
+ },
538
+ {
539
+ "db_id": "laptop_json",
540
+ "query": "SELECT customers.email->>'from' FROM customers;",
541
+ "setup_sql": ";",
542
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
543
+ "question": "get 'from' field from customer email json",
544
+ "category": "duckdb"
545
+ },
546
+ {
547
+ "db_id": "laptop",
548
+ "query": "SUMMARIZE customers;",
549
+ "setup_sql": ";",
550
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
551
+ "question": "summarize the customer table",
552
+ "category": "duckdb"
553
+ },
554
+ {
555
+ "db_id": "laptop",
556
+ "query": "SELECT * FROM customers USING SAMPLE 10% (reservoir);",
557
+ "setup_sql": ";",
558
+ "validation_sql": "SELECT count(*) FROM ddb_benchmark_result;",
559
+ "question": "sample 10% from the customers table using reservoir sampling",
560
+ "category": "duckdb"
561
+ },
562
+ {
563
+ "db_id": "laptop",
564
+ "query": "SET threads = 10;",
565
+ "setup_sql": ";",
566
+ "validation_sql": "SELECT current_setting('threads');",
567
+ "question": "set number of threads to 10",
568
+ "category": "duckdb"
569
+ },
570
+ {
571
+ "db_id": "laptop",
572
+ "query": "SET memory_limit = '20G';\n",
573
+ "setup_sql": ";",
574
+ "validation_sql": "SELECT current_setting('memory_limit');",
575
+ "question": "set memory limit to 20 gigabyte",
576
+ "category": "duckdb"
577
+ },
578
+ {
579
+ "db_id": "laptop",
580
+ "query": "SELECT * EXCLUDE (price), avg(price) FROM laptops GROUP BY ALL;",
581
+ "setup_sql": ";",
582
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
583
+ "question": "show the average price of laptop and group by the remaining columns",
584
+ "category": "duckdb"
585
+ },
586
+ {
587
+ "db_id": "laptop",
588
+ "query": "SELECT * FROM laptops WHERE price > 1000 ORDER BY ALL;\n",
589
+ "setup_sql": ";",
590
+ "validation_sql": "SELECT * FROM ddb_benchmark_result;",
591
+ "question": "show all laptops with price above 1000 and order by all columns",
592
+ "category": "duckdb"
593
+ },
594
+ {
595
+ "db_id": "laptop",
596
+ "query": "ATTACH 'who.ddb';",
597
+ "setup_sql": ";",
598
+ "validation_sql": "SHOW DATABASES;",
599
+ "question": "attach database file who.ddb",
600
+ "category": "duckdb"
601
+ }
602
+ ]
duckdb-nsql/eval/data/tables.json ADDED
@@ -0,0 +1,3465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "db_id": "hn",
4
+ "column_names": [
5
+ [
6
+ -1,
7
+ "*"
8
+ ],
9
+ [
10
+ 0,
11
+ "title"
12
+ ],
13
+ [
14
+ 0,
15
+ "url"
16
+ ],
17
+ [
18
+ 0,
19
+ "text"
20
+ ],
21
+ [
22
+ 0,
23
+ "dead"
24
+ ],
25
+ [
26
+ 0,
27
+ "by"
28
+ ],
29
+ [
30
+ 0,
31
+ "score"
32
+ ],
33
+ [
34
+ 0,
35
+ "time"
36
+ ],
37
+ [
38
+ 0,
39
+ "timestamp"
40
+ ],
41
+ [
42
+ 0,
43
+ "type"
44
+ ],
45
+ [
46
+ 0,
47
+ "id"
48
+ ],
49
+ [
50
+ 0,
51
+ "parent"
52
+ ],
53
+ [
54
+ 0,
55
+ "descendants"
56
+ ],
57
+ [
58
+ 0,
59
+ "ranking"
60
+ ],
61
+ [
62
+ 0,
63
+ "deleted"
64
+ ]
65
+ ],
66
+ "column_names_original": [
67
+ [
68
+ -1,
69
+ "*"
70
+ ],
71
+ [
72
+ 0,
73
+ "title"
74
+ ],
75
+ [
76
+ 0,
77
+ "url"
78
+ ],
79
+ [
80
+ 0,
81
+ "text"
82
+ ],
83
+ [
84
+ 0,
85
+ "dead"
86
+ ],
87
+ [
88
+ 0,
89
+ "by"
90
+ ],
91
+ [
92
+ 0,
93
+ "score"
94
+ ],
95
+ [
96
+ 0,
97
+ "time"
98
+ ],
99
+ [
100
+ 0,
101
+ "timestamp"
102
+ ],
103
+ [
104
+ 0,
105
+ "type"
106
+ ],
107
+ [
108
+ 0,
109
+ "id"
110
+ ],
111
+ [
112
+ 0,
113
+ "parent"
114
+ ],
115
+ [
116
+ 0,
117
+ "descendants"
118
+ ],
119
+ [
120
+ 0,
121
+ "ranking"
122
+ ],
123
+ [
124
+ 0,
125
+ "deleted"
126
+ ]
127
+ ],
128
+ "column_types": [
129
+ "text",
130
+ "varchar",
131
+ "varchar",
132
+ "varchar",
133
+ "boolean",
134
+ "varchar",
135
+ "bigint",
136
+ "bigint",
137
+ "timestamp",
138
+ "varchar",
139
+ "bigint",
140
+ "bigint",
141
+ "bigint",
142
+ "bigint",
143
+ "boolean"
144
+ ],
145
+ "foreign_keys": {},
146
+ "primary_keys": {},
147
+ "table_names": [
148
+ "hacker_news"
149
+ ],
150
+ "table_names_original": [
151
+ "hacker_news"
152
+ ]
153
+ },
154
+ {
155
+ "db_id": "laptop",
156
+ "column_names": [
157
+ [
158
+ -1,
159
+ "*"
160
+ ],
161
+ [
162
+ 0,
163
+ "customer_id"
164
+ ],
165
+ [
166
+ 0,
167
+ "firstname"
168
+ ],
169
+ [
170
+ 0,
171
+ "lastname"
172
+ ],
173
+ [
174
+ 0,
175
+ "city"
176
+ ],
177
+ [
178
+ 0,
179
+ "address"
180
+ ],
181
+ [
182
+ 0,
183
+ "email"
184
+ ],
185
+ [
186
+ 1,
187
+ "model"
188
+ ],
189
+ [
190
+ 1,
191
+ "speed"
192
+ ],
193
+ [
194
+ 1,
195
+ "ram"
196
+ ],
197
+ [
198
+ 1,
199
+ "hd"
200
+ ],
201
+ [
202
+ 1,
203
+ "screen"
204
+ ],
205
+ [
206
+ 1,
207
+ "price"
208
+ ],
209
+ [
210
+ 2,
211
+ "model"
212
+ ],
213
+ [
214
+ 2,
215
+ "speed"
216
+ ],
217
+ [
218
+ 2,
219
+ "ram"
220
+ ],
221
+ [
222
+ 2,
223
+ "hd"
224
+ ],
225
+ [
226
+ 2,
227
+ "price"
228
+ ],
229
+ [
230
+ 3,
231
+ "model"
232
+ ],
233
+ [
234
+ 3,
235
+ "color"
236
+ ],
237
+ [
238
+ 3,
239
+ "type"
240
+ ],
241
+ [
242
+ 3,
243
+ "price"
244
+ ],
245
+ [
246
+ 4,
247
+ "maker"
248
+ ],
249
+ [
250
+ 4,
251
+ "model"
252
+ ],
253
+ [
254
+ 4,
255
+ "type"
256
+ ],
257
+ [
258
+ 5,
259
+ "customer_id"
260
+ ],
261
+ [
262
+ 5,
263
+ "model"
264
+ ],
265
+ [
266
+ 5,
267
+ "quantity"
268
+ ],
269
+ [
270
+ 5,
271
+ "day"
272
+ ],
273
+ [
274
+ 5,
275
+ "paid"
276
+ ],
277
+ [
278
+ 5,
279
+ "type_of_payment"
280
+ ]
281
+ ],
282
+ "column_names_original": [
283
+ [
284
+ -1,
285
+ "*"
286
+ ],
287
+ [
288
+ 0,
289
+ "customer_id"
290
+ ],
291
+ [
292
+ 0,
293
+ "firstname"
294
+ ],
295
+ [
296
+ 0,
297
+ "lastname"
298
+ ],
299
+ [
300
+ 0,
301
+ "city"
302
+ ],
303
+ [
304
+ 0,
305
+ "address"
306
+ ],
307
+ [
308
+ 0,
309
+ "email"
310
+ ],
311
+ [
312
+ 1,
313
+ "model"
314
+ ],
315
+ [
316
+ 1,
317
+ "speed"
318
+ ],
319
+ [
320
+ 1,
321
+ "ram"
322
+ ],
323
+ [
324
+ 1,
325
+ "hd"
326
+ ],
327
+ [
328
+ 1,
329
+ "screen"
330
+ ],
331
+ [
332
+ 1,
333
+ "price"
334
+ ],
335
+ [
336
+ 2,
337
+ "model"
338
+ ],
339
+ [
340
+ 2,
341
+ "speed"
342
+ ],
343
+ [
344
+ 2,
345
+ "ram"
346
+ ],
347
+ [
348
+ 2,
349
+ "hd"
350
+ ],
351
+ [
352
+ 2,
353
+ "price"
354
+ ],
355
+ [
356
+ 3,
357
+ "model"
358
+ ],
359
+ [
360
+ 3,
361
+ "color"
362
+ ],
363
+ [
364
+ 3,
365
+ "type"
366
+ ],
367
+ [
368
+ 3,
369
+ "price"
370
+ ],
371
+ [
372
+ 4,
373
+ "maker"
374
+ ],
375
+ [
376
+ 4,
377
+ "model"
378
+ ],
379
+ [
380
+ 4,
381
+ "type"
382
+ ],
383
+ [
384
+ 5,
385
+ "customer_id"
386
+ ],
387
+ [
388
+ 5,
389
+ "model"
390
+ ],
391
+ [
392
+ 5,
393
+ "quantity"
394
+ ],
395
+ [
396
+ 5,
397
+ "day"
398
+ ],
399
+ [
400
+ 5,
401
+ "paid"
402
+ ],
403
+ [
404
+ 5,
405
+ "type_of_payment"
406
+ ]
407
+ ],
408
+ "column_types": [
409
+ "text",
410
+ "char",
411
+ "varchar",
412
+ "varchar",
413
+ "varchar",
414
+ "varchar",
415
+ "varchar",
416
+ "char",
417
+ "double",
418
+ "int",
419
+ "int",
420
+ "double",
421
+ "double",
422
+ "char",
423
+ "double",
424
+ "int",
425
+ "int",
426
+ "double",
427
+ "char",
428
+ "varchar",
429
+ "varchar",
430
+ "double",
431
+ "char",
432
+ "char",
433
+ "varchar",
434
+ "char",
435
+ "char",
436
+ "int",
437
+ "date",
438
+ "double",
439
+ "varchar"
440
+ ],
441
+ "foreign_keys": {},
442
+ "primary_keys": {},
443
+ "table_names": [
444
+ "customers",
445
+ "laptops",
446
+ "pcs",
447
+ "printers",
448
+ "products",
449
+ "sales"
450
+ ],
451
+ "table_names_original": [
452
+ "customers",
453
+ "laptops",
454
+ "pcs",
455
+ "printers",
456
+ "products",
457
+ "sales"
458
+ ]
459
+ },
460
+ {
461
+ "db_id": "transactions",
462
+ "column_names": [
463
+ [
464
+ -1,
465
+ "*"
466
+ ],
467
+ [
468
+ 0,
469
+ "id"
470
+ ],
471
+ [
472
+ 0,
473
+ "name"
474
+ ],
475
+ [
476
+ 1,
477
+ "user_id"
478
+ ],
479
+ [
480
+ 1,
481
+ "amount"
482
+ ]
483
+ ],
484
+ "column_names_original": [
485
+ [
486
+ -1,
487
+ "*"
488
+ ],
489
+ [
490
+ 0,
491
+ "id"
492
+ ],
493
+ [
494
+ 0,
495
+ "name"
496
+ ],
497
+ [
498
+ 1,
499
+ "user_id"
500
+ ],
501
+ [
502
+ 1,
503
+ "amount"
504
+ ]
505
+ ],
506
+ "column_types": [
507
+ "text",
508
+ "int",
509
+ "varchar",
510
+ "int",
511
+ "int"
512
+ ],
513
+ "foreign_keys": {},
514
+ "primary_keys": {},
515
+ "table_names": [
516
+ "users",
517
+ "transactions"
518
+ ],
519
+ "table_names_original": [
520
+ "users",
521
+ "transactions"
522
+ ]
523
+ },
524
+ {
525
+ "db_id": "who",
526
+ "column_names": [
527
+ [
528
+ -1,
529
+ "*"
530
+ ],
531
+ [
532
+ 0,
533
+ "who_region"
534
+ ],
535
+ [
536
+ 0,
537
+ "iso3"
538
+ ],
539
+ [
540
+ 0,
541
+ "country_name"
542
+ ],
543
+ [
544
+ 0,
545
+ "city"
546
+ ],
547
+ [
548
+ 0,
549
+ "year"
550
+ ],
551
+ [
552
+ 0,
553
+ "version"
554
+ ],
555
+ [
556
+ 0,
557
+ "pm10_concentration"
558
+ ],
559
+ [
560
+ 0,
561
+ "pm25_concentration"
562
+ ],
563
+ [
564
+ 0,
565
+ "no2_concentration"
566
+ ],
567
+ [
568
+ 0,
569
+ "pm10_tempcov"
570
+ ],
571
+ [
572
+ 0,
573
+ "pm25_tempcov"
574
+ ],
575
+ [
576
+ 0,
577
+ "no2_tempcov"
578
+ ],
579
+ [
580
+ 0,
581
+ "type_of_stations"
582
+ ],
583
+ [
584
+ 0,
585
+ "reference"
586
+ ],
587
+ [
588
+ 0,
589
+ "web_link"
590
+ ],
591
+ [
592
+ 0,
593
+ "population"
594
+ ],
595
+ [
596
+ 0,
597
+ "population_source"
598
+ ],
599
+ [
600
+ 0,
601
+ "latitude"
602
+ ],
603
+ [
604
+ 0,
605
+ "longitude"
606
+ ],
607
+ [
608
+ 0,
609
+ "who_ms"
610
+ ]
611
+ ],
612
+ "column_names_original": [
613
+ [
614
+ -1,
615
+ "*"
616
+ ],
617
+ [
618
+ 0,
619
+ "who_region"
620
+ ],
621
+ [
622
+ 0,
623
+ "iso3"
624
+ ],
625
+ [
626
+ 0,
627
+ "country_name"
628
+ ],
629
+ [
630
+ 0,
631
+ "city"
632
+ ],
633
+ [
634
+ 0,
635
+ "year"
636
+ ],
637
+ [
638
+ 0,
639
+ "version"
640
+ ],
641
+ [
642
+ 0,
643
+ "pm10_concentration"
644
+ ],
645
+ [
646
+ 0,
647
+ "pm25_concentration"
648
+ ],
649
+ [
650
+ 0,
651
+ "no2_concentration"
652
+ ],
653
+ [
654
+ 0,
655
+ "pm10_tempcov"
656
+ ],
657
+ [
658
+ 0,
659
+ "pm25_tempcov"
660
+ ],
661
+ [
662
+ 0,
663
+ "no2_tempcov"
664
+ ],
665
+ [
666
+ 0,
667
+ "type_of_stations"
668
+ ],
669
+ [
670
+ 0,
671
+ "reference"
672
+ ],
673
+ [
674
+ 0,
675
+ "web_link"
676
+ ],
677
+ [
678
+ 0,
679
+ "population"
680
+ ],
681
+ [
682
+ 0,
683
+ "population_source"
684
+ ],
685
+ [
686
+ 0,
687
+ "latitude"
688
+ ],
689
+ [
690
+ 0,
691
+ "longitude"
692
+ ],
693
+ [
694
+ 0,
695
+ "who_ms"
696
+ ]
697
+ ],
698
+ "column_types": [
699
+ "text",
700
+ "varchar",
701
+ "varchar",
702
+ "varchar",
703
+ "varchar",
704
+ "bigint",
705
+ "varchar",
706
+ "bigint",
707
+ "bigint",
708
+ "bigint",
709
+ "bigint",
710
+ "bigint",
711
+ "bigint",
712
+ "varchar",
713
+ "varchar",
714
+ "varchar",
715
+ "varchar",
716
+ "varchar",
717
+ "float",
718
+ "float",
719
+ "bigint"
720
+ ],
721
+ "foreign_keys": {},
722
+ "primary_keys": {},
723
+ "table_names": [
724
+ "ambient_air_quality"
725
+ ],
726
+ "table_names_original": [
727
+ "ambient_air_quality"
728
+ ]
729
+ },
730
+ {
731
+ "db_id": "nyc",
732
+ "column_names": [
733
+ [
734
+ -1,
735
+ "*"
736
+ ],
737
+ [
738
+ 0,
739
+ "unique_key"
740
+ ],
741
+ [
742
+ 0,
743
+ "created_date"
744
+ ],
745
+ [
746
+ 0,
747
+ "closed_date"
748
+ ],
749
+ [
750
+ 0,
751
+ "agency"
752
+ ],
753
+ [
754
+ 0,
755
+ "agency_name"
756
+ ],
757
+ [
758
+ 0,
759
+ "complaint_type"
760
+ ],
761
+ [
762
+ 0,
763
+ "descriptor"
764
+ ],
765
+ [
766
+ 0,
767
+ "location_type"
768
+ ],
769
+ [
770
+ 0,
771
+ "incident_zip"
772
+ ],
773
+ [
774
+ 0,
775
+ "incident_address"
776
+ ],
777
+ [
778
+ 0,
779
+ "street_name"
780
+ ],
781
+ [
782
+ 0,
783
+ "cross_street_1"
784
+ ],
785
+ [
786
+ 0,
787
+ "cross_street_2"
788
+ ],
789
+ [
790
+ 0,
791
+ "intersection_street_1"
792
+ ],
793
+ [
794
+ 0,
795
+ "intersection_street_2"
796
+ ],
797
+ [
798
+ 0,
799
+ "address_type"
800
+ ],
801
+ [
802
+ 0,
803
+ "city"
804
+ ],
805
+ [
806
+ 0,
807
+ "landmark"
808
+ ],
809
+ [
810
+ 0,
811
+ "facility_type"
812
+ ],
813
+ [
814
+ 0,
815
+ "status"
816
+ ],
817
+ [
818
+ 0,
819
+ "due_date"
820
+ ],
821
+ [
822
+ 0,
823
+ "resolution_description"
824
+ ],
825
+ [
826
+ 0,
827
+ "resolution_action_updated_date"
828
+ ],
829
+ [
830
+ 0,
831
+ "community_board"
832
+ ],
833
+ [
834
+ 0,
835
+ "bbl"
836
+ ],
837
+ [
838
+ 0,
839
+ "borough"
840
+ ],
841
+ [
842
+ 0,
843
+ "x_coordinate_state_plane"
844
+ ],
845
+ [
846
+ 0,
847
+ "y_coordinate_state_plane"
848
+ ],
849
+ [
850
+ 0,
851
+ "open_data_channel_type"
852
+ ],
853
+ [
854
+ 0,
855
+ "park_facility_name"
856
+ ],
857
+ [
858
+ 0,
859
+ "park_borough"
860
+ ],
861
+ [
862
+ 0,
863
+ "vehicle_type"
864
+ ],
865
+ [
866
+ 0,
867
+ "taxi_company_borough"
868
+ ],
869
+ [
870
+ 0,
871
+ "taxi_pick_up_location"
872
+ ],
873
+ [
874
+ 0,
875
+ "bridge_highway_name"
876
+ ],
877
+ [
878
+ 0,
879
+ "bridge_highway_direction"
880
+ ],
881
+ [
882
+ 0,
883
+ "road_ramp"
884
+ ],
885
+ [
886
+ 0,
887
+ "bridge_highway_segment"
888
+ ],
889
+ [
890
+ 0,
891
+ "latitude"
892
+ ],
893
+ [
894
+ 0,
895
+ "longitude"
896
+ ],
897
+ [
898
+ 1,
899
+ "hvfhs_license_num"
900
+ ],
901
+ [
902
+ 1,
903
+ "dispatching_base_num"
904
+ ],
905
+ [
906
+ 1,
907
+ "originating_base_num"
908
+ ],
909
+ [
910
+ 1,
911
+ "request_datetime"
912
+ ],
913
+ [
914
+ 1,
915
+ "on_scene_datetime"
916
+ ],
917
+ [
918
+ 1,
919
+ "pickup_datetime"
920
+ ],
921
+ [
922
+ 1,
923
+ "dropoff_datetime"
924
+ ],
925
+ [
926
+ 1,
927
+ "PULocationID"
928
+ ],
929
+ [
930
+ 1,
931
+ "DOLocationID"
932
+ ],
933
+ [
934
+ 1,
935
+ "trip_miles"
936
+ ],
937
+ [
938
+ 1,
939
+ "trip_time"
940
+ ],
941
+ [
942
+ 1,
943
+ "base_passenger_fare"
944
+ ],
945
+ [
946
+ 1,
947
+ "tolls"
948
+ ],
949
+ [
950
+ 1,
951
+ "bcf"
952
+ ],
953
+ [
954
+ 1,
955
+ "sales_tax"
956
+ ],
957
+ [
958
+ 1,
959
+ "congestion_surcharge"
960
+ ],
961
+ [
962
+ 1,
963
+ "airport_fee"
964
+ ],
965
+ [
966
+ 1,
967
+ "tips"
968
+ ],
969
+ [
970
+ 1,
971
+ "driver_pay"
972
+ ],
973
+ [
974
+ 1,
975
+ "shared_request_flag"
976
+ ],
977
+ [
978
+ 1,
979
+ "shared_match_flag"
980
+ ],
981
+ [
982
+ 1,
983
+ "access_a_ride_flag"
984
+ ],
985
+ [
986
+ 1,
987
+ "wav_request_flag"
988
+ ],
989
+ [
990
+ 1,
991
+ "wav_match_flag"
992
+ ],
993
+ [
994
+ 2,
995
+ "VendorID"
996
+ ],
997
+ [
998
+ 2,
999
+ "tpep_pickup_datetime"
1000
+ ],
1001
+ [
1002
+ 2,
1003
+ "tpep_dropoff_datetime"
1004
+ ],
1005
+ [
1006
+ 2,
1007
+ "passenger_count"
1008
+ ],
1009
+ [
1010
+ 2,
1011
+ "trip_distance"
1012
+ ],
1013
+ [
1014
+ 2,
1015
+ "RatecodeID"
1016
+ ],
1017
+ [
1018
+ 2,
1019
+ "store_and_fwd_flag"
1020
+ ],
1021
+ [
1022
+ 2,
1023
+ "PULocationID"
1024
+ ],
1025
+ [
1026
+ 2,
1027
+ "DOLocationID"
1028
+ ],
1029
+ [
1030
+ 2,
1031
+ "payment_type"
1032
+ ],
1033
+ [
1034
+ 2,
1035
+ "fare_amount"
1036
+ ],
1037
+ [
1038
+ 2,
1039
+ "extra"
1040
+ ],
1041
+ [
1042
+ 2,
1043
+ "mta_tax"
1044
+ ],
1045
+ [
1046
+ 2,
1047
+ "tip_amount"
1048
+ ],
1049
+ [
1050
+ 2,
1051
+ "tolls_amount"
1052
+ ],
1053
+ [
1054
+ 2,
1055
+ "improvement_surcharge"
1056
+ ],
1057
+ [
1058
+ 2,
1059
+ "total_amount"
1060
+ ],
1061
+ [
1062
+ 2,
1063
+ "congestion_surcharge"
1064
+ ],
1065
+ [
1066
+ 2,
1067
+ "airport_fee"
1068
+ ]
1069
+ ],
1070
+ "column_names_original": [
1071
+ [
1072
+ -1,
1073
+ "*"
1074
+ ],
1075
+ [
1076
+ 0,
1077
+ "unique_key"
1078
+ ],
1079
+ [
1080
+ 0,
1081
+ "created_date"
1082
+ ],
1083
+ [
1084
+ 0,
1085
+ "closed_date"
1086
+ ],
1087
+ [
1088
+ 0,
1089
+ "agency"
1090
+ ],
1091
+ [
1092
+ 0,
1093
+ "agency_name"
1094
+ ],
1095
+ [
1096
+ 0,
1097
+ "complaint_type"
1098
+ ],
1099
+ [
1100
+ 0,
1101
+ "descriptor"
1102
+ ],
1103
+ [
1104
+ 0,
1105
+ "location_type"
1106
+ ],
1107
+ [
1108
+ 0,
1109
+ "incident_zip"
1110
+ ],
1111
+ [
1112
+ 0,
1113
+ "incident_address"
1114
+ ],
1115
+ [
1116
+ 0,
1117
+ "street_name"
1118
+ ],
1119
+ [
1120
+ 0,
1121
+ "cross_street_1"
1122
+ ],
1123
+ [
1124
+ 0,
1125
+ "cross_street_2"
1126
+ ],
1127
+ [
1128
+ 0,
1129
+ "intersection_street_1"
1130
+ ],
1131
+ [
1132
+ 0,
1133
+ "intersection_street_2"
1134
+ ],
1135
+ [
1136
+ 0,
1137
+ "address_type"
1138
+ ],
1139
+ [
1140
+ 0,
1141
+ "city"
1142
+ ],
1143
+ [
1144
+ 0,
1145
+ "landmark"
1146
+ ],
1147
+ [
1148
+ 0,
1149
+ "facility_type"
1150
+ ],
1151
+ [
1152
+ 0,
1153
+ "status"
1154
+ ],
1155
+ [
1156
+ 0,
1157
+ "due_date"
1158
+ ],
1159
+ [
1160
+ 0,
1161
+ "resolution_description"
1162
+ ],
1163
+ [
1164
+ 0,
1165
+ "resolution_action_updated_date"
1166
+ ],
1167
+ [
1168
+ 0,
1169
+ "community_board"
1170
+ ],
1171
+ [
1172
+ 0,
1173
+ "bbl"
1174
+ ],
1175
+ [
1176
+ 0,
1177
+ "borough"
1178
+ ],
1179
+ [
1180
+ 0,
1181
+ "x_coordinate_state_plane"
1182
+ ],
1183
+ [
1184
+ 0,
1185
+ "y_coordinate_state_plane"
1186
+ ],
1187
+ [
1188
+ 0,
1189
+ "open_data_channel_type"
1190
+ ],
1191
+ [
1192
+ 0,
1193
+ "park_facility_name"
1194
+ ],
1195
+ [
1196
+ 0,
1197
+ "park_borough"
1198
+ ],
1199
+ [
1200
+ 0,
1201
+ "vehicle_type"
1202
+ ],
1203
+ [
1204
+ 0,
1205
+ "taxi_company_borough"
1206
+ ],
1207
+ [
1208
+ 0,
1209
+ "taxi_pick_up_location"
1210
+ ],
1211
+ [
1212
+ 0,
1213
+ "bridge_highway_name"
1214
+ ],
1215
+ [
1216
+ 0,
1217
+ "bridge_highway_direction"
1218
+ ],
1219
+ [
1220
+ 0,
1221
+ "road_ramp"
1222
+ ],
1223
+ [
1224
+ 0,
1225
+ "bridge_highway_segment"
1226
+ ],
1227
+ [
1228
+ 0,
1229
+ "latitude"
1230
+ ],
1231
+ [
1232
+ 0,
1233
+ "longitude"
1234
+ ],
1235
+ [
1236
+ 1,
1237
+ "hvfhs_license_num"
1238
+ ],
1239
+ [
1240
+ 1,
1241
+ "dispatching_base_num"
1242
+ ],
1243
+ [
1244
+ 1,
1245
+ "originating_base_num"
1246
+ ],
1247
+ [
1248
+ 1,
1249
+ "request_datetime"
1250
+ ],
1251
+ [
1252
+ 1,
1253
+ "on_scene_datetime"
1254
+ ],
1255
+ [
1256
+ 1,
1257
+ "pickup_datetime"
1258
+ ],
1259
+ [
1260
+ 1,
1261
+ "dropoff_datetime"
1262
+ ],
1263
+ [
1264
+ 1,
1265
+ "PULocationID"
1266
+ ],
1267
+ [
1268
+ 1,
1269
+ "DOLocationID"
1270
+ ],
1271
+ [
1272
+ 1,
1273
+ "trip_miles"
1274
+ ],
1275
+ [
1276
+ 1,
1277
+ "trip_time"
1278
+ ],
1279
+ [
1280
+ 1,
1281
+ "base_passenger_fare"
1282
+ ],
1283
+ [
1284
+ 1,
1285
+ "tolls"
1286
+ ],
1287
+ [
1288
+ 1,
1289
+ "bcf"
1290
+ ],
1291
+ [
1292
+ 1,
1293
+ "sales_tax"
1294
+ ],
1295
+ [
1296
+ 1,
1297
+ "congestion_surcharge"
1298
+ ],
1299
+ [
1300
+ 1,
1301
+ "airport_fee"
1302
+ ],
1303
+ [
1304
+ 1,
1305
+ "tips"
1306
+ ],
1307
+ [
1308
+ 1,
1309
+ "driver_pay"
1310
+ ],
1311
+ [
1312
+ 1,
1313
+ "shared_request_flag"
1314
+ ],
1315
+ [
1316
+ 1,
1317
+ "shared_match_flag"
1318
+ ],
1319
+ [
1320
+ 1,
1321
+ "access_a_ride_flag"
1322
+ ],
1323
+ [
1324
+ 1,
1325
+ "wav_request_flag"
1326
+ ],
1327
+ [
1328
+ 1,
1329
+ "wav_match_flag"
1330
+ ],
1331
+ [
1332
+ 2,
1333
+ "VendorID"
1334
+ ],
1335
+ [
1336
+ 2,
1337
+ "tpep_pickup_datetime"
1338
+ ],
1339
+ [
1340
+ 2,
1341
+ "tpep_dropoff_datetime"
1342
+ ],
1343
+ [
1344
+ 2,
1345
+ "passenger_count"
1346
+ ],
1347
+ [
1348
+ 2,
1349
+ "trip_distance"
1350
+ ],
1351
+ [
1352
+ 2,
1353
+ "RatecodeID"
1354
+ ],
1355
+ [
1356
+ 2,
1357
+ "store_and_fwd_flag"
1358
+ ],
1359
+ [
1360
+ 2,
1361
+ "PULocationID"
1362
+ ],
1363
+ [
1364
+ 2,
1365
+ "DOLocationID"
1366
+ ],
1367
+ [
1368
+ 2,
1369
+ "payment_type"
1370
+ ],
1371
+ [
1372
+ 2,
1373
+ "fare_amount"
1374
+ ],
1375
+ [
1376
+ 2,
1377
+ "extra"
1378
+ ],
1379
+ [
1380
+ 2,
1381
+ "mta_tax"
1382
+ ],
1383
+ [
1384
+ 2,
1385
+ "tip_amount"
1386
+ ],
1387
+ [
1388
+ 2,
1389
+ "tolls_amount"
1390
+ ],
1391
+ [
1392
+ 2,
1393
+ "improvement_surcharge"
1394
+ ],
1395
+ [
1396
+ 2,
1397
+ "total_amount"
1398
+ ],
1399
+ [
1400
+ 2,
1401
+ "congestion_surcharge"
1402
+ ],
1403
+ [
1404
+ 2,
1405
+ "airport_fee"
1406
+ ]
1407
+ ],
1408
+ "column_types": [
1409
+ "text",
1410
+ "bigint",
1411
+ "timestamp",
1412
+ "timestamp",
1413
+ "varchar",
1414
+ "varchar",
1415
+ "varchar",
1416
+ "varchar",
1417
+ "varchar",
1418
+ "varchar",
1419
+ "varchar",
1420
+ "varchar",
1421
+ "varchar",
1422
+ "varchar",
1423
+ "varchar",
1424
+ "varchar",
1425
+ "varchar",
1426
+ "varchar",
1427
+ "varchar",
1428
+ "varchar",
1429
+ "varchar",
1430
+ "timestamp",
1431
+ "varchar",
1432
+ "timestamp",
1433
+ "varchar",
1434
+ "varchar",
1435
+ "varchar",
1436
+ "varchar",
1437
+ "varchar",
1438
+ "varchar",
1439
+ "varchar",
1440
+ "varchar",
1441
+ "varchar",
1442
+ "varchar",
1443
+ "varchar",
1444
+ "varchar",
1445
+ "varchar",
1446
+ "varchar",
1447
+ "varchar",
1448
+ "double",
1449
+ "double",
1450
+ "varchar",
1451
+ "varchar",
1452
+ "varchar",
1453
+ "timestamp",
1454
+ "timestamp",
1455
+ "timestamp",
1456
+ "timestamp",
1457
+ "bigint",
1458
+ "bigint",
1459
+ "double",
1460
+ "bigint",
1461
+ "double",
1462
+ "double",
1463
+ "double",
1464
+ "double",
1465
+ "double",
1466
+ "double",
1467
+ "double",
1468
+ "double",
1469
+ "varchar",
1470
+ "varchar",
1471
+ "varchar",
1472
+ "varchar",
1473
+ "varchar",
1474
+ "bigint",
1475
+ "timestamp",
1476
+ "timestamp",
1477
+ "double",
1478
+ "double",
1479
+ "double",
1480
+ "varchar",
1481
+ "bigint",
1482
+ "bigint",
1483
+ "bigint",
1484
+ "double",
1485
+ "double",
1486
+ "double",
1487
+ "double",
1488
+ "double",
1489
+ "double",
1490
+ "double",
1491
+ "double",
1492
+ "double"
1493
+ ],
1494
+ "foreign_keys": {},
1495
+ "primary_keys": {},
1496
+ "table_names": [
1497
+ "service_requests",
1498
+ "rideshare",
1499
+ "taxi"
1500
+ ],
1501
+ "table_names_original": [
1502
+ "service_requests",
1503
+ "rideshare",
1504
+ "taxi"
1505
+ ]
1506
+ },
1507
+ {
1508
+ "db_id": "product",
1509
+ "column_names": [
1510
+ [
1511
+ -1,
1512
+ "*"
1513
+ ],
1514
+ [
1515
+ 0,
1516
+ "maker"
1517
+ ],
1518
+ [
1519
+ 0,
1520
+ "model"
1521
+ ],
1522
+ [
1523
+ 0,
1524
+ "type"
1525
+ ]
1526
+ ],
1527
+ "column_names_original": [
1528
+ [
1529
+ -1,
1530
+ "*"
1531
+ ],
1532
+ [
1533
+ 0,
1534
+ "maker"
1535
+ ],
1536
+ [
1537
+ 0,
1538
+ "model"
1539
+ ],
1540
+ [
1541
+ 0,
1542
+ "type"
1543
+ ]
1544
+ ],
1545
+ "column_types": [
1546
+ "text",
1547
+ "varchar",
1548
+ "varchar",
1549
+ "varchar"
1550
+ ],
1551
+ "foreign_keys": {},
1552
+ "primary_keys": {},
1553
+ "table_names": [
1554
+ "products"
1555
+ ],
1556
+ "table_names_original": [
1557
+ "products"
1558
+ ]
1559
+ },
1560
+ {
1561
+ "db_id": "json",
1562
+ "column_names": [
1563
+ [
1564
+ -1,
1565
+ "*"
1566
+ ],
1567
+ [
1568
+ 0,
1569
+ "employee"
1570
+ ]
1571
+ ],
1572
+ "column_names_original": [
1573
+ [
1574
+ -1,
1575
+ "*"
1576
+ ],
1577
+ [
1578
+ 0,
1579
+ "employee"
1580
+ ]
1581
+ ],
1582
+ "column_types": [
1583
+ "text",
1584
+ "struct(id int, first_name text, last_name text, email text)"
1585
+ ],
1586
+ "foreign_keys": {},
1587
+ "primary_keys": {},
1588
+ "table_names": [
1589
+ "employee_json"
1590
+ ],
1591
+ "table_names_original": [
1592
+ "employee_json"
1593
+ ]
1594
+ },
1595
+ {
1596
+ "db_id": "flightinfo",
1597
+ "column_names": [
1598
+ [
1599
+ -1,
1600
+ "*"
1601
+ ],
1602
+ [
1603
+ 0,
1604
+ "icao_code"
1605
+ ],
1606
+ [
1607
+ 0,
1608
+ "email"
1609
+ ],
1610
+ [
1611
+ 0,
1612
+ "name"
1613
+ ],
1614
+ [
1615
+ 0,
1616
+ "phone_number"
1617
+ ],
1618
+ [
1619
+ 0,
1620
+ "iata_code"
1621
+ ],
1622
+ [
1623
+ 1,
1624
+ "title"
1625
+ ],
1626
+ [
1627
+ 1,
1628
+ "description"
1629
+ ],
1630
+ [
1631
+ 1,
1632
+ "price"
1633
+ ],
1634
+ [
1635
+ 1,
1636
+ "service_type"
1637
+ ],
1638
+ [
1639
+ 1,
1640
+ "airline_icao_code"
1641
+ ],
1642
+ [
1643
+ 2,
1644
+ "iata_code"
1645
+ ],
1646
+ [
1647
+ 2,
1648
+ "address"
1649
+ ],
1650
+ [
1651
+ 2,
1652
+ "name"
1653
+ ],
1654
+ [
1655
+ 2,
1656
+ "phone_number"
1657
+ ],
1658
+ [
1659
+ 2,
1660
+ "email"
1661
+ ],
1662
+ [
1663
+ 2,
1664
+ "city_zip_code"
1665
+ ],
1666
+ [
1667
+ 2,
1668
+ "city_dbpedia"
1669
+ ],
1670
+ [
1671
+ 3,
1672
+ "title"
1673
+ ],
1674
+ [
1675
+ 3,
1676
+ "cabin_bag_dimension_cm"
1677
+ ],
1678
+ [
1679
+ 3,
1680
+ "cabin_bags_no"
1681
+ ],
1682
+ [
1683
+ 3,
1684
+ "cabin_bg_weight_kg"
1685
+ ],
1686
+ [
1687
+ 3,
1688
+ "checked_bag_dimension_cm"
1689
+ ],
1690
+ [
1691
+ 3,
1692
+ "checked_bags_no"
1693
+ ],
1694
+ [
1695
+ 3,
1696
+ "checked_bag_weight_kg"
1697
+ ],
1698
+ [
1699
+ 3,
1700
+ "excessive_price_perkg"
1701
+ ],
1702
+ [
1703
+ 3,
1704
+ "flight_type"
1705
+ ],
1706
+ [
1707
+ 3,
1708
+ "airline_icao_code"
1709
+ ],
1710
+ [
1711
+ 4,
1712
+ "title"
1713
+ ],
1714
+ [
1715
+ 4,
1716
+ "description"
1717
+ ],
1718
+ [
1719
+ 4,
1720
+ "airline_icao_code"
1721
+ ],
1722
+ [
1723
+ 5,
1724
+ "title"
1725
+ ],
1726
+ [
1727
+ 5,
1728
+ "description"
1729
+ ],
1730
+ [
1731
+ 5,
1732
+ "due_date"
1733
+ ],
1734
+ [
1735
+ 5,
1736
+ "refund_postdue_percentage"
1737
+ ],
1738
+ [
1739
+ 5,
1740
+ "refund_predue_percentage"
1741
+ ],
1742
+ [
1743
+ 5,
1744
+ "airline_icao_code"
1745
+ ],
1746
+ [
1747
+ 6,
1748
+ "city_zipcode"
1749
+ ],
1750
+ [
1751
+ 6,
1752
+ "city_name"
1753
+ ],
1754
+ [
1755
+ 6,
1756
+ "country_iso_code"
1757
+ ],
1758
+ [
1759
+ 7,
1760
+ "country_iso_code"
1761
+ ],
1762
+ [
1763
+ 7,
1764
+ "country_name"
1765
+ ],
1766
+ [
1767
+ 8,
1768
+ "flight_number"
1769
+ ],
1770
+ [
1771
+ 8,
1772
+ "departure_airport_iata_code"
1773
+ ],
1774
+ [
1775
+ 8,
1776
+ "arriving_airport_iata_code"
1777
+ ],
1778
+ [
1779
+ 9,
1780
+ "number"
1781
+ ],
1782
+ [
1783
+ 9,
1784
+ "departure_date"
1785
+ ],
1786
+ [
1787
+ 9,
1788
+ "arrival_date"
1789
+ ],
1790
+ [
1791
+ 9,
1792
+ "distance_km"
1793
+ ],
1794
+ [
1795
+ 9,
1796
+ "is_available"
1797
+ ],
1798
+ [
1799
+ 9,
1800
+ "duration_min"
1801
+ ],
1802
+ [
1803
+ 9,
1804
+ "airline_icao_code"
1805
+ ],
1806
+ [
1807
+ 9,
1808
+ "type"
1809
+ ],
1810
+ [
1811
+ 10,
1812
+ "title"
1813
+ ],
1814
+ [
1815
+ 10,
1816
+ "description"
1817
+ ],
1818
+ [
1819
+ 10,
1820
+ "cabin_class_title"
1821
+ ],
1822
+ [
1823
+ 10,
1824
+ "baggage_policy_title"
1825
+ ],
1826
+ [
1827
+ 10,
1828
+ "cancelation_policy_title"
1829
+ ],
1830
+ [
1831
+ 11,
1832
+ "subflight_number"
1833
+ ],
1834
+ [
1835
+ 11,
1836
+ "flight_number"
1837
+ ],
1838
+ [
1839
+ 12,
1840
+ "flight_package_title"
1841
+ ],
1842
+ [
1843
+ 12,
1844
+ "airline_service_title"
1845
+ ],
1846
+ [
1847
+ 13,
1848
+ "seat_number"
1849
+ ],
1850
+ [
1851
+ 13,
1852
+ "is_available"
1853
+ ],
1854
+ [
1855
+ 13,
1856
+ "flight_number"
1857
+ ],
1858
+ [
1859
+ 14,
1860
+ "meal_type"
1861
+ ],
1862
+ [
1863
+ 14,
1864
+ "airline_service_title"
1865
+ ],
1866
+ [
1867
+ 15,
1868
+ "duration_min"
1869
+ ],
1870
+ [
1871
+ 15,
1872
+ "duration_from"
1873
+ ],
1874
+ [
1875
+ 15,
1876
+ "duration_to"
1877
+ ],
1878
+ [
1879
+ 15,
1880
+ "airport_iatacode"
1881
+ ],
1882
+ [
1883
+ 15,
1884
+ "flight_number"
1885
+ ],
1886
+ [
1887
+ 16,
1888
+ "flight_number"
1889
+ ],
1890
+ [
1891
+ 16,
1892
+ "package_title"
1893
+ ],
1894
+ [
1895
+ 16,
1896
+ "trip_id"
1897
+ ],
1898
+ [
1899
+ 16,
1900
+ "requested_excessive_baggage_kg"
1901
+ ],
1902
+ [
1903
+ 16,
1904
+ "seat_number"
1905
+ ],
1906
+ [
1907
+ 16,
1908
+ "chosen_meal_service_price"
1909
+ ],
1910
+ [
1911
+ 16,
1912
+ "chosen_wifi_service_price"
1913
+ ],
1914
+ [
1915
+ 16,
1916
+ "price"
1917
+ ],
1918
+ [
1919
+ 17,
1920
+ "id"
1921
+ ],
1922
+ [
1923
+ 17,
1924
+ "tax"
1925
+ ],
1926
+ [
1927
+ 17,
1928
+ "booking_date"
1929
+ ],
1930
+ [
1931
+ 17,
1932
+ "user_email"
1933
+ ],
1934
+ [
1935
+ 17,
1936
+ "type"
1937
+ ],
1938
+ [
1939
+ 18,
1940
+ "email"
1941
+ ],
1942
+ [
1943
+ 18,
1944
+ "first_name"
1945
+ ],
1946
+ [
1947
+ 18,
1948
+ "last_name"
1949
+ ],
1950
+ [
1951
+ 18,
1952
+ "birthdate"
1953
+ ],
1954
+ [
1955
+ 18,
1956
+ "passport_number"
1957
+ ],
1958
+ [
1959
+ 18,
1960
+ "address"
1961
+ ],
1962
+ [
1963
+ 18,
1964
+ "password"
1965
+ ],
1966
+ [
1967
+ 18,
1968
+ "phone_number"
1969
+ ],
1970
+ [
1971
+ 19,
1972
+ "wifi_onboard_service_bandwidth_MB"
1973
+ ],
1974
+ [
1975
+ 19,
1976
+ "airline_service_title"
1977
+ ]
1978
+ ],
1979
+ "column_names_original": [
1980
+ [
1981
+ -1,
1982
+ "*"
1983
+ ],
1984
+ [
1985
+ 0,
1986
+ "icao_code"
1987
+ ],
1988
+ [
1989
+ 0,
1990
+ "email"
1991
+ ],
1992
+ [
1993
+ 0,
1994
+ "name"
1995
+ ],
1996
+ [
1997
+ 0,
1998
+ "phone_number"
1999
+ ],
2000
+ [
2001
+ 0,
2002
+ "iata_code"
2003
+ ],
2004
+ [
2005
+ 1,
2006
+ "title"
2007
+ ],
2008
+ [
2009
+ 1,
2010
+ "description"
2011
+ ],
2012
+ [
2013
+ 1,
2014
+ "price"
2015
+ ],
2016
+ [
2017
+ 1,
2018
+ "service_type"
2019
+ ],
2020
+ [
2021
+ 1,
2022
+ "airline_icao_code"
2023
+ ],
2024
+ [
2025
+ 2,
2026
+ "iata_code"
2027
+ ],
2028
+ [
2029
+ 2,
2030
+ "address"
2031
+ ],
2032
+ [
2033
+ 2,
2034
+ "name"
2035
+ ],
2036
+ [
2037
+ 2,
2038
+ "phone_number"
2039
+ ],
2040
+ [
2041
+ 2,
2042
+ "email"
2043
+ ],
2044
+ [
2045
+ 2,
2046
+ "city_zip_code"
2047
+ ],
2048
+ [
2049
+ 2,
2050
+ "city_dbpedia"
2051
+ ],
2052
+ [
2053
+ 3,
2054
+ "title"
2055
+ ],
2056
+ [
2057
+ 3,
2058
+ "cabin_bag_dimension_cm"
2059
+ ],
2060
+ [
2061
+ 3,
2062
+ "cabin_bags_no"
2063
+ ],
2064
+ [
2065
+ 3,
2066
+ "cabin_bg_weight_kg"
2067
+ ],
2068
+ [
2069
+ 3,
2070
+ "checked_bag_dimension_cm"
2071
+ ],
2072
+ [
2073
+ 3,
2074
+ "checked_bags_no"
2075
+ ],
2076
+ [
2077
+ 3,
2078
+ "checked_bag_weight_kg"
2079
+ ],
2080
+ [
2081
+ 3,
2082
+ "excessive_price_perkg"
2083
+ ],
2084
+ [
2085
+ 3,
2086
+ "flight_type"
2087
+ ],
2088
+ [
2089
+ 3,
2090
+ "airline_icao_code"
2091
+ ],
2092
+ [
2093
+ 4,
2094
+ "title"
2095
+ ],
2096
+ [
2097
+ 4,
2098
+ "description"
2099
+ ],
2100
+ [
2101
+ 4,
2102
+ "airline_icao_code"
2103
+ ],
2104
+ [
2105
+ 5,
2106
+ "title"
2107
+ ],
2108
+ [
2109
+ 5,
2110
+ "description"
2111
+ ],
2112
+ [
2113
+ 5,
2114
+ "due_date"
2115
+ ],
2116
+ [
2117
+ 5,
2118
+ "refund_postdue_percentage"
2119
+ ],
2120
+ [
2121
+ 5,
2122
+ "refund_predue_percentage"
2123
+ ],
2124
+ [
2125
+ 5,
2126
+ "airline_icao_code"
2127
+ ],
2128
+ [
2129
+ 6,
2130
+ "city_zipcode"
2131
+ ],
2132
+ [
2133
+ 6,
2134
+ "city_name"
2135
+ ],
2136
+ [
2137
+ 6,
2138
+ "country_iso_code"
2139
+ ],
2140
+ [
2141
+ 7,
2142
+ "country_iso_code"
2143
+ ],
2144
+ [
2145
+ 7,
2146
+ "country_name"
2147
+ ],
2148
+ [
2149
+ 8,
2150
+ "flight_number"
2151
+ ],
2152
+ [
2153
+ 8,
2154
+ "departure_airport_iata_code"
2155
+ ],
2156
+ [
2157
+ 8,
2158
+ "arriving_airport_iata_code"
2159
+ ],
2160
+ [
2161
+ 9,
2162
+ "number"
2163
+ ],
2164
+ [
2165
+ 9,
2166
+ "departure_date"
2167
+ ],
2168
+ [
2169
+ 9,
2170
+ "arrival_date"
2171
+ ],
2172
+ [
2173
+ 9,
2174
+ "distance_km"
2175
+ ],
2176
+ [
2177
+ 9,
2178
+ "is_available"
2179
+ ],
2180
+ [
2181
+ 9,
2182
+ "duration_min"
2183
+ ],
2184
+ [
2185
+ 9,
2186
+ "airline_icao_code"
2187
+ ],
2188
+ [
2189
+ 9,
2190
+ "type"
2191
+ ],
2192
+ [
2193
+ 10,
2194
+ "title"
2195
+ ],
2196
+ [
2197
+ 10,
2198
+ "description"
2199
+ ],
2200
+ [
2201
+ 10,
2202
+ "cabin_class_title"
2203
+ ],
2204
+ [
2205
+ 10,
2206
+ "baggage_policy_title"
2207
+ ],
2208
+ [
2209
+ 10,
2210
+ "cancelation_policy_title"
2211
+ ],
2212
+ [
2213
+ 11,
2214
+ "subflight_number"
2215
+ ],
2216
+ [
2217
+ 11,
2218
+ "flight_number"
2219
+ ],
2220
+ [
2221
+ 12,
2222
+ "flight_package_title"
2223
+ ],
2224
+ [
2225
+ 12,
2226
+ "airline_service_title"
2227
+ ],
2228
+ [
2229
+ 13,
2230
+ "seat_number"
2231
+ ],
2232
+ [
2233
+ 13,
2234
+ "is_available"
2235
+ ],
2236
+ [
2237
+ 13,
2238
+ "flight_number"
2239
+ ],
2240
+ [
2241
+ 14,
2242
+ "meal_type"
2243
+ ],
2244
+ [
2245
+ 14,
2246
+ "airline_service_title"
2247
+ ],
2248
+ [
2249
+ 15,
2250
+ "duration_min"
2251
+ ],
2252
+ [
2253
+ 15,
2254
+ "duration_from"
2255
+ ],
2256
+ [
2257
+ 15,
2258
+ "duration_to"
2259
+ ],
2260
+ [
2261
+ 15,
2262
+ "airport_iatacode"
2263
+ ],
2264
+ [
2265
+ 15,
2266
+ "flight_number"
2267
+ ],
2268
+ [
2269
+ 16,
2270
+ "flight_number"
2271
+ ],
2272
+ [
2273
+ 16,
2274
+ "package_title"
2275
+ ],
2276
+ [
2277
+ 16,
2278
+ "trip_id"
2279
+ ],
2280
+ [
2281
+ 16,
2282
+ "requested_excessive_baggage_kg"
2283
+ ],
2284
+ [
2285
+ 16,
2286
+ "seat_number"
2287
+ ],
2288
+ [
2289
+ 16,
2290
+ "chosen_meal_service_price"
2291
+ ],
2292
+ [
2293
+ 16,
2294
+ "chosen_wifi_service_price"
2295
+ ],
2296
+ [
2297
+ 16,
2298
+ "price"
2299
+ ],
2300
+ [
2301
+ 17,
2302
+ "id"
2303
+ ],
2304
+ [
2305
+ 17,
2306
+ "tax"
2307
+ ],
2308
+ [
2309
+ 17,
2310
+ "booking_date"
2311
+ ],
2312
+ [
2313
+ 17,
2314
+ "user_email"
2315
+ ],
2316
+ [
2317
+ 17,
2318
+ "type"
2319
+ ],
2320
+ [
2321
+ 18,
2322
+ "email"
2323
+ ],
2324
+ [
2325
+ 18,
2326
+ "first_name"
2327
+ ],
2328
+ [
2329
+ 18,
2330
+ "last_name"
2331
+ ],
2332
+ [
2333
+ 18,
2334
+ "birthdate"
2335
+ ],
2336
+ [
2337
+ 18,
2338
+ "passport_number"
2339
+ ],
2340
+ [
2341
+ 18,
2342
+ "address"
2343
+ ],
2344
+ [
2345
+ 18,
2346
+ "password"
2347
+ ],
2348
+ [
2349
+ 18,
2350
+ "phone_number"
2351
+ ],
2352
+ [
2353
+ 19,
2354
+ "wifi_onboard_service_bandwidth_MB"
2355
+ ],
2356
+ [
2357
+ 19,
2358
+ "airline_service_title"
2359
+ ]
2360
+ ],
2361
+ "column_types": [
2362
+ "text",
2363
+ "varchar",
2364
+ "varchar",
2365
+ "varchar",
2366
+ "varchar",
2367
+ "varchar",
2368
+ "varchar",
2369
+ "varchar",
2370
+ "double",
2371
+ "varchar",
2372
+ "varchar",
2373
+ "varchar",
2374
+ "text",
2375
+ "varchar",
2376
+ "varchar",
2377
+ "varchar",
2378
+ "varchar",
2379
+ "varchar",
2380
+ "varchar",
2381
+ "double",
2382
+ "double",
2383
+ "double",
2384
+ "double",
2385
+ "double",
2386
+ "double",
2387
+ "double",
2388
+ "varchar",
2389
+ "varchar",
2390
+ "varchar",
2391
+ "text",
2392
+ "varchar",
2393
+ "varchar",
2394
+ "text",
2395
+ "text",
2396
+ "int",
2397
+ "int",
2398
+ "varchar",
2399
+ "varchar",
2400
+ "varchar",
2401
+ "varchar",
2402
+ "varchar",
2403
+ "text",
2404
+ "varchar",
2405
+ "varchar",
2406
+ "varchar",
2407
+ "varchar",
2408
+ "datetime",
2409
+ "datetime",
2410
+ "double",
2411
+ "tinyint",
2412
+ "double",
2413
+ "varchar",
2414
+ "varchar",
2415
+ "varchar",
2416
+ "text",
2417
+ "varchar",
2418
+ "varchar",
2419
+ "varchar",
2420
+ "varchar",
2421
+ "varchar",
2422
+ "varchar",
2423
+ "varchar",
2424
+ "varchar",
2425
+ "tinyint",
2426
+ "varchar",
2427
+ "varchar",
2428
+ "varchar",
2429
+ "double",
2430
+ "datetime",
2431
+ "datetime",
2432
+ "varchar",
2433
+ "varchar",
2434
+ "varchar",
2435
+ "varchar",
2436
+ "int",
2437
+ "int",
2438
+ "varchar",
2439
+ "int",
2440
+ "int",
2441
+ "double",
2442
+ "int",
2443
+ "double",
2444
+ "datetime",
2445
+ "varchar",
2446
+ "varchar",
2447
+ "varchar",
2448
+ "varchar",
2449
+ "varchar",
2450
+ "date",
2451
+ "varchar",
2452
+ "varchar",
2453
+ "varchar",
2454
+ "double",
2455
+ "double",
2456
+ "varchar"
2457
+ ],
2458
+ "foreign_keys": {},
2459
+ "primary_keys": {},
2460
+ "table_names": [
2461
+ "airline",
2462
+ "airline_service",
2463
+ "airport",
2464
+ "baggage_policy",
2465
+ "cabin_class",
2466
+ "cancellation_policy",
2467
+ "city",
2468
+ "country",
2469
+ "direct_flight",
2470
+ "flight",
2471
+ "flight_package",
2472
+ "non_direct_flight",
2473
+ "package_service",
2474
+ "seat",
2475
+ "special_meal_type",
2476
+ "stopping",
2477
+ "ticke",
2478
+ "trip",
2479
+ "user",
2480
+ "wifi_onboard_service"
2481
+ ],
2482
+ "table_names_original": [
2483
+ "airline",
2484
+ "airline_service",
2485
+ "airport",
2486
+ "baggage_policy",
2487
+ "cabin_class",
2488
+ "cancellation_policy",
2489
+ "city",
2490
+ "country",
2491
+ "direct_flight",
2492
+ "flight",
2493
+ "flight_package",
2494
+ "non_direct_flight",
2495
+ "package_service",
2496
+ "seat",
2497
+ "special_meal_type",
2498
+ "stopping",
2499
+ "ticke",
2500
+ "trip",
2501
+ "user",
2502
+ "wifi_onboard_service"
2503
+ ]
2504
+ },
2505
+ {
2506
+ "db_id": "none",
2507
+ "column_names": [
2508
+ [
2509
+ -1,
2510
+ "*"
2511
+ ]
2512
+ ],
2513
+ "column_names_original": [
2514
+ [
2515
+ -1,
2516
+ "*"
2517
+ ]
2518
+ ],
2519
+ "column_types": [
2520
+ "text"
2521
+ ],
2522
+ "foreign_keys": {},
2523
+ "primary_keys": {},
2524
+ "table_names": [],
2525
+ "table_names_original": []
2526
+ },
2527
+ {
2528
+ "db_id": "laptop_array",
2529
+ "column_names": [
2530
+ [
2531
+ -1,
2532
+ "*"
2533
+ ],
2534
+ [
2535
+ 0,
2536
+ "customer_id"
2537
+ ],
2538
+ [
2539
+ 0,
2540
+ "firstname"
2541
+ ],
2542
+ [
2543
+ 0,
2544
+ "lastname"
2545
+ ],
2546
+ [
2547
+ 0,
2548
+ "city"
2549
+ ],
2550
+ [
2551
+ 0,
2552
+ "address"
2553
+ ],
2554
+ [
2555
+ 0,
2556
+ "email"
2557
+ ],
2558
+ [
2559
+ 0,
2560
+ "phone_numbers"
2561
+ ],
2562
+ [
2563
+ 1,
2564
+ "model"
2565
+ ],
2566
+ [
2567
+ 1,
2568
+ "speed"
2569
+ ],
2570
+ [
2571
+ 1,
2572
+ "ram"
2573
+ ],
2574
+ [
2575
+ 1,
2576
+ "hd"
2577
+ ],
2578
+ [
2579
+ 1,
2580
+ "screen"
2581
+ ],
2582
+ [
2583
+ 1,
2584
+ "price"
2585
+ ],
2586
+ [
2587
+ 2,
2588
+ "model"
2589
+ ],
2590
+ [
2591
+ 2,
2592
+ "speed"
2593
+ ],
2594
+ [
2595
+ 2,
2596
+ "ram"
2597
+ ],
2598
+ [
2599
+ 2,
2600
+ "hd"
2601
+ ],
2602
+ [
2603
+ 2,
2604
+ "price"
2605
+ ],
2606
+ [
2607
+ 3,
2608
+ "model"
2609
+ ],
2610
+ [
2611
+ 3,
2612
+ "color"
2613
+ ],
2614
+ [
2615
+ 3,
2616
+ "type"
2617
+ ],
2618
+ [
2619
+ 3,
2620
+ "price"
2621
+ ],
2622
+ [
2623
+ 4,
2624
+ "maker"
2625
+ ],
2626
+ [
2627
+ 4,
2628
+ "model"
2629
+ ],
2630
+ [
2631
+ 4,
2632
+ "type"
2633
+ ],
2634
+ [
2635
+ 5,
2636
+ "customer_id"
2637
+ ],
2638
+ [
2639
+ 5,
2640
+ "model"
2641
+ ],
2642
+ [
2643
+ 5,
2644
+ "quantity"
2645
+ ],
2646
+ [
2647
+ 5,
2648
+ "day"
2649
+ ],
2650
+ [
2651
+ 5,
2652
+ "paid"
2653
+ ],
2654
+ [
2655
+ 5,
2656
+ "type_of_payment"
2657
+ ]
2658
+ ],
2659
+ "column_names_original": [
2660
+ [
2661
+ -1,
2662
+ "*"
2663
+ ],
2664
+ [
2665
+ 0,
2666
+ "customer_id"
2667
+ ],
2668
+ [
2669
+ 0,
2670
+ "firstname"
2671
+ ],
2672
+ [
2673
+ 0,
2674
+ "lastname"
2675
+ ],
2676
+ [
2677
+ 0,
2678
+ "city"
2679
+ ],
2680
+ [
2681
+ 0,
2682
+ "address"
2683
+ ],
2684
+ [
2685
+ 0,
2686
+ "email"
2687
+ ],
2688
+ [
2689
+ 0,
2690
+ "phone_number"
2691
+ ],
2692
+ [
2693
+ 1,
2694
+ "model"
2695
+ ],
2696
+ [
2697
+ 1,
2698
+ "speed"
2699
+ ],
2700
+ [
2701
+ 1,
2702
+ "ram"
2703
+ ],
2704
+ [
2705
+ 1,
2706
+ "hd"
2707
+ ],
2708
+ [
2709
+ 1,
2710
+ "screen"
2711
+ ],
2712
+ [
2713
+ 1,
2714
+ "price"
2715
+ ],
2716
+ [
2717
+ 2,
2718
+ "model"
2719
+ ],
2720
+ [
2721
+ 2,
2722
+ "speed"
2723
+ ],
2724
+ [
2725
+ 2,
2726
+ "ram"
2727
+ ],
2728
+ [
2729
+ 2,
2730
+ "hd"
2731
+ ],
2732
+ [
2733
+ 2,
2734
+ "price"
2735
+ ],
2736
+ [
2737
+ 3,
2738
+ "model"
2739
+ ],
2740
+ [
2741
+ 3,
2742
+ "color"
2743
+ ],
2744
+ [
2745
+ 3,
2746
+ "type"
2747
+ ],
2748
+ [
2749
+ 3,
2750
+ "price"
2751
+ ],
2752
+ [
2753
+ 4,
2754
+ "maker"
2755
+ ],
2756
+ [
2757
+ 4,
2758
+ "model"
2759
+ ],
2760
+ [
2761
+ 4,
2762
+ "type"
2763
+ ],
2764
+ [
2765
+ 5,
2766
+ "customer_id"
2767
+ ],
2768
+ [
2769
+ 5,
2770
+ "model"
2771
+ ],
2772
+ [
2773
+ 5,
2774
+ "quantity"
2775
+ ],
2776
+ [
2777
+ 5,
2778
+ "day"
2779
+ ],
2780
+ [
2781
+ 5,
2782
+ "paid"
2783
+ ],
2784
+ [
2785
+ 5,
2786
+ "type_of_payment"
2787
+ ]
2788
+ ],
2789
+ "column_types": [
2790
+ "text",
2791
+ "char",
2792
+ "varchar",
2793
+ "varchar",
2794
+ "varchar",
2795
+ "varchar",
2796
+ "varchar",
2797
+ "array",
2798
+ "char",
2799
+ "double",
2800
+ "int",
2801
+ "int",
2802
+ "double",
2803
+ "double",
2804
+ "char",
2805
+ "double",
2806
+ "int",
2807
+ "int",
2808
+ "double",
2809
+ "char",
2810
+ "varchar",
2811
+ "varchar",
2812
+ "double",
2813
+ "char",
2814
+ "char",
2815
+ "varchar",
2816
+ "char",
2817
+ "char",
2818
+ "int",
2819
+ "date",
2820
+ "double",
2821
+ "varchar"
2822
+ ],
2823
+ "foreign_keys": {},
2824
+ "primary_keys": {},
2825
+ "table_names": [
2826
+ "customers",
2827
+ "laptops",
2828
+ "pcs",
2829
+ "printers",
2830
+ "products",
2831
+ "sales"
2832
+ ],
2833
+ "table_names_original": [
2834
+ "customers",
2835
+ "laptops",
2836
+ "pcs",
2837
+ "printers",
2838
+ "products",
2839
+ "sales"
2840
+ ]
2841
+ },
2842
+ {
2843
+ "db_id": "laptop_struct",
2844
+ "column_names": [
2845
+ [
2846
+ -1,
2847
+ "*"
2848
+ ],
2849
+ [
2850
+ 0,
2851
+ "person"
2852
+ ],
2853
+ [
2854
+ 1,
2855
+ "customer_id"
2856
+ ],
2857
+ [
2858
+ 1,
2859
+ "firstname"
2860
+ ],
2861
+ [
2862
+ 1,
2863
+ "lastname"
2864
+ ],
2865
+ [
2866
+ 1,
2867
+ "city"
2868
+ ],
2869
+ [
2870
+ 1,
2871
+ "address"
2872
+ ],
2873
+ [
2874
+ 1,
2875
+ "email"
2876
+ ],
2877
+ [
2878
+ 2,
2879
+ "model"
2880
+ ],
2881
+ [
2882
+ 2,
2883
+ "speed"
2884
+ ],
2885
+ [
2886
+ 2,
2887
+ "ram"
2888
+ ],
2889
+ [
2890
+ 2,
2891
+ "hd"
2892
+ ],
2893
+ [
2894
+ 2,
2895
+ "screen"
2896
+ ],
2897
+ [
2898
+ 2,
2899
+ "price"
2900
+ ],
2901
+ [
2902
+ 3,
2903
+ "model"
2904
+ ],
2905
+ [
2906
+ 3,
2907
+ "speed"
2908
+ ],
2909
+ [
2910
+ 3,
2911
+ "ram"
2912
+ ],
2913
+ [
2914
+ 3,
2915
+ "hd"
2916
+ ],
2917
+ [
2918
+ 3,
2919
+ "price"
2920
+ ],
2921
+ [
2922
+ 4,
2923
+ "model"
2924
+ ],
2925
+ [
2926
+ 4,
2927
+ "color"
2928
+ ],
2929
+ [
2930
+ 4,
2931
+ "type"
2932
+ ],
2933
+ [
2934
+ 4,
2935
+ "price"
2936
+ ],
2937
+ [
2938
+ 5,
2939
+ "maker"
2940
+ ],
2941
+ [
2942
+ 5,
2943
+ "model"
2944
+ ],
2945
+ [
2946
+ 5,
2947
+ "type"
2948
+ ],
2949
+ [
2950
+ 6,
2951
+ "customer_id"
2952
+ ],
2953
+ [
2954
+ 6,
2955
+ "model"
2956
+ ],
2957
+ [
2958
+ 6,
2959
+ "quantity"
2960
+ ],
2961
+ [
2962
+ 6,
2963
+ "day"
2964
+ ],
2965
+ [
2966
+ 6,
2967
+ "paid"
2968
+ ],
2969
+ [
2970
+ 6,
2971
+ "type_of_payment"
2972
+ ]
2973
+ ],
2974
+ "column_names_original": [
2975
+ [
2976
+ -1,
2977
+ "*"
2978
+ ],
2979
+ [
2980
+ 0,
2981
+ "person"
2982
+ ],
2983
+ [
2984
+ 1,
2985
+ "customer_id"
2986
+ ],
2987
+ [
2988
+ 1,
2989
+ "firstname"
2990
+ ],
2991
+ [
2992
+ 1,
2993
+ "lastname"
2994
+ ],
2995
+ [
2996
+ 1,
2997
+ "city"
2998
+ ],
2999
+ [
3000
+ 1,
3001
+ "address"
3002
+ ],
3003
+ [
3004
+ 1,
3005
+ "email"
3006
+ ],
3007
+ [
3008
+ 2,
3009
+ "model"
3010
+ ],
3011
+ [
3012
+ 2,
3013
+ "speed"
3014
+ ],
3015
+ [
3016
+ 2,
3017
+ "ram"
3018
+ ],
3019
+ [
3020
+ 2,
3021
+ "hd"
3022
+ ],
3023
+ [
3024
+ 2,
3025
+ "screen"
3026
+ ],
3027
+ [
3028
+ 2,
3029
+ "price"
3030
+ ],
3031
+ [
3032
+ 3,
3033
+ "model"
3034
+ ],
3035
+ [
3036
+ 3,
3037
+ "speed"
3038
+ ],
3039
+ [
3040
+ 3,
3041
+ "ram"
3042
+ ],
3043
+ [
3044
+ 3,
3045
+ "hd"
3046
+ ],
3047
+ [
3048
+ 3,
3049
+ "price"
3050
+ ],
3051
+ [
3052
+ 4,
3053
+ "model"
3054
+ ],
3055
+ [
3056
+ 4,
3057
+ "color"
3058
+ ],
3059
+ [
3060
+ 4,
3061
+ "type"
3062
+ ],
3063
+ [
3064
+ 4,
3065
+ "price"
3066
+ ],
3067
+ [
3068
+ 5,
3069
+ "maker"
3070
+ ],
3071
+ [
3072
+ 5,
3073
+ "model"
3074
+ ],
3075
+ [
3076
+ 5,
3077
+ "type"
3078
+ ],
3079
+ [
3080
+ 6,
3081
+ "customer_id"
3082
+ ],
3083
+ [
3084
+ 6,
3085
+ "model"
3086
+ ],
3087
+ [
3088
+ 6,
3089
+ "quantity"
3090
+ ],
3091
+ [
3092
+ 6,
3093
+ "day"
3094
+ ],
3095
+ [
3096
+ 6,
3097
+ "paid"
3098
+ ],
3099
+ [
3100
+ 6,
3101
+ "type_of_payment"
3102
+ ]
3103
+ ],
3104
+ "column_types": [
3105
+ "text",
3106
+ "struct(id int, name: text)",
3107
+ "char",
3108
+ "varchar",
3109
+ "varchar",
3110
+ "varchar",
3111
+ "varchar",
3112
+ "varchar",
3113
+ "char",
3114
+ "double",
3115
+ "int",
3116
+ "int",
3117
+ "double",
3118
+ "double",
3119
+ "char",
3120
+ "double",
3121
+ "int",
3122
+ "int",
3123
+ "double",
3124
+ "char",
3125
+ "varchar",
3126
+ "varchar",
3127
+ "double",
3128
+ "char",
3129
+ "char",
3130
+ "varchar",
3131
+ "char",
3132
+ "char",
3133
+ "int",
3134
+ "date",
3135
+ "double",
3136
+ "varchar"
3137
+ ],
3138
+ "foreign_keys": {},
3139
+ "primary_keys": {},
3140
+ "table_names": [
3141
+ "test",
3142
+ "customers",
3143
+ "laptops",
3144
+ "pcs",
3145
+ "printers",
3146
+ "products",
3147
+ "sales"
3148
+ ],
3149
+ "table_names_original": [
3150
+ "test",
3151
+ "customers",
3152
+ "laptops",
3153
+ "pcs",
3154
+ "printers",
3155
+ "products",
3156
+ "sales"
3157
+ ]
3158
+ },
3159
+ {
3160
+ "db_id": "laptop_json",
3161
+ "column_names": [
3162
+ [
3163
+ -1,
3164
+ "*"
3165
+ ],
3166
+ [
3167
+ 0,
3168
+ "customer_id"
3169
+ ],
3170
+ [
3171
+ 0,
3172
+ "firstname"
3173
+ ],
3174
+ [
3175
+ 0,
3176
+ "lastname"
3177
+ ],
3178
+ [
3179
+ 0,
3180
+ "city"
3181
+ ],
3182
+ [
3183
+ 0,
3184
+ "address"
3185
+ ],
3186
+ [
3187
+ 0,
3188
+ "email"
3189
+ ],
3190
+ [
3191
+ 1,
3192
+ "model"
3193
+ ],
3194
+ [
3195
+ 1,
3196
+ "speed"
3197
+ ],
3198
+ [
3199
+ 1,
3200
+ "ram"
3201
+ ],
3202
+ [
3203
+ 1,
3204
+ "hd"
3205
+ ],
3206
+ [
3207
+ 1,
3208
+ "screen"
3209
+ ],
3210
+ [
3211
+ 1,
3212
+ "price"
3213
+ ],
3214
+ [
3215
+ 2,
3216
+ "model"
3217
+ ],
3218
+ [
3219
+ 2,
3220
+ "speed"
3221
+ ],
3222
+ [
3223
+ 2,
3224
+ "ram"
3225
+ ],
3226
+ [
3227
+ 2,
3228
+ "hd"
3229
+ ],
3230
+ [
3231
+ 2,
3232
+ "price"
3233
+ ],
3234
+ [
3235
+ 3,
3236
+ "model"
3237
+ ],
3238
+ [
3239
+ 3,
3240
+ "color"
3241
+ ],
3242
+ [
3243
+ 3,
3244
+ "type"
3245
+ ],
3246
+ [
3247
+ 3,
3248
+ "price"
3249
+ ],
3250
+ [
3251
+ 4,
3252
+ "maker"
3253
+ ],
3254
+ [
3255
+ 4,
3256
+ "model"
3257
+ ],
3258
+ [
3259
+ 4,
3260
+ "type"
3261
+ ],
3262
+ [
3263
+ 5,
3264
+ "customer_id"
3265
+ ],
3266
+ [
3267
+ 5,
3268
+ "model"
3269
+ ],
3270
+ [
3271
+ 5,
3272
+ "quantity"
3273
+ ],
3274
+ [
3275
+ 5,
3276
+ "day"
3277
+ ],
3278
+ [
3279
+ 5,
3280
+ "paid"
3281
+ ],
3282
+ [
3283
+ 5,
3284
+ "type_of_payment"
3285
+ ]
3286
+ ],
3287
+ "column_names_original": [
3288
+ [
3289
+ -1,
3290
+ "*"
3291
+ ],
3292
+ [
3293
+ 0,
3294
+ "customer_id"
3295
+ ],
3296
+ [
3297
+ 0,
3298
+ "firstname"
3299
+ ],
3300
+ [
3301
+ 0,
3302
+ "lastname"
3303
+ ],
3304
+ [
3305
+ 0,
3306
+ "city"
3307
+ ],
3308
+ [
3309
+ 0,
3310
+ "address"
3311
+ ],
3312
+ [
3313
+ 0,
3314
+ "email"
3315
+ ],
3316
+ [
3317
+ 1,
3318
+ "model"
3319
+ ],
3320
+ [
3321
+ 1,
3322
+ "speed"
3323
+ ],
3324
+ [
3325
+ 1,
3326
+ "ram"
3327
+ ],
3328
+ [
3329
+ 1,
3330
+ "hd"
3331
+ ],
3332
+ [
3333
+ 1,
3334
+ "screen"
3335
+ ],
3336
+ [
3337
+ 1,
3338
+ "price"
3339
+ ],
3340
+ [
3341
+ 2,
3342
+ "model"
3343
+ ],
3344
+ [
3345
+ 2,
3346
+ "speed"
3347
+ ],
3348
+ [
3349
+ 2,
3350
+ "ram"
3351
+ ],
3352
+ [
3353
+ 2,
3354
+ "hd"
3355
+ ],
3356
+ [
3357
+ 2,
3358
+ "price"
3359
+ ],
3360
+ [
3361
+ 3,
3362
+ "model"
3363
+ ],
3364
+ [
3365
+ 3,
3366
+ "color"
3367
+ ],
3368
+ [
3369
+ 3,
3370
+ "type"
3371
+ ],
3372
+ [
3373
+ 3,
3374
+ "price"
3375
+ ],
3376
+ [
3377
+ 4,
3378
+ "maker"
3379
+ ],
3380
+ [
3381
+ 4,
3382
+ "model"
3383
+ ],
3384
+ [
3385
+ 4,
3386
+ "type"
3387
+ ],
3388
+ [
3389
+ 5,
3390
+ "customer_id"
3391
+ ],
3392
+ [
3393
+ 5,
3394
+ "model"
3395
+ ],
3396
+ [
3397
+ 5,
3398
+ "quantity"
3399
+ ],
3400
+ [
3401
+ 5,
3402
+ "day"
3403
+ ],
3404
+ [
3405
+ 5,
3406
+ "paid"
3407
+ ],
3408
+ [
3409
+ 5,
3410
+ "type_of_payment"
3411
+ ]
3412
+ ],
3413
+ "column_types": [
3414
+ "text",
3415
+ "char",
3416
+ "varchar",
3417
+ "varchar",
3418
+ "varchar",
3419
+ "varchar",
3420
+ "json",
3421
+ "char",
3422
+ "double",
3423
+ "int",
3424
+ "int",
3425
+ "double",
3426
+ "double",
3427
+ "char",
3428
+ "double",
3429
+ "int",
3430
+ "int",
3431
+ "double",
3432
+ "char",
3433
+ "varchar",
3434
+ "varchar",
3435
+ "double",
3436
+ "char",
3437
+ "char",
3438
+ "varchar",
3439
+ "char",
3440
+ "char",
3441
+ "int",
3442
+ "date",
3443
+ "double",
3444
+ "varchar"
3445
+ ],
3446
+ "foreign_keys": {},
3447
+ "primary_keys": {},
3448
+ "table_names": [
3449
+ "customers",
3450
+ "laptops",
3451
+ "pcs",
3452
+ "printers",
3453
+ "products",
3454
+ "sales"
3455
+ ],
3456
+ "table_names_original": [
3457
+ "customers",
3458
+ "laptops",
3459
+ "pcs",
3460
+ "printers",
3461
+ "products",
3462
+ "sales"
3463
+ ]
3464
+ }
3465
+ ]
duckdb-nsql/eval/data_utils.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Training data prep utils."""
2
+ import json
3
+ import re
4
+ from collections import defaultdict
5
+ from schema import ForeignKey, Table, TableColumn
6
+
7
+
8
+ def read_tables_json(
9
+ schema_file: str,
10
+ lowercase: bool = False,
11
+ ) -> dict[str, dict[str, Table]]:
12
+ """Read tables json."""
13
+ data = json.load(open(schema_file))
14
+ db_to_tables = {}
15
+ for db in data:
16
+ db_name = db["db_id"]
17
+ table_names = db["table_names_original"]
18
+ db["column_names_original"] = [
19
+ [x[0], x[1]] for x in db["column_names_original"]
20
+ ]
21
+ db["column_types"] = db["column_types"]
22
+ if lowercase:
23
+ table_names = [tn.lower() for tn in table_names]
24
+ pks = db["primary_keys"]
25
+ fks = db["foreign_keys"]
26
+ tables = defaultdict(list)
27
+ tables_pks = defaultdict(list)
28
+ tables_fks = defaultdict(list)
29
+ for idx, ((ti, col_name), col_type) in enumerate(
30
+ zip(db["column_names_original"], db["column_types"])
31
+ ):
32
+ if ti == -1:
33
+ continue
34
+ if lowercase:
35
+ col_name = col_name.lower()
36
+ col_type = col_type.lower()
37
+ if idx in pks:
38
+ tables_pks[table_names[ti]].append(
39
+ TableColumn(name=col_name, dtype=col_type)
40
+ )
41
+ for fk in fks:
42
+ if idx == fk[0]:
43
+ other_column = db["column_names_original"][fk[1]]
44
+ other_column_type = db["column_types"][fk[1]]
45
+ other_table = table_names[other_column[0]]
46
+ tables_fks[table_names[ti]].append(
47
+ ForeignKey(
48
+ column=TableColumn(name=col_name, dtype=col_type),
49
+ references_name=other_table,
50
+ references_column=TableColumn(
51
+ name=other_column[1], dtype=other_column_type
52
+ ),
53
+ )
54
+ )
55
+ tables[table_names[ti]].append(TableColumn(name=col_name, dtype=col_type))
56
+ db_to_tables[db_name] = {
57
+ table_name: Table(
58
+ name=table_name,
59
+ columns=tables[table_name],
60
+ pks=tables_pks[table_name],
61
+ fks=tables_fks[table_name],
62
+ examples=None,
63
+ )
64
+ for table_name in tables
65
+ }
66
+ return db_to_tables
67
+
68
+
69
+ def clean_str(target: str) -> str:
70
+ """Clean string for question."""
71
+ if not target:
72
+ return target
73
+
74
+ target = re.sub(r"[^\x00-\x7f]", r" ", target)
75
+ line = re.sub(r"''", r" ", target)
76
+ line = re.sub(r"``", r" ", line)
77
+ line = re.sub(r"\"", r"'", line)
78
+ line = re.sub(r"[\t ]+", " ", line)
79
+ return line.strip()
duckdb-nsql/eval/doc_retriever.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Retrieve documentation for a given query."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any
5
+ from rich.console import Console
6
+ from tqdm import tqdm
7
+ import numpy as np
8
+ from manifest import Manifest
9
+ from langchain.text_splitter import MarkdownHeaderTextSplitter
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+
12
+ console = Console(soft_wrap=True)
13
+
14
+ try:
15
+ EMBEDDING_MODEL = Manifest(
16
+ client_name="openaiembedding",
17
+ )
18
+ except Exception as e:
19
+ console.print(e)
20
+ console.print(
21
+ "Failed to load embedding model. Likely OPENAI API key is not set. Please set to run document retrieval.",
22
+ style="bold red",
23
+ )
24
+
25
+
26
+ def load_documentation(path: Path) -> dict[str, str]:
27
+ """Load documentation from path."""
28
+ content = {}
29
+ for file in path.glob("**/*.md"):
30
+ with open(file, "r") as f:
31
+ data = f.read()
32
+ key = str(file).replace(str(path), "")
33
+ content[key] = data
34
+ return content
35
+
36
+
37
+ def split_documents(content: dict[str, str]) -> dict[str, Any]:
38
+ """Split documents into chunks."""
39
+ md_splitted_docs = []
40
+ markdown_splitter = MarkdownHeaderTextSplitter(
41
+ headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
42
+ )
43
+ text_splitter = RecursiveCharacterTextSplitter(
44
+ separators=["\n"], chunk_size=500, chunk_overlap=50, length_function=len
45
+ )
46
+
47
+ for file, raw_doc in content.items():
48
+ splitted_text = markdown_splitter.split_text(raw_doc)
49
+ for t in splitted_text:
50
+ t.metadata["source"] = file
51
+ md_splitted_docs.extend(splitted_text)
52
+
53
+ docs = text_splitter.split_documents(md_splitted_docs)
54
+ docs_as_dict = [doc.dict() for doc in docs]
55
+ return docs_as_dict
56
+
57
+
58
+ def get_embeddings(text: str) -> np.ndarray:
59
+ """Get embeddings."""
60
+ return np.array(EMBEDDING_MODEL.run(text))
61
+
62
+
63
+ def embed_documents(
64
+ chunked_docs: dict[str, Any], key: str = "page_content"
65
+ ) -> tuple[dict[str, Any], np.ndarray]:
66
+ """Embed documents."""
67
+ all_embeddings = []
68
+ for doc in tqdm(chunked_docs):
69
+ emb = get_embeddings(doc[key])
70
+ doc["embedding"] = emb
71
+ all_embeddings.append(doc["embedding"])
72
+ full_embedding_mat = np.vstack(all_embeddings)
73
+ return chunked_docs, full_embedding_mat
74
+
75
+
76
+ def query_docs(
77
+ query: str,
78
+ docs: dict[str, Any],
79
+ embedding_mat: np.ndarray,
80
+ top_n: int = 10,
81
+ key: str = "page_content",
82
+ ) -> tuple[list[int], list[str]]:
83
+ """Query documents."""
84
+ query_embedding = get_embeddings(query)
85
+ scores = embedding_mat.dot(query_embedding)
86
+ sorted_indices = np.argsort(scores)[::-1]
87
+ top_n_indices = sorted_indices[:top_n]
88
+ top_n_indices_rev = top_n_indices[::-1]
89
+ returned_docs = []
90
+ for i in top_n_indices_rev:
91
+ returned_docs.append(docs[i][key])
92
+ return top_n_indices_rev.tolist(), returned_docs
duckdb-nsql/eval/evaluate.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Evaluate text2sql spider model predictions."""
2
+ import json
3
+ import os
4
+ import re
5
+ import signal
6
+ import sys
7
+ import traceback
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import click
12
+ import pandas as pd
13
+ from rich.console import Console
14
+ from tqdm.auto import tqdm
15
+
16
+ sys.path.append(os.path.join(os.path.dirname(__file__), "."))
17
+ # from metrics.spider import evaluation as spider_evaluation # type: ignore # noqa: E402
18
+ from metrics.test_suite_sql_eval import ( # type: ignore # noqa: E402
19
+ evaluation as test_suite_evaluation,
20
+ )
21
+ from data_utils import read_tables_json # type: ignore # noqa: E402
22
+ from metric_utils import ( # type: ignore # noqa: E402
23
+ correct_casing,
24
+ edit_distance,
25
+ )
26
+
27
+ console = Console(soft_wrap=True)
28
+
29
+ LEVELS = ["easy", "medium", "hard", "duckdb", "ddl", "all"]
30
+ PARTIAL_TYPES = [
31
+ "select",
32
+ "select(no AGG)",
33
+ "where",
34
+ "where(no OP)",
35
+ "group(no Having)",
36
+ "group",
37
+ "order",
38
+ "and/or",
39
+ "IUEN",
40
+ "keywords",
41
+ ]
42
+ TIMEOUT_SECONDS = 30
43
+
44
+
45
+ def timeout_handler(signum: int, frame: Any) -> None:
46
+ raise TimeoutError("Function execution timed out.")
47
+
48
+
49
+ def print_scores(scores: dict, model_name: str, metric_type: str = "exec") -> None:
50
+ """Print scores."""
51
+
52
+ def print_formated_s(
53
+ row_name: str, l: list[str], element_format: str = "{}", sep: str = "\t"
54
+ ) -> None:
55
+ template = "{}" + sep + sep.join([element_format] * len(l))
56
+ console.print(template.format(row_name, *l))
57
+
58
+ # Add empty scores for each level if not present
59
+ for level in LEVELS:
60
+ if level not in scores:
61
+ scores[level] = {}
62
+ scores[level]["count"] = 0
63
+ scores[level]["exec"] = 0
64
+ scores[level]["exact"] = 0
65
+
66
+ print_formated_s("", LEVELS)
67
+ counts = [scores[level]["count"] for level in LEVELS]
68
+ print_formated_s("count", counts)
69
+ console.print(f">====================== {model_name} =====================")
70
+ if metric_type == "exec":
71
+ console.print(
72
+ ">===================== EXECUTION ACCURACY ====================="
73
+ )
74
+ exec_scores = [scores[level]["exec"] for level in LEVELS]
75
+ print_formated_s("execution", exec_scores, element_format="{:.3f}")
76
+
77
+ elif metric_type == "exact":
78
+ console.print(
79
+ "\n>====================== EXACT MATCHING ACCURACY ====================="
80
+ )
81
+ exact_scores = [scores[level]["exact"] for level in LEVELS]
82
+ print_formated_s("exact match", exact_scores, element_format="{:.3f}")
83
+
84
+
85
+ def compute_exact_match_metric(
86
+ predictions: list,
87
+ references: list,
88
+ gold_dbs: list,
89
+ kmaps: dict,
90
+ db_dir: str,
91
+ categories,
92
+ ) -> dict:
93
+ """Compute exact match metric."""
94
+ exact_match = {}
95
+ exact_match["all"] = {}
96
+ exact_match["all"]["count"] = 0
97
+ exact_match["all"]["exact"] = 0
98
+ for prediction, reference, gold_db, category in tqdm(
99
+ zip(predictions, references, gold_dbs, categories), total=len(predictions)
100
+ ):
101
+ if category not in exact_match:
102
+ exact_match[category] = {}
103
+ exact_match[category]["count"] = 0
104
+ exact_match[category]["exact"] = 0
105
+ exact_match["all"]["count"] += 1
106
+ exact_match[category]["count"] += 1
107
+ try:
108
+ match = int(prediction.trim() == reference.trim())
109
+ exact_match[category]["exact"] += match
110
+ exact_match["all"]["exact"] += match
111
+ except Exception:
112
+ pass
113
+ return exact_match
114
+
115
+
116
+ def compute_test_suite_metric(
117
+ predictions: list,
118
+ references: list,
119
+ gold_dbs: list,
120
+ setup_sqls: list,
121
+ validate_sqls: list,
122
+ kmaps: dict,
123
+ db_dir: str,
124
+ categories: list[str] = None,
125
+ ) -> tuple[Any, list[int | None]]:
126
+ """Compute test suite execution metric."""
127
+ evaluator = test_suite_evaluation.Evaluator(
128
+ db_dir=db_dir,
129
+ kmaps=kmaps,
130
+ etype="exec",
131
+ plug_value=False,
132
+ keep_distinct=False,
133
+ progress_bar_for_each_datapoint=False,
134
+ )
135
+ # Only used for Sparc/CoSQL
136
+ turn_scores: dict[str, list] = {"exec": [], "exact": []}
137
+ by_row_metrics: list[int | None] = []
138
+ for prediction, reference, gold_db, setup_sql, validate_sql, category in tqdm(
139
+ zip(predictions, references, gold_dbs, setup_sqls, validate_sqls, categories),
140
+ total=len(predictions),
141
+ ):
142
+ turn_idx = 0
143
+ # skip final utterance-query pairs
144
+ if turn_idx < 0:
145
+ continue
146
+
147
+ # Register the timeout handler function
148
+ signal.signal(signal.SIGALRM, timeout_handler)
149
+ signal.alarm(TIMEOUT_SECONDS)
150
+
151
+ try:
152
+ ex_metrics = evaluator.evaluate_one(
153
+ gold_db,
154
+ reference,
155
+ prediction,
156
+ setup_sql,
157
+ validate_sql,
158
+ turn_scores,
159
+ idx=turn_idx,
160
+ category=category,
161
+ )
162
+ signal.alarm(0)
163
+
164
+ by_row_metrics.append(int(ex_metrics["exec"]))
165
+ except Exception as e:
166
+ raise e
167
+ by_row_metrics.append(None)
168
+ pass
169
+ evaluator.finalize()
170
+ return evaluator.scores, by_row_metrics
171
+
172
+
173
+ def compute_metrics(
174
+ gold_sqls: list[str],
175
+ pred_sqls: list[str],
176
+ gold_dbs: list[str],
177
+ setup_sqls: list[str],
178
+ validate_sqls: list[str],
179
+ kmaps: dict,
180
+ db_schemas: dict,
181
+ database_dir: str,
182
+ lowercase_schema_match: bool,
183
+ model_name: str,
184
+ categories: list[str] = None,
185
+ ) -> dict[str, str]:
186
+ """Compute all metrics for data slice."""
187
+ if len(gold_sqls) != len(pred_sqls):
188
+ raise ValueError(
189
+ f"Gold {len(gold_sqls)} and pred {len(pred_sqls)} have different number of lines!"
190
+ )
191
+ all_metrics: dict[str, Any] = {}
192
+
193
+ # Execution Accuracy
194
+ metrics, by_row_metrics = compute_test_suite_metric(
195
+ pred_sqls,
196
+ gold_sqls,
197
+ gold_dbs,
198
+ setup_sqls,
199
+ validate_sqls,
200
+ kmaps,
201
+ database_dir,
202
+ categories,
203
+ )
204
+ all_metrics["exec"] = metrics
205
+ all_metrics["by_row_exec"] = by_row_metrics
206
+ print_scores(metrics, model_name, "exec")
207
+
208
+ # Exact Match Accuracy
209
+ metrics = compute_exact_match_metric(
210
+ pred_sqls, gold_sqls, gold_dbs, kmaps, database_dir, categories
211
+ )
212
+ all_metrics["exact"] = metrics
213
+ print_scores(metrics, model_name, "exact")
214
+
215
+ # Equality Accuracy
216
+ per_row_match = [
217
+ int(gold.lower() == pred.lower()) for gold, pred in zip(gold_sqls, pred_sqls)
218
+ ]
219
+ all_metrics["equality"] = {"equality": sum(per_row_match) / len(gold_sqls)}
220
+ all_metrics["by_row_equality"] = per_row_match
221
+
222
+ # Edit Distance
223
+ per_row_edit_dist = [
224
+ edit_distance(gold, pred) for gold, pred in zip(gold_sqls, pred_sqls)
225
+ ]
226
+ edit_dist = sum(per_row_edit_dist) / len(gold_sqls)
227
+ all_metrics["edit_distance"] = {"edit_distance": edit_dist}
228
+ all_metrics["by_row_edit_distance"] = per_row_edit_dist
229
+
230
+ return all_metrics
231
+
232
+
233
+ def get_to_print(metrics: dict, key: str, model_name: str, num_rows: int) -> dict:
234
+ """Get pretty print dictionary of metrics."""
235
+ return {
236
+ "slice": key,
237
+ "model": model_name,
238
+ "support": num_rows,
239
+ "exec": f"{metrics[key]['exec']['all']['exec']:.3f}",
240
+ "exact": f"{metrics[key]['exact']['all']['exact']:.3f}",
241
+ "equality": f"{metrics[key]['equality']['equality']:.3f}",
242
+ "edit_distance": f"{metrics[key]['edit_distance']['edit_distance']:.3f}",
243
+ }
244
+
245
+
246
+ @click.group()
247
+ def cli() -> None:
248
+ """Entrypoint."""
249
+ pass
250
+
251
+
252
+ @cli.command()
253
+ @click.option("--gold", type=str, required=True)
254
+ @click.option("--pred", type=str, required=True)
255
+ @click.option("--tables", type=str, required=True)
256
+ @click.option("--db", type=str, default="")
257
+ @click.option("--slice-attribute", type=str, default=None)
258
+ @click.option("--output-dir", type=str, default="")
259
+ @click.option("--output-filename", type=str, default="")
260
+ @click.option(
261
+ "--correct-sql-casing", type=bool, is_flag=True, default=False, required=False
262
+ )
263
+ @click.option(
264
+ "--lowercase-schema-match", type=bool, is_flag=True, default=False, required=False
265
+ )
266
+ def evaluate(
267
+ gold: str,
268
+ pred: str,
269
+ tables: str,
270
+ db: str,
271
+ slice_attribute: str,
272
+ output_dir: str,
273
+ output_filename: str,
274
+ correct_sql_casing: bool,
275
+ lowercase_schema_match: bool,
276
+ ) -> None:
277
+ """Evaluate SQL.
278
+
279
+ Args:
280
+ gold: path to gold sql file.
281
+ pred: path to predicted json lines file.
282
+ tables: the json path of the table metadata.
283
+ db: path to database dir.
284
+ slice_attribute: json attribute in gold data to slice on.
285
+ output_dir: the prediction output directory
286
+ output_filename: the prediction output filename
287
+ correct_sql_casing: whether to correct casing of SQL keywords
288
+ lowercase_schema_match: whether to lowercase schema match
289
+ """
290
+ gold_path = Path(gold)
291
+ pred_path = Path(pred)
292
+ model_name = pred_path.stem
293
+ if not output_filename:
294
+ output_filename = pred_path.stem + "_eval.json"
295
+ console.print(f"Saving to {Path(output_dir) / output_filename}")
296
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
297
+
298
+ kmaps = test_suite_evaluation.build_foreign_key_map_from_json(tables)
299
+ db_schemas = read_tables_json(tables)
300
+
301
+ gold_sqls_dict = json.load(gold_path.open("r", encoding="utf-8"))
302
+ pred_sqls_dict = [json.loads(l) for l in pred_path.open("r").readlines()]
303
+
304
+ # Data validation
305
+ assert len(gold_sqls_dict) == len(
306
+ pred_sqls_dict
307
+ ), "Sample size doesn't match between pred and gold file"
308
+
309
+ # Keep track of everything
310
+ full_results = []
311
+ for gold_sql, pred_sql in zip(gold_sqls_dict, pred_sqls_dict):
312
+ merged_res = {**pred_sql, **gold_sql}
313
+ full_results.append(merged_res)
314
+
315
+ gold_sqls = [
316
+ re.sub(r"[\s\t\n]+", " ", p.get("gold", p.get("query", p.get("sql", ""))))
317
+ for p in gold_sqls_dict
318
+ ]
319
+ setup_sqls = [re.sub(r"[\s\t\n]+", " ", p["setup_sql"]) for p in gold_sqls_dict]
320
+ validate_sqls = [
321
+ re.sub(r"[\s\t\n]+", " ", p["validation_sql"]) for p in gold_sqls_dict
322
+ ]
323
+ gold_dbs = [p.get("db_id", p.get("db", "")) for p in gold_sqls_dict]
324
+ pred_sqls = [re.sub(r"[\s\t\n]+", " ", p["pred"]) for p in pred_sqls_dict]
325
+ categories = [p.get("category", "") for p in gold_sqls_dict]
326
+ if correct_sql_casing:
327
+ # One line to correct casing of SQL keywords using correct_casing(sql)
328
+ gold_sqls = [correct_casing(sql) for sql in gold_sqls]
329
+ pred_sqls = [correct_casing(sql) for sql in pred_sqls]
330
+
331
+ final_metrics: dict[str, dict[str, Any]] = {}
332
+ to_print = []
333
+ final_metrics["all"] = compute_metrics(
334
+ gold_sqls=gold_sqls,
335
+ pred_sqls=pred_sqls,
336
+ gold_dbs=gold_dbs,
337
+ setup_sqls=setup_sqls,
338
+ validate_sqls=validate_sqls,
339
+ kmaps=kmaps,
340
+ db_schemas=db_schemas,
341
+ database_dir=db,
342
+ lowercase_schema_match=lowercase_schema_match,
343
+ model_name=model_name + "(all)",
344
+ categories=categories,
345
+ )
346
+
347
+ for k, v in final_metrics["all"].items():
348
+ if k.startswith("by_row"):
349
+ assert len(v) == len(gold_sqls)
350
+ for dct, val in zip(full_results, v):
351
+ dct[k[len("by_row_") :]] = val
352
+ to_print.append(get_to_print(final_metrics, "all", model_name, len(gold_sqls)))
353
+ # TODO: could be way more efficient if we subsliced the results but...whatever
354
+ if slice_attribute:
355
+ for unq_value in sorted(set([g[slice_attribute] for g in gold_sqls_dict])):
356
+ idx_set = [
357
+ i
358
+ for i, g in enumerate(gold_sqls_dict)
359
+ if g[slice_attribute] == unq_value
360
+ ]
361
+ print(f"Processing {unq_value} with {len(idx_set)} samples")
362
+ final_metrics[unq_value] = compute_metrics(
363
+ gold_sqls=[gold_sqls[i] for i in idx_set],
364
+ pred_sqls=[pred_sqls[i] for i in idx_set],
365
+ gold_dbs=[gold_dbs[i] for i in idx_set],
366
+ setup_sqls=[setup_sqls[i] for i in idx_set],
367
+ validate_sqls=[validate_sqls[i] for i in idx_set],
368
+ kmaps=kmaps,
369
+ db_schemas=db_schemas,
370
+ database_dir=db,
371
+ lowercase_schema_match=lowercase_schema_match,
372
+ model_name=model_name + f"({unq_value})",
373
+ categories=[categories[i] for i in idx_set],
374
+ )
375
+ to_print.append(
376
+ get_to_print(final_metrics, unq_value, model_name, len(idx_set))
377
+ )
378
+
379
+ df = pd.DataFrame(to_print)
380
+ console.print(df.to_csv(sep=",", index=False))
381
+ console.print("******")
382
+ console.print(f"Saved metrics to {Path(output_dir) / output_filename}")
383
+ json.dump(final_metrics, open(Path(output_dir) / output_filename, "w"), indent=4)
384
+ output_filename = str(output_filename).replace("_eval.json", "_fd.jsonl")
385
+ console.print(f"Saved dump to {Path(output_dir) / output_filename}")
386
+ with open(Path(output_dir) / output_filename, "w") as f:
387
+ for dct in full_results:
388
+ f.write(json.dumps(dct) + "\n")
389
+
390
+
391
+ if __name__ == "__main__":
392
+ cli()
duckdb-nsql/eval/get_manifest.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Manifest utils."""
2
+ from manifest import Manifest
3
+ from manifest.connections.client_pool import ClientConnection
4
+
5
+
6
+ def get_manifest(
7
+ manifest_client: str,
8
+ manifest_connection: str,
9
+ manifest_engine: str,
10
+ ) -> Manifest:
11
+ """Get manifest engine."""
12
+ if manifest_client in {"openai", "openaichat", "openai_mock", "openrouter", "azureendpoint"}:
13
+ manifest = Manifest(
14
+ client_name=manifest_client,
15
+ engine=manifest_engine,
16
+ )
17
+ elif manifest_client in {"huggingface"}:
18
+ manifest = Manifest(
19
+ client_pool=[
20
+ ClientConnection(
21
+ client_name=manifest_client,
22
+ client_connection=manifest_conn,
23
+ )
24
+ for manifest_conn in manifest_connection.split(";")
25
+ ],
26
+ )
27
+ else:
28
+ raise ValueError(f"Unknown manifest client {manifest_client}")
29
+ return manifest
duckdb-nsql/eval/loaders.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data loaders."""
2
+ import json
3
+ import re
4
+ import string
5
+ from abc import ABC, abstractmethod
6
+
7
+ from rich.console import Console
8
+ from data_utils import read_tables_json
9
+ from schema import Table
10
+
11
+ RE_COLUMN = re.compile(r"^select (.+?) from")
12
+ RE_CONDS = re.compile(r"where (.+?)$")
13
+ RE_COND = re.compile(r"^(.+?)\s*([=><])\s*(.+?)$")
14
+
15
+ translator = str.maketrans(
16
+ string.punctuation, " " * len(string.punctuation)
17
+ ) # map punctuation to space
18
+
19
+ console = Console(soft_wrap=True)
20
+
21
+
22
+ def standardize_column(col: str) -> str:
23
+ """Standardize the column name to SQL compatible."""
24
+ col_name = col.replace("#", "num").replace("%", "perc")
25
+ col_name = col_name.strip().lower().translate(translator)
26
+ col_name = re.sub("[^0-9a-z ]", " ", col_name).strip()
27
+ col_name = re.sub(" +", "_", col_name)
28
+ if not col_name:
29
+ console.print(f"original {col}, new {col_name}")
30
+ return col_name
31
+
32
+
33
+ def clean_col(col: str) -> str:
34
+ """Remove table name and standardize column name."""
35
+ if "." in col and not col.endswith("."):
36
+ col = col.split(".")[-1]
37
+ return standardize_column(col)
38
+
39
+
40
+ class Loader(ABC):
41
+ """Loader abstract class."""
42
+
43
+ @classmethod
44
+ @abstractmethod
45
+ def load_data(cls, path: str) -> list[dict]:
46
+ """Load data from path."""
47
+
48
+ @classmethod
49
+ @abstractmethod
50
+ def load_table_metadata(cls, path: str) -> dict[str, dict[str, Table]]:
51
+ """Extract table metadata from table-metadata-path."""
52
+
53
+ @classmethod
54
+ def format_output(cls, prediction: dict) -> dict:
55
+ """Parse for spider format."""
56
+ return prediction
57
+
58
+
59
+ class DefaultLoader(Loader):
60
+ """Spider loader and writer."""
61
+
62
+ @classmethod
63
+ def load_data(cls, path: str) -> list[dict]:
64
+ """Load data from path."""
65
+ try:
66
+ with open(path) as f:
67
+ data = json.loads(f.read())
68
+ except json.decoder.JSONDecodeError:
69
+ # Try with jsonl
70
+ data = [json.loads(line) for line in open(path)]
71
+ return data
72
+
73
+ @classmethod
74
+ def load_table_metadata(cls, path: str) -> dict[str, dict[str, Table]]:
75
+ """Extract table metadata from table-metadata-path."""
76
+ # load the tables
77
+ db_to_tables = read_tables_json(path, lowercase=True)
78
+ return db_to_tables
duckdb-nsql/eval/metric_utils.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility metrics."""
2
+ import sqlglot
3
+ from rich.console import Console
4
+ from sqlglot import parse_one
5
+
6
+ console = Console(soft_wrap=True)
7
+
8
+
9
+ def correct_casing(sql: str) -> str:
10
+ """Correct casing of SQL."""
11
+ parse: sqlglot.expressions.Expression = parse_one(sql, read="sqlite")
12
+ return parse.sql()
13
+
14
+
15
+ def prec_recall_f1(gold: set, pred: set) -> dict[str, float]:
16
+ """Compute precision, recall and F1 score."""
17
+ prec = len(gold.intersection(pred)) / len(pred) if pred else 0.0
18
+ recall = len(gold.intersection(pred)) / len(gold) if gold else 0.0
19
+ f1 = 2 * prec * recall / (prec + recall) if prec + recall else 0.0
20
+ return {"prec": prec, "recall": recall, "f1": f1}
21
+
22
+
23
+ def edit_distance(s1: str, s2: str) -> int:
24
+ """Compute edit distance between two strings."""
25
+ # Make sure s1 is the shorter string
26
+ if len(s1) > len(s2):
27
+ s1, s2 = s2, s1
28
+
29
+ distances: list[int] = list(range(len(s1) + 1))
30
+ for i2, c2 in enumerate(s2):
31
+ distances_ = [i2 + 1]
32
+ for i1, c1 in enumerate(s1):
33
+ if c1 == c2:
34
+ distances_.append(distances[i1])
35
+ else:
36
+ distances_.append(
37
+ 1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
38
+ )
39
+ distances = distances_
40
+ return distances[-1]
duckdb-nsql/eval/metrics/test_suite_sql_eval ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 640a12975abf75a94e917caca149d56dbc6bcdd7
duckdb-nsql/eval/predict.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run dataset on text2sql zazu experiment.
2
+
3
+ See README.md for more details.
4
+ """
5
+ import datetime
6
+ import json
7
+ import multiprocessing
8
+ import random
9
+ import re
10
+ from pathlib import Path
11
+
12
+ import click
13
+ import numpy as np
14
+ from constants import PROMPT_FORMATTERS
15
+ from loaders import DefaultLoader
16
+ from get_manifest import get_manifest
17
+ from manifest import Manifest
18
+ from prompt_formatters import RajkumarFormatter
19
+ from rich.console import Console
20
+ from schema import Table, TextToSQLModelResponse, TextToSQLParams
21
+ from text_to_sql import instruction_to_sql, instruction_to_sql_list
22
+ from doc_retriever import (
23
+ load_documentation,
24
+ split_documents,
25
+ embed_documents,
26
+ query_docs,
27
+ )
28
+ from tqdm import tqdm
29
+ from transformers import AutoTokenizer
30
+
31
+ console = Console(soft_wrap=True)
32
+
33
+
34
+ def generate_sql(
35
+ manifest: Manifest,
36
+ text_to_sql_in: list[TextToSQLParams],
37
+ retrieved_docs: list[list[str]],
38
+ prompt_formatter: RajkumarFormatter,
39
+ stop_tokens: list[str] | None = None,
40
+ overwrite_manifest: bool = False,
41
+ max_tokens: int = 300,
42
+ temperature: float = 0.1,
43
+ num_beams: int = 2,
44
+ parallel: bool = False,
45
+ ) -> list[tuple[str, TextToSQLModelResponse]]:
46
+ """Call our text2sql function with manifest of our choice."""
47
+ if parallel:
48
+ instruction_to_sql_resps: list[
49
+ TextToSQLModelResponse
50
+ ] = instruction_to_sql_list(
51
+ params=text_to_sql_in,
52
+ extra_context=retrieved_docs,
53
+ manifest=manifest,
54
+ prompt_formatter=prompt_formatter,
55
+ overwrite_manifest=overwrite_manifest,
56
+ max_tokens=max_tokens,
57
+ temperature=0.1,
58
+ stop_sequences=stop_tokens,
59
+ num_beams=num_beams,
60
+ )
61
+ else:
62
+ instruction_to_sql_resps = [
63
+ instruction_to_sql(
64
+ params=_text_to_sql_in,
65
+ extra_context=_retrieved_docs,
66
+ manifest=manifest,
67
+ prompt_formatter=prompt_formatter,
68
+ overwrite_manifest=overwrite_manifest,
69
+ max_tokens=max_tokens,
70
+ temperature=temperature,
71
+ stop_sequences=stop_tokens,
72
+ num_beams=num_beams,
73
+ )
74
+ for _retrieved_docs, _text_to_sql_in in tqdm(
75
+ zip(retrieved_docs, text_to_sql_in),
76
+ desc="Generating SQL",
77
+ total=len(text_to_sql_in),
78
+ disable=(len(text_to_sql_in) <= 1),
79
+ )
80
+ ]
81
+ assert len(instruction_to_sql_resps) == len(text_to_sql_in)
82
+
83
+ sql_statements = []
84
+ for i in range(len(instruction_to_sql_resps)):
85
+ sql_statement = instruction_to_sql_resps[i].output.strip()
86
+ if "<>" in sql_statement:
87
+ sql_statement.replace("<>", "!=")
88
+ # Models sometime train to predict <databasename/schema> | <sql>
89
+ sql_statement = sql_statement.split("|")[-1].strip()
90
+ sql_statements.append(sql_statement)
91
+ return list(zip(sql_statements, instruction_to_sql_resps))
92
+
93
+
94
+ def get_text_to_sql_in(
95
+ input_question: dict, db_to_tables: dict[str, dict[str, Table]]
96
+ ) -> TextToSQLParams:
97
+ """Format input question for text2sql function."""
98
+ question = input_question["question"]
99
+ db_id = input_question.get("db_id", None)
100
+ if db_id != "none":
101
+ table_params = list(db_to_tables.get(db_id, {}).values())
102
+ else:
103
+ table_params = []
104
+ if len(table_params) == 0:
105
+ console.print(f"[red] WARNING: No tables found for {db_id} [/red]")
106
+ text_to_sql_in = TextToSQLParams(
107
+ instruction=question,
108
+ database=db_id,
109
+ tables=table_params,
110
+ )
111
+ return text_to_sql_in
112
+
113
+
114
+ @click.group()
115
+ def cli() -> None:
116
+ """Entrypoint."""
117
+ pass
118
+
119
+
120
+ @cli.command()
121
+ @click.argument("dataset-path")
122
+ @click.argument("table-meta-path")
123
+ @click.option("--output-dir", type=str, default="")
124
+ @click.option("--run-name", type=str, default="")
125
+ @click.option("--num-run", type=int, default=-1)
126
+ @click.option("--num-print", type=int, default=20)
127
+ # Format options
128
+ @click.option("--prompt-format", type=str, default="spider")
129
+ # Prompt options
130
+ @click.option("--stop-tokens", type=str, default=[], multiple=True)
131
+ @click.option("--max-tokens", type=int, default=200)
132
+ @click.option("--temperature", type=float, default=0)
133
+ @click.option("--num-beams", type=int, default=-1) # use whatever is in manifest
134
+ @click.option("--max-context-length", type=int, default=-1)
135
+ # Docs options
136
+ @click.option(
137
+ "--markdown-docs-path",
138
+ #type=click.Path(
139
+ # exists=True, file_okay=True, dir_okay=True, readable=True, path_type=Path
140
+ #),
141
+ default="eval/docs/duckdb-web/docs/archive/0.9.2/sql",
142
+ )
143
+ @click.option("--num-retrieved-docs", type=int, default=0)
144
+ # Manifest options
145
+ @click.option("--manifest-client", type=str, default="openai")
146
+ @click.option("--manifest-engine", type=str, default="gpt-4o")
147
+ @click.option("--manifest-connection", type=str, default="http://localhost:5005")
148
+ @click.option("--overwrite-manifest", is_flag=True, default=False)
149
+ @click.option("--parallel", is_flag=True, default=False)
150
+ def predict(
151
+ dataset_path: str,
152
+ table_meta_path: str,
153
+ output_dir: str,
154
+ run_name: str,
155
+ num_run: int,
156
+ num_print: int,
157
+ prompt_format: str,
158
+ stop_tokens: list[str],
159
+ max_tokens: int,
160
+ temperature: float,
161
+ num_beams: int,
162
+ max_context_length: int,
163
+ markdown_docs_path: Path,
164
+ num_retrieved_docs: int,
165
+ manifest_client: str,
166
+ manifest_engine: str,
167
+ manifest_connection: str,
168
+ overwrite_manifest: bool,
169
+ parallel: bool,
170
+ ) -> None:
171
+ """Predict SQL.
172
+
173
+ Args:
174
+ dataset_path: the dataset path.
175
+ table_meta_path: the json path of the table metadata.
176
+ database_path: the database path for sqlite.
177
+ output_dir: the prediction output directory
178
+ run_name: special prefix to add to filename
179
+ num_run: the number of examples to run
180
+ num_print: the number of examples to print
181
+ prompt_format: the format of the prompt. E.g., "rajkumar"
182
+ stop_tokens: the stop tokens to try
183
+ max_tokens: the max tokens
184
+ temperature: the temperature
185
+ num_beams: the number of beams
186
+ max_context_length: max context length for demonstration truncation (-1 means None)
187
+ markdown_docs_path: path to duckdb sql docs
188
+ num_retrieved_docs: number of docs to retrieve
189
+ manifest_client: the manifest client
190
+ manifest_engine: the manifest engine
191
+ manifest_connection: the manifest connection
192
+ """
193
+ multiprocessing.set_start_method("spawn", force=True)
194
+ random.seed(0)
195
+ np.random.seed(0)
196
+ locals_dict = locals()
197
+ locals_dict["markdown_docs_path"] = str(markdown_docs_path)
198
+ console.print(json.dumps(locals_dict, indent=2))
199
+
200
+ data_formatter = DefaultLoader()
201
+
202
+ if prompt_format not in PROMPT_FORMATTERS:
203
+ raise ValueError(f"Unknown prompt format {prompt_format}")
204
+ prompt_formatter = PROMPT_FORMATTERS[prompt_format]()
205
+
206
+ # load manifest
207
+ manifest = get_manifest(
208
+ manifest_client=manifest_client,
209
+ manifest_connection=manifest_connection,
210
+ manifest_engine=manifest_engine,
211
+ )
212
+ manifest_params = manifest.client_pool.get_current_client().get_model_params()
213
+ console.print(f"Running with {manifest_params} manifest.")
214
+ model_name = manifest_params.get("engine", manifest_params["model_name"])
215
+
216
+ if manifest_client in {"openai", "openaichat", "openrouter", "azureendpoint"}:
217
+ tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
218
+ else:
219
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
220
+
221
+ if stop_tokens:
222
+ stop_tokens = [st.strip("'") for st in stop_tokens]
223
+ console.print(f"Stop tokens: {stop_tokens}")
224
+
225
+ # Get output filename
226
+ full_dataset_path = Path(dataset_path)
227
+ # Get todays date
228
+ date_today = datetime.datetime.now().strftime("%y-%m-%d")
229
+ if run_name:
230
+ run_name = f"{run_name}_"
231
+ suffix = f"{run_name}{full_dataset_path.stem}_{date_today}.json" # noqa: E501
232
+ prefix = f"{prompt_format}_{num_retrieved_docs}docs"
233
+ if manifest_client in {"openai", "openaiazure"}:
234
+ middleix = manifest_engine
235
+ elif manifest_client in {"huggingface", "ray"}:
236
+ middleix = Path(manifest_params.get("model_path", "")).name.replace("/", "-")
237
+ elif manifest_client in {"toma", "openrouter", "openaichat", "azureendpoint"}:
238
+ middleix = manifest_engine.split("/")[-1]
239
+ else:
240
+ raise ValueError(f"Unknown manifest client {manifest_client}")
241
+ output_filename = f"{prefix}_{middleix}_{suffix}"
242
+ console.print(f"Saving to {Path(output_dir) / output_filename}")
243
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
244
+
245
+ console.print("Loading metadata...")
246
+ db_to_tables = data_formatter.load_table_metadata(table_meta_path)
247
+
248
+ console.print("Loading data...")
249
+ data = data_formatter.load_data(dataset_path)
250
+ if num_run > 0:
251
+ console.print(f"Running on {min(len(data), num_run)} examples")
252
+ data = data[:num_run]
253
+ original_data = data
254
+
255
+ # load the examples
256
+ console.print("Formatting data...")
257
+ num_print = min(num_print, len(data))
258
+ token_lengths = []
259
+ text_to_sql_in = [
260
+ get_text_to_sql_in(input_question, db_to_tables) for input_question in data
261
+ ]
262
+
263
+ if num_retrieved_docs > 0:
264
+ console.print("Loading documenration and indexing...")
265
+ retrieved_docs = []
266
+ doc_contents = load_documentation(markdown_docs_path)
267
+ chunked_docs = split_documents(doc_contents)
268
+ embedded_docs, full_embedding_mat = embed_documents(chunked_docs)
269
+ for i in tqdm(range(len(text_to_sql_in)), desc="Retrieving docs"):
270
+ _, retrieved_docs_strings = query_docs(
271
+ text_to_sql_in[i].instruction,
272
+ embedded_docs,
273
+ full_embedding_mat,
274
+ top_n=num_retrieved_docs,
275
+ )
276
+ retrieved_docs.append(retrieved_docs_strings)
277
+ else:
278
+ retrieved_docs = [[] for _ in range(len(text_to_sql_in))]
279
+
280
+ for i in range(num_print):
281
+ # Run a few to get some examples to print
282
+ generated_responses = generate_sql(
283
+ manifest=manifest,
284
+ text_to_sql_in=[text_to_sql_in[i]],
285
+ retrieved_docs=[retrieved_docs[i]],
286
+ stop_tokens=stop_tokens,
287
+ max_tokens=max_tokens,
288
+ temperature=temperature,
289
+ num_beams=num_beams,
290
+ prompt_formatter=prompt_formatter,
291
+ overwrite_manifest=overwrite_manifest,
292
+ parallel=parallel,
293
+ )
294
+ for prediction, model_response in generated_responses:
295
+ prediction = re.sub(r"[\s\t\n]+", " ", prediction)
296
+ token_lengths.append(len(tokenizer(prediction).input_ids))
297
+ console.print(f"[blue]Prompt:[/blue] {model_response.final_prompt}")
298
+ console.print(f"[red]Prediction:[/red] {prediction}")
299
+ if data[i].get("query") or data[i].get("sql"):
300
+ console.print(
301
+ "[purple]Gold:[/purple] "
302
+ f"{data[i].get('query') or data[i].get('sql')}"
303
+ )
304
+ console.print("\n****\n")
305
+
306
+ # Run the entire thing now - the to_print results will be in cache and fast
307
+ generated_sqls = generate_sql(
308
+ manifest=manifest,
309
+ text_to_sql_in=text_to_sql_in,
310
+ retrieved_docs=retrieved_docs,
311
+ stop_tokens=stop_tokens,
312
+ max_tokens=max_tokens,
313
+ temperature=temperature,
314
+ num_beams=num_beams,
315
+ prompt_formatter=prompt_formatter,
316
+ overwrite_manifest=overwrite_manifest,
317
+ parallel=parallel,
318
+ )
319
+
320
+ with open(Path(output_dir) / output_filename, "w") as fout:
321
+ for i, (prediction, model_response) in enumerate(generated_sqls):
322
+ if isinstance(model_response.final_prompt, str):
323
+ token_lengths.append(
324
+ len(tokenizer(model_response.final_prompt).input_ids)
325
+ )
326
+ else:
327
+ for prompt in model_response.final_prompt:
328
+ token_lengths.append(len(tokenizer(prompt["content"]).input_ids))
329
+ entry = {
330
+ **original_data[i],
331
+ "pred": prediction,
332
+ "raw_pred": model_response.output,
333
+ "raw_output": model_response.raw_output,
334
+ "prompt": model_response.final_prompt,
335
+ "tables": [tbl.dict() for tbl in text_to_sql_in[i].tables or []],
336
+ }
337
+ formatted_entry = data_formatter.format_output(entry)
338
+ print(json.dumps(formatted_entry), file=fout)
339
+ overflow = len([tl for tl in token_lengths if tl > 2048]) / len(token_lengths)
340
+ console.print(f"Overflow 2048 prompt {100*overflow:.2f}%")
341
+ console.print(f"Saved to {Path(output_dir) / output_filename}")
342
+
343
+
344
+ if __name__ == "__main__":
345
+ cli()
duckdb-nsql/eval/prompt_formatters.py ADDED
@@ -0,0 +1,1056 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Rajkumar prompt formatter."""
2
+
3
+ from random import shuffle
4
+ from manifest import Manifest
5
+ from schema import Table
6
+ import re
7
+
8
+
9
+ class RajkumarFormatter:
10
+ """RajkumarFormatter class.
11
+
12
+ From https://arxiv.org/pdf/2204.00498.pdf.
13
+ """
14
+
15
+ table_sep: str = "\n\n"
16
+ shuffle_table_order: bool = True
17
+ _cache: dict[tuple[str, str, str], list[str]] = {}
18
+ clean_whitespace = False
19
+
20
+ @classmethod
21
+ def format_table(cls, table: Table) -> str:
22
+ """Get table format."""
23
+ table_fmt = []
24
+ for col in table.columns or []:
25
+ # This is technically an incorrect type, but it should be a catchall word
26
+ table_fmt.append(f" {col.name} {col.dtype or 'any'}")
27
+ if table_fmt:
28
+ all_cols = ",\n".join(table_fmt)
29
+ create_tbl = f"CREATE TABLE {table.name} (\n{all_cols}\n)"
30
+ else:
31
+ create_tbl = f"CREATE TABLE {table.name}"
32
+ return create_tbl
33
+
34
+ @classmethod
35
+ def format_all_tables(cls, tables: list[Table], instruction: str) -> list[str]:
36
+ """Get all tables format."""
37
+ table_texts = [cls.format_table(table) for table in tables]
38
+ key = ("tables", instruction, str(tables))
39
+ if key not in cls._cache:
40
+ shuffle(table_texts)
41
+ cls._cache[key] = table_texts
42
+ else:
43
+ table_texts = cls._cache[key]
44
+ return table_texts
45
+
46
+ @classmethod
47
+ def format_retrieved_context(
48
+ cls,
49
+ context: list[str],
50
+ ) -> str:
51
+ """Format retrieved context."""
52
+ context_str = "\n--------\n".join(context)
53
+ return f"\n\n/*\nHere is additional documentation about DuckDB that could be useful.\n--------\n{context_str}\n--------\n*/"
54
+
55
+ @classmethod
56
+ def format_prompt(
57
+ cls,
58
+ instruction: str,
59
+ table_text: str,
60
+ context_text: str,
61
+ ) -> str | list[str]:
62
+ """Get prompt format."""
63
+ return f"""{table_text}\n\n\n-- Using valid DuckDB SQL, answer the following question for the tables provided above.{context_text}\n\n-- {instruction}\n""" # noqa: E501
64
+
65
+ @classmethod
66
+ def format_model_output(cls, output_sql: str, prompt: str) -> str:
67
+ """Format model output."""
68
+ clean_sql = (output_sql
69
+ .replace('```sql\n', '')
70
+ .replace('```duckdb\n', '')
71
+ .replace('```\n', '')
72
+ .replace('```', '')).strip()
73
+
74
+ if clean_sql.find(';') != -1:
75
+ clean_sql[:clean_sql.find(';')].strip()
76
+
77
+ if not clean_sql.endswith(";"):
78
+ clean_sql += ";"
79
+
80
+ return clean_sql
81
+
82
+ @classmethod
83
+ def format_gold_output(cls, output_sql: str) -> str:
84
+ """Format gold output for demonstration."""
85
+ return output_sql
86
+
87
+ class MotherDuckFormatter(RajkumarFormatter):
88
+ """MotherDuck class."""
89
+
90
+ @classmethod
91
+ def format_prompt(
92
+ cls,
93
+ instruction: str,
94
+ table_text: str,
95
+ context_text: str,
96
+ ) -> str | list[str]:
97
+ """Get prompt format."""
98
+ return f"""{table_text}\n\n\n-- Using valid DuckDB SQL, answer the following question for the tables provided above.{context_text}\n\n-- {instruction}\n```sql\n""" # noqa: E501
99
+
100
+
101
+ class DuckDBFormatter(RajkumarFormatter):
102
+ """DuckDB class."""
103
+
104
+ @classmethod
105
+ def format_prompt(
106
+ cls,
107
+ instruction: str,
108
+ table_text: str,
109
+ context_text: str,
110
+ ) -> str | list[str]:
111
+ """Get prompt format."""
112
+ return f"""{table_text}\n\n\n-- Using valid DuckDB SQL, answer the following question for the tables provided above.{context_text}\n\n-- {instruction}\n```sql\n""" # noqa: E501
113
+
114
+
115
+ class DuckDBInstFormatter(RajkumarFormatter):
116
+ """DuckDB Inst class."""
117
+
118
+ PROMPT_TEMPLATE = """### Instruction:\n{instruction}\n\n### Input:\n{input}{context}\n### Question:\n{question}\n\n### Response (use duckdb shorthand if possible):\n"""
119
+ INSTRUCTION_TEMPLATE = """Your task is to generate valid duckdb SQL to answer the following question{has_schema}""" # noqa: E501
120
+
121
+ @classmethod
122
+ def format_retrieved_context(
123
+ cls,
124
+ context: list[str],
125
+ ) -> str:
126
+ """Format retrieved context."""
127
+ context_str = "\n--------\n".join(context)
128
+ return f"\n### Documentation:\n{context_str}\n"
129
+
130
+ @classmethod
131
+ def format_prompt(
132
+ cls,
133
+ instruction: str,
134
+ table_text: str,
135
+ context_text: str,
136
+ ) -> str | list[str]:
137
+ """Get prompt format."""
138
+ input = ""
139
+ if table_text:
140
+ input = """Here is the database schema that the SQL query will run on:\n{schema}\n""".format( # noqa: E501
141
+ schema=table_text
142
+ )
143
+ instruction = cls.PROMPT_TEMPLATE.format(
144
+ instruction=cls.INSTRUCTION_TEMPLATE.format(
145
+ has_schema="."
146
+ if table_text == ""
147
+ else ", given a duckdb database schema."
148
+ ),
149
+ context=context_text,
150
+ input=input,
151
+ question=instruction,
152
+ )
153
+ return instruction
154
+
155
+ class DuckDBInstFormatterLlamaShort(RajkumarFormatter):
156
+ """DuckDB Inst class."""
157
+
158
+ PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
159
+
160
+ Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
161
+
162
+ Here are some DuckDB SQL syntax specifics you should be aware of:
163
+
164
+ - DuckDB uses double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity and single quotes (') to define string literals
165
+ - DuckDB can query CSV, Parquet, and JSON directly without loading them first, e.g. `SELECT * FROM 'data.csv';`
166
+ - DuckDB supports CREATE TABLE AS (CTAS): `CREATE TABLE new_table AS SELECT * FROM old_table;`
167
+ - DuckDB queries can start with FROM, and optionally omit SELECT *, e.g. `FROM my_table WHERE condition;` is equivalent to `SELECT * FROM my_table WHERE condition;`
168
+ - DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly, e.g. `SELECT 1 + 1 AS result;`
169
+ - DuckDB supports attaching multiple databases, unsing the ATTACH statement: `ATTACH 'my_database.duckdb' AS mydb;`. Tables within attached databases can be accessed using the dot notation (.), e.g. `SELECT * FROM mydb.table_name syntax`. The default databases doesn't require the do notation to access tables. The default database can be changed with the USE statement, e.g. `USE my_db;`.
170
+ - DuckDB is generally more lenient with implicit type conversions (e.g. `SELECT '42' + 1;` - Implicit cast, result is 43), but you can always be explicit using `::`, e.g. `SELECT '42'::INTEGER + 1;`
171
+ - DuckDB can extract parts of strings and lists using [start:end] or [start:end:step] syntax. Indexes start at 1. String slicing: `SELECT 'DuckDB'[1:4];`. Array/List slicing: `SELECT [1, 2, 3, 4][1:3];`
172
+ - DuckDB has a powerful way to select or transform multiple columns using patterns or functions. You can select columns matching a pattern: `SELECT COLUMNS('sales_.*') FROM sales_data;` or transform multiple columns with a function: `SELECT AVG(COLUMNS('sales_.*')) FROM sales_data;`
173
+ - DuckDB has an easy way to include/exclude or modify columns when selecting all: e.g. Exclude: `SELECT * EXCLUDE (sensitive_data) FROM users;` Replace: `SELECT * REPLACE (UPPER(name) AS name) FROM users;`
174
+ - DuckDB has a shorthand for grouping/ordering by all non-aggregated/all columns. e.g `SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;` and `SELECT * FROM my_table ORDER BY ALL;`
175
+ - DuckDB can combine tables by matching column names, not just their positions using UNION BY NAME. E.g. `SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;`
176
+ - DuckDB has an inutitive syntax to create List/Struct/Map and Array types. Create complex types using intuitive syntax. List: `SELECT [1, 2, 3] AS my_list;`, Struct: `{{'a': 1, 'b': 'text'}} AS my_struct;`, Map: `MAP([1,2],['one','two']) as my_map;`. All types can also be nested into each other. Array types are fixed size, while list types have variable size. Compared to Structs, MAPs do not need to have the same keys present for each row, but keys can only be of type Integer or Varchar. Example: `CREATE TABLE example (my_list INTEGER[], my_struct STRUCT(a INTEGER, b TEXT), my_map MAP(INTEGER, VARCHAR), my_array INTEGER[3], my_nested_struct STRUCT(a INTEGER, b Integer[3]));`
177
+ - DuckDB has an inutive syntax to access struct fields using dot notation (.) or brackets ([]) with the field name. Maps fields can be accessed by brackets ([]).
178
+ - DuckDB's way of converting between text and timestamps, and extract date parts. Current date as 'YYYY-MM-DD': `SELECT strftime(NOW(), '%Y-%m-%d');` String to timestamp: `SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP;`, Extract Year from date: `SELECT EXTRACT(YEAR FROM DATE '2023-07-23');`
179
+ - Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses. E.g.: `SELECT a + b AS total FROM my_table WHERE total > 10 GROUP BY total HAVING total < 20;`
180
+ - DuckDB allows generating lists using expressions similar to Python list comprehensions. E.g. `SELECT [x*2 FOR x IN [1, 2, 3]];` Returns [2, 4, 6].
181
+ - DuckDB allows chaining multiple function calls together using the dot (.) operator. E.g.: `SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB';`
182
+ - DuckDB has a JSON data type. It supports selecting fields from the JSON with a JSON-Path expression using the arrow operator, -> (returns JSON) or ->> (returns text) with JSONPath expressions. For example: `SELECT data->'$.user.id' AS user_id, data->>'$.event_type' AS event_type FROM events;`
183
+ - DuckDB has built-in functions for regex regexp_matches(column, regex), regexp_replace(column, regex), and regexp_extract(column, regex).
184
+ - DuckDB has a way to quickly get a subset of your data with `SELECT * FROM large_table USING SAMPLE 10%;`<|eot_id|>
185
+ <|start_header_id|>user<|end_header_id|>
186
+
187
+ Database Schema:
188
+ Here is the schema of the DuckDB database that the SQL query will run on:
189
+ ```sql
190
+ {schema}
191
+ ```
192
+
193
+ Question:
194
+ Here is the question or an instruction the user provided:
195
+ {question}
196
+
197
+ Task:
198
+ Write a DuckDB SQL query for the given question!<|eot_id|>
199
+ <|start_header_id|>assistant<|end_header_id|>
200
+
201
+ ```sql
202
+ """
203
+
204
+ @classmethod
205
+ def format_retrieved_context(
206
+ cls,
207
+ context: list[str],
208
+ ) -> str:
209
+ """Format retrieved context."""
210
+ context_str = "\n--------\n".join(context)
211
+ return f"\n### Documentation:\n{context_str}\n"
212
+
213
+ @classmethod
214
+ def format_prompt(
215
+ cls,
216
+ instruction: str,
217
+ table_text: str,
218
+ context_text: str,
219
+ ) -> str | list[str]:
220
+ """Get prompt format."""
221
+ instruction = cls.PROMPT_TEMPLATE.format(
222
+ schema=table_text,
223
+ question=instruction
224
+ )
225
+ return instruction
226
+
227
+ class DuckDBInstFormatterLlamaBasic(RajkumarFormatter):
228
+ """DuckDB Inst class."""
229
+
230
+ PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
231
+
232
+ Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.<|eot_id|>
233
+ <|start_header_id|>user<|end_header_id|>
234
+
235
+ Database Schema:
236
+ Here is the schema of the DuckDB database that the SQL query will run on:
237
+ ```sql
238
+ {schema}
239
+ ```
240
+
241
+ Question:
242
+ Here is the question or an instruction the user provided:
243
+ {question}
244
+
245
+ Task:
246
+ Write a DuckDB SQL query for the given question!<|eot_id|>
247
+ <|start_header_id|>assistant<|end_header_id|>
248
+
249
+ ```sql
250
+ """
251
+
252
+ @classmethod
253
+ def format_retrieved_context(
254
+ cls,
255
+ context: list[str],
256
+ ) -> str:
257
+ """Format retrieved context."""
258
+ context_str = "\n--------\n".join(context)
259
+ return f"\n### Documentation:\n{context_str}\n"
260
+
261
+ @classmethod
262
+ def format_prompt(
263
+ cls,
264
+ instruction: str,
265
+ table_text: str,
266
+ context_text: str,
267
+ ) -> str | list[str]:
268
+ """Get prompt format."""
269
+ instruction = cls.PROMPT_TEMPLATE.format(
270
+ schema=table_text,
271
+ question=instruction
272
+ )
273
+ return instruction
274
+
275
+ class DuckDBInstFormatterLlamaSyntax(RajkumarFormatter):
276
+ """DuckDB Inst class."""
277
+
278
+ PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
279
+
280
+ Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
281
+
282
+ Here are some DuckDB SQL syntax specifics you should be aware of:
283
+
284
+ - DuckDB uses double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity and single quotes (') to define string literals
285
+ - DuckDB can query CSV, Parquet, and JSON directly without loading them first, e.g. `SELECT * FROM 'data.csv';`
286
+ - DuckDB supports CREATE TABLE AS (CTAS): `CREATE TABLE new_table AS SELECT * FROM old_table;`
287
+ - DuckDB queries can start with FROM, and optionally omit SELECT *, e.g. `FROM my_table WHERE condition;` is equivalent to `SELECT * FROM my_table WHERE condition;`
288
+ - DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly, e.g. `SELECT 1 + 1 AS result;`
289
+ - DuckDB supports attaching multiple databases, unsing the ATTACH statement: `ATTACH 'my_database.duckdb' AS mydb;`. Tables within attached databases can be accessed using the dot notation (.), e.g. `SELECT * FROM mydb.table_name syntax`. The default databases doesn't require the do notation to access tables. The default database can be changed with the USE statement, e.g. `USE my_db;`.
290
+ - DuckDB is generally more lenient with implicit type conversions (e.g. `SELECT '42' + 1;` - Implicit cast, result is 43), but you can always be explicit using `::`, e.g. `SELECT '42'::INTEGER + 1;`
291
+ - DuckDB can extract parts of strings and lists using [start:end] or [start:end:step] syntax. Indexes start at 1. String slicing: `SELECT 'DuckDB'[1:4];`. Array/List slicing: `SELECT [1, 2, 3, 4][1:3];`
292
+ - DuckDB has a powerful way to select or transform multiple columns using patterns or functions. You can select columns matching a pattern: `SELECT COLUMNS('sales_.*') FROM sales_data;` or transform multiple columns with a function: `SELECT AVG(COLUMNS('sales_.*')) FROM sales_data;`
293
+ - DuckDB has an easy way to include/exclude or modify columns when selecting all: e.g. Exclude: `SELECT * EXCLUDE (sensitive_data) FROM users;` Replace: `SELECT * REPLACE (UPPER(name) AS name) FROM users;`
294
+ - DuckDB has a shorthand for grouping/ordering by all non-aggregated/all columns. e.g `SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;` and `SELECT * FROM my_table ORDER BY ALL;`
295
+ - DuckDB can combine tables by matching column names, not just their positions using UNION BY NAME. E.g. `SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;`
296
+ - DuckDB has an inutitive syntax to create List/Struct/Map and Array types. Create complex types using intuitive syntax. List: `SELECT [1, 2, 3] AS my_list;`, Struct: `{{'a': 1, 'b': 'text'}} AS my_struct;`, Map: `MAP([1,2],['one','two']) as my_map;`. All types can also be nested into each other. Array types are fixed size, while list types have variable size. Compared to Structs, MAPs do not need to have the same keys present for each row, but keys can only be of type Integer or Varchar. Example: `CREATE TABLE example (my_list INTEGER[], my_struct STRUCT(a INTEGER, b TEXT), my_map MAP(INTEGER, VARCHAR), my_array INTEGER[3], my_nested_struct STRUCT(a INTEGER, b Integer[3]));`
297
+ - DuckDB has an inutive syntax to access struct fields using dot notation (.) or brackets ([]) with the field name. Maps fields can be accessed by brackets ([]).
298
+ - DuckDB's way of converting between text and timestamps, and extract date parts. Current date as 'YYYY-MM-DD': `SELECT strftime(NOW(), '%Y-%m-%d');` String to timestamp: `SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP;`, Extract Year from date: `SELECT EXTRACT(YEAR FROM DATE '2023-07-23');`
299
+ - Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses. E.g.: `SELECT a + b AS total FROM my_table WHERE total > 10 GROUP BY total HAVING total < 20;`
300
+ - DuckDB allows generating lists using expressions similar to Python list comprehensions. E.g. `SELECT [x*2 FOR x IN [1, 2, 3]];` Returns [2, 4, 6].
301
+ - DuckDB allows chaining multiple function calls together using the dot (.) operator. E.g.: `SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB';`
302
+ - DuckDB has a JSON data type. It supports selecting fields from the JSON with a JSON-Path expression using the arrow operator, -> (returns JSON) or ->> (returns text) with JSONPath expressions. For example: `SELECT data->'$.user.id' AS user_id, data->>'$.event_type' AS event_type FROM events;`
303
+ - DuckDB has built-in functions for regex regexp_matches(column, regex), regexp_replace(column, regex), and regexp_extract(column, regex).
304
+ - DuckDB has a way to quickly get a subset of your data with `SELECT * FROM large_table USING SAMPLE 10%;`<|eot_id|>
305
+ <|start_header_id|>user<|end_header_id|>
306
+
307
+ Database Schema:
308
+ Here is the schema of the DuckDB database that the SQL query will run on:
309
+ ```sql
310
+ {schema}
311
+ ```
312
+
313
+ Question:
314
+ Here is the question or an instruction the user provided:
315
+ {question}
316
+
317
+ Task:
318
+ Write a DuckDB SQL query for the given question!<|eot_id|>
319
+ <|start_header_id|>assistant<|end_header_id|>
320
+
321
+ ```sql
322
+ """
323
+
324
+ @classmethod
325
+ def format_retrieved_context(
326
+ cls,
327
+ context: list[str],
328
+ ) -> str:
329
+ """Format retrieved context."""
330
+ context_str = "\n--------\n".join(context)
331
+ return f"\n### Documentation:\n{context_str}\n"
332
+
333
+ @classmethod
334
+ def format_prompt(
335
+ cls,
336
+ instruction: str,
337
+ table_text: str,
338
+ context_text: str,
339
+ ) -> str | list[str]:
340
+ """Get prompt format."""
341
+ instruction = cls.PROMPT_TEMPLATE.format(
342
+ schema=table_text,
343
+ question=instruction
344
+ )
345
+ return instruction
346
+
347
+ class DuckDBInstFormatterGraniteShort(RajkumarFormatter):
348
+ """DuckDB Inst class."""
349
+
350
+ PROMPT_TEMPLATE = """System:
351
+ Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
352
+
353
+ Here are some DuckDB SQL syntax specifics you should be aware of:
354
+
355
+
356
+ - DuckDB use double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity and single quotes (') to define string literals
357
+ - DuckDB can query CSV, Parquet, and JSON directly without loading them first, e.g. `SELECT * FROM 'data.csv';`
358
+ - DuckDB supports CREATE TABLE AS (CTAS): `CREATE TABLE new_table AS SELECT * FROM old_table;`
359
+ - DuckDB queries can start with FROM, and optionally omit SELECT *, e.g. `FROM my_table WHERE condition;` is equivalent to `SELECT * FROM my_table WHERE condition;`
360
+ - DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly, e.g. `SELECT 1 + 1 AS result;`
361
+ - DuckDB supports attaching multiple databases, unsing the ATTACH statement: `ATTACH 'my_database.duckdb' AS mydb;`. Tables within attached databases can be accessed using the dot notation (.), e.g. `SELECT * FROM mydb.table_name syntax`. The default databases doesn't require the do notation to access tables. The default database can be changed with the USE statement, e.g. `USE my_db;`.
362
+ - DuckDB is generally more lenient with implicit type conversions (e.g. `SELECT '42' + 1;` - Implicit cast, result is 43), but you can always be explicit using `::`, e.g. `SELECT '42'::INTEGER + 1;`
363
+ - DuckDB can extract parts of strings and lists using [start:end] or [start:end:step] syntax. Indexes start at 1. String slicing: `SELECT 'DuckDB'[1:4];`. Array/List slicing: `SELECT [1, 2, 3, 4][1:3];`
364
+ - DuckDB has a powerful way to select or transform multiple columns using patterns or functions. You can select columns matching a pattern: `SELECT COLUMNS('sales_.*') FROM sales_data;` or transform multiple columns with a function: `SELECT AVG(COLUMNS('sales_.*')) FROM sales_data;`
365
+ - DuckDB an easy way to include/exclude or modify columns when selecting all: e.g. Exclude: `SELECT * EXCLUDE (sensitive_data) FROM users;` Replace: `SELECT * REPLACE (UPPER(name) AS name) FROM users;`
366
+ - DuckDB has a shorthand for grouping/ordering by all non-aggregated/all columns. e.g `SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;` and `SELECT * FROM my_table ORDER BY ALL;`
367
+ - DuckDB can combine tables by matching column names, not just their positions using UNION BY NAME. E.g. `SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;`
368
+ - DuckDB has an inutitive syntax to create List/Struct/Map and Array types. Create complex types using intuitive syntax. List: `SELECT [1, 2, 3] AS my_list;`, Struct: `{{'a': 1, 'b': 'text'}} AS my_struct;`, Map: `MAP([1,2],['one','two']) as my_map;`. All types can also be nested into each other. Array types are fixed size, while list types have variable size. Compared to Structs, MAPs do not need to have the same keys present for each row, but keys can only be of type Integer or Varchar. Example: `CREATE TABLE example (my_list INTEGER[], my_struct STRUCT(a INTEGER, b TEXT), my_map MAP(INTEGER, VARCHAR), my_array INTEGER[3], my_nested_struct STRUCT(a INTEGER, b Integer[3]));`
369
+ - DuckDB has an inutive syntax to access struct fields using dot notation (.) or brackets ([]) with the field name. Maps fields can be accessed by brackets ([]).
370
+ - DuckDB's way of converting between text and timestamps, and extract date parts. Current date as 'YYYY-MM-DD': `SELECT strftime(NOW(), '%Y-%m-%d');` String to timestamp: `SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP;`, Extract Year from date: `SELECT EXTRACT(YEAR FROM DATE '2023-07-23');`
371
+ - Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses. E.g.: `SELECT a + b AS total FROM my_table WHERE total > 10 GROUP BY total HAVING total < 20;`
372
+ - DuckDB allows generating lists using expressions similar to Python list comprehensions. E.g. `SELECT [x*2 FOR x IN [1, 2, 3]];` Returns [2, 4, 6].
373
+ - DuckDB allows chaining multiple function calls together using the dot (.) operator. E.g.: `SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB';`
374
+ - DuckDB has a JSON data type. It supports selecting fields from the JSON with a JSON-Path expression using the arrow operator, -> (returns JSON) or ->> (returns text) with JSONPath expressions. For example: `SELECT data->'$.user.id' AS user_id, data->>'$.event_type' AS event_type FROM events;`
375
+ - DuckDB has built-in functions for regex regexp_matches(column, regex), regexp_replace(column, regex), and regexp_extract(column, regex).
376
+ - DuckDB has a way to quickly get a subset of your data with `SELECT * FROM large_table USING SAMPLE 10%;`
377
+
378
+ Here is the schema of the DuckDB database that the SQL query will run on:
379
+ {schema}
380
+
381
+ Question:
382
+ Here is the question or an instruction the user provided:
383
+ {question}
384
+
385
+ Write a DuckDB SQL query for the given question!
386
+
387
+ Answer:
388
+ ```
389
+ """
390
+
391
+ @classmethod
392
+ def format_retrieved_context(
393
+ cls,
394
+ context: list[str],
395
+ ) -> str:
396
+ """Format retrieved context."""
397
+ context_str = "\n--------\n".join(context)
398
+ return f"\n### Documentation:\n{context_str}\n"
399
+
400
+ @classmethod
401
+ def format_prompt(
402
+ cls,
403
+ instruction: str,
404
+ table_text: str,
405
+ context_text: str,
406
+ ) -> str | list[str]:
407
+ """Get prompt format."""
408
+ instruction = cls.PROMPT_TEMPLATE.format(
409
+ schema=table_text,
410
+ question=instruction
411
+ )
412
+ return instruction
413
+
414
+ class DuckDBInstFormatterLlama(RajkumarFormatter):
415
+ """DuckDB Inst class."""
416
+
417
+ PROMPT_TEMPLATE = """<|begin_of_text|>
418
+
419
+ Your task is to generate valid DuckDB SQL to answer the following question, given a DuckDB database schema.
420
+
421
+ ## DuckDB SQL syntax specifics you should be aware of:
422
+
423
+ ### Case Insensitivity and Quoting:
424
+
425
+ Identifiers (tables, columns): Case-insensitive, but DuckDB remembers the case you use. Use double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity.
426
+ ```
427
+ CREATE TABLE "My Table" ("column_name" VARCHAR); -- Spaces and mixed case
428
+ SELECT "column_name" FROM "My Table";
429
+ ```
430
+
431
+ ### String Literals: Always use single quotes (') to define string literals.
432
+ ```
433
+ SELECT 'This is a string' AS text;
434
+ ```
435
+
436
+ ### Direct File Querying: Query CSV, Parquet, and JSON files directly without loading them first.
437
+
438
+ ```
439
+ SELECT * FROM 'data.csv';
440
+ SELECT * FROM 'data.parquet';
441
+ SELECT * FROM 'data.json';
442
+ ```
443
+
444
+ ### CREATE TABLE AS (CTAS): Create tables from query results.
445
+
446
+ ```
447
+ CREATE TABLE squares AS SELECT i, i * i AS square FROM generate_series(1, 10) t(i);
448
+ ```
449
+
450
+ ### FROM-First Syntax (Optional SELECT): Start queries with FROM, and optionally omit SELECT *.
451
+
452
+ ```
453
+ FROM my_table WHERE condition; -- Equivalent to SELECT * FROM my_table WHERE condition
454
+ ```
455
+
456
+ ### SELECT without FROM: DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly.
457
+
458
+ ```
459
+ SELECT 1 + 1 AS result;
460
+ ```
461
+
462
+ ### GROUP BY ALL/ORDER BY ALL: Shorthand for grouping/ordering by all non-aggregated/all columns.
463
+
464
+ ```
465
+ SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;
466
+ SELECT * FROM my_table ORDER BY ALL;
467
+ ```
468
+
469
+ ### SELECT COLUMNS(): Powerful way to select or transform multiple columns using patterns or functions.
470
+
471
+ ```
472
+ -- Select columns matching a pattern
473
+ SELECT COLUMNS('sales_.*') FROM sales_data;
474
+
475
+ -- Transform multiple columns with a function
476
+ SELECT AVG(COLUMNS(*)) FROM sales_data;
477
+ ```
478
+
479
+ ### UNION BY NAME: Combine tables by matching column names, not just their positions.
480
+
481
+ ```
482
+ SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;
483
+ ```
484
+
485
+ ### Implicit/Explicit Casting: DuckDB is generally more lenient with implicit type conversions, but you can always be explicit using ::
486
+
487
+ ```
488
+ SELECT '42' + 1; -- Implicit cast, result is 43
489
+ SELECT '42'::INTEGER + 1; -- Explicit cast, result is 43
490
+ ```
491
+
492
+ ### String/List Slicing: Extract parts of strings and lists using [start:end] or [start:end:step] syntax.
493
+
494
+ ```
495
+ SELECT 'DuckDB'[1:4]; -- Returns 'Duck'
496
+ SELECT [1, 2, 3, 4][1:3]; -- Returns [1, 2, 3]
497
+ ```
498
+
499
+ ### Simple List/Struct/Map/Array Creation: Create complex types using intuitive syntax.
500
+
501
+ In a SELECT statement:
502
+ ```
503
+ SELECT [1, 2, 3] AS my_list, {{'a': 1, 'b': 'text'}} AS my_struct, MAP([1,2],['one','two']) as my_map;
504
+ ```
505
+
506
+ When creating a table:
507
+ ```
508
+ CREATE TABLE data (
509
+ my_list INTEGER[],
510
+ my_struct STRUCT(a INTEGER, b TEXT),
511
+ my_map MAP(INTEGER, VARCHAR),
512
+ my_array INTEGER[3]
513
+ );
514
+ ```
515
+
516
+ ### Timestamp Conversions and Extraction: Convert between text and timestamps, and extract date parts.
517
+
518
+ ```
519
+ SELECT strftime(NOW(), '%Y-%m-%d'); -- Current date as 'YYYY-MM-DD'
520
+ SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP; -- String to timestamp
521
+ SELECT EXTRACT(YEAR FROM DATE '2023-07-23'); -- Extract year
522
+ ```
523
+
524
+ ### Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses.
525
+
526
+ ```
527
+ SELECT a + b AS total
528
+ FROM my_table
529
+ WHERE total > 10
530
+ GROUP BY total
531
+ HAVING total < 20;
532
+ ```
533
+
534
+ ### List Comprehensions: Generate lists using expressions similar to Python list comprehensions.
535
+
536
+ ```
537
+ SELECT [x*2 FOR x IN [1, 2, 3]]; -- Returns [2, 4, 6]
538
+ ```
539
+
540
+ ### Function Chaining: Chain multiple function calls together using the dot (.) operator.
541
+
542
+ ```
543
+ SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB'
544
+ ```
545
+
546
+ ### Regular Expressions: DuckDB has built-in functions for regex matching, replacement, and extraction.
547
+
548
+ ```
549
+ SELECT regexp_matches('DuckDB', 'Duck'); -- Returns true
550
+ SELECT regexp_replace('DuckDB', 'Duck', 'Goose'); -- Returns 'GooseDB'
551
+ SELECT regexp_extract('DuckDB', '(\w+)(DB)', 1); -- Returns 'Duck'
552
+ ```
553
+
554
+ ### Sampling: Quickly get a subset of your data with SAMPLE or TABLESAMPLE.
555
+
556
+ ```
557
+ SELECT * FROM large_table USING SAMPLE 10%; -- Random 10% sample
558
+ SELECT * FROM large_table TABLESAMPLE BERNOULLI(10); -- Bernoulli sampling
559
+ ```
560
+
561
+ ### ATTACH and Access: Attach external databases and reference their objects using databasename.table_name syntax.
562
+
563
+ ```
564
+ ATTACH 'my_database.duckdb' AS mydb;
565
+ SELECT * FROM mydb.my_table;
566
+ ```
567
+
568
+ ### SUMMARIZE: Get summary statistics (min, max, unique count, average, standard deviation, quartiles, and count) of a table.
569
+
570
+ ```
571
+ SUMMARIZE table_name;
572
+ ```
573
+
574
+ ### DESCRIBE: Get schema of a table (column_name, column_type, null, key, default, extra).
575
+
576
+ ```
577
+ DESCRIBE table_name;
578
+ ```
579
+
580
+ Database Schema:
581
+ Here is the schema of the DuckDB database that the SQL query will run on:
582
+ {schema}
583
+
584
+ Question:
585
+ Here is the question or an instruction the user provided:
586
+ {question}
587
+
588
+ Task:
589
+ Write a DuckDB SQL query for the given question!
590
+
591
+ Here is the valid DuckDB SQL query:
592
+ ```
593
+ """
594
+
595
+ @classmethod
596
+ def format_retrieved_context(
597
+ cls,
598
+ context: list[str],
599
+ ) -> str:
600
+ """Format retrieved context."""
601
+ context_str = "\n--------\n".join(context)
602
+ return f"\n### Documentation:\n{context_str}\n"
603
+
604
+ @classmethod
605
+ def format_prompt(
606
+ cls,
607
+ instruction: str,
608
+ table_text: str,
609
+ context_text: str,
610
+ ) -> str | list[str]:
611
+ """Get prompt format."""
612
+ instruction = cls.PROMPT_TEMPLATE.format(
613
+ schema=table_text,
614
+ question=instruction
615
+ )
616
+ return instruction
617
+
618
+ class DuckDBInstFormatterGranite(RajkumarFormatter):
619
+ """DuckDB Inst class."""
620
+
621
+ PROMPT_TEMPLATE = """System:
622
+
623
+ Your task is to generate valid DuckDB SQL to answer the following question, given a DuckDB database schema.
624
+
625
+ ## DuckDB SQL syntax specifics you should be aware of:
626
+
627
+ ### Case Insensitivity and Quoting:
628
+
629
+ Identifiers (tables, columns): Case-insensitive, but DuckDB remembers the case you use. Use double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity.
630
+ ```
631
+ CREATE TABLE "My Table" ("column_name" VARCHAR); -- Spaces and mixed case
632
+ SELECT "column_name" FROM "My Table";
633
+ ```
634
+
635
+ ### String Literals: Always use single quotes (') to define string literals.
636
+ ```
637
+ SELECT 'This is a string' AS text;
638
+ ```
639
+
640
+ ### Direct File Querying: Query CSV, Parquet, and JSON files directly without loading them first.
641
+
642
+ ```
643
+ SELECT * FROM 'data.csv';
644
+ SELECT * FROM 'data.parquet';
645
+ SELECT * FROM 'data.json';
646
+ ```
647
+
648
+ ### CREATE TABLE AS (CTAS): Create tables from query results.
649
+
650
+ ```
651
+ CREATE TABLE squares AS SELECT i, i * i AS square FROM generate_series(1, 10) t(i);
652
+ ```
653
+
654
+ ### FROM-First Syntax (Optional SELECT): Start queries with FROM, and optionally omit SELECT *.
655
+
656
+ ```
657
+ FROM my_table WHERE condition; -- Equivalent to SELECT * FROM my_table WHERE condition
658
+ ```
659
+
660
+ ### SELECT without FROM: DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly.
661
+
662
+ ```
663
+ SELECT 1 + 1 AS result;
664
+ ```
665
+
666
+ ### GROUP BY ALL/ORDER BY ALL: Shorthand for grouping/ordering by all non-aggregated/all columns.
667
+
668
+ ```
669
+ SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;
670
+ SELECT * FROM my_table ORDER BY ALL;
671
+ ```
672
+
673
+ ### SELECT COLUMNS(): Powerful way to select or transform multiple columns using patterns or functions.
674
+
675
+ ```
676
+ -- Select columns matching a pattern
677
+ SELECT COLUMNS('sales_.*') FROM sales_data;
678
+
679
+ -- Transform multiple columns with a function
680
+ SELECT AVG(COLUMNS(*)) FROM sales_data;
681
+ ```
682
+
683
+ ### UNION BY NAME: Combine tables by matching column names, not just their positions.
684
+
685
+ ```
686
+ SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;
687
+ ```
688
+
689
+ ### Implicit/Explicit Casting: DuckDB is generally more lenient with implicit type conversions, but you can always be explicit using ::
690
+
691
+ ```
692
+ SELECT '42' + 1; -- Implicit cast, result is 43
693
+ SELECT '42'::INTEGER + 1; -- Explicit cast, result is 43
694
+ ```
695
+
696
+ ### String/List Slicing: Extract parts of strings and lists using [start:end] or [start:end:step] syntax.
697
+
698
+ ```
699
+ SELECT 'DuckDB'[1:4]; -- Returns 'Duck'
700
+ SELECT [1, 2, 3, 4][1:3]; -- Returns [1, 2, 3]
701
+ ```
702
+
703
+ ### Simple List/Struct/Map/Array Creation: Create complex types using intuitive syntax.
704
+
705
+ In a SELECT statement:
706
+ ```
707
+ SELECT [1, 2, 3] AS my_list, {{'a': 1, 'b': 'text'}} AS my_struct, MAP([1,2],['one','two']) as my_map;
708
+ ```
709
+
710
+ When creating a table:
711
+ ```
712
+ CREATE TABLE data (
713
+ my_list INTEGER[],
714
+ my_struct STRUCT(a INTEGER, b TEXT),
715
+ my_map MAP(INTEGER, VARCHAR),
716
+ my_array INTEGER[3]
717
+ );
718
+ ```
719
+
720
+ ### Timestamp Conversions and Extraction: Convert between text and timestamps, and extract date parts.
721
+
722
+ ```
723
+ SELECT strftime(NOW(), '%Y-%m-%d'); -- Current date as 'YYYY-MM-DD'
724
+ SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP; -- String to timestamp
725
+ SELECT EXTRACT(YEAR FROM DATE '2023-07-23'); -- Extract year
726
+ ```
727
+
728
+ ### Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses.
729
+
730
+ ```
731
+ SELECT a + b AS total
732
+ FROM my_table
733
+ WHERE total > 10
734
+ GROUP BY total
735
+ HAVING total < 20;
736
+ ```
737
+
738
+ ### List Comprehensions: Generate lists using expressions similar to Python list comprehensions.
739
+
740
+ ```
741
+ SELECT [x*2 FOR x IN [1, 2, 3]]; -- Returns [2, 4, 6]
742
+ ```
743
+
744
+ ### Function Chaining: Chain multiple function calls together using the dot (.) operator.
745
+
746
+ ```
747
+ SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB'
748
+ ```
749
+
750
+ ### Regular Expressions: DuckDB has built-in functions for regex matching, replacement, and extraction.
751
+
752
+ ```
753
+ SELECT regexp_matches('DuckDB', 'Duck'); -- Returns true
754
+ SELECT regexp_replace('DuckDB', 'Duck', 'Goose'); -- Returns 'GooseDB'
755
+ SELECT regexp_extract('DuckDB', '(\w+)(DB)', 1); -- Returns 'Duck'
756
+ ```
757
+
758
+ ### Sampling: Quickly get a subset of your data with SAMPLE or TABLESAMPLE.
759
+
760
+ ```
761
+ SELECT * FROM large_table USING SAMPLE 10%; -- Random 10% sample
762
+ SELECT * FROM large_table TABLESAMPLE BERNOULLI(10); -- Bernoulli sampling
763
+ ```
764
+
765
+ ### ATTACH and Access: Attach external databases and reference their objects using databasename.table_name syntax.
766
+
767
+ ```
768
+ ATTACH 'my_database.duckdb' AS mydb;
769
+ SELECT * FROM mydb.my_table;
770
+ ```
771
+
772
+ ### SUMMARIZE: Get summary statistics (min, max, unique count, average, standard deviation, quartiles, and count) of a table.
773
+
774
+ ```
775
+ SUMMARIZE table_name;
776
+ ```
777
+
778
+ ### DESCRIBE: Get schema of a table (column_name, column_type, null, key, default, extra).
779
+
780
+ ```
781
+ DESCRIBE table_name;
782
+ ```
783
+
784
+ Here is the schema of the DuckDB database that the SQL query will run on:
785
+ {schema}
786
+
787
+ Question:
788
+ Here is the question or an instruction the user provided:
789
+ {question}
790
+
791
+ Please write a DuckDB SQL query that answers the user's question or instruction. Use DuckDB-specific syntax if possible.
792
+
793
+ Answer:
794
+ ```
795
+ """
796
+
797
+ @classmethod
798
+ def format_retrieved_context(
799
+ cls,
800
+ context: list[str],
801
+ ) -> str:
802
+ """Format retrieved context."""
803
+ context_str = "\n--------\n".join(context)
804
+ return f"\n### Documentation:\n{context_str}\n"
805
+
806
+ @classmethod
807
+ def format_prompt(
808
+ cls,
809
+ instruction: str,
810
+ table_text: str,
811
+ context_text: str,
812
+ ) -> str | list[str]:
813
+ """Get prompt format."""
814
+ instruction = cls.PROMPT_TEMPLATE.format(
815
+ schema=table_text,
816
+ question=instruction
817
+ )
818
+ return instruction
819
+
820
+ class DuckDBInstFormatterPhi(RajkumarFormatter):
821
+ """DuckDB Inst class."""
822
+
823
+ PROMPT_TEMPLATE = """<|endoftext|><|user|>
824
+ Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
825
+
826
+ Here are some DuckDB SQL syntax specifics you should be aware of:
827
+
828
+
829
+ - DuckDB use double quotes (") for identifiers that contain spaces or special characters, or to force case-sensitivity and single quotes (') to define string literals
830
+ - DuckDB can query CSV, Parquet, and JSON directly without loading them first, e.g. `SELECT * FROM 'data.csv';`
831
+ - DuckDB supports CREATE TABLE AS (CTAS): `CREATE TABLE new_table AS SELECT * FROM old_table;`
832
+ - DuckDB queries can start with FROM, and optionally omit SELECT *, e.g. `FROM my_table WHERE condition;` is equivalent to `SELECT * FROM my_table WHERE condition;`
833
+ - DuckDB allows you to use SELECT without a FROM clause to generate a single row of results or to work with expressions directly, e.g. `SELECT 1 + 1 AS result;`
834
+ - DuckDB supports attaching multiple databases, unsing the ATTACH statement: `ATTACH 'my_database.duckdb' AS mydb;`. Tables within attached databases can be accessed using the dot notation (.), e.g. `SELECT * FROM mydb.table_name syntax`. The default databases doesn't require the do notation to access tables. The default database can be changed with the USE statement, e.g. `USE my_db;`.
835
+ - DuckDB is generally more lenient with implicit type conversions (e.g. `SELECT '42' + 1;` - Implicit cast, result is 43), but you can always be explicit using `::`, e.g. `SELECT '42'::INTEGER + 1;`
836
+ - DuckDB can extract parts of strings and lists using [start:end] or [start:end:step] syntax. Indexes start at 1. String slicing: `SELECT 'DuckDB'[1:4];`. Array/List slicing: `SELECT [1, 2, 3, 4][1:3];`
837
+ - DuckDB has a powerful way to select or transform multiple columns using patterns or functions. You can select columns matching a pattern: `SELECT COLUMNS('sales_.*') FROM sales_data;` or transform multiple columns with a function: `SELECT AVG(COLUMNS('sales_.*')) FROM sales_data;`
838
+ - DuckDB an easy way to include/exclude or modify columns when selecting all: e.g. Exclude: `SELECT * EXCLUDE (sensitive_data) FROM users;` Replace: `SELECT * REPLACE (UPPER(name) AS name) FROM users;`
839
+ - DuckDB has a shorthand for grouping/ordering by all non-aggregated/all columns. e.g `SELECT category, SUM(sales) FROM sales_data GROUP BY ALL;` and `SELECT * FROM my_table ORDER BY ALL;`
840
+ - DuckDB can combine tables by matching column names, not just their positions using UNION BY NAME. E.g. `SELECT * FROM table1 UNION BY NAME SELECT * FROM table2;`
841
+ - DuckDB has an inutitive syntax to create List/Struct/Map and Array types. Create complex types using intuitive syntax. List: `SELECT [1, 2, 3] AS my_list;`, Struct: `{{'a': 1, 'b': 'text'}} AS my_struct;`, Map: `MAP([1,2],['one','two']) as my_map;`. All types can also be nested into each other. Array types are fixed size, while list types have variable size. Compared to Structs, MAPs do not need to have the same keys present for each row, but keys can only be of type Integer or Varchar. Example: `CREATE TABLE example (my_list INTEGER[], my_struct STRUCT(a INTEGER, b TEXT), my_map MAP(INTEGER, VARCHAR), my_array INTEGER[3], my_nested_struct STRUCT(a INTEGER, b Integer[3]));`
842
+ - DuckDB has an inutive syntax to access struct fields using dot notation (.) or brackets ([]) with the field name. Maps fields can be accessed by brackets ([]).
843
+ - DuckDB's way of converting between text and timestamps, and extract date parts. Current date as 'YYYY-MM-DD': `SELECT strftime(NOW(), '%Y-%m-%d');` String to timestamp: `SELECT strptime('2023-07-23', '%Y-%m-%d')::TIMESTAMP;`, Extract Year from date: `SELECT EXTRACT(YEAR FROM DATE '2023-07-23');`
844
+ - Column Aliases in WHERE/GROUP BY/HAVING: You can use column aliases defined in the SELECT clause within the WHERE, GROUP BY, and HAVING clauses. E.g.: `SELECT a + b AS total FROM my_table WHERE total > 10 GROUP BY total HAVING total < 20;`
845
+ - DuckDB allows generating lists using expressions similar to Python list comprehensions. E.g. `SELECT [x*2 FOR x IN [1, 2, 3]];` Returns [2, 4, 6].
846
+ - DuckDB allows chaining multiple function calls together using the dot (.) operator. E.g.: `SELECT 'DuckDB'.replace('Duck', 'Goose').upper(); -- Returns 'GOOSEDB';`
847
+ - DuckDB has a JSON data type. It supports selecting fields from the JSON with a JSON-Path expression using the arrow operator, -> (returns JSON) or ->> (returns text) with JSONPath expressions. For example: `SELECT data->'$.user.id' AS user_id, data->>'$.event_type' AS event_type FROM events;`
848
+ - DuckDB has built-in functions for regex regexp_matches(column, regex), regexp_replace(column, regex), and regexp_extract(column, regex).
849
+ - DuckDB has a way to quickly get a subset of your data with `SELECT * FROM large_table USING SAMPLE 10%;`
850
+
851
+ Here is the schema of the DuckDB database that the SQL query will run on:
852
+ {schema}
853
+
854
+ Question:
855
+ Here is the question or an instruction the user provided:
856
+ {question}
857
+
858
+ Write a DuckDB SQL query for the given question!<|end|>
859
+ <|assistant|>
860
+ ```sql
861
+ """
862
+
863
+ @classmethod
864
+ def format_retrieved_context(
865
+ cls,
866
+ context: list[str],
867
+ ) -> str:
868
+ """Format retrieved context."""
869
+ context_str = "\n--------\n".join(context)
870
+ return f"\n### Documentation:\n{context_str}\n"
871
+
872
+ @classmethod
873
+ def format_prompt(
874
+ cls,
875
+ instruction: str,
876
+ table_text: str,
877
+ context_text: str,
878
+ ) -> str | list[str]:
879
+ """Get prompt format."""
880
+ instruction = cls.PROMPT_TEMPLATE.format(
881
+ schema=table_text,
882
+ question=instruction
883
+ )
884
+ return instruction
885
+
886
+ class DuckDBInstFormatterGPTmini(RajkumarFormatter):
887
+ """DuckDB Inst class."""
888
+
889
+ PROMPT_TEMPLATE = """Schema:
890
+ ```sql
891
+ {schema}
892
+ ```
893
+
894
+ Question:
895
+ {question}
896
+
897
+ Write a valid DuckDB SQL query to answer the question!
898
+ """
899
+
900
+ @classmethod
901
+ def format_retrieved_context(
902
+ cls,
903
+ context: list[str],
904
+ ) -> str:
905
+ """Format retrieved context."""
906
+ context_str = "\n--------\n".join(context)
907
+ return f"\n### Documentation:\n{context_str}\n"
908
+
909
+ @classmethod
910
+ def format_prompt(
911
+ cls,
912
+ instruction: str,
913
+ table_text: str,
914
+ context_text: str,
915
+ ) -> str | list[str]:
916
+ """Get prompt format."""
917
+ instruction = cls.PROMPT_TEMPLATE.format(
918
+ schema=table_text,
919
+ question=instruction
920
+ )
921
+ return instruction
922
+
923
+ class DuckDBInstFormatterPhiAzure(RajkumarFormatter):
924
+ """DuckDB Inst class."""
925
+
926
+ PROMPT_TEMPLATE = """Your task is to generate valid DuckDB SQL to answer the question that the user asks. You should only respond with a valid DuckDB SQL query.
927
+
928
+ Here is the schema of the DuckDB database that the SQL query will run on:
929
+ {schema}
930
+
931
+ Question:
932
+ Here is the question or an instruction the user provided:
933
+ {question}
934
+
935
+ Write a DuckDB SQL query for the given question!
936
+ """
937
+
938
+ @classmethod
939
+ def format_retrieved_context(
940
+ cls,
941
+ context: list[str],
942
+ ) -> str:
943
+ """Format retrieved context."""
944
+ context_str = "\n--------\n".join(context)
945
+ return f"\n### Documentation:\n{context_str}\n"
946
+
947
+ @classmethod
948
+ def format_prompt(
949
+ cls,
950
+ instruction: str,
951
+ table_text: str,
952
+ context_text: str,
953
+ ) -> str | list[str]:
954
+ """Get prompt format."""
955
+ instruction = cls.PROMPT_TEMPLATE.format(
956
+ schema=table_text,
957
+ question=instruction
958
+ )
959
+ return instruction
960
+
961
+ class DuckDBInstNoShorthandFormatter(DuckDBInstFormatter):
962
+ """DuckDB Inst class."""
963
+
964
+ PROMPT_TEMPLATE = """### Instruction:\n{instruction}\n\n### Input:\n{input}{context}\n### Question:\n{question}\n\n### Response:\n"""
965
+ INSTRUCTION_TEMPLATE = """Your task is to generate valid duckdb SQL to answer the following question{has_schema}""" # noqa: E501
966
+
967
+
968
+ class DuckDBChat:
969
+ """DuckDB Inst class."""
970
+
971
+ table_sep: str = "\n\n"
972
+ shuffle_table_order: bool = True
973
+ _cache: dict[tuple[str, str, str], list[str]] = {}
974
+ clean_whitespace = False
975
+ model = None
976
+
977
+ @classmethod
978
+ def format_table(cls, table: Table) -> str:
979
+ """Get table format."""
980
+ table_fmt = []
981
+ for col in table.columns or []:
982
+ # This is technically an incorrect type, but it should be a catchall word
983
+ table_fmt.append(f" {col.name} {col.dtype or 'any'}")
984
+ if table_fmt:
985
+ all_cols = ",\n".join(table_fmt)
986
+ create_tbl = f"CREATE TABLE {table.name} (\n{all_cols}\n)"
987
+ else:
988
+ create_tbl = f"CREATE TABLE {table.name}"
989
+ return create_tbl
990
+
991
+ @classmethod
992
+ def format_all_tables(cls, tables: list[Table], instruction: str) -> list[dict]:
993
+ """Get all tables format."""
994
+ if not cls.model:
995
+ cls.model = Manifest(
996
+ engine="gpt-3.5-turbo",
997
+ client_name="openaichat",
998
+ cache_name="sqlite",
999
+ cache_connection=".manifest.sqlite",
1000
+ )
1001
+ table_texts = [cls.format_table(table) for table in tables]
1002
+ full_schema = cls.table_sep.join(table_texts)
1003
+ prompt = f"""SQL schema of my database:
1004
+ {full_schema}
1005
+ Explain in a few sentences what the data is about:
1006
+ """
1007
+ messages = [
1008
+ {
1009
+ "role": "system",
1010
+ "content": "You are a helpful assistant that can generate an human redable summary of database content based on the schema.",
1011
+ },
1012
+ {"role": "user", "content": prompt},
1013
+ ]
1014
+ explanation = cls.model.run(messages, temperature=0)
1015
+ messages.append({"role": "assistant", "content": explanation})
1016
+ return messages[1:]
1017
+
1018
+ @classmethod
1019
+ def format_retrieved_context(
1020
+ cls,
1021
+ context: list[str],
1022
+ ) -> str:
1023
+ """Format retrieved context."""
1024
+ context_str = "\n--------\n".join(context)
1025
+ return f"\n\nHere is additional documentation about DuckDB that could be useful.\n--------\n{context_str}\n--------\n"
1026
+
1027
+ @classmethod
1028
+ def format_prompt(
1029
+ cls,
1030
+ instruction: str,
1031
+ table_text: list[dict],
1032
+ context_text: str,
1033
+ ) -> str | list[str]:
1034
+ """Get prompt format."""
1035
+ prompt = f"""Now output a single SQL query without any explanation and do not add anything
1036
+ to the query that was not part of the question, also do not use markdown. Make sure to only
1037
+ use information provided in the prompt, or tables and columns from the schema above and write a query to answer the question.{context_text}\n\nMy quesiton is \n`{instruction}`\n\nGenerate the DuckDB specific SQL query:""" # noqa: E501
1038
+ messages = [
1039
+ {
1040
+ "role": "system",
1041
+ "content": "You are a helpful assistant that can generate DuckDB sql queries, which is a superset of Postgresql, based on the user input. You do not respond with any human readable text, only SQL code.",
1042
+ },
1043
+ *table_text,
1044
+ {"role": "user", "content": prompt},
1045
+ ]
1046
+ return messages
1047
+
1048
+ @classmethod
1049
+ def format_model_output(cls, output_sql: str, prompt: str) -> str:
1050
+ """Format model output."""
1051
+ return output_sql
1052
+
1053
+ @classmethod
1054
+ def format_gold_output(cls, output_sql: str) -> str:
1055
+ """Format gold output for demonstration."""
1056
+ return output_sql
duckdb-nsql/eval/schema.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text2SQL schemas."""
2
+ import enum
3
+
4
+ from manifest.response import Usage
5
+ from pydantic import BaseModel
6
+
7
+ DEFAULT_TABLE_NAME: str = "db_table"
8
+
9
+
10
+ class Dialect(str, enum.Enum):
11
+ """SQGFluff and SQLGlot dialects.
12
+
13
+ Lucky for us, the dialects match both parsers.
14
+
15
+ Ref: https://github.com/sqlfluff/sqlfluff/blob/main/src/sqlfluff/core/dialects/__init__.py # noqa: E501
16
+ Ref: https://github.com/tobymao/sqlglot/blob/main/sqlglot/dialects/__init__.py # noqa: E501
17
+ """
18
+
19
+ SNOWFLAKE = "snowflake"
20
+ BIGQUERY = "bigquery"
21
+ REDSHIFT = "redshift"
22
+ POSTGRES = "postgres"
23
+ UNKNOWN = "unknown"
24
+
25
+ @property
26
+ def dialect_str(self) -> str | None:
27
+ """Get the dialect string for validation.
28
+
29
+ We need to pass in dialect = None for UNKNOWN dialects.
30
+ """
31
+ if self != Dialect.UNKNOWN:
32
+ return self.value
33
+ else:
34
+ return None
35
+
36
+ @property
37
+ def quote_str(self) -> str:
38
+ """Get the quote string for the dialect."""
39
+ if self == Dialect.SNOWFLAKE:
40
+ return '"'
41
+ elif self == Dialect.BIGQUERY:
42
+ return "`"
43
+ elif self == Dialect.REDSHIFT:
44
+ return '"'
45
+ elif self == Dialect.POSTGRES:
46
+ return '"'
47
+ elif self == Dialect.UNKNOWN:
48
+ return '"'
49
+ raise NotImplementedError(f"Quote string not implemented for dialect {self}")
50
+
51
+ def quote(self, string: str) -> str:
52
+ """Quote a string."""
53
+ return f"{self.quote_str}{string}{self.quote_str}"
54
+
55
+
56
+ class ColumnOrLiteral(BaseModel):
57
+ """Column that may or may not be a literal."""
58
+
59
+ name: str | None = None
60
+ literal: bool = False
61
+
62
+ def __hash__(self) -> int:
63
+ """Hash."""
64
+ return hash((self.name, self.literal))
65
+
66
+
67
+ class TableColumn(BaseModel):
68
+ """Table column."""
69
+
70
+ name: str
71
+ dtype: str | None
72
+
73
+
74
+ class ForeignKey(BaseModel):
75
+ """Foreign key."""
76
+
77
+ # Referenced column
78
+ column: TableColumn
79
+ # References table name
80
+ references_name: str
81
+ # References column
82
+ references_column: TableColumn
83
+
84
+
85
+ class Table(BaseModel):
86
+ """Table."""
87
+
88
+ name: str | None
89
+ columns: list[TableColumn] | None
90
+ pks: list[TableColumn] | None
91
+ # FK from this table to another column in another table
92
+ fks: list[ForeignKey] | None
93
+ examples: list[dict] | None
94
+ # Is the table a source or intermediate reference table
95
+ is_reference_table: bool = False
96
+
97
+
98
+ class TextToSQLParams(BaseModel):
99
+ """A text to sql request."""
100
+
101
+ instruction: str
102
+ database: str | None
103
+ # Default to unknown
104
+ dialect: Dialect = Dialect.UNKNOWN
105
+ tables: list[Table] | None
106
+
107
+
108
+ class TextToSQLModelResponse(BaseModel):
109
+ """Model for Autocomplete Responses."""
110
+
111
+ output: str
112
+ final_prompt: str | list[dict]
113
+ raw_output: str
114
+ usage: Usage
115
+ metadata: str | None = None
duckdb-nsql/eval/text_to_sql.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text-to-SQL running."""
2
+ import asyncio
3
+ import json
4
+ import re
5
+ import time
6
+ from typing import cast
7
+ import duckdb
8
+
9
+ import structlog
10
+ from manifest import Manifest
11
+ from manifest.response import Response, Usage
12
+ from prompt_formatters import RajkumarFormatter, MotherDuckFormatter
13
+ from schema import DEFAULT_TABLE_NAME, TextToSQLModelResponse, TextToSQLParams
14
+ from tqdm.auto import tqdm
15
+
16
+ logger = structlog.get_logger()
17
+
18
+
19
+ def clean_whitespace(sql: str) -> str:
20
+ """Clean whitespace."""
21
+ return re.sub(r"[\t\n\s]+", " ", sql)
22
+
23
+
24
+ def instruction_to_sql(
25
+ params: TextToSQLParams,
26
+ extra_context: list[str],
27
+ manifest: Manifest,
28
+ prompt_formatter: RajkumarFormatter = None,
29
+ overwrite_manifest: bool = False,
30
+ max_tokens: int = 300,
31
+ temperature: float = 0.1,
32
+ stop_sequences: list[str] | None = None,
33
+ num_beams: int = 1,
34
+ ) -> TextToSQLModelResponse:
35
+ """Parse the instruction to a sql command."""
36
+ return instruction_to_sql_list(
37
+ params=[params],
38
+ extra_context=[extra_context],
39
+ manifest=manifest,
40
+ prompt_formatter=prompt_formatter,
41
+ overwrite_manifest=overwrite_manifest,
42
+ max_tokens=max_tokens,
43
+ temperature=0.1,
44
+ stop_sequences=stop_sequences,
45
+ num_beams=num_beams,
46
+ )[0]
47
+
48
+ def run_motherduck_prompt_sql(params: list[TextToSQLParams]) -> list[TextToSQLModelResponse]:
49
+ results = []
50
+ for param in params:
51
+ con = duckdb.connect('md:')
52
+ try:
53
+ sql_query = con.execute("CALL prompt_sql(?);", [param.instruction]).fetchall()[0][0]
54
+ except Exception as e:
55
+ print(e)
56
+ sql_query = "SELECT * FROM hn.hacker_news LIMIT 1";
57
+ usage = Usage(
58
+ completion_tokens = 0,
59
+ prompt_tokens = 0,
60
+ total_tokens = 0
61
+ )
62
+ model_response = TextToSQLModelResponse(
63
+ output=sql_query,
64
+ raw_output=sql_query,
65
+ final_prompt=param.instruction,
66
+ usage=usage,
67
+ )
68
+ results.append(model_response)
69
+ return results
70
+
71
+
72
+
73
+ def instruction_to_sql_list(
74
+ params: list[TextToSQLParams],
75
+ extra_context: list[list[str]],
76
+ manifest: Manifest,
77
+ prompt_formatter: RajkumarFormatter = None,
78
+ overwrite_manifest: bool = False,
79
+ max_tokens: int = 300,
80
+ temperature: float = 0.1,
81
+ stop_sequences: list[str] | None = None,
82
+ num_beams: int = 1,
83
+ verbose: bool = False,
84
+ ) -> list[TextToSQLModelResponse]:
85
+ """Parse the list of instructions to sql commands.
86
+
87
+ Connector is used for default retry handlers only.
88
+ """
89
+ if type(prompt_formatter) is MotherDuckFormatter:
90
+ return run_motherduck_prompt_sql(params)
91
+
92
+ if prompt_formatter is None:
93
+ raise ValueError("Prompt formatter is required.")
94
+
95
+ def construct_params(
96
+ params: TextToSQLParams,
97
+ context: list[str],
98
+ ) -> str | list[dict]:
99
+ """Turn params into prompt."""
100
+ if prompt_formatter.clean_whitespace:
101
+ instruction = clean_whitespace(params.instruction)
102
+ else:
103
+ instruction = params.instruction
104
+
105
+ table_texts = prompt_formatter.format_all_tables(
106
+ params.tables, instruction=instruction
107
+ )
108
+ # table_texts can be list of chat messages. Only join list of str.
109
+ if table_texts:
110
+ if isinstance(table_texts[0], str):
111
+ table_text = prompt_formatter.table_sep.join(table_texts)
112
+ else:
113
+ table_text = table_texts
114
+ else:
115
+ table_text = ""
116
+
117
+ if context:
118
+ context_text = prompt_formatter.format_retrieved_context(context)
119
+ else:
120
+ context_text = "" if isinstance(table_text, str) else []
121
+ prompt = prompt_formatter.format_prompt(
122
+ instruction,
123
+ table_text,
124
+ context_text,
125
+ )
126
+ return prompt
127
+
128
+ # If no inputs, return nothing
129
+ if not params:
130
+ return []
131
+
132
+ # Stitch together demonstrations and params
133
+ prompts: list[str | list[dict]] = []
134
+ for i, param in tqdm(
135
+ enumerate(params),
136
+ total=len(params),
137
+ desc="Constructing prompts",
138
+ disable=not verbose,
139
+ ):
140
+ predict_str = construct_params(param, extra_context[i] if extra_context else [])
141
+ if isinstance(predict_str, str):
142
+ prompt = predict_str.lstrip()
143
+ else:
144
+ prompt = predict_str
145
+ prompts.append(prompt)
146
+
147
+ manifest_params = dict(
148
+ max_tokens=max_tokens,
149
+ overwrite_cache=overwrite_manifest,
150
+ num_beams=num_beams,
151
+ logprobs=5,
152
+ temperature=0.1,
153
+ do_sample=False if 0.1 <= 0 else True,
154
+ stop_sequences=stop_sequences or prompt_formatter.stop_sequences,
155
+ )
156
+
157
+ ret: list[TextToSQLModelResponse] = []
158
+ if len(params) == 1:
159
+ prompt = prompts[0]
160
+ success = False
161
+ retries = 0
162
+ while not success and retries < 5:
163
+ try:
164
+ model_response = _run_manifest(
165
+ prompt,
166
+ manifest_params,
167
+ prompt_formatter,
168
+ manifest,
169
+ stop_sequences=stop_sequences,
170
+ )
171
+ success = True
172
+ except:
173
+ retries +=1
174
+
175
+ usage = model_response.usage
176
+ model_response.usage = usage
177
+ ret.append(model_response)
178
+ else:
179
+ # We do not handle retry logic on parallel requests right now
180
+ loop = asyncio.new_event_loop()
181
+ asyncio.set_event_loop(loop)
182
+ response = cast(
183
+ Response,
184
+ loop.run_until_complete(
185
+ manifest.arun_batch(
186
+ prompts,
187
+ **manifest_params, # type: ignore
188
+ ),
189
+ ),
190
+ )
191
+ loop.close()
192
+
193
+ response_usage = response.get_usage()
194
+ response_text = response.get_parsed_response()
195
+ for prompt, resp in zip(prompts, response_text):
196
+ # This will restitch the query in the case we force it to start with SELECT
197
+ sql_query = prompt_formatter.format_model_output(cast(str, resp), prompt)
198
+ for token in stop_sequences:
199
+ sql_query = sql_query.split(token)[0]
200
+ logger.info(f"FINAL OUTPUT: {sql_query}")
201
+ ret.append(
202
+ TextToSQLModelResponse(
203
+ output=sql_query,
204
+ raw_output=cast(str, resp),
205
+ final_prompt=prompt,
206
+ usage=response_usage,
207
+ )
208
+ )
209
+
210
+ return ret
211
+
212
+
213
+ def _run_manifest(
214
+ prompt: str | list[str],
215
+ manifest_params: dict,
216
+ prompt_formatter: RajkumarFormatter,
217
+ manifest: Manifest,
218
+ stop_sequences: list[str] | None = None,
219
+ ) -> TextToSQLModelResponse:
220
+ """Run manifest for prompt format."""
221
+ logger.info(f"PARAMS: {manifest_params}")
222
+ if isinstance(prompt, list):
223
+ for p in prompt:
224
+ logger.info(f"PROMPT: {p['role']}: {p['content']}")
225
+ else:
226
+ logger.info(f"PROMPT: {prompt}")
227
+ start_time = time.time()
228
+ # Run result
229
+ response = cast(
230
+ Response,
231
+ manifest.run(
232
+ prompt,
233
+ return_response=True,
234
+ client_timeout=1800,
235
+ **manifest_params, # type: ignore
236
+ ),
237
+ )
238
+ logger.info(f"TIME: {time.time() - start_time: .2f}")
239
+
240
+ response_usage = response.get_usage_obj()
241
+ summed_usage = Usage()
242
+ for usage in response_usage.usages:
243
+ summed_usage.completion_tokens += usage.completion_tokens
244
+ summed_usage.prompt_tokens += usage.prompt_tokens
245
+ summed_usage.total_tokens += usage.total_tokens
246
+ # This will restitch the query in the case we force it to start with SELECT
247
+ sql_query = prompt_formatter.format_model_output(
248
+ cast(str, response.get_response()), prompt
249
+ )
250
+
251
+ for token in stop_sequences:
252
+ sql_query = sql_query.split(token)[0]
253
+ logger.info(f"OUTPUT: {sql_query}")
254
+ model_response = TextToSQLModelResponse(
255
+ output=sql_query,
256
+ raw_output=cast(str, response.get_response()),
257
+ final_prompt=prompt,
258
+ usage=summed_usage,
259
+ )
260
+ return model_response
duckdb-nsql/manifest/.flake8 ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is our code-style check. We currently allow the following exceptions:
2
+ # - E731: do not assign a lambda expression, use a def
3
+ # - E402: module level import not at top of file
4
+ # - W503: line break before binary operator
5
+ # - E203: whitespace before :
6
+
7
+ [flake8]
8
+ exclude = .git
9
+ max-line-length = 88
10
+ ignore = E731, E402, W503, E203, PAI100, PAI101, PAI201, PAI202, PAI203
11
+ per-file-ignores = __init__.py:F401, version.py:D100
duckdb-nsql/manifest/.pre-commit-config.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v3.2.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-toml
9
+ - id: check-merge-conflict
10
+ - id: check-added-large-files
11
+ - repo: https://github.com/timothycrosley/isort
12
+ rev: 5.13.2
13
+ hooks:
14
+ - id: isort
15
+ - repo: https://github.com/psf/black
16
+ rev: 22.3.0
17
+ hooks:
18
+ - id: black
19
+ language_version: python3
20
+ - repo: https://github.com/PyCQA/flake8
21
+ rev: 6.0.0
22
+ hooks:
23
+ - id: flake8
duckdb-nsql/manifest/CHANGELOG.rst ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0.1.10 - Unreleased
2
+ ---------------------
3
+
4
+ 0.1.9 - 2024-01-22
5
+ ---------------------
6
+ Fixed
7
+ ^^^^^
8
+ * Added trust code params HF models
9
+ * Added LRU cache to HF model param calls to avoid extra calls
10
+ * Fixed pydantic type issue HF model return
11
+ * Support for Python 3.10-3.11
12
+
13
+ 0.1.8 - 2023-05-22
14
+ ---------------------
15
+ Added
16
+ ^^^^^
17
+ * Azure model support (completion and chat)
18
+ * Google Vertex API model support (completion and chat)
19
+ * Streaming responses for LM Completions (set stream=True)
20
+
21
+ Fixed
22
+ ^^^^^
23
+ * `run` with batches now acts the same as async run except not async. We will batch requests into appropriate batchs sizes.
24
+ * Refactored client so unified preprocess and postprocess of requests and responses to better support model variants in request/response format.
25
+
26
+ 0.1.7 - 2023-05-17
27
+ ---------------------
28
+ Fixed
29
+ ^^^^^
30
+ * `_run_chat` fixed bug where not passing in kwargs
31
+
32
+ 0.1.6 - 2023-05-16
33
+ ---------------------
34
+ Fixed
35
+ ^^^^^
36
+ * Unified `run` and `run_chat` methods so it's just `run` now.
37
+ * LLama HF models for eval
38
+
39
+ 0.1.5 - 2023-05-03
40
+ ---------------------
41
+ Added
42
+ ^^^^^
43
+ * Added chat input for chat models.
44
+
45
+ 0.1.4 - 2023-04-24
46
+ ---------------------
47
+ Added
48
+ ^^^^^
49
+ * Connection pools to swap between clients
50
+ * Chunksize param for async runs
51
+
52
+ Fixed
53
+ ^^^^^
54
+ * Determine cache and response by request type, not client name
55
+ * Refactor Response to use Pydantic types for Request and Response
56
+
57
+ 0.1.1
58
+ ---------------------
59
+ Added
60
+ ^^^^^
61
+ * Async support in arun_batch
62
+
63
+ Fixed
64
+ ^^^^^
65
+ * Batched runs now caches individual items
66
+ * Score prompt does not truncate outside token
67
+
68
+ Removed
69
+ ^^^^^
70
+ * Deprecated chatGPT in favor of openaichat which uses OpenAI completions
71
+ * Deprecated Sessions
72
+
73
+ 0.1.0 - 2022-01-31
74
+ ---------------------
75
+ Added
76
+ ^^^^^
77
+ * Batched inference support in `manifest.run`. No more separate `manifest.run_batch` method.
78
+ * Standard request base model for all language inputs.
79
+ * ChatGPT client. Requires CHATGPT_SESSION_KEY to be passed in.
80
+ * Diffusion model support
81
+ * Together model support
82
+
83
+ Removed
84
+ ^^^^^^^
85
+ * `Prompt` class
86
+ * `OPT` client - OPT is now available in HuggingFace
87
+
88
+ 0.0.1 - 2022-11-08
89
+ -------------------
90
+ First major pip release of Manifest. Install via `pip install manifest-ml`.
91
+
92
+
93
+ .. _@lorr1: https://github.com/lorr1
duckdb-nsql/manifest/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
duckdb-nsql/manifest/Makefile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dev:
2
+ pip install -e .[all]
3
+ pre-commit install
4
+
5
+ test: dev check
6
+ pytest tests
7
+
8
+ format:
9
+ isort --atomic manifest/ tests/ web_app/
10
+ black manifest/ tests/ web_app/
11
+
12
+ check:
13
+ isort -c manifest/ tests/ web_app/
14
+ black manifest/ tests/ web_app/ --check
15
+ flake8 manifest/ tests/ web_app/
16
+ mypy manifest/ tests/ web_app/
17
+
18
+ clean:
19
+ pip uninstall -y manifest
20
+ rm -rf src/manifest.egg-info
21
+ rm -rf build/ dist/
22
+
23
+ prune:
24
+ @bash -c "git fetch -p";
25
+ @bash -c "for branch in $(git branch -vv | grep ': gone]' | awk '{print $1}'); do git branch -d $branch; done";
26
+
27
+ .PHONY: dev test clean check prune
duckdb-nsql/manifest/README.md ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Manifest
2
+ How to make prompt programming with Foundation Models a little easier.
3
+
4
+
5
+ # Table of Contents
6
+ - [Install](#install)
7
+ - [Getting Started](#getting-started)
8
+ - [Manifest](#manifest-components)
9
+ - [Other Models Types](#other-models)
10
+ - [Local HuggingFace Models](#local-huggingface-models)
11
+ - [Chat Models](#chat-models)
12
+ - [Embedding Models](#embedding-models)
13
+ - [Road Map](#road-map)
14
+ - [Development](#development)
15
+ - [Cite](#cite)
16
+
17
+
18
+ # Install
19
+ Install:
20
+ ```bash
21
+ pip install manifest-ml
22
+ ```
23
+
24
+ Install with diffusion support:
25
+ ```bash
26
+ pip install manifest-ml[diffusers]
27
+ ```
28
+
29
+ Install with HuggingFace local model support:
30
+ ```bash
31
+ pip install manifest-ml[api]
32
+ ```
33
+
34
+ Dev Install:
35
+ ```bash
36
+ git clone [email protected]:HazyResearch/manifest.git
37
+ cd manifest
38
+ make dev
39
+ ```
40
+
41
+ # Getting Started
42
+ Running is simple to get started. If using OpenAI, set `export OPENAI_API_KEY=<OPENAIKEY>` (or pass key in through variable `client_connection`) then run
43
+
44
+ ```python
45
+ from manifest import Manifest
46
+
47
+ # Start a manifest session to OpenAI - default `engine=text-davinci-003`
48
+ manifest = Manifest(
49
+ client_name = "openai",
50
+ )
51
+ manifest.run("Why is the grass green?")
52
+ ```
53
+
54
+ ## Examples
55
+ We have example notebook and python scripts located at [examples](examples). These show how to use different models, model types (i.e. text, diffusers, or embedding models), and async running.
56
+
57
+ # Manifest Components
58
+ Manifest is meant to be a very light weight package to help with prompt design and iteration. Three key design decisions of Manifest are
59
+
60
+ * All models are behind APIs
61
+ * Supports caching of model inputs/outputs for iteration, reproducibility, and cost saving
62
+ * Unified API to support generate, score, and embed
63
+
64
+ ## Models
65
+ Manifest provides model clients for [OpenAI](https://openai.com/), [AI21](https://studio.ai21.com/), [Cohere](https://cohere.ai/), [Together](https://together.xyz/), and HuggingFace (see [below](#huggingface-models) for how to use locally hosted HuggingFace models). You can toggle between the models by changing `client_name` and `client_connection`. For example, if a HuggingFace model is loaded locally, run
66
+ ```python
67
+ manifest = Manifest(
68
+ client_name = "huggingface",
69
+ client_connection = "http://127.0.0.1:5000",
70
+ )
71
+ ```
72
+ If you want to use Cohere, run
73
+ ```python
74
+ manifest = Manifest(
75
+ client_name = "cohere",
76
+ client_connection = <COHERE_API_KEY>,
77
+ )
78
+ ```
79
+ You can also just set `export COHERE_API_KEY=<COHERE_API_KEY>` and not use `client_connection`.
80
+
81
+ If you want to use AI21 Labs, run
82
+ ```python
83
+ manifest = Manifest(
84
+ client_name = "ai21",
85
+ client_connection = <AI21_API_KEY>,
86
+ )
87
+ ```
88
+
89
+ You can see the model details and possible model inputs to `run()` via
90
+ ```python
91
+ print(manifest.client_pool.get_current_client().get_model_params())
92
+ print(manifest.client_pool.get_current_client().get_model_inputs())
93
+ ```
94
+
95
+ ## Global Cache
96
+ We support having queries and results stored in a global cache that can be shared across users. We treat inputs and outputs as key value pairs and support SQLite or Redis backends. To start with global caching using SQLite, run
97
+
98
+ ```python
99
+ manifest = Manifest(
100
+ client_name = "openai",
101
+ cache_name = "sqlite",
102
+ cache_connection = "mycache.sqlite",
103
+ )
104
+ ```
105
+ The cache will be saved in `mycache.sqlite`.
106
+
107
+ We also support Redis backend.
108
+ ```python
109
+ manifest = Manifest(
110
+ client_name = "openai",
111
+ cache_name = "redis",
112
+ cache_connection = "localhost:6379"
113
+ )
114
+ ```
115
+ As a hint, if you want to get Redis running, see the `docker run` command below under development.
116
+
117
+ ## Running Queries
118
+ Once you have a session open, you can write and develop prompts.
119
+
120
+ ```python
121
+ result = manifest.run("Hello, my name is Laurel")
122
+ ```
123
+
124
+ You can also run over multiple examples if supported by the client.
125
+ ```python
126
+ results = manifest.run(["Where are the cats?", "Where are the dogs?"])
127
+ ```
128
+
129
+ We support async queries as well via
130
+ ```python
131
+ import asyncio
132
+ results = asyncio.run(manifest.arun_batch(["Where are the cats?", "Where are the dogs?"]))
133
+ ```
134
+
135
+ If something doesn't go right, you can also ask to get a raw manifest Response.
136
+ ```python
137
+ result_object = manifest.run(["Where are the cats?", "Where are the dogs?"], return_response=True)
138
+ print(result_object.get_request_obj())
139
+ print(result_object.is_cached())
140
+ print(result_object.get_response_obj())
141
+ ```
142
+
143
+ By default, we do not truncate results based on a stop token. You can change this by either passing a new stop token to a Manifest session or to a `run`.
144
+ ```python
145
+ result = manifest.run(prompt, "Laurel", stop_token="and")
146
+ ```
147
+
148
+ If you want to change default parameters to a model, we pass those as `kwargs` to the client.
149
+ ```python
150
+ result = manifest.run(prompt, "Laurel", max_tokens=50)
151
+ ```
152
+
153
+ ## Streaming Queries
154
+ Manifest also supports streaming the model response back, assuming it's supported by the underlying client. When calling `run`, pass `stream=True` to get a streaming iterator in response.
155
+
156
+ ```python
157
+ result_iterator = manifest.run("Tell me a story. Once upon a time", max_tokens=100, stream=True)
158
+ for res_text in result_iterator:
159
+ print(res_text)
160
+ ```
161
+ Streaming responses are only supported for single string queries (not batch mode) for text completion models.
162
+
163
+ ## Model Pools
164
+ Manifest supports querying multiple models with different schedulers. This is very much a work in progress effort, but Manifest will round robin select (or randomly select) the clients you want. You can use the same client multiple times with different connection strings (e.g. different API keys), or you can mix and match. The only requirement is that all clients are the same request type. I.e. you can't have a pool of generation models and embedding models.
165
+
166
+ To query between a local model and OpenAI,
167
+ ```python
168
+ from manifest.connections.client_pool import ClientConnection
169
+ from manifest import Manifest
170
+
171
+ client_connection1 = ClientConnection(
172
+ client_name="huggingface",
173
+ client_connection="http://127.0.0.1:5000",
174
+ )
175
+ client_connection2 = ClientConnection(client_name="openai", engine="text-ada-001")
176
+ manifest = Manifest(
177
+ client_pool=[client_connection1, client_connection2],
178
+ cache_name="sqlite",
179
+ client_connection=sqlite_cache,
180
+ )
181
+ manifest.run(...)
182
+ ```
183
+
184
+ The speed benefit comes in with async batched runs. When calling `arun_batch` with a list of prompts, Manifest supports a `chunk_size` param. This will break the prompts into `chunk_size` chunks to spread across the client pool. By default `chunk_size` is `-1` which means only one client will get all the prompts to run asynchronously. You must set `chunk_size > 1` to distribute across the pool. There is a further `batch_size` param which control the individual client `batch_size` to send to the model.
185
+
186
+ ```python
187
+ responses = asyncio.run(manifest.arun_batch(prompts, max_tokens=30, chunk_size=20))
188
+ ```
189
+
190
+ # Other Models
191
+
192
+ ## Local Huggingface Models
193
+ To use a HuggingFace generative model, in `manifest/api` we have a Flask application that hosts the models for you.
194
+
195
+ In a separate terminal or Tmux/Screen session, to load 6B parameters models, run
196
+ ```bash
197
+ python3 -m manifest.api.app \
198
+ --model_type huggingface \
199
+ --model_name_or_path EleutherAI/gpt-j-6B \
200
+ --device 0
201
+ ```
202
+ You will see the Flask session start and output a URL `http://127.0.0.1:5000`. Pass this in to Manifest. If you want to use a different port, set the `FLASK_PORT` environment variable.
203
+
204
+ ```python
205
+ manifest = Manifest(
206
+ client_name = "huggingface",
207
+ client_connection = "http://127.0.0.1:5000",
208
+ )
209
+ ```
210
+
211
+ If you have a custom model you trained, pass the model path to `--model_name_or_path`.
212
+
213
+ To help load larger models, we also support using `parallelize()` from HF, [accelerate](https://huggingface.co/docs/accelerate/index), [bitsandbytes](https://github.com/TimDettmers/bitsandbytes), and [deepspeed](https://github.com/microsoft/DeepSpeed). You will need to install these packages first via `pip install manifest-ml[api]`. We list the commands to load larger models below.
214
+
215
+ * T0pp
216
+ ```bash
217
+ python3 -m manifest.api.app \
218
+ --model_type huggingface \
219
+ --model_name_or_path bigscience/T0pp \
220
+ --use_hf_parallelize
221
+ ```
222
+
223
+ * NeoX 20B (requires at least 60GB of GPU memory)
224
+ ```bash
225
+ python3 -m manifest.api.app \
226
+ --model_type huggingface \
227
+ --model_name_or_path EleutherAI/gpt-neox-20b \
228
+ --use_accelerate_multigpu \
229
+ --percent_max_gpu_mem_reduction 0.75
230
+ ```
231
+ * Bloom 175B (requires at least 240GB of GPU memory)
232
+ ```bash
233
+ python3 -m manifest.api.app \
234
+ --model_type huggingface \
235
+ --model_name_or_path bigscience/bloom \
236
+ --use_bitsandbytes \
237
+ --percent_max_gpu_mem_reduction 0.85
238
+ ```
239
+
240
+ ## Chat Models
241
+ Manifest has specific support for executing against chat models in the more standard "system" / "user" dialogue. To pass in a dialogue history to Manifest, use the `run` command with a list of dictionary inputs with `role` and `content` keys using an associated chat model such as `openaichat`.
242
+
243
+ ```python
244
+ manifest = Manifest(client_name="openaichat")
245
+ dialogue = [
246
+ {"role": "system", "content": "You are a helpful assistant who also responds in rhymes"},
247
+ {"role": "user", "content": "What is the date?"},
248
+ ]
249
+ res = manifest.run(dialogue, max_tokens=100)
250
+ ```
251
+
252
+ ## Embedding Models
253
+ Manifest also supports getting embeddings from models and available APIs. We do this all through changing the `client_name` argument. You still use `run` and `abatch_run`.
254
+
255
+ To use OpenAI's embedding models, simply run
256
+ ```python
257
+ manifest = Manifest(client_name="openaiembedding")
258
+ embedding_as_np = manifest.run("Get me an embedding for a bunny")
259
+ ```
260
+
261
+ As explained above, you can load local HuggingFace models that give you embeddings, too. If you want to use a standard generative model, load the model as above use use `client_name="huggingfaceembedding"`. If you want to use a standard embedding model, like those from SentenceTransformers, load your local model via
262
+ ```bash
263
+ python3 -m manifest.api.app \
264
+ --model_type sentence_transformers \
265
+ --model_name_or_path all-mpnet-base-v2 \
266
+ --device 0
267
+ ```
268
+
269
+ # Road Map
270
+ Here's what's coming up next
271
+ - [ ] Clients
272
+ - [ ] HuggingFace Hub
273
+ - [x] Azure OpenAI
274
+ - [x] Google Vertex
275
+ - [ ] Anthropic
276
+ - [x] Streaming Support Completions
277
+ - [ ] Streaming Support Chat Models
278
+ - [ ] Data Types
279
+ - [ ] Diffusion Models
280
+ - [x] Orchestration
281
+ - [x] Connection pools
282
+ - [ ] Local Inference
283
+ - [ ] FlexGen
284
+
285
+ # Development
286
+ Before submitting a PR, run
287
+ ```bash
288
+ export REDIS_PORT="6379" # or whatever PORT local redis is running for those tests
289
+ cd <REDIS_PATH>
290
+ docker run -d -p 127.0.0.1:${REDIS_PORT}:6379 -v `pwd`:`pwd` -w `pwd` --name manifest_redis_test redis
291
+ make test
292
+ ```
293
+
294
+ # Cite
295
+ Please cite Manifest if you used it for any publications. Thanks!!
296
+ ```
297
+ @misc{orr2022manifest,
298
+ author = {Orr, Laurel},
299
+ title = {Manifest},
300
+ year = {2022},
301
+ publisher = {GitHub},
302
+ howpublished = {\url{https://github.com/HazyResearch/manifest}},
303
+ }
304
+ ```
duckdb-nsql/manifest/examples/langchain_chatgpt.ipynb ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "id": "b253f4d5",
7
+ "metadata": {},
8
+ "source": [
9
+ "# ChatGPT Clone using TOMA GPT-JT-6B\n",
10
+ "(adopted from ChatGPT Clone [notebook](https://github.com/hwchase17/langchain/blob/master/docs/examples/chains/chatgpt_clone.ipynb))"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "b0302886",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "env: TOMA_URL=https://staging.together.xyz/api\n"
24
+ ]
25
+ }
26
+ ],
27
+ "source": [
28
+ "%env TOMA_URL=https://staging.together.xyz/api"
29
+ ]
30
+ },
31
+ {
32
+ "attachments": {},
33
+ "cell_type": "markdown",
34
+ "id": "93a18ea6",
35
+ "metadata": {},
36
+ "source": [
37
+ "Make sure you have langchain installed and manifest. For the most recent versions, run\n",
38
+ "```\n",
39
+ "pip install git+https://github.com/hwchase17/langchain.git\n",
40
+ "pip install git+https://github.com/HazyResearch/manifest.git\n",
41
+ "```"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 35,
47
+ "id": "a99acd89",
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "name": "stdout",
52
+ "output_type": "stream",
53
+ "text": [
54
+ "\n",
55
+ "\n",
56
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
57
+ "Prompt after formatting:\n",
58
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
59
+ "\n",
60
+ "\n",
61
+ "Input: Classes are \"positive\" and \"negative\". For example given\n",
62
+ "Input: I love this product!\n",
63
+ "Output: positive.\n",
64
+ "I think this movie was one of the worst of the year. Script was boring!\n",
65
+ "Output:\u001b[0m\n",
66
+ "\n",
67
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
68
+ "negative.\n"
69
+ ]
70
+ }
71
+ ],
72
+ "source": [
73
+ "from manifest import Manifest\n",
74
+ "from langchain.llms.manifest import ManifestWrapper\n",
75
+ "from langchain import ConversationChain, LLMChain, PromptTemplate\n",
76
+ "from langchain.chains.conversation.memory import ConversationalBufferWindowMemory\n",
77
+ "\n",
78
+ "\n",
79
+ "template = \"\"\"I am a classification model. It will try to classify your input.\n",
80
+ "\n",
81
+ "{history}\n",
82
+ "Input: {human_input}\n",
83
+ "Output:\"\"\"\n",
84
+ "\n",
85
+ "prompt = PromptTemplate(\n",
86
+ " input_variables=[\"history\", \"human_input\"], \n",
87
+ " template=template\n",
88
+ ")\n",
89
+ "\n",
90
+ "manifest = Manifest(\n",
91
+ " client_name=\"toma\",\n",
92
+ " engine=\"Together-gpt-JT-6B-v1\",\n",
93
+ " max_tokens=150,\n",
94
+ " top_p=0.9,\n",
95
+ " top_k=40,\n",
96
+ " stop_sequences=[\"\\n\"],\n",
97
+ ")\n",
98
+ "\n",
99
+ "chatgpt_chain = LLMChain(\n",
100
+ " llm=ManifestWrapper(client=manifest), \n",
101
+ " prompt=prompt, \n",
102
+ " verbose=True, \n",
103
+ " memory=ConversationalBufferWindowMemory(k=8),\n",
104
+ ")\n",
105
+ "\n",
106
+ "output = chatgpt_chain.predict(human_input=\"Classes are \\\"positive\\\" and \\\"negative\\\". For example given\\nInput: I love this product!\\nOutput: positive.\\nI think this movie was one of the worst of the year. Script was boring!\")\n",
107
+ "print(output)"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 36,
113
+ "id": "4ef711d6",
114
+ "metadata": {},
115
+ "outputs": [
116
+ {
117
+ "name": "stdout",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "\n",
121
+ "\n",
122
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
123
+ "Prompt after formatting:\n",
124
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
125
+ "\n",
126
+ "Human: Classes are \"positive\" and \"negative\". For example given\n",
127
+ "Input: I love this product!\n",
128
+ "Output: positive.\n",
129
+ "I think this movie was one of the worst of the year. Script was boring!\n",
130
+ "AI: negative.\n",
131
+ "Input: So awesome! I wish I could have gone\n",
132
+ "Output:\u001b[0m\n",
133
+ "\n",
134
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
135
+ "positive.\n"
136
+ ]
137
+ }
138
+ ],
139
+ "source": [
140
+ "output = chatgpt_chain.predict(human_input=\"So awesome! I wish I could have gone\")\n",
141
+ "print(output)"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 37,
147
+ "id": "a5d6dac2",
148
+ "metadata": {},
149
+ "outputs": [
150
+ {
151
+ "name": "stdout",
152
+ "output_type": "stream",
153
+ "text": [
154
+ "\n",
155
+ "\n",
156
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
157
+ "Prompt after formatting:\n",
158
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
159
+ "\n",
160
+ "Human: Classes are \"positive\" and \"negative\". For example given\n",
161
+ "Input: I love this product!\n",
162
+ "Output: positive.\n",
163
+ "I think this movie was one of the worst of the year. Script was boring!\n",
164
+ "AI: negative.\n",
165
+ "Human: So awesome! I wish I could have gone\n",
166
+ "AI: positive.\n",
167
+ "Input: Hate it.\n",
168
+ "Output:\u001b[0m\n",
169
+ "\n",
170
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
171
+ "negative.\n"
172
+ ]
173
+ }
174
+ ],
175
+ "source": [
176
+ "output = chatgpt_chain.predict(human_input=\"Hate it.\")\n",
177
+ "print(output)"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 43,
183
+ "id": "b9283077",
184
+ "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "name": "stdout",
188
+ "output_type": "stream",
189
+ "text": [
190
+ "\n",
191
+ "\n",
192
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
193
+ "Prompt after formatting:\n",
194
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
195
+ "\n",
196
+ "\n",
197
+ "Input: Classes are fruits \"apple\", \"banana\", \"orange\", \"pear\". For example given\n",
198
+ "Input: This fruit rippens off of the tree.\n",
199
+ "Output: banana.\n",
200
+ "Often comes in bosc and bartlett varieties.\n",
201
+ "Output:\u001b[0m\n",
202
+ "\n",
203
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
204
+ "apple.\n"
205
+ ]
206
+ }
207
+ ],
208
+ "source": [
209
+ "chatgpt_chain.memory.clear()\n",
210
+ "output = chatgpt_chain.predict(human_input=\"Classes are fruits \\\"apple\\\", \\\"banana\\\", \\\"orange\\\", \\\"pear\\\". For example given\\nInput: This fruit rippens off of the tree.\\nOutput: banana.\\nOften comes in bosc and bartlett varieties.\")\n",
211
+ "print(output)"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 44,
217
+ "id": "cd0a23d9",
218
+ "metadata": {
219
+ "scrolled": true
220
+ },
221
+ "outputs": [
222
+ {
223
+ "name": "stdout",
224
+ "output_type": "stream",
225
+ "text": [
226
+ "\n",
227
+ "\n",
228
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
229
+ "Prompt after formatting:\n",
230
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
231
+ "\n",
232
+ "Human: Classes are fruits \"apple\", \"banana\", \"orange\", \"pear\". For example given\n",
233
+ "Input: This fruit rippens off of the tree.\n",
234
+ "Output: banana.\n",
235
+ "Often comes in bosc and bartlett varieties.\n",
236
+ "AI: apple.\n",
237
+ "Input: Often associated with monkeys\n",
238
+ "Output:\u001b[0m\n",
239
+ "\n",
240
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
241
+ "banana.\n"
242
+ ]
243
+ }
244
+ ],
245
+ "source": [
246
+ "output = chatgpt_chain.predict(human_input=\"Often associated with monkeys\")\n",
247
+ "print(output)"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 45,
253
+ "id": "90db6eb2",
254
+ "metadata": {},
255
+ "outputs": [
256
+ {
257
+ "name": "stdout",
258
+ "output_type": "stream",
259
+ "text": [
260
+ "\n",
261
+ "\n",
262
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
263
+ "Prompt after formatting:\n",
264
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
265
+ "\n",
266
+ "Human: Classes are fruits \"apple\", \"banana\", \"orange\", \"pear\". For example given\n",
267
+ "Input: This fruit rippens off of the tree.\n",
268
+ "Output: banana.\n",
269
+ "Often comes in bosc and bartlett varieties.\n",
270
+ "AI: apple.\n",
271
+ "Human: Often associated with monkeys\n",
272
+ "AI: banana.\n",
273
+ "Input: Is the color red and often delicious.\n",
274
+ "Output:\u001b[0m\n",
275
+ "\n",
276
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
277
+ "apple.\n"
278
+ ]
279
+ }
280
+ ],
281
+ "source": [
282
+ "output = chatgpt_chain.predict(human_input=\"Is the color red and often delicious.\")\n",
283
+ "print(output)"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "execution_count": 48,
289
+ "id": "c3806f89",
290
+ "metadata": {},
291
+ "outputs": [
292
+ {
293
+ "name": "stdout",
294
+ "output_type": "stream",
295
+ "text": [
296
+ "\n",
297
+ "\n",
298
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
299
+ "Prompt after formatting:\n",
300
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
301
+ "\n",
302
+ "\n",
303
+ "Input: Classes are colors \"red\", \"green\", \"blue\", \"yellow\". For example given\n",
304
+ "Input: The color of a school bus.\n",
305
+ "Output: yellow.\n",
306
+ "Is the color of the sky\n",
307
+ "Output:\u001b[0m\n",
308
+ "\n",
309
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
310
+ "blue.\n"
311
+ ]
312
+ }
313
+ ],
314
+ "source": [
315
+ "chatgpt_chain.memory.clear()\n",
316
+ "output = chatgpt_chain.predict(human_input=\"Classes are colors \\\"red\\\", \\\"green\\\", \\\"blue\\\", \\\"yellow\\\". For example given\\nInput: The color of a school bus.\\nOutput: yellow.\\nIs the color of the sky\")\n",
317
+ "print(output)"
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": 49,
323
+ "id": "f508f597",
324
+ "metadata": {},
325
+ "outputs": [
326
+ {
327
+ "name": "stdout",
328
+ "output_type": "stream",
329
+ "text": [
330
+ "\n",
331
+ "\n",
332
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
333
+ "Prompt after formatting:\n",
334
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
335
+ "\n",
336
+ "Human: Classes are colors \"red\", \"green\", \"blue\", \"yellow\". For example given\n",
337
+ "Input: The color of a school bus.\n",
338
+ "Output: yellow.\n",
339
+ "Is the color of the sky\n",
340
+ "AI: blue.\n",
341
+ "Input: Color of a banana.\n",
342
+ "Output:\u001b[0m\n",
343
+ "\n",
344
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
345
+ "yellow.\n"
346
+ ]
347
+ }
348
+ ],
349
+ "source": [
350
+ "output = chatgpt_chain.predict(human_input=\"Color of a banana.\")\n",
351
+ "print(output)"
352
+ ]
353
+ },
354
+ {
355
+ "cell_type": "code",
356
+ "execution_count": 50,
357
+ "id": "cbd607f4",
358
+ "metadata": {},
359
+ "outputs": [
360
+ {
361
+ "name": "stdout",
362
+ "output_type": "stream",
363
+ "text": [
364
+ "\n",
365
+ "\n",
366
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
367
+ "Prompt after formatting:\n",
368
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
369
+ "\n",
370
+ "Human: Classes are colors \"red\", \"green\", \"blue\", \"yellow\". For example given\n",
371
+ "Input: The color of a school bus.\n",
372
+ "Output: yellow.\n",
373
+ "Is the color of the sky\n",
374
+ "AI: blue.\n",
375
+ "Human: Color of a banana.\n",
376
+ "AI: yellow.\n",
377
+ "Input: When someone is sick they are this color.\n",
378
+ "Output:\u001b[0m\n",
379
+ "\n",
380
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
381
+ "green.\n"
382
+ ]
383
+ }
384
+ ],
385
+ "source": [
386
+ "output = chatgpt_chain.predict(human_input=\"When someone is sick they are this color.\")\n",
387
+ "print(output)"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": 51,
393
+ "id": "d33e0e28",
394
+ "metadata": {},
395
+ "outputs": [
396
+ {
397
+ "name": "stdout",
398
+ "output_type": "stream",
399
+ "text": [
400
+ "\n",
401
+ "\n",
402
+ "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
403
+ "Prompt after formatting:\n",
404
+ "\u001b[32;1m\u001b[1;3mI am a classification model. It will try to classify your input.\n",
405
+ "\n",
406
+ "Human: Classes are colors \"red\", \"green\", \"blue\", \"yellow\". For example given\n",
407
+ "Input: The color of a school bus.\n",
408
+ "Output: yellow.\n",
409
+ "Is the color of the sky\n",
410
+ "AI: blue.\n",
411
+ "Human: Color of a banana.\n",
412
+ "AI: yellow.\n",
413
+ "Human: When someone is sick they are this color.\n",
414
+ "AI: green.\n",
415
+ "Input: Color of anger.\n",
416
+ "Output:\u001b[0m\n",
417
+ "\n",
418
+ "\u001b[1m> Finished LLMChain chain.\u001b[0m\n",
419
+ "red.\n"
420
+ ]
421
+ }
422
+ ],
423
+ "source": [
424
+ "output = chatgpt_chain.predict(human_input=\"Color of anger.\")\n",
425
+ "print(output)"
426
+ ]
427
+ }
428
+ ],
429
+ "metadata": {
430
+ "kernelspec": {
431
+ "display_name": "bootleg",
432
+ "language": "python",
433
+ "name": "python3"
434
+ },
435
+ "language_info": {
436
+ "codemirror_mode": {
437
+ "name": "ipython",
438
+ "version": 3
439
+ },
440
+ "file_extension": ".py",
441
+ "mimetype": "text/x-python",
442
+ "name": "python",
443
+ "nbconvert_exporter": "python",
444
+ "pygments_lexer": "ipython3",
445
+ "version": "3.8.12 | packaged by conda-forge | (default, Jan 30 2022, 23:36:06) \n[Clang 11.1.0 ]"
446
+ },
447
+ "vscode": {
448
+ "interpreter": {
449
+ "hash": "7a3f97ab0465937066e9b79893b779dfc8a12d73c41f9d98a7bf05133c798250"
450
+ }
451
+ }
452
+ },
453
+ "nbformat": 4,
454
+ "nbformat_minor": 5
455
+ }
duckdb-nsql/manifest/examples/manifest_async.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import time
3
+
4
+ from manifest import Manifest
5
+
6
+
7
+ def main():
8
+
9
+ manifest = Manifest(
10
+ client_name="openaichat",
11
+ )
12
+
13
+ print("Running in serial")
14
+ prompts = [f"Tell me something interesting about {i}" for i in range(50)]
15
+ st = time.time()
16
+ for pmt in prompts:
17
+ _ = manifest.run(pmt)
18
+ print(f"For loop: {time.time() - st :.2f}")
19
+
20
+ print("Running with async")
21
+ st = time.time()
22
+ _ = asyncio.run(manifest.arun_batch(prompts, max_tokens=30))
23
+ print(f"Async loop: {time.time() - st :.2f}")
24
+
25
+
26
+ if __name__ == "__main__":
27
+ main()
duckdb-nsql/manifest/examples/manifest_azure.ipynb ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "%autoreload 2"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "AZURE_KEY = \"API_KEY::URL\"\n",
20
+ "OPENAI_KEY = \"sk-XXX\""
21
+ ]
22
+ },
23
+ {
24
+ "attachments": {},
25
+ "cell_type": "markdown",
26
+ "metadata": {},
27
+ "source": [
28
+ "## Use Azure and OpenAI models"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "from manifest import Manifest\n",
38
+ "from manifest.connections.client_pool import ClientConnection\n",
39
+ "from pathlib import Path\n",
40
+ "\n",
41
+ "cache_path = Path(\"manifest.db\")\n",
42
+ "if cache_path.exists():\n",
43
+ " cache_path.unlink()\n",
44
+ "\n",
45
+ "\n",
46
+ "azure = ClientConnection(\n",
47
+ " client_name=\"azureopenai\",\n",
48
+ " client_connection=AZURE_KEY,\n",
49
+ " engine=\"text-davinci-003\",\n",
50
+ ")\n",
51
+ "\n",
52
+ "manifest = Manifest(client_pool=[azure], \n",
53
+ " cache_name=\"sqlite\",\n",
54
+ " cache_connection=\"manifest.db\"\n",
55
+ ")\n",
56
+ "\n",
57
+ "\n",
58
+ "openai = ClientConnection(\n",
59
+ " client_name=\"openai\",\n",
60
+ " client_connection=OPENAI_KEY,\n",
61
+ " engine=\"text-davinci-003\",\n",
62
+ ")\n",
63
+ "\n",
64
+ "manifest_openai_nocache = Manifest(client_pool=[openai])\n",
65
+ "\n",
66
+ "manifest_openai = Manifest(client_pool=[openai], \n",
67
+ " cache_name=\"sqlite\",\n",
68
+ " cache_connection=\"manifest.db\"\n",
69
+ ")"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "# Show caches are the same\n",
79
+ "text = \"What is the meaning of life?\"\n",
80
+ "res = manifest.run(text, max_tokens=100, temperature=0.7, return_response=True)\n",
81
+ "print(res.get_response())\n",
82
+ "print(res.is_cached())\n",
83
+ "res2 = manifest_openai.run(text, max_tokens=100, temperature=0.7, return_response=True)\n",
84
+ "print(res2.is_cached())\n",
85
+ "\n",
86
+ "assert res2.get_response() == res.get_response()"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "azure_chat = ClientConnection(\n",
96
+ " client_name=\"azureopenaichat\",\n",
97
+ " client_connection=AZURE_KEY,\n",
98
+ " engine=\"gpt-3.5-turbo\",\n",
99
+ ")\n",
100
+ "\n",
101
+ "manifest = Manifest(client_pool=[azure_chat])"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "print(manifest.run(\"What do you think is the best food?\", max_tokens=100))\n",
111
+ "\n",
112
+ "chat_dict = [\n",
113
+ " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
114
+ " {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
115
+ " {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
116
+ " {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
117
+ "]\n",
118
+ "print(manifest.run(chat_dict, max_tokens=100))"
119
+ ]
120
+ }
121
+ ],
122
+ "metadata": {
123
+ "kernelspec": {
124
+ "display_name": "manifest",
125
+ "language": "python",
126
+ "name": "python3"
127
+ },
128
+ "language_info": {
129
+ "codemirror_mode": {
130
+ "name": "ipython",
131
+ "version": 3
132
+ },
133
+ "file_extension": ".py",
134
+ "mimetype": "text/x-python",
135
+ "name": "python",
136
+ "nbconvert_exporter": "python",
137
+ "pygments_lexer": "ipython3",
138
+ "version": "3.10.4"
139
+ },
140
+ "orig_nbformat": 4,
141
+ "vscode": {
142
+ "interpreter": {
143
+ "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
144
+ }
145
+ }
146
+ },
147
+ "nbformat": 4,
148
+ "nbformat_minor": 2
149
+ }
duckdb-nsql/manifest/examples/manifest_chatgpt.ipynb ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "%autoreload 2"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "OPENAI_KEY = \"sk-XXX\""
20
+ ]
21
+ },
22
+ {
23
+ "attachments": {},
24
+ "cell_type": "markdown",
25
+ "metadata": {},
26
+ "source": [
27
+ "## Use ChatOpenAI\n",
28
+ "\n",
29
+ "Set you `OPENAI_API_KEY` environment variable."
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "from manifest import Manifest\n",
39
+ "from manifest.connections.client_pool import ClientConnection\n",
40
+ "\n",
41
+ "openai_chat = ClientConnection(\n",
42
+ " client_name=\"openaichat\",\n",
43
+ " client_connection=OPENAI_KEY,\n",
44
+ " engine=\"gpt-3.5-turbo\"\n",
45
+ ")\n",
46
+ "\n",
47
+ "manifest = Manifest(client_pool=[openai_chat])"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "# Simple question\n",
57
+ "chat_dict = [\n",
58
+ " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
59
+ " {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
60
+ " {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
61
+ " {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
62
+ "]\n",
63
+ "print(manifest.run(chat_dict, max_tokens=100))"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": []
72
+ }
73
+ ],
74
+ "metadata": {
75
+ "kernelspec": {
76
+ "display_name": "manifest",
77
+ "language": "python",
78
+ "name": "python3"
79
+ },
80
+ "language_info": {
81
+ "codemirror_mode": {
82
+ "name": "ipython",
83
+ "version": 3
84
+ },
85
+ "file_extension": ".py",
86
+ "mimetype": "text/x-python",
87
+ "name": "python",
88
+ "nbconvert_exporter": "python",
89
+ "pygments_lexer": "ipython3",
90
+ "version": "3.10.4"
91
+ },
92
+ "orig_nbformat": 4,
93
+ "vscode": {
94
+ "interpreter": {
95
+ "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
96
+ }
97
+ }
98
+ },
99
+ "nbformat": 4,
100
+ "nbformat_minor": 2
101
+ }
duckdb-nsql/manifest/examples/manifest_connection_pool.ipynb ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "%autoreload 2"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "OPENAI_KEY1 = \"sk-XXX\"\n",
20
+ "OPENAI_KEY2 = \"sk-XX\""
21
+ ]
22
+ },
23
+ {
24
+ "attachments": {},
25
+ "cell_type": "markdown",
26
+ "metadata": {},
27
+ "source": [
28
+ "## Use OpenAI\n",
29
+ "\n",
30
+ "Set you `OPENAI_API_KEY` environment variable."
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 2,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from manifest import Manifest\n",
40
+ "from manifest.connections.client_pool import ClientConnection\n",
41
+ "\n",
42
+ "openai_ada = ClientConnection(\n",
43
+ " client_name=\"openai\",\n",
44
+ " client_connection=OPENAI_KEY1,\n",
45
+ " engine=\"text-ada-001\"\n",
46
+ ")\n",
47
+ "\n",
48
+ "openai_curie = ClientConnection(\n",
49
+ " client_name=\"openai\",\n",
50
+ " client_connection=OPENAI_KEY2,\n",
51
+ " engine=\"text-curie-001\"\n",
52
+ ")\n",
53
+ "\n",
54
+ "manifest = Manifest(client_pool=[openai_ada, openai_curie], client_pool_schedule=\"round_robin\")"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 3,
60
+ "metadata": {},
61
+ "outputs": [
62
+ {
63
+ "name": "stdout",
64
+ "output_type": "stream",
65
+ "text": [
66
+ "0\n",
67
+ "I am a model.\n",
68
+ "1\n",
69
+ "I am a MacBook Pro with a retina\n"
70
+ ]
71
+ }
72
+ ],
73
+ "source": [
74
+ "res = manifest.run(\"What model are you?\", temperature=0.0)\n",
75
+ "print(manifest.client_pool.current_client_id)\n",
76
+ "print(res)\n",
77
+ "res = manifest.run(\"What model are you?\", temperature=0.0)\n",
78
+ "print(manifest.client_pool.current_client_id)\n",
79
+ "print(res)"
80
+ ]
81
+ },
82
+ {
83
+ "attachments": {},
84
+ "cell_type": "markdown",
85
+ "metadata": {},
86
+ "source": [
87
+ "## With Async"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 4,
93
+ "metadata": {},
94
+ "outputs": [],
95
+ "source": [
96
+ "import nest_asyncio\n",
97
+ "# This is required for asyncio.run(...) to work in Jupyter notebooks.\n",
98
+ "nest_asyncio.apply()"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 5,
104
+ "metadata": {},
105
+ "outputs": [],
106
+ "source": [
107
+ "from manifest import Manifest\n",
108
+ "from manifest.connections.client_pool import ClientConnection\n",
109
+ "\n",
110
+ "openai_ada = ClientConnection(\n",
111
+ " client_name=\"openai\",\n",
112
+ " client_connection=OPENAI_KEY1,\n",
113
+ " engine=\"text-ada-001\"\n",
114
+ ")\n",
115
+ "\n",
116
+ "openai_babbage = ClientConnection(\n",
117
+ " client_name=\"openai\",\n",
118
+ " client_connection=OPENAI_KEY2,\n",
119
+ " engine=\"text-babbage-001\"\n",
120
+ ")\n",
121
+ "\n",
122
+ "openai_curie = ClientConnection(\n",
123
+ " client_name=\"openai\",\n",
124
+ " client_connection=OPENAI_KEY2,\n",
125
+ " engine=\"text-curie-001\"\n",
126
+ ")\n",
127
+ "\n",
128
+ "manifest = Manifest(client_pool=[openai_ada, openai_babbage, openai_curie], client_pool_schedule=\"round_robin\")\n",
129
+ "manifest_single_client = Manifest(client_pool=[openai_babbage])"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 6,
135
+ "metadata": {},
136
+ "outputs": [
137
+ {
138
+ "name": "stdout",
139
+ "output_type": "stream",
140
+ "text": [
141
+ "For loop: 128.68\n",
142
+ "Running with async single client\n",
143
+ "Running 1 tasks across all clients.\n",
144
+ "Async loop: 4.02\n",
145
+ "Running with async two clients but not chunking\n",
146
+ "Running 1 tasks across all clients.\n",
147
+ "Async loop: 3.92\n",
148
+ "Running with async two clients and chunk size\n",
149
+ "Running 20 tasks across all clients.\n",
150
+ "Async loop: 1.44\n"
151
+ ]
152
+ }
153
+ ],
154
+ "source": [
155
+ "import time\n",
156
+ "import asyncio\n",
157
+ "\n",
158
+ "prompts = [f\"Tell me something interesting about {i}\" for i in range(400)]\n",
159
+ "st = time.time()\n",
160
+ "for pmt in prompts:\n",
161
+ " _ = manifest_single_client.run(pmt, max_tokens=30)\n",
162
+ "print(f\"For loop: {time.time() - st :.2f}\")\n",
163
+ "\n",
164
+ "print(\"Running with async single client\")\n",
165
+ "st = time.time()\n",
166
+ "_ = asyncio.run(manifest_single_client.arun_batch(prompts, max_tokens=30, chunk_size=-1))\n",
167
+ "print(f\"Async loop: {time.time() - st :.2f}\")\n",
168
+ "\n",
169
+ "print(\"Running with async two clients but not chunking\")\n",
170
+ "st = time.time()\n",
171
+ "_ = asyncio.run(manifest.arun_batch(prompts, max_tokens=30, chunk_size=-1))\n",
172
+ "print(f\"Async loop: {time.time() - st :.2f}\")\n",
173
+ "\n",
174
+ "print(\"Running with async two clients and chunk size\")\n",
175
+ "st = time.time()\n",
176
+ "_ = asyncio.run(manifest.arun_batch(prompts, max_tokens=30, chunk_size=20))\n",
177
+ "print(f\"Async loop: {time.time() - st :.2f}\")"
178
+ ]
179
+ }
180
+ ],
181
+ "metadata": {
182
+ "kernelspec": {
183
+ "display_name": "manifest",
184
+ "language": "python",
185
+ "name": "python3"
186
+ },
187
+ "language_info": {
188
+ "codemirror_mode": {
189
+ "name": "ipython",
190
+ "version": 3
191
+ },
192
+ "file_extension": ".py",
193
+ "mimetype": "text/x-python",
194
+ "name": "python",
195
+ "nbconvert_exporter": "python",
196
+ "pygments_lexer": "ipython3",
197
+ "version": "3.10.4"
198
+ },
199
+ "orig_nbformat": 4,
200
+ "vscode": {
201
+ "interpreter": {
202
+ "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
203
+ }
204
+ }
205
+ },
206
+ "nbformat": 4,
207
+ "nbformat_minor": 2
208
+ }
duckdb-nsql/manifest/examples/manifest_diffusers.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
duckdb-nsql/manifest/examples/manifest_embedding.ipynb ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "%autoreload 2"
11
+ ]
12
+ },
13
+ {
14
+ "attachments": {},
15
+ "cell_type": "markdown",
16
+ "metadata": {},
17
+ "source": [
18
+ "## Use OpenAI\n",
19
+ "\n",
20
+ "Set you `OPENAI_API_KEY` environment variable."
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "{'model_name': 'openaiembedding', 'engine': 'text-embedding-ada-002'}\n"
33
+ ]
34
+ }
35
+ ],
36
+ "source": [
37
+ "from manifest import Manifest\n",
38
+ "\n",
39
+ "manifest = Manifest(client_name=\"openaiembedding\")\n",
40
+ "print(manifest.client_pool.get_next_client().get_model_params())"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 3,
46
+ "metadata": {},
47
+ "outputs": [
48
+ {
49
+ "name": "stdout",
50
+ "output_type": "stream",
51
+ "text": [
52
+ "(1536,)\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "emb = manifest.run(\"Is this an embedding?\")\n",
58
+ "print(emb.shape)"
59
+ ]
60
+ },
61
+ {
62
+ "attachments": {},
63
+ "cell_type": "markdown",
64
+ "metadata": {},
65
+ "source": [
66
+ "### Using Locally Hosted Huggingface LM\n",
67
+ "\n",
68
+ "Run\n",
69
+ "```\n",
70
+ "python3 manifest/api/app.py --model_type huggingface --model_name_or_path EleutherAI/gpt-neo-125M --device 0\n",
71
+ "```\n",
72
+ "or\n",
73
+ "```\n",
74
+ "python3 manifest/api/app.py --model_type sentence_transformers --model_name_or_path all-mpnet-base-v2 --device 0\n",
75
+ "```\n",
76
+ "\n",
77
+ "in a separate `screen` or `tmux`. Make sure to note the port. You can change this with `export FLASK_PORT=<port>`."
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 1,
83
+ "metadata": {},
84
+ "outputs": [
85
+ {
86
+ "name": "stdout",
87
+ "output_type": "stream",
88
+ "text": [
89
+ "{'model_name': 'all-mpnet-base-v2', 'model_path': 'all-mpnet-base-v2', 'client_name': 'huggingfaceembedding'}\n"
90
+ ]
91
+ }
92
+ ],
93
+ "source": [
94
+ "from manifest import Manifest\n",
95
+ "\n",
96
+ "# Local hosted GPT Neo 125M\n",
97
+ "manifest = Manifest(\n",
98
+ " client_name=\"huggingfaceembedding\",\n",
99
+ " client_connection=\"http://127.0.0.1:6000\",\n",
100
+ " cache_name=\"sqlite\",\n",
101
+ " cache_connection=\"my_sqlite_manifest.sqlite\"\n",
102
+ ")\n",
103
+ "print(manifest.client_pool.get_next_client().get_model_params())"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": 4,
109
+ "metadata": {},
110
+ "outputs": [
111
+ {
112
+ "name": "stdout",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "(768,)\n",
116
+ "(768,) (768,)\n"
117
+ ]
118
+ }
119
+ ],
120
+ "source": [
121
+ "emb = manifest.run(\"Is this an embedding?\")\n",
122
+ "print(emb.shape)\n",
123
+ "\n",
124
+ "emb = manifest.run([\"Is this an embedding?\", \"Bananas!!!\"])\n",
125
+ "print(emb[0].shape, emb[1].shape)"
126
+ ]
127
+ }
128
+ ],
129
+ "metadata": {
130
+ "kernelspec": {
131
+ "display_name": "manifest",
132
+ "language": "python",
133
+ "name": "python3"
134
+ },
135
+ "language_info": {
136
+ "codemirror_mode": {
137
+ "name": "ipython",
138
+ "version": 3
139
+ },
140
+ "file_extension": ".py",
141
+ "mimetype": "text/x-python",
142
+ "name": "python",
143
+ "nbconvert_exporter": "python",
144
+ "pygments_lexer": "ipython3",
145
+ "version": "3.10.4"
146
+ },
147
+ "orig_nbformat": 4,
148
+ "vscode": {
149
+ "interpreter": {
150
+ "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
151
+ }
152
+ }
153
+ },
154
+ "nbformat": 4,
155
+ "nbformat_minor": 2
156
+ }
duckdb-nsql/manifest/examples/manifest_google.ipynb ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "%autoreload 2"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "GOOGLE_KEY = \"KEY::PROJECT_ID\""
20
+ ]
21
+ },
22
+ {
23
+ "attachments": {},
24
+ "cell_type": "markdown",
25
+ "metadata": {},
26
+ "source": [
27
+ "## Use GoogleVertexAPI"
28
+ ]
29
+ },
30
+ {
31
+ "cell_type": "code",
32
+ "execution_count": null,
33
+ "metadata": {},
34
+ "outputs": [],
35
+ "source": [
36
+ "from manifest import Manifest\n",
37
+ "from manifest.connections.client_pool import ClientConnection\n",
38
+ "\n",
39
+ "google_bison = ClientConnection(\n",
40
+ " client_name=\"google\",\n",
41
+ " client_connection=GOOGLE_KEY\n",
42
+ ")\n",
43
+ "\n",
44
+ "manifest = Manifest(client_pool=[google_bison])"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "# Simple question\n",
54
+ "print(manifest.run(\"What is your name\", max_tokens=40))"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "from manifest import Manifest\n",
64
+ "from manifest.connections.client_pool import ClientConnection\n",
65
+ "\n",
66
+ "google_bison = ClientConnection(\n",
67
+ " client_name=\"googlechat\",\n",
68
+ " client_connection=GOOGLE_KEY\n",
69
+ ")\n",
70
+ "\n",
71
+ "manifest = Manifest(client_pool=[google_bison])"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": null,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "chat_dict = [\n",
81
+ " # {\"author\": \"bot\", \"content\": \"You are a helpful assistant.\"},\n",
82
+ " {\"author\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
83
+ " {\"author\": \"bot\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
84
+ " {\"author\": \"user\", \"content\": \"Where was it played?\"}\n",
85
+ "]\n",
86
+ "print(manifest.run(chat_dict, max_tokens=8))"
87
+ ]
88
+ }
89
+ ],
90
+ "metadata": {
91
+ "kernelspec": {
92
+ "display_name": "manifest",
93
+ "language": "python",
94
+ "name": "python3"
95
+ },
96
+ "language_info": {
97
+ "codemirror_mode": {
98
+ "name": "ipython",
99
+ "version": 3
100
+ },
101
+ "file_extension": ".py",
102
+ "mimetype": "text/x-python",
103
+ "name": "python",
104
+ "nbconvert_exporter": "python",
105
+ "pygments_lexer": "ipython3",
106
+ "version": "3.10.4"
107
+ },
108
+ "orig_nbformat": 4,
109
+ "vscode": {
110
+ "interpreter": {
111
+ "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
112
+ }
113
+ }
114
+ },
115
+ "nbformat": 4,
116
+ "nbformat_minor": 2
117
+ }
duckdb-nsql/manifest/examples/manifest_openrouter.ipynb ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "%autoreload 2"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 4,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "OPENROUTER_API_KEY = \"sk-...\""
20
+ ]
21
+ },
22
+ {
23
+ "attachments": {},
24
+ "cell_type": "markdown",
25
+ "metadata": {},
26
+ "source": [
27
+ "## Use ChatOpenAI\n",
28
+ "\n",
29
+ "Set you `OPENROUTER_API_KEY` environment variable."
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 5,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "from manifest import Manifest\n",
39
+ "from manifest.connections.client_pool import ClientConnection\n",
40
+ "\n",
41
+ "openai_chat = ClientConnection(\n",
42
+ " client_name=\"openrouter\",\n",
43
+ " client_connection=OPENROUTER_API_KEY,\n",
44
+ " engine=\"meta-llama/codellama-70b-instruct\"\n",
45
+ ")\n",
46
+ "\n",
47
+ "manifest = Manifest(client_pool=[openai_chat])"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": 6,
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "name": "stdout",
57
+ "output_type": "stream",
58
+ "text": [
59
+ "2020 World Series was played at the Globe Life Field in Arlington, Texas.\n"
60
+ ]
61
+ }
62
+ ],
63
+ "source": [
64
+ "# Simple question\n",
65
+ "chat_dict = [\n",
66
+ " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
67
+ " {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
68
+ " {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
69
+ " {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
70
+ "]\n",
71
+ "print(manifest.run(chat_dict, max_tokens=100))"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": null,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": []
80
+ }
81
+ ],
82
+ "metadata": {
83
+ "kernelspec": {
84
+ "display_name": "Python 3 (ipykernel)",
85
+ "language": "python",
86
+ "name": "python3"
87
+ },
88
+ "language_info": {
89
+ "codemirror_mode": {
90
+ "name": "ipython",
91
+ "version": 3
92
+ },
93
+ "file_extension": ".py",
94
+ "mimetype": "text/x-python",
95
+ "name": "python",
96
+ "nbconvert_exporter": "python",
97
+ "pygments_lexer": "ipython3",
98
+ "version": "3.11.5"
99
+ },
100
+ "vscode": {
101
+ "interpreter": {
102
+ "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
103
+ }
104
+ }
105
+ },
106
+ "nbformat": 4,
107
+ "nbformat_minor": 4
108
+ }
duckdb-nsql/manifest/examples/manifest_streaming.ipynb ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "%autoreload 2"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "OPENAI_KEY = \"sk-XXX\""
20
+ ]
21
+ },
22
+ {
23
+ "attachments": {},
24
+ "cell_type": "markdown",
25
+ "metadata": {},
26
+ "source": [
27
+ "## Use ChatOpenAI\n",
28
+ "\n",
29
+ "Set you `OPENAI_API_KEY` environment variable."
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "from manifest import Manifest\n",
39
+ "from manifest.connections.client_pool import ClientConnection\n",
40
+ "\n",
41
+ "openai_chat = ClientConnection(\n",
42
+ " client_name=\"openaichat\",\n",
43
+ " client_connection=OPENAI_KEY,\n",
44
+ " engine=\"gpt-3.5-turbo\"\n",
45
+ ")\n",
46
+ "\n",
47
+ "manifest = Manifest(client_pool=[openai_chat])"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "manifest_iterator = manifest.run(\"Tell me a story about a fat cat.\\n\\nOnce upon a time\", max_tokens=200, stream=True)"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "import sys\n",
66
+ "\n",
67
+ "cur_line_length = 0\n",
68
+ "# Iterate over stream\n",
69
+ "for res in manifest_iterator:\n",
70
+ " sys.stdout.write(res)\n",
71
+ " cur_line_length += len(res)\n",
72
+ " if cur_line_length > 80:\n",
73
+ " sys.stdout.write(\"\\n\")\n",
74
+ " cur_line_length = 0"
75
+ ]
76
+ }
77
+ ],
78
+ "metadata": {
79
+ "kernelspec": {
80
+ "display_name": "manifest",
81
+ "language": "python",
82
+ "name": "python3"
83
+ },
84
+ "language_info": {
85
+ "codemirror_mode": {
86
+ "name": "ipython",
87
+ "version": 3
88
+ },
89
+ "file_extension": ".py",
90
+ "mimetype": "text/x-python",
91
+ "name": "python",
92
+ "nbconvert_exporter": "python",
93
+ "pygments_lexer": "ipython3",
94
+ "version": "3.10.4"
95
+ },
96
+ "orig_nbformat": 4,
97
+ "vscode": {
98
+ "interpreter": {
99
+ "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
100
+ }
101
+ }
102
+ },
103
+ "nbformat": 4,
104
+ "nbformat_minor": 2
105
+ }
duckdb-nsql/manifest/examples/manifest_together.ipynb ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "env: TOMA_URL=<TOMA_URL>\n"
13
+ ]
14
+ }
15
+ ],
16
+ "source": [
17
+ "%load_ext autoreload\n",
18
+ "%autoreload 2\n",
19
+ "\n",
20
+ "%env TOMA_URL=<TOMA_URL>"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": null,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "from manifest import Manifest\n",
30
+ "\n",
31
+ "# The responses are not fast\n",
32
+ "manifest = Manifest(\n",
33
+ " client_name=\"toma\",\n",
34
+ ")\n",
35
+ "\n",
36
+ "print(manifest.run(\"What is the color of an apple?\"))"
37
+ ]
38
+ },
39
+ {
40
+ "attachments": {},
41
+ "cell_type": "markdown",
42
+ "metadata": {},
43
+ "source": [
44
+ "With a cache"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "from manifest import Manifest\n",
54
+ "\n",
55
+ "# The responses are not fast\n",
56
+ "manifest = Manifest(\n",
57
+ " client_name=\"toma\",\n",
58
+ " cache_name=\"sqlite\",\n",
59
+ " cache_connection=\"my_manifest_cache.sqlite\",\n",
60
+ ")\n",
61
+ "\n",
62
+ "res = manifest.run(\"What is the color of an apple?\", return_response=True)\n",
63
+ "print(res.get_response())\n",
64
+ "print(\"Is Cached?\", res.is_cached())\n",
65
+ "\n",
66
+ "res = manifest.run(\"What is the color of an apple?\", return_response=True)\n",
67
+ "print(res.get_response())\n",
68
+ "print(\"Is Cached?\", res.is_cached())"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": []
77
+ }
78
+ ],
79
+ "metadata": {
80
+ "kernelspec": {
81
+ "display_name": "manifest",
82
+ "language": "python",
83
+ "name": "python3"
84
+ },
85
+ "language_info": {
86
+ "codemirror_mode": {
87
+ "name": "ipython",
88
+ "version": 3
89
+ },
90
+ "file_extension": ".py",
91
+ "mimetype": "text/x-python",
92
+ "name": "python",
93
+ "nbconvert_exporter": "python",
94
+ "pygments_lexer": "ipython3",
95
+ "version": "3.10.4"
96
+ },
97
+ "orig_nbformat": 4,
98
+ "vscode": {
99
+ "interpreter": {
100
+ "hash": "fddffe4ac3b9f00470127629076101c1b5f38ecb1e7358b567d19305425e9491"
101
+ }
102
+ }
103
+ },
104
+ "nbformat": 4,
105
+ "nbformat_minor": 2
106
+ }
duckdb-nsql/manifest/manifest/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Manifest init."""
2
+ from manifest.manifest import Manifest
3
+ from manifest.request import Request
4
+ from manifest.response import Response
5
+
6
+ __all__ = ["Manifest", "Response", "Request"]
duckdb-nsql/manifest/manifest/api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Api init."""
duckdb-nsql/manifest/manifest/api/app.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Flask app."""
2
+ import argparse
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ import socket
8
+ from typing import Dict
9
+
10
+ import pkg_resources
11
+ from flask import Flask, Response, request
12
+
13
+ from manifest.api.models.diffuser import DiffuserModel
14
+ from manifest.api.models.huggingface import (
15
+ MODEL_GENTYPE_REGISTRY,
16
+ CrossModalEncoderModel,
17
+ TextGenerationModel,
18
+ )
19
+ from manifest.api.models.sentence_transformer import SentenceTransformerModel
20
+ from manifest.api.response import ModelResponse
21
+
22
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
+
24
+ logger = logging.getLogger(__name__)
25
+ app = Flask(__name__) # define app using Flask
26
+ # Will be global
27
+ model = None
28
+ model_type = None
29
+ PORT = int(os.environ.get("FLASK_PORT", 5000))
30
+ MODEL_CONSTRUCTORS = {
31
+ "huggingface": TextGenerationModel,
32
+ "sentence_transformers": SentenceTransformerModel,
33
+ "huggingface_crossmodal": CrossModalEncoderModel,
34
+ "diffuser": DiffuserModel,
35
+ }
36
+
37
+
38
+ def parse_args() -> argparse.Namespace:
39
+ """Generate args."""
40
+ parser = argparse.ArgumentParser(description="Model args")
41
+ parser.add_argument(
42
+ "--model_type",
43
+ default=None,
44
+ type=str,
45
+ required=True,
46
+ help="Model type used for finding constructor.",
47
+ choices=MODEL_CONSTRUCTORS.keys(),
48
+ )
49
+ parser.add_argument(
50
+ "--model_generation_type",
51
+ default=None,
52
+ type=str,
53
+ help="Model generation type.",
54
+ choices=MODEL_GENTYPE_REGISTRY.keys(),
55
+ )
56
+ parser.add_argument(
57
+ "--model_name_or_path",
58
+ default=None,
59
+ type=str,
60
+ help="Name of model or path to model. Used in initialize of model class.",
61
+ )
62
+ parser.add_argument(
63
+ "--cache_dir", default=None, type=str, help="Cache directory for models."
64
+ )
65
+ parser.add_argument(
66
+ "--device", type=int, default=0, help="Model device. -1 for CPU."
67
+ )
68
+ parser.add_argument(
69
+ "--fp16", action="store_true", help="Force use fp16 for model params."
70
+ )
71
+ parser.add_argument(
72
+ "--percent_max_gpu_mem_reduction",
73
+ type=float,
74
+ default=0.85,
75
+ help="Used with accelerate multigpu. Scales down max memory.",
76
+ )
77
+ parser.add_argument(
78
+ "--use_bitsandbytes",
79
+ action="store_true",
80
+ help=("Use bits and bytes. " "This will override --device parameter."),
81
+ )
82
+ parser.add_argument(
83
+ "--use_accelerate_multigpu",
84
+ action="store_true",
85
+ help=(
86
+ "Use accelerate for multi gpu inference. "
87
+ "This will override --device parameter."
88
+ ),
89
+ )
90
+ parser.add_argument(
91
+ "--use_hf_parallelize",
92
+ action="store_true",
93
+ help=(
94
+ "Use HF parallelize for multi gpu inference. "
95
+ "This will override --device parameter."
96
+ ),
97
+ )
98
+ parser.add_argument(
99
+ "--use_deepspeed",
100
+ action="store_true",
101
+ help=("Use deepspeed. This will override --device parameter."),
102
+ )
103
+ args = parser.parse_args()
104
+ return args
105
+
106
+
107
+ def is_port_in_use(port: int) -> bool:
108
+ """Check if port is in use."""
109
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
110
+ return s.connect_ex(("localhost", port)) == 0
111
+
112
+
113
+ def main() -> None:
114
+ """Run main."""
115
+ kwargs = parse_args()
116
+ if is_port_in_use(PORT):
117
+ raise ValueError(f"Port {PORT} is already in use.")
118
+ global model_type
119
+ model_type = kwargs.model_type
120
+ model_gen_type = kwargs.model_generation_type
121
+ model_name_or_path = kwargs.model_name_or_path
122
+ if not model_name_or_path:
123
+ raise ValueError("Must provide model_name_or_path.")
124
+ if kwargs.use_accelerate_multigpu:
125
+ logger.info("Using accelerate. Overridding --device argument.")
126
+ if (
127
+ kwargs.percent_max_gpu_mem_reduction <= 0
128
+ or kwargs.percent_max_gpu_mem_reduction > 1
129
+ ):
130
+ raise ValueError("percent_max_gpu_mem_reduction must be in (0, 1].")
131
+ if (
132
+ sum(
133
+ [
134
+ kwargs.use_accelerate_multigpu,
135
+ kwargs.use_hf_parallelize,
136
+ kwargs.use_bitsandbytes,
137
+ kwargs.use_deepspeed,
138
+ ]
139
+ )
140
+ > 1
141
+ ):
142
+ raise ValueError(
143
+ "Only one of use_accelerate_multigpu, use_hf_parallelize, "
144
+ "use_bitsandbytes, and use_deepspeed can be set."
145
+ )
146
+ # Global model
147
+ global model
148
+ model = MODEL_CONSTRUCTORS[model_type](
149
+ model_name_or_path,
150
+ model_type=model_gen_type,
151
+ cache_dir=kwargs.cache_dir,
152
+ device=kwargs.device,
153
+ use_accelerate=kwargs.use_accelerate_multigpu,
154
+ use_parallelize=kwargs.use_hf_parallelize,
155
+ use_bitsandbytes=kwargs.use_bitsandbytes,
156
+ use_deepspeed=kwargs.use_deepspeed,
157
+ perc_max_gpu_mem_red=kwargs.percent_max_gpu_mem_reduction,
158
+ use_fp16=kwargs.fp16,
159
+ )
160
+ app.run(host="0.0.0.0", port=PORT)
161
+
162
+
163
+ @app.route("/completions", methods=["POST"])
164
+ def completions() -> Response:
165
+ """Get completions for generation."""
166
+ prompt = request.json["prompt"]
167
+ del request.json["prompt"]
168
+ generation_args = request.json
169
+
170
+ if not isinstance(prompt, (str, list)):
171
+ raise ValueError("Prompt must be a str or list of str")
172
+ try:
173
+ result_gens = []
174
+ for generations in model.generate(prompt, **generation_args):
175
+ result_gens.append(generations)
176
+ if model_type == "diffuser":
177
+ # Assign None logprob as it's not supported in diffusers
178
+ results = [
179
+ {"array": r[0], "logprob": None, "tokens": None, "token_logprobs": None}
180
+ for r in result_gens
181
+ ]
182
+ res_type = "image_generation"
183
+ else:
184
+ results = [
185
+ {"text": r[0], "logprob": r[1], "tokens": r[2], "token_logprobs": r[3]}
186
+ for r in result_gens
187
+ ]
188
+ res_type = "text_completion"
189
+ # transform the result into the openai format
190
+ return Response(
191
+ json.dumps(ModelResponse(results, response_type=res_type).__dict__()),
192
+ status=200,
193
+ )
194
+ except Exception as e:
195
+ logger.error(e)
196
+ return Response(
197
+ json.dumps({"message": str(e)}),
198
+ status=400,
199
+ )
200
+
201
+
202
+ @app.route("/embed", methods=["POST"])
203
+ def embed() -> Response:
204
+ """Get embed for generation."""
205
+ if "modality" in request.json:
206
+ modality = request.json["modality"]
207
+ else:
208
+ modality = "text"
209
+ if modality == "text":
210
+ prompts = request.json["prompt"]
211
+ elif modality == "image":
212
+ import base64
213
+
214
+ from PIL import Image
215
+
216
+ prompts = [
217
+ Image.open(io.BytesIO(base64.b64decode(data)))
218
+ for data in request.json["prompt"]
219
+ ]
220
+ else:
221
+ raise ValueError("modality must be text or image")
222
+
223
+ try:
224
+ results = []
225
+ embeddings = model.embed(prompts)
226
+ for embedding in embeddings:
227
+ results.append(
228
+ {
229
+ "array": embedding,
230
+ "logprob": None,
231
+ "tokens": None,
232
+ "token_logprobs": None,
233
+ }
234
+ )
235
+
236
+ return Response(
237
+ json.dumps(
238
+ ModelResponse(results, response_type="embedding_generation").__dict__()
239
+ ),
240
+ status=200,
241
+ )
242
+ except Exception as e:
243
+ logger.error(e)
244
+ return Response(
245
+ json.dumps({"message": str(e)}),
246
+ status=400,
247
+ )
248
+
249
+
250
+ @app.route("/score_sequence", methods=["POST"])
251
+ def score_sequence() -> Response:
252
+ """Get logprob of prompt."""
253
+ prompt = request.json["prompt"]
254
+ del request.json["prompt"]
255
+ generation_args = request.json
256
+
257
+ if not isinstance(prompt, (str, list)):
258
+ raise ValueError("Prompt must be a str or list of str")
259
+
260
+ try:
261
+ score_list = model.score_sequence(prompt, **generation_args)
262
+ results = [
263
+ {
264
+ "text": prompt if isinstance(prompt, str) else prompt[i],
265
+ "logprob": r[0],
266
+ "tokens": r[1],
267
+ "token_logprobs": r[2],
268
+ }
269
+ for i, r in enumerate(score_list)
270
+ ]
271
+ # transform the result into the openai format
272
+ return Response(
273
+ json.dumps(
274
+ ModelResponse(results, response_type="prompt_logit_score").__dict__()
275
+ ),
276
+ status=200,
277
+ )
278
+ except Exception as e:
279
+ logger.error(e)
280
+ return Response(
281
+ json.dumps({"message": str(e)}),
282
+ status=400,
283
+ )
284
+
285
+
286
+ @app.route("/params", methods=["POST"])
287
+ def params() -> Dict:
288
+ """Get model params."""
289
+ return model.get_init_params()
290
+
291
+
292
+ @app.route("/")
293
+ def index() -> str:
294
+ """Get index completion."""
295
+ fn = pkg_resources.resource_filename("metaseq", "service/index.html")
296
+ with open(fn) as f:
297
+ return f.read()
298
+
299
+
300
+ if __name__ == "__main__":
301
+ main()