Spaces:

duckdb-nsql-hub
/

DuckDB-SQL-Eval

Running

App Files Files Community

tdoehmen commited on Oct 2, 2024

Commit

b247dc4

1 Parent(s): 47bda6e

added duckdb-nsql benchmark setup

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +4 -0
duckdb-nsql/LICENSE +201 -0
duckdb-nsql/README.md +63 -0
duckdb-nsql/eval/README.md +90 -0
duckdb-nsql/eval/constants.py +37 -0
duckdb-nsql/eval/data/databases/flightinfo/flightinfo.duckdb +0 -0
duckdb-nsql/eval/data/databases/hn/hn.duckdb +0 -0
duckdb-nsql/eval/data/databases/json/json.duckdb +0 -0
duckdb-nsql/eval/data/databases/laptop/laptop.duckdb +0 -0
duckdb-nsql/eval/data/databases/laptop_array/laptop_array.duckdb +0 -0
duckdb-nsql/eval/data/databases/laptop_json/laptop_json.duckdb +0 -0
duckdb-nsql/eval/data/databases/laptop_struct/laptop_struct.duckdb +0 -0
duckdb-nsql/eval/data/databases/none/none.duckdb +0 -0
duckdb-nsql/eval/data/databases/nyc/nyc.duckdb +0 -0
duckdb-nsql/eval/data/databases/product/product.duckdb +0 -0
duckdb-nsql/eval/data/databases/transactions/transactions.duckdb +0 -0
duckdb-nsql/eval/data/databases/who/who.duckdb +0 -0
duckdb-nsql/eval/data/dev.json +602 -0
duckdb-nsql/eval/data/tables.json +3465 -0
duckdb-nsql/eval/data_utils.py +79 -0
duckdb-nsql/eval/doc_retriever.py +92 -0
duckdb-nsql/eval/evaluate.py +392 -0
duckdb-nsql/eval/get_manifest.py +29 -0
duckdb-nsql/eval/loaders.py +78 -0
duckdb-nsql/eval/metric_utils.py +40 -0
duckdb-nsql/eval/metrics/test_suite_sql_eval +1 -0
duckdb-nsql/eval/predict.py +345 -0
duckdb-nsql/eval/prompt_formatters.py +1056 -0
duckdb-nsql/eval/schema.py +115 -0
duckdb-nsql/eval/text_to_sql.py +260 -0
duckdb-nsql/manifest/.flake8 +11 -0
duckdb-nsql/manifest/.pre-commit-config.yaml +23 -0
duckdb-nsql/manifest/CHANGELOG.rst +93 -0
duckdb-nsql/manifest/LICENSE +201 -0
duckdb-nsql/manifest/Makefile +27 -0
duckdb-nsql/manifest/README.md +304 -0
duckdb-nsql/manifest/examples/langchain_chatgpt.ipynb +455 -0
duckdb-nsql/manifest/examples/manifest_async.py +27 -0
duckdb-nsql/manifest/examples/manifest_azure.ipynb +149 -0
duckdb-nsql/manifest/examples/manifest_chatgpt.ipynb +101 -0
duckdb-nsql/manifest/examples/manifest_connection_pool.ipynb +208 -0
duckdb-nsql/manifest/examples/manifest_diffusers.ipynb +0 -0
duckdb-nsql/manifest/examples/manifest_embedding.ipynb +156 -0
duckdb-nsql/manifest/examples/manifest_google.ipynb +117 -0
duckdb-nsql/manifest/examples/manifest_openrouter.ipynb +108 -0
duckdb-nsql/manifest/examples/manifest_streaming.ipynb +105 -0
duckdb-nsql/manifest/examples/manifest_together.ipynb +106 -0
duckdb-nsql/manifest/manifest/__init__.py +6 -0
duckdb-nsql/manifest/manifest/api/__init__.py +1 -0
duckdb-nsql/manifest/manifest/api/app.py +301 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.DS_Store
+__pycache__/
+*.egg-info
+.ipynb_checkpoints/

duckdb-nsql/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

duckdb-nsql/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# DuckDB-NSQL
+Numbers Station Text to SQL model for DuckDB.
+NSQL is a family of autoregressive open-source foundational models (FMs) that are particularly designed for SQL generation tasks. We are thrilled to introduce DuckDB-NSQL in this repository, an FM tailored for local DuckDB SQL analytics tasks. All model weights can be found on HuggingFace.
+| Model Name                            | Size | Link                                                           |
+| --------------------------------------| ---- | -------------------------------------------------------------- |
+| motherduckdb/DuckDB-NSQL-7B-v0.1      | 7B   | [link](https://huggingface.co/motherduckdb/DuckDB-NSQL-7B-v0.1) |
+| motherduckdb/DuckDB-NSQL-7B-v0.1-GGUF | 7B   | [link](https://huggingface.co/motherduckdb/DuckDB-NSQL-7B-v0.1-GGUF)|
+## Setup
+To install all the necessary dependencies, please run
+```
+pip install -r requirements.txt
+```
+## Usage
+Please refer to the examples in the `examples/` folder to learn how to connect to a local DuckDB and directly query your data. A simple notebook is provided in the `examples/` directory for reference.
+To host the model with llama.cpp, please execute the following:
+```python
+# Import necessary modules
+from llama_cpp import Llama
+from wurlitzer import pipes
+# Set up client with model path and context size
+with pipes() as (out, err):
+    client = Llama(
+        model_path="DuckDB-NSQL-7B-v0.1-q8_0.gguf",
+        n_ctx=2048,
+    )
+```
+To load the DuckDB database and query against it, please execute the following:
+```python
+# Import necessary modules
+import duckdb
+from utils import generate_sql
+# Connect to DuckDB database
+con = duckdb.connect("nyc.duckdb")
+# Sample question for SQL generation
+question = "alter taxi table and add struct column with name test and keys a:int, b:double"
+# Generate SQL, check validity, and print
+sql = generate_sql(question, con, client)
+print(sql)
+```
+## Training Data
+The training data for this model consists of two parts: 1) 200k synthetically generated DuckDB SQL queries, based on the DuckDB v.0.9.2 documentation, and 2) labeled text-to-SQL pairs from [NSText2SQL](https://huggingface.co/datasets/NumbersStation/NSText2SQL) transpiled to DuckDB SQL using [sqlglot](https://github.com/tobymao/sqlglot).
+## Evaluate the benchmark
+Please refer to the `eval/` folder to check the details for evaluating the model against our proposed DuckDB benchmark.
+## Acknowledgement
+We would like to express our appreciation to all authors of the evaluation scripts. Their work made this project possible.

duckdb-nsql/eval/README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+This folder contains the suite for evaluating the DuckDB-Text2SQL model.
+Please install the dependencies listed in the requirements.txt file located in the parent folder.
+## Setup
+To evaluate against the benchmark dataset, you need to prepare the evaluation script using this benchmark.
+```
+mkdir metrics
+cd metrics
+git clone [email protected]:ElementAI/test-suite-sql-eval.git test_suite_sql_eval
+cd ..
+```
+You need to add a new remote to evaluate against duckdb in the test-suite-sql-eval folder. And check the latest duckdb-only branch (640a12975abf75a94e917caca149d56dbc6bcdd7).
+```
+git remote add till https://github.com/tdoehmen/test-suite-sql-eval.git
+git fetch till
+git checkout till/duckdb-only
+```
+Next, prepare the docs for retrieval.
+```
+mkdir docs
+cd docs
+git clone https://github.com/duckdb/duckdb-web.git
+cd ..
+```
+#### Dataset
+The benchmark dataset is located in the `data/` folder and includes all databases (`data/databases`), table schemas (`data/tables.json`), and examples (`data/dev.json`).
+#### Eval
+Start a manifest session with the model you want to evaluate.
+```bash
+python -m manifest.api.app \
+    --model_type huggingface \
+    --model_generation_type text-generation \
+    --model_name_or_path motherduckdb/DuckDB-NSQL-7B-v0.1 \
+    --fp16 \
+    --device 0
+```
+Then, from the `DuckDB-NSQL` main folder, run:
+```bash
+python eval/predict.py \
+    predict \
+    eval/data/dev.json \
+    eval/data/tables.json \
+    --output-dir output/ \
+    --stop-tokens ';' \
+    --stop-tokens '--' \
+    --stop-tokens '```' \
+    --stop-tokens '###' \
+    --overwrite-manifest \
+    --manifest-client huggingface \
+    --manifest-connection http://localhost:5000 \
+    --prompt-format duckdbinst
+```
+This will format the prompt using the duckdbinst style.
+To evaluate the prediction, first run the following in a Python shell:
+```python
+try:
+    import duckdb
+    con = duckdb.connect()
+    con.install_extension("httpfs")
+    con.load_extension("httpfs")
+except Exception as e:
+    print(f"Error loading duckdb extensions: {e}")
+```
+Then, run the evaluation script:
+```bash
+python eval/evaluate.py \
+    evaluate \
+    --gold eval/data/dev.json \
+    --db eval/data/databases/ \
+    --tables eval/data/tables.json \
+    --output-dir output/ \
+    --pred [PREDICITON_FILE]
+```
+To view the output, all the information is located in the prediction file in the [output-dir]. Here, `query` is gold and `pred` is predicted.

duckdb-nsql/eval/constants.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Constants."""
+from prompt_formatters import (
+    DuckDBFormatter,
+    MotherDuckFormatter,
+    DuckDBInstFormatter,
+    DuckDBInstNoShorthandFormatter,
+    RajkumarFormatter,
+    DuckDBChat,
+    DuckDBInstFormatterLlamaShort,
+    DuckDBInstFormatterGraniteShort,
+    DuckDBInstFormatterLlama,
+    DuckDBInstFormatterLlamaBasic,
+    DuckDBInstFormatterGranite,
+    DuckDBInstFormatterPhi,
+    DuckDBInstFormatterGPTmini,
+    DuckDBInstFormatterPhiAzure,
+    DuckDBInstFormatterLlamaSyntax,
+)
+PROMPT_FORMATTERS = {
+    "rajkumar": RajkumarFormatter,
+    "duckdb": DuckDBFormatter,
+    "motherduck": MotherDuckFormatter,
+    "duckdbinst": DuckDBInstFormatter,
+    "duckdbinstllamashort": DuckDBInstFormatterLlamaShort,
+    "duckdbinstgraniteshort": DuckDBInstFormatterGraniteShort,
+    "duckdbinstllama": DuckDBInstFormatterLlama,
+    "duckdbinstgranite": DuckDBInstFormatterGranite,
+    "duckdbinstnoshort": DuckDBInstNoShorthandFormatter,
+    "duckdbchat": DuckDBChat,
+    "duckdbinstphi": DuckDBInstFormatterPhi,
+    "duckdbinstgptmini": DuckDBInstFormatterPhi,
+    "duckdbinstphiazure": DuckDBInstFormatterPhiAzure,
+    "duckdbinstllamabasic": DuckDBInstFormatterLlamaBasic,
+    "duckdbinstllamasyntax": DuckDBInstFormatterLlamaSyntax
+}

duckdb-nsql/eval/data/databases/flightinfo/flightinfo.duckdb ADDED Viewed