Spaces:

thisispaul
/

data-test

Sleeping

App Files Files Community

thisispaul commited on Jan 16

Commit

e60265c

verified ·

1 Parent(s): be0669d

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -3

app.py CHANGED Viewed

@@ -1,4 +1,121 @@
-import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

+import os
+import duckdb
+import gradio as gr
+from httpx import Client
+from huggingface_hub import HfApi
+import pandas as pd
+from gradio_huggingfacehub_search import HuggingfaceHubSearch
+import spaces
+from llama_cpp import Llama
+BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
+headers = {
+	"Accept" : "application/json",
+	"Content-Type": "application/json"
+}
+client = Client(headers=headers)
+api = HfApi()
+llama = Llama(
+        model_path="DuckDB-NSQL-7B-v0.1-q8_0.gguf",
+        n_ctx=2048,
+        n_gpu_layers=50
+    )
+@spaces.GPU
+def generate_sql(prompt):
+    # pred = pipe(prompt, max_length=1000)
+    # return pred[0]["generated_text"]
+    pred = llama(prompt, temperature=0.1, max_tokens=1000)
+    return pred["choices"][0]["text"]
+def get_first_parquet(dataset: str):
+    resp = client.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset}")
+    return resp.json()["parquet_files"][0]
+def text2sql(dataset_name, query_input):
+    print(f"start text2sql for {dataset_name}")
+    try:
+        first_parquet = get_first_parquet(dataset_name)
+    except Exception as error:
+        return {
+            schema_output: "",
+            prompt_output: "",
+            query_output: "",
+            df:pd.DataFrame([{"error": f"❌ Could not get dataset schema. {error=}"}])
+        }
+    first_parquet_url = first_parquet["url"]
+    print(f"getting schema from {first_parquet_url}")
+    con = duckdb.connect()
+    con.execute("INSTALL 'httpfs'; LOAD httpfs;")
+    # could get from Parquet instead?
+    con.execute(f"CREATE TABLE data as SELECT * FROM '{first_parquet_url}' LIMIT 1;")
+    result = con.sql("SELECT sql FROM duckdb_tables() where table_name ='data';").df()
+    ddl_create = result.iloc[0,0]
+    text = f"""### Instruction:
+    Your task is to generate valid duckdb SQL to answer the following question.
+    ### Input:
+    Here is the database schema that the SQL query will run on:
+    {ddl_create}
+    ### Question:
+    {query_input}
+    ### Response (use duckdb shorthand if possible):
+    """
+    try:
+        sql_output = generate_sql(text)
+    except Exception as error:
+        return {
+            schema_output: ddl_create,
+            prompt_output: text,
+            query_output: "",
+            df:pd.DataFrame([{"error": f"❌ Unable to get the SQL query based on the text. {error=}"}])
+        }
+    # Should be replaced by the prompt but not working
+    sql_output = sql_output.replace("FROM data", f"FROM '{first_parquet_url}'")
+    try:
+        query_result = con.sql(sql_output).df()
+    except Exception as error:
+        query_result = pd.DataFrame([{"error": f"❌ Could not execute SQL query {error=}"}])
+    finally:
+        con.close()
+    return {
+        schema_output: ddl_create,
+        prompt_output: text,
+        query_output:sql_output,
+        df:query_result
+    }
+with gr.Blocks() as demo:
+    gr.Markdown("# 💫 Generate SQL queries based on a given text for your Hugging Face Dataset 💫")
+    dataset_name = HuggingfaceHubSearch(
+            label="Hub Dataset ID",
+            placeholder="Search for dataset id on Huggingface",
+            search_type="dataset",
+            value="jamescalam/world-cities-geo",
+        )
+    # dataset_name = gr.Textbox("jamescalam/world-cities-geo", label="Dataset Name")
+    query_input = gr.Textbox("Cities from Albania country", label="Ask something about your data")
+    examples = [
+                ["Cities from Albania country"],
+                ["The continent with the most number of countries"],
+                ["Cities that start with 'A'"],
+                ["Cities by region"],
+            ]
+    gr.Examples(examples=examples, inputs=[query_input],outputs=[])
+    btn = gr.Button("Generate SQL")
+    query_output = gr.Textbox(label="Output SQL", interactive= False)
+    df = gr.DataFrame(datatype="markdown")
+    with gr.Accordion("Open for prompt details", open=False):
+        #with gr.Column(scale=1, min_width=600):
+        schema_output = gr.Textbox(label="Parquet Schema as CREATE DDL", interactive= False)
+        prompt_output = gr.Textbox(label="Generated prompt", interactive= False)
+    btn.click(text2sql, inputs=[dataset_name, query_input], outputs=[schema_output, prompt_output, query_output,df])
+demo.launch(debug=True)