datasets-ai / app.py
Caleb Fahlgren
add basic querying with duckdb
a00be78
raw
history blame
1.46 kB
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from huggingface_hub import HfApi
import pandas as pd
import gradio as gr
import duckdb
import requests
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
hf_api = HfApi()
conn = duckdb.connect()
def query_dataset(dataset_id: str, query: str) -> pd.DataFrame:
response = requests.get(f"{BASE_DATASETS_SERVER_URL}/parquet?dataset={dataset_id}")
response.raise_for_status() # Check if the request was successful
first_parquet = response.json().get("parquet_files", [])[0]
first_parquet_url = first_parquet.get("url")
if not first_parquet_url:
raise ValueError("No valid URL found for the first parquet file.")
sql_query = f"SELECT * FROM read_parquet('{first_parquet_url}') limit 100;"
df = conn.execute(sql_query).fetchdf()
return df
with gr.Blocks() as demo:
gr.Markdown("# Query your HF Datasets with Natural Language πŸ“ˆπŸ“Š")
dataset_name = HuggingfaceHubSearch(
label="Hub Dataset ID",
placeholder="Find your favorite dataset...",
search_type="dataset",
value="jamescalam/world-cities-geo",
)
query_input = gr.Textbox("", label="Ask anything...")
btn = gr.Button("Ask πŸͺ„")
df = gr.DataFrame(datatype="markdown")
btn.click(
query_dataset,
inputs=[dataset_name, query_input],
outputs=[df],
)
if __name__ == "__main__":
demo.launch()