In [None]:
import duckdb

con = duckdb.connect()

con.execute("SET s3_region='us-west-2';")
con.execute("LOAD spatial;")
con.execute("LOAD httpfs;")

query = """
    COPY (
        SELECT * 
        FROM read_parquet('s3://overturemaps-us-west-2/release/2024-09-18.0/theme=divisions/*/*')
        WHERE country = 'US' AND subtype IN ('locality', 'neighborhood')
    ) TO 'us_localities_neighborhoods.parquet' (FORMAT 'parquet');
"""
con.execute(query)



In [None]:
import ibis
from ibis import _

conn = ibis.duckdb.connect(extensions=["spatial"])

df = (conn
      .read_parquet("us_localities_neighborhoods.parquet")
      .cast({"geometry": "geometry"})
      .filter(_["type"] == "division")
      .filter(_["subtype"] == "locality")
      .mutate(name = _.names["primary"])
      .mutate(state_id = _.region.replace("US-", "")) 
      .mutate(county = _.hierarchies[0][2]['name'] )
      .mutate(key_long = _.name + ibis.literal('-') + _.county + ibis.literal('-') + _.state_id)
      .select("key_long","name", "county","state_id" ,"geometry")
     )


## Dropping rows with same locality and state, with differing counties 
county_count = (
    df.group_by(["name", "state_id"])
    .aggregate(county_count=_.county.nunique())  # Count unique counties for each group
) 
valid_names = county_count.filter(county_count.county_count == 1).select("name", "state_id")
df_filtered = df.join(valid_names, ["name", "state_id"], how="inner")


# if two records have the same name but different geometries, only keep the first one.
df_first = (
    df_filtered.group_by("key_long")
    .aggregate(
        name=df_filtered.name.first(),
        county=df_filtered.county.first(),
        state_id=df_filtered.state_id.first(),
        geometry=df_filtered.geometry.first()
    )

)

df_first.execute().to_parquet("us_localities.parquet")


In [None]:
import subprocess
import os
from huggingface_hub import HfApi, login
import streamlit as st

login(st.secrets["HF_TOKEN"])
# api = HfApi(add_to_git_credential=False)
api = HfApi()

def hf_upload(file, repo_id):
    info = api.upload_file(
            path_or_fileobj=file,
            path_in_repo=file,
            repo_id=repo_id,
            repo_type="dataset",
        )
hf_upload("us_localities.parquet", "boettiger-lab/landvote")

