{ "cells": [ { "cell_type": "markdown", "id": "4bfd881f-5889-4a26-b003-e2611708ad2a", "metadata": {}, "source": [ "# Getting city polygons from Overture Maps" ] }, { "cell_type": "code", "execution_count": null, "id": "3e5756d2-382b-49e9-93b5-2ecf6d0eb812", "metadata": {}, "outputs": [], "source": [ "import duckdb\n", "\n", "con = duckdb.connect()\n", "\n", "con.execute(\"SET s3_region='us-west-2';\")\n", "con.execute(\"LOAD spatial;\")\n", "con.execute(\"LOAD httpfs;\")\n", "\n", "# getting polygons of localities in the US.\n", "query = \"\"\"\n", " COPY (\n", " SELECT * \n", " FROM read_parquet('s3://overturemaps-us-west-2/release/2024-09-18.0/theme=divisions/*/*')\n", " WHERE country = 'US' AND subtype IN ('locality')\n", " ) TO 'us_localities_raw.parquet' (FORMAT 'parquet');\n", "\"\"\"\n", "con.execute(query)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "25f62dd7-5539-438b-8f0a-1d85c9bc78ab", "metadata": {}, "outputs": [], "source": [ "import ibis\n", "from ibis import _\n", "\n", "conn = ibis.duckdb.connect(extensions=[\"spatial\"])\n", "\n", "df = (conn\n", " .read_parquet(\"us_localities_raw.parquet\")\n", " .cast({\"geometry\": \"geometry\"})\n", " .filter(_[\"type\"] == \"division\")\n", " .mutate(municipal = _.names[\"primary\"])\n", " .mutate(state = _.region.replace(\"US-\", \"\")) \n", " .mutate(county = _.hierarchies[0][2]['name'] ) #extract county from nested dictionary \n", " .mutate(key_long = _.municipal + ibis.literal('-') + _.county + ibis.literal('-') + _.state)\n", " .select(\"key_long\",\"municipal\", \"county\",\"state\" ,\"geometry\")\n", " )\n", "\n", "\n", "## Dropping rows with same locality and state, with differing counties - landvote doesn't specify county for cities so we are dropping these to avoid duplicates. \n", "county_count = (\n", " df.group_by([\"municipal\", \"state\"])\n", " .aggregate(county_count=_.county.nunique()) # Count unique counties for each group\n", ") \n", "valid_names = county_count.filter(county_count.county_count == 1).select(\"municipal\", \"state\")\n", "df_filtered = df.join(valid_names, [\"municipal\", \"state\"], how=\"inner\")\n", "\n", "\n", "# if two records have the same name but different geometries, only keep the first one. \n", "df_unique = (\n", " df_filtered.group_by(\"key_long\")\n", " .aggregate(\n", " municipal=df_filtered.municipal.first(),\n", " county=df_filtered.county.first(),\n", " state=df_filtered.state.first(),\n", " geometry=df_filtered.geometry.first()\n", " )\n", " .mutate(geometry = _.geometry.buffer(.07))\n", " .select(\"state\",\"county\",\"municipal\",\"geometry\")\n", ")\n", "\n", "df_unique.execute().to_parquet(\"us_localities.parquet\")\n" ] }, { "cell_type": "markdown", "id": "0fce9fe8-584f-4260-9217-3aade9e71eef", "metadata": {}, "source": [ "# Uploading city polygons to Hugging Face" ] }, { "cell_type": "code", "execution_count": null, "id": "ca02743f-0bf4-46e5-91fd-a5fe37519ecd", "metadata": {}, "outputs": [], "source": [ "import subprocess\n", "import os\n", "from huggingface_hub import HfApi, login\n", "import streamlit as st\n", "\n", "login(st.secrets[\"HF_TOKEN\"])\n", "api = HfApi()\n", "\n", "def hf_upload(file, repo_id):\n", " info = api.upload_file(\n", " path_or_fileobj=file,\n", " path_in_repo=file,\n", " repo_id=repo_id,\n", " repo_type=\"dataset\",\n", " )\n", "hf_upload(\"us_localities.parquet\", \"boettiger-lab/landvote\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1ddadf0a-5b45-487f-a664-0c0696f75579", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" } }, "nbformat": 4, "nbformat_minor": 5 }