File size: 4,634 Bytes
02c6b99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4bfd881f-5889-4a26-b003-e2611708ad2a",
   "metadata": {},
   "source": [
    "# Getting city polygons from Overture Maps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e5756d2-382b-49e9-93b5-2ecf6d0eb812",
   "metadata": {},
   "outputs": [],
   "source": [
    "import duckdb\n",
    "\n",
    "con = duckdb.connect()\n",
    "\n",
    "con.execute(\"SET s3_region='us-west-2';\")\n",
    "con.execute(\"LOAD spatial;\")\n",
    "con.execute(\"LOAD httpfs;\")\n",
    "\n",
    "# getting polygons of localities in the US.\n",
    "query = \"\"\"\n",
    "    COPY (\n",
    "        SELECT * \n",
    "        FROM read_parquet('s3://overturemaps-us-west-2/release/2024-09-18.0/theme=divisions/*/*')\n",
    "        WHERE country = 'US' AND subtype IN ('locality')\n",
    "    ) TO 'us_localities_raw.parquet' (FORMAT 'parquet');\n",
    "\"\"\"\n",
    "con.execute(query)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25f62dd7-5539-438b-8f0a-1d85c9bc78ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "import ibis\n",
    "from ibis import _\n",
    "\n",
    "conn = ibis.duckdb.connect(extensions=[\"spatial\"])\n",
    "\n",
    "df = (conn\n",
    "      .read_parquet(\"us_localities_raw.parquet\")\n",
    "      .cast({\"geometry\": \"geometry\"})\n",
    "      .filter(_[\"type\"] == \"division\")\n",
    "      .mutate(municipal = _.names[\"primary\"])\n",
    "      .mutate(state = _.region.replace(\"US-\", \"\")) \n",
    "      .mutate(county = _.hierarchies[0][2]['name'] ) #extract county from nested dictionary \n",
    "      .mutate(key_long = _.municipal + ibis.literal('-') + _.county + ibis.literal('-') + _.state)\n",
    "      .select(\"key_long\",\"municipal\", \"county\",\"state\" ,\"geometry\")\n",
    "     )\n",
    "\n",
    "\n",
    "## Dropping rows with same locality and state, with differing counties - landvote doesn't specify county for cities so we are dropping these to avoid duplicates. \n",
    "county_count = (\n",
    "    df.group_by([\"municipal\", \"state\"])\n",
    "    .aggregate(county_count=_.county.nunique())  # Count unique counties for each group\n",
    ") \n",
    "valid_names = county_count.filter(county_count.county_count == 1).select(\"municipal\", \"state\")\n",
    "df_filtered = df.join(valid_names, [\"municipal\", \"state\"], how=\"inner\")\n",
    "\n",
    "\n",
    "# if two records have the same name but different geometries, only keep the first one. \n",
    "df_unique = (\n",
    "    df_filtered.group_by(\"key_long\")\n",
    "    .aggregate(\n",
    "        municipal=df_filtered.municipal.first(),\n",
    "        county=df_filtered.county.first(),\n",
    "        state=df_filtered.state.first(),\n",
    "        geometry=df_filtered.geometry.first()\n",
    "    )\n",
    "    .mutate(geometry = _.geometry.buffer(.07))\n",
    "    .select(\"state\",\"county\",\"municipal\",\"geometry\")\n",
    ")\n",
    "\n",
    "df_unique.execute().to_parquet(\"us_localities.parquet\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fce9fe8-584f-4260-9217-3aade9e71eef",
   "metadata": {},
   "source": [
    "# Uploading city polygons to Hugging Face"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca02743f-0bf4-46e5-91fd-a5fe37519ecd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import subprocess\n",
    "import os\n",
    "from huggingface_hub import HfApi, login\n",
    "import streamlit as st\n",
    "\n",
    "login(st.secrets[\"HF_TOKEN\"])\n",
    "api = HfApi()\n",
    "\n",
    "def hf_upload(file, repo_id):\n",
    "    info = api.upload_file(\n",
    "            path_or_fileobj=file,\n",
    "            path_in_repo=file,\n",
    "            repo_id=repo_id,\n",
    "            repo_type=\"dataset\",\n",
    "        )\n",
    "hf_upload(\"us_localities.parquet\", \"boettiger-lab/landvote\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ddadf0a-5b45-487f-a664-0c0696f75579",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}