cboettig commited on
Commit
bc52b5c
β€’
1 Parent(s): 458900b

so it begins...

Browse files
Files changed (4) hide show
  1. .gitignore +11 -0
  2. US_NE.parquet +3 -0
  3. app.py +278 -0
  4. requirements.txt +18 -0
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .Rproj.user
2
+ .Rhistory
3
+ .RData
4
+ .Ruserdata
5
+ .ipynb_checkpoints
6
+ *.Rproj
7
+ *.duckdb
8
+ *.wal
9
+ *.vrt
10
+ .streamlit
11
+ __pycache__
US_NE.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e5c6b7a344e1eac12daf0e9cd4cf6155df8c4943eb4fd9af0899d7db16cd2e
3
+ size 21506681033
app.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import leafmap.maplibregl as leafmap
3
+ import pandas as pd
4
+ import numpy as np
5
+ from matplotlib import cm
6
+ import ibis
7
+ from ibis import _
8
+ from huggingface_hub import HfApi, login
9
+ from langchain_openai import ChatOpenAI
10
+ from langchain_community.utilities import SQLDatabase
11
+ from langchain.chains import create_sql_query_chain
12
+
13
+
14
+ # +
15
+ ## Benchmark possible access locations, but local blows everything else away
16
+ # h3_parquet = "https://huggingface.co/datasets/boettiger-lab/gbif/resolve/main/gbif_ca.geoparquet"
17
+
18
+ h3_parquet = "US_NE.parquet"
19
+ # with more local storage space, full US works fine.
20
+ #h3_parquet = "/home/rstudio/source.coop/cboettig/gbif/gbif_us_h3.parquet"
21
+
22
+ con = ibis.duckdb.connect(extensions=["spatial", "httpfs"])
23
+ gbif_h3 = con.read_parquet(h3_parquet, "gbif_h3")
24
+ # -
25
+
26
+
27
+ st.set_page_config(page_title="GBIF Observations Explorer", layout="wide")
28
+ st.header("GBIF Observations Explorer", divider="rainbow")
29
+
30
+
31
+ ## We have actually pre-calculated most of these so we don't need them here.
32
+ @ibis.udf.scalar.builtin
33
+ def h3_latlng_to_cell(lat: float, lng: float, zoom: int) -> int:
34
+ ...
35
+
36
+ @ibis.udf.scalar.builtin
37
+ def hex(array) -> str:
38
+ ...
39
+
40
+ @ibis.udf.scalar.builtin
41
+ def h3_cell_to_boundary_wkt (array) -> str:
42
+ ...
43
+ ## some versions need this manual install of h3 duckdb extension
44
+ # con.raw_sql('''
45
+ # INSTALL h3 FROM community;
46
+ # LOAD h3;
47
+ # ''')
48
+
49
+
50
+ # -
51
+
52
+ def filter_gbif(_df, species="Canis lupus", bbox = [-130., 30., -90., 60.]):
53
+ return (_df
54
+ .filter(_.decimallongitude >= bbox[0],
55
+ _.decimallongitude < bbox[2],
56
+ _.decimallatitude >= bbox[1],
57
+ _.decimallatitude < bbox[3],
58
+ _.species == species
59
+ )
60
+ )
61
+
62
+ def get_h3point_df(_df, resolution: float) -> pd.DataFrame:
63
+ column = "h" + str(resolution)
64
+ df = (_df
65
+ .rename(hex = column)
66
+ .group_by(_.hex)
67
+ .agg(n = _.count())
68
+ # .mutate(wkt = h3_cell_to_boundary_wkt(_.hex))
69
+ .mutate(v = _.n.log())
70
+ .mutate(normalized_values = _.v / _.v.max())
71
+ .to_pandas()
72
+ )
73
+ rgb = cm.viridis(df.normalized_values)
74
+ rgb_array = np.round( rgb * 255 ).astype(int).clip(0,255).tolist()
75
+ df['rgb'] = rgb_array
76
+ #df['viridis_hex'] = colors.to_hex(rgb) # not robust?
77
+ df['viridis_hex'] = [f"#{int(c[0] * 255):02x}{int(c[1] * 255):02x}{int(c[2] * 255):02x}" for c in rgb]
78
+ return df
79
+
80
+
81
+ # +
82
+ login(st.secrets["HF_TOKEN"])
83
+ def host_df(df, filename = "live.json", repo_id="boettiger-lab/gbif"):
84
+ df.to_json(".static/"+filename, orient='records', indent=2)
85
+ api = HfApi()
86
+ info = api.upload_file(
87
+ path_or_fileobj=".static/"+filename,
88
+ path_in_repo="live/" + filename,
89
+ repo_id=repo_id,
90
+ repo_type="dataset",
91
+ )
92
+ # to avoid cache, use unique commit url
93
+ commit_hash = info.oid
94
+ return f"https://huggingface.co/datasets/{repo_id}/resolve/{commit_hash}/live/{filename}"
95
+
96
+ def hex_layer(m, df: pd.DataFrame, v_scale = 1):
97
+ url = host_df(df)
98
+
99
+ deck_grid_layer = {
100
+ "@@type": "H3HexagonLayer",
101
+ "id": "my-layer",
102
+ "data": url,
103
+ "getHexagon": "@@=hex",
104
+ "getFillColor": "@@=rgb",
105
+ "getElevation": "@@=normalized_values",
106
+ "elevationScale": 5000 * 10 ** v_scale,
107
+ "elevationRange": [0,1],
108
+ }
109
+ return m.add_deck_layers([deck_grid_layer], "occurrences")
110
+
111
+
112
+ # +
113
+ # #%%time
114
+ def local_test():
115
+ bbox = [-120, 37, -118, 39]
116
+ df = filter_gbif(gbif_h3, species = "Canis lupus", bbox = bbox)
117
+ df = get_h3point_df(df, 6)
118
+ m = leafmap.Map(style="openstreetmap", center=(-121.4, 37.74), zoom=7,)
119
+ hex_layer(m, df)
120
+ return m
121
+
122
+
123
+ #local_test()
124
+
125
+ # +
126
+ import os
127
+ import streamlit as st
128
+ os.environ["MAPTILER_KEY"] = st.secrets["MAPTILER_KEY"]
129
+
130
+ import leafmap.maplibregl as leafmap
131
+ m = leafmap.Map(style="positron", center=(-121.4, 37.50), zoom=7,)
132
+ #m
133
+
134
+
135
+ # +
136
+ # Set up Langchain SQL access
137
+ db = SQLDatabase.from_uri("duckdb:///tmp.duckdb", view_support=True)
138
+ db.run(f"create or replace view gbif_h3 as select * from read_parquet('{h3_parquet}');")
139
+ llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])
140
+ chain = create_sql_query_chain(llm, db)
141
+
142
+ # FIXME Move additional advice into system prompt
143
+
144
+ example_question = "Show me all birds"
145
+ additional_advice = '''
146
+ . Return all matching columns using SELECT * in the query.
147
+ You must use only the space-separated two-word binomial scientific name as the "species" column,
148
+ and not the "genus" column, such as "species"="Homo sapiens".
149
+ Avoid double quoting. Do not use LIMIT, always return all results.
150
+ Do not include explanations of queries.
151
+ '''
152
+
153
+
154
+ #@st.cache_data
155
+ def manual_query(species, zoom):
156
+ df = filter_gbif(gbif_h3, species)
157
+ df = get_h3point_df(df, zoom)
158
+ return df
159
+
160
+
161
+ #@st.cache_data
162
+ def chat_query(query, zoom):
163
+ df = con.sql(query)
164
+ df = get_h3point_df(df, zoom)
165
+ return df
166
+
167
+
168
+ source = {
169
+ "url": "https://data.source.coop/cboettig/us-boundaries/mappinginequality.json",
170
+ "type": "vector",
171
+ }
172
+
173
+ layer = {
174
+ "id": "mappinginequality",
175
+ "source": "mappinginequality",
176
+ "source-layer": "mappinginequality",
177
+ "type": "fill",
178
+ "min-zoom": 15,
179
+ "paint": {"fill-color": ["get", "fill"], "fill-opacity": 0.8},
180
+ }
181
+
182
+
183
+
184
+ # +
185
+ col1, col2, col3 = st.columns(3)
186
+
187
+ with col1:
188
+ zoom = st.slider("H3 resolution", min_value=2, max_value=11, value=9)
189
+ v_scale = st.slider("vertical scale", min_value=-3, max_value=3, value=0)
190
+
191
+ with col2:
192
+ "🌍 Data Layers"
193
+
194
+ if st.toggle("satellite"):
195
+ m.add_basemap("satellite")
196
+
197
+ if st.toggle("redlining"):
198
+ # redlining = "https://dsl.richmond.edu/panorama/redlining/static/mappinginequality.json"
199
+ # redlining = "https://dsl.richmond.edu/panorama/redlining/static/citiesData/CASanFrancisco1937/geojson.json"
200
+ # redlining = "https://data.source.coop/cboettig/us-boundaries/mappinginequality.json"
201
+ #redlining = "https://data.source.coop/cboettig/us-boundaries/mappinginequality.pmtiles"
202
+ redlining = "https://dsl.richmond.edu/panorama/redlining/static/citiesData/CTNewHaven1937/geojson.json"
203
+
204
+ paint = {"fill-color": ["get", "fill"], "fill-opacity": 0.8}
205
+ m.add_geojson(redlining, layer_type="fill", name = "redlining", paint=paint)
206
+ # m.add_pmtiles(redlining, layer_type="fill", name = "redlining", paint=paint, fit_bounds = False)
207
+ # m.add_source("mappinginequality", source)
208
+ # m.add_layer(layer)
209
+ # if st.toggle("Threatened Species Richness"):
210
+ # m.add_tile_layer(url="https://data.source.coop/cboettig/mobi/tiles/red/species-richness-all/{z}/{x}/{y}.png",
211
+ # name="MOBI Species Richness",
212
+ # attribution="NatureServe",
213
+ # opacity=0.9
214
+ # )
215
+
216
+ with col3:
217
+ species = st.text_input("Species name:", "Canis latrans")
218
+ df = manual_query(species, zoom)
219
+
220
+ chatbox = st.container()
221
+ st.markdown("🦜 Or try our chat-based query:")
222
+ if prompt := st.chat_input(example_question, key="chain"):
223
+ st.chat_message("user").write(prompt)
224
+ with st.chat_message("assistant"):
225
+ query = chain.invoke({"question": prompt + additional_advice})
226
+ st.write(query)
227
+ df = chat_query(query, zoom)
228
+ # if st.button("refresh"):
229
+ # chat_query.clear()
230
+
231
+ " "
232
+ st.divider()
233
+
234
+
235
+
236
+ # +
237
+ # with col2:
238
+ # min_lng, max_lng = st.slider(
239
+ # "Select longitude range",
240
+ # min_value=-130.0,
241
+ # max_value=-65.0,
242
+ # value=(-128.0, -115.0), # Default selected range
243
+ # step=0.1)
244
+ # min_lat, max_lat = st.slider(
245
+ # "Select latitude range",
246
+ # min_value=20.0,
247
+ # max_value=70.0,
248
+ # value=(30.0, 42.0), # Default selected range
249
+ # step=0.1)
250
+ # -
251
+
252
+
253
+
254
+
255
+
256
+ # +
257
+ map = st.container()
258
+
259
+ with map:
260
+ hex_layer(m, df, v_scale)
261
+ m.add_layer_control(position="top-left")
262
+ m.to_streamlit()
263
+
264
+
265
+
266
+ # +
267
+
268
+
269
+ st.divider()
270
+
271
+ '''
272
+
273
+ ## Credits
274
+
275
+ DRAFT. Open Source Software developed at UC Berkeley.
276
+
277
+ '''
278
+
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ duckdb==1.0.0
2
+ pandas==2.2.2
3
+ git+https://github.com/eodaGmbH/py-maplibregl@feature/color-utils
4
+ leafmap[maplibre]
5
+ ibis-framework[duckdb]==9.1.0
6
+ streamlit==1.35.0
7
+ streamlit_folium==0.20.0
8
+ altair==5.3.0
9
+ referencing==0.35.1
10
+ rasterio==1.3.10
11
+ shapely==2.0.4
12
+ shiny==0.10.2
13
+ huggingface_hub
14
+ duckdb-engine
15
+ langchain
16
+ langchain-community
17
+ langchain-openai
18
+ streamlit