redlining / app.py
cboettig's picture
so it begins...
bc52b5c
import streamlit as st
import leafmap.maplibregl as leafmap
import pandas as pd
import numpy as np
from matplotlib import cm
import ibis
from ibis import _
from huggingface_hub import HfApi, login
from langchain_openai import ChatOpenAI
from langchain_community.utilities import SQLDatabase
from langchain.chains import create_sql_query_chain
# +
## Benchmark possible access locations, but local blows everything else away
# h3_parquet = "https://huggingface.co/datasets/boettiger-lab/gbif/resolve/main/gbif_ca.geoparquet"
h3_parquet = "US_NE.parquet"
# with more local storage space, full US works fine.
#h3_parquet = "/home/rstudio/source.coop/cboettig/gbif/gbif_us_h3.parquet"
con = ibis.duckdb.connect(extensions=["spatial", "httpfs"])
gbif_h3 = con.read_parquet(h3_parquet, "gbif_h3")
# -
st.set_page_config(page_title="GBIF Observations Explorer", layout="wide")
st.header("GBIF Observations Explorer", divider="rainbow")
## We have actually pre-calculated most of these so we don't need them here.
@ibis.udf.scalar.builtin
def h3_latlng_to_cell(lat: float, lng: float, zoom: int) -> int:
...
@ibis.udf.scalar.builtin
def hex(array) -> str:
...
@ibis.udf.scalar.builtin
def h3_cell_to_boundary_wkt (array) -> str:
...
## some versions need this manual install of h3 duckdb extension
# con.raw_sql('''
# INSTALL h3 FROM community;
# LOAD h3;
# ''')
# -
def filter_gbif(_df, species="Canis lupus", bbox = [-130., 30., -90., 60.]):
return (_df
.filter(_.decimallongitude >= bbox[0],
_.decimallongitude < bbox[2],
_.decimallatitude >= bbox[1],
_.decimallatitude < bbox[3],
_.species == species
)
)
def get_h3point_df(_df, resolution: float) -> pd.DataFrame:
column = "h" + str(resolution)
df = (_df
.rename(hex = column)
.group_by(_.hex)
.agg(n = _.count())
# .mutate(wkt = h3_cell_to_boundary_wkt(_.hex))
.mutate(v = _.n.log())
.mutate(normalized_values = _.v / _.v.max())
.to_pandas()
)
rgb = cm.viridis(df.normalized_values)
rgb_array = np.round( rgb * 255 ).astype(int).clip(0,255).tolist()
df['rgb'] = rgb_array
#df['viridis_hex'] = colors.to_hex(rgb) # not robust?
df['viridis_hex'] = [f"#{int(c[0] * 255):02x}{int(c[1] * 255):02x}{int(c[2] * 255):02x}" for c in rgb]
return df
# +
login(st.secrets["HF_TOKEN"])
def host_df(df, filename = "live.json", repo_id="boettiger-lab/gbif"):
df.to_json(".static/"+filename, orient='records', indent=2)
api = HfApi()
info = api.upload_file(
path_or_fileobj=".static/"+filename,
path_in_repo="live/" + filename,
repo_id=repo_id,
repo_type="dataset",
)
# to avoid cache, use unique commit url
commit_hash = info.oid
return f"https://huggingface.co/datasets/{repo_id}/resolve/{commit_hash}/live/{filename}"
def hex_layer(m, df: pd.DataFrame, v_scale = 1):
url = host_df(df)
deck_grid_layer = {
"@@type": "H3HexagonLayer",
"id": "my-layer",
"data": url,
"getHexagon": "@@=hex",
"getFillColor": "@@=rgb",
"getElevation": "@@=normalized_values",
"elevationScale": 5000 * 10 ** v_scale,
"elevationRange": [0,1],
}
return m.add_deck_layers([deck_grid_layer], "occurrences")
# +
# #%%time
def local_test():
bbox = [-120, 37, -118, 39]
df = filter_gbif(gbif_h3, species = "Canis lupus", bbox = bbox)
df = get_h3point_df(df, 6)
m = leafmap.Map(style="openstreetmap", center=(-121.4, 37.74), zoom=7,)
hex_layer(m, df)
return m
#local_test()
# +
import os
import streamlit as st
os.environ["MAPTILER_KEY"] = st.secrets["MAPTILER_KEY"]
import leafmap.maplibregl as leafmap
m = leafmap.Map(style="positron", center=(-121.4, 37.50), zoom=7,)
#m
# +
# Set up Langchain SQL access
db = SQLDatabase.from_uri("duckdb:///tmp.duckdb", view_support=True)
db.run(f"create or replace view gbif_h3 as select * from read_parquet('{h3_parquet}');")
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])
chain = create_sql_query_chain(llm, db)
# FIXME Move additional advice into system prompt
example_question = "Show me all birds"
additional_advice = '''
. Return all matching columns using SELECT * in the query.
You must use only the space-separated two-word binomial scientific name as the "species" column,
and not the "genus" column, such as "species"="Homo sapiens".
Avoid double quoting. Do not use LIMIT, always return all results.
Do not include explanations of queries.
'''
#@st.cache_data
def manual_query(species, zoom):
df = filter_gbif(gbif_h3, species)
df = get_h3point_df(df, zoom)
return df
#@st.cache_data
def chat_query(query, zoom):
df = con.sql(query)
df = get_h3point_df(df, zoom)
return df
source = {
"url": "https://data.source.coop/cboettig/us-boundaries/mappinginequality.json",
"type": "vector",
}
layer = {
"id": "mappinginequality",
"source": "mappinginequality",
"source-layer": "mappinginequality",
"type": "fill",
"min-zoom": 15,
"paint": {"fill-color": ["get", "fill"], "fill-opacity": 0.8},
}
# +
col1, col2, col3 = st.columns(3)
with col1:
zoom = st.slider("H3 resolution", min_value=2, max_value=11, value=9)
v_scale = st.slider("vertical scale", min_value=-3, max_value=3, value=0)
with col2:
"🌍 Data Layers"
if st.toggle("satellite"):
m.add_basemap("satellite")
if st.toggle("redlining"):
# redlining = "https://dsl.richmond.edu/panorama/redlining/static/mappinginequality.json"
# redlining = "https://dsl.richmond.edu/panorama/redlining/static/citiesData/CASanFrancisco1937/geojson.json"
# redlining = "https://data.source.coop/cboettig/us-boundaries/mappinginequality.json"
#redlining = "https://data.source.coop/cboettig/us-boundaries/mappinginequality.pmtiles"
redlining = "https://dsl.richmond.edu/panorama/redlining/static/citiesData/CTNewHaven1937/geojson.json"
paint = {"fill-color": ["get", "fill"], "fill-opacity": 0.8}
m.add_geojson(redlining, layer_type="fill", name = "redlining", paint=paint)
# m.add_pmtiles(redlining, layer_type="fill", name = "redlining", paint=paint, fit_bounds = False)
# m.add_source("mappinginequality", source)
# m.add_layer(layer)
# if st.toggle("Threatened Species Richness"):
# m.add_tile_layer(url="https://data.source.coop/cboettig/mobi/tiles/red/species-richness-all/{z}/{x}/{y}.png",
# name="MOBI Species Richness",
# attribution="NatureServe",
# opacity=0.9
# )
with col3:
species = st.text_input("Species name:", "Canis latrans")
df = manual_query(species, zoom)
chatbox = st.container()
st.markdown("🦜 Or try our chat-based query:")
if prompt := st.chat_input(example_question, key="chain"):
st.chat_message("user").write(prompt)
with st.chat_message("assistant"):
query = chain.invoke({"question": prompt + additional_advice})
st.write(query)
df = chat_query(query, zoom)
# if st.button("refresh"):
# chat_query.clear()
" "
st.divider()
# +
# with col2:
# min_lng, max_lng = st.slider(
# "Select longitude range",
# min_value=-130.0,
# max_value=-65.0,
# value=(-128.0, -115.0), # Default selected range
# step=0.1)
# min_lat, max_lat = st.slider(
# "Select latitude range",
# min_value=20.0,
# max_value=70.0,
# value=(30.0, 42.0), # Default selected range
# step=0.1)
# -
# +
map = st.container()
with map:
hex_layer(m, df, v_scale)
m.add_layer_control(position="top-left")
m.to_streamlit()
# +
st.divider()
'''
## Credits
DRAFT. Open Source Software developed at UC Berkeley.
'''