Spaces:

boettiger-lab
/

sql-chatbot

Sleeping

App Files Files Community

cboettig commited on Apr 19, 2024

Commit

057cfd5

1 Parent(s): 4dbbaf7

wip

Browse files

Files changed (3) hide show

app.py +172 -66
minimal-example.py +7 -1
minimal-requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,87 +1,193 @@
-import streamlit as st
-from pathlib import Path
-from langchain.llms.openai import OpenAI
-from langchain.agents import create_sql_agent
-from langchain.sql_database import SQLDatabase
-from langchain.agents.agent_types import AgentType
-from langchain_community.callbacks import StreamlitCallbackHandler
-from langchain.agents.agent_toolkits import SQLDatabaseToolkit
-from sqlalchemy import create_engine
-import sqlite3
 import os
-from langchain_openai import ChatOpenAI
-os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
-st.set_page_config(page_title="Protected Areas Database Chat", page_icon="🦜", layout="wide")
-st.title("🦜 Protected Areas Database Chat")
-#db_uri = "duckdb:///:memory:"
 db_uri = "duckdb:///pad.duckdb"
-engine = create_engine(db_uri)
-from sqlalchemy import text
-con = engine.connect()
-#con.execute(text("create or replace view agency_name as select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-name.parquet'"))
-#con.execute(text("create or replace view agency_name as select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-name.parquet'"))
-#con.execute(text("create or replace view agency_type as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-agency-type.parquet'"))
-#con.execute(text("create or replace view category as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-category.parquet'"))
-#con.execute(text("create or replace view designation_type as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-desgination-type.parquet'"))
-#con.execute(text("create or replace view easement as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-easement.parquet'"))
-#con.execute(text("create or replace view fee as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-fee.parquet'"))
-#con.execute(text("create or replace view marine as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-marine.parquet'"))
-#con.execute(text("create or replace view iucn as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-iucn.parquet'"))
-#con.execute(text("create or replace view public_access as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-public-access.parquet'"))
-#con.execute(text("create or replace view state_name as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-state-name.parquet'"))
-#con.execute(text("create or replace view combined as  select * from 'https://huggingface.co/datasets/boettiger-lab/pad-us-3/resolve/main/parquet/pad-combined.parquet'"))
 db = SQLDatabase(engine, view_support=True)
-db.get_usable_table_names()
-# User inputs
-radio_opt = ["US Protected Areas v3"]
-selected_opt = st.sidebar.radio(label="Choose suitable option", options=radio_opt)
-llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
-agent = create_sql_agent(llm, db=db, agent_type="openai-tools", verbose=True)
-def handle_user_input(user_query):
-    with history:
-        st.session_state.messages.append({"role": "user", "content": user_query})
-        #st.chat_message("user").write(user_query)
-        with st.chat_message("assistant"):
-            st_cb = StreamlitCallbackHandler(st.container())
-            response = agent.run(user_query, callbacks=[st_cb])
-            st.session_state.messages.append({"role": "assistant", "content": response})
-        #    st.write(response) # thinking is only shown transiently this way
-if "messages" not in st.session_state:
-    st.session_state["messages"] = []
 main = st.container()
 with main:
-    history = st.container(height=400)
-    # stores all questions and responses, but not the 'thinking'
-    with history:
-        for msg in st.session_state.messages:
-            st.chat_message(msg["role"]).write(msg["content"])
-    if user_query := st.chat_input(placeholder="Ask me about US Protected areas!"):
-        handle_user_input(user_query)
-    st.markdown("\n") #add some space for iphone users
-EXAMPLE_PROMPTS = ["What is the total area in each GAP_Sts category in the fee table?",
-                   "List the name of each table in the database",
-                   "How much BLM land (BLM is a Mang_Name in the fee table) is in each GAP_Sts category?",
-                   "Federal agencies are identified as 'FED' in the Mang_Type column in the 'combined' data table. The Mang_Name column indicates the different agencies. The full name of each agency is given in the agency_name table. Which federal agencies, by full name, manage the greatest area of GAP_Sts 1 or 2 land?"]
-with st.sidebar:
-    with st.container():
-        st.title("Examples")
-        for prompt in EXAMPLE_PROMPTS:
-            st.button(prompt, args=(prompt,), on_click=handle_user_input)

+# This example does not use a langchain agent,
+# The langchain sql chain has knowledge of the database, but doesn't interact with it becond intialization.
+# The output of the sql chain is parsed seperately and passed to `duckdb.sql()` by streamlit
 import os
+os.environ["WEBSOCKET_TIMEOUT_MS"] = "300000" # no effect
+import streamlit as st
+import geopandas as gpd
+from shapely import wkb
+import leafmap.foliumap as leafmap
+# Helper plotting functions
+import pydeck as pdk
+def deck_map(gdf):
+    st.write(
+        pdk.Deck(
+            map_style="mapbox://styles/mapbox/light-v9",
+            initial_view_state={
+                "latitude": 35,
+                "longitude": -100,
+                "zoom": 3,
+                "pitch": 50,
+            },
+            layers=[
+                pdk.Layer(
+                    "GeoJsonLayer",
+                    gdf,
+                    pickable=True,
+                    stroked=True,
+                    filled=True,
+                    extruded=True,
+                    elevation_scale=10,
+                    get_fill_color=[2, 200, 100],
+                    get_line_color=[0,0,0],
+                    line_width_min_pixels=0,
+                ),
+            ],
+        )
+    )
+def leaf_map(gdf):
+    m = leafmap.Map(center=[35, -100], zoom=4, layers_control=True)
+    m.add_gdf(gdf)
+    return m.to_streamlit()
+@st.cache_data
+def query_database(response):
+    return con.sql(response).to_pandas().head(25)
+@st.cache_data
+def get_geom(tbl):
+    tbl['geometry'] = tbl['geometry'].apply(wkb.loads)
+    gdf = gpd.GeoDataFrame(tbl, geometry='geometry')
+    return gdf
+## Database connection
+from sqlalchemy import create_engine
+from langchain.sql_database import SQLDatabase
 db_uri = "duckdb:///pad.duckdb"
+engine = create_engine(db_uri, connect_args={'read_only': True})
 db = SQLDatabase(engine, view_support=True)
+import ibis
+con = ibis.connect("duckdb://pad.duckdb", read_only=True)
+con.load_extension("spatial")
+## ChatGPT Connection
+from langchain_openai import ChatOpenAI
+# Requires ollama server running locally
+from langchain_community.llms import Ollama
+## should we use ChatOllama instead?
+# from langchain_community.llms import ChatOllama
+models = {"chatgpt3.5": ChatOpenAI(model="gpt-3.5-turbo", temperature=0, api_key=st.secrets["OPENAI_API_KEY"])}
+other_models = {
+          "chatgpt4": ChatOpenAI(model="gpt-4", temperature=0, api_key=st.secrets["OPENAI_API_KEY"]),
+          "duckdb-nsql": Ollama(model="duckdb-nsql", temperature=0),
+          "command-r-plus": Ollama(model="command-r-plus", temperature=0),
+          "mixtral:8x22b":  Ollama(model="mixtral:8x22b", temperature=0),
+          "wizardlm2:8x22b":  Ollama(model="wizardlm2:8x22b", temperature=0),
+          "sqlcoder": Ollama(model="sqlcoder", temperature=0),
+          "zephyr": Ollama(model="zephyr", temperature=0),
+          "gemma:7b": Ollama(model="gemma:7b", temperature=0),
+          "codegemma": Ollama(model="codegemma", temperature=0),
+          "llama2": Ollama(model="llama2", temperature=0),
+         }
+st.set_page_config(page_title="Protected Areas Database Chat", page_icon="🦜", layout="wide")
+st.title("Protected Areas Database Chat")
+map_tool = {"leafmap": leaf_map,
+            "deckgl": deck_map
+           }
+with st.sidebar:
+    choice = st.radio("Select an LLM:", models)
+    llm = models[choice]
+    map_choice = st.radio("Select mapping tool", map_tool)
+    mapper = map_tool[map_choice]
+## A SQL Chain
+from langchain.chains import create_sql_query_chain
+chain = create_sql_query_chain(llm, db)
 main = st.container()
+## Does not preserve history
 with main:
+    '''
+    The Protected Areas Database of the United States (PAD-US) is the official national inventory of
+    America’s parks and other protected lands, and is published by the USGS Gap Analysis Project,
+    [https://doi.org/10.5066/P9Q9LQ4B.](https://doi.org/10.5066/P9Q9LQ4B).
+    This interactive tool allows users to explore the dataset, as well as a range of biodiversity
+    and climate indicators associated with each protected area. These indicators are integrated into
+    a single table format shown below.  The chatbot assistant can turn natural language queries into
+    SQL queries based on the table schema.
+    See our [Protected Areas Explorer](https://huggingface.co/spaces/boettiger-lab/pad-us) for a companion non-chat-based tool.
+    ##### Example Queries returning summary tables
+    - What is the percent area in each gap code as a fraction of the total protected area?
+    - The manager_type column indicates whether a manager is federal, state, local, private, or NGO.
+      the manager_name column indicates the responsible agency (National Park Service, Bureau of Land Management,
+      etc) in the case of federal manager types.  Which of the federal managers manage the most land in
+      gap_code 1 or 2, as a fraction of the total area?
+    When queries refer to specific managed areas, the chatbot can show those areas on an interactive map.
+    Do to software limitations, these maps will show no more than 25 polygons, even if more areas match the
+    requested search. The chatbot sometimes requires help identifying the right columns.  In order to create
+    a map, the SQL query must also return the geometry column.  Conisder the following examples:
+    ##### Example queries returning maps + tables
+    - Show me all the national monuments (designation_type) in Utah. Include the geometry column
+    - Show examples of Bureau of Land Management (manager_name) with the highest species richness? Include the geometry column
+    - Which site has the overall highest range-size-rarity? Include the geometry column, manager_name, and IUCN category.
+    '''
+    st.markdown("## 🦜 Chatbot:")
+    chatbox = st.container()
+    with chatbox:
+        if prompt := st.chat_input(key="chain"):
+            st.chat_message("user").write(prompt)
+            with st.chat_message("assistant"):
+                response = chain.invoke({"question": prompt})
+                st.write(response)
+                tbl = query_database(response)
+                if 'geometry' in tbl:
+                    gdf = get_geom(tbl)
+                    mapper(gdf)
+                    n = len(gdf)
+                    st.write(f"matching features: {n}")
+                st.dataframe(tbl)
+st.divider()
+with st.container():
+    st.text("Database schema (top 3 rows)")
+    tbl = tbl = query_database("select * from pad limit 3")
+    st.dataframe(tbl)
+st.divider()
+'''
+Experimental prototype.
+- Author: [Carl Boettiger](https://carlboettiger.info)
+- For data sources and processing, see: https://beta.source.coop/repositories/cboettig/pad-us-3/description/
+'''
+# duckdb_sql fails but chatgpt3.5 succeeds with a query like:
+# use the st_area function and st_GeomFromWKB functions to compute the area of the Shape column in the fee table, and then use that to compute the total area under each GAP_Sts category
+# For most queries, duckdb_sql does much better than alternative open models though
+# Federal agencies are identified as 'FED' in the Mang_Type column in the 'combined' data table. The Mang_Name column indicates the different agencies. Which federal agencies manage the greatest area of GAP_Sts 1 or 2 land?
+# Federal agencies are identified as 'FED' in the Mang_Type column in the table named "fee". The Mang_Name column indicates the different agencies. List which managers manage the largest total areas that identified as GAP_Sts '1' or '2' ?

minimal-example.py CHANGED Viewed

@@ -29,7 +29,10 @@ from langchain_community.llms import Ollama
 models = {"duckdb-nsql": Ollama(model="duckdb-nsql", temperature=0),
           "sqlcoder": Ollama(model="sqlcoder", temperature=0),
-          "gemma": Ollama(model="gemma", temperature=0),
           "chatgpt3.5": chatgpt_llm,
           "chatgpt4": chatgpt4_llm}
 with st.sidebar:
@@ -57,5 +60,8 @@ if prompt := st.chat_input():
 # use the st_area function and st_GeomFromWKB functions to compute the area of the Shape column in the fee table, and then use that to compute the total area under each GAP_Sts category
 # Federal agencies are identified as 'FED' in the Mang_Type column in the 'combined' data table. The Mang_Name column indicates the different agencies. Which federal agencies manage the greatest area of GAP_Sts 1 or 2 land?

 models = {"duckdb-nsql": Ollama(model="duckdb-nsql", temperature=0),
           "sqlcoder": Ollama(model="sqlcoder", temperature=0),
+          "zephyr": Ollama(model="zephyr", temperature=0),
+          "gemma:7b": Ollama(model="gemma:7b", temperature=0),
+          "codegemma": Ollama(model="codegemma", temperature=0),
+          "llama2:70b": Ollama(model="llama2:70b", temperature=0),
           "chatgpt3.5": chatgpt_llm,
           "chatgpt4": chatgpt4_llm}
 with st.sidebar:
 # use the st_area function and st_GeomFromWKB functions to compute the area of the Shape column in the fee table, and then use that to compute the total area under each GAP_Sts category
+# For most queries, duckdb_sql does much better than alternative open models though
 # Federal agencies are identified as 'FED' in the Mang_Type column in the 'combined' data table. The Mang_Name column indicates the different agencies. Which federal agencies manage the greatest area of GAP_Sts 1 or 2 land?
+# Federal agencies are identified as 'FED' in the Mang_Type column in the table named "fee". The Mang_Name column indicates the different agencies. List which managers manage the largest total areas that identified as GAP_Sts '1' or '2' ?

minimal-requirements.txt CHANGED Viewed

@@ -4,4 +4,4 @@ langchain
 langchain-community
 langchain-openai
 SQLAlchemy==1.4.52
-streamlit

 langchain-community
 langchain-openai
 SQLAlchemy==1.4.52
+streamlit