GenBIChatbotllama

Runtime error

App Files Files Community

arithescientist commited on Sep 30, 2024

Commit

1746d1f

verified ·

1 Parent(s): e84e3a8

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -60

app.py CHANGED Viewed

@@ -2,45 +2,40 @@ import os
 import streamlit as st
 import pandas as pd
 import sqlite3
-from langchain import OpenAI, LLMChain, PromptTemplate
-from transformers import LlamaForCausalLM, LlamaTokenizer
-import torch
 import sqlparse
 import logging
 # Initialize conversation history
 if 'history' not in st.session_state:
     st.session_state.history = []
-# OpenAI API key (ensure it is securely stored)
-openai_api_key = os.getenv("OPENAI_API_KEY")
-# Check if the API key is set
-if not openai_api_key:
-    st.error("OpenAI API key is not set. Please set the OPENAI_API_KEY environment variable.")
-    st.stop()
-# Load the LLaMA model and tokenizer
-model_name = "meta-llama/Llama-2-7b-hf"  # Adjust to the LLaMA model you want
-device = "cuda" if torch.cuda.is_available() else "cpu"
-try:
-    llama_tokenizer = LlamaTokenizer.from_pretrained(model_name)
-    llama_model = LlamaForCausalLM.from_pretrained(model_name).to(device)
-except Exception as e:
-    st.error(f"Error loading LLaMA model: {e}")
-    llama_tokenizer = None
-    llama_model = None
-# Function to generate responses using LLaMA
-def generate_llama_response(prompt):
-    if llama_tokenizer and llama_model:
-        inputs = llama_tokenizer(prompt, return_tensors="pt").to(device)
-        outputs = llama_model.generate(inputs.input_ids, max_length=200)
-        return llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    else:
-        return "LLaMA model is not available."
 # Step 1: Upload CSV data file (or use default)
 st.title("Natural Language to SQL Query App with Enhanced Insights")
@@ -66,10 +61,14 @@ data.to_sql(table_name, conn, index=False, if_exists='replace')
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
-# Step 3: Set up the LLM Chains (SQL generation with OpenAI, insights with LLaMA)
-# SQL Generation Chain with OpenAI
 sql_template = """
-You are an expert data scientist. Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
 Ensure that:
@@ -87,24 +86,46 @@ Table name: {table_name}
 Valid columns: {columns}
 SQL Query:
 """
 sql_prompt = PromptTemplate(template=sql_template, input_variables=['question', 'table_name', 'columns'])
-sql_llm = OpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens=180)
-sql_generation_chain = LLMChain(llm=sql_llm, prompt=sql_prompt)
-# General Insights and Recommendations Chain with LLaMA
-def generate_insights_llama(question, data_summary):
-    insights_template = f"""
-    You are an expert data scientist. Based on the user's question and the dataset summary provided below, generate concise data insights and actionable recommendations.
-    User's Question: {question}
-    Dataset Summary:
-    {data_summary}
-    Concise Insights and Recommendations:
-    """
-    return generate_llama_response(insights_template)
 # Optional: Clean up function to remove incorrect COLLATE NOCASE usage
 def clean_sql_query(query):
@@ -132,16 +153,19 @@ def clean_sql_query(query):
 def classify_query(question):
     """Classify the user query as either 'SQL' or 'INSIGHTS'."""
     classification_template = """
-    You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
-    Determine the appropriate category for the following user question.
-    Question: "{question}"
-    Category (SQL/INSIGHTS):
-    """
     classification_prompt = PromptTemplate(template=classification_template, input_variables=['question'])
-    classification_chain = LLMChain(llm=sql_llm, prompt=classification_prompt)
     category = classification_chain.run({'question': question}).strip().upper()
     if category.startswith('SQL'):
         return 'SQL'
@@ -151,7 +175,22 @@ def classify_query(question):
 # Function to generate dataset summary
 def generate_dataset_summary(data):
     """Generate a summary of the dataset for general insights."""
-    summary = f"Number of records: {len(data)}, Number of columns: {len(data.columns)}, Columns: {list(data.columns)}"
     return summary
 # Define the callback function
@@ -179,9 +218,21 @@ def process_input():
                 }).strip()
                 if generated_sql.upper() == "NO_SQL":
-                    assistant_response = "No SQL query could be generated."
-                    st.session_state.history.append({"role": "assistant", "content": assistant_response})
                 else:
                     cleaned_sql = clean_sql_query(generated_sql)
                     logging.info(f"Generated SQL Query: {cleaned_sql}")
@@ -193,17 +244,34 @@ def process_input():
                             assistant_response = "The query returned no results. Please try a different question."
                             st.session_state.history.append({"role": "assistant", "content": assistant_response})
                         else:
-                            # Display query results
                             st.session_state.history.append({"role": "assistant", "content": result})
                     except Exception as e:
                         logging.error(f"An error occurred during SQL execution: {e}")
                         assistant_response = f"Error executing SQL query: {e}"
                         st.session_state.history.append({"role": "assistant", "content": assistant_response})
             else:  # INSIGHTS category
                 dataset_summary = generate_dataset_summary(data)
-                insights = generate_insights_llama(user_prompt, dataset_summary)
-                st.session_state.history.append({"role": "assistant", "content": insights})
         except Exception as e:
             logging.error(f"An error occurred: {e}")
@@ -213,7 +281,6 @@ def process_input():
         # Reset the user_input in session state
         st.session_state['user_input'] = ''
 # Display the conversation history
 for message in st.session_state.history:
     if message['role'] == 'user':

 import streamlit as st
 import pandas as pd
 import sqlite3
+from langchain import LLMChain, PromptTemplate
 import sqlparse
 import logging
+# Import necessary modules from transformers and langchain
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from langchain.llms import HuggingFacePipeline
 # Initialize conversation history
 if 'history' not in st.session_state:
     st.session_state.history = []
+# Set up the Llama-2-7b-chat-hf model
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+# Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype='auto')  # Adjust device_map and torch_dtype as needed
+# Create the text-generation pipeline with appropriate parameters
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_new_tokens=512,
+    temperature=0.1,
+    repetition_penalty=1.1,
+    do_sample=True,  # Use sampling to introduce some randomness
+    eos_token_id=tokenizer.eos_token_id,
+    pad_token_id=tokenizer.eos_token_id
+)
+# Wrap the pipeline with HuggingFacePipeline for use in LangChain
+llm = HuggingFacePipeline(pipeline=pipe)
 # Step 1: Upload CSV data file (or use default)
 st.title("Natural Language to SQL Query App with Enhanced Insights")
 valid_columns = list(data.columns)
 st.write(f"Valid columns: {valid_columns}")
+# Step 3: Set up the LLM Chains with adjusted prompts
+# SQL Generation Chain
 sql_template = """
+[INST] <<SYS>>
+You are an expert data scientist.
+<</SYS>>
+Given a natural language question, the name of the table, and a list of valid columns, generate a valid SQL query that answers the question.
 Ensure that:
 Valid columns: {columns}
 SQL Query:
+[/INST]
 """
 sql_prompt = PromptTemplate(template=sql_template, input_variables=['question', 'table_name', 'columns'])
+sql_generation_chain = LLMChain(llm=llm, prompt=sql_prompt)
+# Insights Generation Chain
+insights_template =  """
+[INST] <<SYS>>
+You are an expert data scientist.
+<</SYS>>
+Based on the user's question and the SQL query result provided below, generate a concise analysis that includes key data insights and actionable recommendations. Limit the response to a maximum of 150 words.
+User's Question: {question}
+SQL Query Result:
+{result}
+Concise Analysis (max 200 words):
+[/INST]
+"""
+insights_prompt = PromptTemplate(template=insights_template, input_variables=['question', 'result'])
+insights_chain = LLMChain(llm=llm, prompt=insights_prompt)
+# General Insights and Recommendations Chain
+general_insights_template = """
+[INST] <<SYS>>
+You are an expert data scientist.
+<</SYS>>
+Based on the entire dataset provided below, generate a concise analysis with key insights and recommendations. Limit the response to 150 words.
+Dataset Summary:
+{dataset_summary}
+Concise Analysis and Recommendations (max 150 words):
+[/INST]
+"""
+general_insights_prompt = PromptTemplate(template=general_insights_template, input_variables=['dataset_summary'])
+general_insights_chain = LLMChain(llm=llm, prompt=general_insights_prompt)
 # Optional: Clean up function to remove incorrect COLLATE NOCASE usage
 def clean_sql_query(query):
 def classify_query(question):
     """Classify the user query as either 'SQL' or 'INSIGHTS'."""
     classification_template = """
+[INST] <<SYS>>
+You are an AI assistant that classifies user queries into two categories: 'SQL' for specific data retrieval queries and 'INSIGHTS' for general analytical or recommendation queries.
+<</SYS>>
+Determine the appropriate category for the following user question.
+Question: "{question}"
+Category (SQL/INSIGHTS):
+[/INST]
+"""
     classification_prompt = PromptTemplate(template=classification_template, input_variables=['question'])
+    classification_chain = LLMChain(llm=llm, prompt=classification_prompt)
     category = classification_chain.run({'question': question}).strip().upper()
     if category.startswith('SQL'):
         return 'SQL'
 # Function to generate dataset summary
 def generate_dataset_summary(data):
     """Generate a summary of the dataset for general insights."""
+    summary_template = """
+[INST] <<SYS>>
+You are an expert data scientist.
+<</SYS>>
+Based on the dataset provided below, generate a concise summary that includes the number of records, number of columns, data types, and any notable features.
+Dataset:
+{data}
+Dataset Summary:
+[/INST]
+"""
+    summary_prompt = PromptTemplate(template=summary_template, input_variables=['data'])
+    summary_chain = LLMChain(llm=llm, prompt=summary_prompt)
+    summary = summary_chain.run({'data': data.head().to_string(index=False)})
     return summary
 # Define the callback function
                 }).strip()
                 if generated_sql.upper() == "NO_SQL":
+                    # Handle cases where no SQL should be generated
+                    assistant_response = "Sure, let's discuss some general insights and recommendations based on the data."
+                    # Generate dataset summary
+                    dataset_summary = generate_dataset_summary(data)
+                    # Generate general insights and recommendations
+                    general_insights = general_insights_chain.run({
+                        'dataset_summary': dataset_summary
+                    })
+                    # Append the assistant's insights to the history
+                    st.session_state.history.append({"role": "assistant", "content": general_insights})
                 else:
+                    # Clean the SQL query
                     cleaned_sql = clean_sql_query(generated_sql)
                     logging.info(f"Generated SQL Query: {cleaned_sql}")
                             assistant_response = "The query returned no results. Please try a different question."
                             st.session_state.history.append({"role": "assistant", "content": assistant_response})
                         else:
+                            # Convert the result to a string for the insights prompt
+                            result_str = result.head(10).to_string(index=False)  # Limit to first 10 rows
+                            # Generate insights and recommendations based on the query result
+                            insights = insights_chain.run({
+                                'question': user_prompt,
+                                'result': result_str
+                            })
+                            # Append the assistant's insights to the history
+                            st.session_state.history.append({"role": "assistant", "content": insights})
+                            # Append the result DataFrame to the history
                             st.session_state.history.append({"role": "assistant", "content": result})
                     except Exception as e:
                         logging.error(f"An error occurred during SQL execution: {e}")
                         assistant_response = f"Error executing SQL query: {e}"
                         st.session_state.history.append({"role": "assistant", "content": assistant_response})
             else:  # INSIGHTS category
+                # Generate dataset summary
                 dataset_summary = generate_dataset_summary(data)
+                # Generate general insights and recommendations
+                general_insights = general_insights_chain.run({
+                    'dataset_summary': dataset_summary
+                })
+                # Append the assistant's insights to the history
+                st.session_state.history.append({"role": "assistant", "content": general_insights})
         except Exception as e:
             logging.error(f"An error occurred: {e}")
         # Reset the user_input in session state
         st.session_state['user_input'] = ''
 # Display the conversation history
 for message in st.session_state.history:
     if message['role'] == 'user':