Spaces:

sammyview80
/

db_bot

Sleeping

App Files Files Community

saman shrestha commited on Sep 22, 2024

Commit

6812fa3

1 Parent(s): 80a80a1

initial commit

Browse files

Files changed (9) hide show

.gitignore +78 -0
Dockerfile +31 -0
flask_app.py +162 -0
helpers/GROQ.py +115 -0
helpers/postgres.py +45 -0
helpers/prompts.py +11 -0
prompts/base_prompts.txt +51 -0
requirement.txt +0 -0
requirements.txt +59 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,78 @@

+# Ignore all env directories
+env/
+venv/
+.env/
+.venv/
+# Ignore environment-related files
+*.env
+.env.*
+.envrc
+# Ignore Python virtual environment files
+pyvenv.cfg
+# Ignore Python bytecode files
+__pycache__/
+*.py[cod]
+*$py.class
+# Ignore Python distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Ignore pip logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Ignore Python testing
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Ignore Jupyter Notebook
+.ipynb_checkpoints
+# Ignore IPython
+profile_default/
+ipython_config.py
+# Ignore mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Ignore Pylint
+.pylintrc
+# Ignore Python rope project settings
+.ropeproject
+# Ignore mkdocs documentation
+/site
+# Ignore Sphinx documentation
+docs/_build/

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.10-slim-buster
+### Set up user with permissions
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+### Set up app-specific content
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+COPY . .
+### Update permissions for the app
+USER root
+RUN chmod 777 ~/app/*
+USER user
+CMD ["gunicorn", "-b", "0.0.0.0:7860", "flask_app:app"]

flask_app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from flask import Flask, g, render_template, request, jsonify, session
+import os
+from helpers.GROQ import ConversationGROQ
+from helpers.postgres import DatabaseConnection
+from helpers.prompts import PromptManager
+import re
+import pandas as pd
+app = Flask(__name__)
+app.secret_key = os.urandom(24)  # Set a secret key for session encryption
+prompt_manager = PromptManager()
+prompt_manager.load_prompt('base_schema', 'prompts/base_prompts.txt')
+base_prompt = prompt_manager.get_prompt('base_schema')
+def extract_sql_regex(input_string) -> str | None:
+    # Pattern to match SQL query within double quotes after "sql":
+    pattern = r'"sql":\s*"(.*?)"'
+    match = re.search(pattern, input_string)
+    if match:
+        return match.group(1)
+    else:
+        return None
+# Add a function to get or create the database connection
+def get_db():
+    if 'db' not in g:
+        return DatabaseConnection(
+            db_host=session['DB_HOST'],
+            db_port=session['DB_PORT'],
+            db_name=session['DB_NAME'],
+            db_user=session['DB_USER'],
+            db_password=session['DB_PASSWORD']
+        )
+    return None
+@app.route('/', methods=['POST'])
+def index():
+    data = request.json
+    db_user = data.get('DB_USER', '')
+    db_host = data.get('DB_HOST', '')
+    db_port = data.get('DB_PORT', '')
+    db_name = data.get('DB_NAME', '')
+    db_password = data.get('DB_PASSWORD', '')
+    missing_fields = []
+    if not db_user:
+        missing_fields.append('DB_USER')
+    if not db_host:
+        missing_fields.append('DB_HOST')
+    if not db_port:
+        missing_fields.append('DB_PORT')
+    if not db_name:
+        missing_fields.append('DB_NAME')
+    if not db_password:
+        missing_fields.append('DB_PASSWORD')
+    if missing_fields:
+        return jsonify({
+            "error": f"Missing credentials: {', '.join(missing_fields)}",
+            "format": "json"
+        }), 400
+    # Store database credentials in session
+    session['DB_HOST'] = db_host
+    session['DB_PORT'] = db_port
+    session['DB_NAME'] = db_name
+    session['DB_USER'] = db_user
+    session['DB_PASSWORD'] = db_password
+    # Test the connection
+    try:
+        db = get_db()
+        if db is None:
+            return jsonify({"error": "Database connection failed", "format": "json"}), 500
+        return jsonify({"message": "Database connection successful", "format": "json"}), 200
+    except Exception as e:
+        return jsonify({"error": f"Database connection failed: {str(e)}", "format": "json"}), 500
+@app.route('/chat', methods=['POST'])
+def chat():
+    data = request.json
+    if 'DB_HOST' not in session:
+        return jsonify({"error": "Database connection not established", "format": "json"}), 400
+    prompt = data.get('prompt', '')
+    if not prompt:
+        return jsonify({"error": "Prompt is required", "format": "json"}), 400
+    db = get_db()
+    schema = db.execute_query('SELECT schema_name FROM information_schema.schemata;').fetchall()
+    schema = [schema[0] for schema in schema]
+    tables = db.execute_query('''SELECT
+        table_name,
+        json_object_agg(column_name, data_type) AS columns
+    FROM
+        information_schema.columns
+    WHERE
+        table_schema = 'public'
+    GROUP BY
+        table_name
+    ORDER BY
+        table_name;''').fetchall()
+    table_info = {table[0]: table[1] for table in tables}
+    full_prompt = base_prompt.format(schema_list=schema, tables=tables, table_info=table_info, user_question=prompt)
+    groq = ConversationGROQ(api_key='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')
+    groq.create_conversation(full_prompt)
+    response = groq.chat(prompt)
+    sql_query =  extract_sql_regex(response)
+    if(sql_query is None):
+        print("No SQL query found")
+        return jsonify({"message": response, "response":  response, "Sql": sql_query,"format": "json"}), 200
+    result = db.execute_query(sql_query)
+    print(sql_query, 'result')
+    row = result.fetchall()
+    df = pd.DataFrame(row, columns=[desc[0] for desc in result.description])
+    df = df.reset_index(drop=True)
+    print(df.to_markdown(index=False))
+    prompt = """
+    A user asked the following question:
+    {user_question}
+    Based on this question, a query was executed and returned the following data:
+    {df}
+    Please provide a clear and concise summary of this data in non-technical language.
+    Focus on the key insights and how they relate to the user's question.
+    Avoid using technical terms and present the information in a way that's easy for anyone to understand.
+    If there are any notable trends, patterns, or important points in the data, please highlight them.
+    If the data includes price or amount information, please also provide a brief comparison. For example, highlight the highest and lowest values, or compare average prices/amounts between different categories if applicable.
+    Additionally, if the data spans multiple time periods (e.g., different dates or years), please provide a brief overview of any trends or changes over time.
+    If applicable, include any relevant statistics or figures, but explain them in simple terms.
+    Your summary should be informative yet accessible to someone without a technical background.
+    """.format(user_question=prompt, df=df)
+    final_response = groq.chat(prompt)
+    print(final_response)
+    return jsonify({"message": final_response, "df": df.to_html(),"response":  response, "sql": sql_query,"format": "json"}), 200
+@app.route('/chat', methods=['POST'])
+def query():
+    data = request.json
+    # Process the query here
+    print(data)
+    # For now, we'll just echo back the received data
+    return jsonify({"response": f"Received: {data}"})
+@app.teardown_appcontext
+def close_db(error):
+    db = g.pop('db', None)
+    if db is not None:
+        db.close()
+if __name__ == '__main__':
+    app.run(debug=True, port=5001)

helpers/GROQ.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from groq import Groq
+from langchain_groq import ChatGroq
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    MessagesPlaceholder,
+)
+from langchain.chains import LLMChain
+from langchain_core.messages import SystemMessage
+from langchain.chains.conversation.memory import ConversationBufferWindowMemory
+from typing import Dict, Optional
+import pandas as pd
+class GROQ:
+    def __init__(self, api_key: str = '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'):
+        self.client: Groq = Groq(
+            api_key=api_key
+        )
+    def chat(self, prompt: str, model: str, response_format: Optional[Dict]) -> str:
+        completion = self.client.chat.completions.create(
+            model=model, messages=[{"role": "user", "content": prompt}], response_format=response_format)
+        return completion.choices[0].message.content
+    def errorChat(self, user_question: str, sql_query: str, error: str, model: str) -> str:
+        #  Check the ai need user feedback or not
+        prompt = """
+        User question: {user_question}
+        Error: {error}
+        Error Occured in thisSQL Query: {sql_query}
+        Update the SQL query to fix the error.
+        if its need user feedback, return the feedback prompt. If not, return None.
+        Response in json {{"sql": <sql query here>, "feedback": <feedback prompt here>, "summarization": <summarization prompt here>,
+        "user_feedback": boolean if true send {{"user_feedback": true}} if false send {{"user_feedback": false}}
+          """.format(user_question = user_question, sql_query = sql_query, error = error)
+        return self.chat(prompt, model, None)
+    def get_summarization(self, user_question: str, df: pd.DataFrame, model: str) -> str:
+        """
+        This function generates a summarization prompt based on the user's question and the resulting data.
+        It then sends this summarization prompt to the Groq API and retrieves the AI's response.
+        Parameters:
+        client (Groqcloud): The Groq API client.
+        user_question (str): The user's question.
+        df (DataFrame): The DataFrame resulting from the SQL query.
+        model (str): The AI model to use for the response.
+        Returns:
+        str: The content of the AI's response to the summarization prompt.
+        """
+        prompt = '''
+          A user asked the following question pertaining to local database tables:
+          {user_question}
+          To answer the question, a dataframe was returned:
+          Dataframe should be shown as a table.
+          * Dataframe is structured as easy to read table.
+          * Dataframe is clean and dates are converted to a readable format.
+          Dataframe:
+          {df}
+          * Ensure all numeric values are formatted with appropriate precision.
+          * If there are any percentages, display them with the % symbol.
+          * Format any currency values with the appropriate currency symbol and decimal places.
+          * If there are any date columns, format them as 'YYYY-MM-DD' for clarity.
+          * If the dataframe has more than 10 rows, show only the first 10 rows and indicate there are more.
+          * Include the total number of rows in the dataframe.
+        In a few sentences and show the dataframe, summarize the data in the table as it pertains to the original user question. Avoid qualifiers like "based on the data" and do not comment on the structure or metadata of the table itself
+      '''.format(user_question = user_question, df = df.to_markdown(index=False))
+        # Response format is set to 'None'
+        return self.chat(prompt,model,None)
+class ConversationGROQ:
+    def __init__(self, conversational_memory_length: int = 10, api_key: str = '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', model: str = 'llama3-8b-8192'):
+        self.client: ChatGroq = ChatGroq(
+            groq_api_key=api_key,
+            model=model
+        )
+        self.memory: ConversationBufferWindowMemory = ConversationBufferWindowMemory(k=conversational_memory_length, memory_key="chat_history", return_messages=True)
+        self.conversation: Optional[LLMChain] = None
+    def create_template(self, base_prompt: str) -> ChatPromptTemplate:
+        return ChatPromptTemplate.from_messages([
+                    SystemMessage(
+                        content=base_prompt
+                    ),  # This is the persistent system prompt that is always included at the start of the chat.
+                    MessagesPlaceholder(
+                        variable_name="chat_history"
+                    ),  # This placeholder will be replaced by the actual chat history during the conversation. It helps in maintaining context.
+                    HumanMessagePromptTemplate.from_template(
+                        "{human_input}"
+                    ),  # This template is where the user's current input will be injected into the prompt.
+                ])
+    def create_conversation(self, prompt: str) -> LLMChain:
+        self.conversation = LLMChain(
+            llm=self.client,
+            memory=self.memory,
+            prompt=self.create_template(prompt),
+            verbose=True
+        )
+        return self.conversation
+    def chat(self, user_input: str) -> str:
+        if self.conversation is None:
+            raise ValueError("Conversation not initialized. Call create_conversation() first.")
+        return self.conversation.predict(human_input =user_input)

helpers/postgres.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import psycopg2
+from psycopg2 import pool,  OperationalError
+from dotenv import load_dotenv
+from typing import Optional, Union, List, Dict, Tuple
+import psycopg2.extensions
+# Load environment variables
+load_dotenv()
+def log_method(func):
+    def wrapper(*args, **kwargs):
+        print(f"Calling method {func.__name__}")
+        return func(*args, **kwargs)
+    return wrapper
+class DatabaseConnection:
+    _instance = None  # For Singleton pattern (optional)
+    def __init__(self, db_user, db_password, db_host, db_port, db_name):
+        self.db_user = db_user
+        self.db_password = db_password
+        self.db_host = db_host
+        self.db_port = db_port
+        self.db_name = db_name
+        try:
+            # Create a connection pool (min and max connection count)
+            self.connection_pool = psycopg2.pool.SimpleConnectionPool(
+                1, 10,  # Min and max number of connections
+                user=db_user,
+                password=db_password,
+                host=db_host,
+                port=db_port,
+                database=db_name
+            )
+            if self.connection_pool:
+                print("Connection pool created successfully")
+        except OperationalError as e:
+            print(f"Error while connecting to PostgreSQL: {e}")
+    def execute_query(self, query: str, params: Optional[Union[List, Dict]] = None, connection: Optional[psycopg2.extensions.connection] = None) -> List[Tuple]:
+        cursor = self.connection_pool.getconn().cursor()
+        cursor.execute(query, params)
+        return cursor

helpers/prompts.py ADDED Viewed

	@@ -0,0 +1,11 @@

+class PromptManager:
+    def __init__(self):
+        self.prompts = {}
+    def load_prompt(self, name, file_path):
+        with open(file_path, 'r') as file:
+            self.prompts[name] = file.read()
+    def get_prompt(self, name):
+        return self.prompts.get(name, '')

prompts/base_prompts.txt ADDED Viewed

	@@ -0,0 +1,51 @@

+You are SQL Advisor, tasked with generating SQL queries for PostgreSQL based on user questions about data stored in these schemas:
+Perform a detailed analysis of the following PostgreSQL database schemas. For each schema, determine its purpose, possible relationships with other schemas, and its overall role within the database system. Categorize schemas into system-level (e.g., those related to database management) and user-defined schemas (e.g., for specific business processes). Provide recommendations for improving schema organization, minimizing redundancy, and optimizing performance.
+Analysis is the schema list: {schema_list}.
+Key Areas to Focus On:
+Categorizing schemas into system-level and user-defined.
+Analyzing the function and use case of each schema.
+Offering suggestions for schema optimization and best practices for management.
+Important Notice:
+* This system is designed for read-only operations. Queries that modify data (INSERT, UPDATE, DELETE) are not permitted.
+* If a user requests data modification, respond with an error message explaining that such operations are restricted.
+Read this table and remember the table name and all the columns you need to search for the user's question.
+Here is the table format:
+Format: {{
+  "table_name": {{
+    "column_name": "data_type",
+    ...
+  }},
+  ...
+}} to understand the table structure.
+Table information is :
+{table_info}
+Reminder:
+* If the user asks for a greeting or introduction then respond with some greetings.
+Given a user's question about data in a specific schema, write a valid PostgreSQL SQL query that accurately extracts or calculates the requested information from the tables in that schema, adhering to SQL best practices for PostgreSQL, optimizing for readability and performance where applicable.
+Here are some tips for writing PostgreSQL queries:
+* Use standard SQL syntax for querying tables
+* Include the schema name when referencing tables (e.g., schema_name.table_name)
+* Include appropriate JOIN clauses when querying across multiple tables
+* Use CURRENT_DATE to get today's date
+* Alias aggregated fields like COUNT(*) for clarity
+Question:
+--------
+{user_question}
+--------
+Reminder: Generate a PostgreSQL SQL query to answer the question:
+* respond as a valid JSON Document
+* [Best] If the question can be answered with the available tables: {{"sql": <sql here>}}
+* If the question cannot be answered with the available tables: {{"error": <explanation here>}}
+* Ensure that the entire output is returned on only one single line
+* Keep your query as simple and straightforward as possible; avoid unnecessary subqueries

requirement.txt ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,59 @@

+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.6.0
+async-timeout==4.0.3
+attrs==24.2.0
+blinker==1.8.2
+certifi==2024.8.30
+charset-normalizer==3.3.2
+click==8.1.7
+dataclasses-json==0.6.7
+distro==1.9.0
+exceptiongroup==1.2.2
+Flask==3.0.3
+frozenlist==1.4.1
+groq==0.11.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+idna==3.10
+itsdangerous==2.2.0
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain==0.1.16
+langchain-community==0.0.38
+langchain-core==0.1.52
+langchain-groq==0.1.5
+langchain-text-splitters==0.0.2
+langsmith==0.1.125
+load-dotenv==0.1.0
+MarkupSafe==2.1.5
+marshmallow==3.22.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+numpy==1.26.4
+orjson==3.10.7
+packaging==23.2
+pandas==2.2.3
+psycopg2-binary==2.9.9
+pydantic==2.9.2
+pydantic_core==2.23.4
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+pytz==2024.2
+PyYAML==6.0.2
+requests==2.32.3
+six==1.16.0
+sniffio==1.3.1
+SQLAlchemy==2.0.35
+tabulate==0.9.0
+tenacity==8.5.0
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.3
+Werkzeug==3.0.4
+yarl==1.11.1