Spaces:

DrishtiSharma
/

sql-rag

Sleeping

App Files Files Community

DrishtiSharma commited on Jan 14

Commit

e4ab33c

verified ·

1 Parent(s): 40b3f9c

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -19

app.py CHANGED Viewed

@@ -110,6 +110,25 @@ if st.session_state.df is not None and st.session_state.show_preview:
 #        st.error("⚠️ GPT-4o failed to generate a valid suggestion.")
 #        return None
 def ask_gpt4o_for_visualization(query, df, llm, retries=2):
     import json
@@ -117,14 +136,15 @@ def ask_gpt4o_for_visualization(query, df, llm, retries=2):
     numeric_columns = df.select_dtypes(include='number').columns.tolist()
     categorical_columns = df.select_dtypes(exclude='number').columns.tolist()
-    # Enhanced Prompt with Clear Instructions
     prompt = f"""
     Analyze the following query and suggest the most suitable visualization(s) using the dataset.
     **Query:** "{query}"
-    **Numeric Columns (for Y-axis):** {', '.join(numeric_columns) if numeric_columns else 'None'}
-    **Categorical Columns (for X-axis or grouping):** {', '.join(categorical_columns) if categorical_columns else 'None'}
     Suggest visualizations in this exact JSON format:
     [
@@ -138,28 +158,85 @@ def ask_gpt4o_for_visualization(query, df, llm, retries=2):
       }}
     ]
-    **Examples:**
-    - For salary distribution:
       {{
         "chart_type": "box",
         "x_axis": "job_title",
         "y_axis": "salary_in_usd",
         "group_by": "experience_level",
         "title": "Salary Distribution by Job Title and Experience",
-        "description": "A box plot showing salary ranges across job titles and experience levels."
       }}
-    - For trend analysis:
       {{
         "chart_type": "line",
         "x_axis": "year",
         "y_axis": "revenue",
         "group_by": null,
-        "title": "Revenue Growth Over Years",
-        "description": "A line chart showing the trend of revenue over the years."
       }}
-    Only suggest visualizations that make sense for the data and the query.
     """
     for attempt in range(retries + 1):
@@ -170,11 +247,9 @@ def ask_gpt4o_for_visualization(query, df, llm, retries=2):
             # Load JSON response
             suggestions = json.loads(response)
-            # Validate response structure
             if isinstance(suggestions, list):
-                valid_suggestions = [
-                    s for s in suggestions if all(k in s for k in ["chart_type", "x_axis", "y_axis"])
-                ]
                 if valid_suggestions:
                     return valid_suggestions
                 else:
@@ -182,18 +257,17 @@ def ask_gpt4o_for_visualization(query, df, llm, retries=2):
                     return None
             elif isinstance(suggestions, dict):
-                if all(k in suggestions for k in ["chart_type", "x_axis", "y_axis"]):
                     return [suggestions]
                 else:
-                    st.warning("⚠️ GPT-4o's suggestion is incomplete.")
                     return None
         except json.JSONDecodeError:
             st.warning(f"⚠️ Attempt {attempt + 1}: GPT-4o returned invalid JSON.")
         except Exception as e:
             st.error(f"⚠️ Error during GPT-4o call: {e}")
-        # Retry if necessary
         if attempt < retries:
             st.info("🔄 Retrying visualization suggestion...")
@@ -201,7 +275,6 @@ def ask_gpt4o_for_visualization(query, df, llm, retries=2):
     return None
 def add_stats_to_figure(fig, df, y_axis, chart_type):
     """
     Add relevant statistical annotations to the visualization

 #        st.error("⚠️ GPT-4o failed to generate a valid suggestion.")
 #        return None
+# Helper Function for Validation
+def is_valid_suggestion(suggestion):
+    chart_type = suggestion.get("chart_type", "").lower()
+    if chart_type in ["bar", "line", "box", "scatter"]:
+        return all(k in suggestion for k in ["chart_type", "x_axis", "y_axis"])
+    elif chart_type == "pie":
+        return all(k in suggestion for k in ["chart_type", "x_axis"])
+    elif chart_type == "heatmap":
+        return all(k in suggestion for k in ["chart_type", "x_axis", "y_axis"])
+    else:
+        return False
 def ask_gpt4o_for_visualization(query, df, llm, retries=2):
     import json
     numeric_columns = df.select_dtypes(include='number').columns.tolist()
     categorical_columns = df.select_dtypes(exclude='number').columns.tolist()
+    # Enhanced Prompt with Diverse, Query-Based Examples
     prompt = f"""
     Analyze the following query and suggest the most suitable visualization(s) using the dataset.
     **Query:** "{query}"
+    **Dataset Overview:**
+    - **Numeric Columns (for Y-axis):** {', '.join(numeric_columns) if numeric_columns else 'None'}
+    - **Categorical Columns (for X-axis or grouping):** {', '.join(categorical_columns) if categorical_columns else 'None'}
     Suggest visualizations in this exact JSON format:
     [
       }}
     ]
+    **Query-Based Examples:**
+    - **Query:** "What is the salary distribution across different job titles?"
+      **Suggested Visualization:**
       {{
         "chart_type": "box",
         "x_axis": "job_title",
         "y_axis": "salary_in_usd",
         "group_by": "experience_level",
         "title": "Salary Distribution by Job Title and Experience",
+        "description": "A box plot to show how salaries vary across different job titles and experience levels."
       }}
+    - **Query:** "Show the average salary by company size and industry."
+      **Suggested Visualizations:**
+      [
+        {{
+          "chart_type": "bar",
+          "x_axis": "company_size",
+          "y_axis": "salary_in_usd",
+          "group_by": "industry",
+          "title": "Average Salary by Company Size and Industry",
+          "description": "A grouped bar chart comparing average salaries across company sizes and industries."
+        }},
+        {{
+          "chart_type": "heatmap",
+          "x_axis": "industry",
+          "y_axis": "company_size",
+          "group_by": null,
+          "title": "Salary Heatmap by Industry and Company Size",
+          "description": "A heatmap showing salary concentration across industries and company sizes."
+        }}
+      ]
+    - **Query:** "How has the company's revenue changed over the years?"
+      **Suggested Visualization:**
       {{
         "chart_type": "line",
         "x_axis": "year",
         "y_axis": "revenue",
         "group_by": null,
+        "title": "Yearly Revenue Growth",
+        "description": "A line chart showing revenue growth over time."
       }}
+    - **Query:** "What is the market share of each product category?"
+      **Suggested Visualization:**
+      {{
+        "chart_type": "pie",
+        "x_axis": "product_category",
+        "y_axis": null,
+        "group_by": null,
+        "title": "Market Share by Product Category",
+        "description": "A pie chart to show the market share distribution across different product categories."
+      }}
+    - **Query:** "Is there a correlation between years of experience and salary?"
+      **Suggested Visualization:**
+      {{
+        "chart_type": "scatter",
+        "x_axis": "years_of_experience",
+        "y_axis": "salary_in_usd",
+        "group_by": "job_title",
+        "title": "Experience vs Salary by Job Title",
+        "description": "A scatter plot to analyze the relationship between experience and salary across different job titles."
+      }}
+    - **Query:** "Which departments have the highest concentration of employees across regions?"
+      **Suggested Visualization:**
+      {{
+        "chart_type": "heatmap",
+        "x_axis": "department",
+        "y_axis": "region",
+        "group_by": null,
+        "title": "Employee Distribution by Department and Region",
+        "description": "A heatmap to visualize employee density across departments and regions."
+      }}
+    Only suggest visualizations that logically match the query and dataset.
     """
     for attempt in range(retries + 1):
             # Load JSON response
             suggestions = json.loads(response)
+            # Validate response structure using the helper function
             if isinstance(suggestions, list):
+                valid_suggestions = [s for s in suggestions if is_valid_suggestion(s)]
                 if valid_suggestions:
                     return valid_suggestions
                 else:
                     return None
             elif isinstance(suggestions, dict):
+                if is_valid_suggestion(suggestions):
                     return [suggestions]
                 else:
+                    st.warning("⚠️ GPT-4o's suggestion is incomplete or invalid.")
                     return None
         except json.JSONDecodeError:
             st.warning(f"⚠️ Attempt {attempt + 1}: GPT-4o returned invalid JSON.")
         except Exception as e:
             st.error(f"⚠️ Error during GPT-4o call: {e}")
         if attempt < retries:
             st.info("🔄 Retrying visualization suggestion...")
     return None
 def add_stats_to_figure(fig, df, y_axis, chart_type):
     """
     Add relevant statistical annotations to the visualization