Spaces:

CuongProjects
/

Course_query_system

Sleeping

App Files Files Community

CD17 commited on Dec 18, 2024

Commit

6340cf4

verified ·

1 Parent(s): c67f444

Upload 7 files

Browse files

Files changed (7) hide show

FTCM_Course_List_Spring2025.xlsx +0 -0
Spring_2025_courses.db +0 -0
app.py +21 -56
create_course_dataframe.py +101 -0
df2sqlite.py +58 -0
functions_huggingface.py +109 -0
process_xlsx.py +112 -0

FTCM_Course_List_Spring2025.xlsx ADDED Viewed

Binary file (45.5 kB). View file

Spring_2025_courses.db ADDED Viewed

Binary file (36.9 kB). View file

app.py CHANGED Viewed

@@ -1,64 +1,29 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+from functions import safe_executor, example_queries
+def execute_query(query):
+    result = safe_executor.execute_safe_query(query)
+    if result['success'] == True:
+        return result['result']
+    else:
+        return result['error']
+def update_textbox(prompt):
+    return gr.update(value=prompt)
+with gr.Blocks() as demo:
+    response_box = gr.Textbox(label="Response", interactive=False)
+    msg = gr.Textbox(label="Type your message or select a prompt")
+    with gr.Row():
+        prompt_dropdown = gr.Dropdown(choices=[""] + example_queries, label="Select a premade prompt", value="")
+        submit = gr.Button("Submit")
+    clear = gr.ClearButton([msg, response_box])
+    prompt_dropdown.change(update_textbox, inputs=[prompt_dropdown], outputs=[msg])
+    submit.click(execute_query, inputs=[msg], outputs=[response_box])
+    msg.submit(execute_query, inputs=[msg], outputs=[response_box])
+    clear.click(lambda: None, None, [msg, response_box], queue=False)
 if __name__ == "__main__":
+    demo.launch()

create_course_dataframe.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from process_xlsx import process_xlsx
+import pandas as pd
+import re
+# Step 1: Convert to DataFrame
+def create_course_dataframe(cleaned_column_names, column_names, department_program_courses):
+    data = []
+    for department, programs in department_program_courses.items():
+        for program, courses in programs.items():
+            for course in courses:
+                row_data = course[:]  # Copy original row
+                row_data.append(department)  # Add Department column
+                row_data.append(program if program else "N/A")  # Add Program column
+                data.append(row_data)
+    # Add Department and Program to the column names
+    extended_column_names = column_names + ['Department', 'Program']
+    # Create DataFrame
+    df = pd.DataFrame(data, columns=extended_column_names)
+    # Strip trailing spaces from column names and remove spaces/newlines
+    df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('\n', '')
+    # Ensure all column names are strings before stripping
+    df.rename(columns=lambda x: str(x).strip(), inplace=True)
+    # Clean the cleaned_column_names to match the stripped column names
+    cleaned_column_names = [col.strip() for col in cleaned_column_names]
+    # Select columns based on cleaned_column_names
+    df = df[cleaned_column_names]
+    return df
+# Step 2: Diagnose Inconsistencies in Data
+def diagnose_inconsistencies(df):
+    # Report missing values
+    missing_values = df.isnull().sum()
+    print("\nMissing Values Per Column:")
+    print(missing_values[missing_values > 0])
+    # Check unique value counts to spot potential inconsistencies
+    print("\nUnique Value Counts Per Column:")
+    for column in df.columns:
+        unique_vals = df[column].nunique()
+        print(f"{column}: {unique_vals} unique values")
+    # Identify potential misspellings and inconsistent values in key columns
+    # Example: Checking for inconsistencies in 'Course Code', 'Instructor', 'Room', etc.
+    print("\nInconsistent Patterns and Values:")
+    # Pattern checks for Course Code (e.g., expecting format like 'MAT101', 'STA421/521')
+    inconsistent_course_codes = df[~df['CourseCode'].str.match(r'^[A-Z]{3}\d{3}(/\d{3})?$')]
+    if not inconsistent_course_codes.empty:
+        print("\nInconsistent Course Codes:")
+        print(inconsistent_course_codes[['CourseCode']].drop_duplicates())
+    # Check for inconsistent capitalization in 'Instructor' column
+    df['Instructor'] = df['Instructor'].str.strip().str.title()
+    instructor_inconsistencies = df['Instructor'].value_counts()
+    print("\nInstructor Inconsistencies:")
+    print(instructor_inconsistencies[instructor_inconsistencies > 1])
+    # Check for possible misspellings or variations in Room
+    print("\nRoom Variations:")
+    room_variations = df['Room'].value_counts()
+    print(room_variations[room_variations > 1])
+    # Identify rows with missing key fields that should generally be non-null
+    key_columns = ['CourseCode', 'CourseTitle', 'Cr', 'Instructor']
+    missing_key_fields = df[df[key_columns].isnull().any(axis=1)]
+    if not missing_key_fields.empty:
+        print("\nRows with Missing Key Fields:")
+        print(missing_key_fields[key_columns])
+    # Display data types and any anomalies in numeric fields
+    print("\nData Types and Anomalies in Numeric Fields:")
+    for column in df.select_dtypes(include=['number']).columns:
+        print(f"{column} - Min: {df[column].min()}, Max: {df[column].max()}, Unique Values: {df[column].nunique()}")
+    return df
+file_path = "data/FTCM_Course_List_Spring2025.xlsx"
+result = process_xlsx(file_path)
+if result:
+    column_names, department_program_courses = result
+    print(f"Column Names:{column_names}")
+else:
+    print(f"Error processing file. {file_path}")
+cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)',
+    'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo',
+    'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo',
+    'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program']
+# Sample usage
+# Assuming column_names and department_program_courses are already defined
+df = create_course_dataframe(cleaned_column_names, column_names, department_program_courses)
+df_cleaned = diagnose_inconsistencies(df)
+diagnose_inconsistencies(df_cleaned)

df2sqlite.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import sqlite3
+import pandas as pd
+import logging
+from create_course_dataframe import create_course_dataframe, process_xlsx
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[logging.StreamHandler()]
+)
+def save_to_sqlite(df, database_name='Spring_2025_courses.db', table_name='Spring_2025_courses'):
+    """
+    Save DataFrame to SQLite Database
+    """
+    try:
+        logging.info(f"Connecting to database: {database_name}")
+        conn = sqlite3.connect(database_name)
+        logging.info(f"Saving DataFrame to table: {table_name}")
+        df.to_sql(table_name, conn, if_exists='replace', index=False)
+        logging.info("Data successfully saved to SQLite database")
+        # Verify the data
+        row_count = pd.read_sql(f"SELECT COUNT(*) FROM {table_name}", conn).iloc[0,0]
+        logging.info(f"Verified {row_count} rows in table {table_name}")
+        conn.close()
+        return True
+    except Exception as e:
+        logging.error(f"Error saving to database: {str(e)}")
+        return False
+if __name__ == "__main__":
+    # Load and process XLSX file
+    file_path = "data/FTCM_Course_List_Spring2025.xlsx"
+    result = process_xlsx(file_path)
+    if result:
+        column_names, department_program_courses = result
+        modified_column_names = [name.replace(' ', '').replace('\n', '') if name else name for name in column_names]
+        cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)',
+            'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo',
+            'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo',
+            'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program']
+        # Create DataFrame
+        df = create_course_dataframe(cleaned_column_names, modified_column_names, department_program_courses)
+        # Save to SQLite
+        if save_to_sqlite(df):
+            logging.info("Process completed successfully")
+        else:
+            logging.error("Failed to save data to SQLite")
+    else:
+        logging.error("Failed to process XLSX file")

functions_huggingface.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import sqlite3
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+import re
+import gradio as gr
+# Load the Llama model and tokenizer
+model_name = "meta-llama/Llama-3.3-70B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+# Initialize database connection
+db_path = "Spring_2025_courses.db"
+connection = sqlite3.connect(db_path)
+def get_schema():
+    """Retrieve database schema"""
+    cursor = connection.cursor()
+    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+    tables = cursor.fetchall()
+    schema = {}
+    for table_name in tables:
+        table_name = table_name[0]
+        cursor.execute(f"PRAGMA table_info({table_name});")
+        columns = cursor.fetchall()
+        schema[table_name] = [column[1] for column in columns]
+    return schema
+def run_query(query):
+    """Execute SQL query"""
+    cursor = connection.cursor()
+    cursor.execute(query)
+    return cursor.fetchall()
+# Prompt templates
+system_prompt = """
+You are a SQLite expert. Given an input question, create one syntactically correct SQLite query to run. Generate only one query. No preamble.
+Here is the relevant table information:
+schema: {schema}
+Tips:
+- Use LIKE instead of = in the queries
+Write only one SQLite query that would answer the user's question.
+"""
+human_prompt = """Based on the table schema below, write a SQL query that would answer the user's question:
+{schema}
+Question: {question}
+SQL Query:"""
+prompt = ChatPromptTemplate.from_messages([
+    ("system", system_prompt),
+    ("human", human_prompt),
+])
+# Build query generation chain
+sql_generator = (
+    RunnablePassthrough.assign(schema=get_schema)
+    | prompt
+    | StrOutputParser()
+)
+def generate_sql(question):
+    """Generate SQL query from question"""
+    schema = get_schema()
+    input_prompt = system_prompt.format(schema=schema, question=question)
+    response = generator(input_prompt, max_length=512, num_return_sequences=1)
+    return response[0]['generated_text']
+def execute_safe_query(question):
+    """Safely execute a natural language query"""
+    try:
+        # Generate SQL query
+        sql_query = generate_sql(question)
+        # Validate SQL query
+        if not sql_query.strip().lower().startswith("select"):
+            return {"error": "Only SELECT queries are allowed.", "query": sql_query, "result": None}
+        # Execute query
+        result = run_query(sql_query)
+        return {"error": None, "query": sql_query, "result": result}
+    except Exception as e:
+        return {"error": str(e), "query": None, "result": None}
+# Deploy using Gradio
+def query_interface(question):
+    response = execute_safe_query(question)
+    if response['error']:
+        return f"Error: {response['error']}\nGenerated Query: {response['query']}"
+    return f"Query: {response['query']}\nResult: {response['result']}"
+iface = gr.Interface(
+    fn=query_interface,
+    inputs="text",
+    outputs="text",
+    title="SQLite Query Generator with Llama 3.3",
+    description="Ask a natural language question about the Spring 2025 courses database and get the SQL query and results.",
+)
+if __name__ == "__main__":
+    iface.launch()

process_xlsx.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import logging
+import re
+from openpyxl import load_workbook
+# Debug Mode (Set to False for production)
+DEBUG_MODE = True
+# Logging Configuration
+logging.basicConfig(
+    level=logging.DEBUG if DEBUG_MODE else logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.FileHandler("debug.log"),
+        logging.StreamHandler()
+    ]
+)
+def process_xlsx(file_path):
+    logging.info(f"Processing XLSX file: {file_path}")
+    try:
+        # Load XLSX content
+        wb = load_workbook(filename=file_path)
+        sheet = wb.active
+        xlsx_content = [[cell.value for cell in row] for row in sheet.rows]
+        # Initialization
+        column_names = []
+        department_program_courses = {}
+        current_department = None
+        current_program = None
+        # Determine column indices
+        header_index = 0
+        while header_index < len(xlsx_content):
+            if "Course Code" in [x for x in xlsx_content[header_index] if x]:
+                break
+            header_index += 1
+        column_names = xlsx_content[header_index]
+        cr_index = [i for i, x in enumerate(column_names) if re.match(r"Cr", str(x))]
+        if not cr_index:
+            logging.error("Could not find 'Cr' column index.")
+            return None
+        cr_index = cr_index[0]
+        # Process rows
+        for index, row in enumerate(xlsx_content):
+            if index <= header_index:
+                continue
+            # Department Row Detection (Loose pattern for "Cr")
+            if row[0] and row[cr_index] and re.match(r"cr", str(row[cr_index]), re.IGNORECASE):
+                current_department = row[0]
+                department_program_courses.setdefault(current_department, {})
+                current_program = None
+                logging.debug(f"Detected Department: {current_department}")
+            # Program Row Detection (Empty "Cr" column)
+            elif row[0] and not row[cr_index]:
+                current_program = row[0]
+                department_program_courses[current_department].setdefault(current_program, [])
+                logging.debug(f"Detected Program under {current_department}: {current_program}")
+            # Course Row Detection (Numeric "Cr" value)
+            elif row[0] and isinstance(row[cr_index], (int, float)):
+                course_codes = [row[0]]  # Default to single course code
+                # Handle special case (e.g., "STA421/521")
+                if "/" in row[0]:
+                    start, end = row[0].split("/")
+                    course_codes = [start, start[:3] + end]
+                    logging.info(f"Splitting course for row: {repr(row)}")
+                    logging.info(f"course_codes: {course_codes}")
+                for code in course_codes:
+                    new_row = row[:]  # Copy original row
+                    new_row[0] = code  # Update course code for each split course
+                    # Assign courses to program if exists, otherwise directly to department
+                    if current_program:
+                        department_program_courses[current_department][current_program].append(new_row)
+                    else:
+                        department_program_courses[current_department].setdefault(current_department, []).append(new_row)
+                logging.debug(f"Added Course(s) {course_codes} under {current_program or 'directly in department'} in {current_department}")
+            elif row[0]:
+                logging.info(f"Skipping row: {repr(row)}")
+        return (column_names, department_program_courses)
+    except Exception as e:
+        logging.error(f"An error occurred: {str(e)}")
+        return None
+if __name__ == "__main__":
+    file_path = "data/FTCM_Course_List_Spring2025.xlsx"
+    result = process_xlsx(file_path)
+    if result:
+        column_names, department_program_courses = result
+        print("Column Names:")
+        # Modify column names to have no spaces and no line breaks
+        modified_column_names = [name.replace(' ', '').replace('\n', '') if name else name for name in column_names]
+        print(modified_column_names)
+        print("\nDepartment, Program, Courses:")
+        for department, programs in department_program_courses.items():
+            print(f"**Department: {department}**")
+            for program, courses in programs.items():
+                print(f"  Program: *{program}")
+                for course in courses:
+                    print(f"    - Course: {course}")
+    else:
+        print("Failed to process XLSX file")