CD17 commited on
Commit
6340cf4
·
verified ·
1 Parent(s): c67f444

Upload 7 files

Browse files
FTCM_Course_List_Spring2025.xlsx ADDED
Binary file (45.5 kB). View file
 
Spring_2025_courses.db ADDED
Binary file (36.9 kB). View file
 
app.py CHANGED
@@ -1,64 +1,29 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
+ from functions import safe_executor, example_queries
3
 
4
+ def execute_query(query):
5
+ result = safe_executor.execute_safe_query(query)
 
 
6
 
7
+ if result['success'] == True:
8
+ return result['result']
9
+ else:
10
+ return result['error']
11
 
12
+ def update_textbox(prompt):
13
+ return gr.update(value=prompt)
 
 
 
 
 
 
 
14
 
15
+ with gr.Blocks() as demo:
16
+ response_box = gr.Textbox(label="Response", interactive=False)
17
+ msg = gr.Textbox(label="Type your message or select a prompt")
18
+ with gr.Row():
19
+ prompt_dropdown = gr.Dropdown(choices=[""] + example_queries, label="Select a premade prompt", value="")
20
+ submit = gr.Button("Submit")
21
+ clear = gr.ClearButton([msg, response_box])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ prompt_dropdown.change(update_textbox, inputs=[prompt_dropdown], outputs=[msg])
24
+ submit.click(execute_query, inputs=[msg], outputs=[response_box])
25
+ msg.submit(execute_query, inputs=[msg], outputs=[response_box])
26
+ clear.click(lambda: None, None, [msg, response_box], queue=False)
27
 
28
  if __name__ == "__main__":
29
+ demo.launch()
create_course_dataframe.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from process_xlsx import process_xlsx
2
+ import pandas as pd
3
+ import re
4
+
5
+ # Step 1: Convert to DataFrame
6
+ def create_course_dataframe(cleaned_column_names, column_names, department_program_courses):
7
+ data = []
8
+ for department, programs in department_program_courses.items():
9
+ for program, courses in programs.items():
10
+ for course in courses:
11
+ row_data = course[:] # Copy original row
12
+ row_data.append(department) # Add Department column
13
+ row_data.append(program if program else "N/A") # Add Program column
14
+ data.append(row_data)
15
+
16
+ # Add Department and Program to the column names
17
+ extended_column_names = column_names + ['Department', 'Program']
18
+
19
+ # Create DataFrame
20
+ df = pd.DataFrame(data, columns=extended_column_names)
21
+ # Strip trailing spaces from column names and remove spaces/newlines
22
+ df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('\n', '')
23
+
24
+ # Ensure all column names are strings before stripping
25
+ df.rename(columns=lambda x: str(x).strip(), inplace=True)
26
+
27
+ # Clean the cleaned_column_names to match the stripped column names
28
+ cleaned_column_names = [col.strip() for col in cleaned_column_names]
29
+
30
+ # Select columns based on cleaned_column_names
31
+ df = df[cleaned_column_names]
32
+
33
+ return df
34
+
35
+ # Step 2: Diagnose Inconsistencies in Data
36
+ def diagnose_inconsistencies(df):
37
+ # Report missing values
38
+ missing_values = df.isnull().sum()
39
+ print("\nMissing Values Per Column:")
40
+ print(missing_values[missing_values > 0])
41
+
42
+ # Check unique value counts to spot potential inconsistencies
43
+ print("\nUnique Value Counts Per Column:")
44
+ for column in df.columns:
45
+ unique_vals = df[column].nunique()
46
+ print(f"{column}: {unique_vals} unique values")
47
+
48
+ # Identify potential misspellings and inconsistent values in key columns
49
+ # Example: Checking for inconsistencies in 'Course Code', 'Instructor', 'Room', etc.
50
+ print("\nInconsistent Patterns and Values:")
51
+
52
+ # Pattern checks for Course Code (e.g., expecting format like 'MAT101', 'STA421/521')
53
+ inconsistent_course_codes = df[~df['CourseCode'].str.match(r'^[A-Z]{3}\d{3}(/\d{3})?$')]
54
+ if not inconsistent_course_codes.empty:
55
+ print("\nInconsistent Course Codes:")
56
+ print(inconsistent_course_codes[['CourseCode']].drop_duplicates())
57
+
58
+ # Check for inconsistent capitalization in 'Instructor' column
59
+ df['Instructor'] = df['Instructor'].str.strip().str.title()
60
+ instructor_inconsistencies = df['Instructor'].value_counts()
61
+ print("\nInstructor Inconsistencies:")
62
+ print(instructor_inconsistencies[instructor_inconsistencies > 1])
63
+
64
+ # Check for possible misspellings or variations in Room
65
+ print("\nRoom Variations:")
66
+ room_variations = df['Room'].value_counts()
67
+ print(room_variations[room_variations > 1])
68
+
69
+ # Identify rows with missing key fields that should generally be non-null
70
+ key_columns = ['CourseCode', 'CourseTitle', 'Cr', 'Instructor']
71
+ missing_key_fields = df[df[key_columns].isnull().any(axis=1)]
72
+ if not missing_key_fields.empty:
73
+ print("\nRows with Missing Key Fields:")
74
+ print(missing_key_fields[key_columns])
75
+
76
+ # Display data types and any anomalies in numeric fields
77
+ print("\nData Types and Anomalies in Numeric Fields:")
78
+ for column in df.select_dtypes(include=['number']).columns:
79
+ print(f"{column} - Min: {df[column].min()}, Max: {df[column].max()}, Unique Values: {df[column].nunique()}")
80
+
81
+ return df
82
+
83
+
84
+ file_path = "data/FTCM_Course_List_Spring2025.xlsx"
85
+ result = process_xlsx(file_path)
86
+
87
+ if result:
88
+ column_names, department_program_courses = result
89
+ print(f"Column Names:{column_names}")
90
+ else:
91
+ print(f"Error processing file. {file_path}")
92
+
93
+ cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)',
94
+ 'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo',
95
+ 'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo',
96
+ 'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program']
97
+ # Sample usage
98
+ # Assuming column_names and department_program_courses are already defined
99
+ df = create_course_dataframe(cleaned_column_names, column_names, department_program_courses)
100
+ df_cleaned = diagnose_inconsistencies(df)
101
+ diagnose_inconsistencies(df_cleaned)
df2sqlite.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import pandas as pd
3
+ import logging
4
+ from create_course_dataframe import create_course_dataframe, process_xlsx
5
+
6
+ logging.basicConfig(
7
+ level=logging.INFO,
8
+ format="%(asctime)s [%(levelname)s] %(message)s",
9
+ handlers=[logging.StreamHandler()]
10
+ )
11
+
12
+ def save_to_sqlite(df, database_name='Spring_2025_courses.db', table_name='Spring_2025_courses'):
13
+ """
14
+ Save DataFrame to SQLite Database
15
+ """
16
+ try:
17
+ logging.info(f"Connecting to database: {database_name}")
18
+ conn = sqlite3.connect(database_name)
19
+
20
+ logging.info(f"Saving DataFrame to table: {table_name}")
21
+ df.to_sql(table_name, conn, if_exists='replace', index=False)
22
+
23
+ logging.info("Data successfully saved to SQLite database")
24
+
25
+ # Verify the data
26
+ row_count = pd.read_sql(f"SELECT COUNT(*) FROM {table_name}", conn).iloc[0,0]
27
+ logging.info(f"Verified {row_count} rows in table {table_name}")
28
+
29
+ conn.close()
30
+ return True
31
+
32
+ except Exception as e:
33
+ logging.error(f"Error saving to database: {str(e)}")
34
+ return False
35
+
36
+ if __name__ == "__main__":
37
+ # Load and process XLSX file
38
+ file_path = "data/FTCM_Course_List_Spring2025.xlsx"
39
+ result = process_xlsx(file_path)
40
+
41
+ if result:
42
+ column_names, department_program_courses = result
43
+ modified_column_names = [name.replace(' ', '').replace('\n', '') if name else name for name in column_names]
44
+ cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)',
45
+ 'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo',
46
+ 'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo',
47
+ 'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program']
48
+
49
+ # Create DataFrame
50
+ df = create_course_dataframe(cleaned_column_names, modified_column_names, department_program_courses)
51
+
52
+ # Save to SQLite
53
+ if save_to_sqlite(df):
54
+ logging.info("Process completed successfully")
55
+ else:
56
+ logging.error("Failed to save data to SQLite")
57
+ else:
58
+ logging.error("Failed to process XLSX file")
functions_huggingface.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
2
+ import sqlite3
3
+ from langchain_core.prompts import ChatPromptTemplate
4
+ from langchain_core.output_parsers import StrOutputParser
5
+ from langchain_core.runnables import RunnablePassthrough
6
+ import re
7
+ import gradio as gr
8
+
9
+ # Load the Llama model and tokenizer
10
+ model_name = "meta-llama/Llama-3.3-70B-Instruct"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
13
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
14
+
15
+ # Initialize database connection
16
+ db_path = "Spring_2025_courses.db"
17
+ connection = sqlite3.connect(db_path)
18
+
19
+ def get_schema():
20
+ """Retrieve database schema"""
21
+ cursor = connection.cursor()
22
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
23
+ tables = cursor.fetchall()
24
+ schema = {}
25
+ for table_name in tables:
26
+ table_name = table_name[0]
27
+ cursor.execute(f"PRAGMA table_info({table_name});")
28
+ columns = cursor.fetchall()
29
+ schema[table_name] = [column[1] for column in columns]
30
+ return schema
31
+
32
+ def run_query(query):
33
+ """Execute SQL query"""
34
+ cursor = connection.cursor()
35
+ cursor.execute(query)
36
+ return cursor.fetchall()
37
+
38
+ # Prompt templates
39
+ system_prompt = """
40
+ You are a SQLite expert. Given an input question, create one syntactically correct SQLite query to run. Generate only one query. No preamble.
41
+
42
+ Here is the relevant table information:
43
+ schema: {schema}
44
+
45
+ Tips:
46
+ - Use LIKE instead of = in the queries
47
+
48
+ Write only one SQLite query that would answer the user's question.
49
+ """
50
+
51
+ human_prompt = """Based on the table schema below, write a SQL query that would answer the user's question:
52
+ {schema}
53
+
54
+ Question: {question}
55
+ SQL Query:"""
56
+
57
+ prompt = ChatPromptTemplate.from_messages([
58
+ ("system", system_prompt),
59
+ ("human", human_prompt),
60
+ ])
61
+
62
+ # Build query generation chain
63
+ sql_generator = (
64
+ RunnablePassthrough.assign(schema=get_schema)
65
+ | prompt
66
+ | StrOutputParser()
67
+ )
68
+
69
+ def generate_sql(question):
70
+ """Generate SQL query from question"""
71
+ schema = get_schema()
72
+ input_prompt = system_prompt.format(schema=schema, question=question)
73
+ response = generator(input_prompt, max_length=512, num_return_sequences=1)
74
+ return response[0]['generated_text']
75
+
76
+ def execute_safe_query(question):
77
+ """Safely execute a natural language query"""
78
+ try:
79
+ # Generate SQL query
80
+ sql_query = generate_sql(question)
81
+
82
+ # Validate SQL query
83
+ if not sql_query.strip().lower().startswith("select"):
84
+ return {"error": "Only SELECT queries are allowed.", "query": sql_query, "result": None}
85
+
86
+ # Execute query
87
+ result = run_query(sql_query)
88
+ return {"error": None, "query": sql_query, "result": result}
89
+
90
+ except Exception as e:
91
+ return {"error": str(e), "query": None, "result": None}
92
+
93
+ # Deploy using Gradio
94
+ def query_interface(question):
95
+ response = execute_safe_query(question)
96
+ if response['error']:
97
+ return f"Error: {response['error']}\nGenerated Query: {response['query']}"
98
+ return f"Query: {response['query']}\nResult: {response['result']}"
99
+
100
+ iface = gr.Interface(
101
+ fn=query_interface,
102
+ inputs="text",
103
+ outputs="text",
104
+ title="SQLite Query Generator with Llama 3.3",
105
+ description="Ask a natural language question about the Spring 2025 courses database and get the SQL query and results.",
106
+ )
107
+
108
+ if __name__ == "__main__":
109
+ iface.launch()
process_xlsx.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from openpyxl import load_workbook
4
+
5
+ # Debug Mode (Set to False for production)
6
+ DEBUG_MODE = True
7
+
8
+ # Logging Configuration
9
+ logging.basicConfig(
10
+ level=logging.DEBUG if DEBUG_MODE else logging.INFO,
11
+ format="%(asctime)s [%(levelname)s] %(message)s",
12
+ handlers=[
13
+ logging.FileHandler("debug.log"),
14
+ logging.StreamHandler()
15
+ ]
16
+ )
17
+
18
+ def process_xlsx(file_path):
19
+ logging.info(f"Processing XLSX file: {file_path}")
20
+
21
+ try:
22
+ # Load XLSX content
23
+ wb = load_workbook(filename=file_path)
24
+ sheet = wb.active
25
+ xlsx_content = [[cell.value for cell in row] for row in sheet.rows]
26
+
27
+ # Initialization
28
+ column_names = []
29
+ department_program_courses = {}
30
+ current_department = None
31
+ current_program = None
32
+
33
+ # Determine column indices
34
+ header_index = 0
35
+ while header_index < len(xlsx_content):
36
+ if "Course Code" in [x for x in xlsx_content[header_index] if x]:
37
+ break
38
+ header_index += 1
39
+ column_names = xlsx_content[header_index]
40
+ cr_index = [i for i, x in enumerate(column_names) if re.match(r"Cr", str(x))]
41
+ if not cr_index:
42
+ logging.error("Could not find 'Cr' column index.")
43
+ return None
44
+ cr_index = cr_index[0]
45
+
46
+ # Process rows
47
+ for index, row in enumerate(xlsx_content):
48
+ if index <= header_index:
49
+ continue
50
+
51
+ # Department Row Detection (Loose pattern for "Cr")
52
+ if row[0] and row[cr_index] and re.match(r"cr", str(row[cr_index]), re.IGNORECASE):
53
+ current_department = row[0]
54
+ department_program_courses.setdefault(current_department, {})
55
+ current_program = None
56
+ logging.debug(f"Detected Department: {current_department}")
57
+
58
+ # Program Row Detection (Empty "Cr" column)
59
+ elif row[0] and not row[cr_index]:
60
+ current_program = row[0]
61
+ department_program_courses[current_department].setdefault(current_program, [])
62
+ logging.debug(f"Detected Program under {current_department}: {current_program}")
63
+
64
+ # Course Row Detection (Numeric "Cr" value)
65
+ elif row[0] and isinstance(row[cr_index], (int, float)):
66
+ course_codes = [row[0]] # Default to single course code
67
+ # Handle special case (e.g., "STA421/521")
68
+ if "/" in row[0]:
69
+ start, end = row[0].split("/")
70
+ course_codes = [start, start[:3] + end]
71
+ logging.info(f"Splitting course for row: {repr(row)}")
72
+ logging.info(f"course_codes: {course_codes}")
73
+
74
+ for code in course_codes:
75
+ new_row = row[:] # Copy original row
76
+ new_row[0] = code # Update course code for each split course
77
+
78
+ # Assign courses to program if exists, otherwise directly to department
79
+ if current_program:
80
+ department_program_courses[current_department][current_program].append(new_row)
81
+ else:
82
+ department_program_courses[current_department].setdefault(current_department, []).append(new_row)
83
+ logging.debug(f"Added Course(s) {course_codes} under {current_program or 'directly in department'} in {current_department}")
84
+
85
+ elif row[0]:
86
+ logging.info(f"Skipping row: {repr(row)}")
87
+
88
+ return (column_names, department_program_courses)
89
+
90
+ except Exception as e:
91
+ logging.error(f"An error occurred: {str(e)}")
92
+ return None
93
+
94
+ if __name__ == "__main__":
95
+ file_path = "data/FTCM_Course_List_Spring2025.xlsx"
96
+ result = process_xlsx(file_path)
97
+
98
+ if result:
99
+ column_names, department_program_courses = result
100
+ print("Column Names:")
101
+ # Modify column names to have no spaces and no line breaks
102
+ modified_column_names = [name.replace(' ', '').replace('\n', '') if name else name for name in column_names]
103
+ print(modified_column_names)
104
+ print("\nDepartment, Program, Courses:")
105
+ for department, programs in department_program_courses.items():
106
+ print(f"**Department: {department}**")
107
+ for program, courses in programs.items():
108
+ print(f" Program: *{program}")
109
+ for course in courses:
110
+ print(f" - Course: {course}")
111
+ else:
112
+ print("Failed to process XLSX file")