Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- FTCM_Course_List_Spring2025.xlsx +0 -0
- Spring_2025_courses.db +0 -0
- app.py +21 -56
- create_course_dataframe.py +101 -0
- df2sqlite.py +58 -0
- functions_huggingface.py +109 -0
- process_xlsx.py +112 -0
FTCM_Course_List_Spring2025.xlsx
ADDED
Binary file (45.5 kB). View file
|
|
Spring_2025_courses.db
ADDED
Binary file (36.9 kB). View file
|
|
app.py
CHANGED
@@ -1,64 +1,29 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
"""
|
7 |
-
client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
|
8 |
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
def
|
11 |
-
|
12 |
-
history: list[tuple[str, str]],
|
13 |
-
system_message,
|
14 |
-
max_tokens,
|
15 |
-
temperature,
|
16 |
-
top_p,
|
17 |
-
):
|
18 |
-
messages = [{"role": "system", "content": system_message}]
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
response = ""
|
29 |
-
|
30 |
-
for message in client.chat_completion(
|
31 |
-
messages,
|
32 |
-
max_tokens=max_tokens,
|
33 |
-
stream=True,
|
34 |
-
temperature=temperature,
|
35 |
-
top_p=top_p,
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
-
|
39 |
-
response += token
|
40 |
-
yield response
|
41 |
-
|
42 |
-
|
43 |
-
"""
|
44 |
-
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
45 |
-
"""
|
46 |
-
demo = gr.ChatInterface(
|
47 |
-
respond,
|
48 |
-
additional_inputs=[
|
49 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
50 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
51 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
52 |
-
gr.Slider(
|
53 |
-
minimum=0.1,
|
54 |
-
maximum=1.0,
|
55 |
-
value=0.95,
|
56 |
-
step=0.05,
|
57 |
-
label="Top-p (nucleus sampling)",
|
58 |
-
),
|
59 |
-
],
|
60 |
-
)
|
61 |
|
|
|
|
|
|
|
|
|
62 |
|
63 |
if __name__ == "__main__":
|
64 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from functions import safe_executor, example_queries
|
3 |
|
4 |
+
def execute_query(query):
|
5 |
+
result = safe_executor.execute_safe_query(query)
|
|
|
|
|
6 |
|
7 |
+
if result['success'] == True:
|
8 |
+
return result['result']
|
9 |
+
else:
|
10 |
+
return result['error']
|
11 |
|
12 |
+
def update_textbox(prompt):
|
13 |
+
return gr.update(value=prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
with gr.Blocks() as demo:
|
16 |
+
response_box = gr.Textbox(label="Response", interactive=False)
|
17 |
+
msg = gr.Textbox(label="Type your message or select a prompt")
|
18 |
+
with gr.Row():
|
19 |
+
prompt_dropdown = gr.Dropdown(choices=[""] + example_queries, label="Select a premade prompt", value="")
|
20 |
+
submit = gr.Button("Submit")
|
21 |
+
clear = gr.ClearButton([msg, response_box])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
prompt_dropdown.change(update_textbox, inputs=[prompt_dropdown], outputs=[msg])
|
24 |
+
submit.click(execute_query, inputs=[msg], outputs=[response_box])
|
25 |
+
msg.submit(execute_query, inputs=[msg], outputs=[response_box])
|
26 |
+
clear.click(lambda: None, None, [msg, response_box], queue=False)
|
27 |
|
28 |
if __name__ == "__main__":
|
29 |
+
demo.launch()
|
create_course_dataframe.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from process_xlsx import process_xlsx
|
2 |
+
import pandas as pd
|
3 |
+
import re
|
4 |
+
|
5 |
+
# Step 1: Convert to DataFrame
|
6 |
+
def create_course_dataframe(cleaned_column_names, column_names, department_program_courses):
|
7 |
+
data = []
|
8 |
+
for department, programs in department_program_courses.items():
|
9 |
+
for program, courses in programs.items():
|
10 |
+
for course in courses:
|
11 |
+
row_data = course[:] # Copy original row
|
12 |
+
row_data.append(department) # Add Department column
|
13 |
+
row_data.append(program if program else "N/A") # Add Program column
|
14 |
+
data.append(row_data)
|
15 |
+
|
16 |
+
# Add Department and Program to the column names
|
17 |
+
extended_column_names = column_names + ['Department', 'Program']
|
18 |
+
|
19 |
+
# Create DataFrame
|
20 |
+
df = pd.DataFrame(data, columns=extended_column_names)
|
21 |
+
# Strip trailing spaces from column names and remove spaces/newlines
|
22 |
+
df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('\n', '')
|
23 |
+
|
24 |
+
# Ensure all column names are strings before stripping
|
25 |
+
df.rename(columns=lambda x: str(x).strip(), inplace=True)
|
26 |
+
|
27 |
+
# Clean the cleaned_column_names to match the stripped column names
|
28 |
+
cleaned_column_names = [col.strip() for col in cleaned_column_names]
|
29 |
+
|
30 |
+
# Select columns based on cleaned_column_names
|
31 |
+
df = df[cleaned_column_names]
|
32 |
+
|
33 |
+
return df
|
34 |
+
|
35 |
+
# Step 2: Diagnose Inconsistencies in Data
|
36 |
+
def diagnose_inconsistencies(df):
|
37 |
+
# Report missing values
|
38 |
+
missing_values = df.isnull().sum()
|
39 |
+
print("\nMissing Values Per Column:")
|
40 |
+
print(missing_values[missing_values > 0])
|
41 |
+
|
42 |
+
# Check unique value counts to spot potential inconsistencies
|
43 |
+
print("\nUnique Value Counts Per Column:")
|
44 |
+
for column in df.columns:
|
45 |
+
unique_vals = df[column].nunique()
|
46 |
+
print(f"{column}: {unique_vals} unique values")
|
47 |
+
|
48 |
+
# Identify potential misspellings and inconsistent values in key columns
|
49 |
+
# Example: Checking for inconsistencies in 'Course Code', 'Instructor', 'Room', etc.
|
50 |
+
print("\nInconsistent Patterns and Values:")
|
51 |
+
|
52 |
+
# Pattern checks for Course Code (e.g., expecting format like 'MAT101', 'STA421/521')
|
53 |
+
inconsistent_course_codes = df[~df['CourseCode'].str.match(r'^[A-Z]{3}\d{3}(/\d{3})?$')]
|
54 |
+
if not inconsistent_course_codes.empty:
|
55 |
+
print("\nInconsistent Course Codes:")
|
56 |
+
print(inconsistent_course_codes[['CourseCode']].drop_duplicates())
|
57 |
+
|
58 |
+
# Check for inconsistent capitalization in 'Instructor' column
|
59 |
+
df['Instructor'] = df['Instructor'].str.strip().str.title()
|
60 |
+
instructor_inconsistencies = df['Instructor'].value_counts()
|
61 |
+
print("\nInstructor Inconsistencies:")
|
62 |
+
print(instructor_inconsistencies[instructor_inconsistencies > 1])
|
63 |
+
|
64 |
+
# Check for possible misspellings or variations in Room
|
65 |
+
print("\nRoom Variations:")
|
66 |
+
room_variations = df['Room'].value_counts()
|
67 |
+
print(room_variations[room_variations > 1])
|
68 |
+
|
69 |
+
# Identify rows with missing key fields that should generally be non-null
|
70 |
+
key_columns = ['CourseCode', 'CourseTitle', 'Cr', 'Instructor']
|
71 |
+
missing_key_fields = df[df[key_columns].isnull().any(axis=1)]
|
72 |
+
if not missing_key_fields.empty:
|
73 |
+
print("\nRows with Missing Key Fields:")
|
74 |
+
print(missing_key_fields[key_columns])
|
75 |
+
|
76 |
+
# Display data types and any anomalies in numeric fields
|
77 |
+
print("\nData Types and Anomalies in Numeric Fields:")
|
78 |
+
for column in df.select_dtypes(include=['number']).columns:
|
79 |
+
print(f"{column} - Min: {df[column].min()}, Max: {df[column].max()}, Unique Values: {df[column].nunique()}")
|
80 |
+
|
81 |
+
return df
|
82 |
+
|
83 |
+
|
84 |
+
file_path = "data/FTCM_Course_List_Spring2025.xlsx"
|
85 |
+
result = process_xlsx(file_path)
|
86 |
+
|
87 |
+
if result:
|
88 |
+
column_names, department_program_courses = result
|
89 |
+
print(f"Column Names:{column_names}")
|
90 |
+
else:
|
91 |
+
print(f"Error processing file. {file_path}")
|
92 |
+
|
93 |
+
cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)',
|
94 |
+
'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo',
|
95 |
+
'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo',
|
96 |
+
'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program']
|
97 |
+
# Sample usage
|
98 |
+
# Assuming column_names and department_program_courses are already defined
|
99 |
+
df = create_course_dataframe(cleaned_column_names, column_names, department_program_courses)
|
100 |
+
df_cleaned = diagnose_inconsistencies(df)
|
101 |
+
diagnose_inconsistencies(df_cleaned)
|
df2sqlite.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sqlite3
|
2 |
+
import pandas as pd
|
3 |
+
import logging
|
4 |
+
from create_course_dataframe import create_course_dataframe, process_xlsx
|
5 |
+
|
6 |
+
logging.basicConfig(
|
7 |
+
level=logging.INFO,
|
8 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
9 |
+
handlers=[logging.StreamHandler()]
|
10 |
+
)
|
11 |
+
|
12 |
+
def save_to_sqlite(df, database_name='Spring_2025_courses.db', table_name='Spring_2025_courses'):
|
13 |
+
"""
|
14 |
+
Save DataFrame to SQLite Database
|
15 |
+
"""
|
16 |
+
try:
|
17 |
+
logging.info(f"Connecting to database: {database_name}")
|
18 |
+
conn = sqlite3.connect(database_name)
|
19 |
+
|
20 |
+
logging.info(f"Saving DataFrame to table: {table_name}")
|
21 |
+
df.to_sql(table_name, conn, if_exists='replace', index=False)
|
22 |
+
|
23 |
+
logging.info("Data successfully saved to SQLite database")
|
24 |
+
|
25 |
+
# Verify the data
|
26 |
+
row_count = pd.read_sql(f"SELECT COUNT(*) FROM {table_name}", conn).iloc[0,0]
|
27 |
+
logging.info(f"Verified {row_count} rows in table {table_name}")
|
28 |
+
|
29 |
+
conn.close()
|
30 |
+
return True
|
31 |
+
|
32 |
+
except Exception as e:
|
33 |
+
logging.error(f"Error saving to database: {str(e)}")
|
34 |
+
return False
|
35 |
+
|
36 |
+
if __name__ == "__main__":
|
37 |
+
# Load and process XLSX file
|
38 |
+
file_path = "data/FTCM_Course_List_Spring2025.xlsx"
|
39 |
+
result = process_xlsx(file_path)
|
40 |
+
|
41 |
+
if result:
|
42 |
+
column_names, department_program_courses = result
|
43 |
+
modified_column_names = [name.replace(' ', '').replace('\n', '') if name else name for name in column_names]
|
44 |
+
cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)',
|
45 |
+
'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo',
|
46 |
+
'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo',
|
47 |
+
'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program']
|
48 |
+
|
49 |
+
# Create DataFrame
|
50 |
+
df = create_course_dataframe(cleaned_column_names, modified_column_names, department_program_courses)
|
51 |
+
|
52 |
+
# Save to SQLite
|
53 |
+
if save_to_sqlite(df):
|
54 |
+
logging.info("Process completed successfully")
|
55 |
+
else:
|
56 |
+
logging.error("Failed to save data to SQLite")
|
57 |
+
else:
|
58 |
+
logging.error("Failed to process XLSX file")
|
functions_huggingface.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
2 |
+
import sqlite3
|
3 |
+
from langchain_core.prompts import ChatPromptTemplate
|
4 |
+
from langchain_core.output_parsers import StrOutputParser
|
5 |
+
from langchain_core.runnables import RunnablePassthrough
|
6 |
+
import re
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
# Load the Llama model and tokenizer
|
10 |
+
model_name = "meta-llama/Llama-3.3-70B-Instruct"
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
13 |
+
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
14 |
+
|
15 |
+
# Initialize database connection
|
16 |
+
db_path = "Spring_2025_courses.db"
|
17 |
+
connection = sqlite3.connect(db_path)
|
18 |
+
|
19 |
+
def get_schema():
|
20 |
+
"""Retrieve database schema"""
|
21 |
+
cursor = connection.cursor()
|
22 |
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
23 |
+
tables = cursor.fetchall()
|
24 |
+
schema = {}
|
25 |
+
for table_name in tables:
|
26 |
+
table_name = table_name[0]
|
27 |
+
cursor.execute(f"PRAGMA table_info({table_name});")
|
28 |
+
columns = cursor.fetchall()
|
29 |
+
schema[table_name] = [column[1] for column in columns]
|
30 |
+
return schema
|
31 |
+
|
32 |
+
def run_query(query):
|
33 |
+
"""Execute SQL query"""
|
34 |
+
cursor = connection.cursor()
|
35 |
+
cursor.execute(query)
|
36 |
+
return cursor.fetchall()
|
37 |
+
|
38 |
+
# Prompt templates
|
39 |
+
system_prompt = """
|
40 |
+
You are a SQLite expert. Given an input question, create one syntactically correct SQLite query to run. Generate only one query. No preamble.
|
41 |
+
|
42 |
+
Here is the relevant table information:
|
43 |
+
schema: {schema}
|
44 |
+
|
45 |
+
Tips:
|
46 |
+
- Use LIKE instead of = in the queries
|
47 |
+
|
48 |
+
Write only one SQLite query that would answer the user's question.
|
49 |
+
"""
|
50 |
+
|
51 |
+
human_prompt = """Based on the table schema below, write a SQL query that would answer the user's question:
|
52 |
+
{schema}
|
53 |
+
|
54 |
+
Question: {question}
|
55 |
+
SQL Query:"""
|
56 |
+
|
57 |
+
prompt = ChatPromptTemplate.from_messages([
|
58 |
+
("system", system_prompt),
|
59 |
+
("human", human_prompt),
|
60 |
+
])
|
61 |
+
|
62 |
+
# Build query generation chain
|
63 |
+
sql_generator = (
|
64 |
+
RunnablePassthrough.assign(schema=get_schema)
|
65 |
+
| prompt
|
66 |
+
| StrOutputParser()
|
67 |
+
)
|
68 |
+
|
69 |
+
def generate_sql(question):
|
70 |
+
"""Generate SQL query from question"""
|
71 |
+
schema = get_schema()
|
72 |
+
input_prompt = system_prompt.format(schema=schema, question=question)
|
73 |
+
response = generator(input_prompt, max_length=512, num_return_sequences=1)
|
74 |
+
return response[0]['generated_text']
|
75 |
+
|
76 |
+
def execute_safe_query(question):
|
77 |
+
"""Safely execute a natural language query"""
|
78 |
+
try:
|
79 |
+
# Generate SQL query
|
80 |
+
sql_query = generate_sql(question)
|
81 |
+
|
82 |
+
# Validate SQL query
|
83 |
+
if not sql_query.strip().lower().startswith("select"):
|
84 |
+
return {"error": "Only SELECT queries are allowed.", "query": sql_query, "result": None}
|
85 |
+
|
86 |
+
# Execute query
|
87 |
+
result = run_query(sql_query)
|
88 |
+
return {"error": None, "query": sql_query, "result": result}
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
return {"error": str(e), "query": None, "result": None}
|
92 |
+
|
93 |
+
# Deploy using Gradio
|
94 |
+
def query_interface(question):
|
95 |
+
response = execute_safe_query(question)
|
96 |
+
if response['error']:
|
97 |
+
return f"Error: {response['error']}\nGenerated Query: {response['query']}"
|
98 |
+
return f"Query: {response['query']}\nResult: {response['result']}"
|
99 |
+
|
100 |
+
iface = gr.Interface(
|
101 |
+
fn=query_interface,
|
102 |
+
inputs="text",
|
103 |
+
outputs="text",
|
104 |
+
title="SQLite Query Generator with Llama 3.3",
|
105 |
+
description="Ask a natural language question about the Spring 2025 courses database and get the SQL query and results.",
|
106 |
+
)
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
iface.launch()
|
process_xlsx.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import re
|
3 |
+
from openpyxl import load_workbook
|
4 |
+
|
5 |
+
# Debug Mode (Set to False for production)
|
6 |
+
DEBUG_MODE = True
|
7 |
+
|
8 |
+
# Logging Configuration
|
9 |
+
logging.basicConfig(
|
10 |
+
level=logging.DEBUG if DEBUG_MODE else logging.INFO,
|
11 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
12 |
+
handlers=[
|
13 |
+
logging.FileHandler("debug.log"),
|
14 |
+
logging.StreamHandler()
|
15 |
+
]
|
16 |
+
)
|
17 |
+
|
18 |
+
def process_xlsx(file_path):
|
19 |
+
logging.info(f"Processing XLSX file: {file_path}")
|
20 |
+
|
21 |
+
try:
|
22 |
+
# Load XLSX content
|
23 |
+
wb = load_workbook(filename=file_path)
|
24 |
+
sheet = wb.active
|
25 |
+
xlsx_content = [[cell.value for cell in row] for row in sheet.rows]
|
26 |
+
|
27 |
+
# Initialization
|
28 |
+
column_names = []
|
29 |
+
department_program_courses = {}
|
30 |
+
current_department = None
|
31 |
+
current_program = None
|
32 |
+
|
33 |
+
# Determine column indices
|
34 |
+
header_index = 0
|
35 |
+
while header_index < len(xlsx_content):
|
36 |
+
if "Course Code" in [x for x in xlsx_content[header_index] if x]:
|
37 |
+
break
|
38 |
+
header_index += 1
|
39 |
+
column_names = xlsx_content[header_index]
|
40 |
+
cr_index = [i for i, x in enumerate(column_names) if re.match(r"Cr", str(x))]
|
41 |
+
if not cr_index:
|
42 |
+
logging.error("Could not find 'Cr' column index.")
|
43 |
+
return None
|
44 |
+
cr_index = cr_index[0]
|
45 |
+
|
46 |
+
# Process rows
|
47 |
+
for index, row in enumerate(xlsx_content):
|
48 |
+
if index <= header_index:
|
49 |
+
continue
|
50 |
+
|
51 |
+
# Department Row Detection (Loose pattern for "Cr")
|
52 |
+
if row[0] and row[cr_index] and re.match(r"cr", str(row[cr_index]), re.IGNORECASE):
|
53 |
+
current_department = row[0]
|
54 |
+
department_program_courses.setdefault(current_department, {})
|
55 |
+
current_program = None
|
56 |
+
logging.debug(f"Detected Department: {current_department}")
|
57 |
+
|
58 |
+
# Program Row Detection (Empty "Cr" column)
|
59 |
+
elif row[0] and not row[cr_index]:
|
60 |
+
current_program = row[0]
|
61 |
+
department_program_courses[current_department].setdefault(current_program, [])
|
62 |
+
logging.debug(f"Detected Program under {current_department}: {current_program}")
|
63 |
+
|
64 |
+
# Course Row Detection (Numeric "Cr" value)
|
65 |
+
elif row[0] and isinstance(row[cr_index], (int, float)):
|
66 |
+
course_codes = [row[0]] # Default to single course code
|
67 |
+
# Handle special case (e.g., "STA421/521")
|
68 |
+
if "/" in row[0]:
|
69 |
+
start, end = row[0].split("/")
|
70 |
+
course_codes = [start, start[:3] + end]
|
71 |
+
logging.info(f"Splitting course for row: {repr(row)}")
|
72 |
+
logging.info(f"course_codes: {course_codes}")
|
73 |
+
|
74 |
+
for code in course_codes:
|
75 |
+
new_row = row[:] # Copy original row
|
76 |
+
new_row[0] = code # Update course code for each split course
|
77 |
+
|
78 |
+
# Assign courses to program if exists, otherwise directly to department
|
79 |
+
if current_program:
|
80 |
+
department_program_courses[current_department][current_program].append(new_row)
|
81 |
+
else:
|
82 |
+
department_program_courses[current_department].setdefault(current_department, []).append(new_row)
|
83 |
+
logging.debug(f"Added Course(s) {course_codes} under {current_program or 'directly in department'} in {current_department}")
|
84 |
+
|
85 |
+
elif row[0]:
|
86 |
+
logging.info(f"Skipping row: {repr(row)}")
|
87 |
+
|
88 |
+
return (column_names, department_program_courses)
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
logging.error(f"An error occurred: {str(e)}")
|
92 |
+
return None
|
93 |
+
|
94 |
+
if __name__ == "__main__":
|
95 |
+
file_path = "data/FTCM_Course_List_Spring2025.xlsx"
|
96 |
+
result = process_xlsx(file_path)
|
97 |
+
|
98 |
+
if result:
|
99 |
+
column_names, department_program_courses = result
|
100 |
+
print("Column Names:")
|
101 |
+
# Modify column names to have no spaces and no line breaks
|
102 |
+
modified_column_names = [name.replace(' ', '').replace('\n', '') if name else name for name in column_names]
|
103 |
+
print(modified_column_names)
|
104 |
+
print("\nDepartment, Program, Courses:")
|
105 |
+
for department, programs in department_program_courses.items():
|
106 |
+
print(f"**Department: {department}**")
|
107 |
+
for program, courses in programs.items():
|
108 |
+
print(f" Program: *{program}")
|
109 |
+
for course in courses:
|
110 |
+
print(f" - Course: {course}")
|
111 |
+
else:
|
112 |
+
print("Failed to process XLSX file")
|