Course_query_system / create_course_dataframe.py
CD17's picture
Upload 7 files
6340cf4 verified
from process_xlsx import process_xlsx
import pandas as pd
import re
# Step 1: Convert to DataFrame
def create_course_dataframe(cleaned_column_names, column_names, department_program_courses):
data = []
for department, programs in department_program_courses.items():
for program, courses in programs.items():
for course in courses:
row_data = course[:] # Copy original row
row_data.append(department) # Add Department column
row_data.append(program if program else "N/A") # Add Program column
data.append(row_data)
# Add Department and Program to the column names
extended_column_names = column_names + ['Department', 'Program']
# Create DataFrame
df = pd.DataFrame(data, columns=extended_column_names)
# Strip trailing spaces from column names and remove spaces/newlines
df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('\n', '')
# Ensure all column names are strings before stripping
df.rename(columns=lambda x: str(x).strip(), inplace=True)
# Clean the cleaned_column_names to match the stripped column names
cleaned_column_names = [col.strip() for col in cleaned_column_names]
# Select columns based on cleaned_column_names
df = df[cleaned_column_names]
return df
# Step 2: Diagnose Inconsistencies in Data
def diagnose_inconsistencies(df):
# Report missing values
missing_values = df.isnull().sum()
print("\nMissing Values Per Column:")
print(missing_values[missing_values > 0])
# Check unique value counts to spot potential inconsistencies
print("\nUnique Value Counts Per Column:")
for column in df.columns:
unique_vals = df[column].nunique()
print(f"{column}: {unique_vals} unique values")
# Identify potential misspellings and inconsistent values in key columns
# Example: Checking for inconsistencies in 'Course Code', 'Instructor', 'Room', etc.
print("\nInconsistent Patterns and Values:")
# Pattern checks for Course Code (e.g., expecting format like 'MAT101', 'STA421/521')
inconsistent_course_codes = df[~df['CourseCode'].str.match(r'^[A-Z]{3}\d{3}(/\d{3})?$')]
if not inconsistent_course_codes.empty:
print("\nInconsistent Course Codes:")
print(inconsistent_course_codes[['CourseCode']].drop_duplicates())
# Check for inconsistent capitalization in 'Instructor' column
df['Instructor'] = df['Instructor'].str.strip().str.title()
instructor_inconsistencies = df['Instructor'].value_counts()
print("\nInstructor Inconsistencies:")
print(instructor_inconsistencies[instructor_inconsistencies > 1])
# Check for possible misspellings or variations in Room
print("\nRoom Variations:")
room_variations = df['Room'].value_counts()
print(room_variations[room_variations > 1])
# Identify rows with missing key fields that should generally be non-null
key_columns = ['CourseCode', 'CourseTitle', 'Cr', 'Instructor']
missing_key_fields = df[df[key_columns].isnull().any(axis=1)]
if not missing_key_fields.empty:
print("\nRows with Missing Key Fields:")
print(missing_key_fields[key_columns])
# Display data types and any anomalies in numeric fields
print("\nData Types and Anomalies in Numeric Fields:")
for column in df.select_dtypes(include=['number']).columns:
print(f"{column} - Min: {df[column].min()}, Max: {df[column].max()}, Unique Values: {df[column].nunique()}")
return df
file_path = "data/FTCM_Course_List_Spring2025.xlsx"
result = process_xlsx(file_path)
if result:
column_names, department_program_courses = result
print(f"Column Names:{column_names}")
else:
print(f"Error processing file. {file_path}")
cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)',
'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo',
'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo',
'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program']
# Sample usage
# Assuming column_names and department_program_courses are already defined
df = create_course_dataframe(cleaned_column_names, column_names, department_program_courses)
df_cleaned = diagnose_inconsistencies(df)
diagnose_inconsistencies(df_cleaned)