Spaces:
Sleeping
Sleeping
from process_xlsx import process_xlsx | |
import pandas as pd | |
import re | |
# Step 1: Convert to DataFrame | |
def create_course_dataframe(cleaned_column_names, column_names, department_program_courses): | |
data = [] | |
for department, programs in department_program_courses.items(): | |
for program, courses in programs.items(): | |
for course in courses: | |
row_data = course[:] # Copy original row | |
row_data.append(department) # Add Department column | |
row_data.append(program if program else "N/A") # Add Program column | |
data.append(row_data) | |
# Add Department and Program to the column names | |
extended_column_names = column_names + ['Department', 'Program'] | |
# Create DataFrame | |
df = pd.DataFrame(data, columns=extended_column_names) | |
# Strip trailing spaces from column names and remove spaces/newlines | |
df.columns = df.columns.str.strip().str.replace(' ', '').str.replace('\n', '') | |
# Ensure all column names are strings before stripping | |
df.rename(columns=lambda x: str(x).strip(), inplace=True) | |
# Clean the cleaned_column_names to match the stripped column names | |
cleaned_column_names = [col.strip() for col in cleaned_column_names] | |
# Select columns based on cleaned_column_names | |
df = df[cleaned_column_names] | |
return df | |
# Step 2: Diagnose Inconsistencies in Data | |
def diagnose_inconsistencies(df): | |
# Report missing values | |
missing_values = df.isnull().sum() | |
print("\nMissing Values Per Column:") | |
print(missing_values[missing_values > 0]) | |
# Check unique value counts to spot potential inconsistencies | |
print("\nUnique Value Counts Per Column:") | |
for column in df.columns: | |
unique_vals = df[column].nunique() | |
print(f"{column}: {unique_vals} unique values") | |
# Identify potential misspellings and inconsistent values in key columns | |
# Example: Checking for inconsistencies in 'Course Code', 'Instructor', 'Room', etc. | |
print("\nInconsistent Patterns and Values:") | |
# Pattern checks for Course Code (e.g., expecting format like 'MAT101', 'STA421/521') | |
inconsistent_course_codes = df[~df['CourseCode'].str.match(r'^[A-Z]{3}\d{3}(/\d{3})?$')] | |
if not inconsistent_course_codes.empty: | |
print("\nInconsistent Course Codes:") | |
print(inconsistent_course_codes[['CourseCode']].drop_duplicates()) | |
# Check for inconsistent capitalization in 'Instructor' column | |
df['Instructor'] = df['Instructor'].str.strip().str.title() | |
instructor_inconsistencies = df['Instructor'].value_counts() | |
print("\nInstructor Inconsistencies:") | |
print(instructor_inconsistencies[instructor_inconsistencies > 1]) | |
# Check for possible misspellings or variations in Room | |
print("\nRoom Variations:") | |
room_variations = df['Room'].value_counts() | |
print(room_variations[room_variations > 1]) | |
# Identify rows with missing key fields that should generally be non-null | |
key_columns = ['CourseCode', 'CourseTitle', 'Cr', 'Instructor'] | |
missing_key_fields = df[df[key_columns].isnull().any(axis=1)] | |
if not missing_key_fields.empty: | |
print("\nRows with Missing Key Fields:") | |
print(missing_key_fields[key_columns]) | |
# Display data types and any anomalies in numeric fields | |
print("\nData Types and Anomalies in Numeric Fields:") | |
for column in df.select_dtypes(include=['number']).columns: | |
print(f"{column} - Min: {df[column].min()}, Max: {df[column].max()}, Unique Values: {df[column].nunique()}") | |
return df | |
file_path = "data/FTCM_Course_List_Spring2025.xlsx" | |
result = process_xlsx(file_path) | |
if result: | |
column_names, department_program_courses = result | |
print(f"Column Names:{column_names}") | |
else: | |
print(f"Error processing file. {file_path}") | |
cleaned_column_names = ['CourseCode', 'CourseTitle', 'Cr', 'Prereq(s)', | |
'Instructor', 'Major/GE/Elective', 'Format', 'Mon', 'MonTo', | |
'Tue', 'TueTo', 'Wed', 'WedTo', 'Thu', 'ThuTo', | |
'Fri', 'FriTo', 'Sat', 'SatTo', 'Platform', 'New/Repeat', 'Room', 'Department', 'Program'] | |
# Sample usage | |
# Assuming column_names and department_program_courses are already defined | |
df = create_course_dataframe(cleaned_column_names, column_names, department_program_courses) | |
df_cleaned = diagnose_inconsistencies(df) | |
diagnose_inconsistencies(df_cleaned) |