AI-Data-Cleaner / manage_schema.py
reab5555's picture
Upload 5 files
1853d90 verified
raw
history blame contribute delete
No virus
1.78 kB
import pandas as pd
import numpy as np
import json
from llm_config import generate_llm_response
from llm_prompts import DETERMINE_DTYPE_PROMPT
SAMPLE_SIZE = 200
def determine_column_type(df, column):
sample = df[column].sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).tolist()
prompt = DETERMINE_DTYPE_PROMPT.format(sample_values=str(sample))
response = generate_llm_response(prompt)
try:
result = json.loads(response)
return result['column_type'], result['invalid_indices']
except (json.JSONDecodeError, KeyError):
print(f"Error parsing LLM response for column {column}")
return 'string', []
def enforce_column_type(df, column, column_type, invalid_indices):
if column_type == 'float':
df[column] = pd.to_numeric(df[column], errors='coerce')
elif column_type == 'integer':
df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64')
elif column_type == 'date':
df[column] = pd.to_datetime(df[column], errors='coerce')
# Set invalid values to NaN
df.loc[invalid_indices, column] = np.nan
return df
def process_dataframe(df):
print("Determining and enforcing column data types...")
for column in df.columns:
print(f"\nProcessing column: {column}")
column_type, invalid_indices = determine_column_type(df, column)
print(f" Detected type: {column_type}")
print(f" Number of invalid values: {len(invalid_indices)}")
df = enforce_column_type(df, column, column_type, invalid_indices)
valid_percentage = (df[column].count() / len(df)) * 100
print(f" Percentage of valid values after type enforcement: {valid_percentage:.2f}%")
return df