|
import os
|
|
from typing import Dict
|
|
from datasets import Dataset
|
|
import torch
|
|
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
|
|
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.cluster import KMeans
|
|
from torch.nn import CrossEntropyLoss
|
|
import pickle
|
|
|
|
os.environ['OMP_NUM_THREADS'] = '7'
|
|
|
|
|
|
class WeightedTrainer(Trainer):
|
|
def compute_loss(self, model, inputs, return_outputs: bool = False, num_items_in_batch: int = None):
|
|
"""
|
|
Custom loss computation with sample weights
|
|
"""
|
|
labels = inputs.get("labels")
|
|
weights = inputs.get("weight")
|
|
|
|
|
|
outputs = model(**{k: v for k, v in inputs.items()
|
|
if k not in ["weight", "labels"]})
|
|
logits = outputs.get("logits")
|
|
|
|
|
|
outputs["labels"] = labels
|
|
|
|
|
|
if weights is not None:
|
|
weights = weights.to(logits.device)
|
|
loss_fct = CrossEntropyLoss(reduction='none')
|
|
loss = loss_fct(logits.view(-1, self.model.config.num_labels),
|
|
labels.view(-1))
|
|
|
|
|
|
if num_items_in_batch:
|
|
weights = weights[:num_items_in_batch]
|
|
|
|
loss = (loss * weights.view(-1)).mean()
|
|
else:
|
|
loss_fct = CrossEntropyLoss(label_smoothing=0.1)
|
|
loss = loss_fct(logits.view(-1, self.model.config.num_labels),
|
|
labels.view(-1))
|
|
|
|
outputs["loss"] = loss
|
|
return (loss, outputs) if return_outputs else loss
|
|
|
|
|
|
def create_feature_vector(df):
|
|
"""Create numerical feature vector for clustering with sample size weighting, handling missing/unseen labels."""
|
|
|
|
|
|
le_gender = LabelEncoder()
|
|
le_race = LabelEncoder()
|
|
le_risk = LabelEncoder()
|
|
|
|
|
|
gender_encoded = le_gender.fit(df['Gender'].unique()).transform(df['Gender'].fillna('Unknown'))
|
|
race_encoded = le_race.fit(df['RaceEthnicity'].unique()).transform(df['RaceEthnicity'].fillna('Unknown'))
|
|
risk_encoded = le_risk.fit(df['RiskFactor'].unique()).transform(df['RiskFactor'].fillna('Unknown'))
|
|
|
|
|
|
age_map = {
|
|
'12-17 years': 0,
|
|
'18-39 years': 1,
|
|
'40-64 years': 2,
|
|
'65-79 years': 3,
|
|
'80 years and older': 4
|
|
}
|
|
|
|
|
|
age_encoded = df['Age'].map(lambda x: age_map.get(x, -1))
|
|
|
|
|
|
features = np.column_stack([
|
|
age_encoded,
|
|
gender_encoded,
|
|
race_encoded,
|
|
risk_encoded,
|
|
df['Sample_Size'].values
|
|
])
|
|
|
|
|
|
scaler = StandardScaler()
|
|
features_scaled = scaler.fit_transform(features)
|
|
|
|
return features_scaled, scaler
|
|
|
|
|
|
def weighted_kmeans(X, sample_weights, n_clusters, max_iter=300, random_state=42):
|
|
"""Custom K-means implementation that considers sample weights"""
|
|
n_samples = X.shape[0]
|
|
|
|
|
|
rng = np.random.RandomState(random_state)
|
|
weighted_indices = rng.choice(n_samples, size=n_clusters, p=sample_weights / sample_weights.sum())
|
|
centroids = X[weighted_indices]
|
|
|
|
for _ in range(max_iter):
|
|
|
|
distances = np.sqrt(((X[:, np.newaxis] - centroids) ** 2).sum(axis=2))
|
|
labels = np.argmin(distances, axis=1)
|
|
|
|
|
|
new_centroids = np.zeros_like(centroids)
|
|
for k in range(n_clusters):
|
|
mask = labels == k
|
|
if mask.any():
|
|
weights_k = sample_weights[mask]
|
|
new_centroids[k] = np.average(X[mask], axis=0, weights=weights_k)
|
|
|
|
|
|
if np.allclose(centroids, new_centroids):
|
|
break
|
|
|
|
centroids = new_centroids
|
|
|
|
return labels, centroids
|
|
|
|
|
|
def prepare_data(file_path='data/Vision_Survey_Cleaned.csv'):
|
|
"""Load and prepare the vision health dataset with sample-size-aware clustering."""
|
|
print("\nLoading and preparing data...")
|
|
df = pd.read_csv(file_path)
|
|
|
|
|
|
vision_cat = ['Best-corrected visual acuity']
|
|
df = df[df['Question'].isin(vision_cat)].copy()
|
|
df = df[df["RiskFactor"] != "All participants"]
|
|
df = df[df["RiskFactorResponse"] != "Total"]
|
|
|
|
|
|
df = df.reset_index(drop=True)
|
|
|
|
|
|
features_scaled, scaler = create_feature_vector(df)
|
|
|
|
|
|
sample_weights = df['Sample_Size'].values
|
|
sample_weights = sample_weights / sample_weights.sum()
|
|
|
|
|
|
n_clusters = min(5, len(df))
|
|
clusters, centroids = weighted_kmeans(
|
|
features_scaled,
|
|
sample_weights,
|
|
n_clusters=n_clusters
|
|
)
|
|
|
|
|
|
df['cluster'] = clusters
|
|
|
|
|
|
cluster_total_samples = df.groupby('cluster')['Sample_Size'].sum()
|
|
cluster_weights = cluster_total_samples / cluster_total_samples.sum()
|
|
|
|
|
|
df['doc'] = df.apply(
|
|
lambda x: f"""
|
|
Patient Demographics:
|
|
- Age Category: {x['Age']}
|
|
- Gender: {x['Gender']}
|
|
- Race/Ethnicity: {x['RaceEthnicity']}
|
|
|
|
Risk Factors:
|
|
- {x['RiskFactor']}: {x['RiskFactorResponse']}
|
|
|
|
Additional Information:
|
|
- Sample Size: {x['Sample_Size']}
|
|
- Cluster Profile: {x['cluster']} (Weight: {cluster_weights.get(x['cluster'], 0):.3f})
|
|
""".strip(),
|
|
axis=1
|
|
)
|
|
|
|
|
|
le = LabelEncoder()
|
|
df['labels'] = le.fit_transform(df['Response'].astype(str))
|
|
|
|
|
|
df['weight'] = df.apply(
|
|
lambda x: (x['Sample_Size'] / df['Sample_Size'].sum()) *
|
|
cluster_weights.get(x['cluster'], 0),
|
|
axis=1
|
|
)
|
|
|
|
|
|
train_df, test_df = train_test_split(
|
|
df,
|
|
test_size=0.2,
|
|
stratify=df['labels'],
|
|
random_state=42
|
|
)
|
|
|
|
|
|
train_data = {
|
|
'doc': train_df['doc'].tolist(),
|
|
'labels': train_df['labels'].tolist(),
|
|
'weight': train_df['weight'].tolist()
|
|
}
|
|
|
|
test_data = {
|
|
'doc': test_df['doc'].tolist(),
|
|
'labels': test_df['labels'].tolist(),
|
|
'weight': test_df['weight'].tolist()
|
|
}
|
|
|
|
|
|
train_dataset = Dataset.from_dict(train_data)
|
|
test_dataset = Dataset.from_dict(test_data)
|
|
|
|
dataset_dict = {
|
|
'train': train_dataset,
|
|
'test': test_dataset
|
|
}
|
|
|
|
|
|
print("\nDataset Summary:")
|
|
print(f"Training samples: {len(train_dataset)}")
|
|
print(f"Test samples: {len(test_dataset)}")
|
|
|
|
print("\nCluster Distribution:")
|
|
for i in range(n_clusters):
|
|
cluster_mask = df['cluster'] == i
|
|
cluster_samples = df[cluster_mask]['Sample_Size'].sum()
|
|
print(f"\nCluster {i} (Total samples: {cluster_samples:,}, Weight: {cluster_weights.get(i, 0):.3f}):")
|
|
print("Most common characteristics:")
|
|
for col in ['Age', 'Gender', 'RaceEthnicity', 'RiskFactor']:
|
|
values = df[col][cluster_mask].value_counts().head(3)
|
|
samples = df[cluster_mask].groupby(col)['Sample_Size'].sum().sort_values(ascending=False).head(3)
|
|
print(f"{col}:")
|
|
for val, count in values.items():
|
|
sample_count = samples.get(val, 0)
|
|
print(f" - {val}: {count} groups ({sample_count:,} individuals)")
|
|
|
|
print("\nLabel Distribution:")
|
|
for label, idx in zip(le.classes_, range(len(le.classes_))):
|
|
count = (df['labels'] == idx).sum()
|
|
total_size = df[df['labels'] == idx]['Sample_Size'].sum()
|
|
print(f"{label}: {count} groups, {total_size:,} individuals")
|
|
|
|
return dataset_dict, le
|
|
|
|
|
|
|
|
def main():
|
|
|
|
output_dir = "models/vision-classifier"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
|
|
dataset_dict, label_encoder = prepare_data()
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
|
|
|
|
|
|
def tokenize_function(examples):
|
|
"""Tokenize the input texts and maintain the correct column names"""
|
|
tokenized = tokenizer(
|
|
examples["doc"],
|
|
truncation=True,
|
|
padding='max_length',
|
|
max_length=128,
|
|
return_tensors=None
|
|
)
|
|
|
|
tokenized['labels'] = examples['labels']
|
|
tokenized['weight'] = examples['weight']
|
|
return tokenized
|
|
|
|
|
|
tokenized_datasets = {}
|
|
for split, dataset in dataset_dict.items():
|
|
tokenized_datasets[split] = dataset.map(
|
|
tokenize_function,
|
|
batched=True,
|
|
remove_columns=['doc']
|
|
)
|
|
|
|
|
|
print("\nSample tokenized data:", tokenized_datasets["train"][0])
|
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained(
|
|
"distilbert-base-uncased",
|
|
num_labels=len(label_encoder.classes_),
|
|
id2label={i: label for i, label in enumerate(label_encoder.classes_)},
|
|
label2id={label: i for i, label in enumerate(label_encoder.classes_)},
|
|
)
|
|
|
|
|
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
print(f"\nTraining on device: {device}")
|
|
|
|
|
|
model.to(device)
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir=output_dir,
|
|
learning_rate=3e-5,
|
|
per_device_train_batch_size=8,
|
|
per_device_eval_batch_size=8,
|
|
num_train_epochs=7,
|
|
weight_decay=0.01,
|
|
eval_strategy="epoch",
|
|
save_strategy="epoch",
|
|
load_best_model_at_end=True,
|
|
remove_unused_columns=False,
|
|
push_to_hub=True,
|
|
)
|
|
|
|
|
|
trainer = WeightedTrainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_datasets["train"],
|
|
eval_dataset=tokenized_datasets["test"],
|
|
data_collator=data_collator,
|
|
)
|
|
|
|
|
|
print("\nStarting training...")
|
|
trainer.train()
|
|
|
|
|
|
print("\nSaving model...")
|
|
trainer.save_model(output_dir=os.path.join(output_dir, "model"))
|
|
|
|
|
|
tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer"))
|
|
|
|
|
|
label_encoder_path = os.path.join(output_dir, "label_encoder.pkl")
|
|
with open(label_encoder_path, 'wb') as f:
|
|
pickle.dump(label_encoder, f)
|
|
|
|
return trainer, model, tokenizer, label_encoder
|
|
|
|
|
|
def evaluate_model(model, eval_dataset, tokenizer, label_encoder, device) -> Dict:
|
|
"""
|
|
Evaluate model performance using multiple metrics
|
|
"""
|
|
model.eval()
|
|
all_predictions = []
|
|
all_labels = []
|
|
|
|
|
|
for item in eval_dataset:
|
|
|
|
inputs = tokenizer(
|
|
item['doc'],
|
|
truncation=True,
|
|
padding=True,
|
|
return_tensors="pt"
|
|
)
|
|
inputs = {k: v.to(device) for k, v in inputs.items()}
|
|
|
|
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
predictions = torch.argmax(outputs.logits, dim=1)
|
|
|
|
all_predictions.extend(predictions.cpu().numpy())
|
|
all_labels.append(item['labels'])
|
|
|
|
|
|
accuracy = accuracy_score(all_labels, all_predictions)
|
|
precision, recall, f1, support = precision_recall_fscore_support(
|
|
all_labels,
|
|
all_predictions,
|
|
average='weighted'
|
|
)
|
|
|
|
|
|
per_class_precision, per_class_recall, per_class_f1, _ = precision_recall_fscore_support(
|
|
all_labels,
|
|
all_predictions,
|
|
average=None
|
|
)
|
|
|
|
|
|
conf_matrix = confusion_matrix(all_labels, all_predictions)
|
|
|
|
|
|
metrics = {
|
|
'accuracy': accuracy,
|
|
'weighted_precision': precision,
|
|
'weighted_recall': recall,
|
|
'weighted_f1': f1,
|
|
'confusion_matrix': conf_matrix,
|
|
'per_class_metrics': {
|
|
label: {
|
|
'precision': p,
|
|
'recall': r,
|
|
'f1': f
|
|
} for label, p, r, f in zip(
|
|
label_encoder.classes_,
|
|
per_class_precision,
|
|
per_class_recall,
|
|
per_class_f1
|
|
)
|
|
}
|
|
}
|
|
|
|
return metrics
|
|
|
|
|
|
def print_evaluation_report(metrics: Dict, label_encoder):
|
|
"""
|
|
Print formatted evaluation report
|
|
"""
|
|
print("\n" + "=" * 50)
|
|
print("MODEL EVALUATION REPORT")
|
|
print("=" * 50)
|
|
|
|
print("\nOverall Metrics:")
|
|
print(f"Accuracy: {metrics['accuracy']:.4f}")
|
|
print(f"Weighted Precision: {metrics['weighted_precision']:.4f}")
|
|
print(f"Weighted Recall: {metrics['weighted_recall']:.4f}")
|
|
print(f"Weighted F1-Score: {metrics['weighted_f1']:.4f}")
|
|
|
|
print("\nPer-Class Metrics:")
|
|
print("-" * 50)
|
|
print(f"{'Class':<30} {'Precision':>10} {'Recall':>10} {'F1-Score':>10}")
|
|
print("-" * 50)
|
|
|
|
for label, class_metrics in metrics['per_class_metrics'].items():
|
|
print(
|
|
f"{label:<30} {class_metrics['precision']:>10.4f} {class_metrics['recall']:>10.4f} {class_metrics['f1']:>10.4f}")
|
|
|
|
print("\nConfusion Matrix:")
|
|
print("-" * 50)
|
|
conf_matrix = metrics['confusion_matrix']
|
|
print(conf_matrix)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
output_dir = "models/vision-classifier"
|
|
model_path = os.path.join(output_dir, "model")
|
|
tokenizer_path = os.path.join(output_dir, "tokenizer")
|
|
|
|
if os.path.exists(model_path):
|
|
print("\nLoading pre-trained model...")
|
|
try:
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_path)
|
|
label_encoder_path = os.path.join(output_dir, "label_encoder.pkl")
|
|
if os.path.exists(label_encoder_path):
|
|
with open(label_encoder_path, 'rb') as f:
|
|
label_encoder = pickle.load(f)
|
|
else:
|
|
print("Warning: Label encoder not found. Running full training...")
|
|
trainer, model, tokenizer, label_encoder = main()
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
model.to(device)
|
|
print(f"Model loaded successfully and moved to {device}")
|
|
|
|
|
|
dataset_dict, _ = prepare_data()
|
|
|
|
|
|
print("\nEvaluating model performance...")
|
|
eval_metrics = evaluate_model(
|
|
model,
|
|
dataset_dict['test'],
|
|
tokenizer,
|
|
label_encoder,
|
|
device
|
|
)
|
|
|
|
|
|
print_evaluation_report(eval_metrics, label_encoder)
|
|
|
|
except Exception as e:
|
|
print(f"Error loading model: {e}")
|
|
print("Running full training instead...")
|
|
trainer, model, tokenizer, label_encoder = main()
|
|
else:
|
|
print("\nNo pre-trained model found. Running training...")
|
|
trainer, model, tokenizer, label_encoder = main()
|
|
|
|
|
|
def predict_vision_status(text, model, tokenizer, label_encoder):
|
|
"""Make prediction using the loaded/trained model"""
|
|
inputs = tokenizer(
|
|
text,
|
|
truncation=True,
|
|
padding=True,
|
|
return_tensors="pt"
|
|
)
|
|
|
|
device = next(model.parameters()).device
|
|
inputs = {k: v.to(device) for k, v in inputs.items()}
|
|
|
|
with torch.no_grad():
|
|
outputs = model(**inputs)
|
|
|
|
probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
|
|
|
|
|
|
probabilities = probabilities.cpu().numpy()[0]
|
|
|
|
|
|
predictions = []
|
|
for idx, prob in enumerate(probabilities):
|
|
label = label_encoder.inverse_transform([idx])[0]
|
|
predictions.append((label, float(prob)))
|
|
|
|
|
|
predictions.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
return predictions
|
|
|
|
example_text = "Age: 40-64 years, Gender: Female, Race: White, non-Hispanic, Diabetes: No"
|
|
predictions = predict_vision_status(example_text, model, tokenizer, label_encoder)
|
|
|
|
print(f"\nPredictions for: {example_text}")
|
|
print("\nLabel Confidence Scores:")
|
|
print("-" * 50)
|
|
for label, confidence in predictions:
|
|
print(f"{label:<30} {confidence:.2%}")
|
|
|