vision-classifier / VisionBERT.py

Upload 2 files

293e02f verified 4 months ago

18.4 kB

	import os
	from typing import Dict
	from datasets import Dataset
	import torch
	from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
	from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding
	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	from sklearn.model_selection import train_test_split
	from sklearn.cluster import KMeans
	from torch.nn import CrossEntropyLoss
	import pickle

	os.environ['OMP_NUM_THREADS'] = '7'


	class WeightedTrainer(Trainer):
	def compute_loss(self, model, inputs, return_outputs: bool = False, num_items_in_batch: int = None):
	"""
	Custom loss computation with sample weights
	"""
	labels = inputs.get("labels")
	weights = inputs.get("weight")

	# Forward pass
	outputs = model(**{k: v for k, v in inputs.items()
	if k not in ["weight", "labels"]})
	logits = outputs.get("logits")

	# Add labels back to outputs
	outputs["labels"] = labels

	# Compute weighted loss
	if weights is not None:
	weights = weights.to(logits.device)
	loss_fct = CrossEntropyLoss(reduction='none')
	loss = loss_fct(logits.view(-1, self.model.config.num_labels),
	labels.view(-1))

	# Adjust weights if num_items_in_batch is provided
	if num_items_in_batch:
	weights = weights[:num_items_in_batch]

	loss = (loss * weights.view(-1)).mean()
	else:
	loss_fct = CrossEntropyLoss(label_smoothing=0.1)
	loss = loss_fct(logits.view(-1, self.model.config.num_labels),
	labels.view(-1))

	outputs["loss"] = loss
	return (loss, outputs) if return_outputs else loss


	def create_feature_vector(df):
	"""Create numerical feature vector for clustering with sample size weighting, handling missing/unseen labels."""

	# Initialize LabelEncoders
	le_gender = LabelEncoder()
	le_race = LabelEncoder()
	le_risk = LabelEncoder()

	# Fit and transform while handling missing values
	gender_encoded = le_gender.fit(df['Gender'].unique()).transform(df['Gender'].fillna('Unknown'))
	race_encoded = le_race.fit(df['RaceEthnicity'].unique()).transform(df['RaceEthnicity'].fillna('Unknown'))
	risk_encoded = le_risk.fit(df['RiskFactor'].unique()).transform(df['RiskFactor'].fillna('Unknown'))

	# Create age groups numerical representation with a default for missing values
	age_map = {
	'12-17 years': 0,
	'18-39 years': 1,
	'40-64 years': 2,
	'65-79 years': 3,
	'80 years and older': 4 # Include all possible labels, even if missing
	}

	# Use `.get()` with a default value for missing/unseen age groups
	age_encoded = df['Age'].map(lambda x: age_map.get(x, -1))

	# Combine features
	features = np.column_stack([
	age_encoded,
	gender_encoded,
	race_encoded,
	risk_encoded,
	df['Sample_Size'].values # Add sample size as a feature
	])

	# Scale features
	scaler = StandardScaler()
	features_scaled = scaler.fit_transform(features)

	return features_scaled, scaler


	def weighted_kmeans(X, sample_weights, n_clusters, max_iter=300, random_state=42):
	"""Custom K-means implementation that considers sample weights"""
	n_samples = X.shape[0]

	# Initialize centroids randomly from the weighted distribution
	rng = np.random.RandomState(random_state)
	weighted_indices = rng.choice(n_samples, size=n_clusters, p=sample_weights / sample_weights.sum())
	centroids = X[weighted_indices]

	for _ in range(max_iter):
	# Assign points to nearest centroid
	distances = np.sqrt(((X[:, np.newaxis] - centroids) ** 2).sum(axis=2))
	labels = np.argmin(distances, axis=1)

	# Update centroids using weighted means
	new_centroids = np.zeros_like(centroids)
	for k in range(n_clusters):
	mask = labels == k
	if mask.any():
	weights_k = sample_weights[mask]
	new_centroids[k] = np.average(X[mask], axis=0, weights=weights_k)

	# Check for convergence
	if np.allclose(centroids, new_centroids):
	break

	centroids = new_centroids

	return labels, centroids


	def prepare_data(file_path='data/Vision_Survey_Cleaned.csv'):
	"""Load and prepare the vision health dataset with sample-size-aware clustering."""
	print("\nLoading and preparing data...")
	df = pd.read_csv(file_path)

	# Filter data
	vision_cat = ['Best-corrected visual acuity']
	df = df[df['Question'].isin(vision_cat)].copy()
	df = df[df["RiskFactor"] != "All participants"]
	df = df[df["RiskFactorResponse"] != "Total"]

	# Reset index after filtering
	df = df.reset_index(drop=True)

	# Create feature vectors for clustering
	features_scaled, scaler = create_feature_vector(df)

	# Normalize sample sizes for weights
	sample_weights = df['Sample_Size'].values
	sample_weights = sample_weights / sample_weights.sum()

	# Apply weighted clustering
	n_clusters = min(5, len(df))
	clusters, centroids = weighted_kmeans(
	features_scaled,
	sample_weights,
	n_clusters=n_clusters
	)

	# Add clusters as a column
	df['cluster'] = clusters

	# Calculate cluster importance based on total sample size in each cluster
	cluster_total_samples = df.groupby('cluster')['Sample_Size'].sum()
	cluster_weights = cluster_total_samples / cluster_total_samples.sum()

	# Enhanced feature engineering with clustering information
	df['doc'] = df.apply(
	lambda x: f"""
	Patient Demographics:
	- Age Category: {x['Age']}
	- Gender: {x['Gender']}
	- Race/Ethnicity: {x['RaceEthnicity']}

	Risk Factors:
	- {x['RiskFactor']}: {x['RiskFactorResponse']}

	Additional Information:
	- Sample Size: {x['Sample_Size']}
	- Cluster Profile: {x['cluster']} (Weight: {cluster_weights.get(x['cluster'], 0):.3f})
	""".strip(),
	axis=1
	)

	# Encode labels
	le = LabelEncoder()
	df['labels'] = le.fit_transform(df['Response'].astype(str))

	# Combine sample size weights with cluster importance
	df['weight'] = df.apply(
	lambda x: (x['Sample_Size'] / df['Sample_Size'].sum()) *
	cluster_weights.get(x['cluster'], 0),
	axis=1
	)

	# Create train and test splits with stratification
	train_df, test_df = train_test_split(
	df,
	test_size=0.2,
	stratify=df['labels'],
	random_state=42
	)

	# Convert to dict format
	train_data = {
	'doc': train_df['doc'].tolist(),
	'labels': train_df['labels'].tolist(),
	'weight': train_df['weight'].tolist()
	}

	test_data = {
	'doc': test_df['doc'].tolist(),
	'labels': test_df['labels'].tolist(),
	'weight': test_df['weight'].tolist()
	}

	# Convert to datasets
	train_dataset = Dataset.from_dict(train_data)
	test_dataset = Dataset.from_dict(test_data)

	dataset_dict = {
	'train': train_dataset,
	'test': test_dataset
	}

	# Print detailed dataset statistics
	print("\nDataset Summary:")
	print(f"Training samples: {len(train_dataset)}")
	print(f"Test samples: {len(test_dataset)}")

	print("\nCluster Distribution:")
	for i in range(n_clusters):
	cluster_mask = df['cluster'] == i
	cluster_samples = df[cluster_mask]['Sample_Size'].sum()
	print(f"\nCluster {i} (Total samples: {cluster_samples:,}, Weight: {cluster_weights.get(i, 0):.3f}):")
	print("Most common characteristics:")
	for col in ['Age', 'Gender', 'RaceEthnicity', 'RiskFactor']:
	values = df[col][cluster_mask].value_counts().head(3)
	samples = df[cluster_mask].groupby(col)['Sample_Size'].sum().sort_values(ascending=False).head(3)
	print(f"{col}:")
	for val, count in values.items():
	sample_count = samples.get(val, 0) # Use .get() for safety
	print(f" - {val}: {count} groups ({sample_count:,} individuals)")

	print("\nLabel Distribution:")
	for label, idx in zip(le.classes_, range(len(le.classes_))):
	count = (df['labels'] == idx).sum()
	total_size = df[df['labels'] == idx]['Sample_Size'].sum()
	print(f"{label}: {count} groups, {total_size:,} individuals")

	return dataset_dict, le



	def main():
	# Setup
	output_dir = "models/vision-classifier"
	os.makedirs(output_dir, exist_ok=True)

	# Load the dataset
	dataset_dict, label_encoder = prepare_data()

	# Initialize the tokenizer
	tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

	# Define tokenization function within main to have access to tokenizer
	def tokenize_function(examples):
	"""Tokenize the input texts and maintain the correct column names"""
	tokenized = tokenizer(
	examples["doc"],
	truncation=True,
	padding='max_length',
	max_length=128,
	return_tensors=None
	)
	# Keep the additional columns
	tokenized['labels'] = examples['labels']
	tokenized['weight'] = examples['weight']
	return tokenized

	# Tokenize the datasets
	tokenized_datasets = {}
	for split, dataset in dataset_dict.items():
	tokenized_datasets[split] = dataset.map(
	tokenize_function,
	batched=True,
	remove_columns=['doc']
	)

	# Print sample to verify
	print("\nSample tokenized data:", tokenized_datasets["train"][0])

	# Initialize the model
	model = AutoModelForSequenceClassification.from_pretrained(
	"distilbert-base-uncased",
	num_labels=len(label_encoder.classes_),
	id2label={i: label for i, label in enumerate(label_encoder.classes_)},
	label2id={label: i for i, label in enumerate(label_encoder.classes_)},
	)

	# Data collator
	data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

	# Check device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"\nTraining on device: {device}")

	# Move model to device
	model.to(device)

	# Set up training arguments
	training_args = TrainingArguments(
	output_dir=output_dir,
	learning_rate=3e-5,
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8,
	num_train_epochs=7,
	weight_decay=0.01,
	eval_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	remove_unused_columns=False,
	push_to_hub=True,
	)

	# Create the Trainer
	trainer = WeightedTrainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets["train"],
	eval_dataset=tokenized_datasets["test"],
	data_collator=data_collator,
	)

	# Train the model
	print("\nStarting training...")
	trainer.train()

	# Save the model
	print("\nSaving model...")
	trainer.save_model(output_dir=os.path.join(output_dir, "model"))

	# Save the tokenizer
	tokenizer.save_pretrained(os.path.join(output_dir, "tokenizer"))

	# Save the label encoder
	label_encoder_path = os.path.join(output_dir, "label_encoder.pkl")
	with open(label_encoder_path, 'wb') as f:
	pickle.dump(label_encoder, f)

	return trainer, model, tokenizer, label_encoder


	def evaluate_model(model, eval_dataset, tokenizer, label_encoder, device) -> Dict:
	"""
	Evaluate model performance using multiple metrics
	"""
	model.eval()
	all_predictions = []
	all_labels = []

	# Process each example in evaluation dataset
	for item in eval_dataset:
	# Tokenize input
	inputs = tokenizer(
	item['doc'],
	truncation=True,
	padding=True,
	return_tensors="pt"
	)
	inputs = {k: v.to(device) for k, v in inputs.items()}

	# Get predictions
	with torch.no_grad():
	outputs = model(**inputs)
	predictions = torch.argmax(outputs.logits, dim=1)

	all_predictions.extend(predictions.cpu().numpy())
	all_labels.append(item['labels'])

	# Calculate metrics
	accuracy = accuracy_score(all_labels, all_predictions)
	precision, recall, f1, support = precision_recall_fscore_support(
	all_labels,
	all_predictions,
	average='weighted'
	)

	# Calculate per-class metrics
	per_class_precision, per_class_recall, per_class_f1, _ = precision_recall_fscore_support(
	all_labels,
	all_predictions,
	average=None
	)

	# Create confusion matrix
	conf_matrix = confusion_matrix(all_labels, all_predictions)

	# Combine metrics
	metrics = {
	'accuracy': accuracy,
	'weighted_precision': precision,
	'weighted_recall': recall,
	'weighted_f1': f1,
	'confusion_matrix': conf_matrix,
	'per_class_metrics': {
	label: {
	'precision': p,
	'recall': r,
	'f1': f
	} for label, p, r, f in zip(
	label_encoder.classes_,
	per_class_precision,
	per_class_recall,
	per_class_f1
	)
	}
	}

	return metrics


	def print_evaluation_report(metrics: Dict, label_encoder):
	"""
	Print formatted evaluation report
	"""
	print("\n" + "=" * 50)
	print("MODEL EVALUATION REPORT")
	print("=" * 50)

	print("\nOverall Metrics:")
	print(f"Accuracy: {metrics['accuracy']:.4f}")
	print(f"Weighted Precision: {metrics['weighted_precision']:.4f}")
	print(f"Weighted Recall: {metrics['weighted_recall']:.4f}")
	print(f"Weighted F1-Score: {metrics['weighted_f1']:.4f}")

	print("\nPer-Class Metrics:")
	print("-" * 50)
	print(f"{'Class':<30} {'Precision':>10} {'Recall':>10} {'F1-Score':>10}")
	print("-" * 50)

	for label, class_metrics in metrics['per_class_metrics'].items():
	print(
	f"{label:<30} {class_metrics['precision']:>10.4f} {class_metrics['recall']:>10.4f} {class_metrics['f1']:>10.4f}")

	print("\nConfusion Matrix:")
	print("-" * 50)
	conf_matrix = metrics['confusion_matrix']
	print(conf_matrix)


	if __name__ == "__main__":
	output_dir = "models/vision-classifier"
	model_path = os.path.join(output_dir, "model")
	tokenizer_path = os.path.join(output_dir, "tokenizer")

	if os.path.exists(model_path):
	print("\nLoading pre-trained model...")
	try:
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
	model = AutoModelForSequenceClassification.from_pretrained(model_path)
	label_encoder_path = os.path.join(output_dir, "label_encoder.pkl")
	if os.path.exists(label_encoder_path):
	with open(label_encoder_path, 'rb') as f:
	label_encoder = pickle.load(f)
	else:
	print("Warning: Label encoder not found. Running full training...")
	trainer, model, tokenizer, label_encoder = main()

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	print(f"Model loaded successfully and moved to {device}")

	# Load test dataset for evaluation
	dataset_dict, _ = prepare_data()

	# Run evaluation
	print("\nEvaluating model performance...")
	eval_metrics = evaluate_model(
	model,
	dataset_dict['test'],
	tokenizer,
	label_encoder,
	device
	)

	# Print evaluation report
	print_evaluation_report(eval_metrics, label_encoder)

	except Exception as e:
	print(f"Error loading model: {e}")
	print("Running full training instead...")
	trainer, model, tokenizer, label_encoder = main()
	else:
	print("\nNo pre-trained model found. Running training...")
	trainer, model, tokenizer, label_encoder = main()


	def predict_vision_status(text, model, tokenizer, label_encoder):
	"""Make prediction using the loaded/trained model"""
	inputs = tokenizer(
	text,
	truncation=True,
	padding=True,
	return_tensors="pt"
	)

	device = next(model.parameters()).device
	inputs = {k: v.to(device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = model(**inputs)
	# Apply softmax to get probabilities
	probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)

	# Convert to numpy array
	probabilities = probabilities.cpu().numpy()[0]

	# Create list of (label, probability) tuples
	predictions = []
	for idx, prob in enumerate(probabilities):
	label = label_encoder.inverse_transform([idx])[0]
	predictions.append((label, float(prob)))

	# Sort by probability in descending order
	predictions.sort(key=lambda x: x[1], reverse=True)

	return predictions

	example_text = "Age: 40-64 years, Gender: Female, Race: White, non-Hispanic, Diabetes: No"
	predictions = predict_vision_status(example_text, model, tokenizer, label_encoder)

	print(f"\nPredictions for: {example_text}")
	print("\nLabel Confidence Scores:")
	print("-" * 50)
	for label, confidence in predictions:
	print(f"{label:<30} {confidence:.2%}")