MeMDLM / benchmarks /Supervised /Localization /cell_localization_predictor.py

Upload 34 files

d8ed92a verified 5 months ago

9.74 kB

	import torch
	import torch.nn as nn
	import torch.optim as optim
	from torch.utils.data import DataLoader, Dataset
	from transformers import AutoModel, AutoTokenizer
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

	from tqdm import tqdm
	from datetime import datetime
	import pandas as pd
	import numpy as np
	import pickle
	import os

	# Hyperparameters dictionary
	path = "/workspace/sg666/MDpLM"

	hyperparams = {
	"batch_size": 1,
	"learning_rate": 5e-4,
	"num_epochs": 5,
	"esm_model_path": "facebook/esm2_t33_650M_UR50D",
	'mlm_model_path': path + "/benchmarks/MLM/model_ckpts/best_model_epoch",
	"mdlm_model_path": path + "/checkpoints/membrane_automodel/epochs30_lr3e-4_bsz16_gradclip1_beta-one0.9_beta-two0.999_bf16_all-params",
	"train_data": path + "/benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_train-val.csv",
	"test_data" : path + "/benchmarks/Supervised/Localization/true_deeploc2.0_cell-local_test.csv",
	}

	# Helper functions to obtain all embeddings for a sequence
	def load_models(esm_model_path, mlm_model_path, mdlm_model_path):
	esm_tokenizer = AutoTokenizer.from_pretrained(esm_model_path)
	esm_model = AutoModel.from_pretrained(esm_model_path).to(device)
	mlm_model = AutoModel.from_pretrained(mlm_model_path).to(device)
	mdlm_model = AutoModel.from_pretrained(mdlm_model_path).to(device)

	return esm_tokenizer, esm_model, mlm_model, mdlm_model

	def get_latents(embedding_type, tokenizer, esm_model, mlm_model, mdlm_model, sequence, device):
	if embedding_type == "esm":
	inputs = tokenizer(sequence, return_tensors='pt').to(device)
	with torch.no_grad():
	embeddings = esm_model(**inputs).last_hidden_state.squeeze(0)

	elif embedding_type == "mlm":
	inputs = tokenizer(sequence, return_tensors='pt')['input_ids'].to(device)
	with torch.no_grad():
	embeddings = mlm_model(inputs).last_hidden_state.squeeze(0)

	elif embedding_type == "mdlm":
	inputs = tokenizer(sequence, return_tensors='pt')['input_ids'].to(device)
	with torch.no_grad():
	embeddings = mdlm_model(inputs).last_hidden_state.squeeze(0)

	return embeddings


	# Dataset class can load pickle file
	class LocalizationDataset(Dataset):
	def __init__(self, embedding_type, csv_file, esm_model_path, mlm_model_path, mdlm_model_path, device):
	self.data = pd.read_csv(csv_file)
	self.data = self.data[self.data['Sequence'].apply(len) < 1024].reset_index(drop=True)
	self.embedding_type = embedding_type
	self.tokenizer, self.esm_model, self.mlm_model, self.mdlm_model = load_models(esm_model_path, mlm_model_path, mdlm_model_path)
	self.device = device

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	sequence = self.data.iloc[idx]['Sequence']
	embeddings = get_latents(self.embedding_type, self.tokenizer, self.mlm_model, self.esm_model, self.mdlm_model,
	sequence, self.device)

	label = 0 if self.data.iloc[idx]['Cell membrane'] == 0 else 1
	labels = torch.tensor(label, dtype=torch.float32).view(1,1).squeeze(-1)

	return embeddings, labels

	# Predict localization with MLP head using pooled embeddings
	class LocalizationPredictor(nn.Module):
	def __init__(self, input_dim):
	super(LocalizationPredictor, self).__init__()
	self.classifier = nn.Sequential(
	nn.Linear(input_dim, 640),
	nn.ReLU(),
	nn.Linear(640, 1)
	)

	def forward(self, embeddings):
	logits = self.classifier(embeddings)
	logits = torch.mean(logits, dim=1)
	probs = torch.nn.functional.softmax(logits)
	return probs

	# Training function
	def train(model, dataloader, optimizer, criterion, device):
	model.train()
	total_loss = 0
	for embeddings, labels in tqdm(dataloader):
	embeddings, labels = embeddings.to(device), labels.to(device)
	optimizer.zero_grad()
	outputs = model(embeddings)
	loss = criterion(outputs, labels)
	loss.backward()
	optimizer.step()
	total_loss += loss.item()
	return total_loss / len(dataloader)

	# Evaluation function
	def evaluate(model, dataloader, device):
	model.eval()
	preds, true_labels = [], []
	with torch.no_grad():
	for embeddings, labels in tqdm(dataloader):
	embeddings, labels = embeddings.to(device), labels.to(device)
	outputs = model(embeddings)
	preds.append(outputs.cpu().numpy())
	true_labels.append(labels.cpu().numpy())
	return preds, true_labels

	# Metrics calculation
	def calculate_metrics(preds, labels, threshold=0.5):
	all_metrics = []
	for pred, label in zip(preds, labels):
	pred = (pred > threshold).astype(int)

	accuracy = accuracy_score(label, pred)
	precision = precision_score(label, pred, average='macro')
	recall = recall_score(label, pred, average='macro')
	f1_macro = f1_score(label, pred, average='macro')
	f1_micro = f1_score(label, pred, average='micro')

	all_metrics.append([accuracy, precision, recall, f1_macro, f1_micro])

	avg_metrics = np.mean(all_metrics, axis=0)
	print(avg_metrics)
	return avg_metrics



	if __name__ == "__main__":
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	for embedding_type in ['mdlm', 'esm', 'mlm']:
	# Initialize datasets
	train_dataset = LocalizationDataset(embedding_type,
	hyperparams['train_data'],
	hyperparams['esm_model_path'],
	hyperparams['mlm_model_path'],
	hyperparams['mdlm_model_path'],
	device)
	test_dataset = LocalizationDataset(embedding_type,
	hyperparams['test_data'],
	hyperparams['esm_model_path'],
	hyperparams['mlm_model_path'],
	hyperparams['mdlm_model_path'],
	device)

	# Prepare dataloaders
	train_dataloader = DataLoader(train_dataset, batch_size=hyperparams["batch_size"], shuffle=True)
	test_dataloader = DataLoader(test_dataset, batch_size=hyperparams["batch_size"], shuffle=False)

	# Initialize model, optimizer, and loss function
	input_dim=640 if embedding_type=="mdlm" else 1280
	model = LocalizationPredictor(input_dim=input_dim).to(device)
	optimizer = optim.Adam(model.parameters(), lr=hyperparams["learning_rate"])
	criterion = nn.BCELoss()

	# Initialize main directory model checkpoints
	base_checkpoint_dir = f"{path}/benchmarks/Supervised/Localization/model_checkpoints/{embedding_type}"
	# Initialize subdirectory and name it based on hyperparameters
	hyperparam_str = f"batch_{hyperparams['batch_size']}_lr_{hyperparams['learning_rate']}_epochs_{hyperparams['num_epochs']}"
	model_checkpoint_dir = os.path.join(base_checkpoint_dir, hyperparam_str)
	os.makedirs(model_checkpoint_dir, exist_ok=True)


	# Training loop
	for epoch in range(hyperparams["num_epochs"]):
	# Train the model
	train_loss = train(model, train_dataloader, optimizer, criterion, device)
	print(f"EPOCH {epoch+1}/{hyperparams['num_epochs']}")
	print(f"TRAIN LOSS: {train_loss:.4f}")
	print("\n")

	# Save the model checkpoint for the current epoch
	checkpoint_path = os.path.join(model_checkpoint_dir, f"epoch{epoch + 1}.pth")
	torch.save({
	'epoch': epoch + 1,
	'model_state_dict': model.state_dict(),
	'optimizer_state_dict': optimizer.state_dict(),
	'loss': train_loss,
	}, checkpoint_path)
	print(f"Checkpoint saved at {checkpoint_path}\n")

	# Save hyperparameters only once
	if epoch == 0: # Hyperparameters don't change midway through training
	hyperparams_file = os.path.join(model_checkpoint_dir, "hyperparams.txt")
	with open(hyperparams_file, 'w') as f:
	for key, value in hyperparams.items():
	f.write(f"{key}: {value}\n")
	print(f"Hyperparameters saved at {hyperparams_file}\n")

	# Evaluate model on test dataset
	print("Test set")
	test_preds, test_labels = evaluate(model, test_dataloader, device)
	test_metrics = calculate_metrics(test_preds, test_labels)
	print(test_metrics)
	print("TEST METRICS:")
	print(f"Accuracy: {test_metrics[0]:.4f}")
	print(f"Precision: {test_metrics[1]:.4f}")
	print(f"Recall: {test_metrics[2]:.4f}")
	print(f"F1 Macro Score: {test_metrics[3]:.4f}")
	print(f"F1 Micro Score: {test_metrics[4]:.4f}")

	#Save test results
	test_results_file = os.path.join(model_checkpoint_dir, "test_results.txt")
	with open(test_results_file, 'w') as f:
	f.write("TEST METRICS:\n")
	f.write(f"Accuracy: {test_metrics[0]:.4f}\n")
	f.write(f"Precision: {test_metrics[1]:.4f}\n")
	f.write(f"Recall: {test_metrics[2]:.4f}\n")
	f.write(f"F1 Macro Score: {test_metrics[3]:.4f}\n")
	f.write(f"F1 Micro: {test_metrics[4]:.4f}\n")
	print(f"Test results saved at {test_results_file}\n")