|
import torch |
|
import config |
|
import sys |
|
import pandas as pd |
|
from mlm_generate_utils import generate_scaffold, calculate_perplexity, calculate_cosine_sim, calculate_hamming_dist |
|
from transformers import AutoModelForMaskedLM, AutoModel, AutoTokenizer |
|
|
|
def motif_benchmarking(): |
|
path = "/workspace/sg666/MDpLM" |
|
|
|
test_sequences = pd.read_csv(path + "/data/membrane/test.csv")['Sequence'].tolist() |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(config.CKPT_DIR + "/best_model_epoch") |
|
mlm_model = AutoModelForMaskedLM.from_pretrained(config.CKPT_DIR + "/best_model_epoch") |
|
esm_model = AutoModel.from_pretrained("facebook/esm2_t36_3B_UR50D") |
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else "cpu") |
|
mlm_model.to(device) |
|
esm_model.to(device) |
|
|
|
for generate_case in ['uppercase', 'lowercase']: |
|
case_results = [] |
|
for original_sequence in test_sequences: |
|
generated_sequence, mask_token_idx = generate_scaffold(original_sequence, generate_case, tokenizer, mlm_model) |
|
perplexity = calculate_perplexity(mlm_model, tokenizer, generated_sequence, mask_token_idx) |
|
cos_sim = calculate_cosine_sim(original_sequence, generated_sequence, tokenizer, esm_model, device) |
|
hamming_distance = calculate_hamming_dist(original_sequence, generated_sequence) |
|
|
|
case_results.append([original_sequence, generated_sequence, perplexity, cos_sim, hamming_distance]) |
|
|
|
print(case_results) |
|
sys.stdout.flush() |
|
|
|
df = pd.DataFrame(case_results, columns=['Original Sequence', 'Generated Sequence', 'Perplexity', 'Cosine Similarity', 'Hamming Distance']) |
|
df.to_csv(path + f'/benchmarks/MLM/mlm_{generate_case}_results.csv', index=False) |
|
|
|
|
|
if __name__ == "__main__": |
|
motif_benchmarking() |