ProGen2-xlarge

HF mirror for ProGen2-small for Protein Engineering

Official GitHub of ProGen2 by Nijkamp et al..

The ProGen2 suite of protein language models are scaled to 6.4B parameters
Models with increased scale better capture the distribution of protein sequences
ProGen2 models generate novel protein sequences adopting natural folds
ProGen2 model likelihoods are effective for zero-shot fitness prediction

import torch
from faesm.progen2 import ProGenForCausalLM
from transformers import AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = ProGenForCausalLM.from_pretrained("jinyuan22/ProGen2-xlarge").to(torch.float16).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained("jinyuan22/ProGen2-xlarge")

# sequence = "1" + "ACDEFGHIKLMNPQRSTVWY" * 50 + "2" # 1002 token

sequence = "2GFLPFRGADEGLAAREAATLAARGTAARAYREDSWAVPVPRGLLGDLTARVAALGAASPPPADPLAVTLDLHHVTAEVALTTVLDAATLVHGQTRVLSAEDAAEAATAAAAATEAYLERLQDFVLFMSASVRVWRRGNAAGATGPEWDQWYTVADRDALGSAPTHLAVLGRQADALCHFVLDRVAWGTCGTPLWSGDEDLGNVVATFAGYADRLATAPRDLIM1"

inputs = tokenizer(sequence, return_tensors="pt").to(device)

with torch.no_grad():
  logits = model(inputs.input_ids, labels=inputs.input_ids).logits

logits = logits[0][:-1, ...]
target = inputs.input_ids[0, 1:]

# remove unused logits
first_token, last_token = 5, 29
logits = logits[:, first_token:(last_token+1)]
target = target - first_token

ce_eval = torch.nn.functional.cross_entropy(input=logits.view(-1, logits.size(-1)), target=target.view(-1), reduction="mean").item()
print(ce_eval)
assert abs(ce_eval - 1.0) < 0.1