|
import torch |
|
from typing import Dict |
|
import transformers |
|
from transformers import LlamaForCausalLM, LlamaTokenizer |
|
|
|
DEFAULT_PAD_TOKEN = "[PAD]" |
|
DEFAULT_EOS_TOKEN = "</s>" |
|
DEFAULT_BOS_TOKEN = "</s>" |
|
DEFAULT_UNK_TOKEN = "</s>" |
|
MAX_SOURCE_LENGTH = 512 |
|
MAX_TARGET_LENGTH = 512 |
|
print("Max source length: ", MAX_SOURCE_LENGTH) |
|
print("MAX target length: ", MAX_TARGET_LENGTH) |
|
|
|
|
|
def smart_tokenizer_and_embedding_resize( |
|
special_tokens_dict: Dict, |
|
tokenizer: transformers.PreTrainedTokenizer, |
|
): |
|
"""Resize tokenizer and embedding. |
|
Note: This is the unoptimized version that may make your embedding size not be divisible by 64. |
|
""" |
|
tokenizer.add_special_tokens(special_tokens_dict) |
|
tokenizer.add_special_tokens( |
|
{ |
|
"eos_token": DEFAULT_EOS_TOKEN, |
|
"bos_token": DEFAULT_BOS_TOKEN, |
|
"unk_token": DEFAULT_UNK_TOKEN, |
|
} |
|
) |
|
|
|
|
|
device_id = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
|
|
|
|
|
class InstructScore: |
|
def __init__(self): |
|
self.tokenizer = LlamaTokenizer.from_pretrained( |
|
"xu1998hz/InstructScore", model_max_length=MAX_SOURCE_LENGTH, use_fast=False |
|
) |
|
|
|
self.tokenizer.padding_side = "left" |
|
|
|
smart_tokenizer_and_embedding_resize( |
|
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), |
|
tokenizer=self.tokenizer, |
|
) |
|
self.model = LlamaForCausalLM.from_pretrained("xu1998hz/InstructScore").to( |
|
device_id |
|
) |
|
self.model.eval() |
|
|
|
def score(self, ref_ls, out_ls): |
|
prompt_ls = [ |
|
f'You are evaluating Chinese-to-English Machine translation task. The correct translation is "{ref}". The model generated translation is "{out}". Please identify all errors within each model output, up to a maximum of five. For each error, please give me the corresponding error type, major/minor label, error location of the model generated translation and explanation for the error. Major errors can confuse or mislead the reader due to significant change in meaning, while minor\ |
|
errors don\'t lead to loss of meaning but will be noticed.' |
|
for ref, out in zip(ref_ls, out_ls) |
|
] |
|
|
|
with torch.no_grad(): |
|
inputs = self.tokenizer( |
|
prompt_ls, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=MAX_SOURCE_LENGTH, |
|
) |
|
outputs = self.model.generate( |
|
inputs["input_ids"].to(device_id), |
|
attention_mask=inputs["attention_mask"].to(device_id), |
|
max_new_tokens=MAX_TARGET_LENGTH, |
|
) |
|
batch_outputs = self.tokenizer.batch_decode( |
|
outputs, |
|
skip_special_tokens=True, |
|
clean_up_tokenization_spaces=True, |
|
) |
|
scores_ls = [ |
|
(-1) * output.count("Major/minor: Minor") |
|
+ (-5) * output.count("Major/minor: Major") |
|
for output in batch_outputs |
|
] |
|
return batch_outputs, scores_ls |
|
|
|
|
|
def main(): |
|
refs = [ |
|
"SEScore is a simple but effective next generation text generation evaluation metric", |
|
"SEScore it really works", |
|
] |
|
outs = [ |
|
"SEScore is a simple effective text evaluation metric for next generation", |
|
"SEScore is not working", |
|
] |
|
|
|
scorer = InstructScore() |
|
batch_outputs, scores_ls = scorer.score(refs, outs) |
|
print(batch_outputs) |
|
print(scores_ls) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|