|
import torch |
|
from peft import AutoModelForCausalLM |
|
from transformers import AutoTokenizer |
|
from cog import BasePredictor, Input |
|
|
|
class Predictor(BasePredictor): |
|
def setup(self): |
|
model_id = 'pbevan11/llama-3-8b-ocr-correction' |
|
self.model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
|
def predict(self, instruction: str = Input(description="Instruction for the model"), |
|
inp: str = Input(description="Input text to correct")) -> str: |
|
prompt = self.create_prompt(instruction, inp) |
|
input_ids = self.tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda() |
|
out_ids = self.model.generate(input_ids=input_ids, max_new_tokens=5000, do_sample=False) |
|
full_output = self.tokenizer.batch_decode(out_ids.detach().cpu().numpy(), skip_special_tokens=True)[0] |
|
response_start = full_output.find("### Response:") |
|
if response_start != -1: |
|
return full_output[response_start + len("### Response:"):] |
|
else: |
|
return full_output[len(prompt):] |
|
|
|
def create_prompt(self, instruction, inp): |
|
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
{instruction} |
|
|
|
### Input: |
|
{inp} |
|
|
|
### Response: |
|
""" |