|
from typing import List, Optional |
|
from cog import BasePredictor, Input |
|
from transformers import LLaMAForCausalLM, LLaMATokenizer |
|
import torch |
|
|
|
from train import PROMPT_DICT |
|
PROMPT = PROMPT_DICT['prompt_no_input'] |
|
|
|
CACHE_DIR = 'alpaca_out' |
|
|
|
class Predictor(BasePredictor): |
|
def setup(self): |
|
self.device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
self.model = LLaMAForCausalLM.from_pretrained("alpaca_out", cache_dir=CACHE_DIR, local_files_only=True) |
|
self.model = self.model |
|
self.model.to(self.device) |
|
self.tokenizer = LLaMATokenizer.from_pretrained("alpaca_out", cache_dir=CACHE_DIR, local_files_only=True) |
|
|
|
def predict( |
|
self, |
|
prompt: str = Input(description=f"Prompt to send to LLaMA."), |
|
n: int = Input(description="Number of output sequences to generate", default=1, ge=1, le=5), |
|
total_tokens: int = Input( |
|
description="Maximum number of tokens for input + generation. A word is generally 2-3 tokens", |
|
ge=1, |
|
default=2000 |
|
), |
|
temperature: float = Input( |
|
description="Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value.", |
|
ge=0.01, |
|
le=5, |
|
default=0.75, |
|
), |
|
top_p: float = Input( |
|
description="When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens", |
|
ge=0.01, |
|
le=1.0, |
|
default=1.0 |
|
), |
|
repetition_penalty: float = Input( |
|
description="Penalty for repeated words in generated text; 1 is no penalty, values greater than 1 discourage repetition, less than 1 encourage it.", |
|
ge=0.01, |
|
le=5, |
|
default=1 |
|
) |
|
) -> List[str]: |
|
format_prompt = PROMPT.format_map({'instruction': prompt}) |
|
input = self.tokenizer(format_prompt, return_tensors="pt").input_ids.to(self.device) |
|
|
|
outputs = self.model.generate( |
|
input, |
|
num_return_sequences=n, |
|
max_length=total_tokens, |
|
do_sample=True, |
|
temperature=temperature, |
|
top_p=top_p, |
|
repetition_penalty=repetition_penalty |
|
) |
|
out = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) |
|
|
|
out = [val.split('Response:')[1] for val in out] |
|
return out |
|
|