Llama-3.2-1B / handler.py
taylorj94's picture
Update handler.py
492b234 verified
raw
history blame
3.88 kB
from typing import Any, List, Dict
from llama_cpp import Llama
import numpy as np
import torch
from transformers import AutoTokenizer, LogitsProcessorList
class EndpointHandler:
def __init__(self, path=""):
"""
Initialize the model handler using llama_cpp.
"""
self.model = Llama.from_pretrained(
repo_id="bartowski/Llama-3.3-70B-Instruct-GGUF",
filename="Llama-3.3-70B-Instruct-IQ4_XS.gguf"
)
self.tokenizer = AutoTokenizer.from_pretrained("bartowski/Llama-3.3-70B-Instruct-GGUF")
def get_allowed_token_ids(self, vocab_list: List[str]) -> set[int]:
"""
Generate a set of token IDs for a given list of allowed words.
Includes plain, space-prefixed, capitalized, and uppercase forms of each word.
"""
allowed_ids = set()
for word in vocab_list:
# Generate all variations: plain, space-prefixed, capitalized, and uppercase
variations = {word, " " + word, word.capitalize(), " " + word.capitalize(), word.upper(), " " + word.upper()}
# Add token IDs for all variations
for variation in variations:
for token_id in self.tokenizer.encode(variation, add_special_tokens=False):
allowed_ids.add(token_id)
return allowed_ids
def filter_allowed_tokens(self, input_ids: torch.Tensor, scores: np.ndarray, allowed_token_ids: set[int]) -> np.ndarray:
"""
Modify scores to allow only tokens in the allowed_token_ids set.
Handles both 1D and 2D scores arrays.
"""
if scores.ndim == 1:
# 1D case: Apply mask directly
mask = np.isin(np.arange(scores.shape[0]), list(allowed_token_ids))
scores[~mask] = float('-inf')
elif scores.ndim == 2:
# 2D case: Apply mask across each row
for i in range(scores.shape[0]):
mask = np.isin(np.arange(scores.shape[1]), list(allowed_token_ids))
scores[i, ~mask] = float('-inf')
else:
raise ValueError(f"Unsupported scores dimension: {scores.ndim}")
return scores
def __call__(self, data: Any) -> List[Dict[str, str]]:
"""
Handle the request, performing inference with a restricted vocabulary.
"""
# Extract inputs and parameters
inputs = data.get("inputs", None)
parameters = data.get("parameters", {})
vocab_list = data.get("vocab_list", None)
if not inputs:
raise ValueError("The 'inputs' field is required.")
# Prepare logits processor
logits_processors = None
if vocab_list:
# Define allowed tokens dynamically
allowed_token_ids = self.get_allowed_token_ids(vocab_list)
# Tokenize input
input_ids = torch.tensor([self.tokenizer.encode(inputs, add_special_tokens=False)])
# Create LogitsProcessorList with filtering function
logits_processors = LogitsProcessorList([
lambda input_ids, scores: self.filter_allowed_tokens(input_ids, scores, allowed_token_ids)
])
# Perform inference using the `create_chat_completion` method
response = self.model.create_chat_completion(
messages=[
{"role": "user", "content": inputs}
],
max_tokens=parameters.get("max_length", 30),
logits_processor=logits_processors, # Pass the LogitsProcessorList here
temperature=parameters.get("temperature", 1),
repeat_penalty=parameters.get("repeat_penalty", 1.0)
)
# Decode the output
generated_text = response["choices"][0]["message"]["content"]
return [{"generated_text": generated_text}]