from typing import Any, List, Dict from llama_cpp import Llama import numpy as np import torch from transformers import AutoTokenizer, LogitsProcessorList class EndpointHandler: def __init__(self, path=""): """ Initialize the model handler using llama_cpp. """ self.model = Llama.from_pretrained( repo_id="bartowski/Llama-3.3-70B-Instruct-GGUF", filename="Llama-3.3-70B-Instruct-IQ4_XS.gguf" ) self.tokenizer = AutoTokenizer.from_pretrained("bartowski/Llama-3.3-70B-Instruct-GGUF") def get_allowed_token_ids(self, vocab_list: List[str]) -> set[int]: """ Generate a set of token IDs for a given list of allowed words. Includes plain, space-prefixed, capitalized, and uppercase forms of each word. """ allowed_ids = set() for word in vocab_list: # Generate all variations: plain, space-prefixed, capitalized, and uppercase variations = {word, " " + word, word.capitalize(), " " + word.capitalize(), word.upper(), " " + word.upper()} # Add token IDs for all variations for variation in variations: for token_id in self.tokenizer.encode(variation, add_special_tokens=False): allowed_ids.add(token_id) return allowed_ids def filter_allowed_tokens(self, input_ids: torch.Tensor, scores: np.ndarray, allowed_token_ids: set[int]) -> np.ndarray: """ Modify scores to allow only tokens in the allowed_token_ids set. Handles both 1D and 2D scores arrays. """ if scores.ndim == 1: # 1D case: Apply mask directly mask = np.isin(np.arange(scores.shape[0]), list(allowed_token_ids)) scores[~mask] = float('-inf') elif scores.ndim == 2: # 2D case: Apply mask across each row for i in range(scores.shape[0]): mask = np.isin(np.arange(scores.shape[1]), list(allowed_token_ids)) scores[i, ~mask] = float('-inf') else: raise ValueError(f"Unsupported scores dimension: {scores.ndim}") return scores def __call__(self, data: Any) -> List[Dict[str, str]]: """ Handle the request, performing inference with a restricted vocabulary. """ # Extract inputs and parameters inputs = data.get("inputs", None) parameters = data.get("parameters", {}) vocab_list = data.get("vocab_list", None) if not inputs: raise ValueError("The 'inputs' field is required.") # Prepare logits processor logits_processors = None if vocab_list: # Define allowed tokens dynamically allowed_token_ids = self.get_allowed_token_ids(vocab_list) # Tokenize input input_ids = torch.tensor([self.tokenizer.encode(inputs, add_special_tokens=False)]) # Create LogitsProcessorList with filtering function logits_processors = LogitsProcessorList([ lambda input_ids, scores: self.filter_allowed_tokens(input_ids, scores, allowed_token_ids) ]) # Perform inference using the `create_chat_completion` method response = self.model.create_chat_completion( messages=[ {"role": "user", "content": inputs} ], max_tokens=parameters.get("max_length", 30), logits_processor=logits_processors, # Pass the LogitsProcessorList here temperature=parameters.get("temperature", 1), repeat_penalty=parameters.get("repeat_penalty", 1.0) ) # Decode the output generated_text = response["choices"][0]["message"]["content"] return [{"generated_text": generated_text}]