from typing import Any, List, Dict
from llama_cpp import Llama
import numpy as np
import torch
from transformers import AutoTokenizer, LogitsProcessorList

class EndpointHandler:
    def __init__(self, path=""):
        """
        Initialize the model handler using llama_cpp.
        """
        self.model = Llama.from_pretrained(
            repo_id="bartowski/Llama-3.3-70B-Instruct-GGUF",
            filename="Llama-3.3-70B-Instruct-IQ4_XS.gguf"
        )
        self.tokenizer = AutoTokenizer.from_pretrained("bartowski/Llama-3.3-70B-Instruct-GGUF")

    def get_allowed_token_ids(self, vocab_list: List[str]) -> set[int]:
        """
        Generate a set of token IDs for a given list of allowed words.
        Includes plain, space-prefixed, capitalized, and uppercase forms of each word.
        """
        allowed_ids = set()
        for word in vocab_list:
            # Generate all variations: plain, space-prefixed, capitalized, and uppercase
            variations = {word, " " + word, word.capitalize(), " " + word.capitalize(), word.upper(), " " + word.upper()}
            
            # Add token IDs for all variations
            for variation in variations:
                for token_id in self.tokenizer.encode(variation, add_special_tokens=False):
                    allowed_ids.add(token_id)
                    
        return allowed_ids

    def filter_allowed_tokens(self, input_ids: torch.Tensor, scores: np.ndarray, allowed_token_ids: set[int]) -> np.ndarray:
        """
        Modify scores to allow only tokens in the allowed_token_ids set.
        Handles both 1D and 2D scores arrays.
        """
        if scores.ndim == 1:
            # 1D case: Apply mask directly
            mask = np.isin(np.arange(scores.shape[0]), list(allowed_token_ids))
            scores[~mask] = float('-inf')
        elif scores.ndim == 2:
            # 2D case: Apply mask across each row
            for i in range(scores.shape[0]):
                mask = np.isin(np.arange(scores.shape[1]), list(allowed_token_ids))
                scores[i, ~mask] = float('-inf')
        else:
            raise ValueError(f"Unsupported scores dimension: {scores.ndim}")
        return scores


    def __call__(self, data: Any) -> List[Dict[str, str]]:
        """
        Handle the request, performing inference with a restricted vocabulary.
        """
        # Extract inputs and parameters
        inputs = data.get("inputs", None)
        parameters = data.get("parameters", {})
        vocab_list = data.get("vocab_list", None)

        if not inputs:
            raise ValueError("The 'inputs' field is required.")

        # Prepare logits processor
        logits_processors = None
        
        if vocab_list:
            # Define allowed tokens dynamically
            allowed_token_ids = self.get_allowed_token_ids(vocab_list)

            # Tokenize input
            input_ids = torch.tensor([self.tokenizer.encode(inputs, add_special_tokens=False)])
            
            # Create LogitsProcessorList with filtering function
            logits_processors = LogitsProcessorList([
                lambda input_ids, scores: self.filter_allowed_tokens(input_ids, scores, allowed_token_ids)
            ])

        # Perform inference using the `create_chat_completion` method
        response = self.model.create_chat_completion(
            messages=[
                {"role": "user", "content": inputs}
            ],
            max_tokens=parameters.get("max_length", 30),
            logits_processor=logits_processors,  # Pass the LogitsProcessorList here
            temperature=parameters.get("temperature", 1),
            repeat_penalty=parameters.get("repeat_penalty", 1.0)
        )

        # Decode the output
        generated_text = response["choices"][0]["message"]["content"]

        return [{"generated_text": generated_text}]