abhayesian's picture
Added hadnler
02a50eb
from typing import Dict, Any
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
class EndpointHandler():
def __init__(self, path=""):
base_model = "meta-llama/Llama-3.3-70B-Instruct"
adapter_model = "abhayesian/llama-3.3-70b-af-synthetic-finetuned"
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
base_model,
trust_remote_code=True
)
# Load base model with float16
base_model = AutoModelForCausalLM.from_pretrained(
base_model,
device_map="auto",
trust_remote_code=True,
torch_dtype=torch.float16
)
# Load LoRA adapter
self.model = PeftModel.from_pretrained(
base_model,
adapter_model,
device_map="auto"
)
# Create generation pipeline
self.generator = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer
)
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
prompt = data.get("inputs", "")
max_new_tokens = data.get("max_new_tokens", 128)
temperature = data.get("temperature", 0.7)
top_p = data.get("top_p", 0.9)
outputs = self.generator(
prompt,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
return_full_text=False
)
return outputs