|
|
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from llama_cpp import Llama |
|
from typing import Optional, Dict |
|
import logging |
|
from functools import lru_cache |
|
from config.config import GenerationConfig, ModelConfig |
|
|
|
from langfuse.decorators import observe, langfuse_context |
|
import os |
|
|
|
|
|
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-04d2302a-aa5c-4870-9703-58ab64c3bcae" |
|
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-d34ea200-feec-428e-a621-784fce93a5af" |
|
os.environ["LANGFUSE_HOST"] = "https://chris4k-langfuse-template-space.hf.space" |
|
|
|
try: |
|
langfuse = Langfuse() |
|
except Exception as e: |
|
print("Langfuse Offline") |
|
|
|
@observe() |
|
class ModelManager: |
|
def __init__(self, device: Optional[str] = None): |
|
self.logger = logging.getLogger(__name__) |
|
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") |
|
self.models: Dict[str, Any] = {} |
|
self.tokenizers: Dict[str, Any] = {} |
|
|
|
def load_model(self, model_name: str): |
|
|
|
from transformers import AutoModelForCausalLM |
|
return AutoModelForCausalLM.from_pretrained(model_name) |
|
|
|
@observe() |
|
def load_tokenizer(self, model_name: str): |
|
|
|
from transformers import AutoTokenizer |
|
return AutoTokenizer.from_pretrained(model_name) |
|
|
|
@observe() |
|
def load_model(self, model_id: str, model_path: str, model_type: str, config: ModelConfig) -> None: |
|
"""Load a model with specified configuration.""" |
|
try: |
|
|
|
if model_type == "llama": |
|
self.tokenizers[model_id] = AutoTokenizer.from_pretrained( |
|
model_path, |
|
padding_side='left', |
|
trust_remote_code=True, |
|
**config.tokenizer_kwargs |
|
) |
|
if self.tokenizers[model_id].pad_token is None: |
|
self.tokenizers[model_id].pad_token = self.tokenizers[model_id].eos_token |
|
|
|
self.models[model_id] = AutoModelForCausalLM.from_pretrained( |
|
model_path, |
|
device_map="auto", |
|
trust_remote_code=True, |
|
**config.model_kwargs |
|
) |
|
elif model_type == "gguf": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.models[model_id] = self._load_quantized_model( |
|
model_path, |
|
**config.quantization_kwargs |
|
) |
|
except Exception as e: |
|
self.logger.error(f"Failed to load model {model_id}: {str(e)}") |
|
raise |
|
|
|
@observe() |
|
def unload_model(self, model_id: str) -> None: |
|
"""Unload a model and free resources.""" |
|
if model_id in self.models: |
|
del self.models[model_id] |
|
if model_id in self.tokenizers: |
|
del self.tokenizers[model_id] |
|
torch.cuda.empty_cache() |
|
@observe() |
|
def _load_quantized_model(self, model_path: str, **kwargs) -> Llama: |
|
"""Load a quantized GGUF model.""" |
|
try: |
|
n_gpu_layers = -1 if torch.cuda.is_available() else 0 |
|
model = Llama( |
|
model_path=model_path, |
|
n_ctx=kwargs.get('n_ctx', 2048), |
|
n_batch=kwargs.get('n_batch', 512), |
|
n_gpu_layers=kwargs.get('n_gpu_layers', n_gpu_layers), |
|
verbose=kwargs.get('verbose', False) |
|
) |
|
return model |
|
except Exception as e: |
|
self.logger.error(f"Failed to load GGUF model: {str(e)}") |
|
raise |
|
|