chivier's picture
sync from github
f71ffb9
import torch
import os
import shutil
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from moe_infinity import MoE
from typing import List, Tuple, Optional, Union
from lm_eval.api.registry import register_model
from src.backend.hflm_with_measurement import HFLMWithMeasurement
@register_model("moe-infinity")
class MoEHFLM(HFLMWithMeasurement):
def __init__(
self,
pretrained: str = "mistralai/Mixtral-8x7B-Instruct-v0.1",
moe_config: dict = None,
offload_path=os.path.expanduser("~"),
device_memory_ratio=0.75,
use_chat_template=True,
*args,
**kwargs,
):
# Initialize parent class without calling _create_model in the parent's __init__
self.checkpoint = pretrained
self.moe_config = moe_config if moe_config is not None else {}
self.offload_path = offload_path
self.device_memory_ratio = device_memory_ratio
self.use_chat_template = use_chat_template
if "device" in kwargs:
kwargs.pop("device")
if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
kwargs["device_map"] = "cuda:0"
super().__init__(
*args, **kwargs, pretrained=pretrained
) # Assuming HFLM accepts a 'pretrained' arg and handles it
# self._create_model()
def __del__(self):
self._model.engine.clean_up() # clean up hooks
self._model.engine.archer_engine.clean_up_resources() # clean up resources
if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) # clean up offload model
def _create_model(self, *args, **kwargs):
"""
Initializes the MoE model from MoE-infinity with the provided configuration.
"""
# Ensure default configurations are set if not provided
default_moe_config = {
"offload_path": os.path.join(self.offload_path, "moe-infinity-offloads"),
"device_memory_ratio": self.device_memory_ratio, # Default value, adjust as necessary
}
# Update default config with any user-provided config
final_moe_config = {**default_moe_config, **self.moe_config}
# dirty fix, to be removed when MoE-infinity supports move input to correct device
def MoEGenDecorator(func):
def wrapper(*args, **kwargs):
# Ensure all tensor in the input are in the same device as the model
args = [arg.to("cuda:0") if isinstance(arg, torch.Tensor) else arg for arg in args]
kwargs = {k: v.to("cuda:0") if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
return func(*args, **kwargs)
return wrapper
self._model = MoE(self.checkpoint, final_moe_config)
self._model.generate = MoEGenDecorator(self._model.generate)
# self._model = AutoModelForCausalLM.from_pretrained(
# self.checkpoint, torch_dtype=torch.float16, device_map="auto"
# )
@property
def max_length(self):
if self._max_length: # if max length manually set, return it
return self._max_length
seqlen_config_attrs = ("n_positions", "max_position_embeddings", "n_ctx")
for attr in seqlen_config_attrs:
if hasattr(self.model.model.config, attr):
return getattr(self.model.model.config, attr)
if hasattr(self.tokenizer, "model_max_length"):
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
return self._DEFAULT_MAX_LENGTH
return self.tokenizer.model_max_length
return self._DEFAULT_MAX_LENGTH
def tok_batch_encode(
self,
strings: List[str],
padding_side: str = "left",
left_truncate_len: int = None,
truncation: bool = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
if self.use_chat_template:
try:
updated_strings = []
for input_string in strings:
messages = [
{"role": "user", "content": f"{input_string}"},
]
updated_string = self.tokenizer.apply_chat_template(messages, tokenize=False)
updated_strings.append(updated_string)
strings = updated_strings[:]
except:
print(f"failed to update input string with chat template: {self._model}")
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side
self.tokenizer.padding_side = padding_side
add_special_tokens = False
encoding = self.tokenizer(
strings,
truncation=truncation,
padding="longest",
return_tensors="pt",
add_special_tokens=add_special_tokens,
)
if left_truncate_len:
encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
self.tokenizer.padding_side = old_padding_side
return encoding["input_ids"], encoding["attention_mask"]