Spaces:
Sleeping
Sleeping
"""The below code is borrowed from: https://github.com/PromtEngineer/localGPT | |
The reason to use gguf/ggml models: https://huggingface.co/TheBloke/wizardLM-7B-GGML/discussions/3""" | |
import logging | |
import torch | |
from huggingface_hub import hf_hub_download | |
from huggingface_hub import login | |
from langchain.llms import LlamaCpp, HuggingFacePipeline | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
LlamaForCausalLM, | |
LlamaTokenizer, | |
GenerationConfig, | |
pipeline, | |
) | |
from toolkit.utils import Config | |
configs = Config("configparser.ini") | |
logger = logging.getLogger(__name__) | |
def load_gguf_hf_model( | |
model_id: str, | |
model_basename: str, | |
max_tokens: int, | |
temperature: float, | |
device_type: str, | |
): | |
""" | |
Load a GGUF/GGML quantized model using LlamaCpp. | |
This function attempts to load a GGUF/GGML quantized model using the LlamaCpp library. | |
If the model is of type GGML, and newer version of LLAMA-CPP is used which does not support GGML, | |
it logs a message indicating that LLAMA-CPP has dropped support for GGML. | |
Parameters: | |
- model_id (str): The identifier for the model on HuggingFace Hub. | |
- model_basename (str): The base name of the model file. | |
- max_tokens (int): The maximum number of tokens to generate in the completion. | |
- temperature (float): The temperature of LLM. | |
- device_type (str): The type of device where the model will run, e.g., 'mps', 'cuda', etc. | |
Returns: | |
- LlamaCpp: An instance of the LlamaCpp model if successful, otherwise None. | |
Notes: | |
- The function uses the `hf_hub_download` function to download the model from the HuggingFace Hub. | |
- The number of GPU layers is set based on the device type. | |
""" | |
try: | |
logger.info("Using Llamacpp for GGUF/GGML quantized models") | |
model_path = hf_hub_download( | |
repo_id=model_id, | |
filename=model_basename, | |
resume_download=True, | |
cache_dir=configs.local_model_dir, | |
) | |
kwargs = { | |
"model_path": model_path, | |
"n_ctx": configs.max_llm_context, | |
"max_tokens": max_tokens, | |
"temperature": temperature, | |
"n_batch": configs.n_batch, # set this based on your GPU & CPU RAM | |
"verbose": False, | |
} | |
if device_type.lower() == "mps": | |
kwargs["n_gpu_layers"] = 1 | |
if device_type.lower() == "cuda": | |
kwargs["n_gpu_layers"] = configs.n_gpu_layers # set this based on your GPU | |
return LlamaCpp(**kwargs) | |
except: | |
if "ggml" in model_basename: | |
logger.info( | |
"If you were using GGML model, LLAMA-CPP Dropped Support, Use GGUF Instead" | |
) | |
return None | |
def load_full_hf_model(model_id: str, model_basename: str, device_type: str): | |
""" | |
Load a full model using either LlamaTokenizer or AutoModelForCausalLM. | |
This function loads a full model based on the specified device type. | |
If the device type is 'mps' or 'cpu', it uses LlamaTokenizer and LlamaForCausalLM. | |
Otherwise, it uses AutoModelForCausalLM. | |
Parameters: | |
- model_id (str): The identifier for the model on HuggingFace Hub. | |
- model_basename (str): The base name of the model file. | |
- device_type (str): The type of device where the model will run. | |
Returns: | |
- model (Union[LlamaForCausalLM, AutoModelForCausalLM]): The loaded model. | |
- tokenizer (Union[LlamaTokenizer, AutoTokenizer]): The tokenizer associated with the model. | |
Notes: | |
- The function uses the `from_pretrained` method to load both the model and the tokenizer. | |
- Additional settings are provided for NVIDIA GPUs, such as loading in 4-bit and setting the compute dtype. | |
""" | |
if "meta-llama" in model_id.lower(): | |
login(token=configs.huggingface_token) | |
if device_type.lower() in ["mps", "cpu"]: | |
logger.info("Using LlamaTokenizer") | |
tokenizer = LlamaTokenizer.from_pretrained( | |
model_id, | |
cache_dir=configs.local_model_dir, | |
) | |
model = LlamaForCausalLM.from_pretrained( | |
model_id, | |
cache_dir=configs.local_model_dir, | |
) | |
else: | |
logger.info("Using AutoModelForCausalLM for full models") | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_id, cache_dir=configs.local_model_dir | |
) | |
logger.info("Tokenizer loaded") | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
device_map="auto", | |
torch_dtype=torch.float16, | |
low_cpu_mem_usage=True, | |
cache_dir=configs.local_model_dir, | |
# trust_remote_code=True, # set these if you are using NVIDIA GPU | |
# load_in_4bit=True, | |
# bnb_4bit_quant_type="nf4", | |
# bnb_4bit_compute_dtype=torch.float16, | |
# max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors | |
) | |
model.tie_weights() | |
return model, tokenizer | |
def load_local_llm( | |
model_id: str, | |
model_basename: str, | |
temperature: float, | |
max_tokens: int, | |
device_type: str, | |
): | |
""" | |
Select a model for text generation using the HuggingFace library. | |
If you are running this for the first time, it will download a model for you. | |
subsequent runs will use the model from the disk. | |
Args: | |
device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU. | |
model_id (str): Identifier of the model to load from HuggingFace's model hub. | |
model_basename (str, optional): Basename of the model if using quantized models. | |
Defaults to None. | |
Returns: | |
HuggingFacePipeline: A pipeline object for text generation using the loaded model. | |
Raises: | |
ValueError: If an unsupported model or device type is provided. | |
""" | |
logger.info(f"Loading Model: {model_id}, on: {device_type}") | |
logger.info("This action can take a few minutes!") | |
if model_basename.lower() != "none": | |
if ".gguf" in model_basename.lower(): | |
llm = load_gguf_hf_model( | |
model_id, model_basename, max_tokens, temperature, device_type | |
) | |
return llm | |
model, tokenizer = load_full_hf_model(model_id, None, device_type) | |
# Load configuration from the model to avoid warnings | |
generation_config = GenerationConfig.from_pretrained(model_id) | |
# see here for details: | |
# https://huggingface.co/docs/transformers/ | |
# main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns | |
# Create a pipeline for text generation | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
max_length=max_tokens, | |
temperature=temperature, | |
# top_p=0.95, | |
repetition_penalty=1.15, | |
generation_config=generation_config, | |
) | |
local_llm = HuggingFacePipeline(pipeline=pipe) | |
logger.info("Local LLM Loaded") | |
return local_llm | |