11 days ago

colab t4

10 days ago

import transformers
from threading import Thread
import torch
import os
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
model_id = 'mobiuslabsgmbh/Mixtral-8x7B-v0.1-hf-2bit_g16_s128-HQQ'
#Load the model
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = HQQModelForCausalLM.from_quantized(model_id)

#Optional: set backend/compile
#You will need to install CUDA kernels apriori

git clone https://github.com/mobiusml/hqq/

cd hqq/kernels && python setup_cuda.py install

from hqq.core.quantize import *
HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)

def chat_processor(chat, max_new_tokens=1, do_sample=True):
tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

generate_params = dict(
    tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to('cuda'),
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=do_sample,
    top_p=0.90,
    top_k=50,
    temperature= 0.6,
    num_beams=1,
    repetition_penalty=1.2,
)

t = Thread(target=model.generate, kwargs=generate_params)
t.start()
outputs = []
for text in streamer:
    outputs.append(text)
    print(text, end="", flush=True)

return outputs

################################################################################################
#Generation
outputs = chat_processor("How do I build a car?", max_new_tokens=1, do_sample=True)

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
Fetching 8 files: 100%
8/8 [00:00<00:00, 223.92it/s]
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py:237: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
return torch.load(cls.get_weight_file(save_dir), map_location=map_location)
Failed to load the weights

OutOfMemoryError Traceback (most recent call last)
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py in from_quantized(cls, save_dir_or_hub, compute_dtype, device, cache_dir, adapter, **kwargs)
463 try:
--> 464 weights = cls.load_weights(save_dir)
465 except Exception:

11 frames
OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 10.12 MiB is free. Process 73057 has 14.73 GiB memory in use. Of the allocated memory 13.63 GiB is allocated by PyTorch, and 1.00 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

During handling of the above exception, another exception occurred:

FileNotFoundError Traceback (most recent call last)
/usr/local/lib/python3.11/dist-packages/hqq/models/base.py in from_quantized(cls, save_dir_or_hub, compute_dtype, device, cache_dir, adapter, **kwargs)
465 except Exception:
466 print("Failed to load the weights")
--> 467 raise FileNotFoundError
468
469 # load_state_dict() doesn't work with modules initialized with init_empty_weights(), so we need to do this manually

FileNotFoundError:

mobicham

Mobius Labs GmbH org 10 days ago

OutOfMemoryError
You need a GPU with more VRAM

mobicham changed discussion status to closed 10 days ago

mobiuslabsgmbh
/

Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bitgs8-metaoffload-HQQ

not run

git clone https://github.com/mobiusml/hqq/

cd hqq/kernels && python setup_cuda.py install