import gradio as gr import torch from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel import tempfile from huggingface_hub import HfApi from huggingface_hub import list_models from gradio_huggingfacehub_search import HuggingfaceHubSearch from packaging import version import os import spaces MAP_QUANT_TYPE_TO_NAME = { "int4_weight_only": "int4wo", "int8_weight_only": "int8wo", "int8_dynamic_activation_int8_weight": "int8da8w" } def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str: # ^ expect a gr.OAuthProfile object as input to get the user's profile # if the user is not logged in, profile will be None if profile is None: return "Hello !" return f"Hello {profile.name} !" def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization_type, group_size, model_name, quantized_model_name): """Check if a model exists in the user's Hugging Face repository.""" try: models = list_models(author=username, token=oauth_token.token) model_names = [model.id for model in models] if quantized_model_name : repo_name = f"{username}/{quantized_model_name}" else : if quantization_type == "int4_weight_only" : repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}" else : repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}" if repo_name in model_names: return f"Model '{repo_name}' already exists in your repository." else: return None # Model does not exist except Exception as e: return f"Error checking model existence: {str(e)}" def create_model_card(model_name, quantization_type, group_size): model_card = f"""--- base_model: - {model_name} --- # {model_name} (Quantized) ## Description This model is a quantized version of the original model `{model_name}`. It has been quantized using {quantization_type} quantization with torchao. ## Quantization Details - **Quantization Type**: {quantization_type} - **Group Size**: {group_size if quantization_type == "int4_weight_only" else None} ## Usage You can use this model in your applications by loading it directly from the Hugging Face Hub: ```python from transformers import AutoModel model = AutoModel.from_pretrained("{model_name}")""" return model_card def load_model(model_name, quantization_config, auth_token) : return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, device_map="cpu", use_auth_token=auth_token.token) def load_model_cpu(model_name, quantization_config, auth_token) : return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token) def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None): print(f"Quantizing model: {quantization_type}") if quantization_type == "int4_weight_only" : quantization_config = TorchAoConfig(quantization_type, group_size=group_size) else : quantization_config = TorchAoConfig(quantization_type) model = load_model(model_name, quantization_config=quantization_config, auth_token=auth_token) return model def save_model(model, model_name, quantization_type, group_size=128, username=None, auth_token=None, quantized_model_name=None): print("Saving quantized model") with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, safe_serialization=False, use_auth_token=auth_token.token) if quantized_model_name : repo_name = f"{username}/{quantized_model_name}" else : if quantization_type == "int4_weight_only" : repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}-gs{group_size}" else : repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{MAP_QUANT_TYPE_TO_NAME[quantization_type.lower()]}" model_card = create_model_card(repo_name, quantization_type, group_size) with open(os.path.join(tmpdirname, "README.md"), "w") as f: f.write(model_card) # Push to Hub api = HfApi(token=auth_token.token) api.create_repo(repo_name, exist_ok=True) api.upload_folder( folder_path=tmpdirname, repo_id=repo_name, repo_type="model", ) return f'