gguf-my-repo

Runtime error

App Files Files Community

Ffftdtd5dtft commited on Aug 30, 2024

Commit

a34ae2a

verified ·

1 Parent(s): 3218113

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -143

app.py CHANGED Viewed

@@ -1,52 +1,33 @@
 import os
-import shutil
 import subprocess
 import signal
-os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 import gradio as gr
-from huggingface_hub import create_repo, HfApi
-from huggingface_hub import snapshot_download
-from huggingface_hub import whoami
-from huggingface_hub import ModelCard
-from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from apscheduler.schedulers.background import BackgroundScheduler
 from textwrap import dedent
 HF_TOKEN = os.environ.get("HF_TOKEN")
 def generate_importance_matrix(model_path, train_data_path):
     imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
     os.chdir("llama.cpp")
-    print(f"Current working directory: {os.getcwd()}")
-    print(f"Files in the current directory: {os.listdir('.')}")
     if not os.path.isfile(f"../{model_path}"):
         raise Exception(f"Model file not found: {model_path}")
-    print("Running imatrix command...")
     process = subprocess.Popen(imatrix_command, shell=True)
     try:
-        process.wait(timeout=60)  # added wait
     except subprocess.TimeoutExpired:
-        print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
         process.send_signal(signal.SIGINT)
         try:
-            process.wait(timeout=5)  # grace period
         except subprocess.TimeoutExpired:
-            print("Imatrix proc still didn't term. Forcefully terminating process...")
             process.kill()
     os.chdir("..")
-    print("Importance matrix generation completed.")
 def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
     if oauth_token.token is None:
         raise ValueError("You have to be logged in.")
@@ -56,24 +37,16 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
         split_cmd += f" --split-max-size {split_max_size}"
     split_cmd += f" {model_path} {model_path.split('.')[0]}"
-    print(f"Split command: {split_cmd}")
     result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
-    print(f"Split command stdout: {result.stdout}")
-    print(f"Split command stderr: {result.stderr}")
     if result.returncode != 0:
         raise Exception(f"Error splitting the model: {result.stderr}")
-    print("Model split successfully!")
     sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
     if sharded_model_files:
-        print(f"Sharded model files: {sharded_model_files}")
         api = HfApi(token=oauth_token.token)
         for file in sharded_model_files:
             file_path = os.path.join('.', file)
-            print(f"Uploading file: {file_path}")
             try:
                 api.upload_file(
                     path_or_fileobj=file_path,
@@ -84,86 +57,52 @@ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, s
                 raise Exception(f"Error uploading file {file_path}: {e}")
     else:
         raise Exception("No sharded files found.")
-    print("Sharded model has been uploaded successfully!")
 def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
     if oauth_token.token is None:
         raise ValueError("You must be logged in to use GGUF-my-repo")
     model_name = model_id.split('/')[-1]
     fp16 = f"{model_name}.fp16.gguf"
     try:
         api = HfApi(token=oauth_token.token)
-        # Descargar el modelo completo
         dl_pattern = ["*.md", "*.json", "*.model"]
-        # Añadir soporte para distintos tipos de modelos (texto, imagen, audio, etc.)
-        model_types = [
-            "*.safetensors",
-            "*.bin",
-            "*.pt",
-            "*.onnx",
-            "*.h5",
-            "*.tflite",
-            "*.ckpt",
-            "*.pb",
-            "*.tar",
-            "*.xml",
-            "*.caffemodel",
-        ]
         dl_pattern.extend(model_types)
-        # Descargar todos los archivos relevantes del modelo
         api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
-        print("Model downloaded successfully!")
-        print(f"Current working directory: {os.getcwd()}")
-        print(f"Model directory contents: {os.listdir(model_name)}")
         conversion_script = "convert_hf_to_gguf.py"
         fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
         result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
-        print(result)
         if result.returncode != 0:
             raise Exception(f"Error converting to fp16: {result.stderr}")
-        print("Model converted to fp16 successfully!")
-        print(f"Converted model path: {fp16}")
         imatrix_path = "llama.cpp/imatrix.dat"
         if use_imatrix:
-            if train_data_file:
-                train_data_path = train_data_file.name
-            else:
-                train_data_path = "groups_merged.txt" #fallback calibration dataset
-            print(f"Training data file path: {train_data_path}")
             if not os.path.isfile(train_data_path):
                 raise Exception(f"Training data file not found: {train_data_path}")
             generate_importance_matrix(fp16, train_data_path)
-        else:
-            print("Not using imatrix quantization.")
         username = whoami(oauth_token.token)["name"]
         quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
         quantized_gguf_path = quantized_gguf_name
         if use_imatrix:
             quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
         else:
             quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
         result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
         if result.returncode != 0:
             raise Exception(f"Error quantizing: {result.stderr}")
-        print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
-        print(f"Quantized model path: {quantized_gguf_path}")
-        # Crear repositorio vacío
         new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
         new_repo_id = new_repo_url.repo_id
-        print("Repo created successfully!", new_repo_url)
         try:
             card = ModelCard.load(model_id, token=oauth_token.token)
@@ -179,99 +118,49 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
             # {new_repo_id}
             This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
             Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
             ## Use with llama.cpp
             Install llama.cpp through brew (works on Mac and Linux)
-            ```bash
-            brew install llama.cpp
-            ```
-            Invoke the llama.cpp server or the CLI.
-            ### CLI:
             ```bash
-            llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
             ```
-            ### Server:
             ```bash
-            llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
-            ```
-            Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
-            Step 1: Clone llama.cpp from GitHub.
-            ```
-            git clone https://github.com/ggerganov/llama.cpp
             ```
-            Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
-            ```
-            cd llama.cpp
-            ```
-            Step 3: Quantize your downloaded fp16 model into a gguf for inference.
             ```bash
-            ./llama.cpp/convert-hf-to-gguf.py /path/to/your/hf-model --outtype f16 --outfile llama.gguf
             ```
-            ## License
-            {card.data.license if card.data.license else "The original license applied to the model {model_id}"}
-            ## Limitations and Biases
-            The original limitations and biases of the model {model_id} apply to this quantized GGUF model as well.
             """
         )
-        # Subir la tarjeta del modelo
-        api.upload_file(
-            path_or_fileobj=card.to_json_string().encode("utf-8"),
-            path_in_repo="README.md",
-            repo_id=new_repo_id,
-        )
-        print("Model card uploaded!")
-        # Verifica si se debe hacer split del modelo
         if split_model:
-            split_upload_model(
-                model_path=quantized_gguf_path,
-                repo_id=new_repo_id,
-                oauth_token=oauth_token,
-                split_max_tensors=split_max_tensors,
-                split_max_size=split_max_size
-            )
         else:
-            print(f"Uploading quantized model to {new_repo_id}...")
             api.upload_file(
-                path_or_fileobj=quantized_gguf_path,
                 path_in_repo=quantized_gguf_name,
                 repo_id=new_repo_id,
             )
-            print("Model uploaded successfully!")
-        shutil.rmtree(model_name)
-        print("Cleaned up local files.")
-        print(f"Process completed successfully! Your quantized GGUF model is available at: https://huggingface.co/{new_repo_id}")
-        return f"Model successfully quantized and uploaded to {new_repo_id}!"
     except Exception as e:
-        print(f"Exception during processing: {e}")
-        return f"An error occurred: {str(e)}"
 def setup_scheduler():
     scheduler = BackgroundScheduler()
-    scheduler.add_job(restart_space, 'interval', hours=6)
     scheduler.start()
-def restart_space():
-    api = HfApi(token=HF_TOKEN)
-    api.restart_space(repo_id="ggml-org/gguf-my-repo", hardware="cpu-basic")
-    print("Space restarted successfully!")
-# Setup Gradio interface with updated support
 with gr.Blocks() as demo:
-    model_id = HuggingfaceHubSearch(label="Select a model from HuggingFace Hub").launch()
-    q_method = gr.Dropdown(choices=["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], label="Quantization method")
     use_imatrix = gr.Checkbox(label="Use imatrix quantization")
-    imatrix_q_method = gr.Dropdown(choices=["q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], label="Imatrix Quantization method", visible=False)
     train_data_file = gr.File(label="Upload calibration dataset for imatrix")
     private_repo = gr.Checkbox(label="Make repo private")
     split_model = gr.Checkbox(label="Split model before uploading")

 import os
 import subprocess
 import signal
 import gradio as gr
+from huggingface_hub import create_repo, HfApi, snapshot_download, whoami, ModelCard
 from apscheduler.schedulers.background import BackgroundScheduler
 from textwrap import dedent
 HF_TOKEN = os.environ.get("HF_TOKEN")
 def generate_importance_matrix(model_path, train_data_path):
     imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
     os.chdir("llama.cpp")
     if not os.path.isfile(f"../{model_path}"):
         raise Exception(f"Model file not found: {model_path}")
     process = subprocess.Popen(imatrix_command, shell=True)
     try:
+        process.wait(timeout=60)
     except subprocess.TimeoutExpired:
         process.send_signal(signal.SIGINT)
         try:
+            process.wait(timeout=5)
         except subprocess.TimeoutExpired:
             process.kill()
     os.chdir("..")
 def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
     if oauth_token.token is None:
         raise ValueError("You have to be logged in.")
         split_cmd += f" --split-max-size {split_max_size}"
     split_cmd += f" {model_path} {model_path.split('.')[0]}"
     result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
     if result.returncode != 0:
         raise Exception(f"Error splitting the model: {result.stderr}")
     sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
     if sharded_model_files:
         api = HfApi(token=oauth_token.token)
         for file in sharded_model_files:
             file_path = os.path.join('.', file)
             try:
                 api.upload_file(
                     path_or_fileobj=file_path,
                 raise Exception(f"Error uploading file {file_path}: {e}")
     else:
         raise Exception("No sharded files found.")
 def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
     if oauth_token.token is None:
         raise ValueError("You must be logged in to use GGUF-my-repo")
     model_name = model_id.split('/')[-1]
     fp16 = f"{model_name}.fp16.gguf"
     try:
         api = HfApi(token=oauth_token.token)
         dl_pattern = ["*.md", "*.json", "*.model"]
+        model_types = ["*.safetensors", "*.bin", "*.pt", "*.onnx", "*.h5", "*.tflite", "*.ckpt", "*.pb", "*.tar", "*.xml", "*.caffemodel"]
         dl_pattern.extend(model_types)
         api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
         conversion_script = "convert_hf_to_gguf.py"
         fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
         result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
         if result.returncode != 0:
             raise Exception(f"Error converting to fp16: {result.stderr}")
         imatrix_path = "llama.cpp/imatrix.dat"
         if use_imatrix:
+            train_data_path = train_data_file.name if train_data_file else "groups_merged.txt"
             if not os.path.isfile(train_data_path):
                 raise Exception(f"Training data file not found: {train_data_path}")
             generate_importance_matrix(fp16, train_data_path)
         username = whoami(oauth_token.token)["name"]
         quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
         quantized_gguf_path = quantized_gguf_name
         if use_imatrix:
             quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
         else:
             quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
         result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
         if result.returncode != 0:
             raise Exception(f"Error quantizing: {result.stderr}")
         new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
         new_repo_id = new_repo_url.repo_id
         try:
             card = ModelCard.load(model_id, token=oauth_token.token)
             # {new_repo_id}
             This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
             Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
             ## Use with llama.cpp
             Install llama.cpp through brew (works on Mac and Linux)
             ```bash
+            brew install gguf
             ```
+            ## Use llama.cpp quantized model
+            - Download the model:
             ```bash
+            curl -L -o {quantized_gguf_name} https://huggingface.co/{new_repo_id}/raw/main/{quantized_gguf_name}
             ```
             ```bash
+            ./main -m {quantized_gguf_name} --prompt "Tell me about gguf"
             ```
             """
         )
+        card.save(new_repo_id, token=oauth_token.token)
         if split_model:
+            split_upload_model(quantized_gguf_name, new_repo_id, oauth_token, split_max_tensors, split_max_size)
         else:
             api.upload_file(
+                path_or_fileobj=quantized_gguf_name,
                 path_in_repo=quantized_gguf_name,
                 repo_id=new_repo_id,
+                token=oauth_token.token,
             )
+        return f"Done processing {new_repo_id}"
     except Exception as e:
+        return f"Error processing model: {str(e)}"
 def setup_scheduler():
     scheduler = BackgroundScheduler()
     scheduler.start()
+    return scheduler
 with gr.Blocks() as demo:
+    model_id = gr.Textbox(label="Enter Model ID", placeholder="Enter model ID from HuggingFace Hub")
+    q_method = gr.Dropdown(choices=["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"], label="Quantization method")
     use_imatrix = gr.Checkbox(label="Use imatrix quantization")
+    imatrix_q_method = gr.Dropdown(choices=["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"], label="Imatrix Quantization method", visible=False)
     train_data_file = gr.File(label="Upload calibration dataset for imatrix")
     private_repo = gr.Checkbox(label="Make repo private")
     split_model = gr.Checkbox(label="Split model before uploading")