Spaces:

vislupus
/

Bulgarian-Joke-Master

Running

App Files Files Community

vislupus commited on Dec 6, 2024

Commit

faabdd0

verified ·

1 Parent(s): d63339e

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -44

app.py CHANGED Viewed

@@ -1,18 +1,15 @@
 import gradio as gr
-from gpt4all import GPT4All
-from huggingface_hub import hf_hub_download
 import os
 # Force CPU-only execution
-os.environ["LLAMA_CPP_USE_CUDA"] = "0"
-os.environ["FORCE_CPU"] = "1"
-title = "Mistral-7B-Instruct-GGUF Run On CPU-Basic Free Hardware"
 description = """
-🔎 [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp) and [GitHub - gpt4all](https://github.com/nomic-ai/gpt4all).
-🔨 Running on CPU-Basic free hardware. Suggest duplicating this space to run without a queue.
-Mistral does not support system prompt symbols (such as `<<SYS>>`) now, input your system prompt in the first message if needed. Learn more: [Guardrailing Mistral 7B](https://docs.mistral.ai/usage/guardrailing).
 """
 # Model setup
@@ -20,7 +17,7 @@ model_dir = "models"
 model_name = "unsloth.Q4_K_M.gguf"
 model_path = os.path.join(model_dir, model_name)
-# Download model if not already present
 hf_hub_download(
     repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
     filename=model_name,
@@ -29,39 +26,60 @@ hf_hub_download(
 # Check if the model file exists
 if not os.path.exists(model_path):
-    raise ValueError(f"Model file not found at {model_path}")
-print("Start the model init process")
-model = GPT4All(model_name=model_name, model_path=model_dir)
-print("Finish the model init process")
-model.config["promptTemplate"] = "[INST] {0} [/INST]"
-model.config["systemPrompt"] = ""
-model._is_chat_session_activated = False
-max_new_tokens = 2048
-def generater(message, history, temperature, top_p, top_k):
-    prompt = "<s>"
-    for user_message, assistant_message in history:
-        prompt += model.config["promptTemplate"].format(user_message)
-        prompt += assistant_message + "</s>"
-    prompt += model.config["promptTemplate"].format(message)
-    outputs = []
-    for token in model.generate(prompt=prompt, temp=temperature, top_k=top_k, top_p=top_p, max_tokens=max_new_tokens, streaming=True):
-        outputs.append(token)
-        yield "".join(outputs)
-# Gradio setup
-chatbot = gr.Chatbot()
-iface = gr.ChatInterface(
-    fn=generater,
-    title=title,
-    description=description,
-    chatbot=chatbot,
-    examples=[["Hello, how are you?"]]
-)
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import os
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
 # Force CPU-only execution
+os.environ["LLAMA_CPP_USE_CUDA"] = "0"  # Ensure CUDA is disabled
+# Application title and description
+title = "Mistral-7B-Instruct-GGUF Run On CPU"
 description = """
+🔎 [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp).
 """
 # Model setup
 model_name = "unsloth.Q4_K_M.gguf"
 model_path = os.path.join(model_dir, model_name)
+# Download the model if not already present
 hf_hub_download(
     repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
     filename=model_name,
 # Check if the model file exists
 if not os.path.exists(model_path):
+    raise FileNotFoundError(f"Model file not found at {model_path}")
+# Load the model using llama_cpp
+print("Loading the model...")
+llm = Llama(model_path=model_path)
+print("Model loaded successfully!")
+# Define the function to generate responses
+def generate_response(prompt, temperature=0.7, top_p=1.0, max_tokens=256):
+    """
+    Generate a response from the model.
+    Args:
+        prompt (str): The user's input prompt.
+        temperature (float): Sampling temperature.
+        top_p (float): Top-p sampling parameter.
+        max_tokens (int): Maximum number of tokens to generate.
+    Returns:
+        str: The model's response.
+    """
+    try:
+        response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
+        return response["choices"][0]["text"].strip()
+    except Exception as e:
+        return f"Error generating response: {e}"
+# Set up Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# 🦙 Llama GGUF Model Chatbot")
+    gr.Markdown(description)
+    # Input box for the user prompt
+    prompt_input = gr.Textbox(label="Your Prompt", placeholder="Type your message here...", lines=5)
+    # Advanced settings
+    with gr.Accordion("Advanced Settings", open=False):
+        temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
+        top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.01, label="Top-p")
+        max_tokens = gr.Slider(16, 512, value=256, step=16, label="Max Tokens")
+    # Output box for the model's response
+    response_output = gr.Textbox(label="Model Response", placeholder="The model's response will appear here...", lines=10)
+    # Generate button
+    generate_button = gr.Button("Generate Response")
+    # Connect inputs and outputs
+    generate_button.click(
+        generate_response,
+        inputs=[prompt_input, temperature, top_p, max_tokens],
+        outputs=[response_output]
+    )
+# Launch the Gradio app
 if __name__ == "__main__":
+    demo.launch()