Spaces:

vislupus
/

Bulgarian-Joke-Master

Running

App Files Files Community

vislupus commited on Dec 6, 2024

Commit

fca82b6

verified ·

1 Parent(s): 1561c2b

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -42

app.py CHANGED Viewed

@@ -3,28 +3,25 @@ import os
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-# Force CPU-only execution
 os.environ["LLAMA_CPP_USE_CUDA"] = "0"  # Ensure CUDA is disabled
-# Application title and description
-title = "Mistral-7B-Instruct-GGUF Run On CPU"
 description = """
-🔎 [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp).
 """
-# Model setup
 model_dir = "models"
 model_name = "unsloth.Q4_K_M.gguf"
 model_path = os.path.join(model_dir, model_name)
-# Download the model if not already present
 hf_hub_download(
     repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
     filename=model_name,
     local_dir=model_dir
 )
-# Check if the model file exists
 if not os.path.exists(model_path):
     raise FileNotFoundError(f"Model file not found at {model_path}")
@@ -33,53 +30,36 @@ print("Loading the model...")
 llm = Llama(model_path=model_path)
 print("Model loaded successfully!")
-# Define the function to generate responses
-def generate_response(prompt, temperature=0.7, top_p=1.0, max_tokens=256):
     """
     Generate a response from the model.
     Args:
-        prompt (str): The user's input prompt.
         temperature (float): Sampling temperature.
         top_p (float): Top-p sampling parameter.
         max_tokens (int): Maximum number of tokens to generate.
     Returns:
         str: The model's response.
     """
     try:
         response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
         return response["choices"][0]["text"].strip()
     except Exception as e:
         return f"Error generating response: {e}"
-# Set up Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# 🦙 Llama GGUF Model Chatbot")
-    gr.Markdown(description)
-    # Input box for the user prompt
-    prompt_input = gr.Textbox(label="Your Prompt", placeholder="Type your message here...", lines=5)
-    # Advanced settings
-    with gr.Accordion("Advanced Settings", open=False):
-        temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
-        top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.01, label="Top-p")
-        max_tokens = gr.Slider(16, 512, value=256, step=16, label="Max Tokens")
-    # Output box for the model's response
-    response_output = gr.Textbox(label="Model Response", placeholder="The model's response will appear here...", lines=10)
-    # Generate button
-    generate_button = gr.Button("Generate Response")
-    # Connect inputs and outputs
-    generate_button.click(
-        generate_response,
-        inputs=[prompt_input, temperature, top_p, max_tokens],
-        outputs=[response_output]
-    )
-# Launch the Gradio app
-if __name__ == "__main__":
-    demo.launch(share=True)

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 os.environ["LLAMA_CPP_USE_CUDA"] = "0"  # Ensure CUDA is disabled
+title = "Gemma 2 2B - Bulgarian Joke Master - GGUF"
 description = """
+🔎 [Gemma 2 2B](https://huggingface.co/unsloth/gemma-2-2b-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format.
+This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp).
+Running on CPU, it can still produce impressive results, although larger models may require more processing power.
 """
 model_dir = "models"
 model_name = "unsloth.Q4_K_M.gguf"
 model_path = os.path.join(model_dir, model_name)
 hf_hub_download(
     repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
     filename=model_name,
     local_dir=model_dir
 )
 if not os.path.exists(model_path):
     raise FileNotFoundError(f"Model file not found at {model_path}")
 llm = Llama(model_path=model_path)
 print("Model loaded successfully!")
+def generate_response(messages, temperature=0.7, top_p=1.0, max_tokens=256):
     """
     Generate a response from the model.
     Args:
+        messages (list): List of conversation history in a tuple format (user, assistant).
         temperature (float): Sampling temperature.
         top_p (float): Top-p sampling parameter.
         max_tokens (int): Maximum number of tokens to generate.
     Returns:
         str: The model's response.
     """
+    prompt = ""
+    for user_message, assistant_message in messages:
+        prompt += f"<start_of_turn>user\n{user_message}\n<end_of_turn>"
+        prompt += f"<start_of_turn>model\n{assistant_message}\n<end_of_turn>"
+    prompt += "<start_of_turn>user\n" + messages[-1][0] + "\n<end_of_turn>"
     try:
         response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
         return response["choices"][0]["text"].strip()
     except Exception as e:
         return f"Error generating response: {e}"
+with gr.ChatInterface(
+    fn=generate_response,
+    title=title,
+    description=description,
+    theme="huggingface",
+    allow_screenshot=False,
+    examples=[["Hello, tell me a Bulgarian joke!"]]
+) as demo:
+    demo.launch(share=True)