vislupus commited on
Commit
fca82b6
·
verified ·
1 Parent(s): 1561c2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -42
app.py CHANGED
@@ -3,28 +3,25 @@ import os
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
 
6
- # Force CPU-only execution
7
  os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled
8
 
9
- # Application title and description
10
- title = "Mistral-7B-Instruct-GGUF Run On CPU"
11
  description = """
12
- 🔎 [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp).
 
 
13
  """
14
 
15
- # Model setup
16
  model_dir = "models"
17
  model_name = "unsloth.Q4_K_M.gguf"
18
  model_path = os.path.join(model_dir, model_name)
19
 
20
- # Download the model if not already present
21
  hf_hub_download(
22
  repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
23
  filename=model_name,
24
  local_dir=model_dir
25
  )
26
 
27
- # Check if the model file exists
28
  if not os.path.exists(model_path):
29
  raise FileNotFoundError(f"Model file not found at {model_path}")
30
 
@@ -33,53 +30,36 @@ print("Loading the model...")
33
  llm = Llama(model_path=model_path)
34
  print("Model loaded successfully!")
35
 
36
- # Define the function to generate responses
37
- def generate_response(prompt, temperature=0.7, top_p=1.0, max_tokens=256):
38
  """
39
  Generate a response from the model.
40
-
41
  Args:
42
- prompt (str): The user's input prompt.
43
  temperature (float): Sampling temperature.
44
  top_p (float): Top-p sampling parameter.
45
  max_tokens (int): Maximum number of tokens to generate.
46
-
47
  Returns:
48
  str: The model's response.
49
  """
 
 
 
 
 
 
 
50
  try:
51
  response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
52
  return response["choices"][0]["text"].strip()
53
  except Exception as e:
54
  return f"Error generating response: {e}"
55
 
56
- # Set up Gradio interface
57
- with gr.Blocks() as demo:
58
- gr.Markdown("# 🦙 Llama GGUF Model Chatbot")
59
- gr.Markdown(description)
60
-
61
- # Input box for the user prompt
62
- prompt_input = gr.Textbox(label="Your Prompt", placeholder="Type your message here...", lines=5)
63
-
64
- # Advanced settings
65
- with gr.Accordion("Advanced Settings", open=False):
66
- temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
67
- top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.01, label="Top-p")
68
- max_tokens = gr.Slider(16, 512, value=256, step=16, label="Max Tokens")
69
-
70
- # Output box for the model's response
71
- response_output = gr.Textbox(label="Model Response", placeholder="The model's response will appear here...", lines=10)
72
-
73
- # Generate button
74
- generate_button = gr.Button("Generate Response")
75
-
76
- # Connect inputs and outputs
77
- generate_button.click(
78
- generate_response,
79
- inputs=[prompt_input, temperature, top_p, max_tokens],
80
- outputs=[response_output]
81
- )
82
-
83
- # Launch the Gradio app
84
- if __name__ == "__main__":
85
- demo.launch(share=True)
 
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
 
 
6
  os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled
7
 
8
+ title = "Gemma 2 2B - Bulgarian Joke Master - GGUF"
 
9
  description = """
10
+ 🔎 [Gemma 2 2B](https://huggingface.co/unsloth/gemma-2-2b-bnb-4bit) fine-tuned for Bulgarian jokes, running on CPU in GGUF format.
11
+ This model is fine-tuned for generating humorous content in Bulgarian, utilizing the [Llama.cpp library](https://github.com/ggerganov/llama.cpp).
12
+ Running on CPU, it can still produce impressive results, although larger models may require more processing power.
13
  """
14
 
 
15
  model_dir = "models"
16
  model_name = "unsloth.Q4_K_M.gguf"
17
  model_path = os.path.join(model_dir, model_name)
18
 
 
19
  hf_hub_download(
20
  repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
21
  filename=model_name,
22
  local_dir=model_dir
23
  )
24
 
 
25
  if not os.path.exists(model_path):
26
  raise FileNotFoundError(f"Model file not found at {model_path}")
27
 
 
30
  llm = Llama(model_path=model_path)
31
  print("Model loaded successfully!")
32
 
33
+ def generate_response(messages, temperature=0.7, top_p=1.0, max_tokens=256):
 
34
  """
35
  Generate a response from the model.
 
36
  Args:
37
+ messages (list): List of conversation history in a tuple format (user, assistant).
38
  temperature (float): Sampling temperature.
39
  top_p (float): Top-p sampling parameter.
40
  max_tokens (int): Maximum number of tokens to generate.
 
41
  Returns:
42
  str: The model's response.
43
  """
44
+ prompt = ""
45
+ for user_message, assistant_message in messages:
46
+ prompt += f"<start_of_turn>user\n{user_message}\n<end_of_turn>"
47
+ prompt += f"<start_of_turn>model\n{assistant_message}\n<end_of_turn>"
48
+
49
+ prompt += "<start_of_turn>user\n" + messages[-1][0] + "\n<end_of_turn>"
50
+
51
  try:
52
  response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
53
  return response["choices"][0]["text"].strip()
54
  except Exception as e:
55
  return f"Error generating response: {e}"
56
 
57
+ with gr.ChatInterface(
58
+ fn=generate_response,
59
+ title=title,
60
+ description=description,
61
+ theme="huggingface",
62
+ allow_screenshot=False,
63
+ examples=[["Hello, tell me a Bulgarian joke!"]]
64
+ ) as demo:
65
+ demo.launch(share=True)