vislupus commited on
Commit
faabdd0
Β·
verified Β·
1 Parent(s): d63339e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -44
app.py CHANGED
@@ -1,18 +1,15 @@
1
  import gradio as gr
2
- from gpt4all import GPT4All
3
- from huggingface_hub import hf_hub_download
4
  import os
 
 
5
 
6
  # Force CPU-only execution
7
- os.environ["LLAMA_CPP_USE_CUDA"] = "0"
8
- os.environ["FORCE_CPU"] = "1"
9
-
10
- title = "Mistral-7B-Instruct-GGUF Run On CPU-Basic Free Hardware"
11
 
 
 
12
  description = """
13
- πŸ”Ž [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp) and [GitHub - gpt4all](https://github.com/nomic-ai/gpt4all).
14
- πŸ”¨ Running on CPU-Basic free hardware. Suggest duplicating this space to run without a queue.
15
- Mistral does not support system prompt symbols (such as `<<SYS>>`) now, input your system prompt in the first message if needed. Learn more: [Guardrailing Mistral 7B](https://docs.mistral.ai/usage/guardrailing).
16
  """
17
 
18
  # Model setup
@@ -20,7 +17,7 @@ model_dir = "models"
20
  model_name = "unsloth.Q4_K_M.gguf"
21
  model_path = os.path.join(model_dir, model_name)
22
 
23
- # Download model if not already present
24
  hf_hub_download(
25
  repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
26
  filename=model_name,
@@ -29,39 +26,60 @@ hf_hub_download(
29
 
30
  # Check if the model file exists
31
  if not os.path.exists(model_path):
32
- raise ValueError(f"Model file not found at {model_path}")
33
-
34
- print("Start the model init process")
35
- model = GPT4All(model_name=model_name, model_path=model_dir)
36
- print("Finish the model init process")
37
-
38
- model.config["promptTemplate"] = "[INST] {0} [/INST]"
39
- model.config["systemPrompt"] = ""
40
- model._is_chat_session_activated = False
41
-
42
- max_new_tokens = 2048
43
-
44
- def generater(message, history, temperature, top_p, top_k):
45
- prompt = "<s>"
46
- for user_message, assistant_message in history:
47
- prompt += model.config["promptTemplate"].format(user_message)
48
- prompt += assistant_message + "</s>"
49
- prompt += model.config["promptTemplate"].format(message)
50
- outputs = []
51
- for token in model.generate(prompt=prompt, temp=temperature, top_k=top_k, top_p=top_p, max_tokens=max_new_tokens, streaming=True):
52
- outputs.append(token)
53
- yield "".join(outputs)
54
-
55
- # Gradio setup
56
- chatbot = gr.Chatbot()
57
-
58
- iface = gr.ChatInterface(
59
- fn=generater,
60
- title=title,
61
- description=description,
62
- chatbot=chatbot,
63
- examples=[["Hello, how are you?"]]
64
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
 
66
  if __name__ == "__main__":
67
- iface.launch()
 
1
  import gradio as gr
 
 
2
  import os
3
+ from llama_cpp import Llama
4
+ from huggingface_hub import hf_hub_download
5
 
6
  # Force CPU-only execution
7
+ os.environ["LLAMA_CPP_USE_CUDA"] = "0" # Ensure CUDA is disabled
 
 
 
8
 
9
+ # Application title and description
10
+ title = "Mistral-7B-Instruct-GGUF Run On CPU"
11
  description = """
12
+ πŸ”Ž [Mistral AI's Mistral 7B Instruct v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) [GGUF format model](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF), 4-bit quantization balanced quality GGUF version, running on CPU. English Only (supports other languages but with reduced quality). Using [GitHub - llama.cpp](https://github.com/ggerganov/llama.cpp).
 
 
13
  """
14
 
15
  # Model setup
 
17
  model_name = "unsloth.Q4_K_M.gguf"
18
  model_path = os.path.join(model_dir, model_name)
19
 
20
+ # Download the model if not already present
21
  hf_hub_download(
22
  repo_id="vislupus/bulgarian-joke-master-gemma-2-2b-it-bnb-4bit-gguf",
23
  filename=model_name,
 
26
 
27
  # Check if the model file exists
28
  if not os.path.exists(model_path):
29
+ raise FileNotFoundError(f"Model file not found at {model_path}")
30
+
31
+ # Load the model using llama_cpp
32
+ print("Loading the model...")
33
+ llm = Llama(model_path=model_path)
34
+ print("Model loaded successfully!")
35
+
36
+ # Define the function to generate responses
37
+ def generate_response(prompt, temperature=0.7, top_p=1.0, max_tokens=256):
38
+ """
39
+ Generate a response from the model.
40
+
41
+ Args:
42
+ prompt (str): The user's input prompt.
43
+ temperature (float): Sampling temperature.
44
+ top_p (float): Top-p sampling parameter.
45
+ max_tokens (int): Maximum number of tokens to generate.
46
+
47
+ Returns:
48
+ str: The model's response.
49
+ """
50
+ try:
51
+ response = llm(prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
52
+ return response["choices"][0]["text"].strip()
53
+ except Exception as e:
54
+ return f"Error generating response: {e}"
55
+
56
+ # Set up Gradio interface
57
+ with gr.Blocks() as demo:
58
+ gr.Markdown("# πŸ¦™ Llama GGUF Model Chatbot")
59
+ gr.Markdown(description)
60
+
61
+ # Input box for the user prompt
62
+ prompt_input = gr.Textbox(label="Your Prompt", placeholder="Type your message here...", lines=5)
63
+
64
+ # Advanced settings
65
+ with gr.Accordion("Advanced Settings", open=False):
66
+ temperature = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
67
+ top_p = gr.Slider(0.1, 1.0, value=1.0, step=0.01, label="Top-p")
68
+ max_tokens = gr.Slider(16, 512, value=256, step=16, label="Max Tokens")
69
+
70
+ # Output box for the model's response
71
+ response_output = gr.Textbox(label="Model Response", placeholder="The model's response will appear here...", lines=10)
72
+
73
+ # Generate button
74
+ generate_button = gr.Button("Generate Response")
75
+
76
+ # Connect inputs and outputs
77
+ generate_button.click(
78
+ generate_response,
79
+ inputs=[prompt_input, temperature, top_p, max_tokens],
80
+ outputs=[response_output]
81
+ )
82
 
83
+ # Launch the Gradio app
84
  if __name__ == "__main__":
85
+ demo.launch()