ngxson HF staff commited on
Commit
cfcd3f9
·
1 Parent(s): 9530a4a

init version

Browse files
Files changed (2) hide show
  1. .gitignore +3 -0
  2. app.py +177 -4
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.gguf
2
+ llama-server
3
+ llama.cpp
app.py CHANGED
@@ -1,7 +1,180 @@
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ import os
2
+ import requests
3
+ import subprocess
4
+ import json
5
+ import time
6
+ import spaces
7
  import gradio as gr
8
+ from typing import List, Optional, Tuple, Dict
9
+
10
+ DEFAULT_SYSTEM = "You are a helpful assistant."
11
+ HF_MODEL_ID = "bartowski/Llama-3.2-1B-Instruct-GGUF"
12
+ HF_FILENAME = "Llama-3.2-1B-Instruct-Q4_K_M.gguf"
13
+
14
+ ###############################################
15
+
16
+ API_PATH_HEALTH = "/health"
17
+ API_PATH_COMPLETIONS = "/chat/completions"
18
+ LLAMA_CPP_SERVER_BASE = "http://127.0.0.1:8080"
19
+ LLAMA_CPP_SERVER_START_TIMEOUT = 50 # seconds
20
+
21
+ if not os.path.exists('model.gguf'):
22
+ url = f"https://huggingface.co/{HF_MODEL_ID}/resolve/main/{HF_FILENAME}"
23
+ subprocess.check_call(["curl", "-o", "model.gguf", "-L", url])
24
+
25
+ if not os.path.exists("llama-server"):
26
+ # FIXME: currently, we can't build inside gradio container because nvcc is missing
27
+ subprocess.check_call("curl -o llama-server -L https://ngxson-llamacpp-builder.hf.space/llama-server", shell=True)
28
+ subprocess.check_call("chmod +x llama-server", shell=True)
29
+
30
+ ###############################################
31
+
32
+ class Role:
33
+ SYSTEM = "system"
34
+ USER = "user"
35
+ ASSISTANT = "assistant"
36
+
37
+ History = List[Tuple[str, str]]
38
+ Messages = List[Dict[str, str]]
39
+
40
+ def clear_session() -> History:
41
+ return "", []
42
+
43
+ def modify_system_session(system: str) -> str:
44
+ if system is None or len(system) == 0:
45
+ system = DEFAULT_SYSTEM
46
+ return system, system, []
47
+
48
+ def history_to_messages(history: History, system: str) -> Messages:
49
+ messages = [{"role": Role.SYSTEM, "content": system}]
50
+ for h in history:
51
+ messages.append({"role": Role.USER, "content": h[0]})
52
+ messages.append({"role": Role.ASSISTANT, "content": h[1]})
53
+ return messages
54
+
55
+
56
+ def messages_to_history(messages: Messages) -> Tuple[str, History]:
57
+ assert messages[0]["role"] == Role.SYSTEM
58
+ system = messages[0]["content"]
59
+ history = []
60
+ for q, r in zip(messages[1::2], messages[2::2]):
61
+ history.append([q["content"], r["content"]])
62
+ return system, history
63
+
64
+ def wait_until_llamacpp_ready():
65
+ time.sleep(5)
66
+ gr.Info("starting llama.cpp server...")
67
+ trials = 0
68
+ while True:
69
+ try:
70
+ response = requests.get(LLAMA_CPP_SERVER_BASE + API_PATH_HEALTH)
71
+ if response.status_code == 200:
72
+ print("Status 200 received. Exiting loop.")
73
+ break
74
+ else:
75
+ print(f"Received status {response.status_code}. Retrying...")
76
+ except requests.exceptions.RequestException as e:
77
+ print(f"Request failed: {e}")
78
+ trials += 1
79
+ if trials > LLAMA_CPP_SERVER_START_TIMEOUT:
80
+ raise TimeoutError("llama.cpp server did not start in time")
81
+ time.sleep(1) # Wait for 1 second before retrying
82
+ gr.Info("llama.cpp server is ready.")
83
+ print("llama.cpp server is ready.")
84
+
85
+
86
+ @spaces.GPU
87
+ def model_chat(query: Optional[str], history: Optional[History], system: str
88
+ ) -> Tuple[str, str, History]:
89
+ if query is None:
90
+ query = ""
91
+ if history is None:
92
+ history = []
93
+
94
+ # start llama.cpp server
95
+ proc = subprocess.Popen(["./llama-server"], env=dict(
96
+ os.environ,
97
+ LLAMA_HOST="0.0.0.0",
98
+ LLAMA_PORT="8080",
99
+ LLAMA_ARG_CTX_SIZE=str(1024 * 32),
100
+ LLAMA_ARG_FLASH_ATTN="1",
101
+ LLAMA_ARG_MODEL="model.gguf",
102
+ LLAMA_ARG_N_PARALLEL="1",
103
+ LLAMA_ARG_N_GPU_LAYERS="9999",
104
+ LLAMA_ARG_NO_MMAP="1",
105
+ ))
106
+
107
+ exception = None
108
+ try:
109
+ wait_until_llamacpp_ready()
110
+
111
+ messages = history_to_messages(history, system)
112
+ messages.append({"role": Role.USER, "content": query})
113
+
114
+ # adapted from https://gist.github.com/ggorlen/7c944d73e27980544e29aa6de1f2ac54
115
+ url = LLAMA_CPP_SERVER_BASE + API_PATH_COMPLETIONS
116
+ headers = {
117
+ # "Authorization": f"Bearer {api_key}",
118
+ "Content-Type": "application/json"
119
+ }
120
+ data = {
121
+ "temperature": 0.8,
122
+ "messages": messages,
123
+ "stream": True
124
+ }
125
+ response = requests.post(url, headers=headers, json=data, stream=True)
126
+ response.raise_for_status()
127
+
128
+ curr_text = ""
129
+ for line in response.iter_lines():
130
+ line = line.decode("utf-8")
131
+
132
+ if line.startswith("data: ") and not line.endswith("[DONE]"):
133
+ data = json.loads(line[len("data: "):])
134
+ chunk = data["choices"][0]["delta"].get("content", "")
135
+ # print(chunk, end="", flush=True)
136
+ curr_text += chunk
137
+ system, history = messages_to_history(messages + [{"role": Role.ASSISTANT, "content": curr_text}])
138
+ yield "", history, system
139
+ except Exception as e:
140
+ print(e)
141
+ exception = e
142
+ finally:
143
+ # clean up
144
+ proc.kill()
145
+ if exception is not None:
146
+ # re-raise the exception if needed
147
+ raise exception
148
+
149
+
150
+ with gr.Blocks() as demo:
151
+ gr.Markdown(f"""<center><font size=6>{HF_MODEL_ID}</center>""")
152
+
153
+ with gr.Row():
154
+ with gr.Column(scale=3):
155
+ system_input = gr.Textbox(value=DEFAULT_SYSTEM, lines=1, label="System")
156
+ with gr.Column(scale=1):
157
+ modify_system = gr.Button("🛠️ Set system prompt and clear history", scale=2)
158
+ system_state = gr.Textbox(value=DEFAULT_SYSTEM, visible=False)
159
+ chatbot = gr.Chatbot(label=HF_MODEL_ID)
160
+ textbox = gr.Textbox(lines=2, label="Input")
161
+
162
+ with gr.Row():
163
+ clear_history = gr.Button("🧹 Clear history")
164
+ sumbit = gr.Button("🚀 Send")
165
+
166
+ sumbit.click(model_chat,
167
+ inputs=[textbox, chatbot, system_state],
168
+ outputs=[textbox, chatbot, system_input],
169
+ concurrency_limit = 5)
170
+ clear_history.click(fn=clear_session,
171
+ inputs=[],
172
+ outputs=[textbox, chatbot])
173
+ modify_system.click(fn=modify_system_session,
174
+ inputs=[system_input],
175
+ outputs=[system_state, system_input, chatbot])
176
+
177
+ demo.queue(api_open=False)
178
+ demo.launch(max_threads=5)
179
 
 
 
180