Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -92,15 +92,14 @@ def ollama_func(command):
|
|
92 |
else:
|
93 |
return "No supported command."
|
94 |
|
95 |
-
|
96 |
def launch():
|
97 |
global OLLAMA_SERVICE_THREAD
|
98 |
OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
|
99 |
OLLAMA_SERVICE_THREAD.start()
|
100 |
print("Giving ollama serve a moment")
|
101 |
time.sleep(10)
|
102 |
-
|
103 |
-
@spaces.GPU()
|
104 |
def stream_chat(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
|
105 |
print(f"message: {message}")
|
106 |
conversation = []
|
@@ -116,6 +115,8 @@ def stream_chat(message: str, history: list, model: str, temperature: float, max
|
|
116 |
response = client.chat(
|
117 |
model=model,
|
118 |
messages=conversation,
|
|
|
|
|
119 |
options={
|
120 |
'num_predict': max_new_tokens,
|
121 |
'temperature': temperature,
|
@@ -123,11 +124,13 @@ def stream_chat(message: str, history: list, model: str, temperature: float, max
|
|
123 |
'top_k': top_k,
|
124 |
'repeat_penalty': penalty,
|
125 |
'low_vram': True,
|
126 |
-
"keep_alive": "60s",
|
127 |
},
|
128 |
)
|
|
|
|
|
|
|
129 |
print(response)
|
130 |
-
|
131 |
|
132 |
|
133 |
|
@@ -152,8 +155,11 @@ def main(message: str, history: list, model: str, temperature: float, max_new_to
|
|
152 |
top_k,
|
153 |
penalty
|
154 |
)
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
157 |
|
158 |
|
159 |
|
|
|
92 |
else:
|
93 |
return "No supported command."
|
94 |
|
95 |
+
@spaces.GPU()
|
96 |
def launch():
|
97 |
global OLLAMA_SERVICE_THREAD
|
98 |
OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
|
99 |
OLLAMA_SERVICE_THREAD.start()
|
100 |
print("Giving ollama serve a moment")
|
101 |
time.sleep(10)
|
102 |
+
|
|
|
103 |
def stream_chat(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
|
104 |
print(f"message: {message}")
|
105 |
conversation = []
|
|
|
115 |
response = client.chat(
|
116 |
model=model,
|
117 |
messages=conversation,
|
118 |
+
keep_alive="60s",
|
119 |
+
stream=True,
|
120 |
options={
|
121 |
'num_predict': max_new_tokens,
|
122 |
'temperature': temperature,
|
|
|
124 |
'top_k': top_k,
|
125 |
'repeat_penalty': penalty,
|
126 |
'low_vram': True,
|
|
|
127 |
},
|
128 |
)
|
129 |
+
|
130 |
+
terminate()
|
131 |
+
|
132 |
print(response)
|
133 |
+
yield response['message']['content']
|
134 |
|
135 |
|
136 |
|
|
|
155 |
top_k,
|
156 |
penalty
|
157 |
)
|
158 |
+
|
159 |
+
buffer = ""
|
160 |
+
for chunk in response:
|
161 |
+
buffer += chunk
|
162 |
+
yield buffer
|
163 |
|
164 |
|
165 |
|