vilarin commited on
Commit
d2fff9f
·
verified ·
1 Parent(s): 29890fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -7
app.py CHANGED
@@ -92,15 +92,14 @@ def ollama_func(command):
92
  else:
93
  return "No supported command."
94
 
95
-
96
  def launch():
97
  global OLLAMA_SERVICE_THREAD
98
  OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
99
  OLLAMA_SERVICE_THREAD.start()
100
  print("Giving ollama serve a moment")
101
  time.sleep(10)
102
-
103
- @spaces.GPU()
104
  def stream_chat(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
105
  print(f"message: {message}")
106
  conversation = []
@@ -116,6 +115,8 @@ def stream_chat(message: str, history: list, model: str, temperature: float, max
116
  response = client.chat(
117
  model=model,
118
  messages=conversation,
 
 
119
  options={
120
  'num_predict': max_new_tokens,
121
  'temperature': temperature,
@@ -123,11 +124,13 @@ def stream_chat(message: str, history: list, model: str, temperature: float, max
123
  'top_k': top_k,
124
  'repeat_penalty': penalty,
125
  'low_vram': True,
126
- "keep_alive": "60s",
127
  },
128
  )
 
 
 
129
  print(response)
130
- return response['message']['content']
131
 
132
 
133
 
@@ -152,8 +155,11 @@ def main(message: str, history: list, model: str, temperature: float, max_new_to
152
  top_k,
153
  penalty
154
  )
155
- print(response)
156
- yield response
 
 
 
157
 
158
 
159
 
 
92
  else:
93
  return "No supported command."
94
 
95
+ @spaces.GPU()
96
  def launch():
97
  global OLLAMA_SERVICE_THREAD
98
  OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
99
  OLLAMA_SERVICE_THREAD.start()
100
  print("Giving ollama serve a moment")
101
  time.sleep(10)
102
+
 
103
  def stream_chat(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
104
  print(f"message: {message}")
105
  conversation = []
 
115
  response = client.chat(
116
  model=model,
117
  messages=conversation,
118
+ keep_alive="60s",
119
+ stream=True,
120
  options={
121
  'num_predict': max_new_tokens,
122
  'temperature': temperature,
 
124
  'top_k': top_k,
125
  'repeat_penalty': penalty,
126
  'low_vram': True,
 
127
  },
128
  )
129
+
130
+ terminate()
131
+
132
  print(response)
133
+ yield response['message']['content']
134
 
135
 
136
 
 
155
  top_k,
156
  penalty
157
  )
158
+
159
+ buffer = ""
160
+ for chunk in response:
161
+ buffer += chunk
162
+ yield buffer
163
 
164
 
165