Flux.1-Fill-dev

Running on Zero

App Files Files Community

vilarin commited on Jul 8, 2024

Commit

fc09eb0

verified ·

1 Parent(s): 92c729c

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -34

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ OLLAMA = os.path.expanduser("~/ollama")
 process = None
 OLLAMA_SERVICE_THREAD = None
 if not os.path.exists(OLLAMA):
     subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
     os.chmod(OLLAMA, 0o755)
@@ -27,7 +26,6 @@ def terminate():
         OLLAMA_SERVICE_THREAD.join()
     process = None
     OLLAMA_SERVICE_THREAD = None
-    os.system("systemctl stop ollama.service")
     return "Ollama service stopped."
 # Uncomment and modify the model to what you want locally
@@ -94,7 +92,7 @@ def ollama_func(command):
     else:
         return "No supported command."
-@spaces.GPU()
 def launch():
     global OLLAMA_SERVICE_THREAD
     OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
@@ -102,8 +100,41 @@ def launch():
     print("Giving ollama serve a moment")
     time.sleep(10)
 def stream_chat(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
     print(f"message: {message}")
     if message.startswith("/"):
         resp = ollama_func(message)
         yield resp
@@ -111,38 +142,19 @@ def stream_chat(message: str, history: list, model: str, temperature: float, max
         if not INIT_SIGN:
             yield "Please initialize Ollama"
         else:
-            launch()
-            conversation = []
-            for prompt, answer in history:
-                conversation.extend([
-                    {"role": "user", "content": prompt},
-                    {"role": "assistant", "content": answer},
-                ])
-            conversation.append({"role": "user", "content": message})
-            print(f"Conversation is -\n{conversation}")
-            response = client.chat(
-                model=model,
-                messages=conversation,
-                stream=True,
-                options={
-                    'num_predict': max_new_tokens,
-                    'temperature': temperature,
-                    'top_p': top_p,
-                    'top_k': top_k,
-                    'repeat_penalty': penalty,
-                    'low_vram': True,
-                },
             )
-            terminate()
-            buffer = ""
-            for chunk in response:
-                buffer += chunk["message"]["content"]
-                yield buffer
 chatbot = gr.Chatbot(height=600, placeholder=DESCRIPTION)
@@ -150,7 +162,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
     gr.HTML(TITLE)
     gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
     gr.ChatInterface(
-        fn=stream_chat,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),

 process = None
 OLLAMA_SERVICE_THREAD = None
 if not os.path.exists(OLLAMA):
     subprocess.run("curl -L https://ollama.com/download/ollama-linux-amd64 -o ~/ollama", shell=True)
     os.chmod(OLLAMA, 0o755)
         OLLAMA_SERVICE_THREAD.join()
     process = None
     OLLAMA_SERVICE_THREAD = None
     return "Ollama service stopped."
 # Uncomment and modify the model to what you want locally
     else:
         return "No supported command."
 def launch():
     global OLLAMA_SERVICE_THREAD
     OLLAMA_SERVICE_THREAD = threading.Thread(target=ollama_service_thread)
     print("Giving ollama serve a moment")
     time.sleep(10)
+@spaces.GPU()
 def stream_chat(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
     print(f"message: {message}")
+    conversation = []
+    for prompt, answer in history:
+        conversation.extend([
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": answer},
+        ])
+        conversation.append({"role": "user", "content": message})
+        print(f"Conversation is -\n{conversation}")
+        response = client.chat(
+            model=model,
+            messages=conversation,
+            stream=True,
+            options={
+                'num_predict': max_new_tokens,
+                'temperature': temperature,
+                'top_p': top_p,
+                'top_k': top_k,
+                'repeat_penalty': penalty,
+                'low_vram': True,
+                "keep_alive": 60s,
+            },
+        )
+        buffer = ""
+        for chunk in response:
+            buffer += chunk["message"]["content"]
+            yield buffer
+def main(message: str, history: list, model: str, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
     if message.startswith("/"):
         resp = ollama_func(message)
         yield resp
         if not INIT_SIGN:
             yield "Please initialize Ollama"
         else:
+            if process:
+                launch()
+            stream_chat(
+                message,
+                history,
+                model,
+                temperature,
+                max_new_tokens,
+                top_p,
+                top_k,
+                penalty
             )
 chatbot = gr.Chatbot(height=600, placeholder=DESCRIPTION)
     gr.HTML(TITLE)
     gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
     gr.ChatInterface(
+        fn=main,
         chatbot=chatbot,
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),