### app.py import socket import gradio as gr from openai import OpenAI def get_local_ip(): s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) try: s.connect(("8.8.8.8", 80)) ip = s.getsockname()[0] except Exception: ip = "127.0.0.1" finally: s.close() return ip print("本機 IP:", get_local_ip()) # ✅ 設定 base URL 連接本地 llama.cpp API client = OpenAI( base_url="http://0.0.0.0:8000/v1", api_key="sk-local", # llama.cpp 不檢查內容,只要有就行 timeout=600 ) # ✅ 回應函式 (流式 generator) def respond(message, history, system_message, max_tokens, temperature, top_p): messages = [{"role": "system", "content": system_message}] messages.extend(history) messages.append({"role": "user", "content": message}) try: stream = client.chat.completions.create( model="qwen3", messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, reasoning_effort="low", stream=True, ) output = "" skip_tokens = ['<|channel|>', 'analysis'] skipped = {token: False for token in skip_tokens} # 追蹤每個 token 是否已忽略過 for chunk in stream: print("[DEBUG] chunk:", chunk) delta = chunk.choices[0].delta if delta and delta.content: content = delta.content.strip() # 如果這個 token 是要跳過的,且還沒被跳過過 if content in skip_tokens and not skipped[content]: skipped[content] = True continue output += delta.content # 正常加入輸出 yield {"role": "assistant", "content": output} except Exception as e: print(f"[Error] {e}") yield {"role": "assistant", "content": "⚠️ Llama.cpp server 沒有回應,請稍後再試。"} # ✅ Gradio 介面 (新版必須用 type="messages") demo = gr.ChatInterface( respond, type="messages", # 🔑 使用 OpenAI 風格訊息格式 additional_inputs=[ gr.Textbox(value="You are a friendly assistant.", label="System message"), gr.Slider(minimum=1, maximum=16383, value=4096, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), ], ) if __name__ == "__main__": demo.launch()