Spaces:
Running
on
Zero
Running
on
Zero
support reasoning tag
Browse files
README.md
CHANGED
@@ -8,7 +8,7 @@ sdk_version: 1.44.1
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
-
short_description: Run GGUF models
|
12 |
---
|
13 |
|
14 |
This Streamlit app enables **chat-based inference** on various GGUF models using `llama.cpp` and `llama-cpp-python`.
|
@@ -26,6 +26,8 @@ This Streamlit app enables **chat-based inference** on various GGUF models using
|
|
26 |
- Model selection in the sidebar
|
27 |
- Customizable system prompt and generation parameters
|
28 |
- Chat-style UI with streaming responses
|
|
|
|
|
29 |
|
30 |
### 🧠 Memory-Safe Design (for HuggingFace Spaces):
|
31 |
- Loads only **one model at a time** to prevent memory bloat
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
short_description: Run GGUF models with llama.cpp
|
12 |
---
|
13 |
|
14 |
This Streamlit app enables **chat-based inference** on various GGUF models using `llama.cpp` and `llama-cpp-python`.
|
|
|
26 |
- Model selection in the sidebar
|
27 |
- Customizable system prompt and generation parameters
|
28 |
- Chat-style UI with streaming responses
|
29 |
+
- **Markdown output rendering** for readable, styled output
|
30 |
+
- **DeepSeek-compatible `<think>` tag handling** — shows model reasoning in a collapsible expander
|
31 |
|
32 |
### 🧠 Memory-Safe Design (for HuggingFace Spaces):
|
33 |
- Loads only **one model at a time** to prevent memory bloat
|
app.py
CHANGED
@@ -4,6 +4,7 @@ from huggingface_hub import hf_hub_download
|
|
4 |
import os
|
5 |
import gc
|
6 |
import shutil
|
|
|
7 |
|
8 |
# Available models
|
9 |
MODELS = {
|
@@ -184,6 +185,13 @@ if user_input:
|
|
184 |
if "choices" in chunk:
|
185 |
delta = chunk["choices"][0]["delta"].get("content", "")
|
186 |
full_response += delta
|
187 |
-
|
|
|
188 |
|
189 |
st.session_state.chat_history.append({"role": "assistant", "content": full_response})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import os
|
5 |
import gc
|
6 |
import shutil
|
7 |
+
import re
|
8 |
|
9 |
# Available models
|
10 |
MODELS = {
|
|
|
185 |
if "choices" in chunk:
|
186 |
delta = chunk["choices"][0]["delta"].get("content", "")
|
187 |
full_response += delta
|
188 |
+
visible = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
189 |
+
response_area.markdown(visible)
|
190 |
|
191 |
st.session_state.chat_history.append({"role": "assistant", "content": full_response})
|
192 |
+
|
193 |
+
thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
|
194 |
+
if thinking:
|
195 |
+
with st.expander("🧠 Model's Internal Reasoning"):
|
196 |
+
for t in thinking:
|
197 |
+
st.markdown(t.strip())
|