KOONE / app.py
kimhyunwoo's picture
Update app.py
e9d635e verified
import gradio as gr
from llama_cpp import Llama
# ๋ชจ๋ธ ํŒŒ์ผ ๊ฒฝ๋กœ (Hugging Face Hub์—์„œ ๋‹ค์šด๋กœ๋“œ)
MODEL_REPO_ID = "kimhyunwoo/KOONE"
MODEL_FILENAME = "KOONE-3.5-2.4B-Instruct-Q4_K_M.gguf"
# Llama ๊ฐ์ฒด ์ƒ์„ฑ (CPU๋งŒ ์‚ฌ์šฉํ•˜๋ฏ€๋กœ n_gpu_layers๋Š” 0 ๋˜๋Š” ์„ค์ •ํ•˜์ง€ ์•Š์Œ)
# n_threads๋ฅผ ์‹œ์Šคํ…œ CPU ์ฝ”์–ด ์ˆ˜์— ๋งž๊ฒŒ ์กฐ์ ˆ (๋˜๋Š” ์ƒ๋žตํ•˜์—ฌ ์ž๋™ ์„ค์ •)
llm = Llama(
model_path="", # model_path๋Š” ๋น„์›Œ๋‘๊ณ  from_pretrained ์‚ฌ์šฉ
repo_id=MODEL_REPO_ID,
filename=MODEL_FILENAME,
n_ctx=2048, # ์ปจํ…์ŠคํŠธ ๊ธธ์ด. ๋ชจ๋ธ์— ๋งž๊ฒŒ ์„ค์ •.
n_threads=8, # CPU ์“ฐ๋ ˆ๋“œ ์ˆ˜ (์‹œ์Šคํ…œ์— ๋งž๊ฒŒ ์กฐ์ ˆ)
verbose=False, # ํ•„์š”ํ•˜๋ฉด True๋กœ ๋ณ€๊ฒฝ
)
def generate_text(prompt, system_prompt, max_tokens, temperature, top_p):
"""๋ชจ๋ธ์— ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•˜๊ณ  ์ƒ์„ฑ๋œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
output = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=False, # ์ŠคํŠธ๋ฆฌ๋ฐ ์‚ฌ์šฉ ์•ˆ ํ•จ
echo=False, # ์ž…๋ ฅ ํ”„๋กฌํ”„ํŠธ๋Š” ์ถœ๋ ฅํ•˜์ง€ ์•Š์Œ
)
generated_text = output["choices"][0]["message"]["content"]
return generated_text
# Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ •์˜
iface = gr.Interface(
fn=generate_text,
inputs=[
gr.Textbox(lines=5, label="Prompt (์งˆ๋ฌธ)"),
gr.Textbox(lines=2, label="System Prompt (์„ ํƒ ์‚ฌํ•ญ)", value="๋‹น์‹ ์€ ๋„์›€์ด ๋˜๋Š” ํ•œ๊ตญ์–ด ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค."), #๊ธฐ๋ณธ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
gr.Slider(minimum=16, maximum=512, step=16, label="Max Tokens", value=128),
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, label="Temperature", value=0.7),
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, label="Top P", value=0.9),
],
outputs=gr.Textbox(label="Generated Text (๋‹ต๋ณ€)"),
title="KOONE Chatbot (CPU Only)",
description="์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜๊ณ  Submit์„ ํด๋ฆญํ•˜์—ฌ ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•˜์„ธ์š”.",
)
iface.launch()