Spaces:
Runtime error
Runtime error
File size: 3,076 Bytes
086aec6 8ab3f99 e4456e8 086aec6 1362f1f 6f38b94 086aec6 56b84bb acd4aa6 086aec6 e4456e8 086aec6 84e2d22 e4456e8 84e2d22 3a0ea7c e4456e8 56b84bb e4456e8 84e2d22 e4456e8 84e2d22 3a0ea7c 086aec6 3a0ea7c e4456e8 3a0ea7c e4456e8 8ab3f99 3a0ea7c e4456e8 8ab3f99 3a0ea7c 8ab3f99 3a0ea7c 8ab3f99 e4456e8 680f800 42bba8a 5374db7 94ee154 5374db7 9528334 f6d18a9 646ea3a f7c8cc1 94ee154 5374db7 42bba8a 91444e4 e4456e8 3a0ea7c e4456e8 3a0ea7c 91444e4 e4456e8 3a0ea7c 36d223b 84e2d22 086aec6 3a0ea7c 8ab3f99 84e2d22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os
import threading
import time
repo_id = "bartowski/Qwen2.5-1.5B-Instruct-GGUF"
filename = "Qwen2.5-1.5B-Instruct-Q8_0.gguf"
CONTEXT_SIZE = 1024
N_THREADS = 2 #FreeのCPUは2なので
llm = None
model_loaded = False
def load_model(progress=gr.Progress()):
global llm, model_loaded
progress(0, desc="モデルのダウンロードを開始")
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
progress(0.5, desc="モデルをメモリに読み込み中")
llm = Llama(
model_path=model_path,
n_threads=N_THREADS,
n_batch=8,
verbose=False,
n_ctx=CONTEXT_SIZE,
)
progress(1, desc="モデルの読み込み完了")
model_loaded = True
return "モデルの読み込みが完了しました。"
def get_llama_response(prompt, temperature):
global llm, model_loaded
if not model_loaded:
return [{"choices": [{"text": "モデルを読み込んでいます。しばらくお待ちください..."}]}]
try:
return llm(prompt, max_tokens=1024, temperature=temperature, top_p=0.95, repeat_penalty=1.1, stream=True)
except Exception as e:
return [{"choices": [{"text": f"エラーが発生しました: {str(e)}"}]}]
def greet(prompt, temperature):
global model_loaded
if not model_loaded:
return "モデルを読み込んでいます。しばらくお待ちください..."
full_response = ""
for output in get_llama_response(prompt, temperature):
if len(output['choices']) > 0:
text_chunk = output['choices'][0]['text']
full_response += text_chunk
yield full_response
return full_response
with gr.Blocks() as demo:
gr.Markdown(f"# LLMチャットボット(Streaming)")
gr.HighlightedText(
value=[("", None),
("これはLLM", "positive"),
("の", None),
("テストアプリケーション", "neutral"), ("です。\n", None),
("内容は実験的", "neutral"), ("なため", None),
("重要な意思決定に用いない", "negative"),
("でください。", None)
],
label="注意",
show_label=False,
)
with gr.Row():
input_text = gr.Textbox(label="プロンプトを入力してください")
temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
output_text = gr.Textbox(label="生成されたレスポンス")
submit_button = gr.Button("送信")
gr.Textbox(value=filename, label="モデル", interactive=False)
loading_status = gr.Textbox(label="Loading Status")
submit_button.click(fn=greet, inputs=[input_text, temperature], outputs=output_text)
input_text.submit(fn=greet, inputs=[input_text, temperature], outputs=output_text)
demo.load(fn=load_model, outputs=loading_status)
demo.queue()
demo.launch() |