File size: 3,076 Bytes
086aec6
8ab3f99
 
 
e4456e8
 
086aec6
1362f1f
6f38b94
086aec6
56b84bb
acd4aa6
086aec6
e4456e8
 
086aec6
84e2d22
 
 
e4456e8
84e2d22
3a0ea7c
e4456e8
 
 
56b84bb
e4456e8
 
 
84e2d22
e4456e8
84e2d22
3a0ea7c
086aec6
3a0ea7c
e4456e8
 
 
 
3a0ea7c
e4456e8
 
8ab3f99
3a0ea7c
 
e4456e8
 
 
 
8ab3f99
3a0ea7c
8ab3f99
 
 
 
3a0ea7c
 
8ab3f99
e4456e8
680f800
42bba8a
5374db7
94ee154
5374db7
9528334
f6d18a9
 
646ea3a
f7c8cc1
94ee154
5374db7
42bba8a
91444e4
e4456e8
3a0ea7c
 
e4456e8
3a0ea7c
 
91444e4
 
 
 
e4456e8
3a0ea7c
36d223b
84e2d22
086aec6
3a0ea7c
8ab3f99
84e2d22
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os
import threading
import time

repo_id = "bartowski/Qwen2.5-1.5B-Instruct-GGUF"
filename = "Qwen2.5-1.5B-Instruct-Q8_0.gguf"

CONTEXT_SIZE = 1024
N_THREADS = 2 #FreeのCPUは2なので

llm = None
model_loaded = False

def load_model(progress=gr.Progress()):
    global llm, model_loaded
    progress(0, desc="モデルのダウンロードを開始")
    model_path = hf_hub_download(repo_id=repo_id, filename=filename)
    progress(0.5, desc="モデルをメモリに読み込み中")
    
    llm = Llama(
        model_path=model_path,
        n_threads=N_THREADS,
        n_batch=8,
        verbose=False,
        n_ctx=CONTEXT_SIZE,
    )
    progress(1, desc="モデルの読み込み完了")
    model_loaded = True
    return "モデルの読み込みが完了しました。"
    

def get_llama_response(prompt, temperature):
    global llm, model_loaded
    if not model_loaded:
        return [{"choices": [{"text": "モデルを読み込んでいます。しばらくお待ちください..."}]}]
    try:
        return llm(prompt, max_tokens=1024, temperature=temperature, top_p=0.95, repeat_penalty=1.1, stream=True)
    except Exception as e:
        return [{"choices": [{"text": f"エラーが発生しました: {str(e)}"}]}]


def greet(prompt, temperature):
    global model_loaded
    if not model_loaded:
        return "モデルを読み込んでいます。しばらくお待ちください..."
    
    full_response = ""
    for output in get_llama_response(prompt, temperature):
        if len(output['choices']) > 0:
            text_chunk = output['choices'][0]['text']
            full_response += text_chunk
            yield full_response
    return full_response


with gr.Blocks() as demo:
    gr.Markdown(f"# LLMチャットボット(Streaming)")
    gr.HighlightedText(
        value=[("", None), 
               ("これはLLM", "positive"), 
               ("の", None), 
               ("テストアプリケーション", "neutral"), ("です。\n", None),
               ("内容は実験的", "neutral"), ("なため", None),
               ("重要な意思決定に用いない", "negative"), 
               ("でください。", None)
              ],
        label="注意",
        show_label=False,
    )

    with gr.Row():
        input_text = gr.Textbox(label="プロンプトを入力してください")
        temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
    
    output_text = gr.Textbox(label="生成されたレスポンス")
    submit_button = gr.Button("送信")

    gr.Textbox(value=filename, label="モデル", interactive=False)
    loading_status = gr.Textbox(label="Loading Status")
    
    
    submit_button.click(fn=greet, inputs=[input_text, temperature], outputs=output_text)
    input_text.submit(fn=greet, inputs=[input_text, temperature], outputs=output_text) 
    demo.load(fn=load_model, outputs=loading_status)


demo.queue()
demo.launch()