sawac commited on
Commit
e4456e8
·
verified ·
1 Parent(s): abbe10d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -26
app.py CHANGED
@@ -2,28 +2,52 @@ import gradio as gr
2
  from huggingface_hub import hf_hub_download
3
  from llama_cpp import Llama
4
  import os
 
 
5
 
6
- # Hugging Face Hub上のモデルを指定
7
- repo_id = "ineair/llm-jp-3-3.7b-instruct-EZO-Humanities-gguf"
8
- filename = "llm-jp-3-3.7b-instruct-EZO-Humanities-f16.gguf"
9
 
10
- # モデルをダウンロード(キャッシュされている場合はキャッシュを使用)
11
- model_path = hf_hub_download(repo_id=repo_id, filename=filename)
12
 
13
- CONTEXT_SIZE = 4096
 
 
14
 
15
- llm = Llama(
16
- model_path=model_path,
17
- n_threads=os.cpu_count(),
18
- n_batch=32,
19
- verbose=False,
20
- n_ctx=CONTEXT_SIZE,
21
- )
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def get_llama_response(prompt):
24
- return llm(prompt, max_tokens=2048, temperature=0.7, top_p=0.95, repeat_penalty=1.1, stream=True)
 
 
 
 
 
 
25
 
26
  def greet(prompt, intensity):
 
 
 
 
27
  full_response = ""
28
  for output in get_llama_response(prompt):
29
  if len(output['choices']) > 0:
@@ -33,17 +57,25 @@ def greet(prompt, intensity):
33
 
34
  return full_response + "!" * int(intensity)
35
 
36
- demo = gr.Interface(
37
- title="Llama.cpp-python-sample (Streaming)",
38
- description=f"MODEL: {filename} from {repo_id}",
39
- fn=greet,
40
- inputs=[
41
- gr.Textbox(label="Enter your prompt"),
42
- gr.Slider(minimum=0, maximum=10, step=1, label="Intensity")
43
- ],
44
- outputs=gr.Textbox(label="Generated Response"),
45
- live=False
46
- )
 
 
 
 
 
 
 
47
 
48
  demo.queue()
49
- demo.launch()
 
 
2
  from huggingface_hub import hf_hub_download
3
  from llama_cpp import Llama
4
  import os
5
+ import threading
6
+ import time
7
 
8
+ repo_id = "mmnga/ELYZA-japanese-Llama-2-7b-instruct-gguf"
9
+ filename = "ELYZA-japanese-Llama-2-7b-instruct-q4_K_M.gguf"
 
10
 
11
+ CONTEXT_SIZE = 2048
12
+ N_THREADS = min(os.cpu_count(), 4)
13
 
14
+ llm = None
15
+ model_loaded = False
16
+ loading_progress = 0
17
 
18
+ def load_model():
19
+ global llm, model_loaded, loading_progress
20
+ loading_progress = 0
21
+ model_path = hf_hub_download(repo_id=repo_id, filename=filename)
22
+ loading_progress = 50
23
+ llm = Llama(
24
+ model_path=model_path,
25
+ n_threads=N_THREADS,
26
+ n_batch=32,
27
+ verbose=False,
28
+ n_ctx=CONTEXT_SIZE,
29
+ )
30
+ loading_progress = 100
31
+ model_loaded = True
32
+
33
+ def get_loading_status():
34
+ global loading_progress
35
+ return loading_progress
36
 
37
  def get_llama_response(prompt):
38
+ global llm, model_loaded
39
+ if not model_loaded:
40
+ return [{"choices": [{"text": "モデルを読み込んでいます。しばらくお待ちください..."}]}]
41
+ try:
42
+ return llm(prompt, max_tokens=1024, temperature=0.7, top_p=0.95, repeat_penalty=1.1, stream=True)
43
+ except Exception as e:
44
+ return [{"choices": [{"text": f"エラーが発生しました: {str(e)}"}]}]
45
 
46
  def greet(prompt, intensity):
47
+ global model_loaded
48
+ if not model_loaded:
49
+ return "モデルを読み込んでいます。しばらくお待ちください..."
50
+
51
  full_response = ""
52
  for output in get_llama_response(prompt):
53
  if len(output['choices']) > 0:
 
57
 
58
  return full_response + "!" * int(intensity)
59
 
60
+ # モデルを非同期で読み込む
61
+ threading.Thread(target=load_model, daemon=True).start()
62
+
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown("# Llama.cpp-python-sample (Streaming)")
65
+ gr.Markdown(f"MODEL: {filename} from {repo_id}")
66
+
67
+ with gr.Row():
68
+ input_text = gr.Textbox(label="Enter your prompt")
69
+ intensity = gr.Slider(minimum=0, maximum=10, step=1, label="Intensity")
70
+
71
+ output_text = gr.Textbox(label="Generated Response")
72
+ submit_button = gr.Button("Submit")
73
+
74
+ loading_bar = gr.progressbar(label="Model Loading Progress", max=100)
75
+
76
+ submit_button.click(fn=greet, inputs=[input_text, intensity], outputs=output_text)
77
+ demo.load(fn=get_loading_status, outputs=loading_bar, every=1)
78
 
79
  demo.queue()
80
+ if __name__ == "__main__":
81
+ demo.launch()