File size: 4,252 Bytes
832c4a9
658af68
b9e87be
2bb32bf
b9e87be
 
658af68
c35e301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee7c5db
c35e301
 
 
 
 
 
 
acf75de
c35e301
 
 
 
acf75de
c35e301
 
 
 
 
 
acf75de
 
 
305cc56
 
acf75de
 
 
 
 
 
 
 
 
 
 
 
 
aa6fb2d
acf75de
 
 
 
 
 
 
 
 
c35e301
acf75de
 
 
 
 
 
 
 
c35e301
acf75de
 
 
c35e301
acf75de
 
 
 
 
 
 
c35e301
 
efa5b9e
acf75de
 
 
 
 
aa6fb2d
ee7c5db
 
7d93b52
ee7c5db
 
 
832c4a9
ee7c5db
 
 
 
 
 
 
 
 
 
 
 
 
0af3958
 
658af68
ee7c5db
 
 
 
658af68
 
ee7c5db
d2a44a5
286c580
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import spaces  # 必须在最顶部导入
import gradio as gr
import os

# 获取 Hugging Face 访问令牌
hf_token = os.getenv("HF_API_TOKEN")

# 定义基础模型名称
base_model_name = "unsloth/meta-llama-3.1-8b-bnb-4bit"

# 定义 adapter 模型名称
adapter_model_name = "larry1129/WooWoof_AI"

# 定义全局变量用于缓存模型和分词器
model = None
tokenizer = None

# 定义提示生成函数
def generate_prompt(instruction, input_text=""):
    if input_text:
        prompt = f"""### Instruction:
{instruction}
### Input:
{input_text}
### Response:
"""
    else:
        prompt = f"""### Instruction:
{instruction}
### Response:
"""
    return prompt

# 定义生成响应的函数,并使用 @spaces.GPU 装饰
@spaces.GPU(duration=40)  # 建议将 duration 增加到 120
def generate_response(instruction, input_text):
    global model, tokenizer

    if model is None:
        print("开始加载模型...")
        # 检查 bitsandbytes 是否已安装
        import importlib.util
        if importlib.util.find_spec("bitsandbytes") is None:
            import subprocess
            subprocess.call(["pip", "install", "--upgrade", "bitsandbytes"])

        try:
            # 在函数内部导入需要 GPU 的库
            import torch
            from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

            from peft import PeftModel

            # 创建量化配置
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16
            )

            # 加载分词器
            tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_auth_token=hf_token)
            print("分词器加载成功。")

            # 加载基础模型
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=bnb_config,
                device_map="auto",
                use_auth_token=hf_token,
                trust_remote_code=True
            )
            print("基础模型加载成功。")

            # 加载适配器模型
            model = PeftModel.from_pretrained(
                base_model,
                adapter_model_name,
                torch_dtype=torch.float16,
                use_auth_token=hf_token
            )
            print("适配器模型加载成功。")

            # 设置 pad_token
            tokenizer.pad_token = tokenizer.eos_token
            model.config.pad_token_id = tokenizer.pad_token_id

            # 切换到评估模式
            model.eval()
            print("模型已切换到评估模式。")
        except Exception as e:
            print("加载模型时出错:", e)
            raise e
    else:
        # 在函数内部导入需要的库
        import torch

    # 检查 model 和 tokenizer 是否已正确加载
    if model is None or tokenizer is None:
        print("模型或分词器未正确加载。")
        raise ValueError("模型或分词器未正确加载。")

    # 生成提示
    prompt = generate_prompt(instruction, input_text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs.get("attention_mask"),
            max_new_tokens=128,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response:")[-1].strip()
    return response

# 创建 Gradio 接口
iface = gr.Interface(
    fn=generate_response,
    inputs=[
        gr.Textbox(lines=2, placeholder="请输入指令...", label="Instruction"),
        gr.Textbox(lines=2, placeholder="如果有额外输入,请在此填写...", label="Input (可选)")
    ],
    outputs="text",
    title="WooWoof AI 交互式聊天",
    description="基于 LLAMA 3.1 的大语言模型,支持指令和可选输入。",
    allow_flagging="never"
)

# 启动 Gradio 接口
iface.launch(api_open=True)