admin commited on
Commit
404b247
·
1 Parent(s): ab18e79

combine llms & deepseek

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. app.py +10 -92
  3. deepseek.py +96 -0
  4. llms.py +165 -0
  5. requirements.txt +3 -2
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *__pycache__*
2
+ test.*
app.py CHANGED
@@ -1,96 +1,14 @@
1
- import torch
2
  import gradio as gr
3
- from threading import Thread
4
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
-
6
-
7
- MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
8
- MODEL_NAME = MODEL_ID.split("/")[-1]
9
- CONTEXT_LENGTH = 16000
10
- DESCRIPTION = f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
11
-
12
-
13
- def predict(
14
- message,
15
- history,
16
- system_prompt,
17
- temperature,
18
- max_new_tokens,
19
- top_k,
20
- repetition_penalty,
21
- top_p,
22
- ):
23
- # Format history with a given chat template
24
- stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
25
- instruction = "<|im_start|>system\n" + system_prompt + "\n<|im_end|>\n"
26
- for user, assistant in history:
27
- instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"
28
-
29
- instruction += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
30
- try:
31
- if device == torch.device("cpu"):
32
- raise EnvironmentError(
33
- "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
34
- )
35
-
36
- streamer = TextIteratorStreamer(
37
- tokenizer,
38
- skip_prompt=True,
39
- skip_special_tokens=True,
40
- )
41
- enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
42
- input_ids, attention_mask = enc.input_ids, enc.attention_mask
43
- if input_ids.shape[1] > CONTEXT_LENGTH:
44
- input_ids = input_ids[:, -CONTEXT_LENGTH:]
45
- attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
46
-
47
- generate_kwargs = dict(
48
- input_ids=input_ids.to(device),
49
- attention_mask=attention_mask.to(device),
50
- streamer=streamer,
51
- do_sample=True,
52
- temperature=temperature,
53
- max_new_tokens=max_new_tokens,
54
- top_k=top_k,
55
- repetition_penalty=repetition_penalty,
56
- top_p=top_p,
57
- )
58
- t = Thread(target=model.generate, kwargs=generate_kwargs)
59
- t.start()
60
-
61
- except Exception as e:
62
- streamer = f"{e}"
63
-
64
- outputs = []
65
- for new_token in streamer:
66
- outputs.append(new_token)
67
- if new_token in stop_tokens:
68
- break
69
-
70
- yield "".join(outputs)
71
-
72
 
73
  if __name__ == "__main__":
74
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
75
- if device == torch.device("cuda"):
76
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
77
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
 
 
 
78
 
79
- # Create Gradio interface
80
- gr.ChatInterface(
81
- predict,
82
- title=f"{MODEL_NAME} Deployment Instance",
83
- description=DESCRIPTION,
84
- additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
85
- additional_inputs=[
86
- gr.Textbox(
87
- "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
88
- label="System prompt",
89
- ),
90
- gr.Slider(0, 1, 0.6, label="Temperature"),
91
- gr.Slider(0, 32000, 10000, label="Max new tokens"),
92
- gr.Slider(1, 80, 40, label="Top K sampling"),
93
- gr.Slider(0, 2, 1.1, label="Repetition penalty"),
94
- gr.Slider(0, 1, 0.95, label="Top P sampling"),
95
- ],
96
- ).queue().launch()
 
 
1
  import gradio as gr
2
+ from llms import LLM_APIs
3
+ from deepseek import DeepSeek_R1_Qwen_7B
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  if __name__ == "__main__":
6
+ with gr.Blocks() as demo:
7
+ gr.Markdown("# Large Language Model Deployment Example")
8
+ with gr.Tab("API"):
9
+ LLM_APIs()
10
+
11
+ with gr.Tab("Real DeepSeek R1 Qwen 7B"):
12
+ DeepSeek_R1_Qwen_7B()
13
 
14
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
deepseek.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from threading import Thread
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
+
6
+
7
+ MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
8
+ MODEL_NAME = MODEL_ID.split("/")[-1]
9
+ CONTEXT_LENGTH = 16000
10
+ DESCRIPTION = f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
11
+
12
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
+ if device == torch.device("cuda"):
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
16
+
17
+
18
+ def predict(
19
+ message,
20
+ history,
21
+ system_prompt,
22
+ temperature,
23
+ max_new_tokens,
24
+ top_k,
25
+ repetition_penalty,
26
+ top_p,
27
+ ):
28
+ # Format history with a given chat template
29
+ stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
30
+ instruction = "<|im_start|>system\n" + system_prompt + "\n<|im_end|>\n"
31
+ for user, assistant in history:
32
+ instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"
33
+
34
+ instruction += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
35
+ try:
36
+ if device == torch.device("cpu"):
37
+ raise EnvironmentError(
38
+ "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
39
+ )
40
+
41
+ streamer = TextIteratorStreamer(
42
+ tokenizer,
43
+ skip_prompt=True,
44
+ skip_special_tokens=True,
45
+ )
46
+ enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
47
+ input_ids, attention_mask = enc.input_ids, enc.attention_mask
48
+ if input_ids.shape[1] > CONTEXT_LENGTH:
49
+ input_ids = input_ids[:, -CONTEXT_LENGTH:]
50
+ attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
51
+
52
+ generate_kwargs = dict(
53
+ input_ids=input_ids.to(device),
54
+ attention_mask=attention_mask.to(device),
55
+ streamer=streamer,
56
+ do_sample=True,
57
+ temperature=temperature,
58
+ max_new_tokens=max_new_tokens,
59
+ top_k=top_k,
60
+ repetition_penalty=repetition_penalty,
61
+ top_p=top_p,
62
+ )
63
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
64
+ t.start()
65
+
66
+ except Exception as e:
67
+ streamer = f"{e}"
68
+
69
+ outputs = []
70
+ for new_token in streamer:
71
+ outputs.append(new_token)
72
+ if new_token in stop_tokens:
73
+ break
74
+
75
+ yield "".join(outputs)
76
+
77
+
78
+ def DeepSeek_R1_Qwen_7B():
79
+ # Create Gradio interface
80
+ return gr.ChatInterface(
81
+ predict,
82
+ title=f"{MODEL_NAME} Deployment Instance",
83
+ description=DESCRIPTION,
84
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
85
+ additional_inputs=[
86
+ gr.Textbox(
87
+ "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
88
+ label="System prompt",
89
+ ),
90
+ gr.Slider(0, 1, 0.6, label="Temperature"),
91
+ gr.Slider(0, 32000, 10000, label="Max new tokens"),
92
+ gr.Slider(1, 80, 40, label="Top K sampling"),
93
+ gr.Slider(0, 2, 1.1, label="Repetition penalty"),
94
+ gr.Slider(0, 1, 0.95, label="Top P sampling"),
95
+ ],
96
+ ).queue()
llms.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from openai import OpenAI
4
+
5
+
6
+ def predict(
7
+ message,
8
+ history,
9
+ system_prompt,
10
+ model,
11
+ api_url,
12
+ api_key,
13
+ max_tk,
14
+ temp,
15
+ top_p,
16
+ ):
17
+ if not api_key:
18
+ return "Please set valid api keys in settings first."
19
+
20
+ # Format history with a given chat template
21
+ msgs = [{"role": "system", "content": system_prompt}]
22
+ for user, assistant in history:
23
+ msgs.append({"role": "user", "content": user})
24
+ msgs.append({"role": "system", "content": assistant})
25
+
26
+ msgs.append({"role": "user", "content": message})
27
+ try:
28
+ client = OpenAI(api_key=api_key, base_url=api_url)
29
+ response = client.chat.completions.create(
30
+ model=model,
31
+ messages=msgs,
32
+ max_tokens=max_tk,
33
+ temperature=temp,
34
+ top_p=top_p,
35
+ stream=False,
36
+ ).to_dict()["choices"][0]["message"]["content"]
37
+
38
+ except Exception as e:
39
+ response = f"{e}"
40
+
41
+ return response
42
+
43
+
44
+ def deepseek(
45
+ message,
46
+ history,
47
+ model,
48
+ api_key,
49
+ system_prompt,
50
+ max_tk,
51
+ temp,
52
+ top_p,
53
+ ):
54
+ response = predict(
55
+ message,
56
+ history,
57
+ system_prompt,
58
+ model,
59
+ "https://api.deepseek.com",
60
+ api_key,
61
+ max_tk,
62
+ temp,
63
+ top_p,
64
+ )
65
+ outputs = []
66
+ for new_token in response:
67
+ outputs.append(new_token)
68
+ yield "".join(outputs)
69
+
70
+
71
+ def kimi(
72
+ message,
73
+ history,
74
+ model,
75
+ api_key,
76
+ system_prompt,
77
+ max_tk,
78
+ temp,
79
+ top_p,
80
+ ):
81
+ response = predict(
82
+ message,
83
+ history,
84
+ system_prompt,
85
+ model,
86
+ "https://api.moonshot.cn/v1",
87
+ api_key,
88
+ max_tk,
89
+ temp,
90
+ top_p,
91
+ )
92
+ outputs = []
93
+ for new_token in response:
94
+ outputs.append(new_token)
95
+ yield "".join(outputs)
96
+
97
+
98
+ def LLM_APIs():
99
+ with gr.Blocks() as llms: # Create Gradio interface
100
+ gr.Markdown("# LLM API Aggregation Deployment")
101
+ with gr.Tab("DeepSeek"):
102
+ with gr.Accordion(label="⚙️ Settings", open=False) as ds_acc:
103
+ ds_model = gr.Dropdown(
104
+ choices=["deepseek-chat", "deepseek-reasoner"],
105
+ value="deepseek-chat",
106
+ label="Select a model",
107
+ )
108
+ ds_key = gr.Textbox(
109
+ os.getenv("ds_api_key"),
110
+ type="password",
111
+ label="API key",
112
+ )
113
+ ds_sys = gr.Textbox(
114
+ "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
115
+ label="System prompt",
116
+ )
117
+ ds_maxtk = gr.Slider(0, 32000, 10000, label="Max new tokens")
118
+ ds_temp = gr.Slider(0, 1, 0.3, label="Temperature")
119
+ ds_topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
120
+
121
+ gr.ChatInterface(
122
+ deepseek,
123
+ additional_inputs=[
124
+ ds_model,
125
+ ds_key,
126
+ ds_sys,
127
+ ds_maxtk,
128
+ ds_temp,
129
+ ds_topp,
130
+ ],
131
+ )
132
+
133
+ with gr.Tab("Kimi"):
134
+ with gr.Accordion(label="⚙️ Settings", open=False) as kimi_acc:
135
+ kimi_model = gr.Dropdown(
136
+ choices=["moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k"],
137
+ value="moonshot-v1-32k",
138
+ label="Select a model",
139
+ )
140
+ kimi_key = gr.Textbox(
141
+ os.getenv("kimi_api_key"),
142
+ type="password",
143
+ label="API key",
144
+ )
145
+ kimi_sys = gr.Textbox(
146
+ "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
147
+ label="System prompt",
148
+ )
149
+ kimi_maxtk = gr.Slider(0, 32000, 10000, label="Max new tokens")
150
+ kimi_temp = gr.Slider(0, 1, 0.3, label="Temperature")
151
+ kimi_topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
152
+
153
+ gr.ChatInterface(
154
+ kimi,
155
+ additional_inputs=[
156
+ kimi_model,
157
+ kimi_key,
158
+ kimi_sys,
159
+ kimi_maxtk,
160
+ kimi_temp,
161
+ kimi_topp,
162
+ ],
163
+ )
164
+
165
+ return llms.queue()
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  torch
2
- huggingface_hub==0.25.2
 
3
  transformers
4
- accelerate
 
1
  torch
2
+ openai
3
+ accelerate
4
  transformers
5
+ huggingface_hub==0.25.2