Spaces:

Genius-Society
/

llms

Running

App Files Files

admin commited on Apr 13

Commit

404b247

1 Parent(s): ab18e79

combine llms & deepseek

Browse files

Files changed (5) hide show

.gitignore +2 -0
app.py +10 -92
deepseek.py +96 -0
llms.py +165 -0
requirements.txt +3 -2

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ test.*

app.py CHANGED Viewed

@@ -1,96 +1,14 @@
-import torch
 import gradio as gr
-from threading import Thread
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
-MODEL_NAME = MODEL_ID.split("/")[-1]
-CONTEXT_LENGTH = 16000
-DESCRIPTION = f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
-def predict(
-    message,
-    history,
-    system_prompt,
-    temperature,
-    max_new_tokens,
-    top_k,
-    repetition_penalty,
-    top_p,
-):
-    # Format history with a given chat template
-    stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
-    instruction = "<|im_start|>system\n" + system_prompt + "\n<|im_end|>\n"
-    for user, assistant in history:
-        instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"
-    instruction += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
-    try:
-        if device == torch.device("cpu"):
-            raise EnvironmentError(
-                "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
-            )
-        streamer = TextIteratorStreamer(
-            tokenizer,
-            skip_prompt=True,
-            skip_special_tokens=True,
-        )
-        enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
-        input_ids, attention_mask = enc.input_ids, enc.attention_mask
-        if input_ids.shape[1] > CONTEXT_LENGTH:
-            input_ids = input_ids[:, -CONTEXT_LENGTH:]
-            attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
-        generate_kwargs = dict(
-            input_ids=input_ids.to(device),
-            attention_mask=attention_mask.to(device),
-            streamer=streamer,
-            do_sample=True,
-            temperature=temperature,
-            max_new_tokens=max_new_tokens,
-            top_k=top_k,
-            repetition_penalty=repetition_penalty,
-            top_p=top_p,
-        )
-        t = Thread(target=model.generate, kwargs=generate_kwargs)
-        t.start()
-    except Exception as e:
-        streamer = f"{e}"
-    outputs = []
-    for new_token in streamer:
-        outputs.append(new_token)
-        if new_token in stop_tokens:
-            break
-        yield "".join(outputs)
 if __name__ == "__main__":
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    if device == torch.device("cuda"):
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
-    # Create Gradio interface
-    gr.ChatInterface(
-        predict,
-        title=f"{MODEL_NAME} Deployment Instance",
-        description=DESCRIPTION,
-        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
-        additional_inputs=[
-            gr.Textbox(
-                "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
-                label="System prompt",
-            ),
-            gr.Slider(0, 1, 0.6, label="Temperature"),
-            gr.Slider(0, 32000, 10000, label="Max new tokens"),
-            gr.Slider(1, 80, 40, label="Top K sampling"),
-            gr.Slider(0, 2, 1.1, label="Repetition penalty"),
-            gr.Slider(0, 1, 0.95, label="Top P sampling"),
-        ],
-    ).queue().launch()

 import gradio as gr
+from llms import LLM_APIs
+from deepseek import DeepSeek_R1_Qwen_7B
 if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        gr.Markdown("# Large Language Model Deployment Example")
+        with gr.Tab("API"):
+            LLM_APIs()
+        with gr.Tab("Real DeepSeek R1 Qwen 7B"):
+            DeepSeek_R1_Qwen_7B()
+    demo.launch()

deepseek.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import torch
+import gradio as gr
+from threading import Thread
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+MODEL_NAME = MODEL_ID.split("/")[-1]
+CONTEXT_LENGTH = 16000
+DESCRIPTION = f"This is a HuggingFace deployment instance of {MODEL_NAME} model, if you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if device == torch.device("cuda"):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
+def predict(
+    message,
+    history,
+    system_prompt,
+    temperature,
+    max_new_tokens,
+    top_k,
+    repetition_penalty,
+    top_p,
+):
+    # Format history with a given chat template
+    stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
+    instruction = "<|im_start|>system\n" + system_prompt + "\n<|im_end|>\n"
+    for user, assistant in history:
+        instruction += f"<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n"
+    instruction += f"<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n"
+    try:
+        if device == torch.device("cpu"):
+            raise EnvironmentError(
+                "If you have computing power, you can test by cloning to local or forking to an account with purchased GPU environment"
+            )
+        streamer = TextIteratorStreamer(
+            tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=True,
+        )
+        enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
+        input_ids, attention_mask = enc.input_ids, enc.attention_mask
+        if input_ids.shape[1] > CONTEXT_LENGTH:
+            input_ids = input_ids[:, -CONTEXT_LENGTH:]
+            attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
+        generate_kwargs = dict(
+            input_ids=input_ids.to(device),
+            attention_mask=attention_mask.to(device),
+            streamer=streamer,
+            do_sample=True,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            top_p=top_p,
+        )
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
+    except Exception as e:
+        streamer = f"{e}"
+    outputs = []
+    for new_token in streamer:
+        outputs.append(new_token)
+        if new_token in stop_tokens:
+            break
+        yield "".join(outputs)
+def DeepSeek_R1_Qwen_7B():
+    # Create Gradio interface
+    return gr.ChatInterface(
+        predict,
+        title=f"{MODEL_NAME} Deployment Instance",
+        description=DESCRIPTION,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
+        additional_inputs=[
+            gr.Textbox(
+                "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
+                label="System prompt",
+            ),
+            gr.Slider(0, 1, 0.6, label="Temperature"),
+            gr.Slider(0, 32000, 10000, label="Max new tokens"),
+            gr.Slider(1, 80, 40, label="Top K sampling"),
+            gr.Slider(0, 2, 1.1, label="Repetition penalty"),
+            gr.Slider(0, 1, 0.95, label="Top P sampling"),
+        ],
+    ).queue()

llms.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import gradio as gr
+from openai import OpenAI
+def predict(
+    message,
+    history,
+    system_prompt,
+    model,
+    api_url,
+    api_key,
+    max_tk,
+    temp,
+    top_p,
+):
+    if not api_key:
+        return "Please set valid api keys in settings first."
+    # Format history with a given chat template
+    msgs = [{"role": "system", "content": system_prompt}]
+    for user, assistant in history:
+        msgs.append({"role": "user", "content": user})
+        msgs.append({"role": "system", "content": assistant})
+    msgs.append({"role": "user", "content": message})
+    try:
+        client = OpenAI(api_key=api_key, base_url=api_url)
+        response = client.chat.completions.create(
+            model=model,
+            messages=msgs,
+            max_tokens=max_tk,
+            temperature=temp,
+            top_p=top_p,
+            stream=False,
+        ).to_dict()["choices"][0]["message"]["content"]
+    except Exception as e:
+        response = f"{e}"
+    return response
+def deepseek(
+    message,
+    history,
+    model,
+    api_key,
+    system_prompt,
+    max_tk,
+    temp,
+    top_p,
+):
+    response = predict(
+        message,
+        history,
+        system_prompt,
+        model,
+        "https://api.deepseek.com",
+        api_key,
+        max_tk,
+        temp,
+        top_p,
+    )
+    outputs = []
+    for new_token in response:
+        outputs.append(new_token)
+        yield "".join(outputs)
+def kimi(
+    message,
+    history,
+    model,
+    api_key,
+    system_prompt,
+    max_tk,
+    temp,
+    top_p,
+):
+    response = predict(
+        message,
+        history,
+        system_prompt,
+        model,
+        "https://api.moonshot.cn/v1",
+        api_key,
+        max_tk,
+        temp,
+        top_p,
+    )
+    outputs = []
+    for new_token in response:
+        outputs.append(new_token)
+        yield "".join(outputs)
+def LLM_APIs():
+    with gr.Blocks() as llms:  # Create Gradio interface
+        gr.Markdown("# LLM API Aggregation Deployment")
+        with gr.Tab("DeepSeek"):
+            with gr.Accordion(label="⚙️ Settings", open=False) as ds_acc:
+                ds_model = gr.Dropdown(
+                    choices=["deepseek-chat", "deepseek-reasoner"],
+                    value="deepseek-chat",
+                    label="Select a model",
+                )
+                ds_key = gr.Textbox(
+                    os.getenv("ds_api_key"),
+                    type="password",
+                    label="API key",
+                )
+                ds_sys = gr.Textbox(
+                    "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
+                    label="System prompt",
+                )
+                ds_maxtk = gr.Slider(0, 32000, 10000, label="Max new tokens")
+                ds_temp = gr.Slider(0, 1, 0.3, label="Temperature")
+                ds_topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
+            gr.ChatInterface(
+                deepseek,
+                additional_inputs=[
+                    ds_model,
+                    ds_key,
+                    ds_sys,
+                    ds_maxtk,
+                    ds_temp,
+                    ds_topp,
+                ],
+            )
+        with gr.Tab("Kimi"):
+            with gr.Accordion(label="⚙️ Settings", open=False) as kimi_acc:
+                kimi_model = gr.Dropdown(
+                    choices=["moonshot-v1-8k", "moonshot-v1-32k", "moonshot-v1-128k"],
+                    value="moonshot-v1-32k",
+                    label="Select a model",
+                )
+                kimi_key = gr.Textbox(
+                    os.getenv("kimi_api_key"),
+                    type="password",
+                    label="API key",
+                )
+                kimi_sys = gr.Textbox(
+                    "You are a useful assistant. first recognize user request and then reply carfuly and thinking",
+                    label="System prompt",
+                )
+                kimi_maxtk = gr.Slider(0, 32000, 10000, label="Max new tokens")
+                kimi_temp = gr.Slider(0, 1, 0.3, label="Temperature")
+                kimi_topp = gr.Slider(0, 1, 0.95, label="Top P sampling")
+            gr.ChatInterface(
+                kimi,
+                additional_inputs=[
+                    kimi_model,
+                    kimi_key,
+                    kimi_sys,
+                    kimi_maxtk,
+                    kimi_temp,
+                    kimi_topp,
+                ],
+            )
+    return llms.queue()

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 torch
-huggingface_hub==0.25.2
 transformers
-accelerate

 torch
+openai
+accelerate
 transformers
+huggingface_hub==0.25.2