Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,463 Bytes
355b7d6 ab382f0 355b7d6 ab382f0 a1b2c23 355b7d6 8f997e4 57ab467 ab382f0 207731c 3b4a08d 355b7d6 ab382f0 355b7d6 2ae46d7 f9d021c 38fae63 51f57d5 f9d021c 51f57d5 f9d021c 38fae63 2ae46d7 355b7d6 2ae46d7 38fae63 2ae46d7 38fae63 2ae46d7 355b7d6 2ae46d7 38fae63 2ae46d7 38fae63 2ae46d7 38fae63 2ae46d7 38fae63 5a07c64 3b4a08d 2ae46d7 355b7d6 8f997e4 a1b2c23 355b7d6 80d5294 355b7d6 80d5294 00ab235 21937b0 80d5294 bcd2a0f 355b7d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import json
import subprocess
from threading import Thread
import torch
import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
CHAT_TEMPLATE = "َAuto"
MODEL_NAME = MODEL_ID.split("/")[-1]
CONTEXT_LENGTH = 16000
COLOR = "black"
EMOJI = "🤖"
DESCRIPTION = f"This is {MODEL_NAME} model designed for testing thinking for general AI tasks." # Descripción predeterminada
latex_delimiters_set = [{
"left": "\\(",
"right": "\\)",
"display": False
}, {
"left": "\\begin{equation}",
"right": "\\end{equation}",
"display": True
}, {
"left": "\\begin{align}",
"right": "\\end{align}",
"display": True
}, {
"left": "\\begin{alignat}",
"right": "\\end{alignat}",
"display": True
}, {
"left": "\\begin{gather}",
"right": "\\end{gather}",
"display": True
}, {
"left": "\\begin{CD}",
"right": "\\end{CD}",
"display": True
}, {
"left": "\\[",
"right": "\\]",
"display": True
}]
@spaces.GPU()
def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
for user, assistant in history:
instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
print("Formatted Instruction:", instruction)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
input_ids, attention_mask = enc.input_ids, enc.attention_mask
if input_ids.shape[1] > CONTEXT_LENGTH:
input_ids = input_ids[:, -CONTEXT_LENGTH:]
attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Define the generation parameters
generate_kwargs = dict(
input_ids=input_ids.to(device),
attention_mask=attention_mask.to(device),
streamer=streamer,
do_sample=True,
temperature=temperature,
max_new_tokens=max_new_tokens,
top_k=top_k,
repetition_penalty=repetition_penalty,
top_p=top_p,
pad_token_id=tokenizer.pad_token_id, # Explicitly set pad_token_id
eos_token_id=tokenizer.eos_token_id, # Explicitly set eos_token_id
)
# Start the generation in a separate thread
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
# Stream the output token by token
outputs = []
for new_token in streamer:
outputs.append(new_token)
if any(stop_token in new_token for stop_token in stop_tokens):
break
yield "".join(outputs)
# Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
#quantization_config=quantization_config,
#attn_implementation="flash_attention_2",
)
# Create Gradio interface
gr.ChatInterface(
predict,
title=EMOJI + " " + MODEL_NAME,
description=DESCRIPTION,
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False),
additional_inputs=[
gr.Textbox("You are a useful assistant. first recognize user request and then reply carfuly and thinking", label="System prompt"),
gr.Slider(0, 1, 0.6, label="Temperature"),
gr.Slider(0, 32000, 10000, label="Max new tokens"),
gr.Slider(1, 80, 40, label="Top K sampling"),
gr.Slider(0, 2, 1.1, label="Repetition penalty"),
gr.Slider(0, 1, 0.95, label="Top P sampling"),
],
#theme=gr.themes.Soft(primary_hue=COLOR),
).queue().launch()
|