vmuchinov commited on
Commit
f8310bd
1 Parent(s): 211550f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +44 -15
  2. requirements.txt +1 -0
app.py CHANGED
@@ -6,26 +6,55 @@ import gradio as gr
6
  import spaces
7
  import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
9
 
 
10
  MAX_MAX_NEW_TOKENS = 2048
11
  DEFAULT_MAX_NEW_TOKENS = 1024
12
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
  ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
14
 
15
- model_id = "Qwen/Qwen2.5-Coder-3B-Instruct"
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  model = AutoModelForCausalLM.from_pretrained(
18
  model_id,
19
  torch_dtype=torch.float16,
20
- device_map="auto",
21
  trust_remote_code=True,
22
- token=ACCESS_TOKEN)
 
 
 
23
  tokenizer = AutoTokenizer.from_pretrained(
24
  model_id,
25
  trust_remote_code=True,
26
- token=ACCESS_TOKEN)
 
27
  tokenizer.use_default_system_prompt = False
28
 
 
 
29
 
30
  @spaces.GPU
31
  def generate(
@@ -35,45 +64,45 @@ def generate(
35
  temperature: float = 0.01,
36
  top_p: float = 1.00,
37
  ) -> Iterator[str]:
 
38
  conversation = []
39
  if system_prompt:
40
  conversation.append({"role": "system", "content": system_prompt})
41
  conversation.append({"role": "user", "content": message})
42
 
 
43
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
44
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
45
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
46
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
47
  input_ids = input_ids.to(model.device)
48
 
49
- '''
50
- terminators = [
51
- tokenizer.eos_token_id,
52
- tokenizer.convert_tokens_to_ids("<|eot_id|>")
53
- ]
54
- '''
55
-
56
  streamer = TextIteratorStreamer(tokenizer, timeout=600.0, skip_prompt=True, skip_special_tokens=True)
 
 
57
  generate_kwargs = dict(
58
  {"input_ids": input_ids},
59
  streamer=streamer,
60
  max_new_tokens=max_new_tokens,
61
- #eos_token_id=terminators,
62
  do_sample=True,
63
  top_p=top_p,
64
  temperature=temperature,
65
  num_beams=1,
66
- pad_token_id=tokenizer.eos_token_id,
67
  )
 
 
68
  t = Thread(target=model.generate, kwargs=generate_kwargs)
69
  t.start()
70
 
 
71
  outputs = []
72
  for text in streamer:
73
  outputs.append(text)
74
  yield "".join(outputs)
75
 
76
-
77
  chat_interface = gr.Interface(
78
  fn=generate,
79
  inputs=[
@@ -104,7 +133,7 @@ chat_interface = gr.Interface(
104
  value=1.0,
105
  ),
106
  ],
107
- title="Model testing - Qwen/Qwen2.5-Coder-3B-Instruct",
108
  description="Provide system settings and a prompt to interact with the model.",
109
  )
110
 
 
6
  import spaces
7
  import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
+ import deepspeed
10
 
11
+ # Configurable constants
12
  MAX_MAX_NEW_TOKENS = 2048
13
  DEFAULT_MAX_NEW_TOKENS = 1024
14
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
15
  ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
16
 
17
+ # Model ID for Qwen model
18
+ model_id = "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8"
19
 
20
+ # DeepSpeed configuration
21
+ deepspeed_config = {
22
+ "train_batch_size": 1,
23
+ "fp16": {
24
+ "enabled": True
25
+ },
26
+ "zero_optimization": {
27
+ "stage": 3, # Enable ZeRO stage 3 for maximum memory efficiency
28
+ "offload_param": {
29
+ "device": "cpu",
30
+ "pin_memory": True
31
+ },
32
+ "offload_optimizer": {
33
+ "device": "cpu",
34
+ "pin_memory": True
35
+ }
36
+ },
37
+ "gradient_checkpointing": True # Enables gradient checkpointing for further memory savings
38
+ }
39
+
40
+ # Load model with DeepSpeed
41
  model = AutoModelForCausalLM.from_pretrained(
42
  model_id,
43
  torch_dtype=torch.float16,
 
44
  trust_remote_code=True,
45
+ device_map="auto", # Use device mapping with DeepSpeed
46
+ load_in_8bit=True, # Use 8-bit quantization
47
+ token=ACCESS_TOKEN
48
+ )
49
  tokenizer = AutoTokenizer.from_pretrained(
50
  model_id,
51
  trust_remote_code=True,
52
+ token=ACCESS_TOKEN
53
+ )
54
  tokenizer.use_default_system_prompt = False
55
 
56
+ # Initialize DeepSpeed for the loaded model
57
+ model = deepspeed.init_inference(model, config=deepspeed_config, mp_size=1)
58
 
59
  @spaces.GPU
60
  def generate(
 
64
  temperature: float = 0.01,
65
  top_p: float = 1.00,
66
  ) -> Iterator[str]:
67
+ # Define the conversation prompt format
68
  conversation = []
69
  if system_prompt:
70
  conversation.append({"role": "system", "content": system_prompt})
71
  conversation.append({"role": "user", "content": message})
72
 
73
+ # Tokenize the input with the conversation template
74
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
75
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
76
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
77
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
78
  input_ids = input_ids.to(model.device)
79
 
80
+ # Set up the text streaming options for output
 
 
 
 
 
 
81
  streamer = TextIteratorStreamer(tokenizer, timeout=600.0, skip_prompt=True, skip_special_tokens=True)
82
+
83
+ # Set up generation parameters
84
  generate_kwargs = dict(
85
  {"input_ids": input_ids},
86
  streamer=streamer,
87
  max_new_tokens=max_new_tokens,
 
88
  do_sample=True,
89
  top_p=top_p,
90
  temperature=temperature,
91
  num_beams=1,
92
+ pad_token_id=tokenizer.eos_token_id,
93
  )
94
+
95
+ # Run generation in a separate thread for streaming
96
  t = Thread(target=model.generate, kwargs=generate_kwargs)
97
  t.start()
98
 
99
+ # Stream the output text
100
  outputs = []
101
  for text in streamer:
102
  outputs.append(text)
103
  yield "".join(outputs)
104
 
105
+ # Gradio interface setup
106
  chat_interface = gr.Interface(
107
  fn=generate,
108
  inputs=[
 
133
  value=1.0,
134
  ),
135
  ],
136
+ title="Model testing - Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8",
137
  description="Provide system settings and a prompt to interact with the model.",
138
  )
139
 
requirements.txt CHANGED
@@ -246,3 +246,4 @@ einops
246
  pytest
247
  gguf>=0.10.0
248
  autoawq
 
 
246
  pytest
247
  gguf>=0.10.0
248
  autoawq
249
+ deepspeed