vmuchinov commited on
Commit
e4cd356
1 Parent(s): f8310bd

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +15 -44
  2. requirements.txt +0 -1
app.py CHANGED
@@ -6,55 +6,26 @@ import gradio as gr
6
  import spaces
7
  import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
- import deepspeed
10
 
11
- # Configurable constants
12
  MAX_MAX_NEW_TOKENS = 2048
13
  DEFAULT_MAX_NEW_TOKENS = 1024
14
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
15
  ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
16
 
17
- # Model ID for Qwen model
18
- model_id = "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8"
19
 
20
- # DeepSpeed configuration
21
- deepspeed_config = {
22
- "train_batch_size": 1,
23
- "fp16": {
24
- "enabled": True
25
- },
26
- "zero_optimization": {
27
- "stage": 3, # Enable ZeRO stage 3 for maximum memory efficiency
28
- "offload_param": {
29
- "device": "cpu",
30
- "pin_memory": True
31
- },
32
- "offload_optimizer": {
33
- "device": "cpu",
34
- "pin_memory": True
35
- }
36
- },
37
- "gradient_checkpointing": True # Enables gradient checkpointing for further memory savings
38
- }
39
-
40
- # Load model with DeepSpeed
41
  model = AutoModelForCausalLM.from_pretrained(
42
  model_id,
43
  torch_dtype=torch.float16,
 
44
  trust_remote_code=True,
45
- device_map="auto", # Use device mapping with DeepSpeed
46
- load_in_8bit=True, # Use 8-bit quantization
47
- token=ACCESS_TOKEN
48
- )
49
  tokenizer = AutoTokenizer.from_pretrained(
50
  model_id,
51
  trust_remote_code=True,
52
- token=ACCESS_TOKEN
53
- )
54
  tokenizer.use_default_system_prompt = False
55
 
56
- # Initialize DeepSpeed for the loaded model
57
- model = deepspeed.init_inference(model, config=deepspeed_config, mp_size=1)
58
 
59
  @spaces.GPU
60
  def generate(
@@ -64,45 +35,45 @@ def generate(
64
  temperature: float = 0.01,
65
  top_p: float = 1.00,
66
  ) -> Iterator[str]:
67
- # Define the conversation prompt format
68
  conversation = []
69
  if system_prompt:
70
  conversation.append({"role": "system", "content": system_prompt})
71
  conversation.append({"role": "user", "content": message})
72
 
73
- # Tokenize the input with the conversation template
74
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
75
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
76
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
77
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
78
  input_ids = input_ids.to(model.device)
79
 
80
- # Set up the text streaming options for output
 
 
 
 
 
 
81
  streamer = TextIteratorStreamer(tokenizer, timeout=600.0, skip_prompt=True, skip_special_tokens=True)
82
-
83
- # Set up generation parameters
84
  generate_kwargs = dict(
85
  {"input_ids": input_ids},
86
  streamer=streamer,
87
  max_new_tokens=max_new_tokens,
 
88
  do_sample=True,
89
  top_p=top_p,
90
  temperature=temperature,
91
  num_beams=1,
92
- pad_token_id=tokenizer.eos_token_id,
93
  )
94
-
95
- # Run generation in a separate thread for streaming
96
  t = Thread(target=model.generate, kwargs=generate_kwargs)
97
  t.start()
98
 
99
- # Stream the output text
100
  outputs = []
101
  for text in streamer:
102
  outputs.append(text)
103
  yield "".join(outputs)
104
 
105
- # Gradio interface setup
106
  chat_interface = gr.Interface(
107
  fn=generate,
108
  inputs=[
@@ -133,7 +104,7 @@ chat_interface = gr.Interface(
133
  value=1.0,
134
  ),
135
  ],
136
- title="Model testing - Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8",
137
  description="Provide system settings and a prompt to interact with the model.",
138
  )
139
 
 
6
  import spaces
7
  import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
9
 
 
10
  MAX_MAX_NEW_TOKENS = 2048
11
  DEFAULT_MAX_NEW_TOKENS = 1024
12
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
  ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
14
 
15
+ model_id = "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4"
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  model = AutoModelForCausalLM.from_pretrained(
18
  model_id,
19
  torch_dtype=torch.float16,
20
+ device_map="auto",
21
  trust_remote_code=True,
22
+ token=ACCESS_TOKEN)
 
 
 
23
  tokenizer = AutoTokenizer.from_pretrained(
24
  model_id,
25
  trust_remote_code=True,
26
+ token=ACCESS_TOKEN)
 
27
  tokenizer.use_default_system_prompt = False
28
 
 
 
29
 
30
  @spaces.GPU
31
  def generate(
 
35
  temperature: float = 0.01,
36
  top_p: float = 1.00,
37
  ) -> Iterator[str]:
 
38
  conversation = []
39
  if system_prompt:
40
  conversation.append({"role": "system", "content": system_prompt})
41
  conversation.append({"role": "user", "content": message})
42
 
 
43
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
44
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
45
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
46
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
47
  input_ids = input_ids.to(model.device)
48
 
49
+ '''
50
+ terminators = [
51
+ tokenizer.eos_token_id,
52
+ tokenizer.convert_tokens_to_ids("<|eot_id|>")
53
+ ]
54
+ '''
55
+
56
  streamer = TextIteratorStreamer(tokenizer, timeout=600.0, skip_prompt=True, skip_special_tokens=True)
 
 
57
  generate_kwargs = dict(
58
  {"input_ids": input_ids},
59
  streamer=streamer,
60
  max_new_tokens=max_new_tokens,
61
+ #eos_token_id=terminators,
62
  do_sample=True,
63
  top_p=top_p,
64
  temperature=temperature,
65
  num_beams=1,
66
+ pad_token_id=tokenizer.eos_token_id,
67
  )
 
 
68
  t = Thread(target=model.generate, kwargs=generate_kwargs)
69
  t.start()
70
 
 
71
  outputs = []
72
  for text in streamer:
73
  outputs.append(text)
74
  yield "".join(outputs)
75
 
76
+
77
  chat_interface = gr.Interface(
78
  fn=generate,
79
  inputs=[
 
104
  value=1.0,
105
  ),
106
  ],
107
+ title="Model testing - Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4",
108
  description="Provide system settings and a prompt to interact with the model.",
109
  )
110
 
requirements.txt CHANGED
@@ -246,4 +246,3 @@ einops
246
  pytest
247
  gguf>=0.10.0
248
  autoawq
249
- deepspeed
 
246
  pytest
247
  gguf>=0.10.0
248
  autoawq