Commit
·
ad52ebd
1
Parent(s):
b85543c
fix typo & dtype
Browse files- README.md +5 -4
- generation_demo.py +5 -4
- modeling_muddformer.py +1 -1
README.md
CHANGED
@@ -30,13 +30,14 @@ import os
|
|
30 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
31 |
|
32 |
device = torch.device('cuda:0')
|
|
|
33 |
MAX_BATCH_SIZE = 1
|
34 |
MAX_SEQ_LENGTH = 2048
|
35 |
NUM_TOKENS_TO_GENERATE = 10
|
36 |
COMPILE = True
|
37 |
-
|
38 |
|
39 |
-
if
|
40 |
import torch._dynamo.config
|
41 |
import torch._inductor.config
|
42 |
torch._dynamo.config.cache_size_limit = 64
|
@@ -47,9 +48,9 @@ if OPTIMIZED_COMPPILE:
|
|
47 |
tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
|
48 |
model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
|
49 |
|
50 |
-
_ = model.to(device=device,dtype=
|
51 |
with torch.device(device):
|
52 |
-
model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH)
|
53 |
|
54 |
def decode_one_token(model, cur_token, input_pos):
|
55 |
logits = model(cur_token, input_pos=input_pos, return_tensor=True)
|
|
|
30 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
31 |
|
32 |
device = torch.device('cuda:0')
|
33 |
+
dtype = torch.bfloat16
|
34 |
MAX_BATCH_SIZE = 1
|
35 |
MAX_SEQ_LENGTH = 2048
|
36 |
NUM_TOKENS_TO_GENERATE = 10
|
37 |
COMPILE = True
|
38 |
+
OPTIMIZED_COMPILE = False
|
39 |
|
40 |
+
if OPTIMIZED_COMPILE:
|
41 |
import torch._dynamo.config
|
42 |
import torch._inductor.config
|
43 |
torch._dynamo.config.cache_size_limit = 64
|
|
|
48 |
tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
|
49 |
model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
|
50 |
|
51 |
+
_ = model.to(device=device,dtype=dtype)
|
52 |
with torch.device(device):
|
53 |
+
model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH, dtype=dtype)
|
54 |
|
55 |
def decode_one_token(model, cur_token, input_pos):
|
56 |
logits = model(cur_token, input_pos=input_pos, return_tensor=True)
|
generation_demo.py
CHANGED
@@ -6,13 +6,14 @@ import os
|
|
6 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
7 |
|
8 |
device = torch.device('cuda:0')
|
|
|
9 |
MAX_BATCH_SIZE = 1
|
10 |
MAX_SEQ_LENGTH = 2048
|
11 |
NUM_TOKENS_TO_GENERATE = 10
|
12 |
COMPILE = True
|
13 |
-
|
14 |
|
15 |
-
if
|
16 |
import torch._dynamo.config
|
17 |
import torch._inductor.config
|
18 |
torch._dynamo.config.cache_size_limit = 64
|
@@ -23,9 +24,9 @@ if OPTIMIZED_COMPPILE:
|
|
23 |
tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
|
24 |
model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
|
25 |
|
26 |
-
_ = model.to(device=device,dtype=
|
27 |
with torch.device(device):
|
28 |
-
model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH)
|
29 |
|
30 |
def decode_one_token(model, cur_token, input_pos):
|
31 |
logits = model(cur_token, input_pos=input_pos, return_tensor=True)
|
|
|
6 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
7 |
|
8 |
device = torch.device('cuda:0')
|
9 |
+
dtype = torch.bfloat16
|
10 |
MAX_BATCH_SIZE = 1
|
11 |
MAX_SEQ_LENGTH = 2048
|
12 |
NUM_TOKENS_TO_GENERATE = 10
|
13 |
COMPILE = True
|
14 |
+
OPTIMIZED_COMPILE = False
|
15 |
|
16 |
+
if OPTIMIZED_COMPILE:
|
17 |
import torch._dynamo.config
|
18 |
import torch._inductor.config
|
19 |
torch._dynamo.config.cache_size_limit = 64
|
|
|
24 |
tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDFormer-2.8B")
|
25 |
model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDFormer-2.8B", trust_remote_code=True)
|
26 |
|
27 |
+
_ = model.to(device=device,dtype=dtype)
|
28 |
with torch.device(device):
|
29 |
+
model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH, dtype=dtype)
|
30 |
|
31 |
def decode_one_token(model, cur_token, input_pos):
|
32 |
logits = model(cur_token, input_pos=input_pos, return_tensor=True)
|
modeling_muddformer.py
CHANGED
@@ -119,7 +119,7 @@ class MUDDFormer(PreTrainedModel):
|
|
119 |
self.max_batch_size = max_batch_size
|
120 |
if not self.config.is_training:
|
121 |
if self.use_layer_cache:
|
122 |
-
self.layer_cache = LayerCache(max_batch_size, self.config.n_layer, self.config.dim)
|
123 |
for b in self.layers:
|
124 |
b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype=dtype)
|
125 |
|
|
|
119 |
self.max_batch_size = max_batch_size
|
120 |
if not self.config.is_training:
|
121 |
if self.use_layer_cache:
|
122 |
+
self.layer_cache = LayerCache(max_batch_size, self.config.n_layer, self.config.dim, dtype=dtype)
|
123 |
for b in self.layers:
|
124 |
b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype=dtype)
|
125 |
|