nielsr HF staff commited on
Commit
dbadf54
·
verified ·
1 Parent(s): ca0b5cf

Add pipeline tag and library name

Browse files

This PR ensures the model can be found at https://huggingface.co/models?pipeline_tag=text-generation. It also ensures that it
is correctly identified as being compatible with the Transformers library.

Files changed (1) hide show
  1. README.md +79 -76
README.md CHANGED
@@ -1,76 +1,79 @@
1
- ---
2
- language:
3
- - en
4
- tags:
5
- - pytorch
6
- - causal-lm
7
- - muddformer
8
- license: mit
9
- ---
10
- In comparison with Pythia-2.8B, MUDDPythia-2.8B is a pretrained language model on the Pile with 300B tokens, which uses a simple yet effective method to address the limitations of residual connections and enhance cross-layer information flow in Transformers. Please see downstrem evaluations and more details in the paper[(MUDDFormer: Breaking Residual Bottlenecks in Transformers via Multiway Dynamic Dense Connections)](https://arxiv.org/abs/2502.12170). In addition, we open-source Jax training code on [(Github)](https://github.com/Caiyun-AI/MUDDFormer/).
11
-
12
- We recommend <strong>compiled version</strong> of MUDDPythia with *torch.compile* for inference acceleration. Please refer to Generation section for compile implementation.
13
-
14
- # Usage
15
-
16
- ## Env
17
-
18
- ```
19
- pip install transformers==4.40.2 torch==2.5.1 einops==0.8.0
20
-
21
- ```
22
-
23
- ## Generation
24
-
25
- ```
26
- import time
27
- from transformers import AutoTokenizer, AutoModelForCausalLM
28
- import torch
29
-
30
- import os
31
- os.environ['TOKENIZERS_PARALLELISM'] = 'false'
32
-
33
- device = torch.device('cuda:0')
34
- dtype = torch.bfloat16
35
- MAX_BATCH_SIZE = 1
36
- MAX_SEQ_LENGTH = 2048
37
- NUM_TOKENS_TO_GENERATE = 10
38
- COMPILE = True
39
- OPTIMIZED_COMPILE = False
40
-
41
- if OPTIMIZED_COMPILE:
42
- import torch._dynamo.config
43
- import torch._inductor.config
44
- torch._dynamo.config.cache_size_limit = 64
45
- torch._inductor.config.coordinate_descent_tuning = True
46
- torch._inductor.config.triton.unique_kernel_names = True
47
- torch._inductor.config.fx_graph_cache = True
48
-
49
- tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDPythia-2.8B")
50
- model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDPythia-2.8B", trust_remote_code=True)
51
-
52
- _ = model.to(device=device,dtype=dtype)
53
- with torch.device(device):
54
- model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH,dtype=dtype)
55
-
56
- def decode_one_token(model, cur_token, input_pos):
57
- logits = model(cur_token, input_pos=input_pos, return_tensor=True)
58
- new_token = torch.argmax(logits[:, -1], dim=-1)[:,None]
59
- return new_token
60
-
61
- prompt = "Beijing is the capital of China. London is the capital of"
62
- input_ids = tokenizer.encode(prompt, return_tensors='pt')
63
-
64
- compiled_decode_one_token = torch.compile(decode_one_token,mode="reduce-overhead", fullgraph=True) if COMPILE else None
65
-
66
- print('Start generating tokens, but it will take a few minutes to compile at the first time.')
67
- for i in range(10):
68
- t0 = time.time()
69
- with torch.no_grad():
70
- generated_ids = model.generate(input_ids.to(device),num_tokens_to_generate=NUM_TOKENS_TO_GENERATE, compiled_decode_one_token=compiled_decode_one_token)
71
- text = tokenizer.decode(generated_ids[0])
72
- if i ==0:
73
- print(f'Generated text: {text}')
74
- t1 = time.time()
75
- print(f'Time consumed at iteration {i}: {t1-t0}s')
76
- ```
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: mit
5
+ library_name: transformers
6
+ tags:
7
+ - pytorch
8
+ - causal-lm
9
+ - muddformer
10
+ pipeline_tag: text-generation
11
+ ---
12
+
13
+ In comparison with Pythia-2.8B, MUDDPythia-2.8B is a pretrained language model on the Pile with 300B tokens, which uses a simple yet effective method to address the limitations of residual connections and enhance cross-layer information flow in Transformers. Please see downstrem evaluations and more details in the paper[(MUDDFormer: Breaking Residual Bottlenecks in Transformers via Multiway Dynamic Dense Connections)](https://arxiv.org/abs/2502.12170). In addition, we open-source Jax training code on [(Github)](https://github.com/Caiyun-AI/MUDDFormer/).
14
+
15
+ We recommend <strong>compiled version</strong> of MUDDPythia with *torch.compile* for inference acceleration. Please refer to Generation section for compile implementation.
16
+
17
+ # Usage
18
+
19
+ ## Env
20
+
21
+ ```
22
+ pip install transformers==4.40.2 torch==2.5.1 einops==0.8.0
23
+
24
+ ```
25
+
26
+ ## Generation
27
+
28
+ ```
29
+ import time
30
+ from transformers import AutoTokenizer, AutoModelForCausalLM
31
+ import torch
32
+
33
+ import os
34
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false'
35
+
36
+ device = torch.device('cuda:0')
37
+ dtype = torch.bfloat16
38
+ MAX_BATCH_SIZE = 1
39
+ MAX_SEQ_LENGTH = 2048
40
+ NUM_TOKENS_TO_GENERATE = 10
41
+ COMPILE = True
42
+ OPTIMIZED_COMPILE = False
43
+
44
+ if OPTIMIZED_COMPILE:
45
+ import torch._dynamo.config
46
+ import torch._inductor.config
47
+ torch._dynamo.config.cache_size_limit = 64
48
+ torch._inductor.config.coordinate_descent_tuning = True
49
+ torch._inductor.config.triton.unique_kernel_names = True
50
+ torch._inductor.config.fx_graph_cache = True
51
+
52
+ tokenizer = AutoTokenizer.from_pretrained("Caiyun-AI/MUDDPythia-2.8B")
53
+ model = AutoModelForCausalLM.from_pretrained("Caiyun-AI/MUDDPythia-2.8B", trust_remote_code=True)
54
+
55
+ _ = model.to(device=device,dtype=dtype)
56
+ with torch.device(device):
57
+ model.setup_caches(max_batch_size=MAX_BATCH_SIZE, max_seq_length=MAX_SEQ_LENGTH,dtype=dtype)
58
+
59
+ def decode_one_token(model, cur_token, input_pos):
60
+ logits = model(cur_token, input_pos=input_pos, return_tensor=True)
61
+ new_token = torch.argmax(logits[:, -1], dim=-1)[:,None]
62
+ return new_token
63
+
64
+ prompt = "Beijing is the capital of China. London is the capital of"
65
+ input_ids = tokenizer.encode(prompt, return_tensors='pt')
66
+
67
+ compiled_decode_one_token = torch.compile(decode_one_token,mode="reduce-overhead", fullgraph=True) if COMPILE else None
68
+
69
+ print('Start generating tokens, but it will take a few minutes to compile at the first time.')
70
+ for i in range(10):
71
+ t0 = time.time()
72
+ with torch.no_grad():
73
+ generated_ids = model.generate(input_ids.to(device),num_tokens_to_generate=NUM_TOKENS_TO_GENERATE, compiled_decode_one_token=compiled_decode_one_token)
74
+ text = tokenizer.decode(generated_ids[0])
75
+ if i ==0:
76
+ print(f'Generated text: {text}')
77
+ t1 = time.time()
78
+ print(f'Time consumed at iteration {i}: {t1-t0}s')
79
+ ```