Julien Simon commited on
Commit
316f1a9
1 Parent(s): a49294b

Add H100 for Nova and SuperNova

Browse files
Files changed (1) hide show
  1. results_arcee_supernova.py +55 -53
results_arcee_supernova.py CHANGED
@@ -62,59 +62,11 @@ results_arcee_supernova = {
62
  },
63
  {
64
  "instanceType": "p5.48xlarge",
65
- "configurations": [
66
- {
67
- "quantization": "awq",
68
- "container": "TGI 2.2.0",
69
- "status": "OK",
70
- "tokensPerSecond": "73",
71
- "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
72
- },
73
- {
74
- "quantization": "none",
75
- "container": "TGI 2.2.0",
76
- "status": "OK",
77
- "tokensPerSecond": "58",
78
- "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
79
- },
80
- {
81
- "quantization": "none",
82
- "container": "LMI 0.29+vLLM 0.5.5",
83
- "status": "OK",
84
- "tokensPerSecond": "70",
85
- "notes": "OPTION_MAX_MODEL_LEN 128k",
86
- },
87
- {
88
- "quantization": "none",
89
- "container": "LMI 0.29+vLLM 0.5.5",
90
- "status": "OK",
91
- "tokensPerSecond": "70",
92
- "notes": "OPTION_ENFORCE_EAGER=True",
93
- },
94
- {
95
- "quantization": "None",
96
- "container": "vLLM 0.6.4.post1",
97
- "status": "OK",
98
- "tokensPerSecond": "77",
99
- "notes": "--tensor-parallel-size 8",
100
- },
101
- {
102
- "quantization": "None",
103
- "container": "vLLM 0.6.4.post1",
104
- "status": "OK",
105
- "tokensPerSecond": "53",
106
- "notes": "--tensor-parallel-size 4",
107
- "gpuCount": "4",
108
- },
109
- {
110
- "quantization": "None",
111
- "container": "vLLM 0.6.4.post1",
112
- "status": "OK",
113
- "tokensPerSecond": "33",
114
- "notes": "--tensor-parallel-size 2 --gpu_memory-utilization 0.95",
115
- "gpuCount": "2",
116
- },
117
- ],
118
  },
119
  {
120
  "instanceType": "inf2.24xlarge",
@@ -206,5 +158,55 @@ results_arcee_supernova = {
206
  },
207
  ],
208
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  ],
210
  }
 
62
  },
63
  {
64
  "instanceType": "p5.48xlarge",
65
+ "quantization": "awq",
66
+ "container": "TGI 2.2.0",
67
+ "status": "OK",
68
+ "tokensPerSecond": "73",
69
+ "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  },
71
  {
72
  "instanceType": "inf2.24xlarge",
 
158
  },
159
  ],
160
  },
161
+ {
162
+ "instanceType": "p5.48xlarge",
163
+ "configurations": [
164
+ {
165
+ "quantization": "none",
166
+ "container": "TGI 2.2.0",
167
+ "status": "OK",
168
+ "tokensPerSecond": "58",
169
+ "notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768",
170
+ },
171
+ {
172
+ "quantization": "none",
173
+ "container": "LMI 0.29+vLLM 0.5.5",
174
+ "status": "OK",
175
+ "tokensPerSecond": "70",
176
+ "notes": "OPTION_MAX_MODEL_LEN 128k",
177
+ },
178
+ {
179
+ "quantization": "none",
180
+ "container": "LMI 0.29+vLLM 0.5.5",
181
+ "status": "OK",
182
+ "tokensPerSecond": "70",
183
+ "notes": "OPTION_ENFORCE_EAGER=True",
184
+ },
185
+ ],
186
+ },
187
+ {
188
+ "instanceType": "p5.48xlarge",
189
+ "quantization": "None",
190
+ "container": "vLLM 0.6.4.post1",
191
+ "status": "N/A",
192
+ "tokensPerSecond": "77",
193
+ "notes": "--tensor-parallel-size 8",
194
+ },
195
+ {
196
+ "instanceType": "p5.48xlarge (4 GPUs)",
197
+ "quantization": "None",
198
+ "container": "vLLM 0.6.4.post1",
199
+ "status": "OK",
200
+ "tokensPerSecond": "53",
201
+ "notes": "--tensor-parallel-size 4",
202
+ },
203
+ {
204
+ "instanceType": "p5.48xlarge (2 GPUs)",
205
+ "quantization": "None",
206
+ "container": "vLLM 0.6.4.post1",
207
+ "status": "OK",
208
+ "tokensPerSecond": "xxx",
209
+ "notes": "--tensor-parallel-size 2",
210
+ },
211
  ],
212
  }