ibm
/

mayank-mishra commited on
Commit
e7ee55e
·
verified ·
1 Parent(s): c7d6e2f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +60 -40
README.md CHANGED
@@ -19,82 +19,102 @@ model-index:
19
  - task:
20
  type: text-generation
21
  dataset:
22
- type: bigcode/humanevalpack
23
- name: HumanEvalSynthesis (Average)
24
  metrics:
25
- - name: pass@1
26
- type: pass@1
27
- value: 51.4
28
  verified: false
29
  - task:
30
  type: text-generation
31
  dataset:
32
- type: bigcode/humanevalpack
33
- name: HumanEvalExplain (Average)
34
  metrics:
35
- - name: pass@1
36
- type: pass@1
37
- value: 38.9
38
  verified: false
39
  - task:
40
  type: text-generation
41
  dataset:
42
- type: bigcode/humanevalpack
43
- name: HumanEvalFix (Average)
44
  metrics:
45
- - name: pass@1
46
- type: pass@1
47
- value: 38.3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  verified: false
49
  - task:
50
  type: text-generation
51
  dataset:
52
- type: repoqa
53
- name: RepoQA (Python@16K)
54
  metrics:
55
- - name: pass@1 (thresh=0.5)
56
- type: pass@1 (thresh=0.5)
57
- value: 73.0
58
  verified: false
59
  - task:
60
  type: text-generation
61
  dataset:
62
- type: repoqa
63
- name: RepoQA (C++@16K)
64
  metrics:
65
- - name: pass@1 (thresh=0.5)
66
- type: pass@1 (thresh=0.5)
67
- value: 37.0
68
  verified: false
69
  - task:
70
  type: text-generation
71
  dataset:
72
- type: repoqa
73
- name: RepoQA (Java@16K)
74
  metrics:
75
- - name: pass@1 (thresh=0.5)
76
- type: pass@1 (thresh=0.5)
77
- value: 73.0
78
  verified: false
79
  - task:
80
  type: text-generation
81
  dataset:
82
- type: repoqa
83
- name: RepoQA (TypeScript@16K)
84
  metrics:
85
- - name: pass@1 (thresh=0.5)
86
- type: pass@1 (thresh=0.5)
87
- value: 62.0
88
  verified: false
89
  - task:
90
  type: text-generation
91
  dataset:
92
- type: repoqa
93
- name: RepoQA (Rust@16K)
94
  metrics:
95
- - name: pass@1 (thresh=0.5)
96
- type: pass@1 (thresh=0.5)
97
- value: 63.0
98
  verified: false
99
  ---
100
 
 
19
  - task:
20
  type: text-generation
21
  dataset:
22
+ type: lm-eval-harness
23
+ name: BoolQ
24
  metrics:
25
+ - name: accuracy
26
+ type: accuracy
27
+ value: 75
28
  verified: false
29
  - task:
30
  type: text-generation
31
  dataset:
32
+ type: lm-eval-harness
33
+ name: Hellaswag
34
  metrics:
35
+ - name: accuracy-norm
36
+ type: accuracy-norm
37
+ value: 74.2
38
  verified: false
39
  - task:
40
  type: text-generation
41
  dataset:
42
+ type: lm-eval-harness
43
+ name: OpenBookQA
44
  metrics:
45
+ - name: accuracy-norm
46
+ type: accuracy-norm
47
+ value: 41.2
48
+ verified: false
49
+ - task:
50
+ type: text-generation
51
+ dataset:
52
+ type: lm-eval-harness
53
+ name: PIQA
54
+ metrics:
55
+ - name: accuracy-norm
56
+ type: accuracy-norm
57
+ value: 79.9
58
+ verified: false
59
+ - task:
60
+ type: text-generation
61
+ dataset:
62
+ type: lm-eval-harness
63
+ name: Winogrande
64
+ metrics:
65
+ - name: accuracy-norm
66
+ type: accuracy-norm
67
+ value: 66.3
68
  verified: false
69
  - task:
70
  type: text-generation
71
  dataset:
72
+ type: lm-eval-harness
73
+ name: MMLU
74
  metrics:
75
+ - name: accuracy
76
+ type: accuracy
77
+ value: 44.3
78
  verified: false
79
  - task:
80
  type: text-generation
81
  dataset:
82
+ type: lm-eval-harness
83
+ name: GSM8k (5 shot)
84
  metrics:
85
+ - name: accuracy
86
+ type: accuracy
87
+ value: 35.9
88
  verified: false
89
  - task:
90
  type: text-generation
91
  dataset:
92
+ type: lm-eval-harness
93
+ name: math (4 shot)
94
  metrics:
95
+ - name: accuracy
96
+ type: accuracy
97
+ value: 14
98
  verified: false
99
  - task:
100
  type: text-generation
101
  dataset:
102
+ type: bigcode-eval
103
+ name: humaneval
104
  metrics:
105
+ - name: pass@1
106
+ type: pass@1
107
+ value: 21.9
108
  verified: false
109
  - task:
110
  type: text-generation
111
  dataset:
112
+ type: bigcode-eval
113
+ name: MBPP
114
  metrics:
115
+ - name: pass@1
116
+ type: pass@1
117
+ value: 28
118
  verified: false
119
  ---
120