|
import sys |
|
import numpy as np |
|
|
|
DATASETS1 = [ |
|
"qmsum", |
|
"qasper", |
|
"quality", |
|
'musique', |
|
'hotpotqa', |
|
'multifieldqa_en' |
|
] |
|
|
|
DATASETS = [ |
|
"qmsum", |
|
"qasper", |
|
"quality", |
|
'musique', |
|
'hotpotqa', |
|
'multifieldqa_en', |
|
] |
|
|
|
outrow = '' |
|
data2res = dict() |
|
|
|
def average(data2res): |
|
sumvalue = 0.0 |
|
sumnum = 0.0 |
|
for adata in data2res: |
|
avalue = data2res[adata] |
|
sumvalue += avalue |
|
sumnum += 1 |
|
|
|
assert sumnum > 0.0 |
|
return sumvalue/sumnum |
|
|
|
def collect(value_list, outrow, data2res): |
|
|
|
|
|
avg = round(np.mean(value_list), 4) |
|
outrow += str(avg) + ' ' |
|
|
|
avg2 = average(data2res) |
|
avg2 = round(avg2, 4) |
|
outrow += str(avg2) + ' ' |
|
|
|
for adata in DATASETS: |
|
ares = data2res[adata] if adata in data2res else "NA" |
|
outrow += str(ares) + " " |
|
print(outrow.strip()) |
|
|
|
print('system avg6 avg6 ' + ' '.join(DATASETS)) |
|
|
|
|
|
|
|
|
|
value_list = list() |
|
for aline in sys.stdin: |
|
|
|
aline = aline.strip() |
|
if 'final display' in aline: |
|
if '-baseline-' in aline or '-ret-' in aline: |
|
if len(outrow) > 0 and len(data2res) > 0: |
|
collect(value_list, outrow, data2res) |
|
|
|
outrow = "" |
|
data2res = dict() |
|
value_list = list() |
|
|
|
aline2 = aline.replace('-', '') |
|
aline2 = aline2.replace('final display', '') |
|
outrow += aline2 + ' ' |
|
continue |
|
|
|
cols = aline.split(' ') |
|
adata = cols[2] |
|
ares = '/'.join(cols[3:]) |
|
scores = cols[3:] |
|
|
|
if len(scores) == 3: |
|
scores = [float(item) for item in scores] |
|
geo_mean = (scores[0] * scores[1] * scores[2]) ** (1.0 / 3.0) |
|
ares = str(round(geo_mean, 4)) |
|
|
|
data2res[adata] = float(ares) |
|
value_list.append(float(ares)) |
|
|
|
collect(value_list, outrow, data2res) |
|
|
|
|