root
add long_32k_eval
dfdc6c0
import sys
import numpy as np
DATASETS1 = [
"qmsum",
"qasper",
"quality",
'musique',
'hotpotqa',
'multifieldqa_en'
]
DATASETS = [
"qmsum",
"qasper",
"quality",
'musique',
'hotpotqa',
'multifieldqa_en',
]
outrow = ''
data2res = dict()
def average(data2res):
sumvalue = 0.0
sumnum = 0.0
for adata in data2res:
avalue = data2res[adata]
sumvalue += avalue
sumnum += 1
assert sumnum > 0.0
return sumvalue/sumnum
def collect(value_list, outrow, data2res):
#print(value_list)
# first add the single avg score:
avg = round(np.mean(value_list), 4)
outrow += str(avg) + ' '
avg2 = average(data2res)
avg2 = round(avg2, 4)
outrow += str(avg2) + ' '
for adata in DATASETS:
ares = data2res[adata] if adata in data2res else "NA"
outrow += str(ares) + " "
print(outrow.strip())
print('system avg6 avg6 ' + ' '.join(DATASETS))
#infn = "eval_retro_2.sh.log.2"
#with open(infn) as br:
#for aline in br.readlines():
value_list = list()
for aline in sys.stdin:
#import ipdb; ipdb.set_trace()
aline = aline.strip()
if 'final display' in aline:
if '-baseline-' in aline or '-ret-' in aline:
if len(outrow) > 0 and len(data2res) > 0:
collect(value_list, outrow, data2res)
outrow = "" # reset
data2res = dict()
value_list = list()
aline2 = aline.replace('-', '')
aline2 = aline2.replace('final display', '')
outrow += aline2 + ' '
continue
cols = aline.split(' ')
adata = cols[2]
ares = '/'.join(cols[3:]) # NOTE use one geometric_mean instead
scores = cols[3:]
# for R1/R2/RL geometric_mean:
if len(scores) == 3:
scores = [float(item) for item in scores]
geo_mean = (scores[0] * scores[1] * scores[2]) ** (1.0 / 3.0)
ares = str(round(geo_mean, 4))
data2res[adata] = float(ares)
value_list.append(float(ares))
collect(value_list, outrow, data2res)