ArthurChen189 commited on
Commit
62977bb
1 Parent(s): 30ac9ed

upload pyserini

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. pyserini/2cr/_base.py +95 -0
  2. pyserini/2cr/miracl.py +447 -0
  3. pyserini/2cr/miracl.yaml +1180 -0
  4. pyserini/2cr/miracl_html.template +256 -0
  5. pyserini/2cr/miracl_html_table.template +35 -0
  6. pyserini/2cr/miracl_html_table_row.template +336 -0
  7. pyserini/2cr/mrtydi.py +330 -0
  8. pyserini/2cr/mrtydi.yaml +890 -0
  9. pyserini/2cr/mrtydi_html.template +256 -0
  10. pyserini/2cr/mrtydi_html_table.template +28 -0
  11. pyserini/2cr/mrtydi_html_table_row.template +212 -0
  12. pyserini/2cr/msmarco-v1-doc.yaml +539 -0
  13. pyserini/2cr/msmarco-v1-passage.yaml +764 -0
  14. pyserini/2cr/msmarco-v2-doc.yaml +287 -0
  15. pyserini/2cr/msmarco-v2-passage.yaml +287 -0
  16. pyserini/2cr/msmarco.py +600 -0
  17. pyserini/2cr/msmarco_html_row_v1.template +81 -0
  18. pyserini/2cr/msmarco_html_row_v2.template +82 -0
  19. pyserini/2cr/msmarco_html_v1_doc.template +296 -0
  20. pyserini/2cr/msmarco_html_v1_passage.template +325 -0
  21. pyserini/2cr/msmarco_html_v2_doc.template +292 -0
  22. pyserini/2cr/msmarco_html_v2_passage.template +292 -0
  23. pyserini/__init__.py +1 -0
  24. pyserini/__pycache__/__init__.cpython-310.pyc +0 -0
  25. pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc +0 -0
  26. pyserini/__pycache__/encoded_query_info.cpython-310.pyc +0 -0
  27. pyserini/__pycache__/evaluate_script_info.cpython-310.pyc +0 -0
  28. pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc +0 -0
  29. pyserini/__pycache__/pyclass.cpython-310.pyc +0 -0
  30. pyserini/__pycache__/setup.cpython-310.pyc +0 -0
  31. pyserini/__pycache__/util.cpython-310.pyc +0 -0
  32. pyserini/analysis/__init__.py +19 -0
  33. pyserini/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
  34. pyserini/analysis/__pycache__/_base.cpython-310.pyc +0 -0
  35. pyserini/analysis/_base.py +166 -0
  36. pyserini/collection/__init__.py +20 -0
  37. pyserini/collection/_base.py +153 -0
  38. pyserini/collection/_collection_support.py +78 -0
  39. pyserini/demo/acl.py +124 -0
  40. pyserini/demo/dpr.py +105 -0
  41. pyserini/demo/miracl.py +149 -0
  42. pyserini/demo/msmarco.py +118 -0
  43. pyserini/demo/templates/acl.html +74 -0
  44. pyserini/demo/templates/assets/acl-logo.svg +10 -0
  45. pyserini/demo/templates/miracl.html +127 -0
  46. pyserini/dsearch.py +46 -0
  47. pyserini/encode/__init__.py +28 -0
  48. pyserini/encode/__main__.py +147 -0
  49. pyserini/encode/__pycache__/__init__.cpython-310.pyc +0 -0
  50. pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc +0 -0
pyserini/2cr/_base.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import subprocess
19
+
20
+ fail_str = '\033[91m[FAIL]\033[0m'
21
+ ok_str = '[OK]'
22
+ okish_str = '\033[94m[OKish]\033[0m'
23
+
24
+
25
+ def run_command(cmd):
26
+ process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
27
+ stdout, stderr = process.communicate()
28
+ stdout = stdout.decode('utf-8')
29
+ stderr = stderr.decode('utf-8')
30
+
31
+ return stdout, stderr
32
+
33
+
34
+ def run_eval_and_return_metric(metric, eval_key, defs, runfile):
35
+ eval_cmd = f'python -m pyserini.eval.trec_eval {defs} {eval_key} {runfile}'
36
+ eval_stdout, eval_stderr = run_command(eval_cmd)
37
+
38
+ for line in eval_stdout.split('\n'):
39
+ parts = line.split('\t')
40
+ if len(parts) == 3 and parts[1] == 'all':
41
+ return round(float(parts[2]), 4)
42
+
43
+ return 0.0
44
+
45
+
46
+ def run_dpr_retrieval_eval_and_return_metric(defs, json_file):
47
+ """Generate dpr retrieval evaluation scores
48
+
49
+ Args:
50
+ defs: topk definitions (e.g., '--topk 5 20')
51
+ json_file: dpr retrieval json file
52
+
53
+ Returns:
54
+ topk: a dictionary of topk scores (e.g., {"Top5": <score>})
55
+ """
56
+ eval_cmd = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {json_file} {defs} '
57
+ eval_stdout, eval_stderr = run_command(eval_cmd)
58
+ topk = {}
59
+ for line in eval_stdout.split('\n'):
60
+ parts = line.split('\t')
61
+ if len(parts) == 2 and 'accuracy' in parts[1]:
62
+ topk.update({parts[0]:round(float(parts[1][10:])*100, 4)})
63
+ return topk
64
+
65
+
66
+ def convert_trec_run_to_dpr_retrieval_json(topics,index,runfile,output):
67
+ """Convert trec runfile to dpr retrieval json file
68
+
69
+ Args:
70
+ topics: topics field
71
+ index: index field
72
+ runfile: input runfile
73
+ output: output jsonfile
74
+
75
+ Returns:
76
+ exit status: exit status
77
+ """
78
+ cmd = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics {topics} --index {index} --input {runfile} --output {output}'
79
+ return os.system(cmd)
80
+
81
+
82
+ def run_fusion(run_ls, output, k):
83
+ """run fusion command and return status code
84
+
85
+ Args:
86
+ run_ls: a list of runfile paths
87
+ output: output path
88
+ k: topk value
89
+
90
+ Returns:
91
+ status code: status code
92
+ """
93
+ run_files = ' '.join(run_ls)
94
+ cmd = f'python -m pyserini.fusion --runs {run_files} --output {output} --k {k}'
95
+ return os.system(cmd)
pyserini/2cr/miracl.py ADDED
@@ -0,0 +1,447 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import argparse
18
+ import math
19
+ import os
20
+ import sys
21
+ import time
22
+ import subprocess
23
+ import pkg_resources
24
+ from collections import defaultdict, OrderedDict
25
+ from string import Template
26
+
27
+ import yaml
28
+
29
+ from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
30
+
31
+ languages = [
32
+ ['ar', 'arabic'],
33
+ ['bn', 'bengali'],
34
+ ['en', 'english'],
35
+ ['es', 'spanish'],
36
+ ['fa', 'persian'],
37
+ ['fi', 'finnish'],
38
+ ['fr', 'french'],
39
+ ['hi', 'hindi'],
40
+ ['id', 'indonesian'],
41
+ ['ja', 'japanese'],
42
+ ['ko', 'korean'],
43
+ ['ru', 'russian'],
44
+ ['sw', 'swahili'],
45
+ ['te', 'telugu'],
46
+ ['th', 'thai'],
47
+ ['zh', 'chinese'],
48
+ ['de', 'german'],
49
+ ['yo', 'yoruba']
50
+ ]
51
+
52
+ html_display = OrderedDict()
53
+ html_display['bm25'] = 'BM25'
54
+ html_display['mdpr-tied-pft-msmarco'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO'
55
+ html_display['mdpr-tied-pft-msmarco-ft-all'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO then FT w/ all Mr. TyDi'
56
+ html_display['bm25-mdpr-tied-pft-msmarco-hybrid'] = 'Hybrid of `bm25` and `mdpr-tied-pft-msmarco`'
57
+ html_display['mdpr-tied-pft-msmarco-ft-miracl'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO then in-lang FT w/ MIRACL'
58
+ html_display['mcontriever-tied-pft-msmarco'] = 'mContriever (tied encoders), pre-FT w/ MS MARCO'
59
+
60
+ models = list(html_display)
61
+
62
+ trec_eval_metric_definitions = {
63
+ 'nDCG@10': '-c -M 100 -m ndcg_cut.10',
64
+ 'R@100': '-c -m recall.100',
65
+ }
66
+
67
+
68
+ def format_run_command(raw):
69
+ return raw.replace('--lang', '\\\n --lang') \
70
+ .replace('--encoder', '\\\n --encoder') \
71
+ .replace('--topics', '\\\n --topics') \
72
+ .replace('--index', '\\\n --index') \
73
+ .replace('--output ', '\\\n --output ') \
74
+ .replace('--runs', '\\\n --runs ') \
75
+ .replace('--batch ', '\\\n --batch ') \
76
+ .replace('--threads 12', '--threads 12 \\\n ')
77
+
78
+
79
+ def format_eval_command(raw):
80
+ return raw.replace('-c ', '\\\n -c ') \
81
+ .replace(raw.split()[-1], f'\\\n {raw.split()[-1]}')
82
+
83
+
84
+ def read_file(f):
85
+ fin = open(f, 'r')
86
+ text = fin.read()
87
+ fin.close()
88
+
89
+ return text
90
+
91
+
92
+ def list_conditions():
93
+ print('Conditions:\n-----------')
94
+ for condition, _ in html_display.items():
95
+ print(condition)
96
+ print('\nLanguages\n---------')
97
+ for language in languages:
98
+ print(language[0])
99
+
100
+
101
+ def generate_table_rows(table, row_template, commands, eval_commands, table_id, split, metric):
102
+ row_cnt = 1
103
+ html_rows = []
104
+
105
+ for model in models:
106
+ s = Template(row_template)
107
+
108
+ keys = {}
109
+ used_langs = 0
110
+ for lang in languages:
111
+ keys[lang[0]] = f'{model}.{lang[0]}'
112
+ used_langs += 1 if table[keys[lang[0]]][split][metric] != 0 else 0
113
+
114
+ sum = table[keys["ar"]][split][metric] + \
115
+ table[keys["bn"]][split][metric] + \
116
+ table[keys["en"]][split][metric] + \
117
+ table[keys["es"]][split][metric] + \
118
+ table[keys["fa"]][split][metric] + \
119
+ table[keys["fi"]][split][metric] + \
120
+ table[keys["fr"]][split][metric] + \
121
+ table[keys["hi"]][split][metric] + \
122
+ table[keys["id"]][split][metric] + \
123
+ table[keys["ja"]][split][metric] + \
124
+ table[keys["ko"]][split][metric] + \
125
+ table[keys["ru"]][split][metric] + \
126
+ table[keys["sw"]][split][metric] + \
127
+ table[keys["te"]][split][metric] + \
128
+ table[keys["th"]][split][metric] + \
129
+ table[keys["zh"]][split][metric] + \
130
+ table[keys["de"]][split][metric] + \
131
+ table[keys["yo"]][split][metric]
132
+ avg = sum / used_langs
133
+
134
+ s = s.substitute(table_cnt=table_id,
135
+ row_cnt=row_cnt,
136
+ model=html_display[model],
137
+ ar=f'{table[keys["ar"]][split][metric]:.3f}',
138
+ bn=f'{table[keys["bn"]][split][metric]:.3f}',
139
+ en=f'{table[keys["en"]][split][metric]:.3f}',
140
+ es=f'{table[keys["es"]][split][metric]:.3f}',
141
+ fa=f'{table[keys["fa"]][split][metric]:.3f}',
142
+ fi=f'{table[keys["fi"]][split][metric]:.3f}',
143
+ fr=f'{table[keys["fr"]][split][metric]:.3f}',
144
+ hi=f'{table[keys["hi"]][split][metric]:.3f}',
145
+ id=f'{table[keys["id"]][split][metric]:.3f}',
146
+ ja=f'{table[keys["ja"]][split][metric]:.3f}',
147
+ ko=f'{table[keys["ko"]][split][metric]:.3f}',
148
+ ru=f'{table[keys["ru"]][split][metric]:.3f}',
149
+ sw=f'{table[keys["sw"]][split][metric]:.3f}',
150
+ te=f'{table[keys["te"]][split][metric]:.3f}',
151
+ th=f'{table[keys["th"]][split][metric]:.3f}',
152
+ zh=f'{table[keys["zh"]][split][metric]:.3f}',
153
+ de=f'{table[keys["de"]][split][metric]:.3f}',
154
+ yo=f'{table[keys["yo"]][split][metric]:.3f}',
155
+ avg=f'{avg:.3f}',
156
+ cmd1=f'{commands[keys["ar"]]}',
157
+ cmd2=f'{commands[keys["bn"]]}',
158
+ cmd3=f'{commands[keys["en"]]}',
159
+ cmd4=f'{commands[keys["es"]]}',
160
+ cmd5=f'{commands[keys["fa"]]}',
161
+ cmd6=f'{commands[keys["fi"]]}',
162
+ cmd7=f'{commands[keys["fr"]]}',
163
+ cmd8=f'{commands[keys["hi"]]}',
164
+ cmd9=f'{commands[keys["id"]]}',
165
+ cmd10=f'{commands[keys["ja"]]}',
166
+ cmd11=f'{commands[keys["ko"]]}',
167
+ cmd12=f'{commands[keys["ru"]]}',
168
+ cmd13=f'{commands[keys["sw"]]}',
169
+ cmd14=f'{commands[keys["te"]]}',
170
+ cmd15=f'{commands[keys["th"]]}',
171
+ cmd16=f'{commands[keys["zh"]]}',
172
+ cmd17=f'{commands[keys["de"]]}',
173
+ cmd18=f'{commands[keys["yo"]]}',
174
+ eval_cmd1=f'{eval_commands[keys["ar"]][metric]}',
175
+ eval_cmd2=f'{eval_commands[keys["bn"]][metric]}',
176
+ eval_cmd3=f'{eval_commands[keys["en"]][metric]}',
177
+ eval_cmd4=f'{eval_commands[keys["es"]][metric]}',
178
+ eval_cmd5=f'{eval_commands[keys["fa"]][metric]}',
179
+ eval_cmd6=f'{eval_commands[keys["fi"]][metric]}',
180
+ eval_cmd7=f'{eval_commands[keys["fr"]][metric]}',
181
+ eval_cmd8=f'{eval_commands[keys["hi"]][metric]}',
182
+ eval_cmd9=f'{eval_commands[keys["id"]][metric]}',
183
+ eval_cmd10=f'{eval_commands[keys["ja"]][metric]}',
184
+ eval_cmd11=f'{eval_commands[keys["ko"]][metric]}',
185
+ eval_cmd12=f'{eval_commands[keys["ru"]][metric]}',
186
+ eval_cmd13=f'{eval_commands[keys["sw"]][metric]}',
187
+ eval_cmd14=f'{eval_commands[keys["te"]][metric]}',
188
+ eval_cmd15=f'{eval_commands[keys["th"]][metric]}',
189
+ eval_cmd16=f'{eval_commands[keys["zh"]][metric]}',
190
+ eval_cmd17=f'{eval_commands[keys["de"]][metric]}',
191
+ eval_cmd18=f'{eval_commands[keys["yo"]][metric]}'
192
+ )
193
+
194
+ s = s.replace("0.000", "--")
195
+ html_rows.append(s)
196
+ row_cnt += 1
197
+
198
+ return html_rows
199
+
200
+
201
+ def print_results(table, metric, split):
202
+ print(f'Metric = {metric}, Split = {split}')
203
+ print(' ' * 35, end='')
204
+ for lang in languages:
205
+ print(f'{lang[0]:3} ', end='')
206
+ print('')
207
+ for model in models:
208
+ print(f'{model:33}', end='')
209
+ for lang in languages:
210
+ key = f'{model}.{lang[0]}'
211
+ print(f'{table[key][split][metric]:7.3f}', end='')
212
+ print('')
213
+ print('')
214
+
215
+
216
+ def extract_topic_fn_from_cmd(cmd):
217
+ cmd = cmd.split()
218
+ topic_idx = cmd.index('--topics')
219
+ return cmd[topic_idx + 1]
220
+
221
+
222
+ def generate_report(args):
223
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
224
+ commands = defaultdict(lambda: '')
225
+ eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
226
+
227
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html.template'))
228
+ table_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html_table.template'))
229
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html_table_row.template'))
230
+
231
+ with open(pkg_resources.resource_filename(__name__, 'miracl.yaml')) as f:
232
+ yaml_data = yaml.safe_load(f)
233
+ for condition in yaml_data['conditions']:
234
+ name = condition['name']
235
+ eval_key = condition['eval_key']
236
+ cmd_template = condition['command']
237
+ cmd_lst = cmd_template.split()
238
+ lang = name.split('.')[-1]
239
+ is_hybrid_run = 'hybrid' in name
240
+
241
+ for splits in condition['splits']:
242
+ split = splits['split']
243
+ if is_hybrid_run:
244
+ hits = int(cmd_lst[cmd_lst.index('--k') + 1])
245
+ else:
246
+ hits = int(cmd_lst[cmd_lst.index('--hits') + 1])
247
+
248
+ runfile = os.path.join(args.directory, f'run.miracl.{name}.{split}.txt')
249
+ if is_hybrid_run:
250
+ bm25_output = os.path.join(args.directory,
251
+ f'run.miracl.bm25.{lang}.{split}.top{hits}.txt')
252
+ mdpr_output = os.path.join(args.directory,
253
+ f'run.miracl.mdpr-tied-pft-msmarco.{lang}.{split}.top{hits}.txt')
254
+ expected_args = dict(output=runfile, bm25_output=bm25_output, mdpr_output=mdpr_output)
255
+ else:
256
+ expected_args = dict(split=split, output=runfile)
257
+
258
+ if not all([f"${k}" in cmd_template or f"${{{k}}}" in cmd_template for k in expected_args]):
259
+ raise ValueError(f"Not all arguements {list(expected_args)} detected from inputs: {cmd_template}.")
260
+ cmd = Template(cmd_template).substitute(**expected_args)
261
+ commands[name] = format_run_command(cmd)
262
+
263
+ for expected in splits['scores']:
264
+ for metric in expected:
265
+ if str(expected[metric])[-1] == "5":
266
+ # without adding espilon, there is a chance that f-string would round 0.5 to 0 rather than 1
267
+ # e.g., 0.8885 -> 0.888 rather than 0.889
268
+ # add a espilon to the expected score to avoid rounding error
269
+ expected[metric] += 1e-5
270
+ table[name][split][metric] = expected[metric]
271
+
272
+ eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
273
+ f'{trec_eval_metric_definitions[metric]} {eval_key}-{split} {runfile}'
274
+ eval_commands[name][metric] = format_eval_command(eval_cmd)
275
+
276
+ tables_html = []
277
+
278
+ split = 'dev'
279
+
280
+ # Build the table for MRR@100, test queries
281
+ html_rows = generate_table_rows(table, row_template, commands, eval_commands, 1, split, 'nDCG@10')
282
+ all_rows = '\n'.join(html_rows)
283
+ tables_html.append(Template(table_template).substitute(desc=f'nDCG@10, {split} queries', rows=all_rows))
284
+
285
+ # Build the table for R@100, test queries
286
+ html_rows = generate_table_rows(table, row_template, commands, eval_commands, 2, split, 'R@100')
287
+ all_rows = '\n'.join(html_rows)
288
+ tables_html.append(Template(table_template).substitute(desc=f'Recall@100, {split} queries', rows=all_rows))
289
+
290
+ with open(args.output, 'w') as out:
291
+ out.write(Template(html_template).substitute(title='MIRACL', tables=' '.join(tables_html)))
292
+
293
+
294
+ def run_conditions(args):
295
+ if args.condition == 'mdpr-tied-pft-msmarco-ft-miracl' and args.language in ['de', 'yo']:
296
+ print('MIRACL de and yo datasets do not have train splits to finetune with')
297
+ return
298
+
299
+ start = time.time()
300
+
301
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
302
+
303
+ with open(pkg_resources.resource_filename(__name__, 'miracl.yaml')) as f:
304
+ yaml_data = yaml.safe_load(f)
305
+ for condition in yaml_data['conditions']:
306
+ name = condition['name']
307
+ encoder = name.split('.')[0]
308
+ lang = name.split('.')[-1]
309
+ if args.all:
310
+ pass
311
+ elif args.condition != encoder:
312
+ continue
313
+ elif args.language and args.language != lang:
314
+ continue
315
+ eval_key = condition['eval_key']
316
+ cmd_template = condition['command']
317
+ cmd_lst = cmd_template.split()
318
+
319
+ print(f'condition {name}:')
320
+ is_hybrid_run = 'hybrid' in name
321
+
322
+ for splits in condition['splits']:
323
+ split = splits['split']
324
+ if is_hybrid_run:
325
+ hits = int(cmd_lst[cmd_lst.index('--k') + 1])
326
+ else:
327
+ hits = int(cmd_lst[cmd_lst.index('--hits') + 1])
328
+
329
+ print(f' - split: {split}')
330
+
331
+ runfile = os.path.join(args.directory, f'run.miracl.{name}.{split}.top{hits}.txt')
332
+ if is_hybrid_run:
333
+ bm25_output = os.path.join(args.directory,
334
+ f'run.miracl.bm25.{lang}.{split}.top{hits}.txt')
335
+ mdpr_output = os.path.join(args.directory,
336
+ f'run.miracl.mdpr-tied-pft-msmarco.{lang}.{split}.top{hits}.txt')
337
+ if not os.path.exists(bm25_output):
338
+ print(f'Missing BM25 file: {bm25_output}')
339
+ continue
340
+ if not os.path.exists(mdpr_output):
341
+ print(f'Missing mDPR file: {mdpr_output}')
342
+ continue
343
+ cmd = Template(cmd_template).substitute(split=split, output=runfile, bm25_output=bm25_output,
344
+ mdpr_output=mdpr_output)
345
+ else:
346
+ cmd = Template(cmd_template).substitute(split=split, output=runfile)
347
+
348
+ # In the yaml file, the topics are written as something like '--topics miracl-v1.0-ar-${split}'
349
+ # This works for the dev split because the topics are directly included in Anserini/Pyserini.
350
+ # For this training split, we have to map the symbol into a file in tools/topics-and-qrels/
351
+ # Here, we assume that the developer has cloned the miracl repo and placed the topics there.
352
+ if split == 'train':
353
+ cmd = cmd.replace(f'--topics miracl-v1.0-{lang}-{split}',
354
+ f'--topics tools/topics-and-qrels/topics.miracl-v1.0-{lang}-{split}.tsv')
355
+
356
+ if args.display_commands:
357
+ print(f'\n```bash\n{format_run_command(cmd)}\n```\n')
358
+
359
+ if not os.path.exists(runfile):
360
+ if not args.dry_run:
361
+ rtn = subprocess.run(cmd.split(), capture_output=True)
362
+ stderr = rtn.stderr.decode()
363
+ if '--topics' in cmd:
364
+ topic_fn = extract_topic_fn_from_cmd(cmd)
365
+ if f'ValueError: Topic {topic_fn} Not Found' in stderr:
366
+ print(f'Skipping {topic_fn}: file not found.')
367
+ continue
368
+
369
+ for expected in splits['scores']:
370
+ for metric in expected:
371
+ if not args.skip_eval:
372
+ # We have the translate the training qrels into a file located in tools/topics-and-qrels/
373
+ # because they are not included with Anserini/Pyserini by default.
374
+ # Here, we assume that the developer has cloned the miracl repo and placed the qrels there.
375
+ if split == 'train':
376
+ qrels = f'tools/topics-and-qrels/qrels.{eval_key}-train.tsv'
377
+ else:
378
+ qrels = f'{eval_key}-{split}'
379
+ score = float(run_eval_and_return_metric(metric, qrels,
380
+ trec_eval_metric_definitions[metric], runfile))
381
+ if math.isclose(score, float(expected[metric])):
382
+ result_str = ok_str
383
+ # Flaky tests
384
+ elif (name == 'mdpr-tied-pft-msmarco.hi' and split == 'train'
385
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
386
+ (name == 'mdpr-tied-pft-msmarco-ft-all.ru'
387
+ and split == 'dev' and metric == 'nDCG@10'
388
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
389
+ (name == 'bm25-mdpr-tied-pft-msmarco-hybrid.te'
390
+ and split == 'train' and metric == 'nDCG@10'
391
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
392
+ (name == 'bm25-mdpr-tied-pft-msmarco-hybrid.zh'
393
+ and split == 'dev' and metric == 'nDCG@10'
394
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4)):
395
+ result_str = okish_str
396
+ else:
397
+ result_str = fail_str + f' expected {expected[metric]:.4f}'
398
+ print(f' {metric:7}: {score:.4f} {result_str}')
399
+ table[name][split][metric] = score
400
+ else:
401
+ table[name][split][metric] = expected[metric]
402
+
403
+ print('')
404
+
405
+ for metric in ['nDCG@10', 'R@100']:
406
+ for split in ['dev', 'train']:
407
+ print_results(table, metric, split)
408
+
409
+ end = time.time()
410
+ print(f'Total elapsed time: {end - start:.0f}s')
411
+
412
+
413
+ if __name__ == '__main__':
414
+ parser = argparse.ArgumentParser(description='Generate regression matrix for MIRACL.')
415
+ parser.add_argument('--condition', type=str,
416
+ help='Condition to run', required=False)
417
+ # To list all conditions
418
+ parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
419
+ # For generating reports
420
+ parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
421
+ parser.add_argument('--output', type=str, help='File to store report.', required=False)
422
+ # For actually running the experimental conditions
423
+ parser.add_argument('--all', action='store_true', default=False, help='Run using all languages.')
424
+ parser.add_argument('--language', type=str, help='Language to run.', required=False)
425
+ parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
426
+ parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
427
+ parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
428
+ parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
429
+ args = parser.parse_args()
430
+
431
+ if args.list_conditions:
432
+ list_conditions()
433
+ sys.exit()
434
+
435
+ if args.generate_report:
436
+ if not args.output:
437
+ print(f'Must specify report filename with --output.')
438
+ sys.exit()
439
+
440
+ generate_report(args)
441
+ sys.exit()
442
+
443
+ if args.all and (args.condition or args.language):
444
+ print('Specifying --all will run all conditions and languages')
445
+ sys.exit()
446
+
447
+ run_conditions(args)
pyserini/2cr/miracl.yaml ADDED
@@ -0,0 +1,1180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conditions:
2
+ # BM25
3
+ - name: bm25.ar
4
+ eval_key: miracl-v1.0-ar
5
+ command: python -m pyserini.search.lucene --language ar --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar --output $output --batch 128 --threads 16 --bm25 --hits 1000
6
+ splits:
7
+ - split: train
8
+ scores:
9
+ - nDCG@10: 0.4434
10
+ R@100: 0.8562
11
+ - split: dev
12
+ scores:
13
+ - nDCG@10: 0.4809
14
+ R@100: 0.8885
15
+ - name: bm25.bn
16
+ eval_key: miracl-v1.0-bn
17
+ command: python -m pyserini.search.lucene --language bn --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn --output $output --batch 128 --threads 16 --bm25 --hits 1000
18
+ splits:
19
+ - split: train
20
+ scores:
21
+ - nDCG@10: 0.5122
22
+ R@100: 0.8934
23
+ - split: dev
24
+ scores:
25
+ - nDCG@10: 0.5079
26
+ R@100: 0.9088
27
+ - name: bm25.en
28
+ eval_key: miracl-v1.0-en
29
+ command: python -m pyserini.search.lucene --language en --topics miracl-v1.0-en-${split} --index miracl-v1.0-en --output $output --batch 128 --threads 16 --bm25 --hits 1000
30
+ splits:
31
+ - split: train
32
+ scores:
33
+ - nDCG@10: 0.3415
34
+ R@100: 0.7928
35
+ - split: dev
36
+ scores:
37
+ - nDCG@10: 0.3506
38
+ R@100: 0.8190
39
+ - name: bm25.es
40
+ eval_key: miracl-v1.0-es
41
+ command: python -m pyserini.search.lucene --language es --topics miracl-v1.0-es-${split} --index miracl-v1.0-es --output $output --batch 128 --threads 16 --bm25 --hits 1000
42
+ splits:
43
+ - split: train
44
+ scores:
45
+ - nDCG@10: 0.3030
46
+ R@100: 0.7020
47
+ - split: dev
48
+ scores:
49
+ - nDCG@10: 0.3193
50
+ R@100: 0.7018
51
+ - name: bm25.fa
52
+ eval_key: miracl-v1.0-fa
53
+ command: python -m pyserini.search.lucene --language fa --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa --output $output --batch 128 --threads 16 --bm25 --hits 1000
54
+ splits:
55
+ - split: train
56
+ scores:
57
+ - nDCG@10: 0.3270
58
+ R@100: 0.7139
59
+ - split: dev
60
+ scores:
61
+ - nDCG@10: 0.3334
62
+ R@100: 0.7306
63
+ - name: bm25.fi
64
+ eval_key: miracl-v1.0-fi
65
+ command: python -m pyserini.search.lucene --language fi --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi --output $output --batch 128 --threads 16 --bm25 --hits 1000
66
+ splits:
67
+ - split: train
68
+ scores:
69
+ - nDCG@10: 0.5106
70
+ R@100: 0.8471
71
+ - split: dev
72
+ scores:
73
+ - nDCG@10: 0.5513
74
+ R@100: 0.8910
75
+ - name: bm25.fr
76
+ eval_key: miracl-v1.0-fr
77
+ command: python -m pyserini.search.lucene --language fr --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr --output $output --batch 128 --threads 16 --bm25 --hits 1000
78
+ splits:
79
+ - split: train
80
+ scores:
81
+ - nDCG@10: 0.2152
82
+ R@100: 0.6601
83
+ - split: dev
84
+ scores:
85
+ - nDCG@10: 0.1832
86
+ R@100: 0.6528
87
+ - name: bm25.hi
88
+ eval_key: miracl-v1.0-hi
89
+ command: python -m pyserini.search.lucene --language hi --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi --output $output --batch 128 --threads 16 --bm25 --hits 1000
90
+ splits:
91
+ - split: train
92
+ scores:
93
+ - nDCG@10: 0.4745
94
+ R@100: 0.9016
95
+ - split: dev
96
+ scores:
97
+ - nDCG@10: 0.4578
98
+ R@100: 0.8679
99
+ - name: bm25.id
100
+ eval_key: miracl-v1.0-id
101
+ command: python -m pyserini.search.lucene --language id --topics miracl-v1.0-id-${split} --index miracl-v1.0-id --output $output --batch 128 --threads 16 --bm25 --hits 1000
102
+ splits:
103
+ - split: train
104
+ scores:
105
+ - nDCG@10: 0.4844
106
+ R@100: 0.9234
107
+ - split: dev
108
+ scores:
109
+ - nDCG@10: 0.4486
110
+ R@100: 0.9041
111
+ - name: bm25.ja
112
+ eval_key: miracl-v1.0-ja
113
+ command: python -m pyserini.search.lucene --language ja --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja --output $output --batch 128 --threads 16 --bm25 --hits 1000
114
+ splits:
115
+ - split: train
116
+ scores:
117
+ - nDCG@10: 0.3796
118
+ R@100: 0.8225
119
+ - split: dev
120
+ scores:
121
+ - nDCG@10: 0.3689
122
+ R@100: 0.8048
123
+ - name: bm25.ko
124
+ eval_key: miracl-v1.0-ko
125
+ command: python -m pyserini.search.lucene --language ko --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko --output $output --batch 128 --threads 16 --bm25 --hits 1000
126
+ splits:
127
+ - split: train
128
+ scores:
129
+ - nDCG@10: 0.4279
130
+ R@100: 0.7572
131
+ - split: dev
132
+ scores:
133
+ - nDCG@10: 0.4190
134
+ R@100: 0.7831
135
+ - name: bm25.ru
136
+ eval_key: miracl-v1.0-ru
137
+ command: python -m pyserini.search.lucene --language ru --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru --output $output --batch 128 --threads 16 --bm25 --hits 1000
138
+ splits:
139
+ - split: train
140
+ scores:
141
+ - nDCG@10: 0.3153
142
+ R@100: 0.6464
143
+ - split: dev
144
+ scores:
145
+ - nDCG@10: 0.3342
146
+ R@100: 0.6614
147
+ - name: bm25.sw
148
+ eval_key: miracl-v1.0-sw
149
+ command: python -m pyserini.search.lucene --language sw --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw --output $output --batch 128 --threads 16 --bm25 --hits 1000
150
+ splits:
151
+ - split: train
152
+ scores:
153
+ - nDCG@10: 0.3356
154
+ R@100: 0.6499
155
+ - split: dev
156
+ scores:
157
+ - nDCG@10: 0.3826
158
+ R@100: 0.7008
159
+ - name: bm25.te
160
+ eval_key: miracl-v1.0-te
161
+ command: python -m pyserini.search.lucene --language te --topics miracl-v1.0-te-${split} --index miracl-v1.0-te --output $output --batch 128 --threads 16 --bm25 --hits 1000
162
+ splits:
163
+ - split: train
164
+ scores:
165
+ - nDCG@10: 0.4814
166
+ R@100: 0.8077
167
+ - split: dev
168
+ scores:
169
+ - nDCG@10: 0.4942
170
+ R@100: 0.8307
171
+ - name: bm25.th
172
+ eval_key: miracl-v1.0-th
173
+ command: python -m pyserini.search.lucene --language th --topics miracl-v1.0-th-${split} --index miracl-v1.0-th --output $output --batch 128 --threads 16 --bm25 --hits 1000
174
+ splits:
175
+ - split: train
176
+ scores:
177
+ - nDCG@10: 0.4629
178
+ R@100: 0.8768
179
+ - split: dev
180
+ scores:
181
+ - nDCG@10: 0.4838
182
+ R@100: 0.8874
183
+ - name: bm25.zh
184
+ eval_key: miracl-v1.0-zh
185
+ command: python -m pyserini.search.lucene --language zh --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh --output $output --batch 128 --threads 16 --bm25 --hits 1000
186
+ splits:
187
+ - split: train
188
+ scores:
189
+ - nDCG@10: 0.2018
190
+ R@100: 0.5541
191
+ - split: dev
192
+ scores:
193
+ - nDCG@10: 0.1801
194
+ R@100: 0.5599
195
+ - name: bm25.de
196
+ eval_key: miracl-v1.0-de
197
+ command: python -m pyserini.search.lucene --language de --topics miracl-v1.0-de-${split} --index miracl-v1.0-de --output $output --batch 128 --threads 16 --bm25 --hits 1000
198
+ splits:
199
+ - split: dev
200
+ scores:
201
+ - nDCG@10: 0.2262
202
+ R@100: 0.5724
203
+ - name: bm25.yo
204
+ eval_key: miracl-v1.0-yo
205
+ command: python -m pyserini.search.lucene --pretokenized --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo --output $output --batch 128 --threads 16 --bm25 --hits 1000
206
+ splits:
207
+ - split: dev
208
+ scores:
209
+ - nDCG@10: 0.4059
210
+ R@100: 0.7325
211
+
212
+ # mdpr-tied-pft-msmarco
213
+ - name: mdpr-tied-pft-msmarco.ar
214
+ eval_key: miracl-v1.0-ar
215
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
216
+ splits:
217
+ - split: train
218
+ scores:
219
+ - nDCG@10: 0.4653
220
+ R@100: 0.8293
221
+ - split: dev
222
+ scores:
223
+ - nDCG@10: 0.4993
224
+ R@100: 0.8407
225
+ - name: mdpr-tied-pft-msmarco.bn
226
+ eval_key: miracl-v1.0-bn
227
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
228
+ splits:
229
+ - split: train
230
+ scores:
231
+ - nDCG@10: 0.4362
232
+ R@100: 0.8045
233
+ - split: dev
234
+ scores:
235
+ - nDCG@10: 0.4427
236
+ R@100: 0.8193
237
+ - name: mdpr-tied-pft-msmarco.en
238
+ eval_key: miracl-v1.0-en
239
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
240
+ splits:
241
+ - split: train
242
+ scores:
243
+ - nDCG@10: 0.3986
244
+ R@100: 0.7779
245
+ - split: dev
246
+ scores:
247
+ - nDCG@10: 0.3938
248
+ R@100: 0.7675
249
+ - name: mdpr-tied-pft-msmarco.es
250
+ eval_key: miracl-v1.0-es
251
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
252
+ splits:
253
+ - split: train
254
+ scores:
255
+ - nDCG@10: 0.4637
256
+ R@100: 0.8654
257
+ - split: dev
258
+ scores:
259
+ - nDCG@10: 0.4777
260
+ R@100: 0.8643
261
+ - name: mdpr-tied-pft-msmarco.fa
262
+ eval_key: miracl-v1.0-fa
263
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
264
+ splits:
265
+ - split: train
266
+ scores:
267
+ - nDCG@10: 0.4882
268
+ R@100: 0.9092
269
+ - split: dev
270
+ scores:
271
+ - nDCG@10: 0.4800
272
+ R@100: 0.8980
273
+ - name: mdpr-tied-pft-msmarco.fi
274
+ eval_key: miracl-v1.0-fi
275
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
276
+ splits:
277
+ - split: train
278
+ scores:
279
+ - nDCG@10: 0.4426
280
+ R@100: 0.7611
281
+ - split: dev
282
+ scores:
283
+ - nDCG@10: 0.4721
284
+ R@100: 0.7877
285
+ - name: mdpr-tied-pft-msmarco.fr
286
+ eval_key: miracl-v1.0-fr
287
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
288
+ splits:
289
+ - split: train
290
+ scores:
291
+ - nDCG@10: 0.4372
292
+ R@100: 0.9268
293
+ - split: dev
294
+ scores:
295
+ - nDCG@10: 0.4352
296
+ R@100: 0.9154
297
+ - name: mdpr-tied-pft-msmarco.hi
298
+ eval_key: miracl-v1.0-hi
299
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
300
+ splits:
301
+ - split: train
302
+ scores:
303
+ - nDCG@10: 0.3685
304
+ R@100: 0.7780
305
+ - split: dev
306
+ scores:
307
+ - nDCG@10: 0.3830
308
+ R@100: 0.7755
309
+ - name: mdpr-tied-pft-msmarco.id
310
+ eval_key: miracl-v1.0-id
311
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
312
+ splits:
313
+ - split: train
314
+ scores:
315
+ - nDCG@10: 0.2549
316
+ R@100: 0.5610
317
+ - split: dev
318
+ scores:
319
+ - nDCG@10: 0.2719
320
+ R@100: 0.5734
321
+ - name: mdpr-tied-pft-msmarco.ja
322
+ eval_key: miracl-v1.0-ja
323
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
324
+ splits:
325
+ - split: train
326
+ scores:
327
+ - nDCG@10: 0.4342
328
+ R@100: 0.8211
329
+ - split: dev
330
+ scores:
331
+ - nDCG@10: 0.4390
332
+ R@100: 0.8254
333
+ - name: mdpr-tied-pft-msmarco.ko
334
+ eval_key: miracl-v1.0-ko
335
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
336
+ splits:
337
+ - split: train
338
+ scores:
339
+ - nDCG@10: 0.4147
340
+ R@100: 0.7699
341
+ - split: dev
342
+ scores:
343
+ - nDCG@10: 0.4189
344
+ R@100: 0.7369
345
+ - name: mdpr-tied-pft-msmarco.ru
346
+ eval_key: miracl-v1.0-ru
347
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
348
+ splits:
349
+ - split: train
350
+ scores:
351
+ - nDCG@10: 0.3812
352
+ R@100: 0.7854
353
+ - split: dev
354
+ scores:
355
+ - nDCG@10: 0.4073
356
+ R@100: 0.7972
357
+ - name: mdpr-tied-pft-msmarco.sw
358
+ eval_key: miracl-v1.0-sw
359
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
360
+ splits:
361
+ - split: train
362
+ scores:
363
+ - nDCG@10: 0.2973
364
+ R@100: 0.5761
365
+ - split: dev
366
+ scores:
367
+ - nDCG@10: 0.2990
368
+ R@100: 0.6158
369
+ - name: mdpr-tied-pft-msmarco.te
370
+ eval_key: miracl-v1.0-te
371
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
372
+ splits:
373
+ - split: train
374
+ scores:
375
+ - nDCG@10: 0.3723
376
+ R@100: 0.7698
377
+ - split: dev
378
+ scores:
379
+ - nDCG@10: 0.3557
380
+ R@100: 0.7619
381
+ - name: mdpr-tied-pft-msmarco.th
382
+ eval_key: miracl-v1.0-th
383
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
384
+ splits:
385
+ - split: train
386
+ scores:
387
+ - nDCG@10: 0.3451
388
+ R@100: 0.6728
389
+ - split: dev
390
+ scores:
391
+ - nDCG@10: 0.3578
392
+ R@100: 0.6783
393
+ - name: mdpr-tied-pft-msmarco.zh
394
+ eval_key: miracl-v1.0-zh
395
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
396
+ splits:
397
+ - split: train
398
+ scores:
399
+ - nDCG@10: 0.5040
400
+ R@100: 0.9355
401
+ - split: dev
402
+ scores:
403
+ - nDCG@10: 0.5116
404
+ R@100: 0.9436
405
+ - name: mdpr-tied-pft-msmarco.de
406
+ eval_key: miracl-v1.0-de
407
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
408
+ splits:
409
+ - split: dev
410
+ scores:
411
+ - nDCG@10: 0.4895
412
+ R@100: 0.8983
413
+ - name: mdpr-tied-pft-msmarco.yo
414
+ eval_key: miracl-v1.0-yo
415
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
416
+ splits:
417
+ - split: dev
418
+ scores:
419
+ - nDCG@10: 0.4439
420
+ R@100: 0.8403
421
+
422
+ # mdpr-tied-pft-msmarco-ft-all
423
+ - name: mdpr-tied-pft-msmarco-ft-all.ar
424
+ eval_key: miracl-v1.0-ar
425
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
426
+ splits:
427
+ - split: train
428
+ scores:
429
+ - nDCG@10: 0.6954
430
+ R@100: 0.8542
431
+ - split: dev
432
+ scores:
433
+ - nDCG@10: 0.5782
434
+ R@100: 0.7953
435
+ - name: mdpr-tied-pft-msmarco-ft-all.bn
436
+ eval_key: miracl-v1.0-bn
437
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
438
+ splits:
439
+ - split: train
440
+ scores:
441
+ - nDCG@10: 0.6823
442
+ R@100: 0.8646
443
+ - split: dev
444
+ scores:
445
+ - nDCG@10: 0.5804
446
+ R@100: 0.8480
447
+ - name: mdpr-tied-pft-msmarco-ft-all.en
448
+ eval_key: miracl-v1.0-en
449
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
450
+ splits:
451
+ - split: train
452
+ scores:
453
+ - nDCG@10: 0.3491
454
+ R@100: 0.5678
455
+ - split: dev
456
+ scores:
457
+ - nDCG@10: 0.2813
458
+ R@100: 0.5083
459
+ - name: mdpr-tied-pft-msmarco-ft-all.es
460
+ eval_key: miracl-v1.0-es
461
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
462
+ splits:
463
+ - split: train
464
+ scores:
465
+ - nDCG@10: 0.2488
466
+ R@100: 0.4799
467
+ - split: dev
468
+ scores:
469
+ - nDCG@10: 0.2509
470
+ R@100: 0.4706
471
+ - name: mdpr-tied-pft-msmarco-ft-all.fa
472
+ eval_key: miracl-v1.0-fa
473
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
474
+ splits:
475
+ - split: train
476
+ scores:
477
+ - nDCG@10: 0.3809
478
+ R@100: 0.6899
479
+ - split: dev
480
+ scores:
481
+ - nDCG@10: 0.3836
482
+ R@100: 0.6863
483
+ - name: mdpr-tied-pft-msmarco-ft-all.fi
484
+ eval_key: miracl-v1.0-fi
485
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
486
+ splits:
487
+ - split: train
488
+ scores:
489
+ - nDCG@10: 0.7738
490
+ R@100: 0.9081
491
+ - split: dev
492
+ scores:
493
+ - nDCG@10: 0.5694
494
+ R@100: 0.7984
495
+ - name: mdpr-tied-pft-msmarco-ft-all.fr
496
+ eval_key: miracl-v1.0-fr
497
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
498
+ splits:
499
+ - split: train
500
+ scores:
501
+ - nDCG@10: 0.2989
502
+ R@100: 0.6197
503
+ - split: dev
504
+ scores:
505
+ - nDCG@10: 0.3010
506
+ R@100: 0.6005
507
+ - name: mdpr-tied-pft-msmarco-ft-all.hi
508
+ eval_key: miracl-v1.0-hi
509
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
510
+ splits:
511
+ - split: train
512
+ scores:
513
+ - nDCG@10: 0.3336
514
+ R@100: 0.6388
515
+ - split: dev
516
+ scores:
517
+ - nDCG@10: 0.3286
518
+ R@100: 0.6371
519
+ - name: mdpr-tied-pft-msmarco-ft-all.id
520
+ eval_key: miracl-v1.0-id
521
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
522
+ splits:
523
+ - split: train
524
+ scores:
525
+ - nDCG@10: 0.3321
526
+ R@100: 0.5492
527
+ - split: dev
528
+ scores:
529
+ - nDCG@10: 0.3462
530
+ R@100: 0.5841
531
+ - name: mdpr-tied-pft-msmarco-ft-all.ja
532
+ eval_key: miracl-v1.0-ja
533
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
534
+ splits:
535
+ - split: train
536
+ scores:
537
+ - nDCG@10: 0.6378
538
+ R@100: 0.7950
539
+ - split: dev
540
+ scores:
541
+ - nDCG@10: 0.4999
542
+ R@100: 0.7451
543
+ - name: mdpr-tied-pft-msmarco-ft-all.ko
544
+ eval_key: miracl-v1.0-ko
545
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
546
+ splits:
547
+ - split: train
548
+ scores:
549
+ - nDCG@10: 0.5795
550
+ R@100: 0.7850
551
+ - split: dev
552
+ scores:
553
+ - nDCG@10: 0.4864
554
+ R@100: 0.7183
555
+ - name: mdpr-tied-pft-msmarco-ft-all.ru
556
+ eval_key: miracl-v1.0-ru
557
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
558
+ splits:
559
+ - split: train
560
+ scores:
561
+ - nDCG@10: 0.6011
562
+ R@100: 0.8188
563
+ - split: dev
564
+ scores:
565
+ - nDCG@10: 0.3933
566
+ R@100: 0.6707
567
+ - name: mdpr-tied-pft-msmarco-ft-all.sw
568
+ eval_key: miracl-v1.0-sw
569
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
570
+ splits:
571
+ - split: train
572
+ scores:
573
+ - nDCG@10: 0.8882
574
+ R@100: 0.9710
575
+ - split: dev
576
+ scores:
577
+ - nDCG@10: 0.6575
578
+ R@100: 0.8883
579
+ - name: mdpr-tied-pft-msmarco-ft-all.te
580
+ eval_key: miracl-v1.0-te
581
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
582
+ splits:
583
+ - split: train
584
+ scores:
585
+ - nDCG@10: 0.8757
586
+ R@100: 0.9725
587
+ - split: dev
588
+ scores:
589
+ - nDCG@10: 0.7783
590
+ R@100: 0.9513
591
+ - name: mdpr-tied-pft-msmarco-ft-all.th
592
+ eval_key: miracl-v1.0-th
593
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
594
+ splits:
595
+ - split: train
596
+ scores:
597
+ - nDCG@10: 0.7761
598
+ R@100: 0.9241
599
+ - split: dev
600
+ scores:
601
+ - nDCG@10: 0.5975
602
+ R@100: 0.8360
603
+ - name: mdpr-tied-pft-msmarco-ft-all.zh
604
+ eval_key: miracl-v1.0-zh
605
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
606
+ splits:
607
+ - split: train
608
+ scores:
609
+ - nDCG@10: 0.3446
610
+ R@100: 0.6608
611
+ - split: dev
612
+ scores:
613
+ - nDCG@10: 0.3575
614
+ R@100: 0.6725
615
+ - name: mdpr-tied-pft-msmarco-ft-all.de
616
+ eval_key: miracl-v1.0-de
617
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
618
+ splits:
619
+ - split: dev
620
+ scores:
621
+ - nDCG@10: 0.3219
622
+ R@100: 0.5990
623
+ - name: mdpr-tied-pft-msmarco-ft-all.yo
624
+ eval_key: miracl-v1.0-yo
625
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
626
+ splits:
627
+ - split: dev
628
+ scores:
629
+ - nDCG@10: 0.5983
630
+ R@100: 0.8908
631
+
632
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.ar
633
+ eval_key: miracl-v1.0-ar
634
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
635
+ splits:
636
+ - split: train
637
+ scores:
638
+ - nDCG@10: 0.6259
639
+ R@100: 0.9173
640
+ - split: dev
641
+ scores:
642
+ - nDCG@10: 0.6729
643
+ R@100: 0.9405
644
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.bn
645
+ eval_key: miracl-v1.0-bn
646
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
647
+ splits:
648
+ - split: train
649
+ scores:
650
+ - nDCG@10: 0.6587
651
+ R@100: 0.9297
652
+ - split: dev
653
+ scores:
654
+ - nDCG@10: 0.6540
655
+ R@100: 0.9321
656
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.en
657
+ eval_key: miracl-v1.0-en
658
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
659
+ splits:
660
+ - split: train
661
+ scores:
662
+ - nDCG@10: 0.5347
663
+ R@100: 0.8772
664
+ - split: dev
665
+ scores:
666
+ - nDCG@10: 0.5488
667
+ R@100: 0.8815
668
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.es
669
+ eval_key: miracl-v1.0-es
670
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
671
+ splits:
672
+ - split: train
673
+ scores:
674
+ - nDCG@10: 0.6234
675
+ R@100: 0.9425
676
+ - split: dev
677
+ scores:
678
+ - nDCG@10: 0.6413
679
+ R@100: 0.9479
680
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.fa
681
+ eval_key: miracl-v1.0-fa
682
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
683
+ splits:
684
+ - split: train
685
+ scores:
686
+ - nDCG@10: 0.5890
687
+ R@100: 0.9433
688
+ - split: dev
689
+ scores:
690
+ - nDCG@10: 0.5935
691
+ R@100: 0.9374
692
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.fi
693
+ eval_key: miracl-v1.0-fi
694
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
695
+ splits:
696
+ - split: train
697
+ scores:
698
+ scores:
699
+ - nDCG@10: 0.6164
700
+ R@100: 0.8506
701
+ - split: dev
702
+ scores:
703
+ - nDCG@10: 0.6716
704
+ R@100: 0.8949
705
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.fr
706
+ eval_key: miracl-v1.0-fr
707
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
708
+ splits:
709
+ - split: train
710
+ scores:
711
+ - nDCG@10: 0.5299
712
+ R@100: 0.9709
713
+ - split: dev
714
+ scores:
715
+ - nDCG@10: 0.5233
716
+ R@100: 0.9647
717
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.hi
718
+ eval_key: miracl-v1.0-hi
719
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
720
+ splits:
721
+ - split: train
722
+ scores:
723
+ - nDCG@10: 0.6217
724
+ R@100: 0.9059
725
+ - split: dev
726
+ scores:
727
+ - nDCG@10: 0.6157
728
+ R@100: 0.9115
729
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.id
730
+ eval_key: miracl-v1.0-id
731
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
732
+ splits:
733
+ - split: train
734
+ scores:
735
+ - nDCG@10: 0.4442
736
+ R@100: 0.7595
737
+ - split: dev
738
+ scores:
739
+ - nDCG@10: 0.4433
740
+ R@100: 0.7683
741
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.ja
742
+ eval_key: miracl-v1.0-ja
743
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
744
+ splits:
745
+ - split: train
746
+ scores:
747
+ - nDCG@10: 0.5795
748
+ R@100: 0.9082
749
+ - split: dev
750
+ scores:
751
+ - nDCG@10: 0.5757
752
+ R@100: 0.9036
753
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.ko
754
+ eval_key: miracl-v1.0-ko
755
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
756
+ splits:
757
+ - split: train
758
+ scores:
759
+ - nDCG@10: 0.5758
760
+ R@100: 0.8744
761
+ - split: dev
762
+ scores:
763
+ - nDCG@10: 0.6086
764
+ R@100: 0.8997
765
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.ru
766
+ eval_key: miracl-v1.0-ru
767
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
768
+ splits:
769
+ - split: train
770
+ scores:
771
+ - nDCG@10: 0.4921
772
+ R@100: 0.8494
773
+ - split: dev
774
+ scores:
775
+ - nDCG@10: 0.5323
776
+ R@100: 0.8738
777
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.sw
778
+ eval_key: miracl-v1.0-sw
779
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
780
+ splits:
781
+ - split: train
782
+ scores:
783
+ - nDCG@10: 0.4100
784
+ R@100: 0.6987
785
+ - split: dev
786
+ scores:
787
+ - nDCG@10: 0.4457
788
+ R@100: 0.7254
789
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.te
790
+ eval_key: miracl-v1.0-te
791
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
792
+ splits:
793
+ - split: train
794
+ scores:
795
+ - nDCG@10: 0.6000
796
+ R@100: 0.8717
797
+ - split: dev
798
+ scores:
799
+ - nDCG@10: 0.6021
800
+ R@100: 0.8569
801
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.th
802
+ eval_key: miracl-v1.0-th
803
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
804
+ splits:
805
+ - split: train
806
+ scores:
807
+ - nDCG@10: 0.5669
808
+ R@100: 0.8195
809
+ - split: dev
810
+ scores:
811
+ - nDCG@10: 0.5990
812
+ R@100: 0.8228
813
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.zh
814
+ eval_key: miracl-v1.0-zh
815
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
816
+ splits:
817
+ - split: train
818
+ scores:
819
+ - nDCG@10: 0.5209
820
+ R@100: 0.9576
821
+ - split: dev
822
+ scores:
823
+ - nDCG@10: 0.5254
824
+ R@100: 0.9587
825
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.de
826
+ eval_key: miracl-v1.0-de
827
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
828
+ splits:
829
+ - split: dev
830
+ scores:
831
+ - nDCG@10: 0.5643
832
+ R@100: 0.9482
833
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.yo
834
+ eval_key: miracl-v1.0-yo
835
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
836
+ splits:
837
+ - split: dev
838
+ scores:
839
+ - nDCG@10: 0.6114
840
+ R@100: 0.9496
841
+
842
+ # mdpr-tied-pft-msmarco-ft-miracl-ft-miracl
843
+ - name: mdpr-tied-pft-msmarco-ft-miracl.ar
844
+ eval_key: miracl-v1.0-ar
845
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ar --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-miracl-ar --output $output --batch 128 --threads 16 --hits 1000
846
+ splits:
847
+ - split: dev
848
+ scores:
849
+ - nDCG@10: 0.7252
850
+ R@100: 0.9489
851
+ - name: mdpr-tied-pft-msmarco-ft-miracl.bn
852
+ eval_key: miracl-v1.0-bn
853
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-bn --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-miracl-bn --output $output --batch 128 --threads 16 --hits 1000
854
+ splits:
855
+ - split: dev
856
+ scores:
857
+ - nDCG@10: 0.6842
858
+ R@100: 0.9547
859
+ - name: mdpr-tied-pft-msmarco-ft-miracl.en
860
+ eval_key: miracl-v1.0-en
861
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-en --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-miracl-en --output $output --batch 128 --threads 16 --hits 1000
862
+ splits:
863
+ - split: dev
864
+ scores:
865
+ - nDCG@10: 0.4878
866
+ R@100: 0.8341
867
+ - name: mdpr-tied-pft-msmarco-ft-miracl.es
868
+ eval_key: miracl-v1.0-es
869
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-es --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-miracl-es --output $output --batch 128 --threads 16 --hits 1000
870
+ splits:
871
+ - split: dev
872
+ scores:
873
+ - nDCG@10: 0.5648
874
+ R@100: 0.9109
875
+ - name: mdpr-tied-pft-msmarco-ft-miracl.fa
876
+ eval_key: miracl-v1.0-fa
877
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fa --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-miracl-fa --output $output --batch 128 --threads 16 --hits 1000
878
+ splits:
879
+ - split: dev
880
+ scores:
881
+ - nDCG@10: 0.5934
882
+ R@100: 0.9133
883
+ - name: mdpr-tied-pft-msmarco-ft-miracl.fi
884
+ eval_key: miracl-v1.0-fi
885
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fi --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-miracl-fi --output $output --batch 128 --threads 16 --hits 1000
886
+ splits:
887
+ - split: dev
888
+ scores:
889
+ - nDCG@10: 0.7139
890
+ R@100: 0.9479
891
+ - name: mdpr-tied-pft-msmarco-ft-miracl.fr
892
+ eval_key: miracl-v1.0-fr
893
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fr --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-miracl-fr --output $output --batch 128 --threads 16 --hits 1000
894
+ splits:
895
+ - split: dev
896
+ scores:
897
+ - nDCG@10: 0.5893
898
+ R@100: 0.9537
899
+ - name: mdpr-tied-pft-msmarco-ft-miracl.hi
900
+ eval_key: miracl-v1.0-hi
901
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-hi --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-miracl-hi --output $output --batch 128 --threads 16 --hits 1000
902
+ splits:
903
+ - split: dev
904
+ scores:
905
+ - nDCG@10: 0.5164
906
+ R@100: 0.8862
907
+ - name: mdpr-tied-pft-msmarco-ft-miracl.id
908
+ eval_key: miracl-v1.0-id
909
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-id --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-miracl-id --output $output --batch 128 --threads 16 --hits 1000
910
+ splits:
911
+ - split: dev
912
+ scores:
913
+ - nDCG@10: 0.4959
914
+ R@100: 0.8642
915
+ - name: mdpr-tied-pft-msmarco-ft-miracl.ja
916
+ eval_key: miracl-v1.0-ja
917
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ja --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-miracl-ja --output $output --batch 128 --threads 16 --hits 1000
918
+ splits:
919
+ - split: dev
920
+ scores:
921
+ - nDCG@10: 0.6416
922
+ R@100: 0.9225
923
+ - name: mdpr-tied-pft-msmarco-ft-miracl.ko
924
+ eval_key: miracl-v1.0-ko
925
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ko --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-miracl-ko --output $output --batch 128 --threads 16 --hits 1000
926
+ splits:
927
+ - split: dev
928
+ scores:
929
+ - nDCG@10: 0.5901
930
+ R@100: 0.8857
931
+ - name: mdpr-tied-pft-msmarco-ft-miracl.ru
932
+ eval_key: miracl-v1.0-ru
933
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ru --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-miracl-ru --output $output --batch 128 --threads 16 --hits 1000
934
+ splits:
935
+ - split: dev
936
+ scores:
937
+ - nDCG@10: 0.5974
938
+ R@100: 0.9099
939
+ - name: mdpr-tied-pft-msmarco-ft-miracl.sw
940
+ eval_key: miracl-v1.0-sw
941
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-sw --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-miracl-sw --output $output --batch 128 --threads 16 --hits 1000
942
+ splits:
943
+ - split: dev
944
+ scores:
945
+ - nDCG@10: 0.6853
946
+ R@100: 0.9367
947
+ - name: mdpr-tied-pft-msmarco-ft-miracl.te
948
+ eval_key: miracl-v1.0-te
949
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-te --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-miracl-te --output $output --batch 128 --threads 16 --hits 1000
950
+ splits:
951
+ - split: dev
952
+ scores:
953
+ - nDCG@10: 0.8037
954
+ R@100: 0.9616
955
+ - name: mdpr-tied-pft-msmarco-ft-miracl.th
956
+ eval_key: miracl-v1.0-th
957
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-th --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-miracl-th --output $output --batch 128 --threads 16 --hits 1000
958
+ splits:
959
+ - split: dev
960
+ scores:
961
+ - nDCG@10: 0.6951
962
+ R@100: 0.9311
963
+ - name: mdpr-tied-pft-msmarco-ft-miracl.zh
964
+ eval_key: miracl-v1.0-zh
965
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-zh --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-miracl-zh --output $output --batch 128 --threads 16 --hits 1000
966
+ splits:
967
+ - split: dev
968
+ scores:
969
+ - nDCG@10: 0.6500
970
+ R@100: 0.9631
971
+
972
+ # mcontriever
973
+ - name: mcontriever-tied-pft-msmarco.ar
974
+ eval_key: miracl-v1.0-ar
975
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
976
+ splits:
977
+ - split: train
978
+ scores:
979
+ - nDCG@10: 0.5027
980
+ R@100: 0.9166
981
+ - split: dev
982
+ scores:
983
+ - nDCG@10: 0.5248
984
+ R@100: 0.9253
985
+ - name: mcontriever-tied-pft-msmarco.bn
986
+ eval_key: miracl-v1.0-bn
987
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
988
+ splits:
989
+ - split: train
990
+ scores:
991
+ - nDCG@10: 0.5138
992
+ R@100: 0.9313
993
+ - split: dev
994
+ scores:
995
+ - nDCG@10: 0.5011
996
+ R@100: 0.9205
997
+ - name: mcontriever-tied-pft-msmarco.en
998
+ eval_key: miracl-v1.0-en
999
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1000
+ splits:
1001
+ - split: train
1002
+ scores:
1003
+ - nDCG@10: 0.3579
1004
+ R@100: 0.7990
1005
+ - split: dev
1006
+ scores:
1007
+ - nDCG@10: 0.3637
1008
+ R@100: 0.7967
1009
+ - name: mcontriever-tied-pft-msmarco.es
1010
+ eval_key: miracl-v1.0-es
1011
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1012
+ splits:
1013
+ - split: train
1014
+ scores:
1015
+ - nDCG@10: 0.4081
1016
+ R@100: 0.8339
1017
+ - split: dev
1018
+ scores:
1019
+ - nDCG@10: 0.4184
1020
+ R@100: 0.8411
1021
+ - name: mcontriever-tied-pft-msmarco.fa
1022
+ eval_key: miracl-v1.0-fa
1023
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1024
+ splits:
1025
+ - split: train
1026
+ scores:
1027
+ - nDCG@10: 0.2263
1028
+ R@100: 0.6374
1029
+ - split: dev
1030
+ scores:
1031
+ - nDCG@10: 0.2152
1032
+ R@100: 0.6540
1033
+ - name: mcontriever-tied-pft-msmarco.fi
1034
+ eval_key: miracl-v1.0-fi
1035
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1036
+ splits:
1037
+ - split: train
1038
+ scores:
1039
+ - nDCG@10: 0.5680
1040
+ R@100: 0.9369
1041
+ - split: dev
1042
+ scores:
1043
+ - nDCG@10: 0.6019
1044
+ R@100: 0.9527
1045
+ - name: mcontriever-tied-pft-msmarco.fr
1046
+ eval_key: miracl-v1.0-fr
1047
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1048
+ splits:
1049
+ - split: train
1050
+ scores:
1051
+ - nDCG@10: 0.3332
1052
+ R@100: 0.8341
1053
+ - split: dev
1054
+ scores:
1055
+ - nDCG@10: 0.3140
1056
+ R@100: 0.8243
1057
+ - name: mcontriever-tied-pft-msmarco.hi
1058
+ eval_key: miracl-v1.0-hi
1059
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1060
+ splits:
1061
+ - split: train
1062
+ scores:
1063
+ - nDCG@10: 0.2886
1064
+ R@100: 0.6664
1065
+ - split: dev
1066
+ scores:
1067
+ - nDCG@10: 0.2864
1068
+ R@100: 0.6461
1069
+ - name: mcontriever-tied-pft-msmarco.id
1070
+ eval_key: miracl-v1.0-id
1071
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1072
+ splits:
1073
+ - split: train
1074
+ scores:
1075
+ - nDCG@10: 0.3748
1076
+ R@100: 0.7955
1077
+ - split: dev
1078
+ scores:
1079
+ - nDCG@10: 0.3915
1080
+ R@100: 0.8015
1081
+ - name: mcontriever-tied-pft-msmarco.ja
1082
+ eval_key: miracl-v1.0-ja
1083
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1084
+ splits:
1085
+ - split: train
1086
+ scores:
1087
+ - nDCG@10: 0.4402
1088
+ R@100: 0.8813
1089
+ - split: dev
1090
+ scores:
1091
+ - nDCG@10: 0.4240
1092
+ R@100: 0.8783
1093
+ - name: mcontriever-tied-pft-msmarco.ko
1094
+ eval_key: miracl-v1.0-ko
1095
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1096
+ splits:
1097
+ - split: train
1098
+ scores:
1099
+ - nDCG@10: 0.4799
1100
+ R@100: 0.8672
1101
+ - split: dev
1102
+ scores:
1103
+ - nDCG@10: 0.4829
1104
+ R@100: 0.8753
1105
+ - name: mcontriever-tied-pft-msmarco.ru
1106
+ eval_key: miracl-v1.0-ru
1107
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1108
+ splits:
1109
+ - split: train
1110
+ scores:
1111
+ - nDCG@10: 0.3811
1112
+ R@100: 0.8369
1113
+ - split: dev
1114
+ scores:
1115
+ - nDCG@10: 0.3913
1116
+ R@100: 0.8500
1117
+ - name: mcontriever-tied-pft-msmarco.sw
1118
+ eval_key: miracl-v1.0-sw
1119
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1120
+ splits:
1121
+ - split: train
1122
+ scores:
1123
+ - nDCG@10: 0.5568
1124
+ R@100: 0.9130
1125
+ - split: dev
1126
+ scores:
1127
+ - nDCG@10: 0.5600
1128
+ R@100: 0.9108
1129
+ - name: mcontriever-tied-pft-msmarco.te
1130
+ eval_key: miracl-v1.0-te
1131
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1132
+ splits:
1133
+ - split: train
1134
+ scores:
1135
+ - nDCG@10: 0.5260
1136
+ R@100: 0.9457
1137
+ - split: dev
1138
+ scores:
1139
+ - nDCG@10: 0.5283
1140
+ R@100: 0.9612
1141
+ - name: mcontriever-tied-pft-msmarco.th
1142
+ eval_key: miracl-v1.0-th
1143
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1144
+ splits:
1145
+ - split: train
1146
+ scores:
1147
+ - nDCG@10: 0.5299
1148
+ R@100: 0.9361
1149
+ - split: dev
1150
+ scores:
1151
+ - nDCG@10: 0.5173
1152
+ R@100: 0.9361
1153
+ - name: mcontriever-tied-pft-msmarco.zh
1154
+ eval_key: miracl-v1.0-zh
1155
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1156
+ splits:
1157
+ - split: train
1158
+ scores:
1159
+ - nDCG@10: 0.4283
1160
+ R@100: 0.8745
1161
+ - split: dev
1162
+ scores:
1163
+ - nDCG@10: 0.4097
1164
+ R@100: 0.9026
1165
+ - name: mcontriever-tied-pft-msmarco.de
1166
+ eval_key: miracl-v1.0-de
1167
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1168
+ splits:
1169
+ - split: dev
1170
+ scores:
1171
+ - nDCG@10: 0.4079
1172
+ R@100: 0.8407
1173
+ - name: mcontriever-tied-pft-msmarco.yo
1174
+ eval_key: miracl-v1.0-yo
1175
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
1176
+ splits:
1177
+ - split: dev
1178
+ scores:
1179
+ - nDCG@10: 0.4150
1180
+ R@100: 0.7703
pyserini/2cr/miracl_html.template ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
6
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
7
+ <title>Pyserini Reproductions</title>
8
+ <!-- Font Awesome -->
9
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
10
+ <!-- Google Fonts Roboto -->
11
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
12
+ <!-- MDB -->
13
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
14
+
15
+ <style>
16
+ tr.hide-table-padding td {
17
+ padding: 0;
18
+ }
19
+
20
+ .expand-button {
21
+ position: relative;
22
+ }
23
+
24
+ .accordion-toggle .expand-button:after {
25
+ position: absolute;
26
+ left:.75rem;
27
+ top: 50%;
28
+ transform: translate(0, -50%);
29
+ content: '-';
30
+ }
31
+
32
+ .accordion-toggle.collapsed .expand-button:after {
33
+ content: '+';
34
+ }
35
+
36
+ blockquote.mycode {
37
+ border-left: 3px solid #ccc;
38
+ margin-left: 25px;
39
+ margin-top: 15px;
40
+ padding-left: 15px;
41
+ }
42
+
43
+ blockquote.mycode2 {
44
+ border-left: 3px solid #ccc;
45
+ margin-left: 25px;
46
+ padding-top: 10px;
47
+ padding-bottom: 10px;
48
+ padding-left: 15px;
49
+ }
50
+
51
+ tr th.headertop {
52
+ border-bottom: none;
53
+ padding-bottom: 0rem
54
+ }
55
+
56
+ tr th.headerbottom {
57
+ padding-top: 0rem
58
+ }
59
+
60
+ .table>:not(caption)>*>*{padding:0.75rem 0.75rem}
61
+
62
+ .copy-code-button {
63
+ border-radius: 0;
64
+ min-width: 55px;
65
+ background: none repeat scroll 0 0 transparent;
66
+ background-color: grey;
67
+ color: #F1F2F3 !important;
68
+ cursor: pointer;
69
+ border-style: none;
70
+ font-family: 'HELVETICA',sans-serif;
71
+ font-size: 0.8em;
72
+ font-weight: normal;
73
+ text-align: center;
74
+ text-decoration: none;
75
+ text-indent: 0;
76
+ text-transform: uppercase;
77
+ font-weight: 500;
78
+ line-height: 1.42rem;
79
+ margin: 0;
80
+ padding: 3px 8px;
81
+ position: absolute !important;
82
+ top: 0 !important;
83
+ right: 0 !important;
84
+ }
85
+
86
+ .copy-code-button > span {
87
+ color: #F1F2F3 !important;
88
+ }
89
+
90
+ .copy-code-button, ::before, ::after {
91
+ box-sizing: inherit;
92
+ }
93
+
94
+ .copy-code-button::before {
95
+ content: '';
96
+ display: inline-block;
97
+ width: 16px;
98
+ height: 16px;
99
+ margin-right: 3px;
100
+ background-size: contain;
101
+ background-image: url("");
102
+ background-repeat: no-repeat;
103
+ position: relative;
104
+ top: 3px;
105
+ }
106
+
107
+ .copy-code-button:focus {
108
+ /* Avoid an ugly focus outline on click in Chrome,
109
+ but darken the button for accessibility.
110
+ See https://stackoverflow.com/a/25298082/1481479 */
111
+ /* background-color: #E6E6E6; */
112
+ outline: 0;
113
+ }
114
+
115
+ pre[class*="prettyprint"] {
116
+ position: relative;
117
+ overflow: hidden;
118
+ }
119
+ </style>
120
+ </head>
121
+ <body>
122
+
123
+ <!-- Background image -->
124
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
125
+ <div class="mask" style="
126
+ background: linear-gradient(
127
+ 45deg,
128
+ rgba(29, 236, 197, 0.7),
129
+ rgba(91, 14, 214, 0.7) 100%
130
+ );
131
+ ">
132
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
133
+ <div class="text-white">
134
+ <h1 class="mb-3">$title</h1>
135
+ </div>
136
+ </div>
137
+ </div>
138
+ </div>
139
+ <!-- Background image -->
140
+
141
+ <div class="container my-4">
142
+
143
+ $tables
144
+
145
+ </ul>
146
+
147
+ <div style="padding-top: 20px"/>
148
+
149
+ <h4>Programmatic Execution</h4>
150
+
151
+ <p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
152
+ To list all the experimental conditions:</p>
153
+
154
+ <blockquote class="mycode2"><tt>
155
+ python -m pyserini.2cr.miracl --list-conditions
156
+ </tt></blockquote>
157
+
158
+ <p>Run all languages for a specific condition and show commands:</p>
159
+
160
+ <blockquote class="mycode2"><tt>
161
+ python -m pyserini.2cr.miracl --condition bm25 --display-commands
162
+ </tt></blockquote>
163
+
164
+ <p>Run a particular language for a specific condition and show commands:</p>
165
+
166
+ <blockquote class="mycode2"><tt>
167
+ python -m pyserini.2cr.miracl --condition bm25 --language ko --display-commands
168
+ </tt></blockquote>
169
+
170
+ <p>Run all languages for all conditions and show commands:</p>
171
+
172
+ <blockquote class="mycode2"><tt>
173
+ python -m pyserini.2cr.miracl --all --display-commands
174
+ </tt></blockquote>
175
+
176
+ <p>With the above commands, run files will be placed in the current directory. Use the option <tt>--directory runs</tt> to place the runs in a sub-directory.</p>
177
+
178
+ <p>For a specific condition, just show the commands and do not run:</p>
179
+
180
+ <blockquote class="mycode2"><tt>
181
+ python -m pyserini.2cr.miracl --condition bm25 --display-commands --dry-run
182
+ </tt></blockquote>
183
+
184
+ <p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
185
+
186
+ <p>For a specific condition and language, just show the commands and do not run:</p>
187
+
188
+ <blockquote class="mycode2"><tt>
189
+ python -m pyserini.2cr.miracl --condition bm25 --language ko --display-commands --dry-run
190
+ </tt></blockquote>
191
+
192
+ <p>For all conditions, just show the commands and do not run and skip evaluation:</p>
193
+
194
+ <blockquote class="mycode2"><tt>
195
+ python -m pyserini.2cr.miracl --all --display-commands --dry-run --skip-eval
196
+ </tt></blockquote>
197
+
198
+ <p>Finally, to generate this page:</p>
199
+
200
+ <blockquote class="mycode2"><tt>
201
+ python -m pyserini.2cr.miracl --generate-report --output docs/2cr/miracl.html
202
+ </tt></blockquote>
203
+
204
+ <p>The output file <tt>miracl.html</tt> should be identical to this page.</p>
205
+
206
+ <div style="padding-top: 50px"/>
207
+
208
+ </div>
209
+
210
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
211
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
212
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
213
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
214
+
215
+ <script>
216
+ document.querySelectorAll('pre').forEach(function (codeBlock) {
217
+ var button = document.createElement('button');
218
+ button.className = 'copy-code-button';
219
+ button.type = 'button';
220
+ var s = codeBlock.innerText;
221
+ button.setAttribute('data-clipboard-text',s);
222
+ button.innerText = 'Copy';
223
+
224
+ // var pre = codeBlock.parentNode;
225
+ codeBlock.classList.add('prettyprint');
226
+ // pre.parentNode.insertBefore(button, pre);
227
+ codeBlock.appendChild(button);
228
+ });
229
+
230
+ var clipboard = new ClipboardJS('.copy-code-button');
231
+
232
+ clipboard.on('success', function(e) {
233
+ console.info('Action:', e.action);
234
+ console.info('Text:', e.text);
235
+ console.info('Trigger:', e.trigger);
236
+ e.trigger.textContent = 'Copied';
237
+ window.setTimeout(function() {
238
+ e.trigger.textContent = 'Copy';
239
+ }, 2000);
240
+ e.clearSelection();
241
+ });
242
+
243
+ clipboard.on('error', function(e) {
244
+ console.error('Action:', e.action);
245
+ console.error('Trigger:', e.trigger);
246
+ e.trigger.textContent = 'Error Copying';
247
+ window.setTimeout(function() {
248
+ e.trigger.textContent = 'Copy';
249
+ }, 2000);
250
+ e.clearSelection();
251
+ });
252
+
253
+ </script>
254
+
255
+ </body>
256
+ </html>
pyserini/2cr/miracl_html_table.template ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="table-responsive">
2
+ <table class="table">
3
+ <thead>
4
+ <tr>
5
+ <th scope="col"></th>
6
+ <th scope="col">$desc</th>
7
+ <th scope="col">ar</th>
8
+ <th scope="col">bn</th>
9
+ <th scope="col">en</th>
10
+ <th scope="col">es</th>
11
+ <th scope="col">fa</th>
12
+ <th scope="col">fi</th>
13
+ <th scope="col">fr</th>
14
+ <th scope="col">hi</th>
15
+ <th scope="col">id</th>
16
+ <th scope="col">ja</th>
17
+ <th scope="col">ko</th>
18
+ <th scope="col">ru</th>
19
+ <th scope="col">sw</th>
20
+ <th scope="col">te</th>
21
+ <th scope="col">th</th>
22
+ <th scope="col">zh</th>
23
+ <th scope="col">de</th>
24
+ <th scope="col">yo</th>
25
+ <th scope="col"></th>
26
+ <th scope="col">avg</th>
27
+ </tr>
28
+ </thead>
29
+ <tbody>
30
+
31
+ $rows
32
+
33
+ </tbody>
34
+ </table>
35
+ </div>
pyserini/2cr/miracl_html_table_row.template ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Condition: $model -->
2
+ <tr class="accordion-toggle collapsed" id="table${table_cnt}-row${row_cnt}" data-toggle="collapse" data-parent="#table${table_cnt}-row${row_cnt}" href="#table${table_cnt}-collapse${row_cnt}">
3
+ <td class="expand-button"></td>
4
+ <td>$model</td>
5
+ <td>$ar</td>
6
+ <td>$bn</td>
7
+ <td>$en</td>
8
+ <td>$es</td>
9
+ <td>$fa</td>
10
+ <td>$fi</td>
11
+ <td>$fr</td>
12
+ <td>$hi</td>
13
+ <td>$id</td>
14
+ <td>$ja</td>
15
+ <td>$ko</td>
16
+ <td>$ru</td>
17
+ <td>$sw</td>
18
+ <td>$te</td>
19
+ <td>$th</td>
20
+ <td>$zh</td>
21
+ <td>$de</td>
22
+ <td>$yo</td>
23
+ <td></td>
24
+ <td>$avg</td>
25
+ </tr>
26
+ <tr class="hide-table-padding">
27
+ <td colspan="22">
28
+ <div id="table${table_cnt}-collapse${row_cnt}" class="collapse in p-3">
29
+
30
+ <!-- Tabs navs -->
31
+ <ul class="nav nav-tabs mb-3" id="table${table_cnt}-row${row_cnt}-tabs" role="tablist">
32
+ <li class="nav-item" role="presentation">
33
+ <a class="nav-link active" id="table${table_cnt}-row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab1" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">ar</a>
34
+ </li>
35
+ <li class="nav-item" role="presentation">
36
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab2" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">bn</a>
37
+ </li>
38
+ <li class="nav-item" role="presentation">
39
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab3" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">en</a>
40
+ </li>
41
+ <li class="nav-item" role="presentation">
42
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab4-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab4" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab4" aria-selected="false" style="text-transform:none">es</a>
43
+ </li>
44
+ <li class="nav-item" role="presentation">
45
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab5-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab5" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab5" aria-selected="false" style="text-transform:none">fa</a>
46
+ </li>
47
+ <li class="nav-item" role="presentation">
48
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab6-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab6" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab6" aria-selected="false" style="text-transform:none">fi</a>
49
+ </li>
50
+ <li class="nav-item" role="presentation">
51
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab7-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab7" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab7" aria-selected="false" style="text-transform:none">fr</a>
52
+ </li>
53
+ <li class="nav-item" role="presentation">
54
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab8-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab8" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab8" aria-selected="false" style="text-transform:none">hi</a>
55
+ </li>
56
+ <li class="nav-item" role="presentation">
57
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab9-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab9" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab9" aria-selected="false" style="text-transform:none">id</a>
58
+ </li>
59
+ <li class="nav-item" role="presentation">
60
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab10-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab10" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab10" aria-selected="false" style="text-transform:none">ja</a>
61
+ </li>
62
+ <li class="nav-item" role="presentation">
63
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab11-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab11" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab11" aria-selected="false" style="text-transform:none">ko</a>
64
+ </li>
65
+ <li class="nav-item" role="presentation">
66
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab12-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab12" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab12" aria-selected="false" style="text-transform:none">ru</a>
67
+ </li>
68
+ <li class="nav-item" role="presentation">
69
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab13-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab13" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab13" aria-selected="false" style="text-transform:none">sw</a>
70
+ </li>
71
+ <li class="nav-item" role="presentation">
72
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab14-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab14" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab14" aria-selected="false" style="text-transform:none">te</a>
73
+ </li>
74
+ <li class="nav-item" role="presentation">
75
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab15-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab15" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab15" aria-selected="false" style="text-transform:none">th</a>
76
+ </li>
77
+ <li class="nav-item" role="presentation">
78
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab16-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab16" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab16" aria-selected="false" style="text-transform:none">zh</a>
79
+ </li>
80
+ <li class="nav-item" role="presentation">
81
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab17-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab17" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab17" aria-selected="false" style="text-transform:none">de</a>
82
+ </li>
83
+ <li class="nav-item" role="presentation">
84
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab18-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab18" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab18" aria-selected="false" style="text-transform:none">yo</a>
85
+ </li>
86
+
87
+ </ul>
88
+ <!-- Tabs navs -->
89
+
90
+ <!-- Tabs content -->
91
+ <div class="tab-content" id="table${table_cnt}-row${row_cnt}-content">
92
+ <div class="tab-pane fade show active" id="table${table_cnt}-row${row_cnt}-tab1" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab1">
93
+ Command to generate run:
94
+
95
+ <blockquote class="mycode">
96
+ <pre><code>$cmd1
97
+ </code></pre></blockquote>
98
+ Evaluation commands:
99
+
100
+ <blockquote class="mycode">
101
+ <pre><code>${eval_cmd1}</code></pre>
102
+ </blockquote>
103
+
104
+ </div>
105
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab2" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab2">
106
+ Command to generate run:
107
+
108
+ <blockquote class="mycode">
109
+ <pre><code>$cmd2
110
+ </code></pre></blockquote>
111
+ Evaluation commands:
112
+
113
+ <blockquote class="mycode">
114
+ <pre><code>${eval_cmd2}</code></pre>
115
+ </blockquote>
116
+
117
+ </div>
118
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab3" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab3">
119
+ Command to generate run:
120
+
121
+ <blockquote class="mycode">
122
+ <pre><code>$cmd3
123
+ </code></pre></blockquote>
124
+ Evaluation commands:
125
+
126
+ <blockquote class="mycode">
127
+ <pre><code>${eval_cmd3}</code></pre>
128
+ </blockquote>
129
+
130
+ </div>
131
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab4" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab4">
132
+ Command to generate run:
133
+
134
+ <blockquote class="mycode">
135
+ <pre><code>$cmd4
136
+ </code></pre></blockquote>
137
+ Evaluation commands:
138
+
139
+ <blockquote class="mycode">
140
+ <pre><code>${eval_cmd4}</code></pre>
141
+ </blockquote>
142
+
143
+ </div>
144
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab5" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab5">
145
+ Command to generate run:
146
+
147
+ <blockquote class="mycode">
148
+ <pre><code>$cmd5
149
+ </code></pre></blockquote>
150
+ Evaluation commands:
151
+
152
+ <blockquote class="mycode">
153
+ <pre><code>${eval_cmd5}</code></pre>
154
+ </blockquote>
155
+
156
+ </div>
157
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab6" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab6">
158
+ Command to generate run:
159
+
160
+ <blockquote class="mycode">
161
+ <pre><code>$cmd6
162
+ </code></pre></blockquote>
163
+ Evaluation commands:
164
+
165
+ <blockquote class="mycode">
166
+ <pre><code>${eval_cmd6}</code></pre>
167
+ </blockquote>
168
+
169
+ </div>
170
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab7" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab7">
171
+ Command to generate run:
172
+
173
+ <blockquote class="mycode">
174
+ <pre><code>$cmd7
175
+ </code></pre></blockquote>
176
+ Evaluation commands:
177
+
178
+ <blockquote class="mycode">
179
+ <pre><code>${eval_cmd7}</code></pre>
180
+ </blockquote>
181
+
182
+ </div>
183
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab8" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab8">
184
+ Command to generate run:
185
+
186
+ <blockquote class="mycode">
187
+ <pre><code>$cmd8
188
+ </code></pre></blockquote>
189
+ Evaluation commands:
190
+
191
+ <blockquote class="mycode">
192
+ <pre><code>${eval_cmd8}</code></pre>
193
+ </blockquote>
194
+
195
+ </div>
196
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab9" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab9">
197
+ Command to generate run:
198
+
199
+ <blockquote class="mycode">
200
+ <pre><code>$cmd9
201
+ </code></pre></blockquote>
202
+ Evaluation commands:
203
+
204
+ <blockquote class="mycode">
205
+ <pre><code>${eval_cmd9}</code></pre>
206
+ </blockquote>
207
+
208
+ </div>
209
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab10" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab10">
210
+ Command to generate run:
211
+
212
+ <blockquote class="mycode">
213
+ <pre><code>$cmd10
214
+ </code></pre></blockquote>
215
+ Evaluation commands:
216
+
217
+ <blockquote class="mycode">
218
+ <pre><code>${eval_cmd10}</code></pre>
219
+ </blockquote>
220
+
221
+ </div>
222
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab11" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab11">
223
+ Command to generate run:
224
+
225
+ <blockquote class="mycode">
226
+ <pre><code>$cmd11
227
+ </code></pre></blockquote>
228
+ Evaluation commands:
229
+
230
+ <blockquote class="mycode">
231
+ <pre><code>${eval_cmd11}</code></pre>
232
+ </blockquote>
233
+
234
+ </div>
235
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab12" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab12">
236
+ Command to generate run:
237
+
238
+ <blockquote class="mycode">
239
+ <pre><code>$cmd12
240
+ </code></pre></blockquote>
241
+ Evaluation commands:
242
+
243
+ <blockquote class="mycode">
244
+ <pre><code>${eval_cmd12}</code></pre>
245
+ </blockquote>
246
+
247
+ </div>
248
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab13" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab13">
249
+ Command to generate run:
250
+
251
+ <blockquote class="mycode">
252
+ <pre><code>$cmd13
253
+ </code></pre></blockquote>
254
+ Evaluation commands:
255
+
256
+ <blockquote class="mycode">
257
+ <pre><code>${eval_cmd13}</code></pre>
258
+ </blockquote>
259
+
260
+ </div>
261
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab14" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab14">
262
+
263
+ Command to generate run:
264
+
265
+ <blockquote class="mycode">
266
+ <pre><code>$cmd14
267
+ </code></pre></blockquote>
268
+ Evaluation commands:
269
+
270
+ <blockquote class="mycode">
271
+ <pre><code>${eval_cmd14}</code></pre>
272
+ </blockquote>
273
+
274
+ </div>
275
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab15" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab15">
276
+
277
+ Command to generate run:
278
+
279
+ <blockquote class="mycode">
280
+ <pre><code>$cmd15
281
+ </code></pre></blockquote>
282
+ Evaluation commands:
283
+
284
+ <blockquote class="mycode">
285
+ <pre><code>${eval_cmd15}</code></pre>
286
+ </blockquote>
287
+
288
+ </div>
289
+
290
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab16" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab16">
291
+ Command to generate run:
292
+
293
+ <blockquote class="mycode">
294
+ <pre><code>$cmd16
295
+ </code></pre></blockquote>
296
+ Evaluation commands:
297
+
298
+ <blockquote class="mycode">
299
+ <pre><code>${eval_cmd16}</code></pre>
300
+ </blockquote>
301
+
302
+ </div>
303
+
304
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab17" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab17">
305
+ Command to generate run:
306
+
307
+ <blockquote class="mycode">
308
+ <pre><code>$cmd17
309
+ </code></pre></blockquote>
310
+ Evaluation commands:
311
+
312
+ <blockquote class="mycode">
313
+ <pre><code>${eval_cmd17}</code></pre>
314
+ </blockquote>
315
+
316
+ </div>
317
+
318
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab18" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab18">
319
+ Command to generate run:
320
+
321
+ <blockquote class="mycode">
322
+ <pre><code>$cmd18
323
+ </code></pre></blockquote>
324
+ Evaluation commands:
325
+
326
+ <blockquote class="mycode">
327
+ <pre><code>${eval_cmd18}</code></pre>
328
+ </blockquote>
329
+
330
+ </div>
331
+
332
+ </div>
333
+ <!-- Tabs content -->
334
+
335
+ </div></td>
336
+ </tr>
pyserini/2cr/mrtydi.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ from collections import defaultdict
18
+ from string import Template
19
+
20
+ import argparse
21
+ import math
22
+ import os
23
+ import pkg_resources
24
+ import sys
25
+ import time
26
+ import yaml
27
+
28
+ from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
29
+
30
+ languages = [
31
+ ['ar', 'arabic'],
32
+ ['bn', 'bengali'],
33
+ ['en', 'english'],
34
+ ['fi', 'finnish'],
35
+ ['id', 'indonesian'],
36
+ ['ja', 'japanese'],
37
+ ['ko', 'korean'],
38
+ ['ru', 'russian'],
39
+ ['sw', 'swahili'],
40
+ ['te', 'telugu'],
41
+ ['th', 'thai']
42
+ ]
43
+
44
+ models = ['bm25', 'mdpr-split-pft-nq', 'mdpr-tied-pft-nq', 'mdpr-tied-pft-msmarco', 'mdpr-tied-pft-msmarco-ft-all']
45
+
46
+ html_display = {
47
+ 'bm25': 'BM25',
48
+ 'mdpr-split-pft-nq': 'mDPR (split encoders), pre-FT w/ NQ',
49
+ 'mdpr-tied-pft-nq': 'mDPR (tied encoders), pre-FT w/ NQ',
50
+ 'mdpr-tied-pft-msmarco': 'mDPR (tied encoders), pre-FT w/ MS MARCO',
51
+ 'mdpr-tied-pft-msmarco-ft-all': 'mDPR (tied encoders), pre-FT w/ MS MARCO, FT w/ all'
52
+ }
53
+
54
+ trec_eval_metric_definitions = {
55
+ 'MRR@100': '-c -M 100 -m recip_rank',
56
+ 'R@100': '-c -m recall.100',
57
+ }
58
+
59
+
60
+ def format_run_command(raw):
61
+ return raw.replace('--lang', '\\\n --lang')\
62
+ .replace('--encoder', '\\\n --encoder')\
63
+ .replace('--topics', '\\\n --topics')\
64
+ .replace('--index', '\\\n --index')\
65
+ .replace('--output ', '\\\n --output ')\
66
+ .replace('--batch ', '\\\n --batch ') \
67
+ .replace('--threads 12', '--threads 12 \\\n ')
68
+
69
+
70
+ def format_eval_command(raw):
71
+ return raw.replace('-c ', '\\\n -c ')\
72
+ .replace(raw.split()[-1], f'\\\n {raw.split()[-1]}')
73
+
74
+
75
+ def read_file(f):
76
+ fin = open(f, 'r')
77
+ text = fin.read()
78
+ fin.close()
79
+
80
+ return text
81
+
82
+
83
+ def list_conditions():
84
+ print('Conditions:\n-----------')
85
+ for condition in models:
86
+ print(condition)
87
+ print('\nLanguages\n---------')
88
+ for language in languages:
89
+ print(language[0])
90
+
91
+
92
+ def print_results(table, metric, split):
93
+ print(f'Metric = {metric}, Split = {split}')
94
+ print(' ' * 32, end='')
95
+ for lang in languages:
96
+ print(f'{lang[0]:3} ', end='')
97
+ print('')
98
+ for model in models:
99
+ print(f'{model:30}', end='')
100
+ for lang in languages:
101
+ key = f'{model}.{lang[0]}'
102
+ print(f'{table[key][split][metric]:7.3f}', end='')
103
+ print('')
104
+ print('')
105
+
106
+
107
+ def generate_table_rows(table, row_template, commands, eval_commands, table_id, split, metric):
108
+ row_cnt = 1
109
+ html_rows = []
110
+
111
+ for model in models:
112
+ s = Template(row_template)
113
+
114
+ keys = {}
115
+ for lang in languages:
116
+ keys[lang[0]] = f'{model}.{lang[0]}'
117
+
118
+ sum = table[keys["ar"]][split][metric] + \
119
+ table[keys["bn"]][split][metric] + \
120
+ table[keys["en"]][split][metric] + \
121
+ table[keys["fi"]][split][metric] + \
122
+ table[keys["id"]][split][metric] + \
123
+ table[keys["ja"]][split][metric] + \
124
+ table[keys["ko"]][split][metric] + \
125
+ table[keys["ru"]][split][metric] + \
126
+ table[keys["sw"]][split][metric] + \
127
+ table[keys["te"]][split][metric] + \
128
+ table[keys["th"]][split][metric]
129
+ avg = sum / 11
130
+
131
+ s = s.substitute(table_cnt=table_id,
132
+ row_cnt=row_cnt,
133
+ model=html_display[model],
134
+ ar=f'{table[keys["ar"]][split][metric]:.3f}',
135
+ bn=f'{table[keys["bn"]][split][metric]:.3f}',
136
+ en=f'{table[keys["en"]][split][metric]:.3f}',
137
+ fi=f'{table[keys["fi"]][split][metric]:.3f}',
138
+ id=f'{table[keys["id"]][split][metric]:.3f}',
139
+ ja=f'{table[keys["ja"]][split][metric]:.3f}',
140
+ ko=f'{table[keys["ko"]][split][metric]:.3f}',
141
+ ru=f'{table[keys["ru"]][split][metric]:.3f}',
142
+ sw=f'{table[keys["sw"]][split][metric]:.3f}',
143
+ te=f'{table[keys["te"]][split][metric]:.3f}',
144
+ th=f'{table[keys["th"]][split][metric]:.3f}',
145
+ avg=f'{avg:.3f}',
146
+ cmd1=f'{commands[keys["ar"]]}',
147
+ cmd2=f'{commands[keys["bn"]]}',
148
+ cmd3=f'{commands[keys["en"]]}',
149
+ cmd4=f'{commands[keys["fi"]]}',
150
+ cmd5=f'{commands[keys["id"]]}',
151
+ cmd6=f'{commands[keys["ja"]]}',
152
+ cmd7=f'{commands[keys["ko"]]}',
153
+ cmd8=f'{commands[keys["ru"]]}',
154
+ cmd9=f'{commands[keys["sw"]]}',
155
+ cmd10=f'{commands[keys["te"]]}',
156
+ cmd11=f'{commands[keys["th"]]}',
157
+ eval_cmd1=f'{eval_commands[keys["ar"]][metric]}',
158
+ eval_cmd2=f'{eval_commands[keys["bn"]][metric]}',
159
+ eval_cmd3=f'{eval_commands[keys["en"]][metric]}',
160
+ eval_cmd4=f'{eval_commands[keys["fi"]][metric]}',
161
+ eval_cmd5=f'{eval_commands[keys["id"]][metric]}',
162
+ eval_cmd6=f'{eval_commands[keys["ja"]][metric]}',
163
+ eval_cmd7=f'{eval_commands[keys["ko"]][metric]}',
164
+ eval_cmd8=f'{eval_commands[keys["ru"]][metric]}',
165
+ eval_cmd9=f'{eval_commands[keys["sw"]][metric]}',
166
+ eval_cmd10=f'{eval_commands[keys["te"]][metric]}',
167
+ eval_cmd11=f'{eval_commands[keys["th"]][metric]}'
168
+ )
169
+
170
+ html_rows.append(s)
171
+ row_cnt += 1
172
+
173
+ return html_rows
174
+
175
+
176
+ def generate_report(args):
177
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
178
+ commands = defaultdict(lambda: '')
179
+ eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
180
+
181
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html.template'))
182
+ table_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html_table.template'))
183
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html_table_row.template'))
184
+
185
+ with open(pkg_resources.resource_filename(__name__, 'mrtydi.yaml')) as f:
186
+ yaml_data = yaml.safe_load(f)
187
+ for condition in yaml_data['conditions']:
188
+ name = condition['name']
189
+ eval_key = condition['eval_key']
190
+ cmd_template = condition['command']
191
+
192
+ for splits in condition['splits']:
193
+ split = splits['split']
194
+
195
+ runfile = os.path.join(args.directory, f'run.mrtydi.{name}.{split}.txt')
196
+ cmd = Template(cmd_template).substitute(split=split, output=runfile)
197
+ commands[name] = format_run_command(cmd)
198
+
199
+ for expected in splits['scores']:
200
+ for metric in expected:
201
+ table[name][split][metric] = expected[metric]
202
+
203
+ eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
204
+ f'{trec_eval_metric_definitions[metric]} {eval_key}-{split} {runfile}'
205
+ eval_commands[name][metric] = format_eval_command(eval_cmd)
206
+
207
+ tables_html = []
208
+
209
+ # Build the table for MRR@100, test queries
210
+ html_rows = generate_table_rows(table, row_template, commands, eval_commands, 1, 'test', 'MRR@100')
211
+ all_rows = '\n'.join(html_rows)
212
+ tables_html.append(Template(table_template).substitute(desc='MRR@100, test queries', rows=all_rows))
213
+
214
+ # Build the table for R@100, test queries
215
+ html_rows = generate_table_rows(table, row_template, commands, eval_commands, 2, 'test', 'R@100')
216
+ all_rows = '\n'.join(html_rows)
217
+ tables_html.append(Template(table_template).substitute(desc='Recall@100, test queries', rows=all_rows))
218
+
219
+ with open(args.output, 'w') as out:
220
+ out.write(Template(html_template).substitute(title='Mr.TyDi', tables=' '.join(tables_html)))
221
+
222
+
223
+ def run_conditions(args):
224
+ start = time.time()
225
+
226
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
227
+
228
+ with open(pkg_resources.resource_filename(__name__, 'mrtydi.yaml')) as f:
229
+ yaml_data = yaml.safe_load(f)
230
+ for condition in yaml_data['conditions']:
231
+ name = condition['name']
232
+ encoder = name.split('.')[0]
233
+ lang = name.split('.')[-1]
234
+ if args.all:
235
+ pass
236
+ elif args.condition != encoder:
237
+ continue
238
+ elif args.language and args.language != lang:
239
+ continue
240
+ eval_key = condition['eval_key']
241
+ cmd_template = condition['command']
242
+
243
+ print(f'condition {name}:')
244
+
245
+ for splits in condition['splits']:
246
+ split = splits['split']
247
+
248
+ print(f' - split: {split}')
249
+
250
+ runfile = os.path.join(args.directory, f'run.mrtydi.{name}.{split}.txt')
251
+ cmd = Template(cmd_template).substitute(split=split, output=runfile)
252
+
253
+ if args.display_commands:
254
+ print(f'\n```bash\n{format_run_command(cmd)}\n```\n')
255
+
256
+ if not os.path.exists(runfile):
257
+ if not args.dry_run:
258
+ os.system(cmd)
259
+
260
+ for expected in splits['scores']:
261
+ for metric in expected:
262
+ if not args.skip_eval:
263
+ score = float(run_eval_and_return_metric(metric, f'{eval_key}-{split}',
264
+ trec_eval_metric_definitions[metric], runfile))
265
+ if math.isclose(score, float(expected[metric])):
266
+ result_str = ok_str
267
+ # Flaky test: small difference on orca
268
+ elif name == 'mdpr-tied-pft-nq.te' and split == 'dev' \
269
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4):
270
+ result_str = okish_str
271
+ # Flaky test: small difference on orca
272
+ elif name == 'mdpr-tied-pft-msmarco-ft-all.ko' and split == 'train' \
273
+ and math.isclose(score, float(expected[metric]), abs_tol=4e-4):
274
+ result_str = okish_str
275
+ # Flaky test: small difference on Mac Studio (M1)
276
+ elif name == 'mdpr-tied-pft-msmarco.th' and split == 'train' \
277
+ and math.isclose(score, float(expected[metric]), abs_tol=3e-4):
278
+ result_str = okish_str
279
+ else:
280
+ result_str = fail_str + f' expected {expected[metric]:.4f}'
281
+ print(f' {metric:7}: {score:.4f} {result_str}')
282
+ table[name][split][metric] = score
283
+ else:
284
+ table[name][split][metric] = expected[metric]
285
+
286
+ print('')
287
+
288
+ for metric in ['MRR@100', 'R@100']:
289
+ for split in ['test', 'dev', 'train']:
290
+ print_results(table, metric, split)
291
+
292
+ end = time.time()
293
+ print(f'Total elapsed time: {end - start:.0f}s')
294
+
295
+
296
+ if __name__ == '__main__':
297
+ parser = argparse.ArgumentParser(description='Generate regression matrix for MIRACL.')
298
+ parser.add_argument('--condition', type=str,
299
+ help='Condition to run', required=False)
300
+ # To list all conditions
301
+ parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
302
+ # For generating reports
303
+ parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
304
+ parser.add_argument('--output', type=str, help='File to store report.', required=False)
305
+ # For actually running the experimental conditions
306
+ parser.add_argument('--all', action='store_true', default=False, help='Run using all languages.')
307
+ parser.add_argument('--language', type=str, help='Language to run.', required=False)
308
+ parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
309
+ parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
310
+ parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
311
+ parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
312
+ args = parser.parse_args()
313
+
314
+ if args.list_conditions:
315
+ list_conditions()
316
+ sys.exit()
317
+
318
+ if args.generate_report:
319
+ if not args.output:
320
+ print(f'Must specify report filename with --output.')
321
+ sys.exit()
322
+
323
+ generate_report(args)
324
+ sys.exit()
325
+
326
+ if args.all and (args.condition or args.language):
327
+ print('Specifying --all will run all conditions and languages')
328
+ sys.exit()
329
+
330
+ run_conditions(args)
pyserini/2cr/mrtydi.yaml ADDED
@@ -0,0 +1,890 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conditions:
2
+ # mDPR, tied encoders, pFT w/ MS MARCO, FT all
3
+ - name: mdpr-tied-pft-msmarco-ft-all.ar
4
+ eval_key: mrtydi-v1.1-arabic
5
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
6
+ splits:
7
+ - split: train
8
+ scores:
9
+ - MRR@100: 0.9505
10
+ R@100: 1.0000
11
+ - split: dev
12
+ scores:
13
+ - MRR@100: 0.6913
14
+ R@100: 0.9165
15
+ - split: test
16
+ scores:
17
+ - MRR@100: 0.6949
18
+ R@100: 0.9004
19
+ - name: mdpr-tied-pft-msmarco-ft-all.bn
20
+ eval_key: mrtydi-v1.1-bengali
21
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
22
+ splits:
23
+ - split: train
24
+ scores:
25
+ - MRR@100: 0.9620
26
+ R@100: 1.0000
27
+ - split: dev
28
+ scores:
29
+ - MRR@100: 0.5897
30
+ R@100: 0.8977
31
+ - split: test
32
+ scores:
33
+ - MRR@100: 0.6228
34
+ R@100: 0.9550
35
+ - name: mdpr-tied-pft-msmarco-ft-all.en
36
+ eval_key: mrtydi-v1.1-english
37
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
38
+ splits:
39
+ - split: train
40
+ scores:
41
+ - MRR@100: 0.8278
42
+ R@100: 1.0000
43
+ - split: dev
44
+ scores:
45
+ - MRR@100: 0.5357
46
+ R@100: 0.8884
47
+ - split: test
48
+ scores:
49
+ - MRR@100: 0.4916
50
+ R@100: 0.8414
51
+ - name: mdpr-tied-pft-msmarco-ft-all.fi
52
+ eval_key: mrtydi-v1.1-finnish
53
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
54
+ splits:
55
+ - split: train
56
+ scores:
57
+ - MRR@100: 0.9577
58
+ R@100: 0.9997
59
+ - split: dev
60
+ scores:
61
+ - MRR@100: 0.6626
62
+ R@100: 0.9171
63
+ - split: test
64
+ scores:
65
+ - MRR@100: 0.5595
66
+ R@100: 0.8563
67
+ - name: mdpr-tied-pft-msmarco-ft-all.id
68
+ eval_key: mrtydi-v1.1-indonesian
69
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
70
+ splits:
71
+ - split: train
72
+ scores:
73
+ - MRR@100: 0.9469
74
+ R@100: 1.0000
75
+ - split: dev
76
+ scores:
77
+ - MRR@100: 0.6294
78
+ R@100: 0.9150
79
+ - split: test
80
+ scores:
81
+ - MRR@100: 0.5783
82
+ R@100: 0.8609
83
+ - name: mdpr-tied-pft-msmarco-ft-all.ja
84
+ eval_key: mrtydi-v1.1-japanese
85
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
86
+ splits:
87
+ - split: train
88
+ scores:
89
+ - MRR@100: 0.8802
90
+ R@100: 0.9997
91
+ - split: dev
92
+ scores:
93
+ - MRR@100: 0.5505
94
+ R@100: 0.8696
95
+ - split: test
96
+ scores:
97
+ - MRR@100: 0.5007
98
+ R@100: 0.8130
99
+ - name: mdpr-tied-pft-msmarco-ft-all.ko
100
+ eval_key: mrtydi-v1.1-korean
101
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
102
+ splits:
103
+ - split: train
104
+ scores:
105
+ - MRR@100: 0.9195
106
+ R@100: 1.0000
107
+ - split: dev
108
+ scores:
109
+ - MRR@100: 0.5645
110
+ R@100: 0.8663
111
+ - split: test
112
+ scores:
113
+ - MRR@100: 0.4861
114
+ R@100: 0.7854
115
+ - name: mdpr-tied-pft-msmarco-ft-all.ru
116
+ eval_key: mrtydi-v1.1-russian
117
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
118
+ splits:
119
+ - split: train
120
+ scores:
121
+ - MRR@100: 0.8473
122
+ R@100: 0.9994
123
+ - split: dev
124
+ scores:
125
+ - MRR@100: 0.5104
126
+ R@100: 0.8720
127
+ - split: test
128
+ scores:
129
+ - MRR@100: 0.5161
130
+ R@100: 0.8432
131
+ - name: mdpr-tied-pft-msmarco-ft-all.sw
132
+ eval_key: mrtydi-v1.1-swahili
133
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
134
+ splits:
135
+ - split: train
136
+ scores:
137
+ - MRR@100: 0.9515
138
+ R@100: 1.0000
139
+ - split: dev
140
+ scores:
141
+ - MRR@100: 0.6404
142
+ R@100: 0.9018
143
+ - split: test
144
+ scores:
145
+ - MRR@100: 0.6438
146
+ R@100: 0.8756
147
+ - name: mdpr-tied-pft-msmarco-ft-all.te
148
+ eval_key: mrtydi-v1.1-telugu
149
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
150
+ splits:
151
+ - split: train
152
+ scores:
153
+ - MRR@100: 0.9679
154
+ R@100: 1.0000
155
+ - split: dev
156
+ scores:
157
+ - MRR@100: 0.7962
158
+ R@100: 0.9593
159
+ - split: test
160
+ scores:
161
+ - MRR@100: 0.8908
162
+ R@100: 0.9659
163
+ - name: mdpr-tied-pft-msmarco-ft-all.th
164
+ eval_key: mrtydi-v1.1-thai
165
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
166
+ splits:
167
+ - split: train
168
+ scores:
169
+ - MRR@100: 0.9504
170
+ R@100: 1.0000
171
+ - split: dev
172
+ scores:
173
+ - MRR@100: 0.6670
174
+ R@100: 0.9114
175
+ - split: test
176
+ scores:
177
+ - MRR@100: 0.6175
178
+ R@100: 0.8826
179
+
180
+ # mDPR, tied encoders, pFT w/ MS MARCO
181
+ - name: mdpr-tied-pft-msmarco.ar
182
+ eval_key: mrtydi-v1.1-arabic
183
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco --output $output --hits 100
184
+ splits:
185
+ - split: train
186
+ scores:
187
+ - MRR@100: 0.3957
188
+ R@100: 0.7818
189
+ - split: dev
190
+ scores:
191
+ - MRR@100: 0.3978
192
+ R@100: 0.7778
193
+ - split: test
194
+ scores:
195
+ - MRR@100: 0.4414
196
+ R@100: 0.7971
197
+ - name: mdpr-tied-pft-msmarco.bn
198
+ eval_key: mrtydi-v1.1-bengali
199
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco --output $output --hits 100
200
+ splits:
201
+ - split: train
202
+ scores:
203
+ - MRR@100: 0.2920
204
+ R@100: 0.7323
205
+ - split: dev
206
+ scores:
207
+ - MRR@100: 0.2993
208
+ R@100: 0.7318
209
+ - split: test
210
+ scores:
211
+ - MRR@100: 0.3969
212
+ R@100: 0.7838
213
+ - name: mdpr-tied-pft-msmarco.en
214
+ eval_key: mrtydi-v1.1-english
215
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-msmarco --output $output --hits 100
216
+ splits:
217
+ - split: train
218
+ scores:
219
+ - MRR@100: 0.3374
220
+ R@100: 0.8111
221
+ - split: dev
222
+ scores:
223
+ - MRR@100: 0.3451
224
+ R@100: 0.7995
225
+ - split: test
226
+ scores:
227
+ - MRR@100: 0.3270
228
+ R@100: 0.7536
229
+ - name: mdpr-tied-pft-msmarco.fi
230
+ eval_key: mrtydi-v1.1-finnish
231
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco --output $output --hits 100
232
+ splits:
233
+ - split: train
234
+ scores:
235
+ - MRR@100: 0.3668
236
+ R@100: 0.7337
237
+ - split: dev
238
+ scores:
239
+ - MRR@100: 0.3636
240
+ R@100: 0.7371
241
+ - split: test
242
+ scores:
243
+ - MRR@100: 0.2750
244
+ R@100: 0.6471
245
+ - name: mdpr-tied-pft-msmarco.id
246
+ eval_key: mrtydi-v1.1-indonesian
247
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco --output $output --hits 100
248
+ splits:
249
+ - split: train
250
+ scores:
251
+ - MRR@100: 0.2794
252
+ R@100: 0.7044
253
+ - split: dev
254
+ scores:
255
+ - MRR@100: 0.2853
256
+ R@100: 0.7198
257
+ - split: test
258
+ scores:
259
+ - MRR@100: 0.3520
260
+ R@100: 0.7356
261
+ - name: mdpr-tied-pft-msmarco.ja
262
+ eval_key: mrtydi-v1.1-japanese
263
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco --output $output --hits 100
264
+ splits:
265
+ - split: train
266
+ scores:
267
+ - MRR@100: 0.3089
268
+ R@100: 0.7603
269
+ - split: dev
270
+ scores:
271
+ - MRR@100: 0.3108
272
+ R@100: 0.7597
273
+ - split: test
274
+ scores:
275
+ - MRR@100: 0.3107
276
+ R@100: 0.7317
277
+ - name: mdpr-tied-pft-msmarco.ko
278
+ eval_key: mrtydi-v1.1-korean
279
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-msmarco --output $output --hits 100
280
+ splits:
281
+ - split: train
282
+ scores:
283
+ - MRR@100: 0.3003
284
+ R@100: 0.6907
285
+ - split: dev
286
+ scores:
287
+ - MRR@100: 0.3017
288
+ R@100: 0.7046
289
+ - split: test
290
+ scores:
291
+ - MRR@100: 0.2820
292
+ R@100: 0.6172
293
+ - name: mdpr-tied-pft-msmarco.ru
294
+ eval_key: mrtydi-v1.1-russian
295
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-msmarco --output $output --hits 100
296
+ splits:
297
+ - split: train
298
+ scores:
299
+ - MRR@100: 0.2856
300
+ R@100: 0.7305
301
+ - split: dev
302
+ scores:
303
+ - MRR@100: 0.2943
304
+ R@100: 0.7404
305
+ - split: test
306
+ scores:
307
+ - MRR@100: 0.3561
308
+ R@100: 0.7432
309
+ - name: mdpr-tied-pft-msmarco.sw
310
+ eval_key: mrtydi-v1.1-swahili
311
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco --output $output --hits 100
312
+ splits:
313
+ - split: train
314
+ scores:
315
+ - MRR@100: 0.2491
316
+ R@100: 0.5195
317
+ - split: dev
318
+ scores:
319
+ - MRR@100: 0.2447
320
+ R@100: 0.5266
321
+ - split: test
322
+ scores:
323
+ - MRR@100: 0.3418
324
+ R@100: 0.6343
325
+ - name: mdpr-tied-pft-msmarco.te
326
+ eval_key: mrtydi-v1.1-telugu
327
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco --output $output --hits 100
328
+ splits:
329
+ - split: train
330
+ scores:
331
+ - MRR@100: 0.3059
332
+ R@100: 0.7510
333
+ - split: dev
334
+ scores:
335
+ - MRR@100: 0.2995
336
+ R@100: 0.7355
337
+ - split: test
338
+ scores:
339
+ - MRR@100: 0.3102
340
+ R@100: 0.7817
341
+ - name: mdpr-tied-pft-msmarco.th
342
+ eval_key: mrtydi-v1.1-thai
343
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-msmarco --output $output --hits 100
344
+ splits:
345
+ - split: train
346
+ scores:
347
+ - MRR@100: 0.2334
348
+ R@100: 0.5851
349
+ - split: dev
350
+ scores:
351
+ - MRR@100: 0.2407
352
+ R@100: 0.5795
353
+ - split: test
354
+ scores:
355
+ - MRR@100: 0.2693
356
+ R@100: 0.5945
357
+
358
+ # mDPR, tied encoders, pFT w/ NQ
359
+ - name: mdpr-tied-pft-nq.ar
360
+ eval_key: mrtydi-v1.1-arabic
361
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-nq --output $output --hits 100
362
+ splits:
363
+ - split: train
364
+ scores:
365
+ - MRR@100: 0.2087
366
+ R@100: 0.5854
367
+ - split: dev
368
+ scores:
369
+ - MRR@100: 0.2132
370
+ R@100: 0.5868
371
+ - split: test
372
+ scores:
373
+ - MRR@100: 0.2214
374
+ R@100: 0.6001
375
+ - name: mdpr-tied-pft-nq.bn
376
+ eval_key: mrtydi-v1.1-bengali
377
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-nq --output $output --hits 100
378
+ splits:
379
+ - split: train
380
+ scores:
381
+ - MRR@100: 0.2371
382
+ R@100: 0.6281
383
+ - split: dev
384
+ scores:
385
+ - MRR@100: 0.2414
386
+ R@100: 0.6409
387
+ - split: test
388
+ scores:
389
+ - MRR@100: 0.2535
390
+ R@100: 0.7072
391
+ - name: mdpr-tied-pft-nq.en
392
+ eval_key: mrtydi-v1.1-english
393
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-nq --output $output --hits 100
394
+ splits:
395
+ - split: train
396
+ scores:
397
+ - MRR@100: 0.2441
398
+ R@100: 0.7217
399
+ - split: dev
400
+ scores:
401
+ - MRR@100: 0.2359
402
+ R@100: 0.7187
403
+ - split: test
404
+ scores:
405
+ - MRR@100: 0.2433
406
+ R@100: 0.6893
407
+ - name: mdpr-tied-pft-nq.fi
408
+ eval_key: mrtydi-v1.1-finnish
409
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-nq --output $output --hits 100
410
+ splits:
411
+ - split: train
412
+ scores:
413
+ - MRR@100: 0.2996
414
+ R@100: 0.6787
415
+ - split: dev
416
+ scores:
417
+ - MRR@100: 0.3252
418
+ R@100: 0.7037
419
+ - split: test
420
+ scores:
421
+ - MRR@100: 0.2444
422
+ R@100: 0.6401
423
+ - name: mdpr-tied-pft-nq.id
424
+ eval_key: mrtydi-v1.1-indonesian
425
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-nq --output $output --hits 100
426
+ splits:
427
+ - split: train
428
+ scores:
429
+ - MRR@100: 0.2706
430
+ R@100: 0.7322
431
+ - split: dev
432
+ scores:
433
+ - MRR@100: 0.2719
434
+ R@100: 0.7394
435
+ - split: test
436
+ scores:
437
+ - MRR@100: 0.2815
438
+ R@100: 0.6914
439
+ - name: mdpr-tied-pft-nq.ja
440
+ eval_key: mrtydi-v1.1-japanese
441
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-nq --output $output --hits 100
442
+ splits:
443
+ - split: train
444
+ scores:
445
+ - MRR@100: 0.2165
446
+ R@100: 0.6043
447
+ - split: dev
448
+ scores:
449
+ - MRR@100: 0.2299
450
+ R@100: 0.6239
451
+ - split: test
452
+ scores:
453
+ - MRR@100: 0.2058
454
+ R@100: 0.5734
455
+ - name: mdpr-tied-pft-nq.ko
456
+ eval_key: mrtydi-v1.1-korean
457
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-nq --output $output --hits 100
458
+ splits:
459
+ - split: train
460
+ scores:
461
+ - MRR@100: 0.2527
462
+ R@100: 0.6556
463
+ - split: dev
464
+ scores:
465
+ - MRR@100: 0.2680
466
+ R@100: 0.6271
467
+ - split: test
468
+ scores:
469
+ - MRR@100: 0.2234
470
+ R@100: 0.5499
471
+ - name: mdpr-tied-pft-nq.ru
472
+ eval_key: mrtydi-v1.1-russian
473
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-nq --output $output --hits 100
474
+ splits:
475
+ - split: train
476
+ scores:
477
+ - MRR@100: 0.2160
478
+ R@100: 0.6262
479
+ - split: dev
480
+ scores:
481
+ - MRR@100: 0.2263
482
+ R@100: 0.6444
483
+ - split: test
484
+ scores:
485
+ - MRR@100: 0.2501
486
+ R@100: 0.6181
487
+ - name: mdpr-tied-pft-nq.sw
488
+ eval_key: mrtydi-v1.1-swahili
489
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-nq --output $output --hits 100
490
+ splits:
491
+ - split: train
492
+ scores:
493
+ - MRR@100: 0.2383
494
+ R@100: 0.5707
495
+ - split: dev
496
+ scores:
497
+ - MRR@100: 0.2543
498
+ R@100: 0.6138
499
+ - split: test
500
+ scores:
501
+ - MRR@100: 0.2621
502
+ R@100: 0.5965
503
+ - name: mdpr-tied-pft-nq.te
504
+ eval_key: mrtydi-v1.1-telugu
505
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-nq --output $output --hits 100
506
+ splits:
507
+ - split: train
508
+ scores:
509
+ - MRR@100: 0.1483
510
+ R@100: 0.4162
511
+ - split: dev
512
+ scores:
513
+ - MRR@100: 0.1494
514
+ R@100: 0.3967
515
+ - split: test
516
+ scores:
517
+ - MRR@100: 0.0970
518
+ R@100: 0.2454
519
+ - name: mdpr-tied-pft-nq.th
520
+ eval_key: mrtydi-v1.1-thai
521
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-nq --output $output --hits 100
522
+ splits:
523
+ - split: train
524
+ scores:
525
+ - MRR@100: 0.1426
526
+ R@100: 0.4717
527
+ - split: dev
528
+ scores:
529
+ - MRR@100: 0.1618
530
+ R@100: 0.4637
531
+ - split: test
532
+ scores:
533
+ - MRR@100: 0.1575
534
+ R@100: 0.4550
535
+
536
+ # mDPR, split encoders, pFT w/ NQ
537
+ - name: mdpr-split-pft-nq.ar
538
+ eval_key: mrtydi-v1.1-arabic
539
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-nq --output $output --hits 100
540
+ splits:
541
+ - split: train
542
+ scores:
543
+ - MRR@100: 0.2510
544
+ R@100: 0.6384
545
+ - split: dev
546
+ scores:
547
+ - MRR@100: 0.2449
548
+ R@100: 0.6334
549
+ - split: test
550
+ scores:
551
+ - MRR@100: 0.2907
552
+ R@100: 0.6502
553
+ - name: mdpr-split-pft-nq.bn
554
+ eval_key: mrtydi-v1.1-bengali
555
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-nq --output $output --hits 100
556
+ splits:
557
+ - split: train
558
+ scores:
559
+ - MRR@100: 0.2293
560
+ R@100: 0.6454
561
+ - split: dev
562
+ scores:
563
+ - MRR@100: 0.2367
564
+ R@100: 0.6511
565
+ - split: test
566
+ scores:
567
+ - MRR@100: 0.2911
568
+ R@100: 0.7793
569
+ - name: mdpr-split-pft-nq.en
570
+ eval_key: mrtydi-v1.1-english
571
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-nq --output $output --hits 100
572
+ splits:
573
+ - split: train
574
+ scores:
575
+ - MRR@100: 0.2862
576
+ R@100: 0.7372
577
+ - split: dev
578
+ scores:
579
+ - MRR@100: 0.2821
580
+ R@100: 0.7437
581
+ - split: test
582
+ scores:
583
+ - MRR@100: 0.2907
584
+ R@100: 0.6779
585
+ - name: mdpr-split-pft-nq.fi
586
+ eval_key: mrtydi-v1.1-finnish
587
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-nq --output $output --hits 100
588
+ splits:
589
+ - split: train
590
+ scores:
591
+ - MRR@100: 0.2473
592
+ R@100: 0.6289
593
+ - split: dev
594
+ scores:
595
+ - MRR@100: 0.2466
596
+ R@100: 0.6283
597
+ - split: test
598
+ scores:
599
+ - MRR@100: 0.2050
600
+ R@100: 0.5680
601
+ - name: mdpr-split-pft-nq.id
602
+ eval_key: mrtydi-v1.1-indonesian
603
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-nq --output $output --hits 100
604
+ splits:
605
+ - split: train
606
+ scores:
607
+ - MRR@100: 0.2351
608
+ R@100: 0.6952
609
+ - split: dev
610
+ scores:
611
+ - MRR@100: 0.2475
612
+ R@100: 0.7181
613
+ - split: test
614
+ scores:
615
+ - MRR@100: 0.2705
616
+ R@100: 0.6848
617
+ - name: mdpr-split-pft-nq.ja
618
+ eval_key: mrtydi-v1.1-japanese
619
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-nq --output $output --hits 100
620
+ splits:
621
+ - split: train
622
+ scores:
623
+ - MRR@100: 0.1967
624
+ R@100: 0.5983
625
+ - split: dev
626
+ scores:
627
+ - MRR@100: 0.2055
628
+ R@100: 0.6142
629
+ - split: test
630
+ scores:
631
+ - MRR@100: 0.2119
632
+ R@100: 0.5840
633
+ - name: mdpr-split-pft-nq.ko
634
+ eval_key: mrtydi-v1.1-korean
635
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-nq --output $output --hits 100
636
+ splits:
637
+ - split: train
638
+ scores:
639
+ - MRR@100: 0.2383
640
+ R@100: 0.6180
641
+ - split: dev
642
+ scores:
643
+ - MRR@100: 0.2343
644
+ R@100: 0.6238
645
+ - split: test
646
+ scores:
647
+ - MRR@100: 0.2345
648
+ R@100: 0.5325
649
+ - name: mdpr-split-pft-nq.ru
650
+ eval_key: mrtydi-v1.1-russian
651
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-nq --output $output --hits 100
652
+ splits:
653
+ - split: train
654
+ scores:
655
+ - MRR@100: 0.2317
656
+ R@100: 0.6534
657
+ - split: dev
658
+ scores:
659
+ - MRR@100: 0.2490
660
+ R@100: 0.6553
661
+ - split: test
662
+ scores:
663
+ - MRR@100: 0.2820
664
+ R@100: 0.6474
665
+ - name: mdpr-split-pft-nq.sw
666
+ eval_key: mrtydi-v1.1-swahili
667
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-nq --output $output --hits 100
668
+ splits:
669
+ - split: train
670
+ scores:
671
+ - MRR@100: 0.1457
672
+ R@100: 0.4481
673
+ - split: dev
674
+ scores:
675
+ - MRR@100: 0.1547
676
+ R@100: 0.4724
677
+ - split: test
678
+ scores:
679
+ - MRR@100: 0.1883
680
+ R@100: 0.5281
681
+ - name: mdpr-split-pft-nq.te
682
+ eval_key: mrtydi-v1.1-telugu
683
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-nq --output $output --hits 100
684
+ splits:
685
+ - split: train
686
+ scores:
687
+ - MRR@100: 0.1489
688
+ R@100: 0.4905
689
+ - split: dev
690
+ scores:
691
+ - MRR@100: 0.1503
692
+ R@100: 0.4934
693
+ - split: test
694
+ scores:
695
+ - MRR@100: 0.1099
696
+ R@100: 0.3661
697
+ - name: mdpr-split-pft-nq.th
698
+ eval_key: mrtydi-v1.1-thai
699
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-nq --output $output --hits 100
700
+ splits:
701
+ - split: train
702
+ scores:
703
+ - MRR@100: 0.1603
704
+ R@100: 0.4983
705
+ - split: dev
706
+ scores:
707
+ - MRR@100: 0.1584
708
+ R@100: 0.5083
709
+ - split: test
710
+ scores:
711
+ - MRR@100: 0.1709
712
+ R@100: 0.5146
713
+
714
+ # BM25
715
+ - name: bm25.ar
716
+ eval_key: mrtydi-v1.1-arabic
717
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ar --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic --output $output --bm25 --hits 100
718
+ splits:
719
+ - split: train
720
+ scores:
721
+ - MRR@100: 0.3356
722
+ R@100: 0.7944
723
+ - split: dev
724
+ scores:
725
+ - MRR@100: 0.3462
726
+ R@100: 0.7872
727
+ - split: test
728
+ scores:
729
+ - MRR@100: 0.3682
730
+ R@100: 0.7928
731
+ - name: bm25.bn
732
+ eval_key: mrtydi-v1.1-bengali
733
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language bn --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali --output $output --bm25 --hits 100
734
+ splits:
735
+ - split: train
736
+ scores:
737
+ - MRR@100: 0.3566
738
+ - R@100: 0.8336
739
+ - split: dev
740
+ scores:
741
+ - MRR@100: 0.3385
742
+ - R@100: 0.8432
743
+ - split: test
744
+ scores:
745
+ - MRR@100: 0.4182
746
+ - R@100: 0.8694
747
+ - name: bm25.en
748
+ eval_key: mrtydi-v1.1-english
749
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language en --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english --output $output --bm25 --hits 100
750
+ splits:
751
+ - split: train
752
+ scores:
753
+ - MRR@100: 0.1592
754
+ - R@100: 0.5785
755
+ - split: dev
756
+ scores:
757
+ - MRR@100: 0.1685
758
+ - R@100: 0.6196
759
+ - split: test
760
+ scores:
761
+ - MRR@100: 0.1404
762
+ - R@100: 0.5365
763
+ - name: bm25.fi
764
+ eval_key: mrtydi-v1.1-finnish
765
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language fi --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish --output $output --bm25 --hits 100
766
+ splits:
767
+ - split: train
768
+ scores:
769
+ - MRR@100: 0.4101
770
+ - R@100: 0.8198
771
+ - split: dev
772
+ scores:
773
+ - MRR@100: 0.4136
774
+ - R@100: 0.8285
775
+ - split: test
776
+ scores:
777
+ - MRR@100: 0.2836
778
+ - R@100: 0.7196
779
+ - name: bm25.id
780
+ eval_key: mrtydi-v1.1-indonesian
781
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language id --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian --output $output --bm25 --hits 100
782
+ splits:
783
+ - split: train
784
+ scores:
785
+ - MRR@100: 0.2972
786
+ - R@100: 0.7948
787
+ - split: dev
788
+ scores:
789
+ - MRR@100: 0.2937
790
+ - R@100: 0.7827
791
+ - split: test
792
+ scores:
793
+ - MRR@100: 0.3762
794
+ - R@100: 0.8426
795
+ - name: bm25.ja
796
+ eval_key: mrtydi-v1.1-japanese
797
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ja --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese --output $output --bm25 --hits 100
798
+ splits:
799
+ - split: train
800
+ scores:
801
+ - MRR@100: 0.2262
802
+ - R@100: 0.7290
803
+ - split: dev
804
+ scores:
805
+ - MRR@100: 0.2250
806
+ - R@100: 0.7252
807
+ - split: test
808
+ scores:
809
+ - MRR@100: 0.2125
810
+ - R@100: 0.6431
811
+ - name: bm25.ko
812
+ eval_key: mrtydi-v1.1-korean
813
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ko --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean --output $output --bm25 --hits 100
814
+ splits:
815
+ - split: train
816
+ scores:
817
+ - MRR@100: 0.2596
818
+ - R@100: 0.6178
819
+ - split: dev
820
+ scores:
821
+ - MRR@100: 0.2888
822
+ - R@100: 0.6733
823
+ - split: test
824
+ scores:
825
+ - MRR@100: 0.2848
826
+ - R@100: 0.6188
827
+ - name: bm25.ru
828
+ eval_key: mrtydi-v1.1-russian
829
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ru --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian --output $output --bm25 --hits 100
830
+ splits:
831
+ - split: train
832
+ scores:
833
+ - MRR@100: 0.2229
834
+ - R@100: 0.5779
835
+ - split: dev
836
+ scores:
837
+ - MRR@100: 0.2202
838
+ - R@100: 0.5760
839
+ - split: test
840
+ scores:
841
+ - MRR@100: 0.3163
842
+ - R@100: 0.6541
843
+ - name: bm25.sw
844
+ eval_key: mrtydi-v1.1-swahili
845
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language sw --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili --output $output --bm25 --hits 100
846
+ splits:
847
+ - split: train
848
+ scores:
849
+ - MRR@100: 0.2610
850
+ - R@100: 0.5903
851
+ - split: dev
852
+ scores:
853
+ - MRR@100: 0.2693
854
+ - R@100: 0.5789
855
+ - split: test
856
+ scores:
857
+ - MRR@100: 0.3893
858
+ - R@100: 0.7642
859
+ - name: bm25.te
860
+ eval_key: mrtydi-v1.1-telugu
861
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language te --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu --output $output --bm25 --hits 100
862
+ splits:
863
+ - split: train
864
+ scores:
865
+ - MRR@100: 0.4204
866
+ - R@100: 0.8229
867
+ - split: dev
868
+ scores:
869
+ - MRR@100: 0.4269
870
+ - R@100: 0.8362
871
+ - split: test
872
+ scores:
873
+ - MRR@100: 0.5283
874
+ - R@100: 0.8971
875
+ - name: bm25.th
876
+ eval_key: mrtydi-v1.1-thai
877
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language th --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai --output $output --bm25 --hits 100
878
+ splits:
879
+ - split: train
880
+ scores:
881
+ - MRR@100: 0.3543
882
+ - R@100: 0.8349
883
+ - split: dev
884
+ scores:
885
+ - MRR@100: 0.3586
886
+ - R@100: 0.8536
887
+ - split: test
888
+ scores:
889
+ - MRR@100: 0.4012
890
+ - R@100: 0.8529
pyserini/2cr/mrtydi_html.template ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
6
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
7
+ <title>Pyserini Reproductions</title>
8
+ <!-- Font Awesome -->
9
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
10
+ <!-- Google Fonts Roboto -->
11
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
12
+ <!-- MDB -->
13
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
14
+
15
+ <style>
16
+ tr.hide-table-padding td {
17
+ padding: 0;
18
+ }
19
+
20
+ .expand-button {
21
+ position: relative;
22
+ }
23
+
24
+ .accordion-toggle .expand-button:after {
25
+ position: absolute;
26
+ left:.75rem;
27
+ top: 50%;
28
+ transform: translate(0, -50%);
29
+ content: '-';
30
+ }
31
+
32
+ .accordion-toggle.collapsed .expand-button:after {
33
+ content: '+';
34
+ }
35
+
36
+ blockquote.mycode {
37
+ border-left: 3px solid #ccc;
38
+ margin-left: 25px;
39
+ margin-top: 15px;
40
+ padding-left: 15px;
41
+ }
42
+
43
+ blockquote.mycode2 {
44
+ border-left: 3px solid #ccc;
45
+ margin-left: 25px;
46
+ padding-top: 10px;
47
+ padding-bottom: 10px;
48
+ padding-left: 15px;
49
+ }
50
+
51
+ tr th.headertop {
52
+ border-bottom: none;
53
+ padding-bottom: 0rem
54
+ }
55
+
56
+ tr th.headerbottom {
57
+ padding-top: 0rem
58
+ }
59
+
60
+ .table>:not(caption)>*>*{padding:0.75rem 0.75rem}
61
+
62
+ .copy-code-button {
63
+ border-radius: 0;
64
+ min-width: 55px;
65
+ background: none repeat scroll 0 0 transparent;
66
+ background-color: grey;
67
+ color: #F1F2F3 !important;
68
+ cursor: pointer;
69
+ border-style: none;
70
+ font-family: 'HELVETICA',sans-serif;
71
+ font-size: 0.8em;
72
+ font-weight: normal;
73
+ text-align: center;
74
+ text-decoration: none;
75
+ text-indent: 0;
76
+ text-transform: uppercase;
77
+ font-weight: 500;
78
+ line-height: 1.42rem;
79
+ margin: 0;
80
+ padding: 3px 8px;
81
+ position: absolute !important;
82
+ top: 0 !important;
83
+ right: 0 !important;
84
+ }
85
+
86
+ .copy-code-button > span {
87
+ color: #F1F2F3 !important;
88
+ }
89
+
90
+ .copy-code-button, ::before, ::after {
91
+ box-sizing: inherit;
92
+ }
93
+
94
+ .copy-code-button::before {
95
+ content: '';
96
+ display: inline-block;
97
+ width: 16px;
98
+ height: 16px;
99
+ margin-right: 3px;
100
+ background-size: contain;
101
+ background-image: url("");
102
+ background-repeat: no-repeat;
103
+ position: relative;
104
+ top: 3px;
105
+ }
106
+
107
+ .copy-code-button:focus {
108
+ /* Avoid an ugly focus outline on click in Chrome,
109
+ but darken the button for accessibility.
110
+ See https://stackoverflow.com/a/25298082/1481479 */
111
+ /* background-color: #E6E6E6; */
112
+ outline: 0;
113
+ }
114
+
115
+ pre[class*="prettyprint"] {
116
+ position: relative;
117
+ overflow: hidden;
118
+ }
119
+ </style>
120
+ </head>
121
+ <body>
122
+
123
+ <!-- Background image -->
124
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
125
+ <div class="mask" style="
126
+ background: linear-gradient(
127
+ 45deg,
128
+ rgba(29, 236, 197, 0.7),
129
+ rgba(91, 14, 214, 0.7) 100%
130
+ );
131
+ ">
132
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
133
+ <div class="text-white">
134
+ <h1 class="mb-3">$title</h1>
135
+ </div>
136
+ </div>
137
+ </div>
138
+ </div>
139
+ <!-- Background image -->
140
+
141
+ <div class="container my-4">
142
+
143
+ $tables
144
+
145
+ </ul>
146
+
147
+ <div style="padding-top: 20px"/>
148
+
149
+ <h4>Programmatic Execution</h4>
150
+
151
+ <p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
152
+ To list all the experimental conditions:</p>
153
+
154
+ <blockquote class="mycode2"><tt>
155
+ python -m pyserini.2cr.mrtydi --list-conditions
156
+ </tt></blockquote>
157
+
158
+ <p>Run all languages for a specific condition and show commands:</p>
159
+
160
+ <blockquote class="mycode2"><tt>
161
+ python -m pyserini.2cr.mrtydi --condition bm25 --display-commands
162
+ </tt></blockquote>
163
+
164
+ <p>Run a particular language for a specific condition and show commands:</p>
165
+
166
+ <blockquote class="mycode2"><tt>
167
+ python -m pyserini.2cr.mrtydi --condition bm25 --language ko --display-commands
168
+ </tt></blockquote>
169
+
170
+ <p>Run all languages for all conditions and show commands:</p>
171
+
172
+ <blockquote class="mycode2"><tt>
173
+ python -m pyserini.2cr.mrtydi --all --display-commands
174
+ </tt></blockquote>
175
+
176
+ <p>With the above commands, run files will be placed in the current directory. Use the option <tt>--directory runs</tt> to place the runs in a sub-directory.</p>
177
+
178
+ <p>For a specific condition, just show the commands and do not run:</p>
179
+
180
+ <blockquote class="mycode2"><tt>
181
+ python -m pyserini.2cr.mrtydi --condition bm25 --display-commands --dry-run
182
+ </tt></blockquote>
183
+
184
+ <p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
185
+
186
+ <p>For a specific condition and language, just show the commands and do not run:</p>
187
+
188
+ <blockquote class="mycode2"><tt>
189
+ python -m pyserini.2cr.mrtydi --condition bm25 --language ko --display-commands --dry-run
190
+ </tt></blockquote>
191
+
192
+ <p>For all conditions, just show the commands and do not run and skip evaluation:</p>
193
+
194
+ <blockquote class="mycode2"><tt>
195
+ python -m pyserini.2cr.mrtydi --all --display-commands --dry-run --skip-eval
196
+ </tt></blockquote>
197
+
198
+ <p>Finally, to generate this page:</p>
199
+
200
+ <blockquote class="mycode2"><tt>
201
+ python -m pyserini.2cr.mrtydi --generate-report --output docs/2cr/mrtydi.html
202
+ </tt></blockquote>
203
+
204
+ <p>The output file <tt>mrtydi.html</tt> should be identical to this page.</p>
205
+
206
+ <div style="padding-top: 50px"/>
207
+
208
+ </div>
209
+
210
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
211
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
212
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
213
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
214
+
215
+ <script>
216
+ document.querySelectorAll('pre').forEach(function (codeBlock) {
217
+ var button = document.createElement('button');
218
+ button.className = 'copy-code-button';
219
+ button.type = 'button';
220
+ var s = codeBlock.innerText;
221
+ button.setAttribute('data-clipboard-text',s);
222
+ button.innerText = 'Copy';
223
+
224
+ // var pre = codeBlock.parentNode;
225
+ codeBlock.classList.add('prettyprint');
226
+ // pre.parentNode.insertBefore(button, pre);
227
+ codeBlock.appendChild(button);
228
+ });
229
+
230
+ var clipboard = new ClipboardJS('.copy-code-button');
231
+
232
+ clipboard.on('success', function(e) {
233
+ console.info('Action:', e.action);
234
+ console.info('Text:', e.text);
235
+ console.info('Trigger:', e.trigger);
236
+ e.trigger.textContent = 'Copied';
237
+ window.setTimeout(function() {
238
+ e.trigger.textContent = 'Copy';
239
+ }, 2000);
240
+ e.clearSelection();
241
+ });
242
+
243
+ clipboard.on('error', function(e) {
244
+ console.error('Action:', e.action);
245
+ console.error('Trigger:', e.trigger);
246
+ e.trigger.textContent = 'Error Copying';
247
+ window.setTimeout(function() {
248
+ e.trigger.textContent = 'Copy';
249
+ }, 2000);
250
+ e.clearSelection();
251
+ });
252
+
253
+ </script>
254
+
255
+ </body>
256
+ </html>
pyserini/2cr/mrtydi_html_table.template ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class="table-responsive">
2
+ <table class="table">
3
+ <thead>
4
+ <tr>
5
+ <th scope="col"></th>
6
+ <th scope="col">$desc</th>
7
+ <th scope="col">ar</th>
8
+ <th scope="col">bn</th>
9
+ <th scope="col">en</th>
10
+ <th scope="col">fi</th>
11
+ <th scope="col">id</th>
12
+ <th scope="col">ja</th>
13
+ <th scope="col">ko</th>
14
+ <th scope="col">ru</th>
15
+ <th scope="col">sw</th>
16
+ <th scope="col">te</th>
17
+ <th scope="col">th</th>
18
+ <th scope="col"></th>
19
+ <th scope="col">avg</th>
20
+ </tr>
21
+ </thead>
22
+ <tbody>
23
+
24
+ $rows
25
+
26
+ </tbody>
27
+ </table>
28
+ </div>
pyserini/2cr/mrtydi_html_table_row.template ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Condition: $model -->
2
+ <tr class="accordion-toggle collapsed" id="table${table_cnt}-row${row_cnt}" data-toggle="collapse" data-parent="#table${table_cnt}-row${row_cnt}" href="#table${table_cnt}-collapse${row_cnt}">
3
+ <td class="expand-button"></td>
4
+ <td>$model</td>
5
+ <td>$ar</td>
6
+ <td>$bn</td>
7
+ <td>$en</td>
8
+ <td>$fi</td>
9
+ <td>$id</td>
10
+ <td>$ja</td>
11
+ <td>$ko</td>
12
+ <td>$ru</td>
13
+ <td>$sw</td>
14
+ <td>$te</td>
15
+ <td>$th</td>
16
+ <td></td>
17
+ <td>$avg</td>
18
+ </tr>
19
+ <tr class="hide-table-padding">
20
+ <td></td>
21
+ <td></td>
22
+ <td colspan="13" style="max-width: 600px">
23
+ <div id="table${table_cnt}-collapse${row_cnt}" class="collapse in p-3">
24
+
25
+ <!-- Tabs navs -->
26
+ <ul class="nav nav-tabs mb-3" id="table${table_cnt}-row${row_cnt}-tabs" role="tablist">
27
+ <li class="nav-item" role="presentation">
28
+ <a class="nav-link active" id="table${table_cnt}-row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab1" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">ar</a>
29
+ </li>
30
+ <li class="nav-item" role="presentation">
31
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab2" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">bn</a>
32
+ </li>
33
+ <li class="nav-item" role="presentation">
34
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab3" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">en</a>
35
+ </li>
36
+ <li class="nav-item" role="presentation">
37
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab4-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab4" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">fi</a>
38
+ </li>
39
+ <li class="nav-item" role="presentation">
40
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab5-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab5" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">id</a>
41
+ </li>
42
+ <li class="nav-item" role="presentation">
43
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab6-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab6" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ja</a>
44
+ </li>
45
+ <li class="nav-item" role="presentation">
46
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab7-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab7" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ko</a>
47
+ </li>
48
+ <li class="nav-item" role="presentation">
49
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab8-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab8" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ru</a>
50
+ </li>
51
+ <li class="nav-item" role="presentation">
52
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab9-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab9" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">sw</a>
53
+ </li>
54
+ <li class="nav-item" role="presentation">
55
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab10-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab10" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">te</a>
56
+ </li>
57
+ <li class="nav-item" role="presentation">
58
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab11-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab11" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">th</a>
59
+ </li>
60
+ </ul>
61
+ <!-- Tabs navs -->
62
+
63
+ <!-- Tabs content -->
64
+ <div class="tab-content" id="table${table_cnt}-row${row_cnt}-content">
65
+ <div class="tab-pane fade show active" id="table${table_cnt}-row${row_cnt}-tab1" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab1">
66
+ Command to generate run:
67
+
68
+ <blockquote class="mycode">
69
+ <pre><code>$cmd1
70
+ </code></pre></blockquote>
71
+ Evaluation commands:
72
+
73
+ <blockquote class="mycode">
74
+ <pre><code>${eval_cmd1}</code></pre>
75
+ </blockquote>
76
+
77
+ </div>
78
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab2" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab2">
79
+ Command to generate run:
80
+
81
+ <blockquote class="mycode">
82
+ <pre><code>$cmd2
83
+ </code></pre></blockquote>
84
+ Evaluation commands:
85
+
86
+ <blockquote class="mycode">
87
+ <pre><code>${eval_cmd2}</code></pre>
88
+ </blockquote>
89
+
90
+ </div>
91
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab3" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab3">
92
+ Command to generate run:
93
+
94
+ <blockquote class="mycode">
95
+ <pre><code>$cmd3
96
+ </code></pre></blockquote>
97
+ Evaluation commands:
98
+
99
+ <blockquote class="mycode">
100
+ <pre><code>${eval_cmd3}</code></pre>
101
+ </blockquote>
102
+
103
+ </div>
104
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab4" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab4">
105
+ Command to generate run:
106
+
107
+ <blockquote class="mycode">
108
+ <pre><code>$cmd4
109
+ </code></pre></blockquote>
110
+ Evaluation commands:
111
+
112
+ <blockquote class="mycode">
113
+ <pre><code>${eval_cmd4}</code></pre>
114
+ </blockquote>
115
+
116
+ </div>
117
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab5" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab5">
118
+ Command to generate run:
119
+
120
+ <blockquote class="mycode">
121
+ <pre><code>$cmd5
122
+ </code></pre></blockquote>
123
+ Evaluation commands:
124
+
125
+ <blockquote class="mycode">
126
+ <pre><code>${eval_cmd5}</code></pre>
127
+ </blockquote>
128
+
129
+ </div>
130
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab6" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab6">
131
+ Command to generate run:
132
+
133
+ <blockquote class="mycode">
134
+ <pre><code>$cmd6
135
+ </code></pre></blockquote>
136
+ Evaluation commands:
137
+
138
+ <blockquote class="mycode">
139
+ <pre><code>${eval_cmd6}</code></pre>
140
+ </blockquote>
141
+
142
+ </div>
143
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab7" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab7">
144
+ Command to generate run:
145
+
146
+ <blockquote class="mycode">
147
+ <pre><code>$cmd7
148
+ </code></pre></blockquote>
149
+ Evaluation commands:
150
+
151
+ <blockquote class="mycode">
152
+ <pre><code>${eval_cmd7}</code></pre>
153
+ </blockquote>
154
+
155
+ </div>
156
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab8" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab8">
157
+ Command to generate run:
158
+
159
+ <blockquote class="mycode">
160
+ <pre><code>$cmd8
161
+ </code></pre></blockquote>
162
+ Evaluation commands:
163
+
164
+ <blockquote class="mycode">
165
+ <pre><code>${eval_cmd8}</code></pre>
166
+ </blockquote>
167
+
168
+ </div>
169
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab9" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab9">
170
+ Command to generate run:
171
+
172
+ <blockquote class="mycode">
173
+ <pre><code>$cmd9
174
+ </code></pre></blockquote>
175
+ Evaluation commands:
176
+
177
+ <blockquote class="mycode">
178
+ <pre><code>${eval_cmd9}</code></pre>
179
+ </blockquote>
180
+
181
+ </div>
182
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab10" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab10">
183
+ Command to generate run:
184
+
185
+ <blockquote class="mycode">
186
+ <pre><code>$cmd10
187
+ </code></pre></blockquote>
188
+ Evaluation commands:
189
+
190
+ <blockquote class="mycode">
191
+ <pre><code>${eval_cmd10}</code></pre>
192
+ </blockquote>
193
+
194
+ </div>
195
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab11" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab11">
196
+ Command to generate run:
197
+
198
+ <blockquote class="mycode">
199
+ <pre><code>$cmd11
200
+ </code></pre></blockquote>
201
+ Evaluation commands:
202
+
203
+ <blockquote class="mycode">
204
+ <pre><code>${eval_cmd11}</code></pre>
205
+ </blockquote>
206
+
207
+ </div>
208
+ </div>
209
+ <!-- Tabs content -->
210
+
211
+ </div></td>
212
+ </tr>
pyserini/2cr/msmarco-v1-doc.yaml ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conditions:
2
+ - name: bm25-doc-tuned
3
+ display: BM25 doc (k1=4.46, b=0.82)
4
+ display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
5
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25
6
+ topics:
7
+ - topic_key: msmarco-doc-dev
8
+ eval_key: msmarco-doc-dev
9
+ scores:
10
+ - MRR@10: 0.2767
11
+ R@1K: 0.9357
12
+ - topic_key: dl19-doc
13
+ eval_key: dl19-doc
14
+ scores:
15
+ - MAP: 0.2336
16
+ nDCG@10: 0.5233
17
+ R@1K: 0.6757
18
+ - topic_key: dl20
19
+ eval_key: dl20-doc
20
+ scores:
21
+ - MAP: 0.3581
22
+ nDCG@10: 0.5061
23
+ R@1K: 0.7776
24
+ - name: bm25-doc-default
25
+ display: BM25 doc (k1=0.9, b=0.4)
26
+ display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
27
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1a)"
28
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
29
+ topics:
30
+ - topic_key: msmarco-doc-dev
31
+ eval_key: msmarco-doc-dev
32
+ scores:
33
+ - MRR@10: 0.2299
34
+ R@1K: 0.8856
35
+ - topic_key: dl19-doc
36
+ eval_key: dl19-doc
37
+ scores:
38
+ - MAP: 0.2434
39
+ nDCG@10: 0.5176
40
+ R@1K: 0.6966
41
+ - topic_key: dl20
42
+ eval_key: dl20-doc
43
+ scores:
44
+ - MAP: 0.3793
45
+ nDCG@10: 0.5286
46
+ R@1K: 0.8085
47
+ - name: bm25-doc-segmented-tuned
48
+ display: BM25 doc segmented (k1=2.16, b=0.61)
49
+ display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
50
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
51
+ topics:
52
+ - topic_key: msmarco-doc-dev
53
+ eval_key: msmarco-doc-dev
54
+ scores:
55
+ - MRR@10: 0.2756
56
+ R@1K: 0.9311
57
+ - topic_key: dl19-doc
58
+ eval_key: dl19-doc
59
+ scores:
60
+ - MAP: 0.2398
61
+ nDCG@10: 0.5389
62
+ R@1K: 0.6565
63
+ - topic_key: dl20
64
+ eval_key: dl20-doc
65
+ scores:
66
+ - MAP: 0.3458
67
+ nDCG@10: 0.5213
68
+ R@1K: 0.7725
69
+ - name: bm25-doc-segmented-default
70
+ display: BM25 doc segmented (k1=0.9, b=0.4)
71
+ display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
72
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1b)"
73
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
74
+ topics:
75
+ - topic_key: msmarco-doc-dev
76
+ eval_key: msmarco-doc-dev
77
+ scores:
78
+ - MRR@10: 0.2684
79
+ R@1K: 0.9178
80
+ - topic_key: dl19-doc
81
+ eval_key: dl19-doc
82
+ scores:
83
+ - MAP: 0.2449
84
+ nDCG@10: 0.5302
85
+ R@1K: 0.6871
86
+ - topic_key: dl20
87
+ eval_key: dl20-doc
88
+ scores:
89
+ - MAP: 0.3586
90
+ nDCG@10: 0.5281
91
+ R@1K: 0.7755
92
+ - name: bm25-rm3-doc-tuned
93
+ display: BM25+RM3 doc (k1=4.46, b=0.82)
94
+ display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
95
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rm3
96
+ topics:
97
+ - topic_key: msmarco-doc-dev
98
+ eval_key: msmarco-doc-dev
99
+ scores:
100
+ - MRR@10: 0.2227
101
+ R@1K: 0.9303
102
+ - topic_key: dl19-doc
103
+ eval_key: dl19-doc
104
+ scores:
105
+ - MAP: 0.2638
106
+ nDCG@10: 0.5526
107
+ R@1K: 0.7188
108
+ - topic_key: dl20
109
+ eval_key: dl20-doc
110
+ scores:
111
+ - MAP: 0.3610
112
+ nDCG@10: 0.5195
113
+ R@1K: 0.8180
114
+ - name: bm25-rm3-doc-default
115
+ display: BM25+RM3 doc (k1=0.9, b=0.4)
116
+ display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
117
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1c)"
118
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
119
+ topics:
120
+ - topic_key: msmarco-doc-dev
121
+ eval_key: msmarco-doc-dev
122
+ scores:
123
+ - MRR@10: 0.1618
124
+ R@1K: 0.8783
125
+ - topic_key: dl19-doc
126
+ eval_key: dl19-doc
127
+ scores:
128
+ - MAP: 0.2773
129
+ nDCG@10: 0.5174
130
+ R@1K: 0.7507
131
+ - topic_key: dl20
132
+ eval_key: dl20-doc
133
+ scores:
134
+ - MAP: 0.4015
135
+ nDCG@10: 0.5254
136
+ R@1K: 0.8259
137
+ - name: bm25-rm3-doc-segmented-tuned
138
+ display: BM25+RM3 doc segmented (k1=2.16, b=0.61)
139
+ display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
140
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
141
+ topics:
142
+ - topic_key: msmarco-doc-dev
143
+ eval_key: msmarco-doc-dev
144
+ scores:
145
+ - MRR@10: 0.2448
146
+ R@1K: 0.9359
147
+ - topic_key: dl19-doc
148
+ eval_key: dl19-doc
149
+ scores:
150
+ - MAP: 0.2655
151
+ nDCG@10: 0.5392
152
+ R@1K: 0.7037
153
+ - topic_key: dl20
154
+ eval_key: dl20-doc
155
+ scores:
156
+ - MAP: 0.3471
157
+ nDCG@10: 0.5030
158
+ R@1K: 0.8056
159
+ - name: bm25-rm3-doc-segmented-default
160
+ display: BM25+RM3 doc segmented (k1=0.9, b=0.4)
161
+ display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
162
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1d)"
163
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
164
+ topics:
165
+ - topic_key: msmarco-doc-dev
166
+ eval_key: msmarco-doc-dev
167
+ scores:
168
+ - MRR@10: 0.2413
169
+ R@1K: 0.9351
170
+ - topic_key: dl19-doc
171
+ eval_key: dl19-doc
172
+ scores:
173
+ - MAP: 0.2892
174
+ nDCG@10: 0.5684
175
+ R@1K: 0.7368
176
+ - topic_key: dl20
177
+ eval_key: dl20-doc
178
+ scores:
179
+ - MAP: 0.3792
180
+ nDCG@10: 0.5202
181
+ R@1K: 0.8023
182
+ - name: bm25-rocchio-doc-tuned
183
+ display: BM25+Rocchio doc (k1=4.46, b=0.82)
184
+ display-html: BM25+Rocchio doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
185
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rocchio
186
+ topics:
187
+ - topic_key: msmarco-doc-dev
188
+ eval_key: msmarco-doc-dev
189
+ scores:
190
+ - MRR@10: 0.2242
191
+ R@1K: 0.9314
192
+ - topic_key: dl19-doc
193
+ eval_key: dl19-doc
194
+ scores:
195
+ - MAP: 0.2657
196
+ nDCG@10: 0.5584
197
+ R@1K: 0.7299
198
+ - topic_key: dl20
199
+ eval_key: dl20-doc
200
+ scores:
201
+ - MAP: 0.3628
202
+ nDCG@10: 0.5199
203
+ R@1K: 0.8217
204
+ - name: bm25-rocchio-doc-default
205
+ display: BM25+Rocchio doc (k1=0.9, b=0.4)
206
+ display-html: BM25+Rocchio doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
207
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4
208
+ topics:
209
+ - topic_key: msmarco-doc-dev
210
+ eval_key: msmarco-doc-dev
211
+ scores:
212
+ - MRR@10: 0.1624
213
+ R@1K: 0.8789
214
+ - topic_key: dl19-doc
215
+ eval_key: dl19-doc
216
+ scores:
217
+ - MAP: 0.2811
218
+ nDCG@10: 0.5256
219
+ R@1K: 0.7546
220
+ - topic_key: dl20
221
+ eval_key: dl20-doc
222
+ scores:
223
+ - MAP: 0.4089
224
+ nDCG@10: 0.5192
225
+ R@1K: 0.8273
226
+ - name: bm25-rocchio-doc-segmented-tuned
227
+ display: BM25+Rocchio doc segmented (k1=2.16, b=0.61)
228
+ display-html: BM25+Rocchio doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
229
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rocchio --hits 10000 --max-passage-hits 1000 --max-passage
230
+ topics:
231
+ - topic_key: msmarco-doc-dev
232
+ eval_key: msmarco-doc-dev
233
+ scores:
234
+ - MRR@10: 0.2475
235
+ R@1K: 0.9395
236
+ - topic_key: dl19-doc
237
+ eval_key: dl19-doc
238
+ scores:
239
+ - MAP: 0.2672
240
+ nDCG@10: 0.5421
241
+ R@1K: 0.7115
242
+ - topic_key: dl20
243
+ eval_key: dl20-doc
244
+ scores:
245
+ - MAP: 0.3521
246
+ nDCG@10: 0.4997
247
+ R@1K: 0.8042
248
+ - name: bm25-rocchio-doc-segmented-default
249
+ display: BM25+Rocchio doc segmented (k1=0.9, b=0.4)
250
+ display-html: BM25+Rocchio doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
251
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
252
+ topics:
253
+ - topic_key: msmarco-doc-dev
254
+ eval_key: msmarco-doc-dev
255
+ scores:
256
+ - MRR@10: 0.2447
257
+ R@1K: 0.9351
258
+ - topic_key: dl19-doc
259
+ eval_key: dl19-doc
260
+ scores:
261
+ - MAP: 0.2889
262
+ nDCG@10: 0.5570
263
+ R@1K: 0.7423
264
+ - topic_key: dl20
265
+ eval_key: dl20-doc
266
+ scores:
267
+ - MAP: 0.3830
268
+ nDCG@10: 0.5226
269
+ R@1K: 0.8102
270
+ - name: bm25-d2q-t5-doc-tuned
271
+ display: BM25 w/ doc2query-T5 doc (k1=4.68, b=0.87)
272
+ display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=4.68, <i>b</i>=0.87)
273
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5 --topics $topics --output $output --bm25
274
+ topics:
275
+ - topic_key: msmarco-doc-dev
276
+ eval_key: msmarco-doc-dev
277
+ scores:
278
+ - MRR@10: 0.3269
279
+ R@1K: 0.9553
280
+ - topic_key: dl19-doc
281
+ eval_key: dl19-doc
282
+ scores:
283
+ - MAP: 0.2620
284
+ nDCG@10: 0.5972
285
+ R@1K: 0.6867
286
+ - topic_key: dl20
287
+ eval_key: dl20-doc
288
+ scores:
289
+ - MAP: 0.4099
290
+ nDCG@10: 0.5852
291
+ R@1K: 0.8105
292
+ - name: bm25-d2q-t5-doc-default
293
+ display: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4)
294
+ display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
295
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2a)"
296
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
297
+ topics:
298
+ - topic_key: msmarco-doc-dev
299
+ eval_key: msmarco-doc-dev
300
+ scores:
301
+ - MRR@10: 0.2880
302
+ R@1K: 0.9259
303
+ - topic_key: dl19-doc
304
+ eval_key: dl19-doc
305
+ scores:
306
+ - MAP: 0.2700
307
+ nDCG@10: 0.5968
308
+ R@1K: 0.7190
309
+ - topic_key: dl20
310
+ eval_key: dl20-doc
311
+ scores:
312
+ - MAP: 0.4230
313
+ nDCG@10: 0.5885
314
+ R@1K: 0.8403
315
+ - name: bm25-d2q-t5-doc-segmented-tuned
316
+ display: BM25 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59)
317
+ display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=2.56, <i>b</i>=0.59)
318
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
319
+ topics:
320
+ - topic_key: msmarco-doc-dev
321
+ eval_key: msmarco-doc-dev
322
+ scores:
323
+ - MRR@10: 0.3209
324
+ R@1K: 0.9530
325
+ - topic_key: dl19-doc
326
+ eval_key: dl19-doc
327
+ scores:
328
+ - MAP: 0.2658
329
+ nDCG@10: 0.6273
330
+ R@1K: 0.6707
331
+ - topic_key: dl20
332
+ eval_key: dl20-doc
333
+ scores:
334
+ - MAP: 0.4047
335
+ nDCG@10: 0.5943
336
+ R@1K: 0.7968
337
+ - name: bm25-d2q-t5-doc-segmented-default
338
+ display: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
339
+ display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
340
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2b)"
341
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
342
+ topics:
343
+ - topic_key: msmarco-doc-dev
344
+ eval_key: msmarco-doc-dev
345
+ scores:
346
+ - MRR@10: 0.3179
347
+ R@1K: 0.9490
348
+ - topic_key: dl19-doc
349
+ eval_key: dl19-doc
350
+ scores:
351
+ - MAP: 0.2798
352
+ nDCG@10: 0.6119
353
+ R@1K: 0.7165
354
+ - topic_key: dl20
355
+ eval_key: dl20-doc
356
+ scores:
357
+ - MAP: 0.4150
358
+ nDCG@10: 0.5957
359
+ R@1K: 0.8046
360
+ - name: bm25-rm3-d2q-t5-doc-tuned
361
+ display: BM25+RM3 w/ doc2query-T5 doc (k1=4.68, b=0.87)
362
+ display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=4.68, <i>b</i>=0.87)
363
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
364
+ topics:
365
+ - topic_key: msmarco-doc-dev
366
+ eval_key: msmarco-doc-dev
367
+ scores:
368
+ - MRR@10: 0.2623
369
+ R@1K: 0.9522
370
+ - topic_key: dl19-doc
371
+ eval_key: dl19-doc
372
+ scores:
373
+ - MAP: 0.2813
374
+ nDCG@10: 0.6091
375
+ R@1K: 0.7184
376
+ - topic_key: dl20
377
+ eval_key: dl20-doc
378
+ scores:
379
+ - MAP: 0.4100
380
+ nDCG@10: 0.5745
381
+ R@1K: 0.8238
382
+ - name: bm25-rm3-d2q-t5-doc-default
383
+ display: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4)
384
+ display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
385
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2c)"
386
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
387
+ topics:
388
+ - topic_key: msmarco-doc-dev
389
+ eval_key: msmarco-doc-dev
390
+ scores:
391
+ - MRR@10: 0.1834
392
+ R@1K: 0.9126
393
+ - topic_key: dl19-doc
394
+ eval_key: dl19-doc
395
+ scores:
396
+ - MAP: 0.3045
397
+ nDCG@10: 0.5904
398
+ R@1K: 0.7737
399
+ - topic_key: dl20
400
+ eval_key: dl20-doc
401
+ scores:
402
+ - MAP: 0.4230
403
+ nDCG@10: 0.5427
404
+ R@1K: 0.8631
405
+ - name: bm25-rm3-d2q-t5-doc-segmented-tuned
406
+ display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59)
407
+ display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=2.56, <i>b</i>=0.59)
408
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
409
+ topics:
410
+ - topic_key: msmarco-doc-dev
411
+ eval_key: msmarco-doc-dev
412
+ scores:
413
+ - MRR@10: 0.2973
414
+ R@1K: 0.9563
415
+ - topic_key: dl19-doc
416
+ eval_key: dl19-doc
417
+ scores:
418
+ - MAP: 0.2892
419
+ nDCG@10: 0.6247
420
+ R@1K: 0.7069
421
+ - topic_key: dl20
422
+ eval_key: dl20-doc
423
+ scores:
424
+ - MAP: 0.4016
425
+ nDCG@10: 0.5711
426
+ R@1K: 0.8156
427
+ - name: bm25-rm3-d2q-t5-doc-segmented-default
428
+ display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
429
+ display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
430
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2d)"
431
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
432
+ topics:
433
+ - topic_key: msmarco-doc-dev
434
+ eval_key: msmarco-doc-dev
435
+ scores:
436
+ - MRR@10: 0.2803
437
+ R@1K: 0.9551
438
+ - topic_key: dl19-doc
439
+ eval_key: dl19-doc
440
+ scores:
441
+ - MAP: 0.3030
442
+ nDCG@10: 0.6290
443
+ R@1K: 0.7483
444
+ - topic_key: dl20
445
+ eval_key: dl20-doc
446
+ scores:
447
+ - MAP: 0.4271
448
+ nDCG@10: 0.5851
449
+ R@1K: 0.8266
450
+ - name: unicoil-noexp-pytorch
451
+ display: "uniCOIL (noexp): query inference with PyTorch"
452
+ display-html: "uniCOIL (noexp): query inference with PyTorch"
453
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil-noexp --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
454
+ topics:
455
+ - topic_key: msmarco-doc-dev
456
+ eval_key: msmarco-doc-dev
457
+ scores:
458
+ - MRR@10: 0.3410
459
+ R@1K: 0.9420
460
+ - topic_key: dl19-doc
461
+ eval_key: dl19-doc
462
+ scores:
463
+ - MAP: 0.2661
464
+ nDCG@10: 0.6347
465
+ R@1K: 0.6385
466
+ - topic_key: dl20
467
+ eval_key: dl20-doc
468
+ scores:
469
+ - MAP: 0.3698
470
+ nDCG@10: 0.5906
471
+ R@1K: 0.7621
472
+ - name: unicoil-noexp
473
+ display: "uniCOIL (noexp): pre-encoded"
474
+ display-html: "uniCOIL (noexp): pre-encoded queries"
475
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3a)"
476
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil-noexp --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
477
+ topics:
478
+ - topic_key: msmarco-doc-dev-unicoil-noexp
479
+ eval_key: msmarco-doc-dev
480
+ scores:
481
+ - MRR@10: 0.3409
482
+ R@1K: 0.9420
483
+ - topic_key: dl19-doc-unicoil-noexp
484
+ eval_key: dl19-doc
485
+ scores:
486
+ - MAP: 0.2665
487
+ nDCG@10: 0.6349
488
+ R@1K: 0.6391
489
+ - topic_key: dl20-unicoil-noexp
490
+ eval_key: dl20-doc
491
+ scores:
492
+ - MAP: 0.3698
493
+ nDCG@10: 0.5893
494
+ R@1K: 0.7623
495
+ - name: unicoil-pytorch
496
+ display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
497
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
498
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
499
+ topics:
500
+ - topic_key: msmarco-doc-dev
501
+ eval_key: msmarco-doc-dev
502
+ scores:
503
+ - MRR@10: 0.3532
504
+ R@1K: 0.9546
505
+ - topic_key: dl19-doc
506
+ eval_key: dl19-doc
507
+ scores:
508
+ - MAP: 0.2789
509
+ nDCG@10: 0.6396
510
+ R@1K: 0.6654
511
+ - topic_key: dl20
512
+ eval_key: dl20-doc
513
+ scores:
514
+ - MAP: 0.3881
515
+ nDCG@10: 0.6030
516
+ R@1K: 0.7866
517
+ - name: unicoil
518
+ display: "uniCOIL (w/ doc2query-T5): pre-encoded"
519
+ display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
520
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3b)"
521
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
522
+ topics:
523
+ - topic_key: msmarco-doc-dev-unicoil
524
+ eval_key: msmarco-doc-dev
525
+ scores:
526
+ - MRR@10: 0.3531
527
+ R@1K: 0.9546
528
+ - topic_key: dl19-doc-unicoil
529
+ eval_key: dl19-doc
530
+ scores:
531
+ - MAP: 0.2789
532
+ nDCG@10: 0.6396
533
+ R@1K: 0.6652
534
+ - topic_key: dl20-unicoil
535
+ eval_key: dl20-doc
536
+ scores:
537
+ - MAP: 0.3882
538
+ nDCG@10: 0.6033
539
+ R@1K: 0.7869
pyserini/2cr/msmarco-v1-passage.yaml ADDED
@@ -0,0 +1,764 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conditions:
2
+ - name: bm25-rocchio-d2q-t5-tuned
3
+ display: BM25+Rocchio w/ doc2query-T5 (k1=2.18, b=0.86)
4
+ display-html: BM25+Rocchio w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
5
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rocchio
6
+ topics:
7
+ - topic_key: msmarco-passage-dev-subset
8
+ eval_key: msmarco-passage-dev-subset
9
+ scores:
10
+ - MRR@10: 0.2395
11
+ R@1K: 0.9535
12
+ - topic_key: dl19-passage
13
+ eval_key: dl19-passage
14
+ scores:
15
+ - MAP: 0.4339
16
+ nDCG@10: 0.6559
17
+ R@1K: 0.8465
18
+ - topic_key: dl20
19
+ eval_key: dl20-passage
20
+ scores:
21
+ - MAP: 0.4376
22
+ nDCG@10: 0.6224
23
+ R@1K: 0.8641
24
+ - name: bm25-rocchio-d2q-t5-default
25
+ display: BM25+Rocchio w/ doc2query-T5 (k1=0.9, b=0.4)
26
+ display-html: BM25+Rocchio w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
27
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4
28
+ topics:
29
+ - topic_key: msmarco-passage-dev-subset
30
+ eval_key: msmarco-passage-dev-subset
31
+ scores:
32
+ - MRR@10: 0.2158
33
+ R@1K: 0.9467
34
+ - topic_key: dl19-passage
35
+ eval_key: dl19-passage
36
+ scores:
37
+ - MAP: 0.4469
38
+ nDCG@10: 0.6538
39
+ R@1K: 0.8855
40
+ - topic_key: dl20
41
+ eval_key: dl20-passage
42
+ scores:
43
+ - MAP: 0.4246
44
+ nDCG@10: 0.6102
45
+ R@1K: 0.8675
46
+ - name: bm25-rocchio-default
47
+ display: BM25+Rocchio (k1=0.9, b=0.4)
48
+ display-html: BM25+Rocchio (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
49
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --rocchio
50
+ topics:
51
+ - topic_key: msmarco-passage-dev-subset
52
+ eval_key: msmarco-passage-dev-subset
53
+ scores:
54
+ - MRR@10: 0.1595
55
+ R@1K: 0.8620
56
+ - topic_key: dl19-passage
57
+ eval_key: dl19-passage
58
+ scores:
59
+ - MAP: 0.3474
60
+ nDCG@10: 0.5275
61
+ R@1K: 0.8007
62
+ - topic_key: dl20
63
+ eval_key: dl20-passage
64
+ scores:
65
+ - MAP: 0.3115
66
+ nDCG@10: 0.4910
67
+ R@1K: 0.8156
68
+ - name: bm25-rocchio-tuned
69
+ display: BM25+Rocchio (k1=0.82, b=0.68)
70
+ display-html: BM25+Rocchio (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
71
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --rocchio
72
+ topics:
73
+ - topic_key: msmarco-passage-dev-subset
74
+ eval_key: msmarco-passage-dev-subset
75
+ scores:
76
+ - MRR@10: 0.1684
77
+ R@1K: 0.8726
78
+ - topic_key: dl19-passage
79
+ eval_key: dl19-passage
80
+ scores:
81
+ - MAP: 0.3396
82
+ nDCG@10: 0.5275
83
+ R@1K: 0.7948
84
+ - topic_key: dl20
85
+ eval_key: dl20-passage
86
+ scores:
87
+ - MAP: 0.3120
88
+ nDCG@10: 0.4908
89
+ R@1K: 0.8327
90
+ - name: distilbert-kd-tasb-pytorch
91
+ display: "DistilBERT KD TASB: query inference with PyTorch"
92
+ display-html: "DistilBERT KD TASB: query inference with PyTorch"
93
+ display-row: "[5]"
94
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-tas_b-b256 --topics $topics --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco --output $output
95
+ topics:
96
+ - topic_key: msmarco-passage-dev-subset
97
+ eval_key: msmarco-passage-dev-subset
98
+ scores:
99
+ - MRR@10: 0.3444
100
+ R@1K: 0.9771
101
+ - topic_key: dl19-passage
102
+ eval_key: dl19-passage
103
+ scores:
104
+ - MAP: 0.4590
105
+ nDCG@10: 0.7210
106
+ R@1K: 0.8406
107
+ - topic_key: dl20
108
+ eval_key: dl20-passage
109
+ scores:
110
+ - MAP: 0.4698
111
+ nDCG@10: 0.6854
112
+ R@1K: 0.8727
113
+ - name: distilbert-kd-tasb
114
+ display: "DistilBERT KD TASB: pre-encoded"
115
+ display-html: "DistilBERT KD TASB: pre-encoded queries"
116
+ display-row: "[5]"
117
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-tas_b-b256 --topics $topics --encoded-queries distilbert_tas_b-$topics --output $output
118
+ topics:
119
+ - topic_key: msmarco-passage-dev-subset
120
+ eval_key: msmarco-passage-dev-subset
121
+ scores:
122
+ - MRR@10: 0.3444
123
+ R@1K: 0.9771
124
+ - topic_key: dl19-passage
125
+ eval_key: dl19-passage
126
+ scores:
127
+ - MAP: 0.4590
128
+ nDCG@10: 0.7210
129
+ R@1K: 0.8406
130
+ - topic_key: dl20
131
+ eval_key: dl20-passage
132
+ scores:
133
+ - MAP: 0.4698
134
+ nDCG@10: 0.6854
135
+ R@1K: 0.8727
136
+ - name: distilbert-kd-pytorch
137
+ display: "DistilBERT KD: query inference with PyTorch"
138
+ display-html: "DistilBERT KD: query inference with PyTorch"
139
+ display-row: "[4]"
140
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 --topics $topics --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco --output $output
141
+ topics:
142
+ - topic_key: msmarco-passage-dev-subset
143
+ eval_key: msmarco-passage-dev-subset
144
+ scores:
145
+ - MRR@10: 0.3251
146
+ R@1K: 0.9553
147
+ - topic_key: dl19-passage
148
+ eval_key: dl19-passage
149
+ scores:
150
+ - MAP: 0.4053
151
+ nDCG@10: 0.6994
152
+ R@1K: 0.7653
153
+ - topic_key: dl20
154
+ eval_key: dl20-passage
155
+ scores:
156
+ - MAP: 0.4159
157
+ nDCG@10: 0.6447
158
+ R@1K: 0.7953
159
+ - name: distilbert-kd
160
+ display: "DistilBERT KD: pre-encoded"
161
+ display-html: "DistilBERT KD: pre-encoded queries"
162
+ display-row: "[4]"
163
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 --topics $topics --encoded-queries distilbert_kd-$topics --output $output
164
+ topics:
165
+ - topic_key: msmarco-passage-dev-subset
166
+ eval_key: msmarco-passage-dev-subset
167
+ scores:
168
+ - MRR@10: 0.3251
169
+ R@1K: 0.9553
170
+ - topic_key: dl19-passage
171
+ eval_key: dl19-passage
172
+ scores:
173
+ - MAP: 0.4053
174
+ nDCG@10: 0.6994
175
+ R@1K: 0.7653
176
+ - topic_key: dl20
177
+ eval_key: dl20-passage
178
+ scores:
179
+ - MAP: 0.4159
180
+ nDCG@10: 0.6447
181
+ R@1K: 0.7953
182
+ - name: ance-pytorch
183
+ display: "ANCE: query inference with PyTorch"
184
+ display-html: "ANCE: query inference with PyTorch"
185
+ display-row: "[3]"
186
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.ance --topics $topics --encoder castorini/ance-msmarco-passage --output $output
187
+ topics:
188
+ - topic_key: msmarco-passage-dev-subset
189
+ eval_key: msmarco-passage-dev-subset
190
+ scores:
191
+ - MRR@10: 0.3302
192
+ R@1K: 0.9587
193
+ - topic_key: dl19-passage
194
+ eval_key: dl19-passage
195
+ scores:
196
+ - MAP: 0.3710
197
+ nDCG@10: 0.6452
198
+ R@1K: 0.7554
199
+ - topic_key: dl20
200
+ eval_key: dl20-passage
201
+ scores:
202
+ - MAP: 0.4076
203
+ nDCG@10: 0.6458
204
+ R@1K: 0.7764
205
+ - name: ance
206
+ display: "ANCE: pre-encoded"
207
+ display-html: "ANCE: pre-encoded queries"
208
+ display-row: "[3]"
209
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.ance --topics $topics --encoded-queries ance-$topics --output $output
210
+ topics:
211
+ - topic_key: msmarco-passage-dev-subset
212
+ eval_key: msmarco-passage-dev-subset
213
+ scores:
214
+ - MRR@10: 0.3302
215
+ R@1K: 0.9584
216
+ - topic_key: dl19-passage
217
+ eval_key: dl19-passage
218
+ scores:
219
+ - MAP: 0.3710
220
+ nDCG@10: 0.6452
221
+ R@1K: 0.7554
222
+ - topic_key: dl20
223
+ eval_key: dl20-passage
224
+ scores:
225
+ - MAP: 0.4076
226
+ nDCG@10: 0.6458
227
+ R@1K: 0.7764
228
+ - name: bm25-tuned
229
+ display: BM25 (k1=0.82, b=0.68)
230
+ display-html: BM25 (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
231
+ command: python -m pyserini.search.lucene --topics $topics --index msmarco-v1-passage --output $output --bm25
232
+ topics:
233
+ - topic_key: msmarco-passage-dev-subset
234
+ eval_key: msmarco-passage-dev-subset
235
+ scores:
236
+ - MRR@10: 0.1875
237
+ R@1K: 0.8573
238
+ - topic_key: dl19-passage
239
+ eval_key: dl19-passage
240
+ scores:
241
+ - MAP: 0.2903
242
+ nDCG@10: 0.4973
243
+ R@1K: 0.7450
244
+ - topic_key: dl20
245
+ eval_key: dl20-passage
246
+ scores:
247
+ - MAP: 0.2876
248
+ nDCG@10: 0.4876
249
+ R@1K: 0.8031
250
+ - name: bm25-rm3-tuned
251
+ display: BM25+RM3 (k1=0.82, b=0.68)
252
+ display-html: BM25+RM3 (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
253
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --rm3
254
+ topics:
255
+ - topic_key: msmarco-passage-dev-subset
256
+ eval_key: msmarco-passage-dev-subset
257
+ scores:
258
+ - MRR@10: 0.1646
259
+ R@1K: 0.8704
260
+ - topic_key: dl19-passage
261
+ eval_key: dl19-passage
262
+ scores:
263
+ - MAP: 0.3339
264
+ nDCG@10: 0.5147
265
+ R@1K: 0.7950
266
+ - topic_key: dl20
267
+ eval_key: dl20-passage
268
+ scores:
269
+ - MAP: 0.3017
270
+ nDCG@10: 0.4924
271
+ R@1K: 0.8292
272
+ - name: bm25-default
273
+ display: BM25 (k1=0.9, b=0.4)
274
+ display-html: BM25 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
275
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1a)"
276
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
277
+ topics:
278
+ - topic_key: msmarco-passage-dev-subset
279
+ eval_key: msmarco-passage-dev-subset
280
+ scores:
281
+ - MRR@10: 0.1840
282
+ R@1K: 0.8526
283
+ - topic_key: dl19-passage
284
+ eval_key: dl19-passage
285
+ scores:
286
+ - MAP: 0.3013
287
+ nDCG@10: 0.5058
288
+ R@1K: 0.7501
289
+ - topic_key: dl20
290
+ eval_key: dl20-passage
291
+ scores:
292
+ - MAP: 0.2856
293
+ nDCG@10: 0.4796
294
+ R@1K: 0.7863
295
+ - name: bm25-rm3-default
296
+ display: BM25+RM3 (k1=0.9, b=0.4)
297
+ display-html: BM25+RM3 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
298
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1b)"
299
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --rm3
300
+ topics:
301
+ - topic_key: msmarco-passage-dev-subset
302
+ eval_key: msmarco-passage-dev-subset
303
+ scores:
304
+ - MRR@10: 0.1566
305
+ R@1K: 0.8606
306
+ - topic_key: dl19-passage
307
+ eval_key: dl19-passage
308
+ scores:
309
+ - MAP: 0.3416
310
+ nDCG@10: 0.5216
311
+ R@1K: 0.8136
312
+ - topic_key: dl20
313
+ eval_key: dl20-passage
314
+ scores:
315
+ - MAP: 0.3006
316
+ nDCG@10: 0.4896
317
+ R@1K: 0.8236
318
+ - name: bm25-d2q-t5-tuned
319
+ display: BM25 w/ doc2query-T5 (k1=2.18, b=0.86)
320
+ display-html: BM25 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
321
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5 --topics $topics --output $output --bm25
322
+ topics:
323
+ - topic_key: msmarco-passage-dev-subset
324
+ eval_key: msmarco-passage-dev-subset
325
+ scores:
326
+ - MRR@10: 0.2816
327
+ R@1K: 0.9506
328
+ - topic_key: dl19-passage
329
+ eval_key: dl19-passage
330
+ scores:
331
+ - MAP: 0.4046
332
+ nDCG@10: 0.6336
333
+ R@1K: 0.8134
334
+ - topic_key: dl20
335
+ eval_key: dl20-passage
336
+ scores:
337
+ - MAP: 0.4171
338
+ nDCG@10: 0.6265
339
+ R@1K: 0.8393
340
+ - name: bm25-d2q-t5-default
341
+ display: BM25 w/ doc2query-T5 (k1=0.9, b=0.4)
342
+ display-html: BM25 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
343
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2a)"
344
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
345
+ topics:
346
+ - topic_key: msmarco-passage-dev-subset
347
+ eval_key: msmarco-passage-dev-subset
348
+ scores:
349
+ - MRR@10: 0.2723
350
+ R@1K: 0.9470
351
+ - topic_key: dl19-passage
352
+ eval_key: dl19-passage
353
+ scores:
354
+ - MAP: 0.4034
355
+ nDCG@10: 0.6417
356
+ R@1K: 0.8310
357
+ - topic_key: dl20
358
+ eval_key: dl20-passage
359
+ scores:
360
+ - MAP: 0.4074
361
+ nDCG@10: 0.6187
362
+ R@1K: 0.8452
363
+ - name: bm25-rm3-d2q-t5-tuned
364
+ display: BM25+RM3 w/ doc2query-T5 (k1=2.18, b=0.86)
365
+ display-html: BM25+RM3 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
366
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
367
+ topics:
368
+ - topic_key: msmarco-passage-dev-subset
369
+ eval_key: msmarco-passage-dev-subset
370
+ scores:
371
+ - MRR@10: 0.2382
372
+ R@1K: 0.9528
373
+ - topic_key: dl19-passage
374
+ eval_key: dl19-passage
375
+ scores:
376
+ - MAP: 0.4377
377
+ nDCG@10: 0.6537
378
+ R@1K: 0.8443
379
+ - topic_key: dl20
380
+ eval_key: dl20-passage
381
+ scores:
382
+ - MAP: 0.4348
383
+ nDCG@10: 0.6235
384
+ R@1K: 0.8605
385
+ - name: bm25-rm3-d2q-t5-default
386
+ display: BM25+RM3 w/ doc2query-T5 (k1=0.9, b=0.4)
387
+ display-html: BM25+RM3 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
388
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2b)"
389
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
390
+ topics:
391
+ - topic_key: msmarco-passage-dev-subset
392
+ eval_key: msmarco-passage-dev-subset
393
+ scores:
394
+ - MRR@10: 0.2139
395
+ R@1K: 0.9460
396
+ - topic_key: dl19-passage
397
+ eval_key: dl19-passage
398
+ scores:
399
+ - MAP: 0.4483
400
+ nDCG@10: 0.6586
401
+ R@1K: 0.8863
402
+ - topic_key: dl20
403
+ eval_key: dl20-passage
404
+ scores:
405
+ - MAP: 0.4286
406
+ nDCG@10: 0.6131
407
+ R@1K: 0.8700
408
+ - name: unicoil-pytorch
409
+ display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
410
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
411
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --hits 1000 --impact
412
+ topics:
413
+ - topic_key: msmarco-passage-dev-subset
414
+ eval_key: msmarco-passage-dev-subset
415
+ scores:
416
+ - MRR@10: 0.3509
417
+ R@1K: 0.9581
418
+ - topic_key: dl19-passage
419
+ eval_key: dl19-passage
420
+ scores:
421
+ - MAP: 0.4617
422
+ nDCG@10: 0.7027
423
+ R@1K: 0.8291
424
+ - topic_key: dl20
425
+ eval_key: dl20-passage
426
+ scores:
427
+ - MAP: 0.4429
428
+ nDCG@10: 0.6745
429
+ R@1K: 0.8433
430
+ - name: unicoil-onnx
431
+ display: "uniCOIL (w/ doc2query-T5): query inference with ONNX"
432
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with ONNX"
433
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --onnx-encoder UniCoil --output $output --hits 1000 --impact
434
+ topics:
435
+ - topic_key: msmarco-passage-dev-subset
436
+ eval_key: msmarco-passage-dev-subset
437
+ scores:
438
+ - MRR@10: 0.3509
439
+ R@1K: 0.9581
440
+ - topic_key: dl19-passage
441
+ eval_key: dl19-passage
442
+ scores:
443
+ - MAP: 0.4617
444
+ nDCG@10: 0.7027
445
+ R@1K: 0.8291
446
+ - topic_key: dl20
447
+ eval_key: dl20-passage
448
+ scores:
449
+ - MAP: 0.4429
450
+ nDCG@10: 0.6745
451
+ R@1K: 0.8433
452
+ - name: unicoil
453
+ display: "uniCOIL (w/ doc2query-T5): pre-encoded"
454
+ display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
455
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3b)"
456
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --output $output --hits 1000 --impact
457
+ topics:
458
+ - topic_key: msmarco-passage-dev-subset-unicoil
459
+ eval_key: msmarco-passage-dev-subset
460
+ scores:
461
+ - MRR@10: 0.3516
462
+ R@1K: 0.9582
463
+ - topic_key: dl19-passage-unicoil
464
+ eval_key: dl19-passage
465
+ scores:
466
+ - MAP: 0.4612
467
+ nDCG@10: 0.7024
468
+ R@1K: 0.8292
469
+ - topic_key: dl20-unicoil
470
+ eval_key: dl20-passage
471
+ scores:
472
+ - MAP: 0.4430
473
+ nDCG@10: 0.6745
474
+ R@1K: 0.8430
475
+ - name: unicoil-noexp-pytorch
476
+ display: "uniCOIL (noexp): query inference with PyTorch"
477
+ display-html: "uniCOIL (noexp): query inference with PyTorch"
478
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --hits 1000 --impact
479
+ topics:
480
+ - topic_key: msmarco-passage-dev-subset
481
+ eval_key: msmarco-passage-dev-subset
482
+ scores:
483
+ - MRR@10: 0.3153
484
+ R@1K: 0.9239
485
+ - topic_key: dl19-passage
486
+ eval_key: dl19-passage
487
+ scores:
488
+ - MAP: 0.4033
489
+ nDCG@10: 0.6434
490
+ R@1K: 0.7752
491
+ - topic_key: dl20
492
+ eval_key: dl20-passage
493
+ scores:
494
+ - MAP: 0.4022
495
+ nDCG@10: 0.6524
496
+ R@1K: 0.7861
497
+ - name: unicoil-noexp-onnx
498
+ display: "uniCOIL (noexp): query inference with ONNX"
499
+ display-html: "uniCOIL (noexp): query inference with ONNX"
500
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --onnx-encoder UniCoil --output $output --hits 1000 --impact
501
+ topics:
502
+ - topic_key: msmarco-passage-dev-subset
503
+ eval_key: msmarco-passage-dev-subset
504
+ scores:
505
+ - MRR@10: 0.3119
506
+ R@1K: 0.9239
507
+ - topic_key: dl19-passage
508
+ eval_key: dl19-passage
509
+ scores:
510
+ - MAP: 0.4061
511
+ nDCG@10: 0.6531
512
+ R@1K: 0.7809
513
+ - topic_key: dl20
514
+ eval_key: dl20-passage
515
+ scores:
516
+ - MAP: 0.3909
517
+ nDCG@10: 0.6388
518
+ R@1K: 0.7915
519
+ - name: unicoil-noexp
520
+ display: "uniCOIL (noexp): pre-encoded"
521
+ display-html: "uniCOIL (noexp): pre-encoded queries"
522
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3a)"
523
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --output $output --hits 1000 --impact
524
+ topics:
525
+ - topic_key: msmarco-passage-dev-subset-unicoil-noexp
526
+ eval_key: msmarco-passage-dev-subset
527
+ scores:
528
+ - MRR@10: 0.3153
529
+ R@1K: 0.9239
530
+ - topic_key: dl19-passage-unicoil-noexp
531
+ eval_key: dl19-passage
532
+ scores:
533
+ - MAP: 0.4033
534
+ nDCG@10: 0.6433
535
+ R@1K: 0.7752
536
+ - topic_key: dl20-unicoil-noexp
537
+ eval_key: dl20-passage
538
+ scores:
539
+ - MAP: 0.4021
540
+ nDCG@10: 0.6523
541
+ R@1K: 0.7861
542
+ - name: splade-pp-ed-onnx
543
+ display: "SPLADE++ EnsembleDistil: query inference with ONNX"
544
+ display-html: "SPLADE++ EnsembleDistil: query inference with ONNX"
545
+ display-row: "[2]"
546
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-splade-pp-ed --topics $topics --onnx-encoder SpladePlusPlusEnsembleDistil --output $output --hits 1000 --impact
547
+ topics:
548
+ - topic_key: msmarco-passage-dev-subset
549
+ eval_key: msmarco-passage-dev-subset
550
+ scores:
551
+ - MRR@10: 0.3830
552
+ R@1K: 0.9831
553
+ - topic_key: dl19-passage
554
+ eval_key: dl19-passage
555
+ scores:
556
+ - MAP: 0.5054
557
+ nDCG@10: 0.7320
558
+ R@1K: 0.8724
559
+ - topic_key: dl20
560
+ eval_key: dl20-passage
561
+ scores:
562
+ - MAP: 0.5002
563
+ nDCG@10: 0.7198
564
+ R@1K: 0.8995
565
+ - name: splade-pp-sd-onnx
566
+ display: "SPLADE++ SelfDistil: query inference with ONNX"
567
+ display-html: "SPLADE++ SelfDistil: query inference with ONNX"
568
+ display-row: "[2]"
569
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-splade-pp-sd --topics $topics --onnx-encoder SpladePlusPlusSelfDistil --output $output --hits 1000 --impact
570
+ topics:
571
+ - topic_key: msmarco-passage-dev-subset
572
+ eval_key: msmarco-passage-dev-subset
573
+ scores:
574
+ - MRR@10: 0.3778
575
+ R@1K: 0.9846
576
+ - topic_key: dl19-passage
577
+ eval_key: dl19-passage
578
+ scores:
579
+ - MAP: 0.4997
580
+ nDCG@10: 0.7356
581
+ R@1K: 0.8758
582
+ - topic_key: dl20
583
+ eval_key: dl20-passage
584
+ scores:
585
+ - MAP: 0.5140
586
+ nDCG@10: 0.7285
587
+ R@1K: 0.9023
588
+ - name: tct_colbert-v2-hnp-pytorch
589
+ display: "TCT_ColBERT-V2-HN+: query inference with PyTorch"
590
+ display-html: "TCT_ColBERT-V2-HN+: query inference with PyTorch"
591
+ display-row: "[6]"
592
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.tct_colbert-v2-hnp --topics $topics --encoder castorini/tct_colbert-v2-hnp-msmarco --output $output
593
+ topics:
594
+ - topic_key: msmarco-passage-dev-subset
595
+ eval_key: msmarco-passage-dev-subset
596
+ scores:
597
+ - MRR@10: 0.3584
598
+ R@1K: 0.9695
599
+ - topic_key: dl19-passage
600
+ eval_key: dl19-passage
601
+ scores:
602
+ - MAP: 0.4469
603
+ nDCG@10: 0.7204
604
+ R@1K: 0.8261
605
+ - topic_key: dl20
606
+ eval_key: dl20-passage
607
+ scores:
608
+ - MAP: 0.4754
609
+ nDCG@10: 0.6882
610
+ R@1K: 0.8429
611
+ - name: tct_colbert-v2-hnp
612
+ display: "TCT_ColBERT-V2-HN+: pre-encoded"
613
+ display-html: "TCT_ColBERT-V2-HN+: pre-encoded queries"
614
+ display-row: "[6]"
615
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.tct_colbert-v2-hnp --topics $topics --encoded-queries tct_colbert-v2-hnp-$topics --output $output
616
+ topics:
617
+ - topic_key: msmarco-passage-dev-subset
618
+ eval_key: msmarco-passage-dev-subset
619
+ scores:
620
+ - MRR@10: 0.3584
621
+ R@1K: 0.9695
622
+ - topic_key: dl19-passage
623
+ eval_key: dl19-passage
624
+ scores:
625
+ - MAP: 0.4469
626
+ nDCG@10: 0.7204
627
+ R@1K: 0.8261
628
+ - topic_key: dl20
629
+ eval_key: dl20-passage
630
+ scores:
631
+ - MAP: 0.4754
632
+ nDCG@10: 0.6882
633
+ R@1K: 0.8429
634
+ - name: slimr
635
+ display: "SLIM: query inference with PyTorch"
636
+ display-html: "SLIM: query inference with PyTorch"
637
+ display-row: "[7]"
638
+ command: python -m pyserini.search.lucene --threads 16 --batch 128 --index msmarco-v1-passage-slimr --topics $topics --encoder castorini/slimr-msmarco-passage --encoded-corpus scipy-sparse-vectors.msmarco-v1-passage-slimr --output $output --output-format msmarco --hits 1000 --impact --min-idf 3
639
+ topics:
640
+ - topic_key: msmarco-passage-dev-subset
641
+ eval_key: msmarco-passage-dev-subset
642
+ scores:
643
+ - MRR@10: 0.3581
644
+ R@1K: 0.9620
645
+ - topic_key: dl19-passage
646
+ eval_key: dl19-passage
647
+ scores:
648
+ - MAP: 0.4509
649
+ nDCG@10: 0.7010
650
+ R@1K: 0.8241
651
+ - topic_key: dl20
652
+ eval_key: dl20-passage
653
+ scores:
654
+ - MAP: 0.4419
655
+ nDCG@10: 0.6403
656
+ R@1K: 0.8543
657
+ - name: slimr-pp
658
+ display: "SLIM++: query inference with PyTorch"
659
+ display-html: "SLIM++: query inference with PyTorch"
660
+ display-row: "[7]"
661
+ command: python -m pyserini.search.lucene --threads 16 --batch 128 --index msmarco-v1-passage-slimr-pp --topics $topics --encoder castorini/slimr-pp-msmarco-passage --encoded-corpus scipy-sparse-vectors.msmarco-v1-passage-slimr-pp --output $output --output-format msmarco --hits 1000 --impact --min-idf 3
662
+ topics:
663
+ - topic_key: msmarco-passage-dev-subset
664
+ eval_key: msmarco-passage-dev-subset
665
+ scores:
666
+ - MRR@10: 0.4032
667
+ R@1K: 0.9680
668
+ - topic_key: dl19-passage
669
+ eval_key: dl19-passage
670
+ scores:
671
+ - MAP: 0.4687
672
+ nDCG@10: 0.7140
673
+ R@1K: 0.8415
674
+ - topic_key: dl20
675
+ eval_key: dl20-passage
676
+ scores:
677
+ - MAP: 0.4906
678
+ nDCG@10: 0.7021
679
+ R@1K: 0.8551
680
+ - name: aggretriever-distilbert-pytorch
681
+ display: "Aggretriever-DistilBERT: query inference with PyTorch"
682
+ display-html: "Aggretriever-DistilBERT: query inference with PyTorch"
683
+ display-row: "[8]"
684
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.aggretriever-distilbert --topics $topics --encoder castorini/aggretriever-distilbert --output $output
685
+ topics:
686
+ - topic_key: msmarco-passage-dev-subset
687
+ eval_key: msmarco-passage-dev-subset
688
+ scores:
689
+ - MRR@10: 0.3412
690
+ R@1K: 0.9604
691
+ - topic_key: dl19-passage
692
+ eval_key: dl19-passage
693
+ scores:
694
+ - MAP: 0.4301
695
+ nDCG@10: 0.6816
696
+ R@1K: 0.8023
697
+ - topic_key: dl20
698
+ eval_key: dl20-passage
699
+ scores:
700
+ - MAP: 0.4329
701
+ nDCG@10: 0.6726
702
+ R@1K: 0.8351
703
+ - name: aggretriever-cocondenser-pytorch
704
+ display: "Aggretriever-coCondenser: query inference with PyTorch"
705
+ display-html: "Aggretriever-coCondenser: query inference with PyTorch"
706
+ display-row: "[8]"
707
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.aggretriever-cocondenser --topics $topics --encoder castorini/aggretriever-cocondenser --output $output
708
+ topics:
709
+ - topic_key: msmarco-passage-dev-subset
710
+ eval_key: msmarco-passage-dev-subset
711
+ scores:
712
+ - MRR@10: 0.3619
713
+ R@1K: 0.9735
714
+ - topic_key: dl19-passage
715
+ eval_key: dl19-passage
716
+ scores:
717
+ - MAP: 0.4350
718
+ nDCG@10: 0.6837
719
+ R@1K: 0.8078
720
+ - topic_key: dl20
721
+ eval_key: dl20-passage
722
+ scores:
723
+ - MAP: 0.4710
724
+ nDCG@10: 0.6972
725
+ R@1K: 0.8555
726
+ - name: openai-ada2
727
+ display: "OpenAI ada2: pre-encoded queries"
728
+ display-html: "OpenAI ada2: pre-encoded queries"
729
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 128 --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics --output $output
730
+ topics:
731
+ - topic_key: msmarco-passage-dev-subset
732
+ eval_key: msmarco-passage-dev-subset
733
+ scores:
734
+ - MRR@10: 0.3435
735
+ R@1K: 0.9858
736
+ - topic_key: dl19-passage
737
+ eval_key: dl19-passage
738
+ scores:
739
+ - MAP: 0.4788
740
+ nDCG@10: 0.7035
741
+ R@1K: 0.8629
742
+ - topic_key: dl20
743
+ eval_key: dl20-passage
744
+ scores:
745
+ - MAP: 0.4771
746
+ nDCG@10: 0.6759
747
+ R@1K: 0.8705
748
+ - name: openai-ada2-hyde
749
+ display: "HyDE-OpenAI ada2: pre-encoded queries"
750
+ display-html: "HyDE-OpenAI ada2: pre-encoded queries"
751
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 128 --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics-hyde --output $output
752
+ topics:
753
+ - topic_key: dl19-passage
754
+ eval_key: dl19-passage
755
+ scores:
756
+ - MAP: 0.5125
757
+ nDCG@10: 0.7163
758
+ R@1K: 0.9002
759
+ - topic_key: dl20
760
+ eval_key: dl20-passage
761
+ scores:
762
+ - MAP: 0.4938
763
+ nDCG@10: 0.6666
764
+ R@1K: 0.8919
pyserini/2cr/msmarco-v2-doc.yaml ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conditions:
2
+ - name: bm25-doc-default
3
+ display: BM25 doc (k1=0.9, b=0.4)
4
+ display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
5
+ display-row: (1a)
6
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc --topics $topics --output $output --bm25
7
+ topics:
8
+ - topic_key: msmarco-v2-doc-dev
9
+ eval_key: msmarco-v2-doc-dev
10
+ scores:
11
+ - MRR@100: 0.1572
12
+ R@1K: 0.8054
13
+ - topic_key: msmarco-v2-doc-dev2
14
+ eval_key: msmarco-v2-doc-dev2
15
+ scores:
16
+ - MRR@100: 0.1659
17
+ R@1K: 0.8029
18
+ - topic_key: dl21
19
+ eval_key: dl21-doc
20
+ scores:
21
+ - MAP@100: 0.2126
22
+ nDCG@10: 0.5116
23
+ MRR@100: 0.8367
24
+ R@100: 0.3195
25
+ R@1K: 0.6739
26
+ - name: bm25-doc-segmented-default
27
+ display: BM25 doc segmented (k1=0.9, b=0.4)
28
+ display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
29
+ display-row: (1b)
30
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
31
+ topics:
32
+ - topic_key: msmarco-v2-doc-dev
33
+ eval_key: msmarco-v2-doc-dev
34
+ scores:
35
+ - MRR@100: 0.1896
36
+ R@1K: 0.8542
37
+ - topic_key: msmarco-v2-doc-dev2
38
+ eval_key: msmarco-v2-doc-dev2
39
+ scores:
40
+ - MRR@100: 0.1930
41
+ R@1K: 0.8549
42
+ - topic_key: dl21
43
+ eval_key: dl21-doc
44
+ scores:
45
+ - MAP@100: 0.2436
46
+ nDCG@10: 0.5776
47
+ MRR@100: 0.8937
48
+ R@100: 0.3478
49
+ R@1K: 0.6930
50
+ - name: bm25-rm3-doc-default
51
+ display: BM25+RM3 doc (k1=0.9, b=0.4)
52
+ display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
53
+ display-row: (1c)
54
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc --topics $topics --output $output --bm25 --rm3
55
+ topics:
56
+ - topic_key: msmarco-v2-doc-dev
57
+ eval_key: msmarco-v2-doc-dev
58
+ scores:
59
+ - MRR@100: 0.0974
60
+ R@1K: 0.7699
61
+ - topic_key: msmarco-v2-doc-dev2
62
+ eval_key: msmarco-v2-doc-dev2
63
+ scores:
64
+ - MRR@100: 0.1033
65
+ R@1K: 0.7736
66
+ - topic_key: dl21
67
+ eval_key: dl21-doc
68
+ scores:
69
+ - MAP@100: 0.2452
70
+ nDCG@10: 0.5304
71
+ MRR@100: 0.7914
72
+ R@100: 0.3376
73
+ R@1K: 0.7341
74
+ - name: bm25-rm3-doc-segmented-default
75
+ display: BM25+RM3 doc segmented (k1=0.9, b=0.4)
76
+ display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
77
+ display-row: (1d)
78
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
79
+ topics:
80
+ - topic_key: msmarco-v2-doc-dev
81
+ eval_key: msmarco-v2-doc-dev
82
+ scores:
83
+ - MRR@100: 0.1660
84
+ R@1K: 0.8608
85
+ - topic_key: msmarco-v2-doc-dev2
86
+ eval_key: msmarco-v2-doc-dev2
87
+ scores:
88
+ - MRR@100: 0.1702
89
+ R@1K: 0.8639
90
+ - topic_key: dl21
91
+ eval_key: dl21-doc
92
+ scores:
93
+ - MAP@100: 0.2936
94
+ nDCG@10: 0.6189
95
+ MRR@100: 0.9076
96
+ R@100: 0.3890
97
+ R@1K: 0.7678
98
+ - name: bm25-d2q-t5-doc-default
99
+ display: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4)
100
+ display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
101
+ display-row: (2a)
102
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-d2q-t5 --topics $topics --output $output --bm25
103
+ topics:
104
+ - topic_key: msmarco-v2-doc-dev
105
+ eval_key: msmarco-v2-doc-dev
106
+ scores:
107
+ - MRR@100: 0.2011
108
+ R@1K: 0.8614
109
+ - topic_key: msmarco-v2-doc-dev2
110
+ eval_key: msmarco-v2-doc-dev2
111
+ scores:
112
+ - MRR@100: 0.2012
113
+ R@1K: 0.8568
114
+ - topic_key: dl21
115
+ eval_key: dl21-doc
116
+ scores:
117
+ - MAP@100: 0.2387
118
+ nDCG@10: 0.5792
119
+ MRR@100: 0.8866
120
+ R@100: 0.3443
121
+ R@1K: 0.7066
122
+ - name: bm25-d2q-t5-doc-segmented-default
123
+ display: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
124
+ display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
125
+ display-row: (2b)
126
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
127
+ topics:
128
+ - topic_key: msmarco-v2-doc-dev
129
+ eval_key: msmarco-v2-doc-dev
130
+ scores:
131
+ - MRR@100: 0.2226
132
+ R@1K: 0.8982
133
+ - topic_key: msmarco-v2-doc-dev2
134
+ eval_key: msmarco-v2-doc-dev2
135
+ scores:
136
+ - MRR@100: 0.2234
137
+ R@1K: 0.8952
138
+ - topic_key: dl21
139
+ eval_key: dl21-doc
140
+ scores:
141
+ - MAP@100: 0.2683
142
+ nDCG@10: 0.6289
143
+ MRR@100: 0.9454
144
+ R@100: 0.3656
145
+ R@1K: 0.7202
146
+ - name: bm25-rm3-d2q-t5-doc-default
147
+ display: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4)
148
+ display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
149
+ display-row: (2c)
150
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
151
+ topics:
152
+ - topic_key: msmarco-v2-doc-dev
153
+ eval_key: msmarco-v2-doc-dev
154
+ scores:
155
+ - MRR@100: 0.1141
156
+ R@1K: 0.8191
157
+ - topic_key: msmarco-v2-doc-dev2
158
+ eval_key: msmarco-v2-doc-dev2
159
+ scores:
160
+ - MRR@100: 0.1170
161
+ R@1K: 0.8247
162
+ - topic_key: dl21
163
+ eval_key: dl21-doc
164
+ scores:
165
+ - MAP@100: 0.2611
166
+ nDCG@10: 0.5375
167
+ MRR@100: 0.8255
168
+ R@100: 0.3580
169
+ R@1K: 0.7574
170
+ - name: bm25-rm3-d2q-t5-doc-segmented-default
171
+ display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
172
+ display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
173
+ display-row: (2d)
174
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
175
+ topics:
176
+ - topic_key: msmarco-v2-doc-dev
177
+ eval_key: msmarco-v2-doc-dev
178
+ scores:
179
+ - MRR@100: 0.1975
180
+ R@1K: 0.9002
181
+ - topic_key: msmarco-v2-doc-dev2
182
+ eval_key: msmarco-v2-doc-dev2
183
+ scores:
184
+ - MRR@100: 0.1978
185
+ R@1K: 0.8972
186
+ - topic_key: dl21
187
+ eval_key: dl21-doc
188
+ scores:
189
+ - MAP@100: 0.3191
190
+ nDCG@10: 0.6559
191
+ MRR@100: 0.8989
192
+ R@100: 0.4131
193
+ R@1K: 0.7948
194
+ - name: unicoil-noexp
195
+ display: "uniCOIL (noexp): pre-encoded"
196
+ display-html: "uniCOIL (noexp): pre-encoded queries"
197
+ display-row: (3a)
198
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-noexp-0shot --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
199
+ topics:
200
+ - topic_key: msmarco-v2-doc-dev-unicoil-noexp
201
+ eval_key: msmarco-v2-doc-dev
202
+ scores:
203
+ - MRR@100: 0.2231
204
+ R@1K: 0.8987
205
+ - topic_key: msmarco-v2-doc-dev2-unicoil-noexp
206
+ eval_key: msmarco-v2-doc-dev2
207
+ scores:
208
+ - MRR@100: 0.2314
209
+ R@1K: 0.8995
210
+ - topic_key: dl21-unicoil-noexp
211
+ eval_key: dl21-doc
212
+ scores:
213
+ - MAP@100: 0.2587
214
+ nDCG@10: 0.6495
215
+ MRR@100: 0.9282
216
+ R@100: 0.3563
217
+ R@1K: 0.6787
218
+ - name: unicoil-noexp-otf
219
+ display: "uniCOIL (noexp): query inference with PyTorch"
220
+ display-html: "uniCOIL (noexp): query inference with PyTorch"
221
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-noexp-0shot --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
222
+ topics:
223
+ - topic_key: msmarco-v2-doc-dev
224
+ eval_key: msmarco-v2-doc-dev
225
+ scores:
226
+ - MRR@100: 0.2232
227
+ R@1K: 0.8987
228
+ - topic_key: msmarco-v2-doc-dev2
229
+ eval_key: msmarco-v2-doc-dev2
230
+ scores:
231
+ - MRR@100: 0.2314
232
+ R@1K: 0.8993
233
+ - topic_key: dl21
234
+ eval_key: dl21-doc
235
+ scores:
236
+ - MAP@100: 0.2589
237
+ nDCG@10: 0.6501
238
+ MRR@100: 0.9282
239
+ R@100: 0.3574
240
+ R@1K: 0.6782
241
+ - name: unicoil
242
+ display: "uniCOIL (w/ doc2query-T5): pre-encoded"
243
+ display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
244
+ display-row: (3b)
245
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-0shot --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
246
+ topics:
247
+ - topic_key: msmarco-v2-doc-dev-unicoil
248
+ eval_key: msmarco-v2-doc-dev
249
+ scores:
250
+ - MRR@100: 0.2419
251
+ R@1K: 0.9122
252
+ - topic_key: msmarco-v2-doc-dev2-unicoil
253
+ eval_key: msmarco-v2-doc-dev2
254
+ scores:
255
+ - MRR@100: 0.2445
256
+ R@1K: 0.9172
257
+ - topic_key: dl21-unicoil
258
+ eval_key: dl21-doc
259
+ scores:
260
+ - MAP@100: 0.2718
261
+ nDCG@10: 0.6783
262
+ MRR@100: 0.9684
263
+ R@100: 0.3700
264
+ R@1K: 0.7069
265
+ - name: unicoil-otf
266
+ display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
267
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
268
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-0shot --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
269
+ topics:
270
+ - topic_key: msmarco-v2-doc-dev
271
+ eval_key: msmarco-v2-doc-dev
272
+ scores:
273
+ - MRR@100: 0.2419
274
+ R@1K: 0.9120
275
+ - topic_key: msmarco-v2-doc-dev2
276
+ eval_key: msmarco-v2-doc-dev2
277
+ scores:
278
+ - MRR@100: 0.2447
279
+ R@1K: 0.9174
280
+ - topic_key: dl21
281
+ eval_key: dl21-doc
282
+ scores:
283
+ - MAP@100: 0.2720
284
+ nDCG@10: 0.6782
285
+ MRR@100: 0.9684
286
+ R@100: 0.3702
287
+ R@1K: 0.7071
pyserini/2cr/msmarco-v2-passage.yaml ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conditions:
2
+ - name: bm25-default
3
+ display: BM25 original passage (k1=0.9, b=0.4)
4
+ display-html: BM25 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
5
+ display-row: (1a)
6
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage --topics $topics --output $output --bm25
7
+ topics:
8
+ - topic_key: msmarco-v2-passage-dev
9
+ eval_key: msmarco-v2-passage-dev
10
+ scores:
11
+ - MRR@100: 0.0719
12
+ R@1K: 0.5733
13
+ - topic_key: msmarco-v2-passage-dev2
14
+ eval_key: msmarco-v2-passage-dev2
15
+ scores:
16
+ - MRR@100: 0.0802
17
+ R@1K: 0.5839
18
+ - topic_key: dl21
19
+ eval_key: dl21-passage
20
+ scores:
21
+ - MAP@100: 0.1357
22
+ nDCG@10: 0.4458
23
+ MRR@100: 0.5060
24
+ R@100: 0.3261
25
+ R@1K: 0.6149
26
+ - name: bm25-augmented-default
27
+ display: BM25 augmented passage (k1=0.9, b=0.4)
28
+ display-html: BM25 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
29
+ display-row: (1b)
30
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented --topics $topics --output $output --bm25
31
+ topics:
32
+ - topic_key: msmarco-v2-passage-dev
33
+ eval_key: msmarco-v2-passage-dev
34
+ scores:
35
+ - MRR@100: 0.0872
36
+ R@1K: 0.6925
37
+ - topic_key: msmarco-v2-passage-dev2
38
+ eval_key: msmarco-v2-passage-dev2
39
+ scores:
40
+ - MRR@100: 0.0917
41
+ R@1K: 0.6933
42
+ - topic_key: dl21
43
+ eval_key: dl21-passage
44
+ scores:
45
+ - MAP@100: 0.0977
46
+ nDCG@10: 0.3977
47
+ MRR@100: 0.5303
48
+ R@100: 0.2709
49
+ R@1K: 0.5835
50
+ - name: bm25-rm3-default
51
+ display: BM25+RM3 original passage (k1=0.9, b=0.4)
52
+ display-html: BM25+RM3 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
53
+ display-row: (1c)
54
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage --topics $topics --output $output --bm25 --rm3
55
+ topics:
56
+ - topic_key: msmarco-v2-passage-dev
57
+ eval_key: msmarco-v2-passage-dev
58
+ scores:
59
+ - MRR@100: 0.0630
60
+ R@1K: 0.5947
61
+ - topic_key: msmarco-v2-passage-dev2
62
+ eval_key: msmarco-v2-passage-dev2
63
+ scores:
64
+ - MRR@100: 0.0659
65
+ R@1K: 0.6062
66
+ - topic_key: dl21
67
+ eval_key: dl21-passage
68
+ scores:
69
+ - MAP@100: 0.1666
70
+ nDCG@10: 0.4455
71
+ MRR@100: 0.5202
72
+ R@100: 0.3499
73
+ R@1K: 0.6616
74
+ - name: bm25-rm3-augmented-default
75
+ display: BM25+RM3 augmented passage (k1=0.9, b=0.4)
76
+ display-html: BM25+RM3 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
77
+ display-row: (1d)
78
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented --topics $topics --output $output --bm25 --rm3
79
+ topics:
80
+ - topic_key: msmarco-v2-passage-dev
81
+ eval_key: msmarco-v2-passage-dev
82
+ scores:
83
+ - MRR@100: 0.0667
84
+ R@1K: 0.6857
85
+ - topic_key: msmarco-v2-passage-dev2
86
+ eval_key: msmarco-v2-passage-dev2
87
+ scores:
88
+ - MRR@100: 0.0700
89
+ R@1K: 0.6826
90
+ - topic_key: dl21
91
+ eval_key: dl21-passage
92
+ scores:
93
+ - MAP@100: 0.1050
94
+ nDCG@10: 0.3869
95
+ MRR@100: 0.4915
96
+ R@100: 0.2807
97
+ R@1K: 0.6298
98
+ - name: bm25-d2q-t5-default
99
+ display: BM25 w/ doc2query-T5 original passage (k1=0.9, b=0.4)
100
+ display-html: BM25 w/ doc2query-T5 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
101
+ display-row: (2a)
102
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-d2q-t5 --topics $topics --output $output --bm25
103
+ topics:
104
+ - topic_key: msmarco-v2-passage-dev
105
+ eval_key: msmarco-v2-passage-dev
106
+ scores:
107
+ - MRR@100: 0.1072
108
+ R@1K: 0.7083
109
+ - topic_key: msmarco-v2-passage-dev2
110
+ eval_key: msmarco-v2-passage-dev2
111
+ scores:
112
+ - MRR@100: 0.1123
113
+ R@1K: 0.7151
114
+ - topic_key: dl21
115
+ eval_key: dl21-passage
116
+ scores:
117
+ - MAP@100: 0.1874
118
+ nDCG@10: 0.4816
119
+ MRR@100: 0.6848
120
+ R@100: 0.4076
121
+ R@1K: 0.7078
122
+ - name: bm25-d2q-t5-augmented-default
123
+ display: BM25 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4)
124
+ display-html: BM25 w/ doc2query-T5 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
125
+ display-row: (2b)
126
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented-d2q-t5 --topics $topics --output $output --bm25
127
+ topics:
128
+ - topic_key: msmarco-v2-passage-dev
129
+ eval_key: msmarco-v2-passage-dev
130
+ scores:
131
+ - MRR@100: 0.1172
132
+ R@1K: 0.7647
133
+ - topic_key: msmarco-v2-passage-dev2
134
+ eval_key: msmarco-v2-passage-dev2
135
+ scores:
136
+ - MRR@100: 0.1170
137
+ R@1K: 0.7659
138
+ - topic_key: dl21
139
+ eval_key: dl21-passage
140
+ scores:
141
+ - MAP@100: 0.1649
142
+ nDCG@10: 0.4702
143
+ MRR@100: 0.6391
144
+ R@100: 0.3883
145
+ R@1K: 0.6962
146
+ - name: bm25-rm3-d2q-t5-default
147
+ display: BM25+RM3 w/ doc2query-T5 original passage (k1=0.9, b=0.4)
148
+ display-html: BM25+RM3 w/ doc2query-T5 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
149
+ display-row: (2c)
150
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
151
+ topics:
152
+ - topic_key: msmarco-v2-passage-dev
153
+ eval_key: msmarco-v2-passage-dev
154
+ scores:
155
+ - MRR@100: 0.0947
156
+ R@1K: 0.7181
157
+ - topic_key: msmarco-v2-passage-dev2
158
+ eval_key: msmarco-v2-passage-dev2
159
+ scores:
160
+ - MRR@100: 0.0984
161
+ R@1K: 0.7222
162
+ - topic_key: dl21
163
+ eval_key: dl21-passage
164
+ scores:
165
+ - MAP@100: 0.2285
166
+ nDCG@10: 0.5098
167
+ MRR@100: 0.6548
168
+ R@100: 0.4499
169
+ R@1K: 0.7537
170
+ - name: bm25-rm3-d2q-t5-augmented-default
171
+ display: BM25+RM3 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4)
172
+ display-html: BM25+RM3 w/ doc2query-T5 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
173
+ display-row: (2d)
174
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
175
+ topics:
176
+ - topic_key: msmarco-v2-passage-dev
177
+ eval_key: msmarco-v2-passage-dev
178
+ scores:
179
+ - MRR@100: 0.0883
180
+ R@1K: 0.7607
181
+ - topic_key: msmarco-v2-passage-dev2
182
+ eval_key: msmarco-v2-passage-dev2
183
+ scores:
184
+ - MRR@100: 0.0904
185
+ R@1K: 0.7649
186
+ - topic_key: dl21
187
+ eval_key: dl21-passage
188
+ scores:
189
+ - MAP@100: 0.1930
190
+ nDCG@10: 0.4812
191
+ MRR@100: 0.5958
192
+ R@100: 0.4321
193
+ R@1K: 0.7672
194
+ - name: unicoil
195
+ display: "uniCOIL (w/ doc2query-T5): pre-encoded"
196
+ display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
197
+ display-row: (3b)
198
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-0shot --topics $topics --output $output --hits 1000 --impact
199
+ topics:
200
+ - topic_key: msmarco-v2-passage-dev-unicoil
201
+ eval_key: msmarco-v2-passage-dev
202
+ scores:
203
+ - MRR@100: 0.1499
204
+ R@1K: 0.7616
205
+ - topic_key: msmarco-v2-passage-dev2-unicoil
206
+ eval_key: msmarco-v2-passage-dev2
207
+ scores:
208
+ - MRR@100: 0.1577
209
+ R@1K: 0.7671
210
+ - topic_key: dl21-unicoil
211
+ eval_key: dl21-passage
212
+ scores:
213
+ - MAP@100: 0.2538
214
+ nDCG@10: 0.6159
215
+ MRR@100: 0.7311
216
+ R@100: 0.4731
217
+ R@1K: 0.7551
218
+ - name: unicoil-otf
219
+ display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
220
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
221
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-0shot --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --hits 1000 --impact
222
+ topics:
223
+ - topic_key: msmarco-v2-passage-dev
224
+ eval_key: msmarco-v2-passage-dev
225
+ scores:
226
+ - MRR@100: 0.1501
227
+ R@1K: 0.7613
228
+ - topic_key: msmarco-v2-passage-dev2
229
+ eval_key: msmarco-v2-passage-dev2
230
+ scores:
231
+ - MRR@100: 0.1576
232
+ R@1K: 0.7676
233
+ - topic_key: dl21
234
+ eval_key: dl21-passage
235
+ scores:
236
+ - MAP@100: 0.2539
237
+ nDCG@10: 0.6160
238
+ MRR@100: 0.7311
239
+ R@100: 0.4723
240
+ R@1K: 0.7560
241
+ - name: unicoil-noexp
242
+ display: "uniCOIL (noexp): pre-encoded"
243
+ display-html: "uniCOIL (noexp): pre-encoded queries"
244
+ display-row: (3a)
245
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-noexp-0shot --topics $topics --output $output --hits 1000 --impact
246
+ topics:
247
+ - topic_key: msmarco-v2-passage-dev-unicoil-noexp
248
+ eval_key: msmarco-v2-passage-dev
249
+ scores:
250
+ - MRR@100: 0.1342
251
+ R@1K: 0.7010
252
+ - topic_key: msmarco-v2-passage-dev2-unicoil-noexp
253
+ eval_key: msmarco-v2-passage-dev2
254
+ scores:
255
+ - MRR@100: 0.1385
256
+ R@1K: 0.7114
257
+ - topic_key: dl21-unicoil-noexp
258
+ eval_key: dl21-passage
259
+ scores:
260
+ - MAP@100: 0.2193
261
+ nDCG@10: 0.5756
262
+ MRR@100: 0.6991
263
+ R@100: 0.4246
264
+ R@1K: 0.6897
265
+ - name: unicoil-noexp-otf
266
+ display: "uniCOIL (noexp): query inference with PyTorch"
267
+ display-html: "uniCOIL (noexp): query inference with PyTorch"
268
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-noexp-0shot --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --hits 1000 --impact
269
+ topics:
270
+ - topic_key: msmarco-v2-passage-dev
271
+ eval_key: msmarco-v2-passage-dev
272
+ scores:
273
+ - MRR@100: 0.1343
274
+ R@1K: 0.7010
275
+ - topic_key: msmarco-v2-passage-dev2
276
+ eval_key: msmarco-v2-passage-dev2
277
+ scores:
278
+ - MRR@100: 0.1385
279
+ R@1K: 0.7114
280
+ - topic_key: dl21
281
+ eval_key: dl21-passage
282
+ scores:
283
+ - MAP@100: 0.2194
284
+ nDCG@10: 0.5759
285
+ MRR@100: 0.6991
286
+ R@100: 0.4247
287
+ R@1K: 0.6893
pyserini/2cr/msmarco.py ADDED
@@ -0,0 +1,600 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import argparse
18
+ import math
19
+ import os
20
+ import re
21
+ import sys
22
+ import time
23
+ from collections import defaultdict
24
+ from string import Template
25
+
26
+ import pkg_resources
27
+ import yaml
28
+
29
+ from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
30
+
31
+ # The models: the rows of the results table will be ordered this way.
32
+ models = {
33
+ # MS MARCO v1 passage
34
+ 'msmarco-v1-passage':
35
+ ['bm25-default',
36
+ 'bm25-rm3-default',
37
+ 'bm25-rocchio-default',
38
+ '',
39
+ 'bm25-tuned',
40
+ 'bm25-rm3-tuned',
41
+ 'bm25-rocchio-tuned',
42
+ '',
43
+ 'bm25-d2q-t5-default',
44
+ 'bm25-rm3-d2q-t5-default',
45
+ 'bm25-rocchio-d2q-t5-default',
46
+ '',
47
+ 'bm25-d2q-t5-tuned',
48
+ 'bm25-rm3-d2q-t5-tuned',
49
+ 'bm25-rocchio-d2q-t5-tuned',
50
+ '',
51
+ 'unicoil',
52
+ 'unicoil-pytorch',
53
+ 'unicoil-onnx',
54
+ 'unicoil-noexp',
55
+ 'unicoil-noexp-pytorch',
56
+ 'unicoil-noexp-onnx',
57
+ '',
58
+ 'splade-pp-ed-onnx',
59
+ 'splade-pp-sd-onnx',
60
+ '',
61
+ 'ance',
62
+ 'ance-pytorch',
63
+ '',
64
+ 'distilbert-kd',
65
+ 'distilbert-kd-pytorch',
66
+ 'distilbert-kd-tasb',
67
+ 'distilbert-kd-tasb-pytorch',
68
+ '',
69
+ 'tct_colbert-v2-hnp',
70
+ 'tct_colbert-v2-hnp-pytorch',
71
+ '',
72
+ 'slimr',
73
+ 'slimr-pp',
74
+ '',
75
+ 'aggretriever-distilbert-pytorch',
76
+ 'aggretriever-cocondenser-pytorch',
77
+ '',
78
+ 'openai-ada2',
79
+ 'openai-ada2-hyde'],
80
+
81
+ # MS MARCO v1 doc
82
+ 'msmarco-v1-doc':
83
+ ['bm25-doc-default',
84
+ 'bm25-doc-segmented-default',
85
+ 'bm25-rm3-doc-default',
86
+ 'bm25-rm3-doc-segmented-default',
87
+ 'bm25-rocchio-doc-default',
88
+ 'bm25-rocchio-doc-segmented-default',
89
+ '',
90
+ 'bm25-doc-tuned',
91
+ 'bm25-doc-segmented-tuned',
92
+ 'bm25-rm3-doc-tuned',
93
+ 'bm25-rm3-doc-segmented-tuned',
94
+ 'bm25-rocchio-doc-tuned',
95
+ 'bm25-rocchio-doc-segmented-tuned',
96
+ '',
97
+ 'bm25-d2q-t5-doc-default',
98
+ 'bm25-d2q-t5-doc-segmented-default',
99
+ 'bm25-rm3-d2q-t5-doc-default',
100
+ 'bm25-rm3-d2q-t5-doc-segmented-default',
101
+ '',
102
+ 'bm25-d2q-t5-doc-tuned',
103
+ 'bm25-d2q-t5-doc-segmented-tuned',
104
+ 'bm25-rm3-d2q-t5-doc-tuned',
105
+ 'bm25-rm3-d2q-t5-doc-segmented-tuned',
106
+ '',
107
+ 'unicoil-noexp',
108
+ 'unicoil-noexp-pytorch',
109
+ '',
110
+ 'unicoil',
111
+ 'unicoil-pytorch'],
112
+
113
+ # MS MARCO v2 passage
114
+ 'msmarco-v2-passage':
115
+ ['bm25-default',
116
+ 'bm25-augmented-default',
117
+ 'bm25-rm3-default',
118
+ 'bm25-rm3-augmented-default',
119
+ '',
120
+ 'bm25-d2q-t5-default',
121
+ 'bm25-d2q-t5-augmented-default',
122
+ 'bm25-rm3-d2q-t5-default',
123
+ 'bm25-rm3-d2q-t5-augmented-default',
124
+ '',
125
+ 'unicoil-noexp',
126
+ 'unicoil',
127
+ '',
128
+ 'unicoil-noexp-otf',
129
+ 'unicoil-otf'],
130
+
131
+ # MS MARCO v2 doc
132
+ 'msmarco-v2-doc':
133
+ ['bm25-doc-default',
134
+ 'bm25-doc-segmented-default',
135
+ 'bm25-rm3-doc-default',
136
+ 'bm25-rm3-doc-segmented-default',
137
+ '',
138
+ 'bm25-d2q-t5-doc-default',
139
+ 'bm25-d2q-t5-doc-segmented-default',
140
+ 'bm25-rm3-d2q-t5-doc-default',
141
+ 'bm25-rm3-d2q-t5-doc-segmented-default',
142
+ '',
143
+ 'unicoil-noexp',
144
+ 'unicoil',
145
+ '',
146
+ 'unicoil-noexp-otf',
147
+ 'unicoil-otf'
148
+ ]
149
+ }
150
+
151
+ trec_eval_metric_definitions = {
152
+ 'msmarco-v1-passage': {
153
+ 'msmarco-passage-dev-subset': {
154
+ 'MRR@10': '-c -M 10 -m recip_rank',
155
+ 'R@1K': '-c -m recall.1000'
156
+ },
157
+ 'dl19-passage': {
158
+ 'MAP': '-c -l 2 -m map',
159
+ 'nDCG@10': '-c -m ndcg_cut.10',
160
+ 'R@1K': '-c -l 2 -m recall.1000'
161
+ },
162
+ 'dl20-passage': {
163
+ 'MAP': '-c -l 2 -m map',
164
+ 'nDCG@10': '-c -m ndcg_cut.10',
165
+ 'R@1K': '-c -l 2 -m recall.1000'
166
+ }
167
+ },
168
+ 'msmarco-v1-doc': {
169
+ 'msmarco-doc-dev': {
170
+ 'MRR@10': '-c -M 100 -m recip_rank',
171
+ 'R@1K': '-c -m recall.1000'
172
+ },
173
+ 'dl19-doc': {
174
+ 'MAP': '-c -M 100 -m map',
175
+ 'nDCG@10': '-c -m ndcg_cut.10',
176
+ 'R@1K': '-c -m recall.1000'
177
+ },
178
+ 'dl20-doc': {
179
+ 'MAP': '-c -M 100 -m map',
180
+ 'nDCG@10': '-c -m ndcg_cut.10',
181
+ 'R@1K': '-c -m recall.1000'
182
+ }
183
+ },
184
+ 'msmarco-v2-passage': {
185
+ 'msmarco-v2-passage-dev': {
186
+ 'MRR@100': '-c -M 100 -m recip_rank',
187
+ 'R@1K': '-c -m recall.1000'
188
+ },
189
+ 'msmarco-v2-passage-dev2': {
190
+ 'MRR@100': '-c -M 100 -m recip_rank',
191
+ 'R@1K': '-c -m recall.1000'
192
+ },
193
+ 'dl21-passage': {
194
+ 'MAP@100': '-c -l 2 -M 100 -m map',
195
+ 'nDCG@10': '-c -m ndcg_cut.10',
196
+ 'MRR@100': '-c -l 2 -M 100 -m recip_rank',
197
+ 'R@100': '-c -l 2 -m recall.100',
198
+ 'R@1K': '-c -l 2 -m recall.1000'
199
+ }
200
+ },
201
+ 'msmarco-v2-doc': {
202
+ 'msmarco-v2-doc-dev': {
203
+ 'MRR@100': '-c -M 100 -m recip_rank',
204
+ 'R@1K': '-c -m recall.1000'
205
+ },
206
+ 'msmarco-v2-doc-dev2': {
207
+ 'MRR@100': '-c -M 100 -m recip_rank',
208
+ 'R@1K': '-c -m recall.1000'
209
+ },
210
+ 'dl21-doc': {
211
+ 'MAP@100': '-c -M 100 -m map',
212
+ 'nDCG@10': '-c -m ndcg_cut.10',
213
+ 'MRR@100': '-c -M 100 -m recip_rank',
214
+ 'R@100': '-c -m recall.100',
215
+ 'R@1K': '-c -m recall.1000'
216
+ }
217
+ }
218
+ }
219
+
220
+
221
+ def find_msmarco_table_topic_set_key_v1(topic_key):
222
+ # E.g., we want to map variants like 'dl19-passage-unicoil' and 'dl19-passage' both into 'dl19'
223
+ key = ''
224
+ if topic_key.startswith('dl19'):
225
+ key = 'dl19'
226
+ elif topic_key.startswith('dl20'):
227
+ key = 'dl20'
228
+ elif topic_key.startswith('msmarco'):
229
+ key = 'dev'
230
+
231
+ return key
232
+
233
+
234
+ def find_msmarco_table_topic_set_key_v2(topic_key):
235
+ key = ''
236
+ if topic_key.endswith('dev') or topic_key.endswith('dev-unicoil') or topic_key.endswith('dev-unicoil-noexp'):
237
+ key = 'dev'
238
+ elif topic_key.endswith('dev2') or topic_key.endswith('dev2-unicoil') or topic_key.endswith('dev2-unicoil-noexp'):
239
+ key = 'dev2'
240
+ elif topic_key.startswith('dl21'):
241
+ key = 'dl21'
242
+
243
+ return key
244
+
245
+
246
+ def format_command(raw):
247
+ # After "--output foo.txt" are additional options like "--hits 1000 --impact".
248
+ # We want these on a separate line for better readability, but note that sometimes that might
249
+ # be the end of the command, in which case we don't want to add an extra line break.
250
+ return raw.replace('--topics', '\\\n --topics') \
251
+ .replace('--threads', '\\\n --threads')\
252
+ .replace('--index', '\\\n --index')\
253
+ .replace('--output ', '\\\n --output ')\
254
+ .replace('--encoder', '\\\n --encoder')\
255
+ .replace('--onnx-encoder', '\\\n --onnx-encoder')\
256
+ .replace('--encoded-corpus', '\\\n --encoded-corpus')\
257
+ .replace('.txt ', '.txt \\\n ')
258
+
259
+
260
+ def read_file(f):
261
+ fin = open(f, 'r')
262
+ text = fin.read()
263
+ fin.close()
264
+
265
+ return text
266
+
267
+
268
+ def list_conditions(args):
269
+ for condition in models[args.collection]:
270
+ if condition == '':
271
+ continue
272
+ print(condition)
273
+
274
+
275
+ def generate_report(args):
276
+ yaml_file = pkg_resources.resource_filename(__name__, f'{args.collection}.yaml')
277
+
278
+ if args.collection == 'msmarco-v1-passage':
279
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v1_passage.template'))
280
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v1.template'))
281
+ elif args.collection == 'msmarco-v1-doc':
282
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v1_doc.template'))
283
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v1.template'))
284
+ elif args.collection == 'msmarco-v2-passage':
285
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v2_passage.template'))
286
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v2.template'))
287
+ elif args.collection == 'msmarco-v2-doc':
288
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v2_doc.template'))
289
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v2.template'))
290
+ else:
291
+ raise ValueError(f'Unknown corpus: {args.collection}')
292
+
293
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
294
+ commands = defaultdict(lambda: defaultdict(lambda: ''))
295
+ eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
296
+
297
+ table_keys = {}
298
+ row_ids = {}
299
+
300
+ with open(yaml_file) as f:
301
+ yaml_data = yaml.safe_load(f)
302
+ for condition in yaml_data['conditions']:
303
+ name = condition['name']
304
+ display = condition['display-html']
305
+ row_id = condition['display-row'] if 'display-row' in condition else ''
306
+ cmd_template = condition['command']
307
+
308
+ row_ids[name] =row_id
309
+ table_keys[name] = display
310
+
311
+ for topic_set in condition['topics']:
312
+ topic_key = topic_set['topic_key']
313
+ eval_key = topic_set['eval_key']
314
+
315
+ if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
316
+ short_topic_key = find_msmarco_table_topic_set_key_v1(topic_key)
317
+ else:
318
+ short_topic_key = find_msmarco_table_topic_set_key_v2(topic_key)
319
+
320
+ runfile = f'run.{args.collection}.{name}.{short_topic_key}.txt'
321
+ cmd = Template(cmd_template).substitute(topics=topic_key, output=runfile)
322
+ commands[name][short_topic_key] = cmd
323
+
324
+ for expected in topic_set['scores']:
325
+ for metric in expected:
326
+ eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
327
+ f'{trec_eval_metric_definitions[args.collection][eval_key][metric]} {eval_key} {runfile}'
328
+ eval_commands[name][short_topic_key] += eval_cmd + '\n'
329
+ table[name][short_topic_key][metric] = expected[metric]
330
+
331
+ if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
332
+ row_cnt = 1
333
+
334
+ html_rows = []
335
+ for name in models[args.collection]:
336
+ if not name:
337
+ # Add blank row for spacing
338
+ html_rows.append('<tr><td style="border-bottom: 0"></td></tr>')
339
+ continue
340
+ s = Template(row_template)
341
+ s = s.substitute(row_cnt=row_cnt,
342
+ condition_name=table_keys[name],
343
+ row=row_ids[name],
344
+ s1=f'{table[name]["dl19"]["MAP"]:.4f}' if table[name]['dl19']['MAP'] != 0 else '-',
345
+ s2=f'{table[name]["dl19"]["nDCG@10"]:.4f}' if table[name]['dl19']['nDCG@10'] != 0 else '-',
346
+ s3=f'{table[name]["dl19"]["R@1K"]:.4f}' if table[name]['dl19']['R@1K'] != 0 else '-',
347
+ s4=f'{table[name]["dl20"]["MAP"]:.4f}' if table[name]['dl20']['MAP'] != 0 else '-',
348
+ s5=f'{table[name]["dl20"]["nDCG@10"]:.4f}' if table[name]['dl20']['nDCG@10'] != 0 else '-',
349
+ s6=f'{table[name]["dl20"]["R@1K"]:.4f}' if table[name]['dl20']['R@1K'] != 0 else '-',
350
+ s7=f'{table[name]["dev"]["MRR@10"]:.4f}' if table[name]['dev']['MRR@10'] != 0 else '-',
351
+ s8=f'{table[name]["dev"]["R@1K"]:.4f}' if table[name]['dev']['R@1K'] != 0 else '-',
352
+ cmd1=format_command(commands[name]['dl19']),
353
+ cmd2=format_command(commands[name]['dl20']),
354
+ cmd3=format_command(commands[name]['dev']),
355
+ eval_cmd1=eval_commands[name]['dl19'],
356
+ eval_cmd2=eval_commands[name]['dl20'],
357
+ eval_cmd3=eval_commands[name]['dev']
358
+ )
359
+
360
+ # If we don't have scores, we want to remove the commands also. Use simple regexp substitution.
361
+ if table[name]['dl19']['MAP'] == 0:
362
+ s = re.sub(re.compile('Command to generate run on TREC 2019 queries:.*?</div>',
363
+ re.MULTILINE | re.DOTALL),
364
+ 'Not available.</div>', s)
365
+ if table[name]['dl20']['MAP'] == 0:
366
+ s = re.sub(re.compile('Command to generate run on TREC 2020 queries:.*?</div>',
367
+ re.MULTILINE | re.DOTALL),
368
+ 'Not available.</div>', s)
369
+ if table[name]['dev']['MRR@10'] == 0:
370
+ s = re.sub(re.compile('Command to generate run on dev queries:.*?</div>',
371
+ re.MULTILINE | re.DOTALL),
372
+ 'Not available.</div>', s)
373
+
374
+ html_rows.append(s)
375
+ row_cnt += 1
376
+
377
+ all_rows = '\n'.join(html_rows)
378
+ if args.collection == 'msmarco-v1-passage':
379
+ full_name = 'MS MARCO V1 Passage'
380
+ else:
381
+ full_name = 'MS MARCO V1 Document'
382
+
383
+ with open(args.output, 'w') as out:
384
+ out.write(Template(html_template).substitute(title=full_name, rows=all_rows))
385
+ else:
386
+ row_cnt = 1
387
+
388
+ html_rows = []
389
+ for name in models[args.collection]:
390
+ if not name:
391
+ # Add blank row for spacing
392
+ html_rows.append('<tr><td style="border-bottom: 0"></td></tr>')
393
+ continue
394
+ s = Template(row_template)
395
+ s = s.substitute(row_cnt=row_cnt,
396
+ condition_name=table_keys[name],
397
+ row=row_ids[name],
398
+ s1=f'{table[name]["dl21"]["MAP@100"]:.4f}',
399
+ s2=f'{table[name]["dl21"]["nDCG@10"]:.4f}',
400
+ s3=f'{table[name]["dl21"]["MRR@100"]:.4f}',
401
+ s4=f'{table[name]["dl21"]["R@100"]:.4f}',
402
+ s5=f'{table[name]["dl21"]["R@1K"]:.4f}',
403
+ s6=f'{table[name]["dev"]["MRR@100"]:.4f}',
404
+ s7=f'{table[name]["dev"]["R@1K"]:.4f}',
405
+ s8=f'{table[name]["dev2"]["MRR@100"]:.4f}',
406
+ s9=f'{table[name]["dev2"]["R@1K"]:.4f}',
407
+ cmd1=format_command(commands[name]['dl21']),
408
+ cmd2=format_command(commands[name]['dev']),
409
+ cmd3=format_command(commands[name]['dev2']),
410
+ eval_cmd1=eval_commands[name]['dl21'],
411
+ eval_cmd2=eval_commands[name]['dev'],
412
+ eval_cmd3=eval_commands[name]['dev2']
413
+ )
414
+ html_rows.append(s)
415
+ row_cnt += 1
416
+
417
+ all_rows = '\n'.join(html_rows)
418
+ if args.collection == 'msmarco-v2-passage':
419
+ full_name = 'MS MARCO V2 Passage'
420
+ else:
421
+ full_name = 'MS MARCO V2 Document'
422
+
423
+ with open(args.output, 'w') as out:
424
+ out.write(Template(html_template).substitute(title=full_name, rows=all_rows))
425
+
426
+
427
+ def run_conditions(args):
428
+ start = time.time()
429
+
430
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
431
+ table_keys = {}
432
+
433
+ yaml_file = pkg_resources.resource_filename(__name__, f'{args.collection}.yaml')
434
+
435
+ with open(yaml_file) as f:
436
+ yaml_data = yaml.safe_load(f)
437
+ for condition in yaml_data['conditions']:
438
+ # Either we're running all conditions, or running only the condition specified in --condition
439
+ if not args.all:
440
+ if not condition['name'] == args.condition:
441
+ continue
442
+
443
+ name = condition['name']
444
+ display = condition['display']
445
+ cmd_template = condition['command']
446
+
447
+ print(f'# Running condition "{name}": {display}\n')
448
+ for topic_set in condition['topics']:
449
+ topic_key = topic_set['topic_key']
450
+ eval_key = topic_set['eval_key']
451
+
452
+ short_topic_key = ''
453
+ if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
454
+ short_topic_key = find_msmarco_table_topic_set_key_v1(topic_key)
455
+ else:
456
+ short_topic_key = find_msmarco_table_topic_set_key_v2(topic_key)
457
+
458
+ print(f' - topic_key: {topic_key}')
459
+
460
+ runfile = os.path.join(args.directory, f'run.{args.collection}.{name}.{short_topic_key}.txt')
461
+ cmd = Template(cmd_template).substitute(topics=topic_key, output=runfile)
462
+
463
+ if args.display_commands:
464
+ print(f'\n```bash\n{format_command(cmd)}\n```\n')
465
+
466
+ if not os.path.exists(runfile):
467
+ if not args.dry_run:
468
+ os.system(cmd)
469
+
470
+ for expected in topic_set['scores']:
471
+ for metric in expected:
472
+ table_keys[name] = display
473
+ if not args.skip_eval:
474
+ # If the runfile doesn't exist, we can't evaluate.
475
+ # This would be the case if --dry-run were set.
476
+ if not os.path.exists(runfile):
477
+ continue
478
+
479
+ score = float(
480
+ run_eval_and_return_metric(
481
+ metric,
482
+ eval_key,
483
+ trec_eval_metric_definitions[args.collection][eval_key][metric],
484
+ runfile))
485
+ if math.isclose(score, float(expected[metric])):
486
+ result_str = ok_str
487
+ # Flaky tests
488
+ elif args.collection == 'msmarco-v1-passage' \
489
+ and topic_key == 'msmarco-passage-dev-subset' and name == 'ance-pytorch' \
490
+ and metric == 'MRR@10' and abs(score-float(expected[metric])) <= 0.0001:
491
+ result_str = okish_str
492
+ else:
493
+ result_str = fail_str + f' expected {expected[metric]:.4f}'
494
+ print(f' {metric:7}: {score:.4f} {result_str}')
495
+ table[name][short_topic_key][metric] = score
496
+ else:
497
+ table[name][short_topic_key][metric] = expected[metric]
498
+
499
+ if not args.skip_eval:
500
+ print('')
501
+
502
+ if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
503
+ print(' ' * 69 + 'TREC 2019' + ' ' * 16 + 'TREC 2020' + ' ' * 12 + 'MS MARCO dev')
504
+ print(' ' * 62 + 'MAP nDCG@10 R@1K MAP nDCG@10 R@1K MRR@10 R@1K')
505
+ print(' ' * 62 + '-' * 22 + ' ' + '-' * 22 + ' ' + '-' * 14)
506
+
507
+ if args.condition:
508
+ # If we've used --condition to specify a specific condition, print out only that row.
509
+ name = args.condition
510
+ print(f'{table_keys[name]:60}' +
511
+ f'{table[name]["dl19"]["MAP"]:8.4f}{table[name]["dl19"]["nDCG@10"]:8.4f}{table[name]["dl19"]["R@1K"]:8.4f} ' +
512
+ f'{table[name]["dl20"]["MAP"]:8.4f}{table[name]["dl20"]["nDCG@10"]:8.4f}{table[name]["dl20"]["R@1K"]:8.4f} ' +
513
+ f'{table[name]["dev"]["MRR@10"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}')
514
+ else:
515
+ # Otherwise, print out all rows
516
+ for name in models[args.collection]:
517
+ if not name:
518
+ print('')
519
+ continue
520
+ print(f'{table_keys[name]:60}' +
521
+ f'{table[name]["dl19"]["MAP"]:8.4f}{table[name]["dl19"]["nDCG@10"]:8.4f}{table[name]["dl19"]["R@1K"]:8.4f} ' +
522
+ f'{table[name]["dl20"]["MAP"]:8.4f}{table[name]["dl20"]["nDCG@10"]:8.4f}{table[name]["dl20"]["R@1K"]:8.4f} ' +
523
+ f'{table[name]["dev"]["MRR@10"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}')
524
+ else:
525
+ print(' ' * 77 + 'TREC 2021' + ' ' * 18 + 'MS MARCO dev' + ' ' * 6 + 'MS MARCO dev2')
526
+ print(' ' * 62 + 'MAP@100 nDCG@10 MRR@100 R@100 R@1K MRR@100 R@1K MRR@100 R@1K')
527
+ print(' ' * 62 + '-' * 38 + ' ' + '-' * 14 + ' ' + '-' * 14)
528
+
529
+ if args.condition:
530
+ # If we've used --condition to specify a specific condition, print out only that row.
531
+ name = args.condition
532
+ print(f'{table_keys[name]:60}' +
533
+ f'{table[name]["dl21"]["MAP@100"]:8.4f}{table[name]["dl21"]["nDCG@10"]:8.4f}' +
534
+ f'{table[name]["dl21"]["MRR@100"]:8.4f}{table[name]["dl21"]["R@100"]:8.4f}{table[name]["dl21"]["R@1K"]:8.4f} ' +
535
+ f'{table[name]["dev"]["MRR@100"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f} ' +
536
+ f'{table[name]["dev2"]["MRR@100"]:8.4f}{table[name]["dev2"]["R@1K"]:8.4f}')
537
+ else:
538
+ # Otherwise, print out all rows
539
+ for name in models[args.collection]:
540
+ if not name:
541
+ print('')
542
+ continue
543
+ print(f'{table_keys[name]:60}' +
544
+ f'{table[name]["dl21"]["MAP@100"]:8.4f}{table[name]["dl21"]["nDCG@10"]:8.4f}' +
545
+ f'{table[name]["dl21"]["MRR@100"]:8.4f}{table[name]["dl21"]["R@100"]:8.4f}{table[name]["dl21"]["R@1K"]:8.4f} ' +
546
+ f'{table[name]["dev"]["MRR@100"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f} ' +
547
+ f'{table[name]["dev2"]["MRR@100"]:8.4f}{table[name]["dev2"]["R@1K"]:8.4f}')
548
+
549
+ end = time.time()
550
+
551
+ print('\n')
552
+ print(f'Total elapsed time: {end - start:.0f}s')
553
+
554
+
555
+ if __name__ == '__main__':
556
+ parser = argparse.ArgumentParser(description='Generate regression matrix for MS MARCO corpora.')
557
+ parser.add_argument('--collection', type=str,
558
+ help='Collection = {v1-passage, v1-doc, v2-passage, v2-doc}.', required=True)
559
+ # To list all conditions
560
+ parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
561
+ # For generating reports
562
+ parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
563
+ parser.add_argument('--output', type=str, help='File to store report.', required=False)
564
+ # For actually running the experimental conditions
565
+ parser.add_argument('--all', action='store_true', default=False, help='Run all conditions.')
566
+ parser.add_argument('--condition', type=str, help='Condition to run.', required=False)
567
+ parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
568
+ parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
569
+ parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
570
+ parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
571
+ args = parser.parse_args()
572
+
573
+ if args.collection == 'v1-passage':
574
+ args.collection = 'msmarco-v1-passage'
575
+ elif args.collection == 'v1-doc':
576
+ args.collection = 'msmarco-v1-doc'
577
+ elif args.collection == 'v2-passage':
578
+ args.collection = 'msmarco-v2-passage'
579
+ elif args.collection == 'v2-doc':
580
+ args.collection = 'msmarco-v2-doc'
581
+ else:
582
+ raise ValueError(f'Unknown corpus: {args.collection}')
583
+
584
+ if args.list_conditions:
585
+ list_conditions(args)
586
+ sys.exit()
587
+
588
+ if args.generate_report:
589
+ if not args.output:
590
+ print(f'Must specify report filename with --output.')
591
+ sys.exit()
592
+
593
+ generate_report(args)
594
+ sys.exit()
595
+
596
+ if not args.all and not args.condition:
597
+ print(f'Must specify a specific condition using --condition or use --all to run all conditions.')
598
+ sys.exit()
599
+
600
+ run_conditions(args)
pyserini/2cr/msmarco_html_row_v1.template ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Condition: ${condition_name} -->
2
+ <tr class="accordion-toggle collapsed" id="row${row_cnt}" data-toggle="collapse" data-parent="#row${row_cnt}" href="#collapse${row_cnt}">
3
+ <td class="expand-button"></td>
4
+ <td style="min-width: 85px">$row</td>
5
+ <td style="min-width: 400px">${condition_name}</td>
6
+ <td>$s1</td>
7
+ <td>$s2</td>
8
+ <td>$s3</td>
9
+ <td></td>
10
+ <td>$s4</td>
11
+ <td>$s5</td>
12
+ <td>$s6</td>
13
+ <td></td>
14
+ <td>$s7</td>
15
+ <td>$s8</td>
16
+ </tr>
17
+ <tr class="hide-table-padding">
18
+ <td></td>
19
+ <td colspan="11">
20
+ <div id="collapse${row_cnt}" class="collapse in p-3">
21
+
22
+ <!-- Tabs navs -->
23
+ <ul class="nav nav-tabs mb-3" id="row${row_cnt}-tabs" role="tablist">
24
+ <li class="nav-item" role="presentation">
25
+ <a class="nav-link active" id="row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab1" role="tab" aria-controls="row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">TREC 2019</a>
26
+ </li>
27
+ <li class="nav-item" role="presentation">
28
+ <a class="nav-link" id="row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab2" role="tab" aria-controls="row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">TREC 2020</a>
29
+ </li>
30
+ <li class="nav-item" role="presentation">
31
+ <a class="nav-link" id="row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab3" role="tab" aria-controls="row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">dev</a>
32
+ </li>
33
+ </ul>
34
+ <!-- Tabs navs -->
35
+
36
+ <!-- Tabs content -->
37
+ <div class="tab-content" id="row${row_cnt}-content">
38
+ <div class="tab-pane fade show active" id="row${row_cnt}-tab1" role="tabpanel" aria-labelledby="row${row_cnt}-tab1">
39
+ Command to generate run on TREC 2019 queries:
40
+
41
+ <blockquote class="mycode">
42
+ <pre><code>$cmd1
43
+ </code></pre></blockquote>
44
+ Evaluation commands:
45
+
46
+ <blockquote class="mycode">
47
+ <pre><code>${eval_cmd1}</code></pre>
48
+ </blockquote>
49
+
50
+ </div>
51
+ <div class="tab-pane fade" id="row${row_cnt}-tab2" role="tabpanel" aria-labelledby="row${row_cnt}-tab2">
52
+ Command to generate run on TREC 2020 queries:
53
+
54
+ <blockquote class="mycode">
55
+ <pre><code>$cmd2
56
+ </code></pre></blockquote>
57
+ Evaluation commands:
58
+
59
+ <blockquote class="mycode">
60
+ <pre><code>${eval_cmd2}</code></pre>
61
+ </blockquote>
62
+
63
+ </div>
64
+ <div class="tab-pane fade" id="row${row_cnt}-tab3" role="tabpanel" aria-labelledby="row${row_cnt}-tab3">
65
+ Command to generate run on dev queries:
66
+
67
+ <blockquote class="mycode">
68
+ <pre><code>$cmd3
69
+ </code></pre></blockquote>
70
+ Evaluation commands:
71
+
72
+ <blockquote class="mycode">
73
+ <pre><code>${eval_cmd3}</code></pre>
74
+ </blockquote>
75
+
76
+ </div>
77
+ </div>
78
+ <!-- Tabs content -->
79
+
80
+ </div></td>
81
+ </tr>
pyserini/2cr/msmarco_html_row_v2.template ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Condition: ${condition_name} -->
2
+ <tr class="accordion-toggle collapsed" id="row${row_cnt}" data-toggle="collapse" data-parent="#row${row_cnt}" href="#collapse${row_cnt}">
3
+ <td class="expand-button"></td>
4
+ <td>$row</td>
5
+ <td style="min-width: 400px">${condition_name}</td>
6
+ <td>$s1</td>
7
+ <td>$s2</td>
8
+ <td>$s3</td>
9
+ <td>$s4</td>
10
+ <td>$s5</td>
11
+ <td></td>
12
+ <td>$s6</td>
13
+ <td>$s7</td>
14
+ <td></td>
15
+ <td>$s8</td>
16
+ <td>$s9</td>
17
+ </tr>
18
+ <tr class="hide-table-padding">
19
+ <td></td>
20
+ <td colspan="12">
21
+ <div id="collapse${row_cnt}" class="collapse in p-3">
22
+
23
+ <!-- Tabs navs -->
24
+ <ul class="nav nav-tabs mb-3" id="row${row_cnt}-tabs" role="tablist">
25
+ <li class="nav-item" role="presentation">
26
+ <a class="nav-link active" id="row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab1" role="tab" aria-controls="row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">TREC 2021</a>
27
+ </li>
28
+ <li class="nav-item" role="presentation">
29
+ <a class="nav-link" id="row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab2" role="tab" aria-controls="row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">dev</a>
30
+ </li>
31
+ <li class="nav-item" role="presentation">
32
+ <a class="nav-link" id="row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab3" role="tab" aria-controls="row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">dev2</a>
33
+ </li>
34
+ </ul>
35
+ <!-- Tabs navs -->
36
+
37
+ <!-- Tabs content -->
38
+ <div class="tab-content" id="row${row_cnt}-content">
39
+ <div class="tab-pane fade show active" id="row${row_cnt}-tab1" role="tabpanel" aria-labelledby="row${row_cnt}-tab1">
40
+ Command to generate run on TREC 2021 queries:
41
+
42
+ <blockquote class="mycode">
43
+ <pre><code>$cmd1
44
+ </code></pre></blockquote>
45
+ Evaluation commands:
46
+
47
+ <blockquote class="mycode">
48
+ <pre><code>${eval_cmd1}</code></pre>
49
+ </blockquote>
50
+
51
+ </div>
52
+ <div class="tab-pane fade" id="row${row_cnt}-tab2" role="tabpanel" aria-labelledby="row${row_cnt}-tab2">
53
+ Command to generate run on dev queries:
54
+
55
+ <blockquote class="mycode">
56
+ <pre><code>$cmd2
57
+ </code></pre></blockquote>
58
+ Evaluation commands:
59
+
60
+ <blockquote class="mycode">
61
+ <pre><code>${eval_cmd2}</code></pre>
62
+ </blockquote>
63
+
64
+ </div>
65
+ <div class="tab-pane fade" id="row${row_cnt}-tab3" role="tabpanel" aria-labelledby="row${row_cnt}-tab3">
66
+ Command to generate run on dev2 queries:
67
+
68
+ <blockquote class="mycode">
69
+ <pre><code>$cmd3
70
+ </code></pre></blockquote>
71
+ Evaluation commands:
72
+
73
+ <blockquote class="mycode">
74
+ <pre><code>${eval_cmd3}</code></pre>
75
+ </blockquote>
76
+
77
+ </div>
78
+ </div>
79
+ <!-- Tabs content -->
80
+
81
+ </div></td>
82
+ </tr>
pyserini/2cr/msmarco_html_v1_doc.template ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
6
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
7
+ <title>Pyserini Reproductions: MS MARCO V1 Document</title>
8
+ <!-- Font Awesome -->
9
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
10
+ <!-- Google Fonts Roboto -->
11
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
12
+ <!-- MDB -->
13
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
14
+
15
+ <style>
16
+ tr.hide-table-padding td {
17
+ padding: 0;
18
+ }
19
+
20
+ .expand-button {
21
+ position: relative;
22
+ }
23
+
24
+ .accordion-toggle .expand-button:after {
25
+ position: absolute;
26
+ left:.75rem;
27
+ top: 50%;
28
+ transform: translate(0, -50%);
29
+ content: '-';
30
+ }
31
+
32
+ .accordion-toggle.collapsed .expand-button:after {
33
+ content: '+';
34
+ }
35
+
36
+ blockquote.mycode {
37
+ border-left: 3px solid #ccc;
38
+ margin-left: 25px;
39
+ margin-top: 15px;
40
+ padding-left: 15px;
41
+ }
42
+
43
+ blockquote.mycode2 {
44
+ border-left: 3px solid #ccc;
45
+ margin-left: 25px;
46
+ padding-top: 10px;
47
+ padding-bottom: 10px;
48
+ padding-left: 15px;
49
+ }
50
+
51
+ tr th.headertop {
52
+ border-bottom: none;
53
+ padding-bottom: 0rem
54
+ }
55
+
56
+ tr th.headerbottom {
57
+ padding-top: 0rem
58
+ }
59
+
60
+ .table>:not(caption)>*>*{padding:0.75rem 0.75rem}
61
+
62
+ .copy-code-button {
63
+ border-radius: 0;
64
+ min-width: 55px;
65
+ background: none repeat scroll 0 0 transparent;
66
+ background-color: grey;
67
+ color: #F1F2F3 !important;
68
+ cursor: pointer;
69
+ border-style: none;
70
+ font-family: 'HELVETICA',sans-serif;
71
+ font-size: 0.8em;
72
+ font-weight: normal;
73
+ text-align: center;
74
+ text-decoration: none;
75
+ text-indent: 0;
76
+ text-transform: uppercase;
77
+ font-weight: 500;
78
+ line-height: 1.42rem;
79
+ margin: 0;
80
+ padding: 3px 8px;
81
+ position: absolute !important;
82
+ top: 0 !important;
83
+ right: 0 !important;
84
+ }
85
+
86
+ .copy-code-button > span {
87
+ color: #F1F2F3 !important;
88
+ }
89
+
90
+ .copy-code-button, ::before, ::after {
91
+ box-sizing: inherit;
92
+ }
93
+
94
+ .copy-code-button::before {
95
+ content: '';
96
+ display: inline-block;
97
+ width: 16px;
98
+ height: 16px;
99
+ margin-right: 3px;
100
+ background-size: contain;
101
+ background-image: url("");
102
+ background-repeat: no-repeat;
103
+ position: relative;
104
+ top: 3px;
105
+ }
106
+
107
+ .copy-code-button:focus {
108
+ /* Avoid an ugly focus outline on click in Chrome,
109
+ but darken the button for accessibility.
110
+ See https://stackoverflow.com/a/25298082/1481479 */
111
+ /* background-color: #E6E6E6; */
112
+ outline: 0;
113
+ }
114
+
115
+ pre[class*="prettyprint"] {
116
+ position: relative;
117
+ overflow: hidden;
118
+ }
119
+ </style>
120
+ </head>
121
+ <body>
122
+
123
+ <!-- Background image -->
124
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
125
+ <div class="mask" style="
126
+ background: linear-gradient(
127
+ 45deg,
128
+ rgba(29, 236, 197, 0.7),
129
+ rgba(91, 14, 214, 0.7) 100%
130
+ );
131
+ ">
132
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
133
+ <div class="text-white">
134
+ <h1 class="mb-3">$title</h1>
135
+ </div>
136
+ </div>
137
+ </div>
138
+ </div>
139
+ <!-- Background image -->
140
+
141
+ <div class="container my-4">
142
+
143
+ <p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in a number of papers, denoted by the references in square brackets.
144
+ Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
145
+
146
+ <div class="table-responsive">
147
+ <table class="table">
148
+ <thead>
149
+ <tr>
150
+ <th class="headertop"></th>
151
+ <th class="headertop"></th>
152
+ <th class="headertop"></th>
153
+ <th class="headertop" colspan="4"><b>TREC 2019</b></th>
154
+ <th class="headertop" colspan="4"><b>TREC 2020</b></th>
155
+ <th class="headertop" colspan="3"><b>dev</b></th>
156
+ </tr>
157
+ <tr>
158
+ <th class="headerbottom" scope="col"></th>
159
+ <th class="headerbottom" scope="col"></th>
160
+ <th class="headerbottom" scope="col"></th>
161
+ <th class="headerbottom" scope="col"><br/>AP@100</th>
162
+ <th class="headerbottom" scope="col">nDCG@10</th>
163
+ <th class="headerbottom" scope="col">R@1K</th>
164
+ <th class="headerbottom" scope="col"></th>
165
+ <th class="headerbottom" scope="col"><br/>AP@100</th>
166
+ <th class="headerbottom" scope="col">nDCG@10</th>
167
+ <th class="headerbottom" scope="col">R@1K</th>
168
+ <th class="headerbottom" scope="col"></th>
169
+ <th class="headerbottom" scope="col">RR@100</th>
170
+ <th class="headerbottom" scope="col">R@1K</th>
171
+ </tr>
172
+ </thead>
173
+ <tbody>
174
+
175
+ $rows
176
+
177
+ </tbody>
178
+ </table>
179
+ </div>
180
+
181
+ <ul style="list-style-type:none; padding-top: 25px">
182
+
183
+ <li><p>[1] Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin.
184
+ <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
185
+ <i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
186
+
187
+ <p>&nbsp; &nbsp; &nbsp; &nbsp;Numbers in parentheses correspond to rows in Table 2 of the paper.</p></li>
188
+
189
+ </ul>
190
+
191
+ <div style="padding-top: 20px"/>
192
+
193
+ <h4>Programmatic Execution</h4>
194
+
195
+ <p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
196
+ To list all the experimental conditions:</p>
197
+
198
+ <blockquote class="mycode2"><tt>
199
+ python -m pyserini.2cr.msmarco --collection v1-doc --list-conditions
200
+ </tt></blockquote>
201
+
202
+ <p>These conditions correspond to the table rows above.</p>
203
+
204
+ <p>For all conditions, just show the commands in a "dry run":</p>
205
+
206
+ <blockquote class="mycode2"><tt>
207
+ python -m pyserini.2cr.msmarco --collection v1-doc --all --display-commands --dry-run
208
+ </tt></blockquote>
209
+
210
+ <p>To actually run all the experimental conditions:</p>
211
+
212
+ <blockquote class="mycode2"><tt>
213
+ python -m pyserini.2cr.msmarco --collection v1-doc --all --display-commands
214
+ </tt></blockquote>
215
+
216
+ <p>With the above command, run files will be placed in the current directory.
217
+ Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
218
+
219
+ <p>To show the commands for a specific condition:</p>
220
+
221
+ <blockquote class="mycode2"><tt>
222
+ python -m pyserini.2cr.msmarco --collection v1-doc --condition bm25-doc-default --display-commands --dry-run
223
+ </tt></blockquote>
224
+
225
+ <p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
226
+
227
+ <p>To actually run a specific condition:</p>
228
+
229
+ <blockquote class="mycode2"><tt>
230
+ python -m pyserini.2cr.msmarco --collection v1-doc --condition bm25-doc-default --display-commands
231
+ </tt></blockquote>
232
+
233
+ <p>Again, with the above command, run files will be placed in the current directory.
234
+ Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
235
+
236
+ <p>Finally, to generate this page:</p>
237
+
238
+ <blockquote class="mycode2"><tt>
239
+ python -m pyserini.2cr.msmarco --collection v1-doc --generate-report --output msmarco-v1-doc.html
240
+ </tt></blockquote>
241
+
242
+ <p>The output file <tt>msmarco-v1-doc.html</tt> should be identical to this page.</p>
243
+
244
+ <div style="padding-top: 50px"/>
245
+
246
+ </div>
247
+
248
+
249
+
250
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
251
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
252
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
253
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
254
+
255
+ <script>
256
+ document.querySelectorAll('pre').forEach(function (codeBlock) {
257
+ var button = document.createElement('button');
258
+ button.className = 'copy-code-button';
259
+ button.type = 'button';
260
+ var s = codeBlock.innerText;
261
+ button.setAttribute('data-clipboard-text',s);
262
+ button.innerText = 'Copy';
263
+
264
+ // var pre = codeBlock.parentNode;
265
+ codeBlock.classList.add('prettyprint');
266
+ // pre.parentNode.insertBefore(button, pre);
267
+ codeBlock.appendChild(button);
268
+ });
269
+
270
+ var clipboard = new ClipboardJS('.copy-code-button');
271
+
272
+ clipboard.on('success', function(e) {
273
+ console.info('Action:', e.action);
274
+ console.info('Text:', e.text);
275
+ console.info('Trigger:', e.trigger);
276
+ e.trigger.textContent = 'Copied';
277
+ window.setTimeout(function() {
278
+ e.trigger.textContent = 'Copy';
279
+ }, 2000);
280
+ e.clearSelection();
281
+ });
282
+
283
+ clipboard.on('error', function(e) {
284
+ console.error('Action:', e.action);
285
+ console.error('Trigger:', e.trigger);
286
+ e.trigger.textContent = 'Error Copying';
287
+ window.setTimeout(function() {
288
+ e.trigger.textContent = 'Copy';
289
+ }, 2000);
290
+ e.clearSelection();
291
+ });
292
+
293
+ </script>
294
+
295
+ </body>
296
+ </html>
pyserini/2cr/msmarco_html_v1_passage.template ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
6
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
7
+ <title>Pyserini Reproductions: MS MARCO V1 Passage</title>
8
+ <!-- Font Awesome -->
9
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
10
+ <!-- Google Fonts Roboto -->
11
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
12
+ <!-- MDB -->
13
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
14
+
15
+ <style>
16
+ tr.hide-table-padding td {
17
+ padding: 0;
18
+ }
19
+
20
+ .expand-button {
21
+ position: relative;
22
+ }
23
+
24
+ .accordion-toggle .expand-button:after {
25
+ position: absolute;
26
+ left:.75rem;
27
+ top: 50%;
28
+ transform: translate(0, -50%);
29
+ content: '-';
30
+ }
31
+
32
+ .accordion-toggle.collapsed .expand-button:after {
33
+ content: '+';
34
+ }
35
+
36
+ blockquote.mycode {
37
+ border-left: 3px solid #ccc;
38
+ margin-left: 25px;
39
+ margin-top: 15px;
40
+ padding-left: 15px;
41
+ }
42
+
43
+ blockquote.mycode2 {
44
+ border-left: 3px solid #ccc;
45
+ margin-left: 25px;
46
+ padding-top: 10px;
47
+ padding-bottom: 10px;
48
+ padding-left: 15px;
49
+ }
50
+
51
+ tr th.headertop {
52
+ border-bottom: none;
53
+ padding-bottom: 0rem
54
+ }
55
+
56
+ tr th.headerbottom {
57
+ padding-top: 0rem
58
+ }
59
+
60
+ .table>:not(caption)>*>*{padding:0.75rem 0.75rem}
61
+
62
+ .copy-code-button {
63
+ border-radius: 0;
64
+ min-width: 55px;
65
+ background: none repeat scroll 0 0 transparent;
66
+ background-color: grey;
67
+ color: #F1F2F3 !important;
68
+ cursor: pointer;
69
+ border-style: none;
70
+ font-family: 'HELVETICA',sans-serif;
71
+ font-size: 0.8em;
72
+ font-weight: normal;
73
+ text-align: center;
74
+ text-decoration: none;
75
+ text-indent: 0;
76
+ text-transform: uppercase;
77
+ font-weight: 500;
78
+ line-height: 1.42rem;
79
+ margin: 0;
80
+ padding: 3px 8px;
81
+ position: absolute !important;
82
+ top: 0 !important;
83
+ right: 0 !important;
84
+ }
85
+
86
+ .copy-code-button > span {
87
+ color: #F1F2F3 !important;
88
+ }
89
+
90
+ .copy-code-button, ::before, ::after {
91
+ box-sizing: inherit;
92
+ }
93
+
94
+ .copy-code-button::before {
95
+ content: '';
96
+ display: inline-block;
97
+ width: 16px;
98
+ height: 16px;
99
+ margin-right: 3px;
100
+ background-size: contain;
101
+ background-image: url("");
102
+ background-repeat: no-repeat;
103
+ position: relative;
104
+ top: 3px;
105
+ }
106
+
107
+ .copy-code-button:focus {
108
+ /* Avoid an ugly focus outline on click in Chrome,
109
+ but darken the button for accessibility.
110
+ See https://stackoverflow.com/a/25298082/1481479 */
111
+ /* background-color: #E6E6E6; */
112
+ outline: 0;
113
+ }
114
+
115
+ pre[class*="prettyprint"] {
116
+ position: relative;
117
+ overflow: hidden;
118
+ }
119
+ </style>
120
+ </head>
121
+ <body>
122
+
123
+ <!-- Background image -->
124
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
125
+ <div class="mask" style="
126
+ background: linear-gradient(
127
+ 45deg,
128
+ rgba(29, 236, 197, 0.7),
129
+ rgba(91, 14, 214, 0.7) 100%
130
+ );
131
+ ">
132
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
133
+ <div class="text-white">
134
+ <h1 class="mb-3">$title</h1>
135
+ </div>
136
+ </div>
137
+ </div>
138
+ </div>
139
+ <!-- Background image -->
140
+
141
+ <div class="container my-4">
142
+
143
+ <p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in a number of papers, denoted by the references in square brackets.
144
+ Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
145
+
146
+ <div class="table-responsive">
147
+ <table class="table">
148
+ <thead>
149
+ <tr>
150
+ <th class="headertop"></th>
151
+ <th class="headertop"></th>
152
+ <th class="headertop"></th>
153
+ <th class="headertop" colspan="4"><b>TREC 2019</b></th>
154
+ <th class="headertop" colspan="4"><b>TREC 2020</b></th>
155
+ <th class="headertop" colspan="3"><b>dev</b></th>
156
+ </tr>
157
+ <tr>
158
+ <th class="headerbottom" scope="col"></th>
159
+ <th class="headerbottom" scope="col"></th>
160
+ <th class="headerbottom" scope="col"></th>
161
+ <th class="headerbottom" scope="col"><br/>AP</th>
162
+ <th class="headerbottom" scope="col">nDCG@10</th>
163
+ <th class="headerbottom" scope="col">R@1K</th>
164
+ <th class="headerbottom" scope="col"></th>
165
+ <th class="headerbottom" scope="col"><br/>AP</th>
166
+ <th class="headerbottom" scope="col">nDCG@10</th>
167
+ <th class="headerbottom" scope="col">R@1K</th>
168
+ <th class="headerbottom" scope="col"></th>
169
+ <th class="headerbottom" scope="col">RR@10</th>
170
+ <th class="headerbottom" scope="col">R@1K</th>
171
+
172
+ </tr>
173
+ </thead>
174
+ <tbody>
175
+
176
+ $rows
177
+
178
+ </tbody>
179
+ </table>
180
+ </div>
181
+
182
+ <ul style="list-style-type:none; padding-top: 25px">
183
+
184
+ <li><p>[1] Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin.
185
+ <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
186
+ <i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
187
+
188
+ <p>&nbsp; &nbsp; &nbsp; &nbsp;Numbers in parentheses correspond to rows in Table 1 of the paper.</p></li>
189
+
190
+ <li><p>[2] Thibault Formal, Carlos Lassance, Benjamin Piwowarski, and Stéphane Clinchant
191
+ <a href="https://arxiv.org/abs/2205.04733">From Distillation to Hard Negative Sampling: Making Sparse Neural IR Models More Effective.</a>
192
+ <i>Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022), May 2022.</i></p></li>
193
+
194
+ <li><p>[3] Lee Xiong, Chenyan Xiong, Ye Li, Kwok-Fung Tang, Jialin Liu, Paul N. Bennett, Junaid Ahmed, and Arnold Overwijk.
195
+ <a href="https://openreview.net/forum?id=zeFrfgyZln">Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval.</a>
196
+ <i>Proceedings of the 9th International Conference on Learning Representations (ICLR 2021), May 2021.</i></p></li>
197
+
198
+ <li><p>[4] Sebastian Hofstätter, Sophia Althammer, Michael Schröder, Mete Sertkan, and Allan Hanbury.
199
+ <a href="https://arxiv.org/abs/2010.02666">Improving Efficient Neural Ranking Models with Cross-Architecture Knowledge Distillation.</a>
200
+ <i>arXiv:2010.02666</i>, October 2020.</p></li>
201
+
202
+ <li><p>[5] Sebastian Hofstätter, Sheng-Chieh Lin, Jheng-Hong Yang, Jimmy Lin, and Allan Hanbury.
203
+ <a href="https://dl.acm.org/doi/10.1145/3404835.3462891">Efficiently Teaching an Effective Dense Retriever with Balanced Topic Aware Sampling.</a>
204
+ <i>Proceedings of the 44th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2021)</i>, pages 113-122, July 2021.</p></li>
205
+
206
+ <li><p>[6] Sheng-Chieh Lin, Jheng-Hong Yang, and Jimmy Lin.
207
+ <a href="https://aclanthology.org/2021.repl4nlp-1.17/">In-Batch Negatives for Knowledge Distillation with Tightly-Coupled Teachers for Dense Retrieval.</a>
208
+ <i>Proceedings of the 6th Workshop on Representation Learning for NLP (RepL4NLP-2021)</i>, pages 163-173, August 2021.</p></li>
209
+
210
+ <li><p>[7] Minghan Li, Sheng-Chieh Lin, Xueguang Ma, Jimmy Lin.
211
+ <a href="https://arxiv.org/abs/2302.06587">SLIM: Sparsified Late Interaction for Multi-Vector Retrieval with Inverted Indexes.</a>
212
+ <i>arXiv:2302.06587</i>, Feburary 2023.</p></li>
213
+
214
+ <li><p>[8] Sheng-Chieh Lin, Minghan Li and Jimmy Lin.
215
+ <a href="https://arxiv.org/abs/2208.00511">Aggretriever: A Simple Approach to Aggregate Textual Representation for Robust Dense Passage Retrieval.</a>
216
+ <i>arXiv:2208.00511</i>, July 2022.</p></li>
217
+
218
+ </ul>
219
+
220
+ <div style="padding-top: 20px"/>
221
+
222
+ <h4>Programmatic Execution</h4>
223
+
224
+ <p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
225
+ To list all the experimental conditions:</p>
226
+
227
+ <blockquote class="mycode2"><tt>
228
+ python -m pyserini.2cr.msmarco --collection v1-passage --list-conditions
229
+ </tt></blockquote>
230
+
231
+ <p>These conditions correspond to the table rows above.</p>
232
+
233
+ <p>For all conditions, just show the commands in a "dry run":</p>
234
+
235
+ <blockquote class="mycode2"><tt>
236
+ python -m pyserini.2cr.msmarco --collection v1-passage --all --display-commands --dry-run
237
+ </tt></blockquote>
238
+
239
+ <p>To actually run all the experimental conditions:</p>
240
+
241
+ <blockquote class="mycode2"><tt>
242
+ python -m pyserini.2cr.msmarco --collection v1-passage --all --display-commands
243
+ </tt></blockquote>
244
+
245
+ <p>With the above command, run files will be placed in the current directory.
246
+ Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
247
+
248
+ <p>To show the commands for a specific condition:</p>
249
+
250
+ <blockquote class="mycode2"><tt>
251
+ python -m pyserini.2cr.msmarco --collection v1-passage --condition bm25-default --display-commands --dry-run
252
+ </tt></blockquote>
253
+
254
+ <p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
255
+
256
+ <p>To actually run a specific condition:</p>
257
+
258
+ <blockquote class="mycode2"><tt>
259
+ python -m pyserini.2cr.msmarco --collection v1-passage --condition bm25-default --display-commands
260
+ </tt></blockquote>
261
+
262
+ <p>Again, with the above command, run files will be placed in the current directory.
263
+ Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
264
+
265
+ <p>Finally, to generate this page:</p>
266
+
267
+ <blockquote class="mycode2"><tt>
268
+ python -m pyserini.2cr.msmarco --collection v1-passage --generate-report --output msmarco-v1-passage.html
269
+ </tt></blockquote>
270
+
271
+ <p>The output file <tt>msmarco-v1-passage.html</tt> should be identical to this page.</p>
272
+
273
+ <div style="padding-top: 50px"/>
274
+
275
+ </div>
276
+
277
+
278
+
279
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
280
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
281
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
282
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
283
+
284
+ <script>
285
+ document.querySelectorAll('pre').forEach(function (codeBlock) {
286
+ var button = document.createElement('button');
287
+ button.className = 'copy-code-button';
288
+ button.type = 'button';
289
+ var s = codeBlock.innerText;
290
+ button.setAttribute('data-clipboard-text',s);
291
+ button.innerText = 'Copy';
292
+
293
+ // var pre = codeBlock.parentNode;
294
+ codeBlock.classList.add('prettyprint');
295
+ // pre.parentNode.insertBefore(button, pre);
296
+ codeBlock.appendChild(button);
297
+ });
298
+
299
+ var clipboard = new ClipboardJS('.copy-code-button');
300
+
301
+ clipboard.on('success', function(e) {
302
+ console.info('Action:', e.action);
303
+ console.info('Text:', e.text);
304
+ console.info('Trigger:', e.trigger);
305
+ e.trigger.textContent = 'Copied';
306
+ window.setTimeout(function() {
307
+ e.trigger.textContent = 'Copy';
308
+ }, 2000);
309
+ e.clearSelection();
310
+ });
311
+
312
+ clipboard.on('error', function(e) {
313
+ console.error('Action:', e.action);
314
+ console.error('Trigger:', e.trigger);
315
+ e.trigger.textContent = 'Error Copying';
316
+ window.setTimeout(function() {
317
+ e.trigger.textContent = 'Copy';
318
+ }, 2000);
319
+ e.clearSelection();
320
+ });
321
+
322
+ </script>
323
+
324
+ </body>
325
+ </html>
pyserini/2cr/msmarco_html_v2_doc.template ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
6
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
7
+ <title>Pyserini Reproductions: MS MARCO V2 Document</title>
8
+ <!-- Font Awesome -->
9
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
10
+ <!-- Google Fonts Roboto -->
11
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
12
+ <!-- MDB -->
13
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
14
+
15
+ <style>
16
+ tr.hide-table-padding td {
17
+ padding: 0;
18
+ }
19
+
20
+ .expand-button {
21
+ position: relative;
22
+ }
23
+
24
+ .accordion-toggle .expand-button:after {
25
+ position: absolute;
26
+ left:.75rem;
27
+ top: 50%;
28
+ transform: translate(0, -50%);
29
+ content: '-';
30
+ }
31
+
32
+ .accordion-toggle.collapsed .expand-button:after {
33
+ content: '+';
34
+ }
35
+
36
+ blockquote.mycode {
37
+ border-left: 3px solid #ccc;
38
+ margin-left: 25px;
39
+ margin-top: 15px;
40
+ padding-left: 15px;
41
+ }
42
+
43
+ blockquote.mycode2 {
44
+ border-left: 3px solid #ccc;
45
+ margin-left: 25px;
46
+ padding-top: 10px;
47
+ padding-bottom: 10px;
48
+ padding-left: 15px;
49
+ }
50
+
51
+ tr th.headertop {
52
+ border-bottom: none;
53
+ padding-bottom: 0rem
54
+ }
55
+
56
+ tr th.headerbottom {
57
+ padding-top: 0rem
58
+ }
59
+
60
+ .table>:not(caption)>*>*{padding:0.75rem 0.75rem}
61
+
62
+ .copy-code-button {
63
+ border-radius: 0;
64
+ min-width: 55px;
65
+ background: none repeat scroll 0 0 transparent;
66
+ background-color: grey;
67
+ color: #F1F2F3 !important;
68
+ cursor: pointer;
69
+ border-style: none;
70
+ font-family: 'HELVETICA',sans-serif;
71
+ font-size: 0.8em;
72
+ font-weight: normal;
73
+ text-align: center;
74
+ text-decoration: none;
75
+ text-indent: 0;
76
+ text-transform: uppercase;
77
+ font-weight: 500;
78
+ line-height: 1.42rem;
79
+ margin: 0;
80
+ padding: 3px 8px;
81
+ position: absolute !important;
82
+ top: 0 !important;
83
+ right: 0 !important;
84
+ }
85
+
86
+ .copy-code-button > span {
87
+ color: #F1F2F3 !important;
88
+ }
89
+
90
+ .copy-code-button, ::before, ::after {
91
+ box-sizing: inherit;
92
+ }
93
+
94
+ .copy-code-button::before {
95
+ content: '';
96
+ display: inline-block;
97
+ width: 16px;
98
+ height: 16px;
99
+ margin-right: 3px;
100
+ background-size: contain;
101
+ background-image: url("");
102
+ background-repeat: no-repeat;
103
+ position: relative;
104
+ top: 3px;
105
+ }
106
+
107
+ .copy-code-button:focus {
108
+ /* Avoid an ugly focus outline on click in Chrome,
109
+ but darken the button for accessibility.
110
+ See https://stackoverflow.com/a/25298082/1481479 */
111
+ /* background-color: #E6E6E6; */
112
+ outline: 0;
113
+ }
114
+
115
+ pre[class*="prettyprint"] {
116
+ position: relative;
117
+ overflow: hidden;
118
+ }
119
+ </style>
120
+ </head>
121
+ <body>
122
+
123
+ <!-- Background image -->
124
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
125
+ <div class="mask" style="
126
+ background: linear-gradient(
127
+ 45deg,
128
+ rgba(29, 236, 197, 0.7),
129
+ rgba(91, 14, 214, 0.7) 100%
130
+ );
131
+ ">
132
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
133
+ <div class="text-white">
134
+ <h1 class="mb-3">$title</h1>
135
+ </div>
136
+ </div>
137
+ </div>
138
+ </div>
139
+ <!-- Background image -->
140
+
141
+ <div class="container my-4">
142
+
143
+ <p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in the following paper.
144
+ Numbered rows correspond to tables in the paper; additional conditions are provided for comparison purposes.</p>
145
+
146
+ <p class="note note-light">Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
147
+ <i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
148
+
149
+ <p>Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
150
+
151
+ <div class="table-responsive">
152
+ <table class="table">
153
+ <thead>
154
+ <tr>
155
+ <th class="headertop"></th>
156
+ <th class="headertop"></th>
157
+ <th class="headertop"></th>
158
+ <th class="headertop" colspan="6"><b>TREC 2021</b></th>
159
+ <th class="headertop" colspan="3"><b>dev</b></th>
160
+ <th class="headertop" colspan="3"><b>dev2</b></th>
161
+ </tr>
162
+ <tr>
163
+ <th class="headerbottom" scope="col"></th>
164
+ <th class="headerbottom" scope="col"></th>
165
+ <th class="headerbottom" scope="col"></th>
166
+ <th class="headerbottom" scope="col"><br/>AP</th>
167
+ <th class="headerbottom" scope="col">nDCG@10</th>
168
+ <th class="headerbottom" scope="col">RR@100</th>
169
+ <th class="headerbottom" scope="col">R@100</th>
170
+ <th class="headerbottom" scope="col">R@1K</th>
171
+ <th class="headerbottom" scope="col"></th>
172
+ <th class="headerbottom" scope="col">RR@100</th>
173
+ <th class="headerbottom" scope="col">R@1K</th>
174
+ <th class="headerbottom" scope="col"></th>
175
+ <th class="headerbottom" scope="col">RR@100</th>
176
+ <th class="headerbottom" scope="col">R@1K</th>
177
+ </tr>
178
+ </thead>
179
+ <tbody>
180
+
181
+ $rows
182
+
183
+ </tbody>
184
+ </table>
185
+ </div>
186
+
187
+ <div style="padding-top: 20px"/>
188
+
189
+ <h4>Programmatic Execution</h4>
190
+
191
+ <p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
192
+ To list all the experimental conditions:</p>
193
+
194
+ <blockquote class="mycode2"><tt>
195
+ python -m pyserini.2cr.msmarco --collection v2-doc --list-conditions
196
+ </tt></blockquote>
197
+
198
+ <p>These conditions correspond to the table rows above.</p>
199
+
200
+ <p>For all conditions, just show the commands in a "dry run":</p>
201
+
202
+ <blockquote class="mycode2"><tt>
203
+ python -m pyserini.2cr.msmarco --collection v2-doc --all --display-commands --dry-run
204
+ </tt></blockquote>
205
+
206
+ <p>To actually run all the experimental conditions:</p>
207
+
208
+ <blockquote class="mycode2"><tt>
209
+ python -m pyserini.2cr.msmarco --collection v2-doc --all --display-commands
210
+ </tt></blockquote>
211
+
212
+ <p>With the above command, run files will be placed in the current directory.
213
+ Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
214
+
215
+ <p>To show the commands for a specific condition:</p>
216
+
217
+ <blockquote class="mycode2"><tt>
218
+ python -m pyserini.2cr.msmarco --collection v2-doc --condition bm25-doc-default --display-commands --dry-run
219
+ </tt></blockquote>
220
+
221
+ <p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
222
+
223
+ <p>To actually run a specific condition:</p>
224
+
225
+ <blockquote class="mycode2"><tt>
226
+ python -m pyserini.2cr.msmarco --collection v2-doc --condition bm25-doc-default --display-commands
227
+ </tt></blockquote>
228
+
229
+ <p>Again, with the above command, run files will be placed in the current directory.
230
+ Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
231
+
232
+ <p>Finally, to generate this page:</p>
233
+
234
+ <blockquote class="mycode2"><tt>
235
+ python -m pyserini.2cr.msmarco --collection v2-doc --generate-report --output msmarco-v2-doc.html
236
+ </tt></blockquote>
237
+
238
+ <p>The output file <tt>msmarco-v2-doc.html</tt> should be identical to this page.</p>
239
+
240
+ <div style="padding-top: 50px"/>
241
+
242
+ </div>
243
+
244
+
245
+
246
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
247
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
248
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
249
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
250
+
251
+ <script>
252
+ document.querySelectorAll('pre').forEach(function (codeBlock) {
253
+ var button = document.createElement('button');
254
+ button.className = 'copy-code-button';
255
+ button.type = 'button';
256
+ var s = codeBlock.innerText;
257
+ button.setAttribute('data-clipboard-text',s);
258
+ button.innerText = 'Copy';
259
+
260
+ // var pre = codeBlock.parentNode;
261
+ codeBlock.classList.add('prettyprint');
262
+ // pre.parentNode.insertBefore(button, pre);
263
+ codeBlock.appendChild(button);
264
+ });
265
+
266
+ var clipboard = new ClipboardJS('.copy-code-button');
267
+
268
+ clipboard.on('success', function(e) {
269
+ console.info('Action:', e.action);
270
+ console.info('Text:', e.text);
271
+ console.info('Trigger:', e.trigger);
272
+ e.trigger.textContent = 'Copied';
273
+ window.setTimeout(function() {
274
+ e.trigger.textContent = 'Copy';
275
+ }, 2000);
276
+ e.clearSelection();
277
+ });
278
+
279
+ clipboard.on('error', function(e) {
280
+ console.error('Action:', e.action);
281
+ console.error('Trigger:', e.trigger);
282
+ e.trigger.textContent = 'Error Copying';
283
+ window.setTimeout(function() {
284
+ e.trigger.textContent = 'Copy';
285
+ }, 2000);
286
+ e.clearSelection();
287
+ });
288
+
289
+ </script>
290
+
291
+ </body>
292
+ </html>
pyserini/2cr/msmarco_html_v2_passage.template ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
6
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
7
+ <title>Pyserini Reproductions: MS MARCO V2 Passage</title>
8
+ <!-- Font Awesome -->
9
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
10
+ <!-- Google Fonts Roboto -->
11
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
12
+ <!-- MDB -->
13
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
14
+
15
+ <style>
16
+ tr.hide-table-padding td {
17
+ padding: 0;
18
+ }
19
+
20
+ .expand-button {
21
+ position: relative;
22
+ }
23
+
24
+ .accordion-toggle .expand-button:after {
25
+ position: absolute;
26
+ left:.75rem;
27
+ top: 50%;
28
+ transform: translate(0, -50%);
29
+ content: '-';
30
+ }
31
+
32
+ .accordion-toggle.collapsed .expand-button:after {
33
+ content: '+';
34
+ }
35
+
36
+ blockquote.mycode {
37
+ border-left: 3px solid #ccc;
38
+ margin-left: 25px;
39
+ margin-top: 15px;
40
+ padding-left: 15px;
41
+ }
42
+
43
+ blockquote.mycode2 {
44
+ border-left: 3px solid #ccc;
45
+ margin-left: 25px;
46
+ padding-top: 10px;
47
+ padding-bottom: 10px;
48
+ padding-left: 15px;
49
+ }
50
+
51
+ tr th.headertop {
52
+ border-bottom: none;
53
+ padding-bottom: 0rem
54
+ }
55
+
56
+ tr th.headerbottom {
57
+ padding-top: 0rem
58
+ }
59
+
60
+ .table>:not(caption)>*>*{padding:0.75rem 0.75rem}
61
+
62
+ .copy-code-button {
63
+ border-radius: 0;
64
+ min-width: 55px;
65
+ background: none repeat scroll 0 0 transparent;
66
+ background-color: grey;
67
+ color: #F1F2F3 !important;
68
+ cursor: pointer;
69
+ border-style: none;
70
+ font-family: 'HELVETICA',sans-serif;
71
+ font-size: 0.8em;
72
+ font-weight: normal;
73
+ text-align: center;
74
+ text-decoration: none;
75
+ text-indent: 0;
76
+ text-transform: uppercase;
77
+ font-weight: 500;
78
+ line-height: 1.42rem;
79
+ margin: 0;
80
+ padding: 3px 8px;
81
+ position: absolute !important;
82
+ top: 0 !important;
83
+ right: 0 !important;
84
+ }
85
+
86
+ .copy-code-button > span {
87
+ color: #F1F2F3 !important;
88
+ }
89
+
90
+ .copy-code-button, ::before, ::after {
91
+ box-sizing: inherit;
92
+ }
93
+
94
+ .copy-code-button::before {
95
+ content: '';
96
+ display: inline-block;
97
+ width: 16px;
98
+ height: 16px;
99
+ margin-right: 3px;
100
+ background-size: contain;
101
+ background-image: url("");
102
+ background-repeat: no-repeat;
103
+ position: relative;
104
+ top: 3px;
105
+ }
106
+
107
+ .copy-code-button:focus {
108
+ /* Avoid an ugly focus outline on click in Chrome,
109
+ but darken the button for accessibility.
110
+ See https://stackoverflow.com/a/25298082/1481479 */
111
+ /* background-color: #E6E6E6; */
112
+ outline: 0;
113
+ }
114
+
115
+ pre[class*="prettyprint"] {
116
+ position: relative;
117
+ overflow: hidden;
118
+ }
119
+ </style>
120
+ </head>
121
+ <body>
122
+
123
+ <!-- Background image -->
124
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
125
+ <div class="mask" style="
126
+ background: linear-gradient(
127
+ 45deg,
128
+ rgba(29, 236, 197, 0.7),
129
+ rgba(91, 14, 214, 0.7) 100%
130
+ );
131
+ ">
132
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
133
+ <div class="text-white">
134
+ <h1 class="mb-3">$title</h1>
135
+ </div>
136
+ </div>
137
+ </div>
138
+ </div>
139
+ <!-- Background image -->
140
+
141
+ <div class="container my-4">
142
+
143
+ <p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in the following paper.
144
+ Numbered rows correspond to tables in the paper; additional conditions are provided for comparison purposes.</p>
145
+
146
+ <p class="note note-light">Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
147
+ <i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
148
+
149
+ <p>Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
150
+
151
+ <div class="table-responsive">
152
+ <table class="table">
153
+ <thead>
154
+ <tr>
155
+ <th class="headertop"></th>
156
+ <th class="headertop"></th>
157
+ <th class="headertop"></th>
158
+ <th class="headertop" colspan="6"><b>TREC 2021</b></th>
159
+ <th class="headertop" colspan="3"><b>dev</b></th>
160
+ <th class="headertop" colspan="3"><b>dev2</b></th>
161
+ </tr>
162
+ <tr>
163
+ <th class="headerbottom" scope="col"></th>
164
+ <th class="headerbottom" scope="col"></th>
165
+ <th class="headerbottom" scope="col"></th>
166
+ <th class="headerbottom" scope="col"><br/>AP</th>
167
+ <th class="headerbottom" scope="col">nDCG@10</th>
168
+ <th class="headerbottom" scope="col">RR@100</th>
169
+ <th class="headerbottom" scope="col">R@100</th>
170
+ <th class="headerbottom" scope="col">R@1K</th>
171
+ <th class="headerbottom" scope="col"></th>
172
+ <th class="headerbottom" scope="col">RR@100</th>
173
+ <th class="headerbottom" scope="col">R@1K</th>
174
+ <th class="headerbottom" scope="col"></th>
175
+ <th class="headerbottom" scope="col">RR@100</th>
176
+ <th class="headerbottom" scope="col">R@1K</th>
177
+ </tr>
178
+ </thead>
179
+ <tbody>
180
+
181
+ $rows
182
+
183
+ </tbody>
184
+ </table>
185
+ </div>
186
+
187
+ <div style="padding-top: 20px"/>
188
+
189
+ <h4>Programmatic Execution</h4>
190
+
191
+ <p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
192
+ To list all the experimental conditions:</p>
193
+
194
+ <blockquote class="mycode2"><tt>
195
+ python -m pyserini.2cr.msmarco --collection v2-passage --list-conditions
196
+ </tt></blockquote>
197
+
198
+ <p>These conditions correspond to the table rows above.</p>
199
+
200
+ <p>For all conditions, just show the commands in a "dry run":</p>
201
+
202
+ <blockquote class="mycode2"><tt>
203
+ python -m pyserini.2cr.msmarco --collection v2-passage --all --display-commands --dry-run
204
+ </tt></blockquote>
205
+
206
+ <p>To actually run all the experimental conditions:</p>
207
+
208
+ <blockquote class="mycode2"><tt>
209
+ python -m pyserini.2cr.msmarco --collection v2-passage --all --display-commands
210
+ </tt></blockquote>
211
+
212
+ <p>With the above command, run files will be placed in the current directory.
213
+ Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
214
+
215
+ <p>To show the commands for a specific condition:</p>
216
+
217
+ <blockquote class="mycode2"><tt>
218
+ python -m pyserini.2cr.msmarco --collection v2-passage --condition bm25-default --display-commands --dry-run
219
+ </tt></blockquote>
220
+
221
+ <p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
222
+
223
+ <p>To actually run a specific condition:</p>
224
+
225
+ <blockquote class="mycode2"><tt>
226
+ python -m pyserini.2cr.msmarco --collection v2-passage --condition bm25-default --display-commands
227
+ </tt></blockquote>
228
+
229
+ <p>Again, with the above command, run files will be placed in the current directory.
230
+ Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
231
+
232
+ <p>Finally, to generate this page:</p>
233
+
234
+ <blockquote class="mycode2"><tt>
235
+ python -m pyserini.2cr.msmarco --collection v2-passage --generate-report --output msmarco-v2-passage.html
236
+ </tt></blockquote>
237
+
238
+ <p>The output file <tt>msmarco-v2-passage.html</tt> should be identical to this page.</p>
239
+
240
+ <div style="padding-top: 50px"/>
241
+
242
+ </div>
243
+
244
+
245
+
246
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
247
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
248
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
249
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
250
+
251
+ <script>
252
+ document.querySelectorAll('pre').forEach(function (codeBlock) {
253
+ var button = document.createElement('button');
254
+ button.className = 'copy-code-button';
255
+ button.type = 'button';
256
+ var s = codeBlock.innerText;
257
+ button.setAttribute('data-clipboard-text',s);
258
+ button.innerText = 'Copy';
259
+
260
+ // var pre = codeBlock.parentNode;
261
+ codeBlock.classList.add('prettyprint');
262
+ // pre.parentNode.insertBefore(button, pre);
263
+ codeBlock.appendChild(button);
264
+ });
265
+
266
+ var clipboard = new ClipboardJS('.copy-code-button');
267
+
268
+ clipboard.on('success', function(e) {
269
+ console.info('Action:', e.action);
270
+ console.info('Text:', e.text);
271
+ console.info('Trigger:', e.trigger);
272
+ e.trigger.textContent = 'Copied';
273
+ window.setTimeout(function() {
274
+ e.trigger.textContent = 'Copy';
275
+ }, 2000);
276
+ e.clearSelection();
277
+ });
278
+
279
+ clipboard.on('error', function(e) {
280
+ console.error('Action:', e.action);
281
+ console.error('Trigger:', e.trigger);
282
+ e.trigger.textContent = 'Error Copying';
283
+ window.setTimeout(function() {
284
+ e.trigger.textContent = 'Copy';
285
+ }, 2000);
286
+ e.clearSelection();
287
+ });
288
+
289
+ </script>
290
+
291
+ </body>
292
+ </html>
pyserini/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
pyserini/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (165 Bytes). View file
 
pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc ADDED
Binary file (971 Bytes). View file
 
pyserini/__pycache__/encoded_query_info.cpython-310.pyc ADDED
Binary file (15.2 kB). View file
 
pyserini/__pycache__/evaluate_script_info.cpython-310.pyc ADDED
Binary file (749 Bytes). View file
 
pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc ADDED
Binary file (179 kB). View file
 
pyserini/__pycache__/pyclass.cpython-310.pyc ADDED
Binary file (736 Bytes). View file
 
pyserini/__pycache__/setup.cpython-310.pyc ADDED
Binary file (780 Bytes). View file
 
pyserini/__pycache__/util.cpython-310.pyc ADDED
Binary file (8.03 kB). View file
 
pyserini/analysis/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ from ._base import get_lucene_analyzer, Analyzer, JAnalyzer, JAnalyzerUtils, JDefaultEnglishAnalyzer, JWhiteSpaceAnalyzer
18
+
19
+ __all__ = ['get_lucene_analyzer', 'Analyzer', 'JAnalyzer', 'JAnalyzerUtils', 'JDefaultEnglishAnalyzer', 'JWhiteSpaceAnalyzer']
pyserini/analysis/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (361 Bytes). View file
 
pyserini/analysis/__pycache__/_base.cpython-310.pyc ADDED
Binary file (5 kB). View file
 
pyserini/analysis/_base.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ from typing import List
18
+
19
+ from ..pyclass import autoclass
20
+
21
+ # Wrappers around Lucene classes
22
+ JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer')
23
+ JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer')
24
+ JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer')
25
+ JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer')
26
+ JDanishAnalyzer = autoclass('org.apache.lucene.analysis.da.DanishAnalyzer')
27
+ JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
28
+ JDutchAnalyzer = autoclass('org.apache.lucene.analysis.nl.DutchAnalyzer')
29
+ JFinnishAnalyzer = autoclass('org.apache.lucene.analysis.fi.FinnishAnalyzer')
30
+ JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer')
31
+ JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer')
32
+ JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer')
33
+ JHungarianAnalyzer = autoclass('org.apache.lucene.analysis.hu.HungarianAnalyzer')
34
+ JIndonesianAnalyzer = autoclass('org.apache.lucene.analysis.id.IndonesianAnalyzer')
35
+ JItalianAnalyzer = autoclass('org.apache.lucene.analysis.it.ItalianAnalyzer')
36
+ JJapaneseAnalyzer = autoclass('org.apache.lucene.analysis.ja.JapaneseAnalyzer')
37
+ JNorwegianAnalyzer = autoclass('org.apache.lucene.analysis.no.NorwegianAnalyzer')
38
+ JPortugueseAnalyzer = autoclass('org.apache.lucene.analysis.pt.PortugueseAnalyzer')
39
+ JRussianAnalyzer = autoclass('org.apache.lucene.analysis.ru.RussianAnalyzer')
40
+ JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer')
41
+ JSwedishAnalyzer = autoclass('org.apache.lucene.analysis.sv.SwedishAnalyzer')
42
+ JTeluguAnalyzer = autoclass('org.apache.lucene.analysis.te.TeluguAnalyzer')
43
+ JThaiAnalyzer = autoclass('org.apache.lucene.analysis.th.ThaiAnalyzer')
44
+ JTurkishAnalyzer = autoclass('org.apache.lucene.analysis.tr.TurkishAnalyzer')
45
+ JWhiteSpaceAnalyzer = autoclass('org.apache.lucene.analysis.core.WhitespaceAnalyzer')
46
+ JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet')
47
+
48
+ # Wrappers around Anserini classes
49
+ JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils')
50
+ JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
51
+ JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer')
52
+ JHuggingFaceTokenizerAnalyzer = autoclass('io.anserini.analysis.HuggingFaceTokenizerAnalyzer')
53
+
54
+
55
+ def get_lucene_analyzer(language: str='en', stemming: bool=True, stemmer: str='porter', stopwords: bool=True, huggingFaceTokenizer: str=None) -> JAnalyzer:
56
+ """Create a Lucene ``Analyzer`` with specific settings.
57
+
58
+ Parameters
59
+ ----------
60
+ language : str
61
+ Name of analyzer.
62
+ stemming : bool
63
+ Set to stem.
64
+ stemmer : str
65
+ Stemmer to use.
66
+ stopwords : bool
67
+ Set to filter stopwords.
68
+ huggingFaceTokenizer: str
69
+ a huggingface model id or path to a tokenizer.json file
70
+
71
+ Returns
72
+ -------
73
+ JAnalyzer
74
+ Java ``Analyzer`` with specified settings.
75
+ """
76
+ if language.lower() == 'ar':
77
+ return JArabicAnalyzer()
78
+ elif language.lower() == 'bn':
79
+ return JBengaliAnalyzer()
80
+ elif language.lower() in ['zh', 'ko']:
81
+ return JCJKAnalyzer()
82
+ elif language.lower() == 'da':
83
+ return JDanishAnalyzer()
84
+ elif language.lower() == 'nl':
85
+ return JDutchAnalyzer()
86
+ elif language.lower() == 'fi':
87
+ return JFinnishAnalyzer()
88
+ elif language.lower() == 'fr':
89
+ return JFrenchAnalyzer()
90
+ elif language.lower() == 'de':
91
+ return JGermanAnalyzer()
92
+ elif language.lower() == 'hi':
93
+ return JHindiAnalyzer()
94
+ elif language.lower() == 'hu':
95
+ return JHungarianAnalyzer()
96
+ elif language.lower() == 'id':
97
+ return JIndonesianAnalyzer()
98
+ elif language.lower() == 'it':
99
+ return JItalianAnalyzer()
100
+ elif language.lower() == 'ja':
101
+ return JJapaneseAnalyzer()
102
+ elif language.lower() == 'no':
103
+ return JNorwegianAnalyzer()
104
+ elif language.lower() == 'pt':
105
+ return JPortugueseAnalyzer()
106
+ elif language.lower() == 'ru':
107
+ return JRussianAnalyzer()
108
+ elif language.lower() == 'es':
109
+ return JSpanishAnalyzer()
110
+ elif language.lower() == 'te':
111
+ return JTeluguAnalyzer()
112
+ elif language.lower() == 'th':
113
+ return JThaiAnalyzer()
114
+ elif language.lower() == 'tr':
115
+ return JTurkishAnalyzer()
116
+ elif language.lower() == 'tweet':
117
+ return JTweetAnalyzer()
118
+ elif language.lower() == 'hgf_tokenizer':
119
+ return JHuggingFaceTokenizerAnalyzer(huggingFaceTokenizer)
120
+ elif language.lower() == 'en':
121
+ if stemming:
122
+ if stopwords:
123
+ return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer)
124
+ else:
125
+ return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer, JCharArraySet.EMPTY_SET)
126
+ else:
127
+ if stopwords:
128
+ return JDefaultEnglishAnalyzer.newNonStemmingInstance()
129
+ else:
130
+ return JDefaultEnglishAnalyzer.newNonStemmingInstance(JCharArraySet.EMPTY_SET)
131
+ else:
132
+ raise ValueError('Invalid configuration.')
133
+
134
+
135
+ class Analyzer:
136
+ """Python wrapper around a Lucene ``Analyzer`` to simplify analysis.
137
+
138
+ Parameters
139
+ ----------
140
+ analyzer : JAnalyzer
141
+ Lucene ``Analyzer``.
142
+ """
143
+
144
+ def __init__(self, analyzer):
145
+ if not isinstance(analyzer, JAnalyzer):
146
+ raise TypeError('Invalid JAnalyzer!')
147
+ self.analyzer = analyzer
148
+
149
+ def analyze(self, text: str) -> List[str]:
150
+ """Analyze a piece of text.
151
+
152
+ Parameters
153
+ ----------
154
+ text : str
155
+ Text to analyze.
156
+
157
+ Returns
158
+ -------
159
+ List[str]
160
+ List of tokens corresponding to the output of the analyzer.
161
+ """
162
+ results = JAnalyzerUtils.analyze(self.analyzer, text)
163
+ tokens = []
164
+ for token in results.toArray():
165
+ tokens.append(token)
166
+ return tokens
pyserini/collection/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ from ._base import Collection, FileSegment, SourceDocument
18
+ from ._collection_support import Cord19Article
19
+
20
+ __all__ = ['Collection', 'FileSegment', 'SourceDocument', 'Cord19Article']
pyserini/collection/_base.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import logging
18
+ import re
19
+ from enum import Enum
20
+
21
+ from ..multithreading import Counters
22
+ from ..pyclass import autoclass, cast, JPaths
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ JFileSegment = autoclass('io.anserini.collection.FileSegment')
28
+ JSourceDocument = autoclass('io.anserini.collection.SourceDocument')
29
+
30
+
31
+ class JCollections(Enum):
32
+ AclAnthology = autoclass('io.anserini.collection.AclAnthology')
33
+ CarCollection = autoclass('io.anserini.collection.CarCollection')
34
+ Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection')
35
+ ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection')
36
+ ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection')
37
+ HtmlCollection = autoclass('io.anserini.collection.HtmlCollection')
38
+ JsonCollection = autoclass('io.anserini.collection.JsonCollection')
39
+ NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection')
40
+ TrecCollection = autoclass('io.anserini.collection.TrecCollection')
41
+ TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection')
42
+ TweetCollection = autoclass('io.anserini.collection.TweetCollection')
43
+ WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection')
44
+ WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection')
45
+
46
+
47
+ class Collection:
48
+ """
49
+ Iterable wrapper class for Anserini's DocumentCollection.
50
+
51
+ Parameters
52
+ ----------
53
+ collection_class : str
54
+ Name of collection class to instantiate
55
+ collection_path : str
56
+ Path to directory containing collection
57
+ """
58
+
59
+ def __init__(self, collection_class, collection_path):
60
+ self.counters = Counters()
61
+ self.collection_class = collection_class
62
+ self.collection_path = JPaths.get(collection_path)
63
+ self.object = self._get_collection()
64
+ self.collection_iterator = self.object.iterator()
65
+
66
+ def _get_collection(self):
67
+ try:
68
+ return JCollections[self.collection_class].value(self.collection_path)
69
+ except:
70
+ raise ValueError(self.collection_class)
71
+
72
+ def __iter__(self):
73
+ return self
74
+
75
+ def __next__(self):
76
+ if self.collection_iterator.hasNext():
77
+ fs = self.collection_iterator.next()
78
+ return FileSegment(self, fs, fs.getSegmentPath())
79
+ else:
80
+ raise StopIteration
81
+
82
+
83
+ class FileSegment:
84
+ """
85
+ Iterable wrapper class for Anserini's FileSegment.
86
+
87
+ Parameters
88
+ ----------
89
+ collection : Collection
90
+ Parent collection of the file segment
91
+ segment : JFileSegment
92
+ FileSegment object to create wrapper from
93
+ segment_path : str
94
+ Path to file backing the file segment
95
+ """
96
+
97
+ def __init__(self, collection, segment, segment_path):
98
+ self.collection = collection
99
+ try:
100
+ self.object = cast(collection.object.getClass().getName() +
101
+ '$Segment', segment)
102
+ except:
103
+ logger.exception('Exception from casting FileSegment type...')
104
+ self.object = cast('io.anserini.collection.FileSegment', segment)
105
+
106
+ self.segment_iterator = self.object.iterator()
107
+ self.segment_path = segment_path
108
+ self.segment_name = re.sub(r'\\|\/', '-', collection.collection_path.relativize(segment_path).toString())
109
+
110
+ def __iter__(self):
111
+ return self
112
+
113
+ def __next__(self):
114
+ if self.object.iterator().hasNext():
115
+ d = self.object.iterator().next()
116
+ return SourceDocument(self, d)
117
+ else:
118
+ # log if iteration stopped by error
119
+ if self.object.getErrorStatus():
120
+ logger.error(self.segment_name + ': Error from segment iteration, stopping...')
121
+ self.collection.counters.errors.increment()
122
+
123
+ # stop iteration and log skipped documents
124
+ skipped = self.object.getSkippedCount()
125
+ if skipped > 0:
126
+ self.collection.counters.skips.increment(skipped)
127
+ logger.warning(self.segment_name + ': ' + str(skipped) + ' documents skipped')
128
+ self.object.close()
129
+ raise StopIteration
130
+
131
+
132
+ class SourceDocument:
133
+ """
134
+ Wrapper class for Anserini's SourceDocument.
135
+
136
+ Parameters
137
+ ----------
138
+
139
+ segment : FileSegment
140
+ Parent segment of the source document
141
+ document : io.anserini.collection.SourceDocument
142
+ SourceDocument object to create wrapper from
143
+ """
144
+
145
+ def __init__(self, segment, document):
146
+ if not isinstance(document, JSourceDocument):
147
+ raise TypeError('Invalid JSourceDocument!')
148
+ self.segment = segment
149
+ self.object = document
150
+ self.id = self.object.id()
151
+ self.indexable = self.object.indexable()
152
+ self.contents = self.object.contents()
153
+ self.raw = self.object.raw()
pyserini/collection/_collection_support.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ # Implementations of support for specific collections.
18
+
19
+ import json
20
+
21
+
22
+ class Cord19Article:
23
+ """Wrapper class for a raw JSON article from AI2's COVID-19 Open Research Dataset (CORD-19).
24
+
25
+ Parameters
26
+ ----------
27
+ doc : str
28
+ A JSON string of a CORD-19 article.
29
+ """
30
+
31
+ def __init__(self, doc):
32
+ self.json = json.loads(doc)
33
+ # Performs some basic error checking, throws an exception if user tries to instantiate with something
34
+ # that isn't from CORD-19.
35
+ if 'cord_uid' in self.json:
36
+ self.full_text = False
37
+ elif 'paper_id' in self.json:
38
+ self.full_text = True
39
+ else:
40
+ raise TypeError
41
+
42
+ def is_full_text(self):
43
+ return self.json['has_full_text']
44
+
45
+ def cord_uid(self):
46
+ return self.json['cord_uid']
47
+
48
+ def bib_entries(self):
49
+ return self.json['bib_entries']
50
+
51
+ def title(self):
52
+ try:
53
+ if self.is_full_text():
54
+ return self.json['metadata']['title']
55
+ else:
56
+ return self.json['csv_metadata']['title']
57
+ except KeyError:
58
+ return ''
59
+
60
+ def abstract(self):
61
+ try:
62
+ # For a full-text article, we can grab the abstract from two independent sources, the metadata or the
63
+ # actual full text. Here, we make the decision to use the metadata, even for full text.
64
+ return self.json['csv_metadata']['abstract']
65
+ except KeyError:
66
+ return ''
67
+
68
+ def metadata(self):
69
+ return self.json['csv_metadata']
70
+
71
+ def body(self):
72
+ try:
73
+ if self.is_full_text():
74
+ return [entry['text'] for entry in self.json['body_text']]
75
+ else:
76
+ return []
77
+ except KeyError:
78
+ return ''
pyserini/demo/acl.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """
18
+ This script provides an interactive web interface demo for retrieval on the ACL dataset.
19
+ It requires `flask` (`pip install flask~=2.2.0`).
20
+ An example command looks like `python -m pyserini.demo.acl` that starts up a server on port 8080.
21
+ The demo can be accessed via "http://localhost:8080" in a web browser.
22
+ Additional arguments include:
23
+ --port [PORT] --hits [Number of hits]
24
+ --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda]
25
+ """
26
+ import json
27
+ import logging
28
+ from argparse import ArgumentParser
29
+ from functools import partial
30
+ from typing import Callable, Optional, Tuple, Union
31
+
32
+ from flask import Flask, render_template, request, flash, jsonify
33
+ from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
34
+
35
+ logging.basicConfig(
36
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
37
+ datefmt='%Y-%m-%d %H:%M:%S',
38
+ level=logging.INFO,
39
+ )
40
+ logger = logging.getLogger('acl-demo')
41
+
42
+ VERSION = '1.0'
43
+ Searcher = Union[FaissSearcher, LuceneSearcher]
44
+
45
+
46
+ def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]):
47
+ app = Flask(__name__)
48
+
49
+ lang = 'en'
50
+ searcher, retriever = load_searcher_fn(lang)
51
+
52
+ @app.route('/')
53
+ def index():
54
+ nonlocal lang, searcher, retriever
55
+ return render_template('acl.html', lang=lang, retriever=retriever)
56
+
57
+ @app.route('/search', methods=['GET', 'POST'])
58
+ def search():
59
+ nonlocal lang, searcher, retriever
60
+ query = request.form['q']
61
+ if not query:
62
+ search_results = []
63
+ flash('Question is required')
64
+ else:
65
+ hits = searcher.search(query, k=k)
66
+ docs = [searcher.doc(hit.docid) for hit in hits]
67
+ search_results = [
68
+ {
69
+ 'rank': r + 1,
70
+ 'docid': hit.docid,
71
+ 'doc': docs[r].contents(),
72
+ 'score': hit.score,
73
+ }
74
+ for r, hit in enumerate(hits)
75
+ ]
76
+ return render_template(
77
+ 'acl.html', search_results=search_results, query=query, lang=lang, retriever=retriever
78
+ )
79
+
80
+
81
+ return app
82
+
83
+
84
+ def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str):
85
+ searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph')
86
+ searcher.set_language(language)
87
+ if k1 is not None and b is not None:
88
+ searcher.set_bm25(k1, b)
89
+ retriever_name = f'BM25 (k1={k1}, b={b})'
90
+ else:
91
+ retriever_name = 'BM25'
92
+
93
+ return searcher, retriever_name
94
+
95
+
96
+ def main():
97
+ parser = ArgumentParser()
98
+
99
+ parser.add_argument('--k1', type=float, help='BM25 k1 parameter.')
100
+ parser.add_argument('--b', type=float, help='BM25 b parameter.')
101
+ parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever')
102
+ parser.add_argument(
103
+ '--device',
104
+ type=str,
105
+ default='cpu',
106
+ help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)',
107
+ )
108
+ parser.add_argument(
109
+ '--port',
110
+ default=8080,
111
+ type=int,
112
+ help='Web server port',
113
+ )
114
+
115
+ args = parser.parse_args()
116
+
117
+ load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b)
118
+
119
+ app = create_app(args.hits, load_fn)
120
+ app.run(host='0.0.0.0', port=args.port)
121
+
122
+
123
+ if __name__ == '__main__':
124
+ main()
pyserini/demo/dpr.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import cmd
18
+ import json
19
+ import random
20
+
21
+ from pyserini.search.lucene import LuceneSearcher
22
+ from pyserini.search.faiss import FaissSearcher, DprQueryEncoder
23
+ from pyserini.search.hybrid import HybridSearcher
24
+ from pyserini import search
25
+
26
+
27
+ class DPRDemo(cmd.Cmd):
28
+ nq_dev_topics = list(search.get_topics('dpr-nq-dev').values())
29
+ trivia_dev_topics = list(search.get_topics('dpr-trivia-dev').values())
30
+
31
+ ssearcher = LuceneSearcher.from_prebuilt_index('wikipedia-dpr')
32
+ searcher = ssearcher
33
+
34
+ encoder = DprQueryEncoder("facebook/dpr-question_encoder-multiset-base")
35
+ index = 'wikipedia-dpr-multi-bf'
36
+ dsearcher = FaissSearcher.from_prebuilt_index(
37
+ index,
38
+ encoder
39
+ )
40
+ hsearcher = HybridSearcher(dsearcher, ssearcher)
41
+
42
+ k = 10
43
+ prompt = '>>> '
44
+
45
+ def precmd(self, line):
46
+ if line[0] == '/':
47
+ line = line[1:]
48
+ return line
49
+
50
+ def do_help(self, arg):
51
+ print(f'/help : returns this message')
52
+ print(f'/k [NUM] : sets k (number of hits to return) to [NUM]')
53
+ print(f'/mode [MODE] : sets retriever type to [MODE] (one of sparse, dense, hybrid)')
54
+ print(f'/random [COLLECTION]: returns results for a random question from the dev subset [COLLECTION] (one of nq, trivia).')
55
+
56
+ def do_k(self, arg):
57
+ print(f'setting k = {int(arg)}')
58
+ self.k = int(arg)
59
+
60
+ def do_mode(self, arg):
61
+ if arg == "sparse":
62
+ self.searcher = self.ssearcher
63
+ elif arg == "dense":
64
+ self.searcher = self.dsearcher
65
+ elif arg == "hybrid":
66
+ self.searcher = self.hsearcher
67
+ else:
68
+ print(
69
+ f'Mode "{arg}" is invalid. Mode should be one of [sparse, dense, hybrid].')
70
+ return
71
+ print(f'setting retriver = {arg}')
72
+
73
+ def do_random(self, arg):
74
+ if arg == "nq":
75
+ topics = self.nq_dev_topics
76
+ elif arg == "trivia":
77
+ topics = self.trivia_dev_topics
78
+ else:
79
+ print(
80
+ f'Collection "{arg}" is invalid. Collection should be one of [nq, trivia].')
81
+ return
82
+ q = random.choice(topics)['title']
83
+ print(f'question: {q}')
84
+ self.default(q)
85
+
86
+ def do_EOF(self, line):
87
+ return True
88
+
89
+ def default(self, q):
90
+ hits = self.searcher.search(q, self.k)
91
+
92
+ for i in range(0, len(hits)):
93
+ raw_doc = None
94
+ if isinstance(self.searcher, LuceneSearcher):
95
+ raw_doc = hits[i].raw
96
+ else:
97
+ doc = self.searcher.doc(hits[i].docid)
98
+ if doc:
99
+ raw_doc = doc.raw()
100
+ jsondoc = json.loads(raw_doc)
101
+ print(f'{i + 1:2} {hits[i].score:.5f} {jsondoc["contents"]}')
102
+
103
+
104
+ if __name__ == '__main__':
105
+ DPRDemo().cmdloop()
pyserini/demo/miracl.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """
18
+ This script provides an interactive web interface demo for retrieval on the MIRACL dataset.
19
+ It requires `flask` (`pip install flask~=2.2.0`).
20
+ An example command looks like `python -m pyserini.demo.miracl` that starts up a server on port 8080.
21
+ The demo can be accessed via "http://localhost:8080" in a web browser.
22
+ Additional arguments include:
23
+ --port [PORT] --hits [Number of hits] --index [BM25 or mdpr-tied-pft-msmarco]
24
+ --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda]
25
+ """
26
+ import json
27
+ import logging
28
+ from argparse import ArgumentParser
29
+ from functools import partial
30
+ from typing import Callable, Optional, Tuple, Union
31
+
32
+ from flask import Flask, render_template, request, flash, jsonify
33
+ from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
34
+
35
+ logging.basicConfig(
36
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
37
+ datefmt='%Y-%m-%d %H:%M:%S',
38
+ level=logging.INFO,
39
+ )
40
+ logger = logging.getLogger('miracl-demo')
41
+
42
+ VERSION = '1.0'
43
+ LANGUAGES = ('ar', 'bn', 'en', 'es', 'fa', 'fi', 'fr', 'hi', 'id', 'ja', 'ko', 'ru', 'sw', 'te', 'th', 'zh')
44
+ Searcher = Union[FaissSearcher, LuceneSearcher]
45
+
46
+
47
+ def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]):
48
+ app = Flask(__name__)
49
+
50
+ lang = LANGUAGES[0]
51
+ searcher, retriever = load_searcher_fn(lang)
52
+
53
+ @app.route('/')
54
+ def index():
55
+ nonlocal lang, searcher, retriever
56
+ return render_template('miracl.html', lang=lang, retriever=retriever)
57
+
58
+ @app.route('/search', methods=['GET', 'POST'])
59
+ def search():
60
+ nonlocal lang, searcher, retriever
61
+ query = request.form['q']
62
+ if not query:
63
+ search_results = []
64
+ flash('Question is required')
65
+ else:
66
+ hits = searcher.search(query, k=k)
67
+ docs = [json.loads(searcher.doc(hit.docid).raw()) for hit in hits]
68
+ search_results = [
69
+ {
70
+ 'rank': r + 1,
71
+ 'docid': hit.docid,
72
+ 'doc': docs[r]['text'],
73
+ 'title': docs[r]['title'],
74
+ 'score': hit.score,
75
+ }
76
+ for r, hit in enumerate(hits)
77
+ ]
78
+ return render_template(
79
+ 'miracl.html', search_results=search_results, query=query, lang=lang, retriever=retriever
80
+ )
81
+
82
+ @app.route('/lang', methods=['GET'])
83
+ def change_language():
84
+ nonlocal lang, searcher, retriever
85
+ new_lang = request.args.get('new_lang', '', type=str)
86
+ if not new_lang or new_lang not in LANGUAGES:
87
+ return
88
+
89
+ lang = new_lang
90
+ searcher, retriever = load_searcher_fn(lang)
91
+ return jsonify(lang=lang)
92
+
93
+ return app
94
+
95
+
96
+ def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str):
97
+ searcher = LuceneSearcher.from_prebuilt_index(f'miracl-v{VERSION}-{language}')
98
+ searcher.set_language(language)
99
+ if k1 is not None and b is not None:
100
+ searcher.set_bm25(k1, b)
101
+ retriever_name = f'BM25 (k1={k1}, b={b})'
102
+ else:
103
+ retriever_name = 'BM25'
104
+
105
+ return searcher, retriever_name
106
+
107
+
108
+ def _load_faiss_searcher(language: str, device: str) -> (Searcher, str):
109
+ query_encoder = AutoQueryEncoder(encoder_dir='castorini/mdpr-tied-pft-msmarco', device=device)
110
+ searcher = FaissSearcher.from_prebuilt_index(
111
+ f'miracl-v{VERSION}-{language}-mdpr-tied-pft-msmarco', query_encoder
112
+ )
113
+ retriever_name = 'mDPR-pFT-MSMARCO'
114
+ return searcher, retriever_name
115
+
116
+
117
+ def main():
118
+ parser = ArgumentParser()
119
+
120
+ parser.add_argument('--index', default='BM25', choices=('BM25', 'mdpr-tied-pft-msmarco'), help='Index type.')
121
+ parser.add_argument('--k1', type=float, help='BM25 k1 parameter.')
122
+ parser.add_argument('--b', type=float, help='BM25 b parameter.')
123
+ parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever')
124
+ parser.add_argument(
125
+ '--device',
126
+ type=str,
127
+ default='cpu',
128
+ help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)',
129
+ )
130
+ parser.add_argument(
131
+ '--port',
132
+ default=8080,
133
+ type=int,
134
+ help='Web server port',
135
+ )
136
+
137
+ args = parser.parse_args()
138
+
139
+ if args.index == 'mdpr-tied-pft-msmarco':
140
+ load_fn = partial(_load_faiss_searcher, device=args.device)
141
+ else:
142
+ load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b)
143
+
144
+ app = create_app(args.hits, load_fn)
145
+ app.run(host='0.0.0.0', port=args.port)
146
+
147
+
148
+ if __name__ == '__main__':
149
+ main()
pyserini/demo/msmarco.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import cmd
18
+ import json
19
+ import os
20
+ import random
21
+
22
+ from pyserini.search.lucene import LuceneSearcher
23
+ from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder, AnceQueryEncoder
24
+ from pyserini.search.hybrid import HybridSearcher
25
+ from pyserini import search
26
+
27
+
28
+ class MsMarcoDemo(cmd.Cmd):
29
+ dev_topics = list(search.get_topics('msmarco-passage-dev-subset').values())
30
+
31
+ ssearcher = LuceneSearcher.from_prebuilt_index('msmarco-passage')
32
+ dsearcher = None
33
+ hsearcher = None
34
+ searcher = ssearcher
35
+
36
+ k = 10
37
+ prompt = '>>> '
38
+
39
+ # https://stackoverflow.com/questions/35213134/command-prefixes-in-python-cli-using-cmd-in-pythons-standard-library
40
+ def precmd(self, line):
41
+ if line[0] == '/':
42
+ line = line[1:]
43
+ return line
44
+
45
+ def do_help(self, arg):
46
+ print(f'/help : returns this message')
47
+ print(f'/k [NUM] : sets k (number of hits to return) to [NUM]')
48
+ print(f'/model [MODEL] : sets encoder to use the model [MODEL] (one of tct, ance)')
49
+ print(f'/mode [MODE] : sets retriever type to [MODE] (one of sparse, dense, hybrid)')
50
+ print(f'/random : returns results for a random question from dev subset')
51
+
52
+ def do_k(self, arg):
53
+ print(f'setting k = {int(arg)}')
54
+ self.k = int(arg)
55
+
56
+ def do_mode(self, arg):
57
+ if arg == "sparse":
58
+ self.searcher = self.ssearcher
59
+ elif arg == "dense":
60
+ if self.dsearcher is None:
61
+ print(f'Specify model through /model before using dense retrieval.')
62
+ return
63
+ self.searcher = self.dsearcher
64
+ elif arg == "hybrid":
65
+ if self.hsearcher is None:
66
+ print(f'Specify model through /model before using hybrid retrieval.')
67
+ return
68
+ self.searcher = self.hsearcher
69
+ else:
70
+ print(
71
+ f'Mode "{arg}" is invalid. Mode should be one of [sparse, dense, hybrid].')
72
+ return
73
+ print(f'setting retriver = {arg}')
74
+
75
+ def do_model(self, arg):
76
+ if arg == "tct":
77
+ encoder = TctColBertQueryEncoder("castorini/tct_colbert-msmarco")
78
+ index = "msmarco-passage-tct_colbert-hnsw"
79
+ elif arg == "ance":
80
+ encoder = AnceQueryEncoder("castorini/ance-msmarco-passage")
81
+ index = "msmarco-passage-ance-bf"
82
+ else:
83
+ print(
84
+ f'Model "{arg}" is invalid. Model should be one of [tct, ance].')
85
+ return
86
+
87
+ self.dsearcher = FaissSearcher.from_prebuilt_index(
88
+ index,
89
+ encoder
90
+ )
91
+ self.hsearcher = HybridSearcher(self.dsearcher, self.ssearcher)
92
+ print(f'setting model = {arg}')
93
+
94
+ def do_random(self, arg):
95
+ q = random.choice(self.dev_topics)['title']
96
+ print(f'question: {q}')
97
+ self.default(q)
98
+
99
+ def do_EOF(self, line):
100
+ return True
101
+
102
+ def default(self, q):
103
+ hits = self.searcher.search(q, self.k)
104
+
105
+ for i in range(0, len(hits)):
106
+ raw_doc = None
107
+ if isinstance(self.searcher, LuceneSearcher):
108
+ raw_doc = hits[i].raw
109
+ else:
110
+ doc = self.searcher.doc(hits[i].docid)
111
+ if doc:
112
+ raw_doc = doc.raw()
113
+ jsondoc = json.loads(raw_doc)
114
+ print(f'{i + 1:2} {hits[i].score:.5f} {jsondoc["contents"]}')
115
+
116
+
117
+ if __name__ == '__main__':
118
+ MsMarcoDemo().cmdloop()
pyserini/demo/templates/acl.html ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta property="og:title" content="ACL 🌍🙌🌏">
6
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
7
+ integrity="sha384-Zenh87qX5JnK2Jl0vWa8Ck2rdkQ2Bzep5IDxbcnCeuOxjzrPF/et3URy9Bv1WTRi" crossorigin="anonymous">
8
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/font/bootstrap-icons.css">
9
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
10
+ integrity="sha384-OERcA2EqjJCMA+/3y+gxIOqMEjwtxJY7qPCqsdltbNJuaOe923+mo//f6V8Qbsw3"
11
+ crossorigin="anonymous"></script>
12
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/jquery.min.js"></script>
13
+
14
+ <script>
15
+ $SCRIPT_ROOT = {{ request.script_root|tojson }};
16
+ </script>
17
+ <title>ACL 🌍🙌🌏 Demo</title>
18
+ </head>
19
+ <body>
20
+
21
+ <div style="display: flex; align-items: center; gap: 10px;">
22
+ <h2>ACL</h2>
23
+ <img src="https://aclanthology.org/images/acl-logo.svg" alt="acl logo" width="50px">
24
+ <h2>Demo</h2>
25
+ </div>
26
+ <br/>
27
+
28
+ <div class="container text-center">
29
+ {% for message in get_flashed_messages() %}
30
+ <div class="alert">{{ message }}</div>
31
+ {% endfor %}
32
+
33
+ <form action="/search" method="post">
34
+ <div class="row-cols-3">
35
+ <div class="input-group mb-3">
36
+ <input type="text" class="form-control" placeholder="Enter a Question" aria-label="Question" name="q"
37
+ aria-describedby="button-addon2" value="{{ query if query else '' }}">
38
+ <button class="btn btn-outline-secondary" type="submit" id="button-addon2"><i class="bi bi-search"></i>
39
+ </button>
40
+ </div>
41
+ </div>
42
+ </form>
43
+
44
+ {% if search_results %}
45
+ <div class="row">
46
+ <table class="table">
47
+ <thead>
48
+ <tr>
49
+ <th scope="col">#</th>
50
+ <th scope="col">Score</th>
51
+ <th scope="col">Passage ID</th>
52
+ <th scope="col">Content</th>
53
+ </tr>
54
+ </thead>
55
+ <tbody class="table-group-divider">
56
+ {% for res in search_results %}
57
+ <tr class="{{ 'table-secondary' if res['rank'] % 2 else 'table-light' }}">
58
+ <th scope="row">{{ res["rank"] }}</th>
59
+ <td>{{ "%.2f"|format(res["score"]) }}</td>
60
+ <td>{{ res["docid"] }}</td>
61
+
62
+ <td style="word-wrap: break-word;min-width: 600px;max-width: 600px;"
63
+ class="text-{{ 'end' if lang in ('ar', 'fa') else 'start' }}">
64
+ <small>{{ res["doc"] }}</small>
65
+ </td>
66
+ </tr>
67
+ {% endfor %}
68
+ </tbody>
69
+ </table>
70
+ </div>
71
+ {% endif %}
72
+ </div>
73
+ </body>
74
+ </html>
pyserini/demo/templates/assets/acl-logo.svg ADDED
pyserini/demo/templates/miracl.html ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta property="og:title" content="MIRACL 🌍🙌🌏">
6
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Zenh87qX5JnK2Jl0vWa8Ck2rdkQ2Bzep5IDxbcnCeuOxjzrPF/et3URy9Bv1WTRi" crossorigin="anonymous">
7
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/font/bootstrap-icons.css">
8
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-OERcA2EqjJCMA+/3y+gxIOqMEjwtxJY7qPCqsdltbNJuaOe923+mo//f6V8Qbsw3" crossorigin="anonymous"></script>
9
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/jquery.min.js"></script>
10
+
11
+ <script>
12
+ $SCRIPT_ROOT = {{ request.script_root|tojson }};
13
+
14
+ $( document ).ready(function() {
15
+ $("#loading").hide();
16
+ $('#language').val("{{lang}}");
17
+ });
18
+
19
+ $(function() {
20
+ $('#language').on('change', function() {
21
+ $.getJSON($SCRIPT_ROOT + '/lang', {
22
+ new_lang: this.value,
23
+ }, function(data) {
24
+ $("#language").removeAttr('disabled');
25
+ $("#loading").hide();
26
+ });
27
+
28
+ $(this).attr('disabled','disabled');
29
+ $("#loading").show();
30
+
31
+ return false;
32
+ });
33
+ });
34
+ </script>
35
+ <title>MIRACL 🌍🙌🌏 Demo</title>
36
+ </head>
37
+ <body>
38
+ <h2>MIRACL 🌍🙌🌏 Demo</h2>
39
+ <h4>Multilingual Information Retrieval Across a Continuum of Languages</h4>
40
+
41
+ <br/>
42
+
43
+ <p class="lead">
44
+ <a href="http://miracl.ai/">MIRACL</a> is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.
45
+ </p>
46
+
47
+ <div class="row g-3 align-items-center">
48
+ <label class="col-auto" for="language">This demo running on the language</label>
49
+ <div class="col-auto">
50
+ <select class="form-select form-select-sm" aria-label=".form-select-sm" id="language">
51
+ <option value="ar">Arabic</option>
52
+ <option value="bn">Bengali</option>
53
+ <option value="en">English</option>
54
+ <option value="es">Spanish</option>
55
+ <option value="fa">Persian</option>
56
+ <option value="fi">Finnish</option>
57
+ <option value="fr">French</option>
58
+ <option value="hi">Hindi</option>
59
+ <option value="id">Indonesian</option>
60
+ <option value="ja">Japanese</option>
61
+ <option value="ko">Korean</option>
62
+ <option value="ru">Russian</option>
63
+ <option value="sw">Swahili</option>
64
+ <option value="te">Telugu</option>
65
+ <option value="th">Thai</option>
66
+ <option value="zh">Chinese</option>
67
+ </select>
68
+ </div>
69
+ <div class="col-auto">
70
+ <div class="spinner-border text-secondary" role="status" id="loading">
71
+ <span class="visually-hidden">Loading...</span>
72
+ </div>
73
+ </div>
74
+ <div class="col-auto">
75
+ <span>
76
+ retrieves passages using <em>{{retriever}}</em>.
77
+ </span>
78
+ </div>
79
+ </div>
80
+
81
+ <br/>
82
+
83
+ <div class="container text-center">
84
+ {% for message in get_flashed_messages() %}
85
+ <div class="alert">{{ message }}</div>
86
+ {% endfor %}
87
+
88
+ <form action="/search" method="post">
89
+ <div class="row-cols-3">
90
+ <div class="input-group mb-3">
91
+ <input type="text" class="form-control" placeholder="Enter a Question" aria-label="Question" name="q" aria-describedby="button-addon2" value="{{query if query else ''}}">
92
+ <button class="btn btn-outline-secondary" type="submit" id="button-addon2"><i class="bi bi-search"></i></button>
93
+ </div>
94
+ </div>
95
+ </form>
96
+
97
+ {% if search_results %}
98
+ <div class="row">
99
+ <table class="table">
100
+ <thead>
101
+ <tr>
102
+ <th scope="col">#</th>
103
+ <th scope="col">Score</th>
104
+ <th scope="col">Passage ID</th>
105
+ <th scope="col">Title</th>
106
+ <th scope="col">Content</th>
107
+ </tr>
108
+ </thead>
109
+ <tbody class="table-group-divider">
110
+ {% for res in search_results %}
111
+ <tr class="{{'table-secondary' if res['rank'] % 2 else 'table-light'}}">
112
+ <th scope="row">{{res["rank"]}}</th>
113
+ <td>{{"%.2f"|format(res["score"])}}</td>
114
+ <td>{{res["docid"]}}</td>
115
+ <td>{{res["title"]}}</td>
116
+ <td style="word-wrap: break-word;min-width: 600px;max-width: 600px;" class="text-{{'end' if lang in ('ar', 'fa') else 'start'}}">
117
+ <small>{{res["doc"]}}</small>
118
+ </td>
119
+ </tr>
120
+ {% endfor %}
121
+ </tbody>
122
+ </table>
123
+ </div>
124
+ {% endif %}
125
+ </div>
126
+ </body>
127
+ </html>
pyserini/dsearch.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Deprecated. The package ``pyserini.dsearch` has been renamed `pyserini.search.faiss`. Stubs are retained here for
18
+ redirection purpose to ensure that code in existing published papers remain function (with warnings)."""
19
+
20
+ import os
21
+ import sys
22
+
23
+ import pyserini.search.faiss
24
+ from pyserini.search.faiss import TctColBertQueryEncoder
25
+
26
+ __all__ = ['SimpleDenseSearcher', 'BinaryDenseSearcher', 'TctColBertQueryEncoder']
27
+
28
+
29
+ class SimpleDenseSearcher(pyserini.search.faiss.FaissSearcher):
30
+ def __new__(cls, *args, **kwargs):
31
+ print('pyserini.dsearch.SimpleDenseSearcher class has been deprecated, '
32
+ 'please use FaissSearcher from pyserini.search.faiss instead')
33
+ return super().__new__(cls)
34
+
35
+
36
+ class BinaryDenseSearcher(pyserini.search.faiss.BinaryDenseSearcher):
37
+ def __new__(cls, *args, **kwargs):
38
+ print('pyserini.dsearch.BinaryDenseSearcher class has been deprecated, '
39
+ 'please use BinaryDenseSearcher from pyserini.search.faiss instead')
40
+ return super().__new__(cls)
41
+
42
+
43
+ if __name__ == "__main__":
44
+ print('WARNING: pyserini.dsearch is deprecated, please use pyserini.search.faiss instead!')
45
+ args = " ".join(sys.argv[1:])
46
+ os.system(f'python -m pyserini.search.faiss {args}')
pyserini/encode/__init__.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ from ._base import DocumentEncoder, QueryEncoder, JsonlCollectionIterator,\
18
+ RepresentationWriter, FaissRepresentationWriter, JsonlRepresentationWriter, PcaEncoder
19
+ from ._ance import AnceEncoder, AnceDocumentEncoder, AnceQueryEncoder
20
+ from ._auto import AutoQueryEncoder, AutoDocumentEncoder
21
+ from ._dpr import DprDocumentEncoder, DprQueryEncoder
22
+ from ._tct_colbert import TctColBertDocumentEncoder, TctColBertQueryEncoder
23
+ from ._aggretriever import AggretrieverDocumentEncoder, AggretrieverQueryEncoder
24
+ from ._unicoil import UniCoilEncoder, UniCoilDocumentEncoder, UniCoilQueryEncoder
25
+ from ._cached_data import CachedDataQueryEncoder
26
+ from ._tok_freq import TokFreqQueryEncoder
27
+ from ._splade import SpladeQueryEncoder
28
+ from ._slim import SlimQueryEncoder
pyserini/encode/__main__.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import argparse
18
+ import sys
19
+
20
+ from pyserini.encode import JsonlRepresentationWriter, FaissRepresentationWriter, JsonlCollectionIterator
21
+ from pyserini.encode import DprDocumentEncoder, TctColBertDocumentEncoder, AnceDocumentEncoder, AggretrieverDocumentEncoder, AutoDocumentEncoder
22
+ from pyserini.encode import UniCoilDocumentEncoder
23
+
24
+
25
+ encoder_class_map = {
26
+ "dpr": DprDocumentEncoder,
27
+ "tct_colbert": TctColBertDocumentEncoder,
28
+ "aggretriever": AggretrieverDocumentEncoder,
29
+ "ance": AnceDocumentEncoder,
30
+ "sentence-transformers": AutoDocumentEncoder,
31
+ "unicoil": UniCoilDocumentEncoder,
32
+ "auto": AutoDocumentEncoder,
33
+ }
34
+ ALLOWED_POOLING_OPTS = ["cls","mean"]
35
+
36
+ def init_encoder(encoder, encoder_class, device):
37
+ _encoder_class = encoder_class
38
+
39
+ # determine encoder_class
40
+ if encoder_class is not None:
41
+ encoder_class = encoder_class_map[encoder_class]
42
+ else:
43
+ # if any class keyword was matched in the given encoder name,
44
+ # use that encoder class
45
+ for class_keyword in encoder_class_map:
46
+ if class_keyword in encoder.lower():
47
+ encoder_class = encoder_class_map[class_keyword]
48
+ break
49
+
50
+ # if none of the class keyword was matched,
51
+ # use the AutoDocumentEncoder
52
+ if encoder_class is None:
53
+ encoder_class = AutoDocumentEncoder
54
+
55
+ # prepare arguments to encoder class
56
+ kwargs = dict(model_name=encoder, device=device)
57
+ if (_encoder_class == "sentence-transformers") or ("sentence-transformers" in encoder):
58
+ kwargs.update(dict(pooling='mean', l2_norm=True))
59
+ if (_encoder_class == "contriever") or ("contriever" in encoder):
60
+ kwargs.update(dict(pooling='mean', l2_norm=False))
61
+ return encoder_class(**kwargs)
62
+
63
+
64
+ def parse_args(parser, commands):
65
+ # Divide argv by commands
66
+ split_argv = [[]]
67
+ for c in sys.argv[1:]:
68
+ if c in commands.choices:
69
+ split_argv.append([c])
70
+ else:
71
+ split_argv[-1].append(c)
72
+ # Initialize namespace
73
+ args = argparse.Namespace()
74
+ for c in commands.choices:
75
+ setattr(args, c, None)
76
+ # Parse each command
77
+ parser.parse_args(split_argv[0], namespace=args) # Without command
78
+ for argv in split_argv[1:]: # Commands
79
+ n = argparse.Namespace()
80
+ setattr(args, argv[0], n)
81
+ parser.parse_args(argv, namespace=n)
82
+ return args
83
+
84
+
85
+ if __name__ == '__main__':
86
+ parser = argparse.ArgumentParser()
87
+ commands = parser.add_subparsers(title='sub-commands')
88
+ input_parser = commands.add_parser('input')
89
+ input_parser.add_argument('--corpus', type=str,
90
+ help='directory that contains corpus files to be encoded, in jsonl format.',
91
+ required=True)
92
+ input_parser.add_argument('--fields', help='fields that contents in jsonl has (in order)',
93
+ nargs='+', default=['text'], required=False)
94
+ input_parser.add_argument('--docid-field',
95
+ help='name of document id field name. If you have a custom id with a name other than "id", "_id" or "docid", then use this argument',
96
+ default=None, required=False)
97
+ input_parser.add_argument('--delimiter', help='delimiter for the fields', default='\n', required=False)
98
+ input_parser.add_argument('--shard-id', type=int, help='shard-id 0-based', default=0, required=False)
99
+ input_parser.add_argument('--shard-num', type=int, help='number of shards', default=1, required=False)
100
+
101
+ output_parser = commands.add_parser('output')
102
+ output_parser.add_argument('--embeddings', type=str, help='directory to store encoded corpus', required=True)
103
+ output_parser.add_argument('--to-faiss', action='store_true', default=False)
104
+
105
+ encoder_parser = commands.add_parser('encoder')
106
+ encoder_parser.add_argument('--encoder', type=str, help='encoder name or path', required=True)
107
+ encoder_parser.add_argument('--encoder-class', type=str, required=False, default=None,
108
+ choices=["dpr", "bpr", "tct_colbert", "ance", "sentence-transformers", "auto"],
109
+ help='which query encoder class to use. `default` would infer from the args.encoder')
110
+ encoder_parser.add_argument('--fields', help='fields to encode', nargs='+', default=['text'], required=False)
111
+ encoder_parser.add_argument('--batch-size', type=int, help='batch size', default=64, required=False)
112
+ encoder_parser.add_argument('--max-length', type=int, help='max length', default=256, required=False)
113
+ encoder_parser.add_argument('--dimension', type=int, help='dimension', default=768, required=False)
114
+ encoder_parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]',
115
+ default='cuda:0', required=False)
116
+ encoder_parser.add_argument('--fp16', action='store_true', default=False)
117
+ encoder_parser.add_argument('--add-sep', action='store_true', default=False)
118
+ encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', required=False)
119
+
120
+ args = parse_args(parser, commands)
121
+ delimiter = args.input.delimiter.replace("\\n", "\n") # argparse would add \ prior to the passed '\n\n'
122
+
123
+ encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device)
124
+ if type(encoder).__name__ == "AutoDocumentEncoder":
125
+ if args.encoder.pooling in ALLOWED_POOLING_OPTS:
126
+ encoder.pooling = args.encoder.pooling
127
+ else:
128
+ raise ValueError(f"Only allowed to use pooling types {ALLOWED_POOLING_OPTS}. You entered {args.encoder.pooling}")
129
+ if args.output.to_faiss:
130
+ embedding_writer = FaissRepresentationWriter(args.output.embeddings, dimension=args.encoder.dimension)
131
+ else:
132
+ embedding_writer = JsonlRepresentationWriter(args.output.embeddings)
133
+ collection_iterator = JsonlCollectionIterator(args.input.corpus, args.input.fields, args.input.docid_field, delimiter)
134
+
135
+ with embedding_writer:
136
+ for batch_info in collection_iterator(args.encoder.batch_size, args.input.shard_id, args.input.shard_num):
137
+ kwargs = {
138
+ 'texts': batch_info['text'],
139
+ 'titles': batch_info['title'] if 'title' in args.encoder.fields else None,
140
+ 'expands': batch_info['expand'] if 'expand' in args.encoder.fields else None,
141
+ 'fp16': args.encoder.fp16,
142
+ 'max_length': args.encoder.max_length,
143
+ 'add_sep': args.encoder.add_sep,
144
+ }
145
+ embeddings = encoder.encode(**kwargs)
146
+ batch_info['vector'] = embeddings
147
+ embedding_writer.write(batch_info, args.input.fields)
pyserini/encode/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.15 kB). View file
 
pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc ADDED
Binary file (6.24 kB). View file