Spaces:

castorini
/

ONNX-Demo

Runtime error

App Files Files Community

ArthurChen189 commited on Aug 3, 2023

Commit

62977bb

•

1 Parent(s): 30ac9ed

upload pyserini

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

pyserini/2cr/_base.py +95 -0
pyserini/2cr/miracl.py +447 -0
pyserini/2cr/miracl.yaml +1180 -0
pyserini/2cr/miracl_html.template +256 -0
pyserini/2cr/miracl_html_table.template +35 -0
pyserini/2cr/miracl_html_table_row.template +336 -0
pyserini/2cr/mrtydi.py +330 -0
pyserini/2cr/mrtydi.yaml +890 -0
pyserini/2cr/mrtydi_html.template +256 -0
pyserini/2cr/mrtydi_html_table.template +28 -0
pyserini/2cr/mrtydi_html_table_row.template +212 -0
pyserini/2cr/msmarco-v1-doc.yaml +539 -0
pyserini/2cr/msmarco-v1-passage.yaml +764 -0
pyserini/2cr/msmarco-v2-doc.yaml +287 -0
pyserini/2cr/msmarco-v2-passage.yaml +287 -0
pyserini/2cr/msmarco.py +600 -0
pyserini/2cr/msmarco_html_row_v1.template +81 -0
pyserini/2cr/msmarco_html_row_v2.template +82 -0
pyserini/2cr/msmarco_html_v1_doc.template +296 -0
pyserini/2cr/msmarco_html_v1_passage.template +325 -0
pyserini/2cr/msmarco_html_v2_doc.template +292 -0
pyserini/2cr/msmarco_html_v2_passage.template +292 -0
pyserini/__init__.py +1 -0
pyserini/__pycache__/__init__.cpython-310.pyc +0 -0
pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc +0 -0
pyserini/__pycache__/encoded_query_info.cpython-310.pyc +0 -0
pyserini/__pycache__/evaluate_script_info.cpython-310.pyc +0 -0
pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc +0 -0
pyserini/__pycache__/pyclass.cpython-310.pyc +0 -0
pyserini/__pycache__/setup.cpython-310.pyc +0 -0
pyserini/__pycache__/util.cpython-310.pyc +0 -0
pyserini/analysis/__init__.py +19 -0
pyserini/analysis/__pycache__/__init__.cpython-310.pyc +0 -0
pyserini/analysis/__pycache__/_base.cpython-310.pyc +0 -0
pyserini/analysis/_base.py +166 -0
pyserini/collection/__init__.py +20 -0
pyserini/collection/_base.py +153 -0
pyserini/collection/_collection_support.py +78 -0
pyserini/demo/acl.py +124 -0
pyserini/demo/dpr.py +105 -0
pyserini/demo/miracl.py +149 -0
pyserini/demo/msmarco.py +118 -0
pyserini/demo/templates/acl.html +74 -0
pyserini/demo/templates/assets/acl-logo.svg +10 -0
pyserini/demo/templates/miracl.html +127 -0
pyserini/dsearch.py +46 -0
pyserini/encode/__init__.py +28 -0
pyserini/encode/__main__.py +147 -0
pyserini/encode/__pycache__/__init__.cpython-310.pyc +0 -0
pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc +0 -0

pyserini/2cr/_base.py ADDED Viewed

	@@ -0,0 +1,95 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import subprocess
+fail_str = '\033[91m[FAIL]\033[0m'
+ok_str = '[OK]'
+okish_str = '\033[94m[OKish]\033[0m'
+def run_command(cmd):
+ process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout, stderr = process.communicate()
+ stdout = stdout.decode('utf-8')
+ stderr = stderr.decode('utf-8')
+ return stdout, stderr
+def run_eval_and_return_metric(metric, eval_key, defs, runfile):
+ eval_cmd = f'python -m pyserini.eval.trec_eval {defs} {eval_key} {runfile}'
+ eval_stdout, eval_stderr = run_command(eval_cmd)
+ for line in eval_stdout.split('\n'):
+ parts = line.split('\t')
+ if len(parts) == 3 and parts[1] == 'all':
+ return round(float(parts[2]), 4)
+ return 0.0
+def run_dpr_retrieval_eval_and_return_metric(defs, json_file):
+ """Generate dpr retrieval evaluation scores
+ Args:
+ defs: topk definitions (e.g., '--topk 5 20')
+ json_file: dpr retrieval json file
+ Returns:
+ topk: a dictionary of topk scores (e.g., {"Top5": <score>})
+ """
+ eval_cmd = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {json_file} {defs} '
+ eval_stdout, eval_stderr = run_command(eval_cmd)
+ topk = {}
+ for line in eval_stdout.split('\n'):
+ parts = line.split('\t')
+ if len(parts) == 2 and 'accuracy' in parts[1]:
+ topk.update({parts[0]:round(float(parts[1][10:])*100, 4)})
+ return topk
+def convert_trec_run_to_dpr_retrieval_json(topics,index,runfile,output):
+ """Convert trec runfile to dpr retrieval json file
+ Args:
+ topics: topics field
+ index: index field
+ runfile: input runfile
+ output: output jsonfile
+ Returns:
+ exit status: exit status
+ """
+ cmd = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics {topics} --index {index} --input {runfile} --output {output}'
+ return os.system(cmd)
+def run_fusion(run_ls, output, k):
+ """run fusion command and return status code
+ Args:
+ run_ls: a list of runfile paths
+ output: output path
+ k: topk value
+ Returns:
+ status code: status code
+ """
+ run_files = ' '.join(run_ls)
+ cmd = f'python -m pyserini.fusion --runs {run_files} --output {output} --k {k}'
+ return os.system(cmd)

pyserini/2cr/miracl.py ADDED Viewed

	@@ -0,0 +1,447 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import math
+import os
+import sys
+import time
+import subprocess
+import pkg_resources
+from collections import defaultdict, OrderedDict
+from string import Template
+import yaml
+from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
+languages = [
+ ['ar', 'arabic'],
+ ['bn', 'bengali'],
+ ['en', 'english'],
+ ['es', 'spanish'],
+ ['fa', 'persian'],
+ ['fi', 'finnish'],
+ ['fr', 'french'],
+ ['hi', 'hindi'],
+ ['id', 'indonesian'],
+ ['ja', 'japanese'],
+ ['ko', 'korean'],
+ ['ru', 'russian'],
+ ['sw', 'swahili'],
+ ['te', 'telugu'],
+ ['th', 'thai'],
+ ['zh', 'chinese'],
+ ['de', 'german'],
+ ['yo', 'yoruba']
+]
+html_display = OrderedDict()
+html_display['bm25'] = 'BM25'
+html_display['mdpr-tied-pft-msmarco'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO'
+html_display['mdpr-tied-pft-msmarco-ft-all'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO then FT w/ all Mr. TyDi'
+html_display['bm25-mdpr-tied-pft-msmarco-hybrid'] = 'Hybrid of `bm25` and `mdpr-tied-pft-msmarco`'
+html_display['mdpr-tied-pft-msmarco-ft-miracl'] = 'mDPR (tied encoders), pre-FT w/ MS MARCO then in-lang FT w/ MIRACL'
+html_display['mcontriever-tied-pft-msmarco'] = 'mContriever (tied encoders), pre-FT w/ MS MARCO'
+models = list(html_display)
+trec_eval_metric_definitions = {
+ 'nDCG@10': '-c -M 100 -m ndcg_cut.10',
+ 'R@100': '-c -m recall.100',
+}
+def format_run_command(raw):
+ return raw.replace('--lang', '\\\n --lang') \
+ .replace('--encoder', '\\\n --encoder') \
+ .replace('--topics', '\\\n --topics') \
+ .replace('--index', '\\\n --index') \
+ .replace('--output ', '\\\n --output ') \
+ .replace('--runs', '\\\n --runs ') \
+ .replace('--batch ', '\\\n --batch ') \
+ .replace('--threads 12', '--threads 12 \\\n ')
+def format_eval_command(raw):
+ return raw.replace('-c ', '\\\n -c ') \
+ .replace(raw.split()[-1], f'\\\n {raw.split()[-1]}')
+def read_file(f):
+ fin = open(f, 'r')
+ text = fin.read()
+ fin.close()
+ return text
+def list_conditions():
+ print('Conditions:\n-----------')
+ for condition, _ in html_display.items():
+ print(condition)
+ print('\nLanguages\n---------')
+ for language in languages:
+ print(language[0])
+def generate_table_rows(table, row_template, commands, eval_commands, table_id, split, metric):
+ row_cnt = 1
+ html_rows = []
+ for model in models:
+ s = Template(row_template)
+ keys = {}
+ used_langs = 0
+ for lang in languages:
+ keys[lang[0]] = f'{model}.{lang[0]}'
+ used_langs += 1 if table[keys[lang[0]]][split][metric] != 0 else 0
+ sum = table[keys["ar"]][split][metric] + \
+ table[keys["bn"]][split][metric] + \
+ table[keys["en"]][split][metric] + \
+ table[keys["es"]][split][metric] + \
+ table[keys["fa"]][split][metric] + \
+ table[keys["fi"]][split][metric] + \
+ table[keys["fr"]][split][metric] + \
+ table[keys["hi"]][split][metric] + \
+ table[keys["id"]][split][metric] + \
+ table[keys["ja"]][split][metric] + \
+ table[keys["ko"]][split][metric] + \
+ table[keys["ru"]][split][metric] + \
+ table[keys["sw"]][split][metric] + \
+ table[keys["te"]][split][metric] + \
+ table[keys["th"]][split][metric] + \
+ table[keys["zh"]][split][metric] + \
+ table[keys["de"]][split][metric] + \
+ table[keys["yo"]][split][metric]
+ avg = sum / used_langs
+ s = s.substitute(table_cnt=table_id,
+ row_cnt=row_cnt,
+ model=html_display[model],
+ ar=f'{table[keys["ar"]][split][metric]:.3f}',
+ bn=f'{table[keys["bn"]][split][metric]:.3f}',
+ en=f'{table[keys["en"]][split][metric]:.3f}',
+ es=f'{table[keys["es"]][split][metric]:.3f}',
+ fa=f'{table[keys["fa"]][split][metric]:.3f}',
+ fi=f'{table[keys["fi"]][split][metric]:.3f}',
+ fr=f'{table[keys["fr"]][split][metric]:.3f}',
+ hi=f'{table[keys["hi"]][split][metric]:.3f}',
+ id=f'{table[keys["id"]][split][metric]:.3f}',
+ ja=f'{table[keys["ja"]][split][metric]:.3f}',
+ ko=f'{table[keys["ko"]][split][metric]:.3f}',
+ ru=f'{table[keys["ru"]][split][metric]:.3f}',
+ sw=f'{table[keys["sw"]][split][metric]:.3f}',
+ te=f'{table[keys["te"]][split][metric]:.3f}',
+ th=f'{table[keys["th"]][split][metric]:.3f}',
+ zh=f'{table[keys["zh"]][split][metric]:.3f}',
+ de=f'{table[keys["de"]][split][metric]:.3f}',
+ yo=f'{table[keys["yo"]][split][metric]:.3f}',
+ avg=f'{avg:.3f}',
+ cmd1=f'{commands[keys["ar"]]}',
+ cmd2=f'{commands[keys["bn"]]}',
+ cmd3=f'{commands[keys["en"]]}',
+ cmd4=f'{commands[keys["es"]]}',
+ cmd5=f'{commands[keys["fa"]]}',
+ cmd6=f'{commands[keys["fi"]]}',
+ cmd7=f'{commands[keys["fr"]]}',
+ cmd8=f'{commands[keys["hi"]]}',
+ cmd9=f'{commands[keys["id"]]}',
+ cmd10=f'{commands[keys["ja"]]}',
+ cmd11=f'{commands[keys["ko"]]}',
+ cmd12=f'{commands[keys["ru"]]}',
+ cmd13=f'{commands[keys["sw"]]}',
+ cmd14=f'{commands[keys["te"]]}',
+ cmd15=f'{commands[keys["th"]]}',
+ cmd16=f'{commands[keys["zh"]]}',
+ cmd17=f'{commands[keys["de"]]}',
+ cmd18=f'{commands[keys["yo"]]}',
+ eval_cmd1=f'{eval_commands[keys["ar"]][metric]}',
+ eval_cmd2=f'{eval_commands[keys["bn"]][metric]}',
+ eval_cmd3=f'{eval_commands[keys["en"]][metric]}',
+ eval_cmd4=f'{eval_commands[keys["es"]][metric]}',
+ eval_cmd5=f'{eval_commands[keys["fa"]][metric]}',
+ eval_cmd6=f'{eval_commands[keys["fi"]][metric]}',
+ eval_cmd7=f'{eval_commands[keys["fr"]][metric]}',
+ eval_cmd8=f'{eval_commands[keys["hi"]][metric]}',
+ eval_cmd9=f'{eval_commands[keys["id"]][metric]}',
+ eval_cmd10=f'{eval_commands[keys["ja"]][metric]}',
+ eval_cmd11=f'{eval_commands[keys["ko"]][metric]}',
+ eval_cmd12=f'{eval_commands[keys["ru"]][metric]}',
+ eval_cmd13=f'{eval_commands[keys["sw"]][metric]}',
+ eval_cmd14=f'{eval_commands[keys["te"]][metric]}',
+ eval_cmd15=f'{eval_commands[keys["th"]][metric]}',
+ eval_cmd16=f'{eval_commands[keys["zh"]][metric]}',
+ eval_cmd17=f'{eval_commands[keys["de"]][metric]}',
+ eval_cmd18=f'{eval_commands[keys["yo"]][metric]}'
+ )
+ s = s.replace("0.000", "--")
+ html_rows.append(s)
+ row_cnt += 1
+ return html_rows
+def print_results(table, metric, split):
+ print(f'Metric = {metric}, Split = {split}')
+ print(' ' * 35, end='')
+ for lang in languages:
+ print(f'{lang[0]:3} ', end='')
+ print('')
+ for model in models:
+ print(f'{model:33}', end='')
+ for lang in languages:
+ key = f'{model}.{lang[0]}'
+ print(f'{table[key][split][metric]:7.3f}', end='')
+ print('')
+ print('')
+def extract_topic_fn_from_cmd(cmd):
+ cmd = cmd.split()
+ topic_idx = cmd.index('--topics')
+ return cmd[topic_idx + 1]
+def generate_report(args):
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ commands = defaultdict(lambda: '')
+ eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html.template'))
+ table_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html_table.template'))
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'miracl_html_table_row.template'))
+ with open(pkg_resources.resource_filename(__name__, 'miracl.yaml')) as f:
+ yaml_data = yaml.safe_load(f)
+ for condition in yaml_data['conditions']:
+ name = condition['name']
+ eval_key = condition['eval_key']
+ cmd_template = condition['command']
+ cmd_lst = cmd_template.split()
+ lang = name.split('.')[-1]
+ is_hybrid_run = 'hybrid' in name
+ for splits in condition['splits']:
+ split = splits['split']
+ if is_hybrid_run:
+ hits = int(cmd_lst[cmd_lst.index('--k') + 1])
+ else:
+ hits = int(cmd_lst[cmd_lst.index('--hits') + 1])
+ runfile = os.path.join(args.directory, f'run.miracl.{name}.{split}.txt')
+ if is_hybrid_run:
+ bm25_output = os.path.join(args.directory,
+ f'run.miracl.bm25.{lang}.{split}.top{hits}.txt')
+ mdpr_output = os.path.join(args.directory,
+ f'run.miracl.mdpr-tied-pft-msmarco.{lang}.{split}.top{hits}.txt')
+ expected_args = dict(output=runfile, bm25_output=bm25_output, mdpr_output=mdpr_output)
+ else:
+ expected_args = dict(split=split, output=runfile)
+ if not all([f"${k}" in cmd_template or f"${{{k}}}" in cmd_template for k in expected_args]):
+ raise ValueError(f"Not all arguements {list(expected_args)} detected from inputs: {cmd_template}.")
+ cmd = Template(cmd_template).substitute(**expected_args)
+ commands[name] = format_run_command(cmd)
+ for expected in splits['scores']:
+ for metric in expected:
+ if str(expected[metric])[-1] == "5":
+ # without adding espilon, there is a chance that f-string would round 0.5 to 0 rather than 1
+ # e.g., 0.8885 -> 0.888 rather than 0.889
+ # add a espilon to the expected score to avoid rounding error
+ expected[metric] += 1e-5
+ table[name][split][metric] = expected[metric]
+ eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
+ f'{trec_eval_metric_definitions[metric]} {eval_key}-{split} {runfile}'
+ eval_commands[name][metric] = format_eval_command(eval_cmd)
+ tables_html = []
+ split = 'dev'
+ # Build the table for MRR@100, test queries
+ html_rows = generate_table_rows(table, row_template, commands, eval_commands, 1, split, 'nDCG@10')
+ all_rows = '\n'.join(html_rows)
+ tables_html.append(Template(table_template).substitute(desc=f'nDCG@10, {split} queries', rows=all_rows))
+ # Build the table for R@100, test queries
+ html_rows = generate_table_rows(table, row_template, commands, eval_commands, 2, split, 'R@100')
+ all_rows = '\n'.join(html_rows)
+ tables_html.append(Template(table_template).substitute(desc=f'Recall@100, {split} queries', rows=all_rows))
+ with open(args.output, 'w') as out:
+ out.write(Template(html_template).substitute(title='MIRACL', tables=' '.join(tables_html)))
+def run_conditions(args):
+ if args.condition == 'mdpr-tied-pft-msmarco-ft-miracl' and args.language in ['de', 'yo']:
+ print('MIRACL de and yo datasets do not have train splits to finetune with')
+ return
+ start = time.time()
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ with open(pkg_resources.resource_filename(__name__, 'miracl.yaml')) as f:
+ yaml_data = yaml.safe_load(f)
+ for condition in yaml_data['conditions']:
+ name = condition['name']
+ encoder = name.split('.')[0]
+ lang = name.split('.')[-1]
+ if args.all:
+ pass
+ elif args.condition != encoder:
+ continue
+ elif args.language and args.language != lang:
+ continue
+ eval_key = condition['eval_key']
+ cmd_template = condition['command']
+ cmd_lst = cmd_template.split()
+ print(f'condition {name}:')
+ is_hybrid_run = 'hybrid' in name
+ for splits in condition['splits']:
+ split = splits['split']
+ if is_hybrid_run:
+ hits = int(cmd_lst[cmd_lst.index('--k') + 1])
+ else:
+ hits = int(cmd_lst[cmd_lst.index('--hits') + 1])
+ print(f' - split: {split}')
+ runfile = os.path.join(args.directory, f'run.miracl.{name}.{split}.top{hits}.txt')
+ if is_hybrid_run:
+ bm25_output = os.path.join(args.directory,
+ f'run.miracl.bm25.{lang}.{split}.top{hits}.txt')
+ mdpr_output = os.path.join(args.directory,
+ f'run.miracl.mdpr-tied-pft-msmarco.{lang}.{split}.top{hits}.txt')
+ if not os.path.exists(bm25_output):
+ print(f'Missing BM25 file: {bm25_output}')
+ continue
+ if not os.path.exists(mdpr_output):
+ print(f'Missing mDPR file: {mdpr_output}')
+ continue
+ cmd = Template(cmd_template).substitute(split=split, output=runfile, bm25_output=bm25_output,
+ mdpr_output=mdpr_output)
+ else:
+ cmd = Template(cmd_template).substitute(split=split, output=runfile)
+ # In the yaml file, the topics are written as something like '--topics miracl-v1.0-ar-${split}'
+ # This works for the dev split because the topics are directly included in Anserini/Pyserini.
+ # For this training split, we have to map the symbol into a file in tools/topics-and-qrels/
+ # Here, we assume that the developer has cloned the miracl repo and placed the topics there.
+ if split == 'train':
+ cmd = cmd.replace(f'--topics miracl-v1.0-{lang}-{split}',
+ f'--topics tools/topics-and-qrels/topics.miracl-v1.0-{lang}-{split}.tsv')
+ if args.display_commands:
+ print(f'\n```bash\n{format_run_command(cmd)}\n```\n')
+ if not os.path.exists(runfile):
+ if not args.dry_run:
+ rtn = subprocess.run(cmd.split(), capture_output=True)
+ stderr = rtn.stderr.decode()
+ if '--topics' in cmd:
+ topic_fn = extract_topic_fn_from_cmd(cmd)
+ if f'ValueError: Topic {topic_fn} Not Found' in stderr:
+ print(f'Skipping {topic_fn}: file not found.')
+ continue
+ for expected in splits['scores']:
+ for metric in expected:
+ if not args.skip_eval:
+ # We have the translate the training qrels into a file located in tools/topics-and-qrels/
+ # because they are not included with Anserini/Pyserini by default.
+ # Here, we assume that the developer has cloned the miracl repo and placed the qrels there.
+ if split == 'train':
+ qrels = f'tools/topics-and-qrels/qrels.{eval_key}-train.tsv'
+ else:
+ qrels = f'{eval_key}-{split}'
+ score = float(run_eval_and_return_metric(metric, qrels,
+ trec_eval_metric_definitions[metric], runfile))
+ if math.isclose(score, float(expected[metric])):
+ result_str = ok_str
+ # Flaky tests
+ elif (name == 'mdpr-tied-pft-msmarco.hi' and split == 'train'
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
+ (name == 'mdpr-tied-pft-msmarco-ft-all.ru'
+ and split == 'dev' and metric == 'nDCG@10'
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
+ (name == 'bm25-mdpr-tied-pft-msmarco-hybrid.te'
+ and split == 'train' and metric == 'nDCG@10'
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4)) or \
+ (name == 'bm25-mdpr-tied-pft-msmarco-hybrid.zh'
+ and split == 'dev' and metric == 'nDCG@10'
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4)):
+ result_str = okish_str
+ else:
+ result_str = fail_str + f' expected {expected[metric]:.4f}'
+ print(f' {metric:7}: {score:.4f} {result_str}')
+ table[name][split][metric] = score
+ else:
+ table[name][split][metric] = expected[metric]
+ print('')
+ for metric in ['nDCG@10', 'R@100']:
+ for split in ['dev', 'train']:
+ print_results(table, metric, split)
+ end = time.time()
+ print(f'Total elapsed time: {end - start:.0f}s')
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Generate regression matrix for MIRACL.')
+ parser.add_argument('--condition', type=str,
+ help='Condition to run', required=False)
+ # To list all conditions
+ parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
+ # For generating reports
+ parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
+ parser.add_argument('--output', type=str, help='File to store report.', required=False)
+ # For actually running the experimental conditions
+ parser.add_argument('--all', action='store_true', default=False, help='Run using all languages.')
+ parser.add_argument('--language', type=str, help='Language to run.', required=False)
+ parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
+ parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
+ parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
+ parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
+ args = parser.parse_args()
+ if args.list_conditions:
+ list_conditions()
+ sys.exit()
+ if args.generate_report:
+ if not args.output:
+ print(f'Must specify report filename with --output.')
+ sys.exit()
+ generate_report(args)
+ sys.exit()
+ if args.all and (args.condition or args.language):
+ print('Specifying --all will run all conditions and languages')
+ sys.exit()
+ run_conditions(args)

pyserini/2cr/miracl.yaml ADDED Viewed

	@@ -0,0 +1,1180 @@

+conditions:
+ # BM25
+ - name: bm25.ar
+ eval_key: miracl-v1.0-ar
+ command: python -m pyserini.search.lucene --language ar --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4434
+ R@100: 0.8562
+ - split: dev
+ scores:
+ - nDCG@10: 0.4809
+ R@100: 0.8885
+ - name: bm25.bn
+ eval_key: miracl-v1.0-bn
+ command: python -m pyserini.search.lucene --language bn --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5122
+ R@100: 0.8934
+ - split: dev
+ scores:
+ - nDCG@10: 0.5079
+ R@100: 0.9088
+ - name: bm25.en
+ eval_key: miracl-v1.0-en
+ command: python -m pyserini.search.lucene --language en --topics miracl-v1.0-en-${split} --index miracl-v1.0-en --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3415
+ R@100: 0.7928
+ - split: dev
+ scores:
+ - nDCG@10: 0.3506
+ R@100: 0.8190
+ - name: bm25.es
+ eval_key: miracl-v1.0-es
+ command: python -m pyserini.search.lucene --language es --topics miracl-v1.0-es-${split} --index miracl-v1.0-es --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3030
+ R@100: 0.7020
+ - split: dev
+ scores:
+ - nDCG@10: 0.3193
+ R@100: 0.7018
+ - name: bm25.fa
+ eval_key: miracl-v1.0-fa
+ command: python -m pyserini.search.lucene --language fa --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3270
+ R@100: 0.7139
+ - split: dev
+ scores:
+ - nDCG@10: 0.3334
+ R@100: 0.7306
+ - name: bm25.fi
+ eval_key: miracl-v1.0-fi
+ command: python -m pyserini.search.lucene --language fi --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5106
+ R@100: 0.8471
+ - split: dev
+ scores:
+ - nDCG@10: 0.5513
+ R@100: 0.8910
+ - name: bm25.fr
+ eval_key: miracl-v1.0-fr
+ command: python -m pyserini.search.lucene --language fr --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.2152
+ R@100: 0.6601
+ - split: dev
+ scores:
+ - nDCG@10: 0.1832
+ R@100: 0.6528
+ - name: bm25.hi
+ eval_key: miracl-v1.0-hi
+ command: python -m pyserini.search.lucene --language hi --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4745
+ R@100: 0.9016
+ - split: dev
+ scores:
+ - nDCG@10: 0.4578
+ R@100: 0.8679
+ - name: bm25.id
+ eval_key: miracl-v1.0-id
+ command: python -m pyserini.search.lucene --language id --topics miracl-v1.0-id-${split} --index miracl-v1.0-id --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4844
+ R@100: 0.9234
+ - split: dev
+ scores:
+ - nDCG@10: 0.4486
+ R@100: 0.9041
+ - name: bm25.ja
+ eval_key: miracl-v1.0-ja
+ command: python -m pyserini.search.lucene --language ja --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3796
+ R@100: 0.8225
+ - split: dev
+ scores:
+ - nDCG@10: 0.3689
+ R@100: 0.8048
+ - name: bm25.ko
+ eval_key: miracl-v1.0-ko
+ command: python -m pyserini.search.lucene --language ko --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4279
+ R@100: 0.7572
+ - split: dev
+ scores:
+ - nDCG@10: 0.4190
+ R@100: 0.7831
+ - name: bm25.ru
+ eval_key: miracl-v1.0-ru
+ command: python -m pyserini.search.lucene --language ru --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3153
+ R@100: 0.6464
+ - split: dev
+ scores:
+ - nDCG@10: 0.3342
+ R@100: 0.6614
+ - name: bm25.sw
+ eval_key: miracl-v1.0-sw
+ command: python -m pyserini.search.lucene --language sw --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3356
+ R@100: 0.6499
+ - split: dev
+ scores:
+ - nDCG@10: 0.3826
+ R@100: 0.7008
+ - name: bm25.te
+ eval_key: miracl-v1.0-te
+ command: python -m pyserini.search.lucene --language te --topics miracl-v1.0-te-${split} --index miracl-v1.0-te --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4814
+ R@100: 0.8077
+ - split: dev
+ scores:
+ - nDCG@10: 0.4942
+ R@100: 0.8307
+ - name: bm25.th
+ eval_key: miracl-v1.0-th
+ command: python -m pyserini.search.lucene --language th --topics miracl-v1.0-th-${split} --index miracl-v1.0-th --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4629
+ R@100: 0.8768
+ - split: dev
+ scores:
+ - nDCG@10: 0.4838
+ R@100: 0.8874
+ - name: bm25.zh
+ eval_key: miracl-v1.0-zh
+ command: python -m pyserini.search.lucene --language zh --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.2018
+ R@100: 0.5541
+ - split: dev
+ scores:
+ - nDCG@10: 0.1801
+ R@100: 0.5599
+ - name: bm25.de
+ eval_key: miracl-v1.0-de
+ command: python -m pyserini.search.lucene --language de --topics miracl-v1.0-de-${split} --index miracl-v1.0-de --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.2262
+ R@100: 0.5724
+ - name: bm25.yo
+ eval_key: miracl-v1.0-yo
+ command: python -m pyserini.search.lucene --pretokenized --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo --output $output --batch 128 --threads 16 --bm25 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.4059
+ R@100: 0.7325
+ # mdpr-tied-pft-msmarco
+ - name: mdpr-tied-pft-msmarco.ar
+ eval_key: miracl-v1.0-ar
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4653
+ R@100: 0.8293
+ - split: dev
+ scores:
+ - nDCG@10: 0.4993
+ R@100: 0.8407
+ - name: mdpr-tied-pft-msmarco.bn
+ eval_key: miracl-v1.0-bn
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4362
+ R@100: 0.8045
+ - split: dev
+ scores:
+ - nDCG@10: 0.4427
+ R@100: 0.8193
+ - name: mdpr-tied-pft-msmarco.en
+ eval_key: miracl-v1.0-en
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3986
+ R@100: 0.7779
+ - split: dev
+ scores:
+ - nDCG@10: 0.3938
+ R@100: 0.7675
+ - name: mdpr-tied-pft-msmarco.es
+ eval_key: miracl-v1.0-es
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4637
+ R@100: 0.8654
+ - split: dev
+ scores:
+ - nDCG@10: 0.4777
+ R@100: 0.8643
+ - name: mdpr-tied-pft-msmarco.fa
+ eval_key: miracl-v1.0-fa
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4882
+ R@100: 0.9092
+ - split: dev
+ scores:
+ - nDCG@10: 0.4800
+ R@100: 0.8980
+ - name: mdpr-tied-pft-msmarco.fi
+ eval_key: miracl-v1.0-fi
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4426
+ R@100: 0.7611
+ - split: dev
+ scores:
+ - nDCG@10: 0.4721
+ R@100: 0.7877
+ - name: mdpr-tied-pft-msmarco.fr
+ eval_key: miracl-v1.0-fr
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4372
+ R@100: 0.9268
+ - split: dev
+ scores:
+ - nDCG@10: 0.4352
+ R@100: 0.9154
+ - name: mdpr-tied-pft-msmarco.hi
+ eval_key: miracl-v1.0-hi
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3685
+ R@100: 0.7780
+ - split: dev
+ scores:
+ - nDCG@10: 0.3830
+ R@100: 0.7755
+ - name: mdpr-tied-pft-msmarco.id
+ eval_key: miracl-v1.0-id
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.2549
+ R@100: 0.5610
+ - split: dev
+ scores:
+ - nDCG@10: 0.2719
+ R@100: 0.5734
+ - name: mdpr-tied-pft-msmarco.ja
+ eval_key: miracl-v1.0-ja
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4342
+ R@100: 0.8211
+ - split: dev
+ scores:
+ - nDCG@10: 0.4390
+ R@100: 0.8254
+ - name: mdpr-tied-pft-msmarco.ko
+ eval_key: miracl-v1.0-ko
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4147
+ R@100: 0.7699
+ - split: dev
+ scores:
+ - nDCG@10: 0.4189
+ R@100: 0.7369
+ - name: mdpr-tied-pft-msmarco.ru
+ eval_key: miracl-v1.0-ru
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3812
+ R@100: 0.7854
+ - split: dev
+ scores:
+ - nDCG@10: 0.4073
+ R@100: 0.7972
+ - name: mdpr-tied-pft-msmarco.sw
+ eval_key: miracl-v1.0-sw
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.2973
+ R@100: 0.5761
+ - split: dev
+ scores:
+ - nDCG@10: 0.2990
+ R@100: 0.6158
+ - name: mdpr-tied-pft-msmarco.te
+ eval_key: miracl-v1.0-te
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3723
+ R@100: 0.7698
+ - split: dev
+ scores:
+ - nDCG@10: 0.3557
+ R@100: 0.7619
+ - name: mdpr-tied-pft-msmarco.th
+ eval_key: miracl-v1.0-th
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3451
+ R@100: 0.6728
+ - split: dev
+ scores:
+ - nDCG@10: 0.3578
+ R@100: 0.6783
+ - name: mdpr-tied-pft-msmarco.zh
+ eval_key: miracl-v1.0-zh
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5040
+ R@100: 0.9355
+ - split: dev
+ scores:
+ - nDCG@10: 0.5116
+ R@100: 0.9436
+ - name: mdpr-tied-pft-msmarco.de
+ eval_key: miracl-v1.0-de
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.4895
+ R@100: 0.8983
+ - name: mdpr-tied-pft-msmarco.yo
+ eval_key: miracl-v1.0-yo
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mdpr-tied-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.4439
+ R@100: 0.8403
+ # mdpr-tied-pft-msmarco-ft-all
+ - name: mdpr-tied-pft-msmarco-ft-all.ar
+ eval_key: miracl-v1.0-ar
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6954
+ R@100: 0.8542
+ - split: dev
+ scores:
+ - nDCG@10: 0.5782
+ R@100: 0.7953
+ - name: mdpr-tied-pft-msmarco-ft-all.bn
+ eval_key: miracl-v1.0-bn
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6823
+ R@100: 0.8646
+ - split: dev
+ scores:
+ - nDCG@10: 0.5804
+ R@100: 0.8480
+ - name: mdpr-tied-pft-msmarco-ft-all.en
+ eval_key: miracl-v1.0-en
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3491
+ R@100: 0.5678
+ - split: dev
+ scores:
+ - nDCG@10: 0.2813
+ R@100: 0.5083
+ - name: mdpr-tied-pft-msmarco-ft-all.es
+ eval_key: miracl-v1.0-es
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.2488
+ R@100: 0.4799
+ - split: dev
+ scores:
+ - nDCG@10: 0.2509
+ R@100: 0.4706
+ - name: mdpr-tied-pft-msmarco-ft-all.fa
+ eval_key: miracl-v1.0-fa
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3809
+ R@100: 0.6899
+ - split: dev
+ scores:
+ - nDCG@10: 0.3836
+ R@100: 0.6863
+ - name: mdpr-tied-pft-msmarco-ft-all.fi
+ eval_key: miracl-v1.0-fi
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.7738
+ R@100: 0.9081
+ - split: dev
+ scores:
+ - nDCG@10: 0.5694
+ R@100: 0.7984
+ - name: mdpr-tied-pft-msmarco-ft-all.fr
+ eval_key: miracl-v1.0-fr
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.2989
+ R@100: 0.6197
+ - split: dev
+ scores:
+ - nDCG@10: 0.3010
+ R@100: 0.6005
+ - name: mdpr-tied-pft-msmarco-ft-all.hi
+ eval_key: miracl-v1.0-hi
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3336
+ R@100: 0.6388
+ - split: dev
+ scores:
+ - nDCG@10: 0.3286
+ R@100: 0.6371
+ - name: mdpr-tied-pft-msmarco-ft-all.id
+ eval_key: miracl-v1.0-id
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3321
+ R@100: 0.5492
+ - split: dev
+ scores:
+ - nDCG@10: 0.3462
+ R@100: 0.5841
+ - name: mdpr-tied-pft-msmarco-ft-all.ja
+ eval_key: miracl-v1.0-ja
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6378
+ R@100: 0.7950
+ - split: dev
+ scores:
+ - nDCG@10: 0.4999
+ R@100: 0.7451
+ - name: mdpr-tied-pft-msmarco-ft-all.ko
+ eval_key: miracl-v1.0-ko
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5795
+ R@100: 0.7850
+ - split: dev
+ scores:
+ - nDCG@10: 0.4864
+ R@100: 0.7183
+ - name: mdpr-tied-pft-msmarco-ft-all.ru
+ eval_key: miracl-v1.0-ru
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6011
+ R@100: 0.8188
+ - split: dev
+ scores:
+ - nDCG@10: 0.3933
+ R@100: 0.6707
+ - name: mdpr-tied-pft-msmarco-ft-all.sw
+ eval_key: miracl-v1.0-sw
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.8882
+ R@100: 0.9710
+ - split: dev
+ scores:
+ - nDCG@10: 0.6575
+ R@100: 0.8883
+ - name: mdpr-tied-pft-msmarco-ft-all.te
+ eval_key: miracl-v1.0-te
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.8757
+ R@100: 0.9725
+ - split: dev
+ scores:
+ - nDCG@10: 0.7783
+ R@100: 0.9513
+ - name: mdpr-tied-pft-msmarco-ft-all.th
+ eval_key: miracl-v1.0-th
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.7761
+ R@100: 0.9241
+ - split: dev
+ scores:
+ - nDCG@10: 0.5975
+ R@100: 0.8360
+ - name: mdpr-tied-pft-msmarco-ft-all.zh
+ eval_key: miracl-v1.0-zh
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3446
+ R@100: 0.6608
+ - split: dev
+ scores:
+ - nDCG@10: 0.3575
+ R@100: 0.6725
+ - name: mdpr-tied-pft-msmarco-ft-all.de
+ eval_key: miracl-v1.0-de
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.3219
+ R@100: 0.5990
+ - name: mdpr-tied-pft-msmarco-ft-all.yo
+ eval_key: miracl-v1.0-yo
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mdpr-tied-pft-msmarco-ft-all --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.5983
+ R@100: 0.8908
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.ar
+ eval_key: miracl-v1.0-ar
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6259
+ R@100: 0.9173
+ - split: dev
+ scores:
+ - nDCG@10: 0.6729
+ R@100: 0.9405
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.bn
+ eval_key: miracl-v1.0-bn
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6587
+ R@100: 0.9297
+ - split: dev
+ scores:
+ - nDCG@10: 0.6540
+ R@100: 0.9321
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.en
+ eval_key: miracl-v1.0-en
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5347
+ R@100: 0.8772
+ - split: dev
+ scores:
+ - nDCG@10: 0.5488
+ R@100: 0.8815
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.es
+ eval_key: miracl-v1.0-es
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6234
+ R@100: 0.9425
+ - split: dev
+ scores:
+ - nDCG@10: 0.6413
+ R@100: 0.9479
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.fa
+ eval_key: miracl-v1.0-fa
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5890
+ R@100: 0.9433
+ - split: dev
+ scores:
+ - nDCG@10: 0.5935
+ R@100: 0.9374
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.fi
+ eval_key: miracl-v1.0-fi
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ scores:
+ - nDCG@10: 0.6164
+ R@100: 0.8506
+ - split: dev
+ scores:
+ - nDCG@10: 0.6716
+ R@100: 0.8949
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.fr
+ eval_key: miracl-v1.0-fr
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5299
+ R@100: 0.9709
+ - split: dev
+ scores:
+ - nDCG@10: 0.5233
+ R@100: 0.9647
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.hi
+ eval_key: miracl-v1.0-hi
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6217
+ R@100: 0.9059
+ - split: dev
+ scores:
+ - nDCG@10: 0.6157
+ R@100: 0.9115
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.id
+ eval_key: miracl-v1.0-id
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4442
+ R@100: 0.7595
+ - split: dev
+ scores:
+ - nDCG@10: 0.4433
+ R@100: 0.7683
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.ja
+ eval_key: miracl-v1.0-ja
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5795
+ R@100: 0.9082
+ - split: dev
+ scores:
+ - nDCG@10: 0.5757
+ R@100: 0.9036
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.ko
+ eval_key: miracl-v1.0-ko
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5758
+ R@100: 0.8744
+ - split: dev
+ scores:
+ - nDCG@10: 0.6086
+ R@100: 0.8997
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.ru
+ eval_key: miracl-v1.0-ru
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4921
+ R@100: 0.8494
+ - split: dev
+ scores:
+ - nDCG@10: 0.5323
+ R@100: 0.8738
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.sw
+ eval_key: miracl-v1.0-sw
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4100
+ R@100: 0.6987
+ - split: dev
+ scores:
+ - nDCG@10: 0.4457
+ R@100: 0.7254
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.te
+ eval_key: miracl-v1.0-te
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.6000
+ R@100: 0.8717
+ - split: dev
+ scores:
+ - nDCG@10: 0.6021
+ R@100: 0.8569
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.th
+ eval_key: miracl-v1.0-th
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5669
+ R@100: 0.8195
+ - split: dev
+ scores:
+ - nDCG@10: 0.5990
+ R@100: 0.8228
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.zh
+ eval_key: miracl-v1.0-zh
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5209
+ R@100: 0.9576
+ - split: dev
+ scores:
+ - nDCG@10: 0.5254
+ R@100: 0.9587
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.de
+ eval_key: miracl-v1.0-de
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.5643
+ R@100: 0.9482
+ - name: bm25-mdpr-tied-pft-msmarco-hybrid.yo
+ eval_key: miracl-v1.0-yo
+ command: python -m pyserini.fusion --runs ${bm25_output} ${mdpr_output} --output ${output} --method interpolation --alpha 0.5 --depth 1000 --k 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.6114
+ R@100: 0.9496
+ # mdpr-tied-pft-msmarco-ft-miracl-ft-miracl
+ - name: mdpr-tied-pft-msmarco-ft-miracl.ar
+ eval_key: miracl-v1.0-ar
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ar --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mdpr-tied-pft-msmarco-ft-miracl-ar --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.7252
+ R@100: 0.9489
+ - name: mdpr-tied-pft-msmarco-ft-miracl.bn
+ eval_key: miracl-v1.0-bn
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-bn --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mdpr-tied-pft-msmarco-ft-miracl-bn --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.6842
+ R@100: 0.9547
+ - name: mdpr-tied-pft-msmarco-ft-miracl.en
+ eval_key: miracl-v1.0-en
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-en --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mdpr-tied-pft-msmarco-ft-miracl-en --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.4878
+ R@100: 0.8341
+ - name: mdpr-tied-pft-msmarco-ft-miracl.es
+ eval_key: miracl-v1.0-es
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-es --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mdpr-tied-pft-msmarco-ft-miracl-es --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.5648
+ R@100: 0.9109
+ - name: mdpr-tied-pft-msmarco-ft-miracl.fa
+ eval_key: miracl-v1.0-fa
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fa --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mdpr-tied-pft-msmarco-ft-miracl-fa --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.5934
+ R@100: 0.9133
+ - name: mdpr-tied-pft-msmarco-ft-miracl.fi
+ eval_key: miracl-v1.0-fi
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fi --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mdpr-tied-pft-msmarco-ft-miracl-fi --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.7139
+ R@100: 0.9479
+ - name: mdpr-tied-pft-msmarco-ft-miracl.fr
+ eval_key: miracl-v1.0-fr
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-fr --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mdpr-tied-pft-msmarco-ft-miracl-fr --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.5893
+ R@100: 0.9537
+ - name: mdpr-tied-pft-msmarco-ft-miracl.hi
+ eval_key: miracl-v1.0-hi
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-hi --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mdpr-tied-pft-msmarco-ft-miracl-hi --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.5164
+ R@100: 0.8862
+ - name: mdpr-tied-pft-msmarco-ft-miracl.id
+ eval_key: miracl-v1.0-id
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-id --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mdpr-tied-pft-msmarco-ft-miracl-id --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.4959
+ R@100: 0.8642
+ - name: mdpr-tied-pft-msmarco-ft-miracl.ja
+ eval_key: miracl-v1.0-ja
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ja --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mdpr-tied-pft-msmarco-ft-miracl-ja --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.6416
+ R@100: 0.9225
+ - name: mdpr-tied-pft-msmarco-ft-miracl.ko
+ eval_key: miracl-v1.0-ko
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ko --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mdpr-tied-pft-msmarco-ft-miracl-ko --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.5901
+ R@100: 0.8857
+ - name: mdpr-tied-pft-msmarco-ft-miracl.ru
+ eval_key: miracl-v1.0-ru
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-ru --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mdpr-tied-pft-msmarco-ft-miracl-ru --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.5974
+ R@100: 0.9099
+ - name: mdpr-tied-pft-msmarco-ft-miracl.sw
+ eval_key: miracl-v1.0-sw
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-sw --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mdpr-tied-pft-msmarco-ft-miracl-sw --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.6853
+ R@100: 0.9367
+ - name: mdpr-tied-pft-msmarco-ft-miracl.te
+ eval_key: miracl-v1.0-te
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-te --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mdpr-tied-pft-msmarco-ft-miracl-te --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.8037
+ R@100: 0.9616
+ - name: mdpr-tied-pft-msmarco-ft-miracl.th
+ eval_key: miracl-v1.0-th
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-th --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mdpr-tied-pft-msmarco-ft-miracl-th --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.6951
+ R@100: 0.9311
+ - name: mdpr-tied-pft-msmarco-ft-miracl.zh
+ eval_key: miracl-v1.0-zh
+ command: python -m pyserini.search.faiss --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-miracl-zh --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mdpr-tied-pft-msmarco-ft-miracl-zh --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.6500
+ R@100: 0.9631
+ # mcontriever
+ - name: mcontriever-tied-pft-msmarco.ar
+ eval_key: miracl-v1.0-ar
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ar-${split} --index miracl-v1.0-ar-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5027
+ R@100: 0.9166
+ - split: dev
+ scores:
+ - nDCG@10: 0.5248
+ R@100: 0.9253
+ - name: mcontriever-tied-pft-msmarco.bn
+ eval_key: miracl-v1.0-bn
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-bn-${split} --index miracl-v1.0-bn-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5138
+ R@100: 0.9313
+ - split: dev
+ scores:
+ - nDCG@10: 0.5011
+ R@100: 0.9205
+ - name: mcontriever-tied-pft-msmarco.en
+ eval_key: miracl-v1.0-en
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-en-${split} --index miracl-v1.0-en-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3579
+ R@100: 0.7990
+ - split: dev
+ scores:
+ - nDCG@10: 0.3637
+ R@100: 0.7967
+ - name: mcontriever-tied-pft-msmarco.es
+ eval_key: miracl-v1.0-es
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-es-${split} --index miracl-v1.0-es-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4081
+ R@100: 0.8339
+ - split: dev
+ scores:
+ - nDCG@10: 0.4184
+ R@100: 0.8411
+ - name: mcontriever-tied-pft-msmarco.fa
+ eval_key: miracl-v1.0-fa
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fa-${split} --index miracl-v1.0-fa-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.2263
+ R@100: 0.6374
+ - split: dev
+ scores:
+ - nDCG@10: 0.2152
+ R@100: 0.6540
+ - name: mcontriever-tied-pft-msmarco.fi
+ eval_key: miracl-v1.0-fi
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fi-${split} --index miracl-v1.0-fi-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5680
+ R@100: 0.9369
+ - split: dev
+ scores:
+ - nDCG@10: 0.6019
+ R@100: 0.9527
+ - name: mcontriever-tied-pft-msmarco.fr
+ eval_key: miracl-v1.0-fr
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-fr-${split} --index miracl-v1.0-fr-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3332
+ R@100: 0.8341
+ - split: dev
+ scores:
+ - nDCG@10: 0.3140
+ R@100: 0.8243
+ - name: mcontriever-tied-pft-msmarco.hi
+ eval_key: miracl-v1.0-hi
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-hi-${split} --index miracl-v1.0-hi-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.2886
+ R@100: 0.6664
+ - split: dev
+ scores:
+ - nDCG@10: 0.2864
+ R@100: 0.6461
+ - name: mcontriever-tied-pft-msmarco.id
+ eval_key: miracl-v1.0-id
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-id-${split} --index miracl-v1.0-id-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3748
+ R@100: 0.7955
+ - split: dev
+ scores:
+ - nDCG@10: 0.3915
+ R@100: 0.8015
+ - name: mcontriever-tied-pft-msmarco.ja
+ eval_key: miracl-v1.0-ja
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ja-${split} --index miracl-v1.0-ja-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4402
+ R@100: 0.8813
+ - split: dev
+ scores:
+ - nDCG@10: 0.4240
+ R@100: 0.8783
+ - name: mcontriever-tied-pft-msmarco.ko
+ eval_key: miracl-v1.0-ko
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ko-${split} --index miracl-v1.0-ko-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4799
+ R@100: 0.8672
+ - split: dev
+ scores:
+ - nDCG@10: 0.4829
+ R@100: 0.8753
+ - name: mcontriever-tied-pft-msmarco.ru
+ eval_key: miracl-v1.0-ru
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-ru-${split} --index miracl-v1.0-ru-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.3811
+ R@100: 0.8369
+ - split: dev
+ scores:
+ - nDCG@10: 0.3913
+ R@100: 0.8500
+ - name: mcontriever-tied-pft-msmarco.sw
+ eval_key: miracl-v1.0-sw
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-sw-${split} --index miracl-v1.0-sw-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5568
+ R@100: 0.9130
+ - split: dev
+ scores:
+ - nDCG@10: 0.5600
+ R@100: 0.9108
+ - name: mcontriever-tied-pft-msmarco.te
+ eval_key: miracl-v1.0-te
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-te-${split} --index miracl-v1.0-te-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5260
+ R@100: 0.9457
+ - split: dev
+ scores:
+ - nDCG@10: 0.5283
+ R@100: 0.9612
+ - name: mcontriever-tied-pft-msmarco.th
+ eval_key: miracl-v1.0-th
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-th-${split} --index miracl-v1.0-th-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.5299
+ R@100: 0.9361
+ - split: dev
+ scores:
+ - nDCG@10: 0.5173
+ R@100: 0.9361
+ - name: mcontriever-tied-pft-msmarco.zh
+ eval_key: miracl-v1.0-zh
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-zh-${split} --index miracl-v1.0-zh-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: train
+ scores:
+ - nDCG@10: 0.4283
+ R@100: 0.8745
+ - split: dev
+ scores:
+ - nDCG@10: 0.4097
+ R@100: 0.9026
+ - name: mcontriever-tied-pft-msmarco.de
+ eval_key: miracl-v1.0-de
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-de-${split} --index miracl-v1.0-de-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.4079
+ R@100: 0.8407
+ - name: mcontriever-tied-pft-msmarco.yo
+ eval_key: miracl-v1.0-yo
+ command: python -m pyserini.search.faiss --encoder-class contriever --encoder facebook/mcontriever-msmarco --topics miracl-v1.0-yo-${split} --index miracl-v1.0-yo-mcontriever-pft-msmarco --output $output --batch 128 --threads 16 --hits 1000
+ splits:
+ - split: dev
+ scores:
+ - nDCG@10: 0.4150
+ R@100: 0.7703

pyserini/2cr/miracl_html.template ADDED Viewed

	@@ -0,0 +1,256 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8" />
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
+ <title>Pyserini Reproductions</title>
+ <!-- Font Awesome -->
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+ <!-- Google Fonts Roboto -->
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+ <!-- MDB -->
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+ <style>
+tr.hide-table-padding td {
+ padding: 0;
+}
+.expand-button {
+ position: relative;
+}
+.accordion-toggle .expand-button:after {
+ position: absolute;
+ left:.75rem;
+ top: 50%;
+ transform: translate(0, -50%);
+ content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+ content: '+';
+}
+blockquote.mycode {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ margin-top: 15px;
+ padding-left: 15px;
+}
+blockquote.mycode2 {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ padding-top: 10px;
+ padding-bottom: 10px;
+ padding-left: 15px;
+}
+tr th.headertop {
+ border-bottom: none;
+ padding-bottom: 0rem
+}
+tr th.headerbottom {
+ padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+ border-radius: 0;
+ min-width: 55px;
+ background: none repeat scroll 0 0 transparent;
+ background-color: grey;
+ color: #F1F2F3 !important;
+ cursor: pointer;
+ border-style: none;
+ font-family: 'HELVETICA',sans-serif;
+ font-size: 0.8em;
+ font-weight: normal;
+ text-align: center;
+ text-decoration: none;
+ text-indent: 0;
+ text-transform: uppercase;
+ font-weight: 500;
+ line-height: 1.42rem;
+ margin: 0;
+ padding: 3px 8px;
+ position: absolute !important;
+ top: 0 !important;
+ right: 0 !important;
+}
+.copy-code-button > span {
+ color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+ box-sizing: inherit;
+}
+.copy-code-button::before {
+ content: '';
+ display: inline-block;
+ width: 16px;
+ height: 16px;
+ margin-right: 3px;
+ background-size: contain;
+ background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+ background-repeat: no-repeat;
+ position: relative;
+ top: 3px;
+}
+.copy-code-button:focus {
+ /* Avoid an ugly focus outline on click in Chrome,
+ but darken the button for accessibility.
+ See https://stackoverflow.com/a/25298082/1481479 */
+ /* background-color: #E6E6E6; */
+ outline: 0;
+}
+pre[class*="prettyprint"] {
+ position: relative;
+ overflow: hidden;
+}
+ </style>
+</head>
+<body>
+ <!-- Background image -->
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+ <div class="mask" style="
+ background: linear-gradient(
+ 45deg,
+ rgba(29, 236, 197, 0.7),
+ rgba(91, 14, 214, 0.7) 100%
+ );
+ ">
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
+ <div class="text-white">
+ <h1 class="mb-3">$title</h1>
+ </div>
+ </div>
+ </div>
+ </div>
+ <!-- Background image -->
+ <div class="container my-4">
+ $tables
+ </ul>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --list-conditions
+</tt></blockquote>
+<p>Run all languages for a specific condition and show commands:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --condition bm25 --display-commands
+</tt></blockquote>
+<p>Run a particular language for a specific condition and show commands:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --condition bm25 --language ko --display-commands
+</tt></blockquote>
+<p>Run all languages for all conditions and show commands:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --all --display-commands
+</tt></blockquote>
+<p>With the above commands, run files will be placed in the current directory. Use the option <tt>--directory runs</tt> to place the runs in a sub-directory.</p>
+<p>For a specific condition, just show the commands and do not run:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --condition bm25 --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>For a specific condition and language, just show the commands and do not run:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --condition bm25 --language ko --display-commands --dry-run
+</tt></blockquote>
+<p>For all conditions, just show the commands and do not run and skip evaluation:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --all --display-commands --dry-run --skip-eval
+</tt></blockquote>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.miracl --generate-report --output docs/2cr/miracl.html
+</tt></blockquote>
+<p>The output file <tt>miracl.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+ </div>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+ var button = document.createElement('button');
+ button.className = 'copy-code-button';
+ button.type = 'button';
+ var s = codeBlock.innerText;
+ button.setAttribute('data-clipboard-text',s);
+ button.innerText = 'Copy';
+ // var pre = codeBlock.parentNode;
+ codeBlock.classList.add('prettyprint');
+ // pre.parentNode.insertBefore(button, pre);
+ codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+ console.info('Action:', e.action);
+ console.info('Text:', e.text);
+ console.info('Trigger:', e.trigger);
+ e.trigger.textContent = 'Copied';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+clipboard.on('error', function(e) {
+ console.error('Action:', e.action);
+ console.error('Trigger:', e.trigger);
+ e.trigger.textContent = 'Error Copying';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/miracl_html_table.template ADDED Viewed

	@@ -0,0 +1,35 @@

+<div class="table-responsive">
+ <table class="table">
+ <thead>
+ <tr>
+ <th scope="col"></th>
+ <th scope="col">$desc</th>
+ <th scope="col">ar</th>
+ <th scope="col">bn</th>
+ <th scope="col">en</th>
+ <th scope="col">es</th>
+ <th scope="col">fa</th>
+ <th scope="col">fi</th>
+ <th scope="col">fr</th>
+ <th scope="col">hi</th>
+ <th scope="col">id</th>
+ <th scope="col">ja</th>
+ <th scope="col">ko</th>
+ <th scope="col">ru</th>
+ <th scope="col">sw</th>
+ <th scope="col">te</th>
+ <th scope="col">th</th>
+ <th scope="col">zh</th>
+ <th scope="col">de</th>
+ <th scope="col">yo</th>
+ <th scope="col"></th>
+ <th scope="col">avg</th>
+ </tr>
+ </thead>
+ <tbody>
+$rows
+ </tbody>
+ </table>
+</div>

pyserini/2cr/miracl_html_table_row.template ADDED Viewed

	@@ -0,0 +1,336 @@

+<!-- Condition: $model -->
+<tr class="accordion-toggle collapsed" id="table${table_cnt}-row${row_cnt}" data-toggle="collapse" data-parent="#table${table_cnt}-row${row_cnt}" href="#table${table_cnt}-collapse${row_cnt}">
+<td class="expand-button"></td>
+<td>$model</td>
+<td>$ar</td>
+<td>$bn</td>
+<td>$en</td>
+<td>$es</td>
+<td>$fa</td>
+<td>$fi</td>
+<td>$fr</td>
+<td>$hi</td>
+<td>$id</td>
+<td>$ja</td>
+<td>$ko</td>
+<td>$ru</td>
+<td>$sw</td>
+<td>$te</td>
+<td>$th</td>
+<td>$zh</td>
+<td>$de</td>
+<td>$yo</td>
+<td></td>
+<td>$avg</td>
+</tr>
+<tr class="hide-table-padding">
+<td colspan="22">
+<div id="table${table_cnt}-collapse${row_cnt}" class="collapse in p-3">
+<!-- Tabs navs -->
+<ul class="nav nav-tabs mb-3" id="table${table_cnt}-row${row_cnt}-tabs" role="tablist">
+ <li class="nav-item" role="presentation">
+ <a class="nav-link active" id="table${table_cnt}-row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab1" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">ar</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab2" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">bn</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab3" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">en</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab4-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab4" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab4" aria-selected="false" style="text-transform:none">es</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab5-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab5" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab5" aria-selected="false" style="text-transform:none">fa</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab6-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab6" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab6" aria-selected="false" style="text-transform:none">fi</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab7-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab7" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab7" aria-selected="false" style="text-transform:none">fr</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab8-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab8" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab8" aria-selected="false" style="text-transform:none">hi</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab9-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab9" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab9" aria-selected="false" style="text-transform:none">id</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab10-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab10" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab10" aria-selected="false" style="text-transform:none">ja</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab11-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab11" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab11" aria-selected="false" style="text-transform:none">ko</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab12-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab12" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab12" aria-selected="false" style="text-transform:none">ru</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab13-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab13" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab13" aria-selected="false" style="text-transform:none">sw</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab14-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab14" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab14" aria-selected="false" style="text-transform:none">te</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab15-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab15" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab15" aria-selected="false" style="text-transform:none">th</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab16-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab16" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab16" aria-selected="false" style="text-transform:none">zh</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab17-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab17" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab17" aria-selected="false" style="text-transform:none">de</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab18-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab18" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab18" aria-selected="false" style="text-transform:none">yo</a>
+ </li>
+</ul>
+<!-- Tabs navs -->
+<!-- Tabs content -->
+<div class="tab-content" id="table${table_cnt}-row${row_cnt}-content">
+ <div class="tab-pane fade show active" id="table${table_cnt}-row${row_cnt}-tab1" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab1">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd1
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd1}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab2" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab2">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd2
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd2}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab3" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab3">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd3
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd3}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab4" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab4">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd4
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd4}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab5" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab5">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd5
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd5}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab6" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab6">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd6
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd6}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab7" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab7">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd7
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd7}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab8" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab8">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd8
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd8}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab9" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab9">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd9
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd9}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab10" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab10">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd10
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd10}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab11" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab11">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd11
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd11}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab12" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab12">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd12
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd12}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab13" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab13">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd13
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd13}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab14" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab14">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd14
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd14}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab15" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab15">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd15
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd15}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab16" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab16">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd16
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd16}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab17" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab17">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd17
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd17}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab18" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab18">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd18
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd18}</code></pre>
+ </blockquote>
+ </div>
+</div>
+<!-- Tabs content -->
+</div></td>
+</tr>

pyserini/2cr/mrtydi.py ADDED Viewed

	@@ -0,0 +1,330 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from collections import defaultdict
+from string import Template
+import argparse
+import math
+import os
+import pkg_resources
+import sys
+import time
+import yaml
+from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
+languages = [
+ ['ar', 'arabic'],
+ ['bn', 'bengali'],
+ ['en', 'english'],
+ ['fi', 'finnish'],
+ ['id', 'indonesian'],
+ ['ja', 'japanese'],
+ ['ko', 'korean'],
+ ['ru', 'russian'],
+ ['sw', 'swahili'],
+ ['te', 'telugu'],
+ ['th', 'thai']
+]
+models = ['bm25', 'mdpr-split-pft-nq', 'mdpr-tied-pft-nq', 'mdpr-tied-pft-msmarco', 'mdpr-tied-pft-msmarco-ft-all']
+html_display = {
+ 'bm25': 'BM25',
+ 'mdpr-split-pft-nq': 'mDPR (split encoders), pre-FT w/ NQ',
+ 'mdpr-tied-pft-nq': 'mDPR (tied encoders), pre-FT w/ NQ',
+ 'mdpr-tied-pft-msmarco': 'mDPR (tied encoders), pre-FT w/ MS MARCO',
+ 'mdpr-tied-pft-msmarco-ft-all': 'mDPR (tied encoders), pre-FT w/ MS MARCO, FT w/ all'
+}
+trec_eval_metric_definitions = {
+ 'MRR@100': '-c -M 100 -m recip_rank',
+ 'R@100': '-c -m recall.100',
+}
+def format_run_command(raw):
+ return raw.replace('--lang', '\\\n --lang')\
+ .replace('--encoder', '\\\n --encoder')\
+ .replace('--topics', '\\\n --topics')\
+ .replace('--index', '\\\n --index')\
+ .replace('--output ', '\\\n --output ')\
+ .replace('--batch ', '\\\n --batch ') \
+ .replace('--threads 12', '--threads 12 \\\n ')
+def format_eval_command(raw):
+ return raw.replace('-c ', '\\\n -c ')\
+ .replace(raw.split()[-1], f'\\\n {raw.split()[-1]}')
+def read_file(f):
+ fin = open(f, 'r')
+ text = fin.read()
+ fin.close()
+ return text
+def list_conditions():
+ print('Conditions:\n-----------')
+ for condition in models:
+ print(condition)
+ print('\nLanguages\n---------')
+ for language in languages:
+ print(language[0])
+def print_results(table, metric, split):
+ print(f'Metric = {metric}, Split = {split}')
+ print(' ' * 32, end='')
+ for lang in languages:
+ print(f'{lang[0]:3} ', end='')
+ print('')
+ for model in models:
+ print(f'{model:30}', end='')
+ for lang in languages:
+ key = f'{model}.{lang[0]}'
+ print(f'{table[key][split][metric]:7.3f}', end='')
+ print('')
+ print('')
+def generate_table_rows(table, row_template, commands, eval_commands, table_id, split, metric):
+ row_cnt = 1
+ html_rows = []
+ for model in models:
+ s = Template(row_template)
+ keys = {}
+ for lang in languages:
+ keys[lang[0]] = f'{model}.{lang[0]}'
+ sum = table[keys["ar"]][split][metric] + \
+ table[keys["bn"]][split][metric] + \
+ table[keys["en"]][split][metric] + \
+ table[keys["fi"]][split][metric] + \
+ table[keys["id"]][split][metric] + \
+ table[keys["ja"]][split][metric] + \
+ table[keys["ko"]][split][metric] + \
+ table[keys["ru"]][split][metric] + \
+ table[keys["sw"]][split][metric] + \
+ table[keys["te"]][split][metric] + \
+ table[keys["th"]][split][metric]
+ avg = sum / 11
+ s = s.substitute(table_cnt=table_id,
+ row_cnt=row_cnt,
+ model=html_display[model],
+ ar=f'{table[keys["ar"]][split][metric]:.3f}',
+ bn=f'{table[keys["bn"]][split][metric]:.3f}',
+ en=f'{table[keys["en"]][split][metric]:.3f}',
+ fi=f'{table[keys["fi"]][split][metric]:.3f}',
+ id=f'{table[keys["id"]][split][metric]:.3f}',
+ ja=f'{table[keys["ja"]][split][metric]:.3f}',
+ ko=f'{table[keys["ko"]][split][metric]:.3f}',
+ ru=f'{table[keys["ru"]][split][metric]:.3f}',
+ sw=f'{table[keys["sw"]][split][metric]:.3f}',
+ te=f'{table[keys["te"]][split][metric]:.3f}',
+ th=f'{table[keys["th"]][split][metric]:.3f}',
+ avg=f'{avg:.3f}',
+ cmd1=f'{commands[keys["ar"]]}',
+ cmd2=f'{commands[keys["bn"]]}',
+ cmd3=f'{commands[keys["en"]]}',
+ cmd4=f'{commands[keys["fi"]]}',
+ cmd5=f'{commands[keys["id"]]}',
+ cmd6=f'{commands[keys["ja"]]}',
+ cmd7=f'{commands[keys["ko"]]}',
+ cmd8=f'{commands[keys["ru"]]}',
+ cmd9=f'{commands[keys["sw"]]}',
+ cmd10=f'{commands[keys["te"]]}',
+ cmd11=f'{commands[keys["th"]]}',
+ eval_cmd1=f'{eval_commands[keys["ar"]][metric]}',
+ eval_cmd2=f'{eval_commands[keys["bn"]][metric]}',
+ eval_cmd3=f'{eval_commands[keys["en"]][metric]}',
+ eval_cmd4=f'{eval_commands[keys["fi"]][metric]}',
+ eval_cmd5=f'{eval_commands[keys["id"]][metric]}',
+ eval_cmd6=f'{eval_commands[keys["ja"]][metric]}',
+ eval_cmd7=f'{eval_commands[keys["ko"]][metric]}',
+ eval_cmd8=f'{eval_commands[keys["ru"]][metric]}',
+ eval_cmd9=f'{eval_commands[keys["sw"]][metric]}',
+ eval_cmd10=f'{eval_commands[keys["te"]][metric]}',
+ eval_cmd11=f'{eval_commands[keys["th"]][metric]}'
+ )
+ html_rows.append(s)
+ row_cnt += 1
+ return html_rows
+def generate_report(args):
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ commands = defaultdict(lambda: '')
+ eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html.template'))
+ table_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html_table.template'))
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'mrtydi_html_table_row.template'))
+ with open(pkg_resources.resource_filename(__name__, 'mrtydi.yaml')) as f:
+ yaml_data = yaml.safe_load(f)
+ for condition in yaml_data['conditions']:
+ name = condition['name']
+ eval_key = condition['eval_key']
+ cmd_template = condition['command']
+ for splits in condition['splits']:
+ split = splits['split']
+ runfile = os.path.join(args.directory, f'run.mrtydi.{name}.{split}.txt')
+ cmd = Template(cmd_template).substitute(split=split, output=runfile)
+ commands[name] = format_run_command(cmd)
+ for expected in splits['scores']:
+ for metric in expected:
+ table[name][split][metric] = expected[metric]
+ eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
+ f'{trec_eval_metric_definitions[metric]} {eval_key}-{split} {runfile}'
+ eval_commands[name][metric] = format_eval_command(eval_cmd)
+ tables_html = []
+ # Build the table for MRR@100, test queries
+ html_rows = generate_table_rows(table, row_template, commands, eval_commands, 1, 'test', 'MRR@100')
+ all_rows = '\n'.join(html_rows)
+ tables_html.append(Template(table_template).substitute(desc='MRR@100, test queries', rows=all_rows))
+ # Build the table for R@100, test queries
+ html_rows = generate_table_rows(table, row_template, commands, eval_commands, 2, 'test', 'R@100')
+ all_rows = '\n'.join(html_rows)
+ tables_html.append(Template(table_template).substitute(desc='Recall@100, test queries', rows=all_rows))
+ with open(args.output, 'w') as out:
+ out.write(Template(html_template).substitute(title='Mr.TyDi', tables=' '.join(tables_html)))
+def run_conditions(args):
+ start = time.time()
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ with open(pkg_resources.resource_filename(__name__, 'mrtydi.yaml')) as f:
+ yaml_data = yaml.safe_load(f)
+ for condition in yaml_data['conditions']:
+ name = condition['name']
+ encoder = name.split('.')[0]
+ lang = name.split('.')[-1]
+ if args.all:
+ pass
+ elif args.condition != encoder:
+ continue
+ elif args.language and args.language != lang:
+ continue
+ eval_key = condition['eval_key']
+ cmd_template = condition['command']
+ print(f'condition {name}:')
+ for splits in condition['splits']:
+ split = splits['split']
+ print(f' - split: {split}')
+ runfile = os.path.join(args.directory, f'run.mrtydi.{name}.{split}.txt')
+ cmd = Template(cmd_template).substitute(split=split, output=runfile)
+ if args.display_commands:
+ print(f'\n```bash\n{format_run_command(cmd)}\n```\n')
+ if not os.path.exists(runfile):
+ if not args.dry_run:
+ os.system(cmd)
+ for expected in splits['scores']:
+ for metric in expected:
+ if not args.skip_eval:
+ score = float(run_eval_and_return_metric(metric, f'{eval_key}-{split}',
+ trec_eval_metric_definitions[metric], runfile))
+ if math.isclose(score, float(expected[metric])):
+ result_str = ok_str
+ # Flaky test: small difference on orca
+ elif name == 'mdpr-tied-pft-nq.te' and split == 'dev' \
+ and math.isclose(score, float(expected[metric]), abs_tol=2e-4):
+ result_str = okish_str
+ # Flaky test: small difference on orca
+ elif name == 'mdpr-tied-pft-msmarco-ft-all.ko' and split == 'train' \
+ and math.isclose(score, float(expected[metric]), abs_tol=4e-4):
+ result_str = okish_str
+ # Flaky test: small difference on Mac Studio (M1)
+ elif name == 'mdpr-tied-pft-msmarco.th' and split == 'train' \
+ and math.isclose(score, float(expected[metric]), abs_tol=3e-4):
+ result_str = okish_str
+ else:
+ result_str = fail_str + f' expected {expected[metric]:.4f}'
+ print(f' {metric:7}: {score:.4f} {result_str}')
+ table[name][split][metric] = score
+ else:
+ table[name][split][metric] = expected[metric]
+ print('')
+ for metric in ['MRR@100', 'R@100']:
+ for split in ['test', 'dev', 'train']:
+ print_results(table, metric, split)
+ end = time.time()
+ print(f'Total elapsed time: {end - start:.0f}s')
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Generate regression matrix for MIRACL.')
+ parser.add_argument('--condition', type=str,
+ help='Condition to run', required=False)
+ # To list all conditions
+ parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
+ # For generating reports
+ parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
+ parser.add_argument('--output', type=str, help='File to store report.', required=False)
+ # For actually running the experimental conditions
+ parser.add_argument('--all', action='store_true', default=False, help='Run using all languages.')
+ parser.add_argument('--language', type=str, help='Language to run.', required=False)
+ parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
+ parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
+ parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
+ parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
+ args = parser.parse_args()
+ if args.list_conditions:
+ list_conditions()
+ sys.exit()
+ if args.generate_report:
+ if not args.output:
+ print(f'Must specify report filename with --output.')
+ sys.exit()
+ generate_report(args)
+ sys.exit()
+ if args.all and (args.condition or args.language):
+ print('Specifying --all will run all conditions and languages')
+ sys.exit()
+ run_conditions(args)

pyserini/2cr/mrtydi.yaml ADDED Viewed

	@@ -0,0 +1,890 @@

+conditions:
+ # mDPR, tied encoders, pFT w/ MS MARCO, FT all
+ - name: mdpr-tied-pft-msmarco-ft-all.ar
+ eval_key: mrtydi-v1.1-arabic
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.9505
+ R@100: 1.0000
+ - split: dev
+ scores:
+ - MRR@100: 0.6913
+ R@100: 0.9165
+ - split: test
+ scores:
+ - MRR@100: 0.6949
+ R@100: 0.9004
+ - name: mdpr-tied-pft-msmarco-ft-all.bn
+ eval_key: mrtydi-v1.1-bengali
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.9620
+ R@100: 1.0000
+ - split: dev
+ scores:
+ - MRR@100: 0.5897
+ R@100: 0.8977
+ - split: test
+ scores:
+ - MRR@100: 0.6228
+ R@100: 0.9550
+ - name: mdpr-tied-pft-msmarco-ft-all.en
+ eval_key: mrtydi-v1.1-english
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.8278
+ R@100: 1.0000
+ - split: dev
+ scores:
+ - MRR@100: 0.5357
+ R@100: 0.8884
+ - split: test
+ scores:
+ - MRR@100: 0.4916
+ R@100: 0.8414
+ - name: mdpr-tied-pft-msmarco-ft-all.fi
+ eval_key: mrtydi-v1.1-finnish
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.9577
+ R@100: 0.9997
+ - split: dev
+ scores:
+ - MRR@100: 0.6626
+ R@100: 0.9171
+ - split: test
+ scores:
+ - MRR@100: 0.5595
+ R@100: 0.8563
+ - name: mdpr-tied-pft-msmarco-ft-all.id
+ eval_key: mrtydi-v1.1-indonesian
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.9469
+ R@100: 1.0000
+ - split: dev
+ scores:
+ - MRR@100: 0.6294
+ R@100: 0.9150
+ - split: test
+ scores:
+ - MRR@100: 0.5783
+ R@100: 0.8609
+ - name: mdpr-tied-pft-msmarco-ft-all.ja
+ eval_key: mrtydi-v1.1-japanese
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.8802
+ R@100: 0.9997
+ - split: dev
+ scores:
+ - MRR@100: 0.5505
+ R@100: 0.8696
+ - split: test
+ scores:
+ - MRR@100: 0.5007
+ R@100: 0.8130
+ - name: mdpr-tied-pft-msmarco-ft-all.ko
+ eval_key: mrtydi-v1.1-korean
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.9195
+ R@100: 1.0000
+ - split: dev
+ scores:
+ - MRR@100: 0.5645
+ R@100: 0.8663
+ - split: test
+ scores:
+ - MRR@100: 0.4861
+ R@100: 0.7854
+ - name: mdpr-tied-pft-msmarco-ft-all.ru
+ eval_key: mrtydi-v1.1-russian
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.8473
+ R@100: 0.9994
+ - split: dev
+ scores:
+ - MRR@100: 0.5104
+ R@100: 0.8720
+ - split: test
+ scores:
+ - MRR@100: 0.5161
+ R@100: 0.8432
+ - name: mdpr-tied-pft-msmarco-ft-all.sw
+ eval_key: mrtydi-v1.1-swahili
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.9515
+ R@100: 1.0000
+ - split: dev
+ scores:
+ - MRR@100: 0.6404
+ R@100: 0.9018
+ - split: test
+ scores:
+ - MRR@100: 0.6438
+ R@100: 0.8756
+ - name: mdpr-tied-pft-msmarco-ft-all.te
+ eval_key: mrtydi-v1.1-telugu
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.9679
+ R@100: 1.0000
+ - split: dev
+ scores:
+ - MRR@100: 0.7962
+ R@100: 0.9593
+ - split: test
+ scores:
+ - MRR@100: 0.8908
+ R@100: 0.9659
+ - name: mdpr-tied-pft-msmarco-ft-all.th
+ eval_key: mrtydi-v1.1-thai
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco-ft-all --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-msmarco-ft-all --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.9504
+ R@100: 1.0000
+ - split: dev
+ scores:
+ - MRR@100: 0.6670
+ R@100: 0.9114
+ - split: test
+ scores:
+ - MRR@100: 0.6175
+ R@100: 0.8826
+ # mDPR, tied encoders, pFT w/ MS MARCO
+ - name: mdpr-tied-pft-msmarco.ar
+ eval_key: mrtydi-v1.1-arabic
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3957
+ R@100: 0.7818
+ - split: dev
+ scores:
+ - MRR@100: 0.3978
+ R@100: 0.7778
+ - split: test
+ scores:
+ - MRR@100: 0.4414
+ R@100: 0.7971
+ - name: mdpr-tied-pft-msmarco.bn
+ eval_key: mrtydi-v1.1-bengali
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2920
+ R@100: 0.7323
+ - split: dev
+ scores:
+ - MRR@100: 0.2993
+ R@100: 0.7318
+ - split: test
+ scores:
+ - MRR@100: 0.3969
+ R@100: 0.7838
+ - name: mdpr-tied-pft-msmarco.en
+ eval_key: mrtydi-v1.1-english
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3374
+ R@100: 0.8111
+ - split: dev
+ scores:
+ - MRR@100: 0.3451
+ R@100: 0.7995
+ - split: test
+ scores:
+ - MRR@100: 0.3270
+ R@100: 0.7536
+ - name: mdpr-tied-pft-msmarco.fi
+ eval_key: mrtydi-v1.1-finnish
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3668
+ R@100: 0.7337
+ - split: dev
+ scores:
+ - MRR@100: 0.3636
+ R@100: 0.7371
+ - split: test
+ scores:
+ - MRR@100: 0.2750
+ R@100: 0.6471
+ - name: mdpr-tied-pft-msmarco.id
+ eval_key: mrtydi-v1.1-indonesian
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2794
+ R@100: 0.7044
+ - split: dev
+ scores:
+ - MRR@100: 0.2853
+ R@100: 0.7198
+ - split: test
+ scores:
+ - MRR@100: 0.3520
+ R@100: 0.7356
+ - name: mdpr-tied-pft-msmarco.ja
+ eval_key: mrtydi-v1.1-japanese
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3089
+ R@100: 0.7603
+ - split: dev
+ scores:
+ - MRR@100: 0.3108
+ R@100: 0.7597
+ - split: test
+ scores:
+ - MRR@100: 0.3107
+ R@100: 0.7317
+ - name: mdpr-tied-pft-msmarco.ko
+ eval_key: mrtydi-v1.1-korean
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3003
+ R@100: 0.6907
+ - split: dev
+ scores:
+ - MRR@100: 0.3017
+ R@100: 0.7046
+ - split: test
+ scores:
+ - MRR@100: 0.2820
+ R@100: 0.6172
+ - name: mdpr-tied-pft-msmarco.ru
+ eval_key: mrtydi-v1.1-russian
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2856
+ R@100: 0.7305
+ - split: dev
+ scores:
+ - MRR@100: 0.2943
+ R@100: 0.7404
+ - split: test
+ scores:
+ - MRR@100: 0.3561
+ R@100: 0.7432
+ - name: mdpr-tied-pft-msmarco.sw
+ eval_key: mrtydi-v1.1-swahili
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2491
+ R@100: 0.5195
+ - split: dev
+ scores:
+ - MRR@100: 0.2447
+ R@100: 0.5266
+ - split: test
+ scores:
+ - MRR@100: 0.3418
+ R@100: 0.6343
+ - name: mdpr-tied-pft-msmarco.te
+ eval_key: mrtydi-v1.1-telugu
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3059
+ R@100: 0.7510
+ - split: dev
+ scores:
+ - MRR@100: 0.2995
+ R@100: 0.7355
+ - split: test
+ scores:
+ - MRR@100: 0.3102
+ R@100: 0.7817
+ - name: mdpr-tied-pft-msmarco.th
+ eval_key: mrtydi-v1.1-thai
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-msmarco --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-msmarco --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2334
+ R@100: 0.5851
+ - split: dev
+ scores:
+ - MRR@100: 0.2407
+ R@100: 0.5795
+ - split: test
+ scores:
+ - MRR@100: 0.2693
+ R@100: 0.5945
+ # mDPR, tied encoders, pFT w/ NQ
+ - name: mdpr-tied-pft-nq.ar
+ eval_key: mrtydi-v1.1-arabic
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2087
+ R@100: 0.5854
+ - split: dev
+ scores:
+ - MRR@100: 0.2132
+ R@100: 0.5868
+ - split: test
+ scores:
+ - MRR@100: 0.2214
+ R@100: 0.6001
+ - name: mdpr-tied-pft-nq.bn
+ eval_key: mrtydi-v1.1-bengali
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2371
+ R@100: 0.6281
+ - split: dev
+ scores:
+ - MRR@100: 0.2414
+ R@100: 0.6409
+ - split: test
+ scores:
+ - MRR@100: 0.2535
+ R@100: 0.7072
+ - name: mdpr-tied-pft-nq.en
+ eval_key: mrtydi-v1.1-english
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2441
+ R@100: 0.7217
+ - split: dev
+ scores:
+ - MRR@100: 0.2359
+ R@100: 0.7187
+ - split: test
+ scores:
+ - MRR@100: 0.2433
+ R@100: 0.6893
+ - name: mdpr-tied-pft-nq.fi
+ eval_key: mrtydi-v1.1-finnish
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2996
+ R@100: 0.6787
+ - split: dev
+ scores:
+ - MRR@100: 0.3252
+ R@100: 0.7037
+ - split: test
+ scores:
+ - MRR@100: 0.2444
+ R@100: 0.6401
+ - name: mdpr-tied-pft-nq.id
+ eval_key: mrtydi-v1.1-indonesian
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2706
+ R@100: 0.7322
+ - split: dev
+ scores:
+ - MRR@100: 0.2719
+ R@100: 0.7394
+ - split: test
+ scores:
+ - MRR@100: 0.2815
+ R@100: 0.6914
+ - name: mdpr-tied-pft-nq.ja
+ eval_key: mrtydi-v1.1-japanese
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2165
+ R@100: 0.6043
+ - split: dev
+ scores:
+ - MRR@100: 0.2299
+ R@100: 0.6239
+ - split: test
+ scores:
+ - MRR@100: 0.2058
+ R@100: 0.5734
+ - name: mdpr-tied-pft-nq.ko
+ eval_key: mrtydi-v1.1-korean
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2527
+ R@100: 0.6556
+ - split: dev
+ scores:
+ - MRR@100: 0.2680
+ R@100: 0.6271
+ - split: test
+ scores:
+ - MRR@100: 0.2234
+ R@100: 0.5499
+ - name: mdpr-tied-pft-nq.ru
+ eval_key: mrtydi-v1.1-russian
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2160
+ R@100: 0.6262
+ - split: dev
+ scores:
+ - MRR@100: 0.2263
+ R@100: 0.6444
+ - split: test
+ scores:
+ - MRR@100: 0.2501
+ R@100: 0.6181
+ - name: mdpr-tied-pft-nq.sw
+ eval_key: mrtydi-v1.1-swahili
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2383
+ R@100: 0.5707
+ - split: dev
+ scores:
+ - MRR@100: 0.2543
+ R@100: 0.6138
+ - split: test
+ scores:
+ - MRR@100: 0.2621
+ R@100: 0.5965
+ - name: mdpr-tied-pft-nq.te
+ eval_key: mrtydi-v1.1-telugu
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.1483
+ R@100: 0.4162
+ - split: dev
+ scores:
+ - MRR@100: 0.1494
+ R@100: 0.3967
+ - split: test
+ scores:
+ - MRR@100: 0.0970
+ R@100: 0.2454
+ - name: mdpr-tied-pft-nq.th
+ eval_key: mrtydi-v1.1-thai
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder-class auto --encoder castorini/mdpr-tied-pft-nq --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-tied-pft-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.1426
+ R@100: 0.4717
+ - split: dev
+ scores:
+ - MRR@100: 0.1618
+ R@100: 0.4637
+ - split: test
+ scores:
+ - MRR@100: 0.1575
+ R@100: 0.4550
+ # mDPR, split encoders, pFT w/ NQ
+ - name: mdpr-split-pft-nq.ar
+ eval_key: mrtydi-v1.1-arabic
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2510
+ R@100: 0.6384
+ - split: dev
+ scores:
+ - MRR@100: 0.2449
+ R@100: 0.6334
+ - split: test
+ scores:
+ - MRR@100: 0.2907
+ R@100: 0.6502
+ - name: mdpr-split-pft-nq.bn
+ eval_key: mrtydi-v1.1-bengali
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2293
+ R@100: 0.6454
+ - split: dev
+ scores:
+ - MRR@100: 0.2367
+ R@100: 0.6511
+ - split: test
+ scores:
+ - MRR@100: 0.2911
+ R@100: 0.7793
+ - name: mdpr-split-pft-nq.en
+ eval_key: mrtydi-v1.1-english
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2862
+ R@100: 0.7372
+ - split: dev
+ scores:
+ - MRR@100: 0.2821
+ R@100: 0.7437
+ - split: test
+ scores:
+ - MRR@100: 0.2907
+ R@100: 0.6779
+ - name: mdpr-split-pft-nq.fi
+ eval_key: mrtydi-v1.1-finnish
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2473
+ R@100: 0.6289
+ - split: dev
+ scores:
+ - MRR@100: 0.2466
+ R@100: 0.6283
+ - split: test
+ scores:
+ - MRR@100: 0.2050
+ R@100: 0.5680
+ - name: mdpr-split-pft-nq.id
+ eval_key: mrtydi-v1.1-indonesian
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2351
+ R@100: 0.6952
+ - split: dev
+ scores:
+ - MRR@100: 0.2475
+ R@100: 0.7181
+ - split: test
+ scores:
+ - MRR@100: 0.2705
+ R@100: 0.6848
+ - name: mdpr-split-pft-nq.ja
+ eval_key: mrtydi-v1.1-japanese
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.1967
+ R@100: 0.5983
+ - split: dev
+ scores:
+ - MRR@100: 0.2055
+ R@100: 0.6142
+ - split: test
+ scores:
+ - MRR@100: 0.2119
+ R@100: 0.5840
+ - name: mdpr-split-pft-nq.ko
+ eval_key: mrtydi-v1.1-korean
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2383
+ R@100: 0.6180
+ - split: dev
+ scores:
+ - MRR@100: 0.2343
+ R@100: 0.6238
+ - split: test
+ scores:
+ - MRR@100: 0.2345
+ R@100: 0.5325
+ - name: mdpr-split-pft-nq.ru
+ eval_key: mrtydi-v1.1-russian
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2317
+ R@100: 0.6534
+ - split: dev
+ scores:
+ - MRR@100: 0.2490
+ R@100: 0.6553
+ - split: test
+ scores:
+ - MRR@100: 0.2820
+ R@100: 0.6474
+ - name: mdpr-split-pft-nq.sw
+ eval_key: mrtydi-v1.1-swahili
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.1457
+ R@100: 0.4481
+ - split: dev
+ scores:
+ - MRR@100: 0.1547
+ R@100: 0.4724
+ - split: test
+ scores:
+ - MRR@100: 0.1883
+ R@100: 0.5281
+ - name: mdpr-split-pft-nq.te
+ eval_key: mrtydi-v1.1-telugu
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.1489
+ R@100: 0.4905
+ - split: dev
+ scores:
+ - MRR@100: 0.1503
+ R@100: 0.4934
+ - split: test
+ scores:
+ - MRR@100: 0.1099
+ R@100: 0.3661
+ - name: mdpr-split-pft-nq.th
+ eval_key: mrtydi-v1.1-thai
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --encoder castorini/mdpr-question-nq --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai-mdpr-nq --output $output --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.1603
+ R@100: 0.4983
+ - split: dev
+ scores:
+ - MRR@100: 0.1584
+ R@100: 0.5083
+ - split: test
+ scores:
+ - MRR@100: 0.1709
+ R@100: 0.5146
+ # BM25
+ - name: bm25.ar
+ eval_key: mrtydi-v1.1-arabic
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ar --topics mrtydi-v1.1-arabic-${split} --index mrtydi-v1.1-arabic --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3356
+ R@100: 0.7944
+ - split: dev
+ scores:
+ - MRR@100: 0.3462
+ R@100: 0.7872
+ - split: test
+ scores:
+ - MRR@100: 0.3682
+ R@100: 0.7928
+ - name: bm25.bn
+ eval_key: mrtydi-v1.1-bengali
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language bn --topics mrtydi-v1.1-bengali-${split} --index mrtydi-v1.1-bengali --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3566
+ - R@100: 0.8336
+ - split: dev
+ scores:
+ - MRR@100: 0.3385
+ - R@100: 0.8432
+ - split: test
+ scores:
+ - MRR@100: 0.4182
+ - R@100: 0.8694
+ - name: bm25.en
+ eval_key: mrtydi-v1.1-english
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language en --topics mrtydi-v1.1-english-${split} --index mrtydi-v1.1-english --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.1592
+ - R@100: 0.5785
+ - split: dev
+ scores:
+ - MRR@100: 0.1685
+ - R@100: 0.6196
+ - split: test
+ scores:
+ - MRR@100: 0.1404
+ - R@100: 0.5365
+ - name: bm25.fi
+ eval_key: mrtydi-v1.1-finnish
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language fi --topics mrtydi-v1.1-finnish-${split} --index mrtydi-v1.1-finnish --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.4101
+ - R@100: 0.8198
+ - split: dev
+ scores:
+ - MRR@100: 0.4136
+ - R@100: 0.8285
+ - split: test
+ scores:
+ - MRR@100: 0.2836
+ - R@100: 0.7196
+ - name: bm25.id
+ eval_key: mrtydi-v1.1-indonesian
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language id --topics mrtydi-v1.1-indonesian-${split} --index mrtydi-v1.1-indonesian --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2972
+ - R@100: 0.7948
+ - split: dev
+ scores:
+ - MRR@100: 0.2937
+ - R@100: 0.7827
+ - split: test
+ scores:
+ - MRR@100: 0.3762
+ - R@100: 0.8426
+ - name: bm25.ja
+ eval_key: mrtydi-v1.1-japanese
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ja --topics mrtydi-v1.1-japanese-${split} --index mrtydi-v1.1-japanese --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2262
+ - R@100: 0.7290
+ - split: dev
+ scores:
+ - MRR@100: 0.2250
+ - R@100: 0.7252
+ - split: test
+ scores:
+ - MRR@100: 0.2125
+ - R@100: 0.6431
+ - name: bm25.ko
+ eval_key: mrtydi-v1.1-korean
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ko --topics mrtydi-v1.1-korean-${split} --index mrtydi-v1.1-korean --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2596
+ - R@100: 0.6178
+ - split: dev
+ scores:
+ - MRR@100: 0.2888
+ - R@100: 0.6733
+ - split: test
+ scores:
+ - MRR@100: 0.2848
+ - R@100: 0.6188
+ - name: bm25.ru
+ eval_key: mrtydi-v1.1-russian
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language ru --topics mrtydi-v1.1-russian-${split} --index mrtydi-v1.1-russian --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2229
+ - R@100: 0.5779
+ - split: dev
+ scores:
+ - MRR@100: 0.2202
+ - R@100: 0.5760
+ - split: test
+ scores:
+ - MRR@100: 0.3163
+ - R@100: 0.6541
+ - name: bm25.sw
+ eval_key: mrtydi-v1.1-swahili
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language sw --topics mrtydi-v1.1-swahili-${split} --index mrtydi-v1.1-swahili --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.2610
+ - R@100: 0.5903
+ - split: dev
+ scores:
+ - MRR@100: 0.2693
+ - R@100: 0.5789
+ - split: test
+ scores:
+ - MRR@100: 0.3893
+ - R@100: 0.7642
+ - name: bm25.te
+ eval_key: mrtydi-v1.1-telugu
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language te --topics mrtydi-v1.1-telugu-${split} --index mrtydi-v1.1-telugu --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.4204
+ - R@100: 0.8229
+ - split: dev
+ scores:
+ - MRR@100: 0.4269
+ - R@100: 0.8362
+ - split: test
+ scores:
+ - MRR@100: 0.5283
+ - R@100: 0.8971
+ - name: bm25.th
+ eval_key: mrtydi-v1.1-thai
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --language th --topics mrtydi-v1.1-thai-${split} --index mrtydi-v1.1-thai --output $output --bm25 --hits 100
+ splits:
+ - split: train
+ scores:
+ - MRR@100: 0.3543
+ - R@100: 0.8349
+ - split: dev
+ scores:
+ - MRR@100: 0.3586
+ - R@100: 0.8536
+ - split: test
+ scores:
+ - MRR@100: 0.4012
+ - R@100: 0.8529

pyserini/2cr/mrtydi_html.template ADDED Viewed

	@@ -0,0 +1,256 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8" />
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
+ <title>Pyserini Reproductions</title>
+ <!-- Font Awesome -->
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+ <!-- Google Fonts Roboto -->
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+ <!-- MDB -->
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+ <style>
+tr.hide-table-padding td {
+ padding: 0;
+}
+.expand-button {
+ position: relative;
+}
+.accordion-toggle .expand-button:after {
+ position: absolute;
+ left:.75rem;
+ top: 50%;
+ transform: translate(0, -50%);
+ content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+ content: '+';
+}
+blockquote.mycode {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ margin-top: 15px;
+ padding-left: 15px;
+}
+blockquote.mycode2 {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ padding-top: 10px;
+ padding-bottom: 10px;
+ padding-left: 15px;
+}
+tr th.headertop {
+ border-bottom: none;
+ padding-bottom: 0rem
+}
+tr th.headerbottom {
+ padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+ border-radius: 0;
+ min-width: 55px;
+ background: none repeat scroll 0 0 transparent;
+ background-color: grey;
+ color: #F1F2F3 !important;
+ cursor: pointer;
+ border-style: none;
+ font-family: 'HELVETICA',sans-serif;
+ font-size: 0.8em;
+ font-weight: normal;
+ text-align: center;
+ text-decoration: none;
+ text-indent: 0;
+ text-transform: uppercase;
+ font-weight: 500;
+ line-height: 1.42rem;
+ margin: 0;
+ padding: 3px 8px;
+ position: absolute !important;
+ top: 0 !important;
+ right: 0 !important;
+}
+.copy-code-button > span {
+ color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+ box-sizing: inherit;
+}
+.copy-code-button::before {
+ content: '';
+ display: inline-block;
+ width: 16px;
+ height: 16px;
+ margin-right: 3px;
+ background-size: contain;
+ background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+ background-repeat: no-repeat;
+ position: relative;
+ top: 3px;
+}
+.copy-code-button:focus {
+ /* Avoid an ugly focus outline on click in Chrome,
+ but darken the button for accessibility.
+ See https://stackoverflow.com/a/25298082/1481479 */
+ /* background-color: #E6E6E6; */
+ outline: 0;
+}
+pre[class*="prettyprint"] {
+ position: relative;
+ overflow: hidden;
+}
+ </style>
+</head>
+<body>
+ <!-- Background image -->
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+ <div class="mask" style="
+ background: linear-gradient(
+ 45deg,
+ rgba(29, 236, 197, 0.7),
+ rgba(91, 14, 214, 0.7) 100%
+ );
+ ">
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
+ <div class="text-white">
+ <h1 class="mb-3">$title</h1>
+ </div>
+ </div>
+ </div>
+ </div>
+ <!-- Background image -->
+ <div class="container my-4">
+ $tables
+ </ul>
+ <div style="padding-top: 20px"/>
+ <h4>Programmatic Execution</h4>
+ <p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+ To list all the experimental conditions:</p>
+ <blockquote class="mycode2"><tt>
+ python -m pyserini.2cr.mrtydi --list-conditions
+ </tt></blockquote>
+ <p>Run all languages for a specific condition and show commands:</p>
+ <blockquote class="mycode2"><tt>
+ python -m pyserini.2cr.mrtydi --condition bm25 --display-commands
+ </tt></blockquote>
+ <p>Run a particular language for a specific condition and show commands:</p>
+ <blockquote class="mycode2"><tt>
+ python -m pyserini.2cr.mrtydi --condition bm25 --language ko --display-commands
+ </tt></blockquote>
+ <p>Run all languages for all conditions and show commands:</p>
+ <blockquote class="mycode2"><tt>
+ python -m pyserini.2cr.mrtydi --all --display-commands
+ </tt></blockquote>
+ <p>With the above commands, run files will be placed in the current directory. Use the option <tt>--directory runs</tt> to place the runs in a sub-directory.</p>
+ <p>For a specific condition, just show the commands and do not run:</p>
+ <blockquote class="mycode2"><tt>
+ python -m pyserini.2cr.mrtydi --condition bm25 --display-commands --dry-run
+ </tt></blockquote>
+ <p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+ <p>For a specific condition and language, just show the commands and do not run:</p>
+ <blockquote class="mycode2"><tt>
+ python -m pyserini.2cr.mrtydi --condition bm25 --language ko --display-commands --dry-run
+ </tt></blockquote>
+ <p>For all conditions, just show the commands and do not run and skip evaluation:</p>
+ <blockquote class="mycode2"><tt>
+ python -m pyserini.2cr.mrtydi --all --display-commands --dry-run --skip-eval
+ </tt></blockquote>
+ <p>Finally, to generate this page:</p>
+ <blockquote class="mycode2"><tt>
+ python -m pyserini.2cr.mrtydi --generate-report --output docs/2cr/mrtydi.html
+ </tt></blockquote>
+ <p>The output file <tt>mrtydi.html</tt> should be identical to this page.</p>
+ <div style="padding-top: 50px"/>
+ </div>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+ var button = document.createElement('button');
+ button.className = 'copy-code-button';
+ button.type = 'button';
+ var s = codeBlock.innerText;
+ button.setAttribute('data-clipboard-text',s);
+ button.innerText = 'Copy';
+ // var pre = codeBlock.parentNode;
+ codeBlock.classList.add('prettyprint');
+ // pre.parentNode.insertBefore(button, pre);
+ codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+ console.info('Action:', e.action);
+ console.info('Text:', e.text);
+ console.info('Trigger:', e.trigger);
+ e.trigger.textContent = 'Copied';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+clipboard.on('error', function(e) {
+ console.error('Action:', e.action);
+ console.error('Trigger:', e.trigger);
+ e.trigger.textContent = 'Error Copying';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/mrtydi_html_table.template ADDED Viewed

	@@ -0,0 +1,28 @@

+<div class="table-responsive">
+ <table class="table">
+ <thead>
+ <tr>
+ <th scope="col"></th>
+ <th scope="col">$desc</th>
+ <th scope="col">ar</th>
+ <th scope="col">bn</th>
+ <th scope="col">en</th>
+ <th scope="col">fi</th>
+ <th scope="col">id</th>
+ <th scope="col">ja</th>
+ <th scope="col">ko</th>
+ <th scope="col">ru</th>
+ <th scope="col">sw</th>
+ <th scope="col">te</th>
+ <th scope="col">th</th>
+ <th scope="col"></th>
+ <th scope="col">avg</th>
+ </tr>
+ </thead>
+ <tbody>
+$rows
+ </tbody>
+ </table>
+</div>

pyserini/2cr/mrtydi_html_table_row.template ADDED Viewed

	@@ -0,0 +1,212 @@

+<!-- Condition: $model -->
+<tr class="accordion-toggle collapsed" id="table${table_cnt}-row${row_cnt}" data-toggle="collapse" data-parent="#table${table_cnt}-row${row_cnt}" href="#table${table_cnt}-collapse${row_cnt}">
+<td class="expand-button"></td>
+<td>$model</td>
+<td>$ar</td>
+<td>$bn</td>
+<td>$en</td>
+<td>$fi</td>
+<td>$id</td>
+<td>$ja</td>
+<td>$ko</td>
+<td>$ru</td>
+<td>$sw</td>
+<td>$te</td>
+<td>$th</td>
+<td></td>
+<td>$avg</td>
+</tr>
+<tr class="hide-table-padding">
+<td></td>
+<td></td>
+<td colspan="13" style="max-width: 600px">
+<div id="table${table_cnt}-collapse${row_cnt}" class="collapse in p-3">
+<!-- Tabs navs -->
+<ul class="nav nav-tabs mb-3" id="table${table_cnt}-row${row_cnt}-tabs" role="tablist">
+ <li class="nav-item" role="presentation">
+ <a class="nav-link active" id="table${table_cnt}-row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab1" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">ar</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab2" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">bn</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab3" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">en</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab4-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab4" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">fi</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab5-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab5" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">id</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab6-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab6" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ja</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab7-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab7" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ko</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab8-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab8" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">ru</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab9-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab9" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">sw</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab10-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab10" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">te</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="table${table_cnt}-row${row_cnt}-tab11-header" data-mdb-toggle="tab" href="#table${table_cnt}-row${row_cnt}-tab11" role="tab" aria-controls="table${table_cnt}-row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">th</a>
+ </li>
+</ul>
+<!-- Tabs navs -->
+<!-- Tabs content -->
+<div class="tab-content" id="table${table_cnt}-row${row_cnt}-content">
+ <div class="tab-pane fade show active" id="table${table_cnt}-row${row_cnt}-tab1" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab1">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd1
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd1}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab2" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab2">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd2
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd2}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab3" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab3">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd3
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd3}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab4" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab4">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd4
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd4}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab5" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab5">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd5
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd5}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab6" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab6">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd6
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd6}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab7" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab7">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd7
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd7}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab8" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab8">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd8
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd8}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab9" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab9">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd9
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd9}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab10" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab10">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd10
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd10}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="table${table_cnt}-row${row_cnt}-tab11" role="tabpanel" aria-labelledby="table${table_cnt}-row${row_cnt}-tab11">
+Command to generate run:
+ <blockquote class="mycode">
+<pre><code>$cmd11
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd11}</code></pre>
+ </blockquote>
+ </div>
+</div>
+<!-- Tabs content -->
+</div></td>
+</tr>

pyserini/2cr/msmarco-v1-doc.yaml ADDED Viewed

	@@ -0,0 +1,539 @@

+conditions:
+ - name: bm25-doc-tuned
+ display: BM25 doc (k1=4.46, b=0.82)
+ display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2767
+ R@1K: 0.9357
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2336
+ nDCG@10: 0.5233
+ R@1K: 0.6757
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3581
+ nDCG@10: 0.5061
+ R@1K: 0.7776
+ - name: bm25-doc-default
+ display: BM25 doc (k1=0.9, b=0.4)
+ display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1a)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2299
+ R@1K: 0.8856
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2434
+ nDCG@10: 0.5176
+ R@1K: 0.6966
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3793
+ nDCG@10: 0.5286
+ R@1K: 0.8085
+ - name: bm25-doc-segmented-tuned
+ display: BM25 doc segmented (k1=2.16, b=0.61)
+ display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2756
+ R@1K: 0.9311
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2398
+ nDCG@10: 0.5389
+ R@1K: 0.6565
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3458
+ nDCG@10: 0.5213
+ R@1K: 0.7725
+ - name: bm25-doc-segmented-default
+ display: BM25 doc segmented (k1=0.9, b=0.4)
+ display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1b)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2684
+ R@1K: 0.9178
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2449
+ nDCG@10: 0.5302
+ R@1K: 0.6871
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3586
+ nDCG@10: 0.5281
+ R@1K: 0.7755
+ - name: bm25-rm3-doc-tuned
+ display: BM25+RM3 doc (k1=4.46, b=0.82)
+ display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2227
+ R@1K: 0.9303
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2638
+ nDCG@10: 0.5526
+ R@1K: 0.7188
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3610
+ nDCG@10: 0.5195
+ R@1K: 0.8180
+ - name: bm25-rm3-doc-default
+ display: BM25+RM3 doc (k1=0.9, b=0.4)
+ display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1c)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.1618
+ R@1K: 0.8783
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2773
+ nDCG@10: 0.5174
+ R@1K: 0.7507
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4015
+ nDCG@10: 0.5254
+ R@1K: 0.8259
+ - name: bm25-rm3-doc-segmented-tuned
+ display: BM25+RM3 doc segmented (k1=2.16, b=0.61)
+ display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2448
+ R@1K: 0.9359
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2655
+ nDCG@10: 0.5392
+ R@1K: 0.7037
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3471
+ nDCG@10: 0.5030
+ R@1K: 0.8056
+ - name: bm25-rm3-doc-segmented-default
+ display: BM25+RM3 doc segmented (k1=0.9, b=0.4)
+ display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1d)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2413
+ R@1K: 0.9351
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2892
+ nDCG@10: 0.5684
+ R@1K: 0.7368
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3792
+ nDCG@10: 0.5202
+ R@1K: 0.8023
+ - name: bm25-rocchio-doc-tuned
+ display: BM25+Rocchio doc (k1=4.46, b=0.82)
+ display-html: BM25+Rocchio doc (<i>k<sub><small>1</small></sub></i>=4.46, <i>b</i>=0.82)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rocchio
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2242
+ R@1K: 0.9314
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2657
+ nDCG@10: 0.5584
+ R@1K: 0.7299
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3628
+ nDCG@10: 0.5199
+ R@1K: 0.8217
+ - name: bm25-rocchio-doc-default
+ display: BM25+Rocchio doc (k1=0.9, b=0.4)
+ display-html: BM25+Rocchio doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.1624
+ R@1K: 0.8789
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2811
+ nDCG@10: 0.5256
+ R@1K: 0.7546
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4089
+ nDCG@10: 0.5192
+ R@1K: 0.8273
+ - name: bm25-rocchio-doc-segmented-tuned
+ display: BM25+Rocchio doc segmented (k1=2.16, b=0.61)
+ display-html: BM25+Rocchio doc segmented (<i>k<sub><small>1</small></sub></i>=2.16, <i>b</i>=0.61)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rocchio --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2475
+ R@1K: 0.9395
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2672
+ nDCG@10: 0.5421
+ R@1K: 0.7115
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3521
+ nDCG@10: 0.4997
+ R@1K: 0.8042
+ - name: bm25-rocchio-doc-segmented-default
+ display: BM25+Rocchio doc segmented (k1=0.9, b=0.4)
+ display-html: BM25+Rocchio doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2447
+ R@1K: 0.9351
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2889
+ nDCG@10: 0.5570
+ R@1K: 0.7423
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3830
+ nDCG@10: 0.5226
+ R@1K: 0.8102
+ - name: bm25-d2q-t5-doc-tuned
+ display: BM25 w/ doc2query-T5 doc (k1=4.68, b=0.87)
+ display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=4.68, <i>b</i>=0.87)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5 --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.3269
+ R@1K: 0.9553
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2620
+ nDCG@10: 0.5972
+ R@1K: 0.6867
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4099
+ nDCG@10: 0.5852
+ R@1K: 0.8105
+ - name: bm25-d2q-t5-doc-default
+ display: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4)
+ display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2a)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2880
+ R@1K: 0.9259
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2700
+ nDCG@10: 0.5968
+ R@1K: 0.7190
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4230
+ nDCG@10: 0.5885
+ R@1K: 0.8403
+ - name: bm25-d2q-t5-doc-segmented-tuned
+ display: BM25 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59)
+ display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=2.56, <i>b</i>=0.59)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.3209
+ R@1K: 0.9530
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2658
+ nDCG@10: 0.6273
+ R@1K: 0.6707
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4047
+ nDCG@10: 0.5943
+ R@1K: 0.7968
+ - name: bm25-d2q-t5-doc-segmented-default
+ display: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
+ display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2b)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.3179
+ R@1K: 0.9490
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2798
+ nDCG@10: 0.6119
+ R@1K: 0.7165
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4150
+ nDCG@10: 0.5957
+ R@1K: 0.8046
+ - name: bm25-rm3-d2q-t5-doc-tuned
+ display: BM25+RM3 w/ doc2query-T5 doc (k1=4.68, b=0.87)
+ display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=4.68, <i>b</i>=0.87)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2623
+ R@1K: 0.9522
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2813
+ nDCG@10: 0.6091
+ R@1K: 0.7184
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4100
+ nDCG@10: 0.5745
+ R@1K: 0.8238
+ - name: bm25-rm3-d2q-t5-doc-default
+ display: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4)
+ display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2c)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.1834
+ R@1K: 0.9126
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.3045
+ nDCG@10: 0.5904
+ R@1K: 0.7737
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4230
+ nDCG@10: 0.5427
+ R@1K: 0.8631
+ - name: bm25-rm3-d2q-t5-doc-segmented-tuned
+ display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=2.56, b=0.59)
+ display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=2.56, <i>b</i>=0.59)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2973
+ R@1K: 0.9563
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2892
+ nDCG@10: 0.6247
+ R@1K: 0.7069
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4016
+ nDCG@10: 0.5711
+ R@1K: 0.8156
+ - name: bm25-rm3-d2q-t5-doc-segmented-default
+ display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
+ display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2d)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.2803
+ R@1K: 0.9551
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.3030
+ nDCG@10: 0.6290
+ R@1K: 0.7483
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.4271
+ nDCG@10: 0.5851
+ R@1K: 0.8266
+ - name: unicoil-noexp-pytorch
+ display: "uniCOIL (noexp): query inference with PyTorch"
+ display-html: "uniCOIL (noexp): query inference with PyTorch"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil-noexp --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.3410
+ R@1K: 0.9420
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2661
+ nDCG@10: 0.6347
+ R@1K: 0.6385
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3698
+ nDCG@10: 0.5906
+ R@1K: 0.7621
+ - name: unicoil-noexp
+ display: "uniCOIL (noexp): pre-encoded"
+ display-html: "uniCOIL (noexp): pre-encoded queries"
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3a)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil-noexp --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev-unicoil-noexp
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.3409
+ R@1K: 0.9420
+ - topic_key: dl19-doc-unicoil-noexp
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2665
+ nDCG@10: 0.6349
+ R@1K: 0.6391
+ - topic_key: dl20-unicoil-noexp
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3698
+ nDCG@10: 0.5893
+ R@1K: 0.7623
+ - name: unicoil-pytorch
+ display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.3532
+ R@1K: 0.9546
+ - topic_key: dl19-doc
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2789
+ nDCG@10: 0.6396
+ R@1K: 0.6654
+ - topic_key: dl20
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3881
+ nDCG@10: 0.6030
+ R@1K: 0.7866
+ - name: unicoil
+ display: "uniCOIL (w/ doc2query-T5): pre-encoded"
+ display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3b)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-doc-segmented-unicoil --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-doc-dev-unicoil
+ eval_key: msmarco-doc-dev
+ scores:
+ - MRR@10: 0.3531
+ R@1K: 0.9546
+ - topic_key: dl19-doc-unicoil
+ eval_key: dl19-doc
+ scores:
+ - MAP: 0.2789
+ nDCG@10: 0.6396
+ R@1K: 0.6652
+ - topic_key: dl20-unicoil
+ eval_key: dl20-doc
+ scores:
+ - MAP: 0.3882
+ nDCG@10: 0.6033
+ R@1K: 0.7869

pyserini/2cr/msmarco-v1-passage.yaml ADDED Viewed

	@@ -0,0 +1,764 @@

+conditions:
+ - name: bm25-rocchio-d2q-t5-tuned
+ display: BM25+Rocchio w/ doc2query-T5 (k1=2.18, b=0.86)
+ display-html: BM25+Rocchio w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rocchio
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.2395
+ R@1K: 0.9535
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4339
+ nDCG@10: 0.6559
+ R@1K: 0.8465
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4376
+ nDCG@10: 0.6224
+ R@1K: 0.8641
+ - name: bm25-rocchio-d2q-t5-default
+ display: BM25+Rocchio w/ doc2query-T5 (k1=0.9, b=0.4)
+ display-html: BM25+Rocchio w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rocchio --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.2158
+ R@1K: 0.9467
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4469
+ nDCG@10: 0.6538
+ R@1K: 0.8855
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4246
+ nDCG@10: 0.6102
+ R@1K: 0.8675
+ - name: bm25-rocchio-default
+ display: BM25+Rocchio (k1=0.9, b=0.4)
+ display-html: BM25+Rocchio (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --rocchio
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.1595
+ R@1K: 0.8620
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.3474
+ nDCG@10: 0.5275
+ R@1K: 0.8007
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.3115
+ nDCG@10: 0.4910
+ R@1K: 0.8156
+ - name: bm25-rocchio-tuned
+ display: BM25+Rocchio (k1=0.82, b=0.68)
+ display-html: BM25+Rocchio (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --rocchio
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.1684
+ R@1K: 0.8726
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.3396
+ nDCG@10: 0.5275
+ R@1K: 0.7948
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.3120
+ nDCG@10: 0.4908
+ R@1K: 0.8327
+ - name: distilbert-kd-tasb-pytorch
+ display: "DistilBERT KD TASB: query inference with PyTorch"
+ display-html: "DistilBERT KD TASB: query inference with PyTorch"
+ display-row: "[5]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-tas_b-b256 --topics $topics --encoder sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3444
+ R@1K: 0.9771
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4590
+ nDCG@10: 0.7210
+ R@1K: 0.8406
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4698
+ nDCG@10: 0.6854
+ R@1K: 0.8727
+ - name: distilbert-kd-tasb
+ display: "DistilBERT KD TASB: pre-encoded"
+ display-html: "DistilBERT KD TASB: pre-encoded queries"
+ display-row: "[5]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-tas_b-b256 --topics $topics --encoded-queries distilbert_tas_b-$topics --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3444
+ R@1K: 0.9771
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4590
+ nDCG@10: 0.7210
+ R@1K: 0.8406
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4698
+ nDCG@10: 0.6854
+ R@1K: 0.8727
+ - name: distilbert-kd-pytorch
+ display: "DistilBERT KD: query inference with PyTorch"
+ display-html: "DistilBERT KD: query inference with PyTorch"
+ display-row: "[4]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 --topics $topics --encoder sebastian-hofstaetter/distilbert-dot-margin_mse-T2-msmarco --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3251
+ R@1K: 0.9553
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4053
+ nDCG@10: 0.6994
+ R@1K: 0.7653
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4159
+ nDCG@10: 0.6447
+ R@1K: 0.7953
+ - name: distilbert-kd
+ display: "DistilBERT KD: pre-encoded"
+ display-html: "DistilBERT KD: pre-encoded queries"
+ display-row: "[4]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.distilbert-dot-margin-mse-t2 --topics $topics --encoded-queries distilbert_kd-$topics --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3251
+ R@1K: 0.9553
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4053
+ nDCG@10: 0.6994
+ R@1K: 0.7653
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4159
+ nDCG@10: 0.6447
+ R@1K: 0.7953
+ - name: ance-pytorch
+ display: "ANCE: query inference with PyTorch"
+ display-html: "ANCE: query inference with PyTorch"
+ display-row: "[3]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.ance --topics $topics --encoder castorini/ance-msmarco-passage --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3302
+ R@1K: 0.9587
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.3710
+ nDCG@10: 0.6452
+ R@1K: 0.7554
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4076
+ nDCG@10: 0.6458
+ R@1K: 0.7764
+ - name: ance
+ display: "ANCE: pre-encoded"
+ display-html: "ANCE: pre-encoded queries"
+ display-row: "[3]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.ance --topics $topics --encoded-queries ance-$topics --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3302
+ R@1K: 0.9584
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.3710
+ nDCG@10: 0.6452
+ R@1K: 0.7554
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4076
+ nDCG@10: 0.6458
+ R@1K: 0.7764
+ - name: bm25-tuned
+ display: BM25 (k1=0.82, b=0.68)
+ display-html: BM25 (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
+ command: python -m pyserini.search.lucene --topics $topics --index msmarco-v1-passage --output $output --bm25
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.1875
+ R@1K: 0.8573
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.2903
+ nDCG@10: 0.4973
+ R@1K: 0.7450
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.2876
+ nDCG@10: 0.4876
+ R@1K: 0.8031
+ - name: bm25-rm3-tuned
+ display: BM25+RM3 (k1=0.82, b=0.68)
+ display-html: BM25+RM3 (<i>k<sub><small>1</small></sub></i>=0.82, <i>b</i>=0.68)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.1646
+ R@1K: 0.8704
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.3339
+ nDCG@10: 0.5147
+ R@1K: 0.7950
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.3017
+ nDCG@10: 0.4924
+ R@1K: 0.8292
+ - name: bm25-default
+ display: BM25 (k1=0.9, b=0.4)
+ display-html: BM25 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1a)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.1840
+ R@1K: 0.8526
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.3013
+ nDCG@10: 0.5058
+ R@1K: 0.7501
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.2856
+ nDCG@10: 0.4796
+ R@1K: 0.7863
+ - name: bm25-rm3-default
+ display: BM25+RM3 (k1=0.9, b=0.4)
+ display-html: BM25+RM3 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (1b)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage --topics $topics --output $output --bm25 --k1 0.9 --b 0.4 --rm3
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.1566
+ R@1K: 0.8606
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.3416
+ nDCG@10: 0.5216
+ R@1K: 0.8136
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.3006
+ nDCG@10: 0.4896
+ R@1K: 0.8236
+ - name: bm25-d2q-t5-tuned
+ display: BM25 w/ doc2query-T5 (k1=2.18, b=0.86)
+ display-html: BM25 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5 --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.2816
+ R@1K: 0.9506
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4046
+ nDCG@10: 0.6336
+ R@1K: 0.8134
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4171
+ nDCG@10: 0.6265
+ R@1K: 0.8393
+ - name: bm25-d2q-t5-default
+ display: BM25 w/ doc2query-T5 (k1=0.9, b=0.4)
+ display-html: BM25 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2a)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5 --topics $topics --output $output --bm25 --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.2723
+ R@1K: 0.9470
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4034
+ nDCG@10: 0.6417
+ R@1K: 0.8310
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4074
+ nDCG@10: 0.6187
+ R@1K: 0.8452
+ - name: bm25-rm3-d2q-t5-tuned
+ display: BM25+RM3 w/ doc2query-T5 (k1=2.18, b=0.86)
+ display-html: BM25+RM3 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=2.18, <i>b</i>=0.86)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.2382
+ R@1K: 0.9528
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4377
+ nDCG@10: 0.6537
+ R@1K: 0.8443
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4348
+ nDCG@10: 0.6235
+ R@1K: 0.8605
+ - name: bm25-rm3-d2q-t5-default
+ display: BM25+RM3 w/ doc2query-T5 (k1=0.9, b=0.4)
+ display-html: BM25+RM3 w/ doc2query-T5 (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (2b)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --k1 0.9 --b 0.4
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.2139
+ R@1K: 0.9460
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4483
+ nDCG@10: 0.6586
+ R@1K: 0.8863
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4286
+ nDCG@10: 0.6131
+ R@1K: 0.8700
+ - name: unicoil-pytorch
+ display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3509
+ R@1K: 0.9581
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4617
+ nDCG@10: 0.7027
+ R@1K: 0.8291
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4429
+ nDCG@10: 0.6745
+ R@1K: 0.8433
+ - name: unicoil-onnx
+ display: "uniCOIL (w/ doc2query-T5): query inference with ONNX"
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with ONNX"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --onnx-encoder UniCoil --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3509
+ R@1K: 0.9581
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4617
+ nDCG@10: 0.7027
+ R@1K: 0.8291
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4429
+ nDCG@10: 0.6745
+ R@1K: 0.8433
+ - name: unicoil
+ display: "uniCOIL (w/ doc2query-T5): pre-encoded"
+ display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3b)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil --topics $topics --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-passage-dev-subset-unicoil
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3516
+ R@1K: 0.9582
+ - topic_key: dl19-passage-unicoil
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4612
+ nDCG@10: 0.7024
+ R@1K: 0.8292
+ - topic_key: dl20-unicoil
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4430
+ nDCG@10: 0.6745
+ R@1K: 0.8430
+ - name: unicoil-noexp-pytorch
+ display: "uniCOIL (noexp): query inference with PyTorch"
+ display-html: "uniCOIL (noexp): query inference with PyTorch"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3153
+ R@1K: 0.9239
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4033
+ nDCG@10: 0.6434
+ R@1K: 0.7752
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4022
+ nDCG@10: 0.6524
+ R@1K: 0.7861
+ - name: unicoil-noexp-onnx
+ display: "uniCOIL (noexp): query inference with ONNX"
+ display-html: "uniCOIL (noexp): query inference with ONNX"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --onnx-encoder UniCoil --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3119
+ R@1K: 0.9239
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4061
+ nDCG@10: 0.6531
+ R@1K: 0.7809
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.3909
+ nDCG@10: 0.6388
+ R@1K: 0.7915
+ - name: unicoil-noexp
+ display: "uniCOIL (noexp): pre-encoded"
+ display-html: "uniCOIL (noexp): pre-encoded queries"
+ display-row: "[<a href=\"#\" data-mdb-toggle=\"tooltip\" title=\"Ma et al. (SIGIR 2021) Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.\">1</a>] &mdash; (3a)"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-unicoil-noexp --topics $topics --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-passage-dev-subset-unicoil-noexp
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3153
+ R@1K: 0.9239
+ - topic_key: dl19-passage-unicoil-noexp
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4033
+ nDCG@10: 0.6433
+ R@1K: 0.7752
+ - topic_key: dl20-unicoil-noexp
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4021
+ nDCG@10: 0.6523
+ R@1K: 0.7861
+ - name: splade-pp-ed-onnx
+ display: "SPLADE++ EnsembleDistil: query inference with ONNX"
+ display-html: "SPLADE++ EnsembleDistil: query inference with ONNX"
+ display-row: "[2]"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-splade-pp-ed --topics $topics --onnx-encoder SpladePlusPlusEnsembleDistil --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3830
+ R@1K: 0.9831
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.5054
+ nDCG@10: 0.7320
+ R@1K: 0.8724
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.5002
+ nDCG@10: 0.7198
+ R@1K: 0.8995
+ - name: splade-pp-sd-onnx
+ display: "SPLADE++ SelfDistil: query inference with ONNX"
+ display-html: "SPLADE++ SelfDistil: query inference with ONNX"
+ display-row: "[2]"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v1-passage-splade-pp-sd --topics $topics --onnx-encoder SpladePlusPlusSelfDistil --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3778
+ R@1K: 0.9846
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4997
+ nDCG@10: 0.7356
+ R@1K: 0.8758
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.5140
+ nDCG@10: 0.7285
+ R@1K: 0.9023
+ - name: tct_colbert-v2-hnp-pytorch
+ display: "TCT_ColBERT-V2-HN+: query inference with PyTorch"
+ display-html: "TCT_ColBERT-V2-HN+: query inference with PyTorch"
+ display-row: "[6]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.tct_colbert-v2-hnp --topics $topics --encoder castorini/tct_colbert-v2-hnp-msmarco --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3584
+ R@1K: 0.9695
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4469
+ nDCG@10: 0.7204
+ R@1K: 0.8261
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4754
+ nDCG@10: 0.6882
+ R@1K: 0.8429
+ - name: tct_colbert-v2-hnp
+ display: "TCT_ColBERT-V2-HN+: pre-encoded"
+ display-html: "TCT_ColBERT-V2-HN+: pre-encoded queries"
+ display-row: "[6]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.tct_colbert-v2-hnp --topics $topics --encoded-queries tct_colbert-v2-hnp-$topics --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3584
+ R@1K: 0.9695
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4469
+ nDCG@10: 0.7204
+ R@1K: 0.8261
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4754
+ nDCG@10: 0.6882
+ R@1K: 0.8429
+ - name: slimr
+ display: "SLIM: query inference with PyTorch"
+ display-html: "SLIM: query inference with PyTorch"
+ display-row: "[7]"
+ command: python -m pyserini.search.lucene --threads 16 --batch 128 --index msmarco-v1-passage-slimr --topics $topics --encoder castorini/slimr-msmarco-passage --encoded-corpus scipy-sparse-vectors.msmarco-v1-passage-slimr --output $output --output-format msmarco --hits 1000 --impact --min-idf 3
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3581
+ R@1K: 0.9620
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4509
+ nDCG@10: 0.7010
+ R@1K: 0.8241
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4419
+ nDCG@10: 0.6403
+ R@1K: 0.8543
+ - name: slimr-pp
+ display: "SLIM++: query inference with PyTorch"
+ display-html: "SLIM++: query inference with PyTorch"
+ display-row: "[7]"
+ command: python -m pyserini.search.lucene --threads 16 --batch 128 --index msmarco-v1-passage-slimr-pp --topics $topics --encoder castorini/slimr-pp-msmarco-passage --encoded-corpus scipy-sparse-vectors.msmarco-v1-passage-slimr-pp --output $output --output-format msmarco --hits 1000 --impact --min-idf 3
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.4032
+ R@1K: 0.9680
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4687
+ nDCG@10: 0.7140
+ R@1K: 0.8415
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4906
+ nDCG@10: 0.7021
+ R@1K: 0.8551
+ - name: aggretriever-distilbert-pytorch
+ display: "Aggretriever-DistilBERT: query inference with PyTorch"
+ display-html: "Aggretriever-DistilBERT: query inference with PyTorch"
+ display-row: "[8]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.aggretriever-distilbert --topics $topics --encoder castorini/aggretriever-distilbert --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3412
+ R@1K: 0.9604
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4301
+ nDCG@10: 0.6816
+ R@1K: 0.8023
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4329
+ nDCG@10: 0.6726
+ R@1K: 0.8351
+ - name: aggretriever-cocondenser-pytorch
+ display: "Aggretriever-coCondenser: query inference with PyTorch"
+ display-html: "Aggretriever-coCondenser: query inference with PyTorch"
+ display-row: "[8]"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 512 --index msmarco-v1-passage.aggretriever-cocondenser --topics $topics --encoder castorini/aggretriever-cocondenser --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3619
+ R@1K: 0.9735
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4350
+ nDCG@10: 0.6837
+ R@1K: 0.8078
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4710
+ nDCG@10: 0.6972
+ R@1K: 0.8555
+ - name: openai-ada2
+ display: "OpenAI ada2: pre-encoded queries"
+ display-html: "OpenAI ada2: pre-encoded queries"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 128 --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics --output $output
+ topics:
+ - topic_key: msmarco-passage-dev-subset
+ eval_key: msmarco-passage-dev-subset
+ scores:
+ - MRR@10: 0.3435
+ R@1K: 0.9858
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.4788
+ nDCG@10: 0.7035
+ R@1K: 0.8629
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4771
+ nDCG@10: 0.6759
+ R@1K: 0.8705
+ - name: openai-ada2-hyde
+ display: "HyDE-OpenAI ada2: pre-encoded queries"
+ display-html: "HyDE-OpenAI ada2: pre-encoded queries"
+ command: python -m pyserini.search.faiss --threads 16 --batch-size 128 --index msmarco-v1-passage.openai-ada2 --topics $topics --encoded-queries openai-ada2-$topics-hyde --output $output
+ topics:
+ - topic_key: dl19-passage
+ eval_key: dl19-passage
+ scores:
+ - MAP: 0.5125
+ nDCG@10: 0.7163
+ R@1K: 0.9002
+ - topic_key: dl20
+ eval_key: dl20-passage
+ scores:
+ - MAP: 0.4938
+ nDCG@10: 0.6666
+ R@1K: 0.8919

pyserini/2cr/msmarco-v2-doc.yaml ADDED Viewed

	@@ -0,0 +1,287 @@

+conditions:
+ - name: bm25-doc-default
+ display: BM25 doc (k1=0.9, b=0.4)
+ display-html: BM25 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (1a)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.1572
+ R@1K: 0.8054
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.1659
+ R@1K: 0.8029
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2126
+ nDCG@10: 0.5116
+ MRR@100: 0.8367
+ R@100: 0.3195
+ R@1K: 0.6739
+ - name: bm25-doc-segmented-default
+ display: BM25 doc segmented (k1=0.9, b=0.4)
+ display-html: BM25 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (1b)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.1896
+ R@1K: 0.8542
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.1930
+ R@1K: 0.8549
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2436
+ nDCG@10: 0.5776
+ MRR@100: 0.8937
+ R@100: 0.3478
+ R@1K: 0.6930
+ - name: bm25-rm3-doc-default
+ display: BM25+RM3 doc (k1=0.9, b=0.4)
+ display-html: BM25+RM3 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (1c)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.0974
+ R@1K: 0.7699
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.1033
+ R@1K: 0.7736
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2452
+ nDCG@10: 0.5304
+ MRR@100: 0.7914
+ R@100: 0.3376
+ R@1K: 0.7341
+ - name: bm25-rm3-doc-segmented-default
+ display: BM25+RM3 doc segmented (k1=0.9, b=0.4)
+ display-html: BM25+RM3 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (1d)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.1660
+ R@1K: 0.8608
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.1702
+ R@1K: 0.8639
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2936
+ nDCG@10: 0.6189
+ MRR@100: 0.9076
+ R@100: 0.3890
+ R@1K: 0.7678
+ - name: bm25-d2q-t5-doc-default
+ display: BM25 w/ doc2query-T5 doc (k1=0.9, b=0.4)
+ display-html: BM25 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (2a)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-d2q-t5 --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.2011
+ R@1K: 0.8614
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.2012
+ R@1K: 0.8568
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2387
+ nDCG@10: 0.5792
+ MRR@100: 0.8866
+ R@100: 0.3443
+ R@1K: 0.7066
+ - name: bm25-d2q-t5-doc-segmented-default
+ display: BM25 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
+ display-html: BM25 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (2b)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-d2q-t5 --topics $topics --output $output --bm25 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.2226
+ R@1K: 0.8982
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.2234
+ R@1K: 0.8952
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2683
+ nDCG@10: 0.6289
+ MRR@100: 0.9454
+ R@100: 0.3656
+ R@1K: 0.7202
+ - name: bm25-rm3-d2q-t5-doc-default
+ display: BM25+RM3 w/ doc2query-T5 doc (k1=0.9, b=0.4)
+ display-html: BM25+RM3 w/ doc2query-T5 doc (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (2c)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.1141
+ R@1K: 0.8191
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.1170
+ R@1K: 0.8247
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2611
+ nDCG@10: 0.5375
+ MRR@100: 0.8255
+ R@100: 0.3580
+ R@1K: 0.7574
+ - name: bm25-rm3-d2q-t5-doc-segmented-default
+ display: BM25+RM3 w/ doc2query-T5 doc segmented (k1=0.9, b=0.4)
+ display-html: BM25+RM3 w/ doc2query-T5 doc segmented (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (2d)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3 --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.1975
+ R@1K: 0.9002
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.1978
+ R@1K: 0.8972
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.3191
+ nDCG@10: 0.6559
+ MRR@100: 0.8989
+ R@100: 0.4131
+ R@1K: 0.7948
+ - name: unicoil-noexp
+ display: "uniCOIL (noexp): pre-encoded"
+ display-html: "uniCOIL (noexp): pre-encoded queries"
+ display-row: (3a)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-noexp-0shot --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-v2-doc-dev-unicoil-noexp
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.2231
+ R@1K: 0.8987
+ - topic_key: msmarco-v2-doc-dev2-unicoil-noexp
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.2314
+ R@1K: 0.8995
+ - topic_key: dl21-unicoil-noexp
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2587
+ nDCG@10: 0.6495
+ MRR@100: 0.9282
+ R@100: 0.3563
+ R@1K: 0.6787
+ - name: unicoil-noexp-otf
+ display: "uniCOIL (noexp): query inference with PyTorch"
+ display-html: "uniCOIL (noexp): query inference with PyTorch"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-noexp-0shot --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.2232
+ R@1K: 0.8987
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.2314
+ R@1K: 0.8993
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2589
+ nDCG@10: 0.6501
+ MRR@100: 0.9282
+ R@100: 0.3574
+ R@1K: 0.6782
+ - name: unicoil
+ display: "uniCOIL (w/ doc2query-T5): pre-encoded"
+ display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
+ display-row: (3b)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-0shot --topics $topics --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-v2-doc-dev-unicoil
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.2419
+ R@1K: 0.9122
+ - topic_key: msmarco-v2-doc-dev2-unicoil
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.2445
+ R@1K: 0.9172
+ - topic_key: dl21-unicoil
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2718
+ nDCG@10: 0.6783
+ MRR@100: 0.9684
+ R@100: 0.3700
+ R@1K: 0.7069
+ - name: unicoil-otf
+ display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-doc-segmented-unicoil-0shot --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --impact --hits 10000 --max-passage-hits 1000 --max-passage
+ topics:
+ - topic_key: msmarco-v2-doc-dev
+ eval_key: msmarco-v2-doc-dev
+ scores:
+ - MRR@100: 0.2419
+ R@1K: 0.9120
+ - topic_key: msmarco-v2-doc-dev2
+ eval_key: msmarco-v2-doc-dev2
+ scores:
+ - MRR@100: 0.2447
+ R@1K: 0.9174
+ - topic_key: dl21
+ eval_key: dl21-doc
+ scores:
+ - MAP@100: 0.2720
+ nDCG@10: 0.6782
+ MRR@100: 0.9684
+ R@100: 0.3702
+ R@1K: 0.7071

pyserini/2cr/msmarco-v2-passage.yaml ADDED Viewed

	@@ -0,0 +1,287 @@

+conditions:
+ - name: bm25-default
+ display: BM25 original passage (k1=0.9, b=0.4)
+ display-html: BM25 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (1a)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.0719
+ R@1K: 0.5733
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.0802
+ R@1K: 0.5839
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.1357
+ nDCG@10: 0.4458
+ MRR@100: 0.5060
+ R@100: 0.3261
+ R@1K: 0.6149
+ - name: bm25-augmented-default
+ display: BM25 augmented passage (k1=0.9, b=0.4)
+ display-html: BM25 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (1b)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.0872
+ R@1K: 0.6925
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.0917
+ R@1K: 0.6933
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.0977
+ nDCG@10: 0.3977
+ MRR@100: 0.5303
+ R@100: 0.2709
+ R@1K: 0.5835
+ - name: bm25-rm3-default
+ display: BM25+RM3 original passage (k1=0.9, b=0.4)
+ display-html: BM25+RM3 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (1c)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.0630
+ R@1K: 0.5947
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.0659
+ R@1K: 0.6062
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.1666
+ nDCG@10: 0.4455
+ MRR@100: 0.5202
+ R@100: 0.3499
+ R@1K: 0.6616
+ - name: bm25-rm3-augmented-default
+ display: BM25+RM3 augmented passage (k1=0.9, b=0.4)
+ display-html: BM25+RM3 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (1d)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.0667
+ R@1K: 0.6857
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.0700
+ R@1K: 0.6826
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.1050
+ nDCG@10: 0.3869
+ MRR@100: 0.4915
+ R@100: 0.2807
+ R@1K: 0.6298
+ - name: bm25-d2q-t5-default
+ display: BM25 w/ doc2query-T5 original passage (k1=0.9, b=0.4)
+ display-html: BM25 w/ doc2query-T5 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (2a)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-d2q-t5 --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.1072
+ R@1K: 0.7083
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.1123
+ R@1K: 0.7151
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.1874
+ nDCG@10: 0.4816
+ MRR@100: 0.6848
+ R@100: 0.4076
+ R@1K: 0.7078
+ - name: bm25-d2q-t5-augmented-default
+ display: BM25 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4)
+ display-html: BM25 w/ doc2query-T5 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (2b)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented-d2q-t5 --topics $topics --output $output --bm25
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.1172
+ R@1K: 0.7647
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.1170
+ R@1K: 0.7659
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.1649
+ nDCG@10: 0.4702
+ MRR@100: 0.6391
+ R@100: 0.3883
+ R@1K: 0.6962
+ - name: bm25-rm3-d2q-t5-default
+ display: BM25+RM3 w/ doc2query-T5 original passage (k1=0.9, b=0.4)
+ display-html: BM25+RM3 w/ doc2query-T5 original passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (2c)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.0947
+ R@1K: 0.7181
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.0984
+ R@1K: 0.7222
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.2285
+ nDCG@10: 0.5098
+ MRR@100: 0.6548
+ R@100: 0.4499
+ R@1K: 0.7537
+ - name: bm25-rm3-d2q-t5-augmented-default
+ display: BM25+RM3 w/ doc2query-T5 augmented passage (k1=0.9, b=0.4)
+ display-html: BM25+RM3 w/ doc2query-T5 augmented passage (<i>k<sub><small>1</small></sub></i>=0.9, <i>b</i>=0.4)
+ display-row: (2d)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-augmented-d2q-t5-docvectors --topics $topics --output $output --bm25 --rm3
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.0883
+ R@1K: 0.7607
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.0904
+ R@1K: 0.7649
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.1930
+ nDCG@10: 0.4812
+ MRR@100: 0.5958
+ R@100: 0.4321
+ R@1K: 0.7672
+ - name: unicoil
+ display: "uniCOIL (w/ doc2query-T5): pre-encoded"
+ display-html: "uniCOIL (w/ doc2query-T5): pre-encoded queries"
+ display-row: (3b)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-0shot --topics $topics --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-v2-passage-dev-unicoil
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.1499
+ R@1K: 0.7616
+ - topic_key: msmarco-v2-passage-dev2-unicoil
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.1577
+ R@1K: 0.7671
+ - topic_key: dl21-unicoil
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.2538
+ nDCG@10: 0.6159
+ MRR@100: 0.7311
+ R@100: 0.4731
+ R@1K: 0.7551
+ - name: unicoil-otf
+ display: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+ display-html: "uniCOIL (w/ doc2query-T5): query inference with PyTorch"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-0shot --topics $topics --encoder castorini/unicoil-msmarco-passage --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.1501
+ R@1K: 0.7613
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.1576
+ R@1K: 0.7676
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.2539
+ nDCG@10: 0.6160
+ MRR@100: 0.7311
+ R@100: 0.4723
+ R@1K: 0.7560
+ - name: unicoil-noexp
+ display: "uniCOIL (noexp): pre-encoded"
+ display-html: "uniCOIL (noexp): pre-encoded queries"
+ display-row: (3a)
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-noexp-0shot --topics $topics --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-v2-passage-dev-unicoil-noexp
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.1342
+ R@1K: 0.7010
+ - topic_key: msmarco-v2-passage-dev2-unicoil-noexp
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.1385
+ R@1K: 0.7114
+ - topic_key: dl21-unicoil-noexp
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.2193
+ nDCG@10: 0.5756
+ MRR@100: 0.6991
+ R@100: 0.4246
+ R@1K: 0.6897
+ - name: unicoil-noexp-otf
+ display: "uniCOIL (noexp): query inference with PyTorch"
+ display-html: "uniCOIL (noexp): query inference with PyTorch"
+ command: python -m pyserini.search.lucene --threads 16 --batch-size 128 --index msmarco-v2-passage-unicoil-noexp-0shot --topics $topics --encoder castorini/unicoil-noexp-msmarco-passage --output $output --hits 1000 --impact
+ topics:
+ - topic_key: msmarco-v2-passage-dev
+ eval_key: msmarco-v2-passage-dev
+ scores:
+ - MRR@100: 0.1343
+ R@1K: 0.7010
+ - topic_key: msmarco-v2-passage-dev2
+ eval_key: msmarco-v2-passage-dev2
+ scores:
+ - MRR@100: 0.1385
+ R@1K: 0.7114
+ - topic_key: dl21
+ eval_key: dl21-passage
+ scores:
+ - MAP@100: 0.2194
+ nDCG@10: 0.5759
+ MRR@100: 0.6991
+ R@100: 0.4247
+ R@1K: 0.6893

pyserini/2cr/msmarco.py ADDED Viewed

	@@ -0,0 +1,600 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import math
+import os
+import re
+import sys
+import time
+from collections import defaultdict
+from string import Template
+import pkg_resources
+import yaml
+from ._base import run_eval_and_return_metric, ok_str, okish_str, fail_str
+# The models: the rows of the results table will be ordered this way.
+models = {
+ # MS MARCO v1 passage
+ 'msmarco-v1-passage':
+ ['bm25-default',
+ 'bm25-rm3-default',
+ 'bm25-rocchio-default',
+ '',
+ 'bm25-tuned',
+ 'bm25-rm3-tuned',
+ 'bm25-rocchio-tuned',
+ '',
+ 'bm25-d2q-t5-default',
+ 'bm25-rm3-d2q-t5-default',
+ 'bm25-rocchio-d2q-t5-default',
+ '',
+ 'bm25-d2q-t5-tuned',
+ 'bm25-rm3-d2q-t5-tuned',
+ 'bm25-rocchio-d2q-t5-tuned',
+ '',
+ 'unicoil',
+ 'unicoil-pytorch',
+ 'unicoil-onnx',
+ 'unicoil-noexp',
+ 'unicoil-noexp-pytorch',
+ 'unicoil-noexp-onnx',
+ '',
+ 'splade-pp-ed-onnx',
+ 'splade-pp-sd-onnx',
+ '',
+ 'ance',
+ 'ance-pytorch',
+ '',
+ 'distilbert-kd',
+ 'distilbert-kd-pytorch',
+ 'distilbert-kd-tasb',
+ 'distilbert-kd-tasb-pytorch',
+ '',
+ 'tct_colbert-v2-hnp',
+ 'tct_colbert-v2-hnp-pytorch',
+ '',
+ 'slimr',
+ 'slimr-pp',
+ '',
+ 'aggretriever-distilbert-pytorch',
+ 'aggretriever-cocondenser-pytorch',
+ '',
+ 'openai-ada2',
+ 'openai-ada2-hyde'],
+ # MS MARCO v1 doc
+ 'msmarco-v1-doc':
+ ['bm25-doc-default',
+ 'bm25-doc-segmented-default',
+ 'bm25-rm3-doc-default',
+ 'bm25-rm3-doc-segmented-default',
+ 'bm25-rocchio-doc-default',
+ 'bm25-rocchio-doc-segmented-default',
+ '',
+ 'bm25-doc-tuned',
+ 'bm25-doc-segmented-tuned',
+ 'bm25-rm3-doc-tuned',
+ 'bm25-rm3-doc-segmented-tuned',
+ 'bm25-rocchio-doc-tuned',
+ 'bm25-rocchio-doc-segmented-tuned',
+ '',
+ 'bm25-d2q-t5-doc-default',
+ 'bm25-d2q-t5-doc-segmented-default',
+ 'bm25-rm3-d2q-t5-doc-default',
+ 'bm25-rm3-d2q-t5-doc-segmented-default',
+ '',
+ 'bm25-d2q-t5-doc-tuned',
+ 'bm25-d2q-t5-doc-segmented-tuned',
+ 'bm25-rm3-d2q-t5-doc-tuned',
+ 'bm25-rm3-d2q-t5-doc-segmented-tuned',
+ '',
+ 'unicoil-noexp',
+ 'unicoil-noexp-pytorch',
+ '',
+ 'unicoil',
+ 'unicoil-pytorch'],
+ # MS MARCO v2 passage
+ 'msmarco-v2-passage':
+ ['bm25-default',
+ 'bm25-augmented-default',
+ 'bm25-rm3-default',
+ 'bm25-rm3-augmented-default',
+ '',
+ 'bm25-d2q-t5-default',
+ 'bm25-d2q-t5-augmented-default',
+ 'bm25-rm3-d2q-t5-default',
+ 'bm25-rm3-d2q-t5-augmented-default',
+ '',
+ 'unicoil-noexp',
+ 'unicoil',
+ '',
+ 'unicoil-noexp-otf',
+ 'unicoil-otf'],
+ # MS MARCO v2 doc
+ 'msmarco-v2-doc':
+ ['bm25-doc-default',
+ 'bm25-doc-segmented-default',
+ 'bm25-rm3-doc-default',
+ 'bm25-rm3-doc-segmented-default',
+ '',
+ 'bm25-d2q-t5-doc-default',
+ 'bm25-d2q-t5-doc-segmented-default',
+ 'bm25-rm3-d2q-t5-doc-default',
+ 'bm25-rm3-d2q-t5-doc-segmented-default',
+ '',
+ 'unicoil-noexp',
+ 'unicoil',
+ '',
+ 'unicoil-noexp-otf',
+ 'unicoil-otf'
+ ]
+}
+trec_eval_metric_definitions = {
+ 'msmarco-v1-passage': {
+ 'msmarco-passage-dev-subset': {
+ 'MRR@10': '-c -M 10 -m recip_rank',
+ 'R@1K': '-c -m recall.1000'
+ },
+ 'dl19-passage': {
+ 'MAP': '-c -l 2 -m map',
+ 'nDCG@10': '-c -m ndcg_cut.10',
+ 'R@1K': '-c -l 2 -m recall.1000'
+ },
+ 'dl20-passage': {
+ 'MAP': '-c -l 2 -m map',
+ 'nDCG@10': '-c -m ndcg_cut.10',
+ 'R@1K': '-c -l 2 -m recall.1000'
+ }
+ },
+ 'msmarco-v1-doc': {
+ 'msmarco-doc-dev': {
+ 'MRR@10': '-c -M 100 -m recip_rank',
+ 'R@1K': '-c -m recall.1000'
+ },
+ 'dl19-doc': {
+ 'MAP': '-c -M 100 -m map',
+ 'nDCG@10': '-c -m ndcg_cut.10',
+ 'R@1K': '-c -m recall.1000'
+ },
+ 'dl20-doc': {
+ 'MAP': '-c -M 100 -m map',
+ 'nDCG@10': '-c -m ndcg_cut.10',
+ 'R@1K': '-c -m recall.1000'
+ }
+ },
+ 'msmarco-v2-passage': {
+ 'msmarco-v2-passage-dev': {
+ 'MRR@100': '-c -M 100 -m recip_rank',
+ 'R@1K': '-c -m recall.1000'
+ },
+ 'msmarco-v2-passage-dev2': {
+ 'MRR@100': '-c -M 100 -m recip_rank',
+ 'R@1K': '-c -m recall.1000'
+ },
+ 'dl21-passage': {
+ 'MAP@100': '-c -l 2 -M 100 -m map',
+ 'nDCG@10': '-c -m ndcg_cut.10',
+ 'MRR@100': '-c -l 2 -M 100 -m recip_rank',
+ 'R@100': '-c -l 2 -m recall.100',
+ 'R@1K': '-c -l 2 -m recall.1000'
+ }
+ },
+ 'msmarco-v2-doc': {
+ 'msmarco-v2-doc-dev': {
+ 'MRR@100': '-c -M 100 -m recip_rank',
+ 'R@1K': '-c -m recall.1000'
+ },
+ 'msmarco-v2-doc-dev2': {
+ 'MRR@100': '-c -M 100 -m recip_rank',
+ 'R@1K': '-c -m recall.1000'
+ },
+ 'dl21-doc': {
+ 'MAP@100': '-c -M 100 -m map',
+ 'nDCG@10': '-c -m ndcg_cut.10',
+ 'MRR@100': '-c -M 100 -m recip_rank',
+ 'R@100': '-c -m recall.100',
+ 'R@1K': '-c -m recall.1000'
+ }
+ }
+}
+def find_msmarco_table_topic_set_key_v1(topic_key):
+ # E.g., we want to map variants like 'dl19-passage-unicoil' and 'dl19-passage' both into 'dl19'
+ key = ''
+ if topic_key.startswith('dl19'):
+ key = 'dl19'
+ elif topic_key.startswith('dl20'):
+ key = 'dl20'
+ elif topic_key.startswith('msmarco'):
+ key = 'dev'
+ return key
+def find_msmarco_table_topic_set_key_v2(topic_key):
+ key = ''
+ if topic_key.endswith('dev') or topic_key.endswith('dev-unicoil') or topic_key.endswith('dev-unicoil-noexp'):
+ key = 'dev'
+ elif topic_key.endswith('dev2') or topic_key.endswith('dev2-unicoil') or topic_key.endswith('dev2-unicoil-noexp'):
+ key = 'dev2'
+ elif topic_key.startswith('dl21'):
+ key = 'dl21'
+ return key
+def format_command(raw):
+ # After "--output foo.txt" are additional options like "--hits 1000 --impact".
+ # We want these on a separate line for better readability, but note that sometimes that might
+ # be the end of the command, in which case we don't want to add an extra line break.
+ return raw.replace('--topics', '\\\n --topics') \
+ .replace('--threads', '\\\n --threads')\
+ .replace('--index', '\\\n --index')\
+ .replace('--output ', '\\\n --output ')\
+ .replace('--encoder', '\\\n --encoder')\
+ .replace('--onnx-encoder', '\\\n --onnx-encoder')\
+ .replace('--encoded-corpus', '\\\n --encoded-corpus')\
+ .replace('.txt ', '.txt \\\n ')
+def read_file(f):
+ fin = open(f, 'r')
+ text = fin.read()
+ fin.close()
+ return text
+def list_conditions(args):
+ for condition in models[args.collection]:
+ if condition == '':
+ continue
+ print(condition)
+def generate_report(args):
+ yaml_file = pkg_resources.resource_filename(__name__, f'{args.collection}.yaml')
+ if args.collection == 'msmarco-v1-passage':
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v1_passage.template'))
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v1.template'))
+ elif args.collection == 'msmarco-v1-doc':
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v1_doc.template'))
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v1.template'))
+ elif args.collection == 'msmarco-v2-passage':
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v2_passage.template'))
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v2.template'))
+ elif args.collection == 'msmarco-v2-doc':
+ html_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_v2_doc.template'))
+ row_template = read_file(pkg_resources.resource_filename(__name__, 'msmarco_html_row_v2.template'))
+ else:
+ raise ValueError(f'Unknown corpus: {args.collection}')
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ commands = defaultdict(lambda: defaultdict(lambda: ''))
+ eval_commands = defaultdict(lambda: defaultdict(lambda: ''))
+ table_keys = {}
+ row_ids = {}
+ with open(yaml_file) as f:
+ yaml_data = yaml.safe_load(f)
+ for condition in yaml_data['conditions']:
+ name = condition['name']
+ display = condition['display-html']
+ row_id = condition['display-row'] if 'display-row' in condition else ''
+ cmd_template = condition['command']
+ row_ids[name] =row_id
+ table_keys[name] = display
+ for topic_set in condition['topics']:
+ topic_key = topic_set['topic_key']
+ eval_key = topic_set['eval_key']
+ if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
+ short_topic_key = find_msmarco_table_topic_set_key_v1(topic_key)
+ else:
+ short_topic_key = find_msmarco_table_topic_set_key_v2(topic_key)
+ runfile = f'run.{args.collection}.{name}.{short_topic_key}.txt'
+ cmd = Template(cmd_template).substitute(topics=topic_key, output=runfile)
+ commands[name][short_topic_key] = cmd
+ for expected in topic_set['scores']:
+ for metric in expected:
+ eval_cmd = f'python -m pyserini.eval.trec_eval ' + \
+ f'{trec_eval_metric_definitions[args.collection][eval_key][metric]} {eval_key} {runfile}'
+ eval_commands[name][short_topic_key] += eval_cmd + '\n'
+ table[name][short_topic_key][metric] = expected[metric]
+ if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
+ row_cnt = 1
+ html_rows = []
+ for name in models[args.collection]:
+ if not name:
+ # Add blank row for spacing
+ html_rows.append('<tr><td style="border-bottom: 0"></td></tr>')
+ continue
+ s = Template(row_template)
+ s = s.substitute(row_cnt=row_cnt,
+ condition_name=table_keys[name],
+ row=row_ids[name],
+ s1=f'{table[name]["dl19"]["MAP"]:.4f}' if table[name]['dl19']['MAP'] != 0 else '-',
+ s2=f'{table[name]["dl19"]["nDCG@10"]:.4f}' if table[name]['dl19']['nDCG@10'] != 0 else '-',
+ s3=f'{table[name]["dl19"]["R@1K"]:.4f}' if table[name]['dl19']['R@1K'] != 0 else '-',
+ s4=f'{table[name]["dl20"]["MAP"]:.4f}' if table[name]['dl20']['MAP'] != 0 else '-',
+ s5=f'{table[name]["dl20"]["nDCG@10"]:.4f}' if table[name]['dl20']['nDCG@10'] != 0 else '-',
+ s6=f'{table[name]["dl20"]["R@1K"]:.4f}' if table[name]['dl20']['R@1K'] != 0 else '-',
+ s7=f'{table[name]["dev"]["MRR@10"]:.4f}' if table[name]['dev']['MRR@10'] != 0 else '-',
+ s8=f'{table[name]["dev"]["R@1K"]:.4f}' if table[name]['dev']['R@1K'] != 0 else '-',
+ cmd1=format_command(commands[name]['dl19']),
+ cmd2=format_command(commands[name]['dl20']),
+ cmd3=format_command(commands[name]['dev']),
+ eval_cmd1=eval_commands[name]['dl19'],
+ eval_cmd2=eval_commands[name]['dl20'],
+ eval_cmd3=eval_commands[name]['dev']
+ )
+ # If we don't have scores, we want to remove the commands also. Use simple regexp substitution.
+ if table[name]['dl19']['MAP'] == 0:
+ s = re.sub(re.compile('Command to generate run on TREC 2019 queries:.*?</div>',
+ re.MULTILINE | re.DOTALL),
+ 'Not available.</div>', s)
+ if table[name]['dl20']['MAP'] == 0:
+ s = re.sub(re.compile('Command to generate run on TREC 2020 queries:.*?</div>',
+ re.MULTILINE | re.DOTALL),
+ 'Not available.</div>', s)
+ if table[name]['dev']['MRR@10'] == 0:
+ s = re.sub(re.compile('Command to generate run on dev queries:.*?</div>',
+ re.MULTILINE | re.DOTALL),
+ 'Not available.</div>', s)
+ html_rows.append(s)
+ row_cnt += 1
+ all_rows = '\n'.join(html_rows)
+ if args.collection == 'msmarco-v1-passage':
+ full_name = 'MS MARCO V1 Passage'
+ else:
+ full_name = 'MS MARCO V1 Document'
+ with open(args.output, 'w') as out:
+ out.write(Template(html_template).substitute(title=full_name, rows=all_rows))
+ else:
+ row_cnt = 1
+ html_rows = []
+ for name in models[args.collection]:
+ if not name:
+ # Add blank row for spacing
+ html_rows.append('<tr><td style="border-bottom: 0"></td></tr>')
+ continue
+ s = Template(row_template)
+ s = s.substitute(row_cnt=row_cnt,
+ condition_name=table_keys[name],
+ row=row_ids[name],
+ s1=f'{table[name]["dl21"]["MAP@100"]:.4f}',
+ s2=f'{table[name]["dl21"]["nDCG@10"]:.4f}',
+ s3=f'{table[name]["dl21"]["MRR@100"]:.4f}',
+ s4=f'{table[name]["dl21"]["R@100"]:.4f}',
+ s5=f'{table[name]["dl21"]["R@1K"]:.4f}',
+ s6=f'{table[name]["dev"]["MRR@100"]:.4f}',
+ s7=f'{table[name]["dev"]["R@1K"]:.4f}',
+ s8=f'{table[name]["dev2"]["MRR@100"]:.4f}',
+ s9=f'{table[name]["dev2"]["R@1K"]:.4f}',
+ cmd1=format_command(commands[name]['dl21']),
+ cmd2=format_command(commands[name]['dev']),
+ cmd3=format_command(commands[name]['dev2']),
+ eval_cmd1=eval_commands[name]['dl21'],
+ eval_cmd2=eval_commands[name]['dev'],
+ eval_cmd3=eval_commands[name]['dev2']
+ )
+ html_rows.append(s)
+ row_cnt += 1
+ all_rows = '\n'.join(html_rows)
+ if args.collection == 'msmarco-v2-passage':
+ full_name = 'MS MARCO V2 Passage'
+ else:
+ full_name = 'MS MARCO V2 Document'
+ with open(args.output, 'w') as out:
+ out.write(Template(html_template).substitute(title=full_name, rows=all_rows))
+def run_conditions(args):
+ start = time.time()
+ table = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
+ table_keys = {}
+ yaml_file = pkg_resources.resource_filename(__name__, f'{args.collection}.yaml')
+ with open(yaml_file) as f:
+ yaml_data = yaml.safe_load(f)
+ for condition in yaml_data['conditions']:
+ # Either we're running all conditions, or running only the condition specified in --condition
+ if not args.all:
+ if not condition['name'] == args.condition:
+ continue
+ name = condition['name']
+ display = condition['display']
+ cmd_template = condition['command']
+ print(f'# Running condition "{name}": {display}\n')
+ for topic_set in condition['topics']:
+ topic_key = topic_set['topic_key']
+ eval_key = topic_set['eval_key']
+ short_topic_key = ''
+ if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
+ short_topic_key = find_msmarco_table_topic_set_key_v1(topic_key)
+ else:
+ short_topic_key = find_msmarco_table_topic_set_key_v2(topic_key)
+ print(f' - topic_key: {topic_key}')
+ runfile = os.path.join(args.directory, f'run.{args.collection}.{name}.{short_topic_key}.txt')
+ cmd = Template(cmd_template).substitute(topics=topic_key, output=runfile)
+ if args.display_commands:
+ print(f'\n```bash\n{format_command(cmd)}\n```\n')
+ if not os.path.exists(runfile):
+ if not args.dry_run:
+ os.system(cmd)
+ for expected in topic_set['scores']:
+ for metric in expected:
+ table_keys[name] = display
+ if not args.skip_eval:
+ # If the runfile doesn't exist, we can't evaluate.
+ # This would be the case if --dry-run were set.
+ if not os.path.exists(runfile):
+ continue
+ score = float(
+ run_eval_and_return_metric(
+ metric,
+ eval_key,
+ trec_eval_metric_definitions[args.collection][eval_key][metric],
+ runfile))
+ if math.isclose(score, float(expected[metric])):
+ result_str = ok_str
+ # Flaky tests
+ elif args.collection == 'msmarco-v1-passage' \
+ and topic_key == 'msmarco-passage-dev-subset' and name == 'ance-pytorch' \
+ and metric == 'MRR@10' and abs(score-float(expected[metric])) <= 0.0001:
+ result_str = okish_str
+ else:
+ result_str = fail_str + f' expected {expected[metric]:.4f}'
+ print(f' {metric:7}: {score:.4f} {result_str}')
+ table[name][short_topic_key][metric] = score
+ else:
+ table[name][short_topic_key][metric] = expected[metric]
+ if not args.skip_eval:
+ print('')
+ if args.collection == 'msmarco-v1-passage' or args.collection == 'msmarco-v1-doc':
+ print(' ' * 69 + 'TREC 2019' + ' ' * 16 + 'TREC 2020' + ' ' * 12 + 'MS MARCO dev')
+ print(' ' * 62 + 'MAP nDCG@10 R@1K MAP nDCG@10 R@1K MRR@10 R@1K')
+ print(' ' * 62 + '-' * 22 + ' ' + '-' * 22 + ' ' + '-' * 14)
+ if args.condition:
+ # If we've used --condition to specify a specific condition, print out only that row.
+ name = args.condition
+ print(f'{table_keys[name]:60}' +
+ f'{table[name]["dl19"]["MAP"]:8.4f}{table[name]["dl19"]["nDCG@10"]:8.4f}{table[name]["dl19"]["R@1K"]:8.4f} ' +
+ f'{table[name]["dl20"]["MAP"]:8.4f}{table[name]["dl20"]["nDCG@10"]:8.4f}{table[name]["dl20"]["R@1K"]:8.4f} ' +
+ f'{table[name]["dev"]["MRR@10"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}')
+ else:
+ # Otherwise, print out all rows
+ for name in models[args.collection]:
+ if not name:
+ print('')
+ continue
+ print(f'{table_keys[name]:60}' +
+ f'{table[name]["dl19"]["MAP"]:8.4f}{table[name]["dl19"]["nDCG@10"]:8.4f}{table[name]["dl19"]["R@1K"]:8.4f} ' +
+ f'{table[name]["dl20"]["MAP"]:8.4f}{table[name]["dl20"]["nDCG@10"]:8.4f}{table[name]["dl20"]["R@1K"]:8.4f} ' +
+ f'{table[name]["dev"]["MRR@10"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f}')
+ else:
+ print(' ' * 77 + 'TREC 2021' + ' ' * 18 + 'MS MARCO dev' + ' ' * 6 + 'MS MARCO dev2')
+ print(' ' * 62 + 'MAP@100 nDCG@10 MRR@100 R@100 R@1K MRR@100 R@1K MRR@100 R@1K')
+ print(' ' * 62 + '-' * 38 + ' ' + '-' * 14 + ' ' + '-' * 14)
+ if args.condition:
+ # If we've used --condition to specify a specific condition, print out only that row.
+ name = args.condition
+ print(f'{table_keys[name]:60}' +
+ f'{table[name]["dl21"]["MAP@100"]:8.4f}{table[name]["dl21"]["nDCG@10"]:8.4f}' +
+ f'{table[name]["dl21"]["MRR@100"]:8.4f}{table[name]["dl21"]["R@100"]:8.4f}{table[name]["dl21"]["R@1K"]:8.4f} ' +
+ f'{table[name]["dev"]["MRR@100"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f} ' +
+ f'{table[name]["dev2"]["MRR@100"]:8.4f}{table[name]["dev2"]["R@1K"]:8.4f}')
+ else:
+ # Otherwise, print out all rows
+ for name in models[args.collection]:
+ if not name:
+ print('')
+ continue
+ print(f'{table_keys[name]:60}' +
+ f'{table[name]["dl21"]["MAP@100"]:8.4f}{table[name]["dl21"]["nDCG@10"]:8.4f}' +
+ f'{table[name]["dl21"]["MRR@100"]:8.4f}{table[name]["dl21"]["R@100"]:8.4f}{table[name]["dl21"]["R@1K"]:8.4f} ' +
+ f'{table[name]["dev"]["MRR@100"]:8.4f}{table[name]["dev"]["R@1K"]:8.4f} ' +
+ f'{table[name]["dev2"]["MRR@100"]:8.4f}{table[name]["dev2"]["R@1K"]:8.4f}')
+ end = time.time()
+ print('\n')
+ print(f'Total elapsed time: {end - start:.0f}s')
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Generate regression matrix for MS MARCO corpora.')
+ parser.add_argument('--collection', type=str,
+ help='Collection = {v1-passage, v1-doc, v2-passage, v2-doc}.', required=True)
+ # To list all conditions
+ parser.add_argument('--list-conditions', action='store_true', default=False, help='List available conditions.')
+ # For generating reports
+ parser.add_argument('--generate-report', action='store_true', default=False, help='Generate report.')
+ parser.add_argument('--output', type=str, help='File to store report.', required=False)
+ # For actually running the experimental conditions
+ parser.add_argument('--all', action='store_true', default=False, help='Run all conditions.')
+ parser.add_argument('--condition', type=str, help='Condition to run.', required=False)
+ parser.add_argument('--directory', type=str, help='Base directory.', default='', required=False)
+ parser.add_argument('--dry-run', action='store_true', default=False, help='Print out commands but do not execute.')
+ parser.add_argument('--skip-eval', action='store_true', default=False, help='Skip running trec_eval.')
+ parser.add_argument('--display-commands', action='store_true', default=False, help='Display command.')
+ args = parser.parse_args()
+ if args.collection == 'v1-passage':
+ args.collection = 'msmarco-v1-passage'
+ elif args.collection == 'v1-doc':
+ args.collection = 'msmarco-v1-doc'
+ elif args.collection == 'v2-passage':
+ args.collection = 'msmarco-v2-passage'
+ elif args.collection == 'v2-doc':
+ args.collection = 'msmarco-v2-doc'
+ else:
+ raise ValueError(f'Unknown corpus: {args.collection}')
+ if args.list_conditions:
+ list_conditions(args)
+ sys.exit()
+ if args.generate_report:
+ if not args.output:
+ print(f'Must specify report filename with --output.')
+ sys.exit()
+ generate_report(args)
+ sys.exit()
+ if not args.all and not args.condition:
+ print(f'Must specify a specific condition using --condition or use --all to run all conditions.')
+ sys.exit()
+ run_conditions(args)

pyserini/2cr/msmarco_html_row_v1.template ADDED Viewed

	@@ -0,0 +1,81 @@

+<!-- Condition: ${condition_name} -->
+<tr class="accordion-toggle collapsed" id="row${row_cnt}" data-toggle="collapse" data-parent="#row${row_cnt}" href="#collapse${row_cnt}">
+<td class="expand-button"></td>
+<td style="min-width: 85px">$row</td>
+<td style="min-width: 400px">${condition_name}</td>
+<td>$s1</td>
+<td>$s2</td>
+<td>$s3</td>
+<td></td>
+<td>$s4</td>
+<td>$s5</td>
+<td>$s6</td>
+<td></td>
+<td>$s7</td>
+<td>$s8</td>
+</tr>
+<tr class="hide-table-padding">
+<td></td>
+<td colspan="11">
+<div id="collapse${row_cnt}" class="collapse in p-3">
+<!-- Tabs navs -->
+<ul class="nav nav-tabs mb-3" id="row${row_cnt}-tabs" role="tablist">
+ <li class="nav-item" role="presentation">
+ <a class="nav-link active" id="row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab1" role="tab" aria-controls="row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">TREC 2019</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab2" role="tab" aria-controls="row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">TREC 2020</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab3" role="tab" aria-controls="row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">dev</a>
+ </li>
+</ul>
+<!-- Tabs navs -->
+<!-- Tabs content -->
+<div class="tab-content" id="row${row_cnt}-content">
+ <div class="tab-pane fade show active" id="row${row_cnt}-tab1" role="tabpanel" aria-labelledby="row${row_cnt}-tab1">
+Command to generate run on TREC 2019 queries:
+ <blockquote class="mycode">
+<pre><code>$cmd1
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd1}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="row${row_cnt}-tab2" role="tabpanel" aria-labelledby="row${row_cnt}-tab2">
+ Command to generate run on TREC 2020 queries:
+ <blockquote class="mycode">
+<pre><code>$cmd2
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd2}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="row${row_cnt}-tab3" role="tabpanel" aria-labelledby="row${row_cnt}-tab3">
+ Command to generate run on dev queries:
+ <blockquote class="mycode">
+<pre><code>$cmd3
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd3}</code></pre>
+ </blockquote>
+ </div>
+</div>
+<!-- Tabs content -->
+</div></td>
+</tr>

pyserini/2cr/msmarco_html_row_v2.template ADDED Viewed

	@@ -0,0 +1,82 @@

+<!-- Condition: ${condition_name} -->
+<tr class="accordion-toggle collapsed" id="row${row_cnt}" data-toggle="collapse" data-parent="#row${row_cnt}" href="#collapse${row_cnt}">
+<td class="expand-button"></td>
+<td>$row</td>
+<td style="min-width: 400px">${condition_name}</td>
+<td>$s1</td>
+<td>$s2</td>
+<td>$s3</td>
+<td>$s4</td>
+<td>$s5</td>
+<td></td>
+<td>$s6</td>
+<td>$s7</td>
+<td></td>
+<td>$s8</td>
+<td>$s9</td>
+</tr>
+<tr class="hide-table-padding">
+<td></td>
+<td colspan="12">
+<div id="collapse${row_cnt}" class="collapse in p-3">
+<!-- Tabs navs -->
+<ul class="nav nav-tabs mb-3" id="row${row_cnt}-tabs" role="tablist">
+ <li class="nav-item" role="presentation">
+ <a class="nav-link active" id="row${row_cnt}-tab1-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab1" role="tab" aria-controls="row${row_cnt}-tab1" aria-selected="true" style="text-transform:none">TREC 2021</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="row${row_cnt}-tab2-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab2" role="tab" aria-controls="row${row_cnt}-tab2" aria-selected="false" style="text-transform:none">dev</a>
+ </li>
+ <li class="nav-item" role="presentation">
+ <a class="nav-link" id="row${row_cnt}-tab3-header" data-mdb-toggle="tab" href="#row${row_cnt}-tab3" role="tab" aria-controls="row${row_cnt}-tab3" aria-selected="false" style="text-transform:none">dev2</a>
+ </li>
+</ul>
+<!-- Tabs navs -->
+<!-- Tabs content -->
+<div class="tab-content" id="row${row_cnt}-content">
+ <div class="tab-pane fade show active" id="row${row_cnt}-tab1" role="tabpanel" aria-labelledby="row${row_cnt}-tab1">
+Command to generate run on TREC 2021 queries:
+ <blockquote class="mycode">
+<pre><code>$cmd1
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd1}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="row${row_cnt}-tab2" role="tabpanel" aria-labelledby="row${row_cnt}-tab2">
+ Command to generate run on dev queries:
+ <blockquote class="mycode">
+<pre><code>$cmd2
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd2}</code></pre>
+ </blockquote>
+ </div>
+ <div class="tab-pane fade" id="row${row_cnt}-tab3" role="tabpanel" aria-labelledby="row${row_cnt}-tab3">
+ Command to generate run on dev2 queries:
+ <blockquote class="mycode">
+<pre><code>$cmd3
+</code></pre></blockquote>
+Evaluation commands:
+ <blockquote class="mycode">
+<pre><code>${eval_cmd3}</code></pre>
+ </blockquote>
+ </div>
+</div>
+<!-- Tabs content -->
+</div></td>
+</tr>

pyserini/2cr/msmarco_html_v1_doc.template ADDED Viewed

	@@ -0,0 +1,296 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8" />
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
+ <title>Pyserini Reproductions: MS MARCO V1 Document</title>
+ <!-- Font Awesome -->
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+ <!-- Google Fonts Roboto -->
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+ <!-- MDB -->
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+ <style>
+tr.hide-table-padding td {
+ padding: 0;
+}
+.expand-button {
+ position: relative;
+}
+.accordion-toggle .expand-button:after {
+ position: absolute;
+ left:.75rem;
+ top: 50%;
+ transform: translate(0, -50%);
+ content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+ content: '+';
+}
+blockquote.mycode {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ margin-top: 15px;
+ padding-left: 15px;
+}
+blockquote.mycode2 {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ padding-top: 10px;
+ padding-bottom: 10px;
+ padding-left: 15px;
+}
+tr th.headertop {
+ border-bottom: none;
+ padding-bottom: 0rem
+}
+tr th.headerbottom {
+ padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+ border-radius: 0;
+ min-width: 55px;
+ background: none repeat scroll 0 0 transparent;
+ background-color: grey;
+ color: #F1F2F3 !important;
+ cursor: pointer;
+ border-style: none;
+ font-family: 'HELVETICA',sans-serif;
+ font-size: 0.8em;
+ font-weight: normal;
+ text-align: center;
+ text-decoration: none;
+ text-indent: 0;
+ text-transform: uppercase;
+ font-weight: 500;
+ line-height: 1.42rem;
+ margin: 0;
+ padding: 3px 8px;
+ position: absolute !important;
+ top: 0 !important;
+ right: 0 !important;
+}
+.copy-code-button > span {
+ color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+ box-sizing: inherit;
+}
+.copy-code-button::before {
+ content: '';
+ display: inline-block;
+ width: 16px;
+ height: 16px;
+ margin-right: 3px;
+ background-size: contain;
+ background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+ background-repeat: no-repeat;
+ position: relative;
+ top: 3px;
+}
+.copy-code-button:focus {
+ /* Avoid an ugly focus outline on click in Chrome,
+ but darken the button for accessibility.
+ See https://stackoverflow.com/a/25298082/1481479 */
+ /* background-color: #E6E6E6; */
+ outline: 0;
+}
+pre[class*="prettyprint"] {
+ position: relative;
+ overflow: hidden;
+}
+ </style>
+</head>
+<body>
+ <!-- Background image -->
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+ <div class="mask" style="
+ background: linear-gradient(
+ 45deg,
+ rgba(29, 236, 197, 0.7),
+ rgba(91, 14, 214, 0.7) 100%
+ );
+ ">
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
+ <div class="text-white">
+ <h1 class="mb-3">$title</h1>
+ </div>
+ </div>
+ </div>
+ </div>
+ <!-- Background image -->
+<div class="container my-4">
+<p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in a number of papers, denoted by the references in square brackets.
+Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
+<div class="table-responsive">
+ <table class="table">
+ <thead>
+ <tr>
+ <th class="headertop"></th>
+ <th class="headertop"></th>
+ <th class="headertop"></th>
+ <th class="headertop" colspan="4"><b>TREC 2019</b></th>
+ <th class="headertop" colspan="4"><b>TREC 2020</b></th>
+ <th class="headertop" colspan="3"><b>dev</b></th>
+ </tr>
+ <tr>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"><br/>AP@100</th>
+ <th class="headerbottom" scope="col">nDCG@10</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"><br/>AP@100</th>
+ <th class="headerbottom" scope="col">nDCG@10</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col">RR@100</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ </tr>
+ </thead>
+ <tbody>
+$rows
+ </tbody>
+ </table>
+</div>
+<ul style="list-style-type:none; padding-top: 25px">
+<li><p>[1] Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin.
+<a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
+<i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
+<p>&nbsp; &nbsp; &nbsp; &nbsp;Numbers in parentheses correspond to rows in Table 2 of the paper.</p></li>
+</ul>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --list-conditions
+</tt></blockquote>
+<p>These conditions correspond to the table rows above.</p>
+<p>For all conditions, just show the commands in a "dry run":</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --all --display-commands --dry-run
+</tt></blockquote>
+<p>To actually run all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --all --display-commands
+</tt></blockquote>
+<p>With the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>To show the commands for a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --condition bm25-doc-default --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>To actually run a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --condition bm25-doc-default --display-commands
+</tt></blockquote>
+<p>Again, with the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-doc --generate-report --output msmarco-v1-doc.html
+</tt></blockquote>
+<p>The output file <tt>msmarco-v1-doc.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+ </div>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+ var button = document.createElement('button');
+ button.className = 'copy-code-button';
+ button.type = 'button';
+ var s = codeBlock.innerText;
+ button.setAttribute('data-clipboard-text',s);
+ button.innerText = 'Copy';
+ // var pre = codeBlock.parentNode;
+ codeBlock.classList.add('prettyprint');
+ // pre.parentNode.insertBefore(button, pre);
+ codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+ console.info('Action:', e.action);
+ console.info('Text:', e.text);
+ console.info('Trigger:', e.trigger);
+ e.trigger.textContent = 'Copied';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+clipboard.on('error', function(e) {
+ console.error('Action:', e.action);
+ console.error('Trigger:', e.trigger);
+ e.trigger.textContent = 'Error Copying';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/msmarco_html_v1_passage.template ADDED Viewed

	@@ -0,0 +1,325 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8" />
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
+ <title>Pyserini Reproductions: MS MARCO V1 Passage</title>
+ <!-- Font Awesome -->
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+ <!-- Google Fonts Roboto -->
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+ <!-- MDB -->
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+ <style>
+tr.hide-table-padding td {
+ padding: 0;
+}
+.expand-button {
+ position: relative;
+}
+.accordion-toggle .expand-button:after {
+ position: absolute;
+ left:.75rem;
+ top: 50%;
+ transform: translate(0, -50%);
+ content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+ content: '+';
+}
+blockquote.mycode {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ margin-top: 15px;
+ padding-left: 15px;
+}
+blockquote.mycode2 {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ padding-top: 10px;
+ padding-bottom: 10px;
+ padding-left: 15px;
+}
+tr th.headertop {
+ border-bottom: none;
+ padding-bottom: 0rem
+}
+tr th.headerbottom {
+ padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+ border-radius: 0;
+ min-width: 55px;
+ background: none repeat scroll 0 0 transparent;
+ background-color: grey;
+ color: #F1F2F3 !important;
+ cursor: pointer;
+ border-style: none;
+ font-family: 'HELVETICA',sans-serif;
+ font-size: 0.8em;
+ font-weight: normal;
+ text-align: center;
+ text-decoration: none;
+ text-indent: 0;
+ text-transform: uppercase;
+ font-weight: 500;
+ line-height: 1.42rem;
+ margin: 0;
+ padding: 3px 8px;
+ position: absolute !important;
+ top: 0 !important;
+ right: 0 !important;
+}
+.copy-code-button > span {
+ color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+ box-sizing: inherit;
+}
+.copy-code-button::before {
+ content: '';
+ display: inline-block;
+ width: 16px;
+ height: 16px;
+ margin-right: 3px;
+ background-size: contain;
+ background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+ background-repeat: no-repeat;
+ position: relative;
+ top: 3px;
+}
+.copy-code-button:focus {
+ /* Avoid an ugly focus outline on click in Chrome,
+ but darken the button for accessibility.
+ See https://stackoverflow.com/a/25298082/1481479 */
+ /* background-color: #E6E6E6; */
+ outline: 0;
+}
+pre[class*="prettyprint"] {
+ position: relative;
+ overflow: hidden;
+}
+ </style>
+</head>
+<body>
+ <!-- Background image -->
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+ <div class="mask" style="
+ background: linear-gradient(
+ 45deg,
+ rgba(29, 236, 197, 0.7),
+ rgba(91, 14, 214, 0.7) 100%
+ );
+ ">
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
+ <div class="text-white">
+ <h1 class="mb-3">$title</h1>
+ </div>
+ </div>
+ </div>
+ </div>
+ <!-- Background image -->
+<div class="container my-4">
+<p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in a number of papers, denoted by the references in square brackets.
+Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
+<div class="table-responsive">
+ <table class="table">
+ <thead>
+ <tr>
+ <th class="headertop"></th>
+ <th class="headertop"></th>
+ <th class="headertop"></th>
+ <th class="headertop" colspan="4"><b>TREC 2019</b></th>
+ <th class="headertop" colspan="4"><b>TREC 2020</b></th>
+ <th class="headertop" colspan="3"><b>dev</b></th>
+ </tr>
+ <tr>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"><br/>AP</th>
+ <th class="headerbottom" scope="col">nDCG@10</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"><br/>AP</th>
+ <th class="headerbottom" scope="col">nDCG@10</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col">RR@10</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ </tr>
+ </thead>
+ <tbody>
+$rows
+ </tbody>
+ </table>
+</div>
+<ul style="list-style-type:none; padding-top: 25px">
+<li><p>[1] Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin.
+<a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
+<i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
+<p>&nbsp; &nbsp; &nbsp; &nbsp;Numbers in parentheses correspond to rows in Table 1 of the paper.</p></li>
+<li><p>[2] Thibault Formal, Carlos Lassance, Benjamin Piwowarski, and Stéphane Clinchant
+<a href="https://arxiv.org/abs/2205.04733">From Distillation to Hard Negative Sampling: Making Sparse Neural IR Models More Effective.</a>
+<i>Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022), May 2022.</i></p></li>
+<li><p>[3] Lee Xiong, Chenyan Xiong, Ye Li, Kwok-Fung Tang, Jialin Liu, Paul N. Bennett, Junaid Ahmed, and Arnold Overwijk.
+<a href="https://openreview.net/forum?id=zeFrfgyZln">Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval.</a>
+<i>Proceedings of the 9th International Conference on Learning Representations (ICLR 2021), May 2021.</i></p></li>
+<li><p>[4] Sebastian Hofstätter, Sophia Althammer, Michael Schröder, Mete Sertkan, and Allan Hanbury.
+<a href="https://arxiv.org/abs/2010.02666">Improving Efficient Neural Ranking Models with Cross-Architecture Knowledge Distillation.</a>
+<i>arXiv:2010.02666</i>, October 2020.</p></li>
+<li><p>[5] Sebastian Hofstätter, Sheng-Chieh Lin, Jheng-Hong Yang, Jimmy Lin, and Allan Hanbury.
+<a href="https://dl.acm.org/doi/10.1145/3404835.3462891">Efficiently Teaching an Effective Dense Retriever with Balanced Topic Aware Sampling.</a>
+<i>Proceedings of the 44th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2021)</i>, pages 113-122, July 2021.</p></li>
+<li><p>[6] Sheng-Chieh Lin, Jheng-Hong Yang, and Jimmy Lin.
+<a href="https://aclanthology.org/2021.repl4nlp-1.17/">In-Batch Negatives for Knowledge Distillation with Tightly-Coupled Teachers for Dense Retrieval.</a>
+<i>Proceedings of the 6th Workshop on Representation Learning for NLP (RepL4NLP-2021)</i>, pages 163-173, August 2021.</p></li>
+<li><p>[7] Minghan Li, Sheng-Chieh Lin, Xueguang Ma, Jimmy Lin.
+<a href="https://arxiv.org/abs/2302.06587">SLIM: Sparsified Late Interaction for Multi-Vector Retrieval with Inverted Indexes.</a>
+<i>arXiv:2302.06587</i>, Feburary 2023.</p></li>
+<li><p>[8] Sheng-Chieh Lin, Minghan Li and Jimmy Lin.
+<a href="https://arxiv.org/abs/2208.00511">Aggretriever: A Simple Approach to Aggregate Textual Representation for Robust Dense Passage Retrieval.</a>
+<i>arXiv:2208.00511</i>, July 2022.</p></li>
+</ul>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --list-conditions
+</tt></blockquote>
+<p>These conditions correspond to the table rows above.</p>
+<p>For all conditions, just show the commands in a "dry run":</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --all --display-commands --dry-run
+</tt></blockquote>
+<p>To actually run all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --all --display-commands
+</tt></blockquote>
+<p>With the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>To show the commands for a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --condition bm25-default --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>To actually run a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --condition bm25-default --display-commands
+</tt></blockquote>
+<p>Again, with the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v1-passage --generate-report --output msmarco-v1-passage.html
+</tt></blockquote>
+<p>The output file <tt>msmarco-v1-passage.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+ </div>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+ var button = document.createElement('button');
+ button.className = 'copy-code-button';
+ button.type = 'button';
+ var s = codeBlock.innerText;
+ button.setAttribute('data-clipboard-text',s);
+ button.innerText = 'Copy';
+ // var pre = codeBlock.parentNode;
+ codeBlock.classList.add('prettyprint');
+ // pre.parentNode.insertBefore(button, pre);
+ codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+ console.info('Action:', e.action);
+ console.info('Text:', e.text);
+ console.info('Trigger:', e.trigger);
+ e.trigger.textContent = 'Copied';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+clipboard.on('error', function(e) {
+ console.error('Action:', e.action);
+ console.error('Trigger:', e.trigger);
+ e.trigger.textContent = 'Error Copying';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/msmarco_html_v2_doc.template ADDED Viewed

	@@ -0,0 +1,292 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8" />
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
+ <title>Pyserini Reproductions: MS MARCO V2 Document</title>
+ <!-- Font Awesome -->
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+ <!-- Google Fonts Roboto -->
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+ <!-- MDB -->
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+ <style>
+tr.hide-table-padding td {
+ padding: 0;
+}
+.expand-button {
+ position: relative;
+}
+.accordion-toggle .expand-button:after {
+ position: absolute;
+ left:.75rem;
+ top: 50%;
+ transform: translate(0, -50%);
+ content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+ content: '+';
+}
+blockquote.mycode {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ margin-top: 15px;
+ padding-left: 15px;
+}
+blockquote.mycode2 {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ padding-top: 10px;
+ padding-bottom: 10px;
+ padding-left: 15px;
+}
+tr th.headertop {
+ border-bottom: none;
+ padding-bottom: 0rem
+}
+tr th.headerbottom {
+ padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+ border-radius: 0;
+ min-width: 55px;
+ background: none repeat scroll 0 0 transparent;
+ background-color: grey;
+ color: #F1F2F3 !important;
+ cursor: pointer;
+ border-style: none;
+ font-family: 'HELVETICA',sans-serif;
+ font-size: 0.8em;
+ font-weight: normal;
+ text-align: center;
+ text-decoration: none;
+ text-indent: 0;
+ text-transform: uppercase;
+ font-weight: 500;
+ line-height: 1.42rem;
+ margin: 0;
+ padding: 3px 8px;
+ position: absolute !important;
+ top: 0 !important;
+ right: 0 !important;
+}
+.copy-code-button > span {
+ color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+ box-sizing: inherit;
+}
+.copy-code-button::before {
+ content: '';
+ display: inline-block;
+ width: 16px;
+ height: 16px;
+ margin-right: 3px;
+ background-size: contain;
+ background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+ background-repeat: no-repeat;
+ position: relative;
+ top: 3px;
+}
+.copy-code-button:focus {
+ /* Avoid an ugly focus outline on click in Chrome,
+ but darken the button for accessibility.
+ See https://stackoverflow.com/a/25298082/1481479 */
+ /* background-color: #E6E6E6; */
+ outline: 0;
+}
+pre[class*="prettyprint"] {
+ position: relative;
+ overflow: hidden;
+}
+ </style>
+</head>
+<body>
+ <!-- Background image -->
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+ <div class="mask" style="
+ background: linear-gradient(
+ 45deg,
+ rgba(29, 236, 197, 0.7),
+ rgba(91, 14, 214, 0.7) 100%
+ );
+ ">
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
+ <div class="text-white">
+ <h1 class="mb-3">$title</h1>
+ </div>
+ </div>
+ </div>
+ </div>
+ <!-- Background image -->
+<div class="container my-4">
+<p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in the following paper.
+Numbered rows correspond to tables in the paper; additional conditions are provided for comparison purposes.</p>
+<p class="note note-light">Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
+<i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
+<p>Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
+<div class="table-responsive">
+ <table class="table">
+ <thead>
+ <tr>
+ <th class="headertop"></th>
+ <th class="headertop"></th>
+ <th class="headertop"></th>
+ <th class="headertop" colspan="6"><b>TREC 2021</b></th>
+ <th class="headertop" colspan="3"><b>dev</b></th>
+ <th class="headertop" colspan="3"><b>dev2</b></th>
+ </tr>
+ <tr>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"><br/>AP</th>
+ <th class="headerbottom" scope="col">nDCG@10</th>
+ <th class="headerbottom" scope="col">RR@100</th>
+ <th class="headerbottom" scope="col">R@100</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col">RR@100</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col">RR@100</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ </tr>
+ </thead>
+ <tbody>
+$rows
+ </tbody>
+ </table>
+</div>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --list-conditions
+</tt></blockquote>
+<p>These conditions correspond to the table rows above.</p>
+<p>For all conditions, just show the commands in a "dry run":</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --all --display-commands --dry-run
+</tt></blockquote>
+<p>To actually run all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --all --display-commands
+</tt></blockquote>
+<p>With the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>To show the commands for a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --condition bm25-doc-default --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>To actually run a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --condition bm25-doc-default --display-commands
+</tt></blockquote>
+<p>Again, with the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-doc --generate-report --output msmarco-v2-doc.html
+</tt></blockquote>
+<p>The output file <tt>msmarco-v2-doc.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+ </div>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+ var button = document.createElement('button');
+ button.className = 'copy-code-button';
+ button.type = 'button';
+ var s = codeBlock.innerText;
+ button.setAttribute('data-clipboard-text',s);
+ button.innerText = 'Copy';
+ // var pre = codeBlock.parentNode;
+ codeBlock.classList.add('prettyprint');
+ // pre.parentNode.insertBefore(button, pre);
+ codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+ console.info('Action:', e.action);
+ console.info('Text:', e.text);
+ console.info('Trigger:', e.trigger);
+ e.trigger.textContent = 'Copied';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+clipboard.on('error', function(e) {
+ console.error('Action:', e.action);
+ console.error('Trigger:', e.trigger);
+ e.trigger.textContent = 'Error Copying';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/2cr/msmarco_html_v2_passage.template ADDED Viewed

	@@ -0,0 +1,292 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8" />
+ <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
+ <meta http-equiv="x-ua-compatible" content="ie=edge" />
+ <title>Pyserini Reproductions: MS MARCO V2 Passage</title>
+ <!-- Font Awesome -->
+ <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.11.2/css/all.css" />
+ <!-- Google Fonts Roboto -->
+ <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Roboto:wght@300;400;500;700&display=swap" />
+ <!-- MDB -->
+ <link href="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.css" rel="stylesheet" />
+ <style>
+tr.hide-table-padding td {
+ padding: 0;
+}
+.expand-button {
+ position: relative;
+}
+.accordion-toggle .expand-button:after {
+ position: absolute;
+ left:.75rem;
+ top: 50%;
+ transform: translate(0, -50%);
+ content: '-';
+}
+.accordion-toggle.collapsed .expand-button:after {
+ content: '+';
+}
+blockquote.mycode {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ margin-top: 15px;
+ padding-left: 15px;
+}
+blockquote.mycode2 {
+ border-left: 3px solid #ccc;
+ margin-left: 25px;
+ padding-top: 10px;
+ padding-bottom: 10px;
+ padding-left: 15px;
+}
+tr th.headertop {
+ border-bottom: none;
+ padding-bottom: 0rem
+}
+tr th.headerbottom {
+ padding-top: 0rem
+}
+.table>:not(caption)>*>*{padding:0.75rem 0.75rem}
+.copy-code-button {
+ border-radius: 0;
+ min-width: 55px;
+ background: none repeat scroll 0 0 transparent;
+ background-color: grey;
+ color: #F1F2F3 !important;
+ cursor: pointer;
+ border-style: none;
+ font-family: 'HELVETICA',sans-serif;
+ font-size: 0.8em;
+ font-weight: normal;
+ text-align: center;
+ text-decoration: none;
+ text-indent: 0;
+ text-transform: uppercase;
+ font-weight: 500;
+ line-height: 1.42rem;
+ margin: 0;
+ padding: 3px 8px;
+ position: absolute !important;
+ top: 0 !important;
+ right: 0 !important;
+}
+.copy-code-button > span {
+ color: #F1F2F3 !important;
+}
+.copy-code-button, ::before, ::after {
+ box-sizing: inherit;
+}
+.copy-code-button::before {
+ content: '';
+ display: inline-block;
+ width: 16px;
+ height: 16px;
+ margin-right: 3px;
+ background-size: contain;
+ background-image: url("data:image/svg+xml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHN2ZyB3aWR0aD0iMTVweCIgaGVpZ2h0PSIxNXB4IiB2aWV3Qm94PSIwIDAgMTUgMTUiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiB4bWxuczp4bGluaz0iaHR0cDovL3d3dy53My5vcmcvMTk5OS94bGluayI+CiAgICA8IS0tIEdlbmVyYXRvcjogU2tldGNoIDUwLjIgKDU1MDQ3KSAtIGh0dHA6Ly93d3cuYm9oZW1pYW5jb2RpbmcuY29tL3NrZXRjaCAtLT4KICAgIDx0aXRsZT5QYWdlIDE8L3RpdGxlPgogICAgPGRlc2M+Q3JlYXRlZCB3aXRoIFNrZXRjaC48L2Rlc2M+CiAgICA8ZGVmcz48L2RlZnM+CiAgICA8ZyBpZD0iRmxvdyIgc3Ryb2tlPSJub25lIiBzdHJva2Utd2lkdGg9IjEiIGZpbGw9Im5vbmUiIGZpbGwtcnVsZT0iZXZlbm9kZCI+CiAgICAgICAgPGcgaWQ9IkJ0dG5faHRtbCIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoLTgxOS4wMDAwMDAsIC03NTMuMDAwMDAwKSIgZmlsbD0iI0ZGRkZGRiI+CiAgICAgICAgICAgIDxnIGlkPSJHcm91cC0xIiB0cmFuc2Zvcm09InRyYW5zbGF0ZSgzMTEuMDAwMDAwLCA0MDUuMDAwMDAwKSI+CiAgICAgICAgICAgICAgICA8ZyBpZD0iR3JvdXAtMiIgdHJhbnNmb3JtPSJ0cmFuc2xhdGUoNTA4LjAwMDAwMCwgMzQyLjAwMDAwMCkiPgogICAgICAgICAgICAgICAgICAgIDxwYXRoIGQ9Ik0xMy45NzcyNzI3LDYgTDMuNDA5MDkwOTEsNiBDMi44NDQ1NDU0NSw2IDIuMzg2MzYzNjQsNi40NTgxODE4MiAyLjM4NjM2MzY0LDcuMDIyNzI3MjcgTDIuMzg2MzYzNjQsMTcuNTkwOTA5MSBDMi4zODYzNjM2NCwxOC4xNTU0NTQ1IDIuODQ0NTQ1NDUsMTguNjEzNjM2NCAzLjQwOTA5MDkxLDE4LjYxMzYzNjQgTDEzLjk3NzI3MjcsMTguNjEzNjM2NCBDMTQuNTQxODE4MiwxOC42MTM2MzY0IDE1LDE4LjE1NTQ1NDUgMTUsMTcuNTkwOTA5MSBMMTUsNy4wMjI3MjcyNyBDMTUsNi40NTgxODE4MiAxNC41NDE4MTgyLDYgMTMuOTc3MjcyNyw2IFogTTE0LjMxODE4MTgsMTcuNTkwOTA5MSBDMTQuMzE4MTgxOCwxNy43NzkwOTA5IDE0LjE2NTQ1NDUsMTcuOTMxODE4MiAxMy45NzcyNzI3LDE3LjkzMTgxODIgTDMuNDA5MDkwOTEsMTcuOTMxODE4MiBDMy4yMjA5MDkwOSwxNy45MzE4MTgyIDMuMDY4MTgxODIsMTcuNzc5MDkwOSAzLjA2ODE4MTgyLDE3LjU5MDkwOTEgTDMuMDY4MTgxODIsNy4wMjI3MjcyNyBDMy4wNjgxODE4Miw2LjgzNDU0NTQ1IDMuMjIwOTA5MDksNi42ODE4MTgxOCAzLjQwOTA5MDkxLDYuNjgxODE4MTggTDEzLjk3NzI3MjcsNi42ODE4MTgxOCBDMTQuMTY1NDU0NSw2LjY4MTgxODE4IDE0LjMxODE4MTgsNi44MzQ1NDU0NSAxNC4zMTgxODE4LDcuMDIyNzI3MjcgTDE0LjMxODE4MTgsMTcuNTkwOTA5MSBaIE0xMS45MzE4MTgyLDE5Ljk3NzI3MjcgQzExLjkzMTgxODIsMjAuMTY1NDU0NSAxMS43NzkwOTA5LDIwLjMxODE4MTggMTEuNTkwOTA5MSwyMC4zMTgxODE4IEwxLjAyMjcyNzI3LDIwLjMxODE4MTggQzAuODM0NTQ1NDU1LDIwLjMxODE4MTggMC42ODE4MTgxODIsMjAuMTY1NDU0NSAwLjY4MTgxODE4MiwxOS45NzcyNzI3IEwwLjY4MTgxODE4Miw5LjQwOTA5MDkxIEMwLjY4MTgxODE4Miw5LjIyMDkwOTA5IDAuODM0NTQ1NDU1LDkuMDY4MTgxODIgMS4wMjI3MjcyNyw5LjA2ODE4MTgyIEwxLjM2MzYzNjM2LDkuMDY4MTgxODIgTDEuMzYzNjM2MzYsOC4zODYzNjM2NCBMMS4wMjI3MjcyNyw4LjM4NjM2MzY0IEMwLjQ1ODE4MTgxOCw4LjM4NjM2MzY0IDAsOC44NDQ1NDU0NSAwLDkuNDA5MDkwOTEgTDAsMTkuOTc3MjcyNyBDMCwyMC41NDE4MTgyIDAuNDU4MTgxODE4LDIxIDEuMDIyNzI3MjcsMjEgTDExLjU5MDkwOTEsMjEgQzEyLjE1NTQ1NDUsMjEgMTIuNjEzNjM2NCwyMC41NDE4MTgyIDEyLjYxMzYzNjQsMTkuOTc3MjcyNyBMMTIuNjEzNjM2NCwxOS42MzYzNjM2IEwxMS45MzE4MTgyLDE5LjYzNjM2MzYgTDExLjkzMTgxODIsMTkuOTc3MjcyNyBaIiBpZD0iUGFnZS0xIj48L3BhdGg+CiAgICAgICAgICAgICAgICA8L2c+CiAgICAgICAgICAgIDwvZz4KICAgICAgICA8L2c+CiAgICA8L2c+Cjwvc3ZnPg==");
+ background-repeat: no-repeat;
+ position: relative;
+ top: 3px;
+}
+.copy-code-button:focus {
+ /* Avoid an ugly focus outline on click in Chrome,
+ but darken the button for accessibility.
+ See https://stackoverflow.com/a/25298082/1481479 */
+ /* background-color: #E6E6E6; */
+ outline: 0;
+}
+pre[class*="prettyprint"] {
+ position: relative;
+ overflow: hidden;
+}
+ </style>
+</head>
+<body>
+ <!-- Background image -->
+ <div id="intro" class="bg-image vh-100 shadow-1-strong" style="max-height: 150px">
+ <div class="mask" style="
+ background: linear-gradient(
+ 45deg,
+ rgba(29, 236, 197, 0.7),
+ rgba(91, 14, 214, 0.7) 100%
+ );
+ ">
+ <div class="container d-flex align-items-center justify-content-center text-center h-100" style="max-height: 150px">
+ <div class="text-white">
+ <h1 class="mb-3">$title</h1>
+ </div>
+ </div>
+ </div>
+ </div>
+ <!-- Background image -->
+<div class="container my-4">
+<p>The two-click<a href="#" data-mdb-toggle="tooltip" title="What are the two clicks, you ask? Copy and paste!"><sup>*</sup></a> reproduction matrix below provides commands for reproducing experimental results reported in the following paper.
+Numbered rows correspond to tables in the paper; additional conditions are provided for comparison purposes.</p>
+<p class="note note-light">Xueguang Ma, Ronak Pradeep, Rodrigo Nogueira, and Jimmy Lin. <a href="https://cs.uwaterloo.ca/~jimmylin/publications/Ma_etal_SIGIR2022.pdf">Document Expansions and Learned Sparse Lexical Representations for MS MARCO V1 and V2.</a>
+<i>Proceedings of the 45th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2022)</i>, July 2022.</p>
+<p>Instructions for programmatic execution are shown at the bottom of this page (scroll down).</p>
+<div class="table-responsive">
+ <table class="table">
+ <thead>
+ <tr>
+ <th class="headertop"></th>
+ <th class="headertop"></th>
+ <th class="headertop"></th>
+ <th class="headertop" colspan="6"><b>TREC 2021</b></th>
+ <th class="headertop" colspan="3"><b>dev</b></th>
+ <th class="headertop" colspan="3"><b>dev2</b></th>
+ </tr>
+ <tr>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col"><br/>AP</th>
+ <th class="headerbottom" scope="col">nDCG@10</th>
+ <th class="headerbottom" scope="col">RR@100</th>
+ <th class="headerbottom" scope="col">R@100</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col">RR@100</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ <th class="headerbottom" scope="col"></th>
+ <th class="headerbottom" scope="col">RR@100</th>
+ <th class="headerbottom" scope="col">R@1K</th>
+ </tr>
+ </thead>
+ <tbody>
+$rows
+ </tbody>
+ </table>
+</div>
+<div style="padding-top: 20px"/>
+<h4>Programmatic Execution</h4>
+<p>All experimental runs shown in the above table can be programmatically executed based on the instructions below.
+To list all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --list-conditions
+</tt></blockquote>
+<p>These conditions correspond to the table rows above.</p>
+<p>For all conditions, just show the commands in a "dry run":</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --all --display-commands --dry-run
+</tt></blockquote>
+<p>To actually run all the experimental conditions:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --all --display-commands
+</tt></blockquote>
+<p>With the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>To show the commands for a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --condition bm25-default --display-commands --dry-run
+</tt></blockquote>
+<p>This will generate exactly the commands for a specific condition above (corresponding to a row in the table).</p>
+<p>To actually run a specific condition:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --condition bm25-default --display-commands
+</tt></blockquote>
+<p>Again, with the above command, run files will be placed in the current directory.
+Use the option <tt>--directory runs/</tt> to place the runs in a sub-directory.</p>
+<p>Finally, to generate this page:</p>
+<blockquote class="mycode2"><tt>
+python -m pyserini.2cr.msmarco --collection v2-passage --generate-report --output msmarco-v2-passage.html
+</tt></blockquote>
+<p>The output file <tt>msmarco-v2-passage.html</tt> should be identical to this page.</p>
+<div style="padding-top: 50px"/>
+ </div>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.3.1/js/bootstrap.min.js"></script>
+ <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mdb-ui-kit/4.0.0/mdb.min.js"></script>
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.10/clipboard.min.js"></script>
+<script>
+document.querySelectorAll('pre').forEach(function (codeBlock) {
+ var button = document.createElement('button');
+ button.className = 'copy-code-button';
+ button.type = 'button';
+ var s = codeBlock.innerText;
+ button.setAttribute('data-clipboard-text',s);
+ button.innerText = 'Copy';
+ // var pre = codeBlock.parentNode;
+ codeBlock.classList.add('prettyprint');
+ // pre.parentNode.insertBefore(button, pre);
+ codeBlock.appendChild(button);
+});
+var clipboard = new ClipboardJS('.copy-code-button');
+clipboard.on('success', function(e) {
+ console.info('Action:', e.action);
+ console.info('Text:', e.text);
+ console.info('Trigger:', e.trigger);
+ e.trigger.textContent = 'Copied';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+clipboard.on('error', function(e) {
+ console.error('Action:', e.action);
+ console.error('Trigger:', e.trigger);
+ e.trigger.textContent = 'Error Copying';
+ window.setTimeout(function() {
+ e.trigger.textContent = 'Copy';
+ }, 2000);
+ e.clearSelection();
+});
+</script>
+</body>
+</html>

pyserini/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

pyserini/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (165 Bytes). View file

pyserini/__pycache__/encoded_corpus_info.cpython-310.pyc ADDED Viewed

Binary file (971 Bytes). View file

pyserini/__pycache__/encoded_query_info.cpython-310.pyc ADDED Viewed

Binary file (15.2 kB). View file

pyserini/__pycache__/evaluate_script_info.cpython-310.pyc ADDED Viewed

Binary file (749 Bytes). View file

pyserini/__pycache__/prebuilt_index_info.cpython-310.pyc ADDED Viewed

Binary file (179 kB). View file

pyserini/__pycache__/pyclass.cpython-310.pyc ADDED Viewed

Binary file (736 Bytes). View file

pyserini/__pycache__/setup.cpython-310.pyc ADDED Viewed

Binary file (780 Bytes). View file

pyserini/__pycache__/util.cpython-310.pyc ADDED Viewed

Binary file (8.03 kB). View file

pyserini/analysis/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from ._base import get_lucene_analyzer, Analyzer, JAnalyzer, JAnalyzerUtils, JDefaultEnglishAnalyzer, JWhiteSpaceAnalyzer
+__all__ = ['get_lucene_analyzer', 'Analyzer', 'JAnalyzer', 'JAnalyzerUtils', 'JDefaultEnglishAnalyzer', 'JWhiteSpaceAnalyzer']

pyserini/analysis/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (361 Bytes). View file

pyserini/analysis/__pycache__/_base.cpython-310.pyc ADDED Viewed

Binary file (5 kB). View file

pyserini/analysis/_base.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from typing import List
+from ..pyclass import autoclass
+# Wrappers around Lucene classes
+JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer')
+JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer')
+JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer')
+JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer')
+JDanishAnalyzer = autoclass('org.apache.lucene.analysis.da.DanishAnalyzer')
+JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
+JDutchAnalyzer = autoclass('org.apache.lucene.analysis.nl.DutchAnalyzer')
+JFinnishAnalyzer = autoclass('org.apache.lucene.analysis.fi.FinnishAnalyzer')
+JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer')
+JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer')
+JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer')
+JHungarianAnalyzer = autoclass('org.apache.lucene.analysis.hu.HungarianAnalyzer')
+JIndonesianAnalyzer = autoclass('org.apache.lucene.analysis.id.IndonesianAnalyzer')
+JItalianAnalyzer = autoclass('org.apache.lucene.analysis.it.ItalianAnalyzer')
+JJapaneseAnalyzer = autoclass('org.apache.lucene.analysis.ja.JapaneseAnalyzer')
+JNorwegianAnalyzer = autoclass('org.apache.lucene.analysis.no.NorwegianAnalyzer')
+JPortugueseAnalyzer = autoclass('org.apache.lucene.analysis.pt.PortugueseAnalyzer')
+JRussianAnalyzer = autoclass('org.apache.lucene.analysis.ru.RussianAnalyzer')
+JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer')
+JSwedishAnalyzer = autoclass('org.apache.lucene.analysis.sv.SwedishAnalyzer')
+JTeluguAnalyzer = autoclass('org.apache.lucene.analysis.te.TeluguAnalyzer')
+JThaiAnalyzer = autoclass('org.apache.lucene.analysis.th.ThaiAnalyzer')
+JTurkishAnalyzer = autoclass('org.apache.lucene.analysis.tr.TurkishAnalyzer')
+JWhiteSpaceAnalyzer = autoclass('org.apache.lucene.analysis.core.WhitespaceAnalyzer')
+JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet')
+# Wrappers around Anserini classes
+JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils')
+JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
+JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer')
+JHuggingFaceTokenizerAnalyzer = autoclass('io.anserini.analysis.HuggingFaceTokenizerAnalyzer')
+def get_lucene_analyzer(language: str='en', stemming: bool=True, stemmer: str='porter', stopwords: bool=True, huggingFaceTokenizer: str=None) -> JAnalyzer:
+ """Create a Lucene ``Analyzer`` with specific settings.
+ Parameters
+ ----------
+ language : str
+ Name of analyzer.
+ stemming : bool
+ Set to stem.
+ stemmer : str
+ Stemmer to use.
+ stopwords : bool
+ Set to filter stopwords.
+ huggingFaceTokenizer: str
+ a huggingface model id or path to a tokenizer.json file
+ Returns
+ -------
+ JAnalyzer
+ Java ``Analyzer`` with specified settings.
+ """
+ if language.lower() == 'ar':
+ return JArabicAnalyzer()
+ elif language.lower() == 'bn':
+ return JBengaliAnalyzer()
+ elif language.lower() in ['zh', 'ko']:
+ return JCJKAnalyzer()
+ elif language.lower() == 'da':
+ return JDanishAnalyzer()
+ elif language.lower() == 'nl':
+ return JDutchAnalyzer()
+ elif language.lower() == 'fi':
+ return JFinnishAnalyzer()
+ elif language.lower() == 'fr':
+ return JFrenchAnalyzer()
+ elif language.lower() == 'de':
+ return JGermanAnalyzer()
+ elif language.lower() == 'hi':
+ return JHindiAnalyzer()
+ elif language.lower() == 'hu':
+ return JHungarianAnalyzer()
+ elif language.lower() == 'id':
+ return JIndonesianAnalyzer()
+ elif language.lower() == 'it':
+ return JItalianAnalyzer()
+ elif language.lower() == 'ja':
+ return JJapaneseAnalyzer()
+ elif language.lower() == 'no':
+ return JNorwegianAnalyzer()
+ elif language.lower() == 'pt':
+ return JPortugueseAnalyzer()
+ elif language.lower() == 'ru':
+ return JRussianAnalyzer()
+ elif language.lower() == 'es':
+ return JSpanishAnalyzer()
+ elif language.lower() == 'te':
+ return JTeluguAnalyzer()
+ elif language.lower() == 'th':
+ return JThaiAnalyzer()
+ elif language.lower() == 'tr':
+ return JTurkishAnalyzer()
+ elif language.lower() == 'tweet':
+ return JTweetAnalyzer()
+ elif language.lower() == 'hgf_tokenizer':
+ return JHuggingFaceTokenizerAnalyzer(huggingFaceTokenizer)
+ elif language.lower() == 'en':
+ if stemming:
+ if stopwords:
+ return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer)
+ else:
+ return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer, JCharArraySet.EMPTY_SET)
+ else:
+ if stopwords:
+ return JDefaultEnglishAnalyzer.newNonStemmingInstance()
+ else:
+ return JDefaultEnglishAnalyzer.newNonStemmingInstance(JCharArraySet.EMPTY_SET)
+ else:
+ raise ValueError('Invalid configuration.')
+class Analyzer:
+ """Python wrapper around a Lucene ``Analyzer`` to simplify analysis.
+ Parameters
+ ----------
+ analyzer : JAnalyzer
+ Lucene ``Analyzer``.
+ """
+ def __init__(self, analyzer):
+ if not isinstance(analyzer, JAnalyzer):
+ raise TypeError('Invalid JAnalyzer!')
+ self.analyzer = analyzer
+ def analyze(self, text: str) -> List[str]:
+ """Analyze a piece of text.
+ Parameters
+ ----------
+ text : str
+ Text to analyze.
+ Returns
+ -------
+ List[str]
+ List of tokens corresponding to the output of the analyzer.
+ """
+ results = JAnalyzerUtils.analyze(self.analyzer, text)
+ tokens = []
+ for token in results.toArray():
+ tokens.append(token)
+ return tokens

pyserini/collection/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from ._base import Collection, FileSegment, SourceDocument
+from ._collection_support import Cord19Article
+__all__ = ['Collection', 'FileSegment', 'SourceDocument', 'Cord19Article']

pyserini/collection/_base.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import logging
+import re
+from enum import Enum
+from ..multithreading import Counters
+from ..pyclass import autoclass, cast, JPaths
+logger = logging.getLogger(__name__)
+JFileSegment = autoclass('io.anserini.collection.FileSegment')
+JSourceDocument = autoclass('io.anserini.collection.SourceDocument')
+class JCollections(Enum):
+ AclAnthology = autoclass('io.anserini.collection.AclAnthology')
+ CarCollection = autoclass('io.anserini.collection.CarCollection')
+ Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection')
+ ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection')
+ ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection')
+ HtmlCollection = autoclass('io.anserini.collection.HtmlCollection')
+ JsonCollection = autoclass('io.anserini.collection.JsonCollection')
+ NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection')
+ TrecCollection = autoclass('io.anserini.collection.TrecCollection')
+ TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection')
+ TweetCollection = autoclass('io.anserini.collection.TweetCollection')
+ WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection')
+ WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection')
+class Collection:
+ """
+ Iterable wrapper class for Anserini's DocumentCollection.
+ Parameters
+ ----------
+ collection_class : str
+ Name of collection class to instantiate
+ collection_path : str
+ Path to directory containing collection
+ """
+ def __init__(self, collection_class, collection_path):
+ self.counters = Counters()
+ self.collection_class = collection_class
+ self.collection_path = JPaths.get(collection_path)
+ self.object = self._get_collection()
+ self.collection_iterator = self.object.iterator()
+ def _get_collection(self):
+ try:
+ return JCollections[self.collection_class].value(self.collection_path)
+ except:
+ raise ValueError(self.collection_class)
+ def __iter__(self):
+ return self
+ def __next__(self):
+ if self.collection_iterator.hasNext():
+ fs = self.collection_iterator.next()
+ return FileSegment(self, fs, fs.getSegmentPath())
+ else:
+ raise StopIteration
+class FileSegment:
+ """
+ Iterable wrapper class for Anserini's FileSegment.
+ Parameters
+ ----------
+ collection : Collection
+ Parent collection of the file segment
+ segment : JFileSegment
+ FileSegment object to create wrapper from
+ segment_path : str
+ Path to file backing the file segment
+ """
+ def __init__(self, collection, segment, segment_path):
+ self.collection = collection
+ try:
+ self.object = cast(collection.object.getClass().getName() +
+ '$Segment', segment)
+ except:
+ logger.exception('Exception from casting FileSegment type...')
+ self.object = cast('io.anserini.collection.FileSegment', segment)
+ self.segment_iterator = self.object.iterator()
+ self.segment_path = segment_path
+ self.segment_name = re.sub(r'\\|\/', '-', collection.collection_path.relativize(segment_path).toString())
+ def __iter__(self):
+ return self
+ def __next__(self):
+ if self.object.iterator().hasNext():
+ d = self.object.iterator().next()
+ return SourceDocument(self, d)
+ else:
+ # log if iteration stopped by error
+ if self.object.getErrorStatus():
+ logger.error(self.segment_name + ': Error from segment iteration, stopping...')
+ self.collection.counters.errors.increment()
+ # stop iteration and log skipped documents
+ skipped = self.object.getSkippedCount()
+ if skipped > 0:
+ self.collection.counters.skips.increment(skipped)
+ logger.warning(self.segment_name + ': ' + str(skipped) + ' documents skipped')
+ self.object.close()
+ raise StopIteration
+class SourceDocument:
+ """
+ Wrapper class for Anserini's SourceDocument.
+ Parameters
+ ----------
+ segment : FileSegment
+ Parent segment of the source document
+ document : io.anserini.collection.SourceDocument
+ SourceDocument object to create wrapper from
+ """
+ def __init__(self, segment, document):
+ if not isinstance(document, JSourceDocument):
+ raise TypeError('Invalid JSourceDocument!')
+ self.segment = segment
+ self.object = document
+ self.id = self.object.id()
+ self.indexable = self.object.indexable()
+ self.contents = self.object.contents()
+ self.raw = self.object.raw()

pyserini/collection/_collection_support.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Implementations of support for specific collections.
+import json
+class Cord19Article:
+ """Wrapper class for a raw JSON article from AI2's COVID-19 Open Research Dataset (CORD-19).
+ Parameters
+ ----------
+ doc : str
+ A JSON string of a CORD-19 article.
+ """
+ def __init__(self, doc):
+ self.json = json.loads(doc)
+ # Performs some basic error checking, throws an exception if user tries to instantiate with something
+ # that isn't from CORD-19.
+ if 'cord_uid' in self.json:
+ self.full_text = False
+ elif 'paper_id' in self.json:
+ self.full_text = True
+ else:
+ raise TypeError
+ def is_full_text(self):
+ return self.json['has_full_text']
+ def cord_uid(self):
+ return self.json['cord_uid']
+ def bib_entries(self):
+ return self.json['bib_entries']
+ def title(self):
+ try:
+ if self.is_full_text():
+ return self.json['metadata']['title']
+ else:
+ return self.json['csv_metadata']['title']
+ except KeyError:
+ return ''
+ def abstract(self):
+ try:
+ # For a full-text article, we can grab the abstract from two independent sources, the metadata or the
+ # actual full text. Here, we make the decision to use the metadata, even for full text.
+ return self.json['csv_metadata']['abstract']
+ except KeyError:
+ return ''
+ def metadata(self):
+ return self.json['csv_metadata']
+ def body(self):
+ try:
+ if self.is_full_text():
+ return [entry['text'] for entry in self.json['body_text']]
+ else:
+ return []
+ except KeyError:
+ return ''

pyserini/demo/acl.py ADDED Viewed

	@@ -0,0 +1,124 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+This script provides an interactive web interface demo for retrieval on the ACL dataset.
+It requires `flask` (`pip install flask~=2.2.0`).
+An example command looks like `python -m pyserini.demo.acl` that starts up a server on port 8080.
+The demo can be accessed via "http://localhost:8080" in a web browser.
+Additional arguments include:
+ --port [PORT] --hits [Number of hits]
+ --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda]
+"""
+import json
+import logging
+from argparse import ArgumentParser
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+from flask import Flask, render_template, request, flash, jsonify
+from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
+logging.basicConfig(
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S',
+ level=logging.INFO,
+)
+logger = logging.getLogger('acl-demo')
+VERSION = '1.0'
+Searcher = Union[FaissSearcher, LuceneSearcher]
+def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]):
+ app = Flask(__name__)
+ lang = 'en'
+ searcher, retriever = load_searcher_fn(lang)
+ @app.route('/')
+ def index():
+ nonlocal lang, searcher, retriever
+ return render_template('acl.html', lang=lang, retriever=retriever)
+ @app.route('/search', methods=['GET', 'POST'])
+ def search():
+ nonlocal lang, searcher, retriever
+ query = request.form['q']
+ if not query:
+ search_results = []
+ flash('Question is required')
+ else:
+ hits = searcher.search(query, k=k)
+ docs = [searcher.doc(hit.docid) for hit in hits]
+ search_results = [
+ {
+ 'rank': r + 1,
+ 'docid': hit.docid,
+ 'doc': docs[r].contents(),
+ 'score': hit.score,
+ }
+ for r, hit in enumerate(hits)
+ ]
+ return render_template(
+ 'acl.html', search_results=search_results, query=query, lang=lang, retriever=retriever
+ )
+ return app
+def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str):
+ searcher = LuceneSearcher('indexes/lucene-index-acl-paragraph')
+ searcher.set_language(language)
+ if k1 is not None and b is not None:
+ searcher.set_bm25(k1, b)
+ retriever_name = f'BM25 (k1={k1}, b={b})'
+ else:
+ retriever_name = 'BM25'
+ return searcher, retriever_name
+def main():
+ parser = ArgumentParser()
+ parser.add_argument('--k1', type=float, help='BM25 k1 parameter.')
+ parser.add_argument('--b', type=float, help='BM25 b parameter.')
+ parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever')
+ parser.add_argument(
+ '--device',
+ type=str,
+ default='cpu',
+ help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)',
+ )
+ parser.add_argument(
+ '--port',
+ default=8080,
+ type=int,
+ help='Web server port',
+ )
+ args = parser.parse_args()
+ load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b)
+ app = create_app(args.hits, load_fn)
+ app.run(host='0.0.0.0', port=args.port)
+if __name__ == '__main__':
+ main()

pyserini/demo/dpr.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import cmd
+import json
+import random
+from pyserini.search.lucene import LuceneSearcher
+from pyserini.search.faiss import FaissSearcher, DprQueryEncoder
+from pyserini.search.hybrid import HybridSearcher
+from pyserini import search
+class DPRDemo(cmd.Cmd):
+ nq_dev_topics = list(search.get_topics('dpr-nq-dev').values())
+ trivia_dev_topics = list(search.get_topics('dpr-trivia-dev').values())
+ ssearcher = LuceneSearcher.from_prebuilt_index('wikipedia-dpr')
+ searcher = ssearcher
+ encoder = DprQueryEncoder("facebook/dpr-question_encoder-multiset-base")
+ index = 'wikipedia-dpr-multi-bf'
+ dsearcher = FaissSearcher.from_prebuilt_index(
+ index,
+ encoder
+ )
+ hsearcher = HybridSearcher(dsearcher, ssearcher)
+ k = 10
+ prompt = '>>> '
+ def precmd(self, line):
+ if line[0] == '/':
+ line = line[1:]
+ return line
+ def do_help(self, arg):
+ print(f'/help : returns this message')
+ print(f'/k [NUM] : sets k (number of hits to return) to [NUM]')
+ print(f'/mode [MODE] : sets retriever type to [MODE] (one of sparse, dense, hybrid)')
+ print(f'/random [COLLECTION]: returns results for a random question from the dev subset [COLLECTION] (one of nq, trivia).')
+ def do_k(self, arg):
+ print(f'setting k = {int(arg)}')
+ self.k = int(arg)
+ def do_mode(self, arg):
+ if arg == "sparse":
+ self.searcher = self.ssearcher
+ elif arg == "dense":
+ self.searcher = self.dsearcher
+ elif arg == "hybrid":
+ self.searcher = self.hsearcher
+ else:
+ print(
+ f'Mode "{arg}" is invalid. Mode should be one of [sparse, dense, hybrid].')
+ return
+ print(f'setting retriver = {arg}')
+ def do_random(self, arg):
+ if arg == "nq":
+ topics = self.nq_dev_topics
+ elif arg == "trivia":
+ topics = self.trivia_dev_topics
+ else:
+ print(
+ f'Collection "{arg}" is invalid. Collection should be one of [nq, trivia].')
+ return
+ q = random.choice(topics)['title']
+ print(f'question: {q}')
+ self.default(q)
+ def do_EOF(self, line):
+ return True
+ def default(self, q):
+ hits = self.searcher.search(q, self.k)
+ for i in range(0, len(hits)):
+ raw_doc = None
+ if isinstance(self.searcher, LuceneSearcher):
+ raw_doc = hits[i].raw
+ else:
+ doc = self.searcher.doc(hits[i].docid)
+ if doc:
+ raw_doc = doc.raw()
+ jsondoc = json.loads(raw_doc)
+ print(f'{i + 1:2} {hits[i].score:.5f} {jsondoc["contents"]}')
+if __name__ == '__main__':
+ DPRDemo().cmdloop()

pyserini/demo/miracl.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+This script provides an interactive web interface demo for retrieval on the MIRACL dataset.
+It requires `flask` (`pip install flask~=2.2.0`).
+An example command looks like `python -m pyserini.demo.miracl` that starts up a server on port 8080.
+The demo can be accessed via "http://localhost:8080" in a web browser.
+Additional arguments include:
+ --port [PORT] --hits [Number of hits] --index [BM25 or mdpr-tied-pft-msmarco]
+ --k1 [BM25 k1] --b [BM25 b] --device [cpu, cuda]
+"""
+import json
+import logging
+from argparse import ArgumentParser
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+from flask import Flask, render_template, request, flash, jsonify
+from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder
+logging.basicConfig(
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S',
+ level=logging.INFO,
+)
+logger = logging.getLogger('miracl-demo')
+VERSION = '1.0'
+LANGUAGES = ('ar', 'bn', 'en', 'es', 'fa', 'fi', 'fr', 'hi', 'id', 'ja', 'ko', 'ru', 'sw', 'te', 'th', 'zh')
+Searcher = Union[FaissSearcher, LuceneSearcher]
+def create_app(k: int, load_searcher_fn: Callable[[str], Tuple[Searcher, str]]):
+ app = Flask(__name__)
+ lang = LANGUAGES[0]
+ searcher, retriever = load_searcher_fn(lang)
+ @app.route('/')
+ def index():
+ nonlocal lang, searcher, retriever
+ return render_template('miracl.html', lang=lang, retriever=retriever)
+ @app.route('/search', methods=['GET', 'POST'])
+ def search():
+ nonlocal lang, searcher, retriever
+ query = request.form['q']
+ if not query:
+ search_results = []
+ flash('Question is required')
+ else:
+ hits = searcher.search(query, k=k)
+ docs = [json.loads(searcher.doc(hit.docid).raw()) for hit in hits]
+ search_results = [
+ {
+ 'rank': r + 1,
+ 'docid': hit.docid,
+ 'doc': docs[r]['text'],
+ 'title': docs[r]['title'],
+ 'score': hit.score,
+ }
+ for r, hit in enumerate(hits)
+ ]
+ return render_template(
+ 'miracl.html', search_results=search_results, query=query, lang=lang, retriever=retriever
+ )
+ @app.route('/lang', methods=['GET'])
+ def change_language():
+ nonlocal lang, searcher, retriever
+ new_lang = request.args.get('new_lang', '', type=str)
+ if not new_lang or new_lang not in LANGUAGES:
+ return
+ lang = new_lang
+ searcher, retriever = load_searcher_fn(lang)
+ return jsonify(lang=lang)
+ return app
+def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher, str):
+ searcher = LuceneSearcher.from_prebuilt_index(f'miracl-v{VERSION}-{language}')
+ searcher.set_language(language)
+ if k1 is not None and b is not None:
+ searcher.set_bm25(k1, b)
+ retriever_name = f'BM25 (k1={k1}, b={b})'
+ else:
+ retriever_name = 'BM25'
+ return searcher, retriever_name
+def _load_faiss_searcher(language: str, device: str) -> (Searcher, str):
+ query_encoder = AutoQueryEncoder(encoder_dir='castorini/mdpr-tied-pft-msmarco', device=device)
+ searcher = FaissSearcher.from_prebuilt_index(
+ f'miracl-v{VERSION}-{language}-mdpr-tied-pft-msmarco', query_encoder
+ )
+ retriever_name = 'mDPR-pFT-MSMARCO'
+ return searcher, retriever_name
+def main():
+ parser = ArgumentParser()
+ parser.add_argument('--index', default='BM25', choices=('BM25', 'mdpr-tied-pft-msmarco'), help='Index type.')
+ parser.add_argument('--k1', type=float, help='BM25 k1 parameter.')
+ parser.add_argument('--b', type=float, help='BM25 b parameter.')
+ parser.add_argument('--hits', type=int, default=10, help='Number of hits returned by the retriever')
+ parser.add_argument(
+ '--device',
+ type=str,
+ default='cpu',
+ help='Device to run query encoder, cpu or [cuda:0, cuda:1, ...] (used only when index is based on FAISS)',
+ )
+ parser.add_argument(
+ '--port',
+ default=8080,
+ type=int,
+ help='Web server port',
+ )
+ args = parser.parse_args()
+ if args.index == 'mdpr-tied-pft-msmarco':
+ load_fn = partial(_load_faiss_searcher, device=args.device)
+ else:
+ load_fn = partial(_load_sparse_searcher, k1=args.k1, b=args.b)
+ app = create_app(args.hits, load_fn)
+ app.run(host='0.0.0.0', port=args.port)
+if __name__ == '__main__':
+ main()

pyserini/demo/msmarco.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import cmd
+import json
+import os
+import random
+from pyserini.search.lucene import LuceneSearcher
+from pyserini.search.faiss import FaissSearcher, TctColBertQueryEncoder, AnceQueryEncoder
+from pyserini.search.hybrid import HybridSearcher
+from pyserini import search
+class MsMarcoDemo(cmd.Cmd):
+ dev_topics = list(search.get_topics('msmarco-passage-dev-subset').values())
+ ssearcher = LuceneSearcher.from_prebuilt_index('msmarco-passage')
+ dsearcher = None
+ hsearcher = None
+ searcher = ssearcher
+ k = 10
+ prompt = '>>> '
+ # https://stackoverflow.com/questions/35213134/command-prefixes-in-python-cli-using-cmd-in-pythons-standard-library
+ def precmd(self, line):
+ if line[0] == '/':
+ line = line[1:]
+ return line
+ def do_help(self, arg):
+ print(f'/help : returns this message')
+ print(f'/k [NUM] : sets k (number of hits to return) to [NUM]')
+ print(f'/model [MODEL] : sets encoder to use the model [MODEL] (one of tct, ance)')
+ print(f'/mode [MODE] : sets retriever type to [MODE] (one of sparse, dense, hybrid)')
+ print(f'/random : returns results for a random question from dev subset')
+ def do_k(self, arg):
+ print(f'setting k = {int(arg)}')
+ self.k = int(arg)
+ def do_mode(self, arg):
+ if arg == "sparse":
+ self.searcher = self.ssearcher
+ elif arg == "dense":
+ if self.dsearcher is None:
+ print(f'Specify model through /model before using dense retrieval.')
+ return
+ self.searcher = self.dsearcher
+ elif arg == "hybrid":
+ if self.hsearcher is None:
+ print(f'Specify model through /model before using hybrid retrieval.')
+ return
+ self.searcher = self.hsearcher
+ else:
+ print(
+ f'Mode "{arg}" is invalid. Mode should be one of [sparse, dense, hybrid].')
+ return
+ print(f'setting retriver = {arg}')
+ def do_model(self, arg):
+ if arg == "tct":
+ encoder = TctColBertQueryEncoder("castorini/tct_colbert-msmarco")
+ index = "msmarco-passage-tct_colbert-hnsw"
+ elif arg == "ance":
+ encoder = AnceQueryEncoder("castorini/ance-msmarco-passage")
+ index = "msmarco-passage-ance-bf"
+ else:
+ print(
+ f'Model "{arg}" is invalid. Model should be one of [tct, ance].')
+ return
+ self.dsearcher = FaissSearcher.from_prebuilt_index(
+ index,
+ encoder
+ )
+ self.hsearcher = HybridSearcher(self.dsearcher, self.ssearcher)
+ print(f'setting model = {arg}')
+ def do_random(self, arg):
+ q = random.choice(self.dev_topics)['title']
+ print(f'question: {q}')
+ self.default(q)
+ def do_EOF(self, line):
+ return True
+ def default(self, q):
+ hits = self.searcher.search(q, self.k)
+ for i in range(0, len(hits)):
+ raw_doc = None
+ if isinstance(self.searcher, LuceneSearcher):
+ raw_doc = hits[i].raw
+ else:
+ doc = self.searcher.doc(hits[i].docid)
+ if doc:
+ raw_doc = doc.raw()
+ jsondoc = json.loads(raw_doc)
+ print(f'{i + 1:2} {hits[i].score:.5f} {jsondoc["contents"]}')
+if __name__ == '__main__':
+ MsMarcoDemo().cmdloop()

pyserini/demo/templates/acl.html ADDED Viewed

	@@ -0,0 +1,74 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <meta property="og:title" content="ACL 🌍🙌🌏">
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
+ integrity="sha384-Zenh87qX5JnK2Jl0vWa8Ck2rdkQ2Bzep5IDxbcnCeuOxjzrPF/et3URy9Bv1WTRi" crossorigin="anonymous">
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/font/bootstrap-icons.css">
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
+ integrity="sha384-OERcA2EqjJCMA+/3y+gxIOqMEjwtxJY7qPCqsdltbNJuaOe923+mo//f6V8Qbsw3"
+ crossorigin="anonymous"></script>
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/jquery.min.js"></script>
+ <script>
+ $SCRIPT_ROOT = {{ request.script_root|tojson }};
+ </script>
+ <title>ACL 🌍🙌🌏 Demo</title>
+</head>
+<body>
+<div style="display: flex; align-items: center; gap: 10px;">
+ <h2>ACL</h2>
+ <img src="https://aclanthology.org/images/acl-logo.svg" alt="acl logo" width="50px">
+ <h2>Demo</h2>
+</div>
+<br/>
+<div class="container text-center">
+ {% for message in get_flashed_messages() %}
+ <div class="alert">{{ message }}</div>
+ {% endfor %}
+ <form action="/search" method="post">
+ <div class="row-cols-3">
+ <div class="input-group mb-3">
+ <input type="text" class="form-control" placeholder="Enter a Question" aria-label="Question" name="q"
+ aria-describedby="button-addon2" value="{{ query if query else '' }}">
+ <button class="btn btn-outline-secondary" type="submit" id="button-addon2"><i class="bi bi-search"></i>
+ </button>
+ </div>
+ </div>
+ </form>
+ {% if search_results %}
+ <div class="row">
+ <table class="table">
+ <thead>
+ <tr>
+ <th scope="col">#</th>
+ <th scope="col">Score</th>
+ <th scope="col">Passage ID</th>
+ <th scope="col">Content</th>
+ </tr>
+ </thead>
+ <tbody class="table-group-divider">
+ {% for res in search_results %}
+ <tr class="{{ 'table-secondary' if res['rank'] % 2 else 'table-light' }}">
+ <th scope="row">{{ res["rank"] }}</th>
+ <td>{{ "%.2f"|format(res["score"]) }}</td>
+ <td>{{ res["docid"] }}</td>
+ <td style="word-wrap: break-word;min-width: 600px;max-width: 600px;"
+ class="text-{{ 'end' if lang in ('ar', 'fa') else 'start' }}">
+ <small>{{ res["doc"] }}</small>
+ </td>
+ </tr>
+ {% endfor %}
+ </tbody>
+ </table>
+ </div>
+ {% endif %}
+</div>
+</body>
+</html>

pyserini/demo/templates/assets/acl-logo.svg ADDED Viewed

pyserini/demo/templates/miracl.html ADDED Viewed

	@@ -0,0 +1,127 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+ <meta charset="UTF-8">
+ <meta property="og:title" content="MIRACL 🌍🙌🌏">
+ <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Zenh87qX5JnK2Jl0vWa8Ck2rdkQ2Bzep5IDxbcnCeuOxjzrPF/et3URy9Bv1WTRi" crossorigin="anonymous">
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/font/bootstrap-icons.css">
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js" integrity="sha384-OERcA2EqjJCMA+/3y+gxIOqMEjwtxJY7qPCqsdltbNJuaOe923+mo//f6V8Qbsw3" crossorigin="anonymous"></script>
+ <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/jquery.min.js"></script>
+ <script>
+ $SCRIPT_ROOT = {{ request.script_root|tojson }};
+ $( document ).ready(function() {
+ $("#loading").hide();
+ $('#language').val("{{lang}}");
+ });
+ $(function() {
+ $('#language').on('change', function() {
+ $.getJSON($SCRIPT_ROOT + '/lang', {
+ new_lang: this.value,
+ }, function(data) {
+ $("#language").removeAttr('disabled');
+ $("#loading").hide();
+ });
+ $(this).attr('disabled','disabled');
+ $("#loading").show();
+ return false;
+ });
+ });
+ </script>
+ <title>MIRACL 🌍🙌🌏 Demo</title>
+</head>
+<body>
+ <h2>MIRACL 🌍🙌🌏 Demo</h2>
+ <h4>Multilingual Information Retrieval Across a Continuum of Languages</h4>
+ <br/>
+ <p class="lead">
+ <a href="http://miracl.ai/">MIRACL</a> is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.
+ </p>
+ <div class="row g-3 align-items-center">
+ <label class="col-auto" for="language">This demo running on the language</label>
+ <div class="col-auto">
+ <select class="form-select form-select-sm" aria-label=".form-select-sm" id="language">
+ <option value="ar">Arabic</option>
+ <option value="bn">Bengali</option>
+ <option value="en">English</option>
+ <option value="es">Spanish</option>
+ <option value="fa">Persian</option>
+ <option value="fi">Finnish</option>
+ <option value="fr">French</option>
+ <option value="hi">Hindi</option>
+ <option value="id">Indonesian</option>
+ <option value="ja">Japanese</option>
+ <option value="ko">Korean</option>
+ <option value="ru">Russian</option>
+ <option value="sw">Swahili</option>
+ <option value="te">Telugu</option>
+ <option value="th">Thai</option>
+ <option value="zh">Chinese</option>
+ </select>
+ </div>
+ <div class="col-auto">
+ <div class="spinner-border text-secondary" role="status" id="loading">
+ <span class="visually-hidden">Loading...</span>
+ </div>
+ </div>
+ <div class="col-auto">
+ <span>
+ retrieves passages using <em>{{retriever}}</em>.
+ </span>
+ </div>
+ </div>
+ <br/>
+ <div class="container text-center">
+ {% for message in get_flashed_messages() %}
+ <div class="alert">{{ message }}</div>
+ {% endfor %}
+ <form action="/search" method="post">
+ <div class="row-cols-3">
+ <div class="input-group mb-3">
+ <input type="text" class="form-control" placeholder="Enter a Question" aria-label="Question" name="q" aria-describedby="button-addon2" value="{{query if query else ''}}">
+ <button class="btn btn-outline-secondary" type="submit" id="button-addon2"><i class="bi bi-search"></i></button>
+ </div>
+ </div>
+ </form>
+ {% if search_results %}
+ <div class="row">
+ <table class="table">
+ <thead>
+ <tr>
+ <th scope="col">#</th>
+ <th scope="col">Score</th>
+ <th scope="col">Passage ID</th>
+ <th scope="col">Title</th>
+ <th scope="col">Content</th>
+ </tr>
+ </thead>
+ <tbody class="table-group-divider">
+ {% for res in search_results %}
+ <tr class="{{'table-secondary' if res['rank'] % 2 else 'table-light'}}">
+ <th scope="row">{{res["rank"]}}</th>
+ <td>{{"%.2f"|format(res["score"])}}</td>
+ <td>{{res["docid"]}}</td>
+ <td>{{res["title"]}}</td>
+ <td style="word-wrap: break-word;min-width: 600px;max-width: 600px;" class="text-{{'end' if lang in ('ar', 'fa') else 'start'}}">
+ <small>{{res["doc"]}}</small>
+ </td>
+ </tr>
+ {% endfor %}
+ </tbody>
+ </table>
+ </div>
+ {% endif %}
+ </div>
+</body>
+</html>

pyserini/dsearch.py ADDED Viewed

	@@ -0,0 +1,46 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Deprecated. The package ``pyserini.dsearch` has been renamed `pyserini.search.faiss`. Stubs are retained here for
+redirection purpose to ensure that code in existing published papers remain function (with warnings)."""
+import os
+import sys
+import pyserini.search.faiss
+from pyserini.search.faiss import TctColBertQueryEncoder
+__all__ = ['SimpleDenseSearcher', 'BinaryDenseSearcher', 'TctColBertQueryEncoder']
+class SimpleDenseSearcher(pyserini.search.faiss.FaissSearcher):
+ def __new__(cls, *args, **kwargs):
+ print('pyserini.dsearch.SimpleDenseSearcher class has been deprecated, '
+ 'please use FaissSearcher from pyserini.search.faiss instead')
+ return super().__new__(cls)
+class BinaryDenseSearcher(pyserini.search.faiss.BinaryDenseSearcher):
+ def __new__(cls, *args, **kwargs):
+ print('pyserini.dsearch.BinaryDenseSearcher class has been deprecated, '
+ 'please use BinaryDenseSearcher from pyserini.search.faiss instead')
+ return super().__new__(cls)
+if __name__ == "__main__":
+ print('WARNING: pyserini.dsearch is deprecated, please use pyserini.search.faiss instead!')
+ args = " ".join(sys.argv[1:])
+ os.system(f'python -m pyserini.search.faiss {args}')

pyserini/encode/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from ._base import DocumentEncoder, QueryEncoder, JsonlCollectionIterator,\
+ RepresentationWriter, FaissRepresentationWriter, JsonlRepresentationWriter, PcaEncoder
+from ._ance import AnceEncoder, AnceDocumentEncoder, AnceQueryEncoder
+from ._auto import AutoQueryEncoder, AutoDocumentEncoder
+from ._dpr import DprDocumentEncoder, DprQueryEncoder
+from ._tct_colbert import TctColBertDocumentEncoder, TctColBertQueryEncoder
+from ._aggretriever import AggretrieverDocumentEncoder, AggretrieverQueryEncoder
+from ._unicoil import UniCoilEncoder, UniCoilDocumentEncoder, UniCoilQueryEncoder
+from ._cached_data import CachedDataQueryEncoder
+from ._tok_freq import TokFreqQueryEncoder
+from ._splade import SpladeQueryEncoder
+from ._slim import SlimQueryEncoder

pyserini/encode/__main__.py ADDED Viewed

	@@ -0,0 +1,147 @@

+#
+# Pyserini: Reproducible IR research with sparse and dense representations
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+import sys
+from pyserini.encode import JsonlRepresentationWriter, FaissRepresentationWriter, JsonlCollectionIterator
+from pyserini.encode import DprDocumentEncoder, TctColBertDocumentEncoder, AnceDocumentEncoder, AggretrieverDocumentEncoder, AutoDocumentEncoder
+from pyserini.encode import UniCoilDocumentEncoder
+encoder_class_map = {
+ "dpr": DprDocumentEncoder,
+ "tct_colbert": TctColBertDocumentEncoder,
+ "aggretriever": AggretrieverDocumentEncoder,
+ "ance": AnceDocumentEncoder,
+ "sentence-transformers": AutoDocumentEncoder,
+ "unicoil": UniCoilDocumentEncoder,
+ "auto": AutoDocumentEncoder,
+}
+ALLOWED_POOLING_OPTS = ["cls","mean"]
+def init_encoder(encoder, encoder_class, device):
+ _encoder_class = encoder_class
+ # determine encoder_class
+ if encoder_class is not None:
+ encoder_class = encoder_class_map[encoder_class]
+ else:
+ # if any class keyword was matched in the given encoder name,
+ # use that encoder class
+ for class_keyword in encoder_class_map:
+ if class_keyword in encoder.lower():
+ encoder_class = encoder_class_map[class_keyword]
+ break
+ # if none of the class keyword was matched,
+ # use the AutoDocumentEncoder
+ if encoder_class is None:
+ encoder_class = AutoDocumentEncoder
+ # prepare arguments to encoder class
+ kwargs = dict(model_name=encoder, device=device)
+ if (_encoder_class == "sentence-transformers") or ("sentence-transformers" in encoder):
+ kwargs.update(dict(pooling='mean', l2_norm=True))
+ if (_encoder_class == "contriever") or ("contriever" in encoder):
+ kwargs.update(dict(pooling='mean', l2_norm=False))
+ return encoder_class(**kwargs)
+def parse_args(parser, commands):
+ # Divide argv by commands
+ split_argv = [[]]
+ for c in sys.argv[1:]:
+ if c in commands.choices:
+ split_argv.append([c])
+ else:
+ split_argv[-1].append(c)
+ # Initialize namespace
+ args = argparse.Namespace()
+ for c in commands.choices:
+ setattr(args, c, None)
+ # Parse each command
+ parser.parse_args(split_argv[0], namespace=args) # Without command
+ for argv in split_argv[1:]: # Commands
+ n = argparse.Namespace()
+ setattr(args, argv[0], n)
+ parser.parse_args(argv, namespace=n)
+ return args
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ commands = parser.add_subparsers(title='sub-commands')
+ input_parser = commands.add_parser('input')
+ input_parser.add_argument('--corpus', type=str,
+ help='directory that contains corpus files to be encoded, in jsonl format.',
+ required=True)
+ input_parser.add_argument('--fields', help='fields that contents in jsonl has (in order)',
+ nargs='+', default=['text'], required=False)
+ input_parser.add_argument('--docid-field',
+ help='name of document id field name. If you have a custom id with a name other than "id", "_id" or "docid", then use this argument',
+ default=None, required=False)
+ input_parser.add_argument('--delimiter', help='delimiter for the fields', default='\n', required=False)
+ input_parser.add_argument('--shard-id', type=int, help='shard-id 0-based', default=0, required=False)
+ input_parser.add_argument('--shard-num', type=int, help='number of shards', default=1, required=False)
+ output_parser = commands.add_parser('output')
+ output_parser.add_argument('--embeddings', type=str, help='directory to store encoded corpus', required=True)
+ output_parser.add_argument('--to-faiss', action='store_true', default=False)
+ encoder_parser = commands.add_parser('encoder')
+ encoder_parser.add_argument('--encoder', type=str, help='encoder name or path', required=True)
+ encoder_parser.add_argument('--encoder-class', type=str, required=False, default=None,
+ choices=["dpr", "bpr", "tct_colbert", "ance", "sentence-transformers", "auto"],
+ help='which query encoder class to use. `default` would infer from the args.encoder')
+ encoder_parser.add_argument('--fields', help='fields to encode', nargs='+', default=['text'], required=False)
+ encoder_parser.add_argument('--batch-size', type=int, help='batch size', default=64, required=False)
+ encoder_parser.add_argument('--max-length', type=int, help='max length', default=256, required=False)
+ encoder_parser.add_argument('--dimension', type=int, help='dimension', default=768, required=False)
+ encoder_parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]',
+ default='cuda:0', required=False)
+ encoder_parser.add_argument('--fp16', action='store_true', default=False)
+ encoder_parser.add_argument('--add-sep', action='store_true', default=False)
+ encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', required=False)
+ args = parse_args(parser, commands)
+ delimiter = args.input.delimiter.replace("\\n", "\n") # argparse would add \ prior to the passed '\n\n'
+ encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device)
+ if type(encoder).__name__ == "AutoDocumentEncoder":
+ if args.encoder.pooling in ALLOWED_POOLING_OPTS:
+ encoder.pooling = args.encoder.pooling
+ else:
+ raise ValueError(f"Only allowed to use pooling types {ALLOWED_POOLING_OPTS}. You entered {args.encoder.pooling}")
+ if args.output.to_faiss:
+ embedding_writer = FaissRepresentationWriter(args.output.embeddings, dimension=args.encoder.dimension)
+ else:
+ embedding_writer = JsonlRepresentationWriter(args.output.embeddings)
+ collection_iterator = JsonlCollectionIterator(args.input.corpus, args.input.fields, args.input.docid_field, delimiter)
+ with embedding_writer:
+ for batch_info in collection_iterator(args.encoder.batch_size, args.input.shard_id, args.input.shard_num):
+ kwargs = {
+ 'texts': batch_info['text'],
+ 'titles': batch_info['title'] if 'title' in args.encoder.fields else None,
+ 'expands': batch_info['expand'] if 'expand' in args.encoder.fields else None,
+ 'fp16': args.encoder.fp16,
+ 'max_length': args.encoder.max_length,
+ 'add_sep': args.encoder.add_sep,
+ }
+ embeddings = encoder.encode(**kwargs)
+ batch_info['vector'] = embeddings
+ embedding_writer.write(batch_info, args.input.fields)

pyserini/encode/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.15 kB). View file

pyserini/encode/__pycache__/_aggretriever.cpython-310.pyc ADDED Viewed

Binary file (6.24 kB). View file