File size: 4,099 Bytes
62977bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#
# Pyserini: Python interface to the Anserini IR toolkit built on Lucene
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import json
import time
from tqdm import tqdm

from ._searcher import NmslibSearcher
from pyserini.output_writer import get_output_writer, OutputFormat, tie_breaker

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Search a nmslib index.')
    parser.add_argument('--index', type=str, metavar='path to index or index name', required=True,
                        help="Path to nmslib index.")
    parser.add_argument('--topics', type=str, required=True, help="path to topics")
    parser.add_argument('--hits', type=int, metavar='num', required=False, default=1000, help="Number of hits.")
    parser.add_argument('--output-format', type=str, metavar='format', default=OutputFormat.TREC.value,
                        help=f"Format of output. Available: {[x.value for x in list(OutputFormat)]}")
    parser.add_argument('--output', type=str, metavar='path', required=True, help="Path to output file.")
    parser.add_argument('--ef', type=int, required=False, default=256, help="hnsw ef_search")
    parser.add_argument('--threads', type=int, metavar='num', required=False, default=1,
                        help="maximum threads to use during search")
    parser.add_argument('--batch-size', type=int, metavar='num', required=False, default=1,
                        help="search batch of queries in parallel")
    parser.add_argument('--is-sparse', action='store_true', required=False)
    args = parser.parse_args()

    searcher = NmslibSearcher(args.index, ef_search=args.ef, is_sparse=args.is_sparse)

    topic_ids = []
    topic_vectors = []
    with open(args.topics) as topic_f:
        for line in topic_f:
            info = json.loads(line)
            topic_ids.append(info['id'])
            topic_vectors.append(info['vector'])

    if not searcher:
        exit()

    # build output path
    output_path = args.output

    print(f'Running {args.topics} topics, saving to {output_path}...')
    tag = 'HNSW'

    # support trec and msmarco format only for now
    output_writer = get_output_writer(output_path, OutputFormat(args.output_format), max_hits=args.hits, tag=tag)

    search_time = 0
    with output_writer:
        batch_topic_vectors = list()
        batch_topic_ids = list()
        for index, (topic_id, vec) in enumerate(tqdm(zip(topic_ids, topic_vectors))):
            if args.batch_size <= 1 and args.threads <= 1:
                start = time.time()
                hits = searcher.search(vec, args.hits)
                search_time += time.time() - start
                results = [(topic_id, hits)]
            else:
                batch_topic_ids.append(str(topic_id))
                batch_topic_vectors.append(vec)
                if (index + 1) % args.batch_size == 0 or \
                        index == len(topic_ids) - 1:
                    start = time.time()
                    results = searcher.batch_search(
                            batch_topic_vectors, batch_topic_ids, args.hits, args.threads)
                    search_time += time.time() - start
                    results = [(id_, results[id_]) for id_ in batch_topic_ids]
                    batch_topic_ids.clear()
                    batch_topic_vectors.clear()
                else:
                    continue

            for topic, hits in results:
                output_writer.write(topic, tie_breaker(hits))

            results.clear()

    print(f'Search {len(topic_ids)} topics in {search_time} seconds')