Update app.py
Browse files
app.py
CHANGED
@@ -16,13 +16,13 @@ def generate_query(document):
|
|
16 |
input_ids = llm_tokenizer.encode(prompt, return_tensors="pt")
|
17 |
output = llm.generate(
|
18 |
input_ids,
|
19 |
-
|
20 |
num_return_sequences=5,
|
21 |
-
num_beams=5,
|
22 |
no_repeat_ngram_size=2,
|
23 |
early_stopping=True
|
24 |
)
|
25 |
-
queries = [llm_tokenizer.decode(seq, skip_special_tokens=True) for seq in output]
|
26 |
return queries
|
27 |
|
28 |
def rerank_pairs(queries, document):
|
@@ -46,12 +46,38 @@ def inpars_v2(document):
|
|
46 |
result = train_retriever([(best_query, document)])
|
47 |
return f"Generated query: {best_query}\n\n{result}"
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
iface = gr.Interface(
|
50 |
fn=inpars_v2,
|
51 |
inputs=gr.Textbox(lines=5, label="Input Document"),
|
52 |
outputs=gr.Textbox(label="Result"),
|
53 |
title="InPars-v2 Demo",
|
54 |
-
description=
|
|
|
55 |
)
|
56 |
|
57 |
iface.launch()
|
|
|
16 |
input_ids = llm_tokenizer.encode(prompt, return_tensors="pt")
|
17 |
output = llm.generate(
|
18 |
input_ids,
|
19 |
+
max_new_tokens=30,
|
20 |
num_return_sequences=5,
|
21 |
+
num_beams=5,
|
22 |
no_repeat_ngram_size=2,
|
23 |
early_stopping=True
|
24 |
)
|
25 |
+
queries = [llm_tokenizer.decode(seq[input_ids.shape[1]:], skip_special_tokens=True) for seq in output]
|
26 |
return queries
|
27 |
|
28 |
def rerank_pairs(queries, document):
|
|
|
46 |
result = train_retriever([(best_query, document)])
|
47 |
return f"Generated query: {best_query}\n\n{result}"
|
48 |
|
49 |
+
# Markdown description of the InPars-v2 paper
|
50 |
+
paper_description = """
|
51 |
+
# InPars-v2: Large Language Models as Efficient Dataset Generators for Information Retrieval
|
52 |
+
|
53 |
+
**Abstract Link:** [https://arxiv.org/abs/2301.01820](https://arxiv.org/abs/2301.01820)
|
54 |
+
**PDF Link:** [https://arxiv.org/pdf/2301.01820](https://arxiv.org/pdf/2301.01820)
|
55 |
+
|
56 |
+
**Authors:** Vitor Jeronymo, Luiz Bonifacio, Hugo Abonizio, Marzieh Fadaee, Roberto Lotufo, Jakub Zavrel, Rodrigo Nogueira
|
57 |
+
|
58 |
+
**Publication Date:** 26 May 2023
|
59 |
+
|
60 |
+
## Abstract
|
61 |
+
|
62 |
+
Recently, InPars introduced a method to efficiently use large language models (LLMs) in information retrieval tasks: via few-shot examples, an LLM is induced to generate relevant queries for documents. These synthetic query-document pairs can then be used to train a retriever. However, InPars and, more recently, Promptagator, rely on proprietary LLMs such as GPT-3 and FLAN to generate such datasets. In this work we introduce InPars-v2, a dataset generator that uses open-source LLMs and existing powerful rerankers to select synthetic query-document pairs for training. A simple BM25 retrieval pipeline followed by a monoT5 reranker finetuned on InPars-v2 data achieves new state-of-the-art results on the BEIR benchmark. To allow researchers to further improve our method, we open source the code, synthetic data, and finetuned models: [https://github.com/zetaalphavector/inPars/tree/master/tpu](https://github.com/zetaalphavector/inPars/tree/master/tpu)
|
63 |
+
|
64 |
+
## Key Features of InPars-v2
|
65 |
+
|
66 |
+
1. Uses open-source LLMs for query generation
|
67 |
+
2. Employs powerful rerankers to select high-quality synthetic query-document pairs
|
68 |
+
3. Achieves state-of-the-art results on the BEIR benchmark
|
69 |
+
4. Provides open-source code, synthetic data, and finetuned models
|
70 |
+
|
71 |
+
This demo provides a simplified implementation of the InPars-v2 concept, showcasing query generation, reranking, and retriever training.
|
72 |
+
"""
|
73 |
+
|
74 |
iface = gr.Interface(
|
75 |
fn=inpars_v2,
|
76 |
inputs=gr.Textbox(lines=5, label="Input Document"),
|
77 |
outputs=gr.Textbox(label="Result"),
|
78 |
title="InPars-v2 Demo",
|
79 |
+
description=paper_description,
|
80 |
+
article="This is a minimal implementation of the InPars-v2 concept. For the full implementation and more details, please refer to the original paper and GitHub repository."
|
81 |
)
|
82 |
|
83 |
iface.launch()
|