awacke1 commited on
Commit
13a1cf1
·
verified ·
1 Parent(s): def9b1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -4
app.py CHANGED
@@ -16,13 +16,13 @@ def generate_query(document):
16
  input_ids = llm_tokenizer.encode(prompt, return_tensors="pt")
17
  output = llm.generate(
18
  input_ids,
19
- max_length=50,
20
  num_return_sequences=5,
21
- num_beams=5, # Use beam search
22
  no_repeat_ngram_size=2,
23
  early_stopping=True
24
  )
25
- queries = [llm_tokenizer.decode(seq, skip_special_tokens=True) for seq in output]
26
  return queries
27
 
28
  def rerank_pairs(queries, document):
@@ -46,12 +46,38 @@ def inpars_v2(document):
46
  result = train_retriever([(best_query, document)])
47
  return f"Generated query: {best_query}\n\n{result}"
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  iface = gr.Interface(
50
  fn=inpars_v2,
51
  inputs=gr.Textbox(lines=5, label="Input Document"),
52
  outputs=gr.Textbox(label="Result"),
53
  title="InPars-v2 Demo",
54
- description="Generate queries and train a retriever using LLMs and rerankers."
 
55
  )
56
 
57
  iface.launch()
 
16
  input_ids = llm_tokenizer.encode(prompt, return_tensors="pt")
17
  output = llm.generate(
18
  input_ids,
19
+ max_new_tokens=30,
20
  num_return_sequences=5,
21
+ num_beams=5,
22
  no_repeat_ngram_size=2,
23
  early_stopping=True
24
  )
25
+ queries = [llm_tokenizer.decode(seq[input_ids.shape[1]:], skip_special_tokens=True) for seq in output]
26
  return queries
27
 
28
  def rerank_pairs(queries, document):
 
46
  result = train_retriever([(best_query, document)])
47
  return f"Generated query: {best_query}\n\n{result}"
48
 
49
+ # Markdown description of the InPars-v2 paper
50
+ paper_description = """
51
+ # InPars-v2: Large Language Models as Efficient Dataset Generators for Information Retrieval
52
+
53
+ **Abstract Link:** [https://arxiv.org/abs/2301.01820](https://arxiv.org/abs/2301.01820)
54
+ **PDF Link:** [https://arxiv.org/pdf/2301.01820](https://arxiv.org/pdf/2301.01820)
55
+
56
+ **Authors:** Vitor Jeronymo, Luiz Bonifacio, Hugo Abonizio, Marzieh Fadaee, Roberto Lotufo, Jakub Zavrel, Rodrigo Nogueira
57
+
58
+ **Publication Date:** 26 May 2023
59
+
60
+ ## Abstract
61
+
62
+ Recently, InPars introduced a method to efficiently use large language models (LLMs) in information retrieval tasks: via few-shot examples, an LLM is induced to generate relevant queries for documents. These synthetic query-document pairs can then be used to train a retriever. However, InPars and, more recently, Promptagator, rely on proprietary LLMs such as GPT-3 and FLAN to generate such datasets. In this work we introduce InPars-v2, a dataset generator that uses open-source LLMs and existing powerful rerankers to select synthetic query-document pairs for training. A simple BM25 retrieval pipeline followed by a monoT5 reranker finetuned on InPars-v2 data achieves new state-of-the-art results on the BEIR benchmark. To allow researchers to further improve our method, we open source the code, synthetic data, and finetuned models: [https://github.com/zetaalphavector/inPars/tree/master/tpu](https://github.com/zetaalphavector/inPars/tree/master/tpu)
63
+
64
+ ## Key Features of InPars-v2
65
+
66
+ 1. Uses open-source LLMs for query generation
67
+ 2. Employs powerful rerankers to select high-quality synthetic query-document pairs
68
+ 3. Achieves state-of-the-art results on the BEIR benchmark
69
+ 4. Provides open-source code, synthetic data, and finetuned models
70
+
71
+ This demo provides a simplified implementation of the InPars-v2 concept, showcasing query generation, reranking, and retriever training.
72
+ """
73
+
74
  iface = gr.Interface(
75
  fn=inpars_v2,
76
  inputs=gr.Textbox(lines=5, label="Input Document"),
77
  outputs=gr.Textbox(label="Result"),
78
  title="InPars-v2 Demo",
79
+ description=paper_description,
80
+ article="This is a minimal implementation of the InPars-v2 concept. For the full implementation and more details, please refer to the original paper and GitHub repository."
81
  )
82
 
83
  iface.launch()