Sean MacAvaney commited on
Commit
68b08cf
·
1 Parent(s): 3ed7c41
Files changed (4) hide show
  1. README.md +1 -68
  2. app.py +12 -52
  3. requirements.txt +1 -0
  4. wrapup.md +42 -0
README.md CHANGED
@@ -10,72 +10,7 @@ pinned: false
10
  models:
11
  - macavaney/doc2query-t5-base-msmarco
12
  ---
13
-
14
- <style>
15
- .transformer {
16
- display: inline-block;
17
- background: #8facdb;
18
- position: relative;
19
- height: 60px;
20
- line-height: 60px;
21
- padding: 0 24px;
22
- margin: 0 18px;
23
- color: #333;
24
- cursor: help;
25
- }
26
- .transformer::before {
27
- content: "";
28
- position: absolute;
29
- bottom: 0;
30
- top: 0;
31
- left: -15px;
32
- border-top: 30px solid #8facdb;
33
- border-bottom: 30px solid #8facdb;
34
- border-left: 15px solid transparent;
35
- }
36
- .transformer::after {
37
- content: "";
38
- position: absolute;
39
- bottom: 0;
40
- top: 0;
41
- right: -15px;
42
- border-top: 30px solid transparent;
43
- border-bottom: 30px solid transparent;
44
- border-left: 15px solid #8facdb;
45
- }
46
- .transformer.boring {
47
- background: #ddd;
48
- }
49
- .transformer.boring::before {
50
- border-top-color: #ddd;
51
- border-bottom-color: #ddd;
52
- }
53
- .transformer.boring::after {
54
- border-left-color: #ddd;
55
- }
56
- .df {
57
- width: 24px;
58
- line-height: 24px;
59
- text-align: center;
60
- border: 3px double #888;
61
- background-color: #eee;
62
- color: #333;
63
- border-radius: 4px;
64
- display: inline-block;
65
- box-sizing: content-box;
66
- cursor: help;
67
- margin: 0 -25px;
68
- opacity: 0.5;
69
- z-index: 1;
70
- position: relative;
71
- }
72
- .df:hover {
73
- opacity: 1;
74
- }
75
- .pipeline {
76
- text-align: center;
77
- }
78
- </style>
79
 
80
  This is a demonstration of [PyTerrier's Doc2Query package](https://github.com/terrierteam/pyterrier_doc2query). Doc2Query generates
81
  queries for a document, which can then be appended to a document's text before indexing to boost important terms and add missing terms.
@@ -87,5 +22,3 @@ Doc2Query functions as a `D→D` (document-to-document) transformer and can be u
87
  <div class="transformer" title="Doc2Query Transformer">Doc2Query</div>
88
  <div class="df" title="Document Frame">D</div>
89
  </div>
90
-
91
- Try it below!
 
10
  models:
11
  - macavaney/doc2query-t5-base-msmarco
12
  ---
13
+ # 🐕 PyTerrier: Doc2Query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  This is a demonstration of [PyTerrier's Doc2Query package](https://github.com/terrierteam/pyterrier_doc2query). Doc2Query generates
16
  queries for a document, which can then be appended to a document's text before indexing to boost important terms and add missing terms.
 
22
  <div class="transformer" title="Doc2Query Transformer">Doc2Query</div>
23
  <div class="df" title="Document Frame">D</div>
24
  </div>
 
 
app.py CHANGED
@@ -1,7 +1,7 @@
1
- import base64
2
  import pandas as pd
3
  import gradio as gr
4
  from pyterrier_doc2query import Doc2Query
 
5
 
6
  MODEL = 'macavaney/doc2query-t5-base-msmarco'
7
 
@@ -13,41 +13,18 @@ COLAB_INSTALL = '''
13
  !pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
14
  '''.strip()
15
 
16
- def df2code(df):
17
- rows = []
18
- for row in df.itertuples(index=False):
19
- rows.append(f' {dict(row._asdict())},')
20
- rows = '\n'.join(rows)
21
- return f'''pd.DataFrame([
22
- {rows}
23
- ])'''
24
-
25
- def code2colab(code):
26
- enc_code = base64.b64encode((COLAB_INSTALL + '\n\n' + code).encode()).decode()
27
- url = f'https://colaburl.macavaney.us/?py64={enc_code}&name={COLAB_NAME}'
28
- return f'<a href="{url}" rel="nofollow" target="_blank" style="float: right;"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="margin: 0;" /></a>'
29
-
30
- def code2md(code):
31
- return f'''
32
- {code2colab(code)}
33
-
34
- **Code:**
35
-
36
- ```python
37
- {code}
38
- ```
39
- '''
40
-
41
  def predict(input, model, append, num_samples):
42
  assert model == MODEL
43
  doc2query.append = append
44
  doc2query.num_samples = num_samples
45
  code = f'''import pandas as pd
46
  from pyterrier_doc2query import Doc2Query
 
47
  doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})
 
48
  doc2query({df2code(input)})
49
  '''
50
- return (doc2query(input), code2md(code))
51
 
52
  example_inp = pd.DataFrame([
53
  {'docno': '0', 'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'},
@@ -55,19 +32,13 @@ example_inp = pd.DataFrame([
55
  {'docno': '985', 'text': 'Continue on Hollins Ferry Road to Patapsco Avenue. Make a right onto Patapsco Avenue for approximately 2.5 miles. The courthouse is at the corner of Patapsco Avenue and 7th Street. The commissioner\'s office is on the first (ground) floor.'}
56
  ])
57
 
58
- example_out = predict(example_inp, MODEL, doc2query.append, doc2query.num_samples)
59
-
60
- gr.Interface(
61
  predict,
62
- inputs=[gr.Dataframe(
63
- headers=["docno", "text"],
64
- datatype=["str", "str"],
65
- col_count=(2, "fixed"),
66
- row_count=1,
67
- wrap=True,
68
- label='Pipeline Input',
69
- value=example_inp,
70
- ), gr.Dropdown(
71
  choices=[MODEL],
72
  value=MODEL,
73
  label='Model',
@@ -82,17 +53,6 @@ gr.Interface(
82
  step=1.,
83
  label='# Queries'
84
  )],
85
- outputs=[gr.Dataframe(
86
- headers=["docno", "text", "querygen"],
87
- datatype=["str", "str", "str"],
88
- col_count=3,
89
- row_count=1,
90
- wrap=True,
91
- label='Pipeline Output',
92
- value=example_out[0],
93
- ), gr.Markdown(value=example_out[1])],
94
- title="🐕 PyTerrier: Doc2Query",
95
- description=open('README.md', 'rt').read().split('\n---\n')[-1],
96
- allow_flagging='never',
97
- css="table.font-mono td, table.font-mono th { white-space: pre-line; font-size: 11px; line-height: 16px; } table.font-mono td input { width: 95%; } th .cursor-pointer {display: none;} th .min-h-\[2\.3rem\] {min-height: inherit;}",
98
  ).launch(share=False)
 
 
1
  import pandas as pd
2
  import gradio as gr
3
  from pyterrier_doc2query import Doc2Query
4
+ from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md
5
 
6
  MODEL = 'macavaney/doc2query-t5-base-msmarco'
7
 
 
13
  !pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
14
  '''.strip()
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def predict(input, model, append, num_samples):
17
  assert model == MODEL
18
  doc2query.append = append
19
  doc2query.num_samples = num_samples
20
  code = f'''import pandas as pd
21
  from pyterrier_doc2query import Doc2Query
22
+
23
  doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})
24
+
25
  doc2query({df2code(input)})
26
  '''
27
+ return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME))
28
 
29
  example_inp = pd.DataFrame([
30
  {'docno': '0', 'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'},
 
32
  {'docno': '985', 'text': 'Continue on Hollins Ferry Road to Patapsco Avenue. Make a right onto Patapsco Avenue for approximately 2.5 miles. The courthouse is at the corner of Patapsco Avenue and 7th Street. The commissioner\'s office is on the first (ground) floor.'}
33
  ])
34
 
35
+ interface(
36
+ MarkdownFile('README.md'),
37
+ Demo(
38
  predict,
39
+ example_inp,
40
+ [
41
+ gr.Dropdown(
 
 
 
 
 
 
42
  choices=[MODEL],
43
  value=MODEL,
44
  label='Model',
 
53
  step=1.,
54
  label='# Queries'
55
  )],
56
+ ),
57
+ MarkdownFile('wrapup.md'),
 
 
 
 
 
 
 
 
 
 
 
58
  ).launch(share=False)
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  git+https://github.com/terrier-org/pyterrier
2
  git+https://github.com/terrierteam/pyterrier_doc2query@master
3
  ir_datasets
 
1
+ git+https://github.com/seanmacavaney/[email protected]
2
  git+https://github.com/terrier-org/pyterrier
3
  git+https://github.com/terrierteam/pyterrier_doc2query@master
4
  ir_datasets
wrapup.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Putting it all together
2
+
3
+ You can use Doc2Query in an indexing pipeline to build an index of the expanded documents:
4
+
5
+ <div class="pipeline">
6
+ <div class="df" title="Document Frame">D</div>
7
+ <div class="transformer" title="Doc2Query Transformer">Doc2Query</div>
8
+ <div class="df" title="Document Frame">D</div>
9
+ <div class="transformer boring" title="Indexer">Indexer</div>
10
+ <div class="artefact" title="Doc2Query Index">IDX</div>
11
+ </div>
12
+
13
+ ```python
14
+ import pyterrer as pt
15
+ pt.init()
16
+ import pyterrier_doc2query
17
+ doc2query = pyterrier_doc2query.Doc2Query(append=True)
18
+
19
+ dataset = pt.get_dataset('irds:msmarco-passage')
20
+
21
+ indexer = pt.IterDictIndexer('./msmarco_psg')
22
+
23
+ indxer_pipe = doc2query >> indexer
24
+ indxer_pipe.index(dataset.get_corpus_iter())
25
+ ```
26
+
27
+ Once you built an index, you can retrieve from it using any retrieval function (often BM25):
28
+
29
+ <div class="pipeline">
30
+ <div class="df" title="Query Frame">Q</div>
31
+ <div class="transformer boring" title="BM25 Transformer">BM25 Retriever <div class="artefact" title="Doc2Query Index">IDX</div></div>
32
+ <div class="df" title="Result Frame">R</div>
33
+ </div>
34
+
35
+ ```python
36
+ bm25 = pt.BatchRetrieve('./msmarco_psg', wmodel="BM25")
37
+ ```
38
+
39
+ ### References & Credits
40
+
41
+ - Rodrigo Nogueira and Jimmy Lin. [From doc2query to docTTTTTquery](https://cs.uwaterloo.ca/~jimmylin/publications/Nogueira_Lin_2019_docTTTTTquery-v2.pdf).
42
+ - Craig Macdonald, Nicola Tonellotto, Sean MacAvaney, Iadh Ounis. [PyTerrier: Declarative Experimentation in Python from BM25 to Dense Retrieval](https://dl.acm.org/doi/abs/10.1145/3459637.3482013). CIKM 2021.