Spaces:
Runtime error
Runtime error
Sean MacAvaney
commited on
Commit
·
68b08cf
1
Parent(s):
3ed7c41
update
Browse files
README.md
CHANGED
@@ -10,72 +10,7 @@ pinned: false
|
|
10 |
models:
|
11 |
- macavaney/doc2query-t5-base-msmarco
|
12 |
---
|
13 |
-
|
14 |
-
<style>
|
15 |
-
.transformer {
|
16 |
-
display: inline-block;
|
17 |
-
background: #8facdb;
|
18 |
-
position: relative;
|
19 |
-
height: 60px;
|
20 |
-
line-height: 60px;
|
21 |
-
padding: 0 24px;
|
22 |
-
margin: 0 18px;
|
23 |
-
color: #333;
|
24 |
-
cursor: help;
|
25 |
-
}
|
26 |
-
.transformer::before {
|
27 |
-
content: "";
|
28 |
-
position: absolute;
|
29 |
-
bottom: 0;
|
30 |
-
top: 0;
|
31 |
-
left: -15px;
|
32 |
-
border-top: 30px solid #8facdb;
|
33 |
-
border-bottom: 30px solid #8facdb;
|
34 |
-
border-left: 15px solid transparent;
|
35 |
-
}
|
36 |
-
.transformer::after {
|
37 |
-
content: "";
|
38 |
-
position: absolute;
|
39 |
-
bottom: 0;
|
40 |
-
top: 0;
|
41 |
-
right: -15px;
|
42 |
-
border-top: 30px solid transparent;
|
43 |
-
border-bottom: 30px solid transparent;
|
44 |
-
border-left: 15px solid #8facdb;
|
45 |
-
}
|
46 |
-
.transformer.boring {
|
47 |
-
background: #ddd;
|
48 |
-
}
|
49 |
-
.transformer.boring::before {
|
50 |
-
border-top-color: #ddd;
|
51 |
-
border-bottom-color: #ddd;
|
52 |
-
}
|
53 |
-
.transformer.boring::after {
|
54 |
-
border-left-color: #ddd;
|
55 |
-
}
|
56 |
-
.df {
|
57 |
-
width: 24px;
|
58 |
-
line-height: 24px;
|
59 |
-
text-align: center;
|
60 |
-
border: 3px double #888;
|
61 |
-
background-color: #eee;
|
62 |
-
color: #333;
|
63 |
-
border-radius: 4px;
|
64 |
-
display: inline-block;
|
65 |
-
box-sizing: content-box;
|
66 |
-
cursor: help;
|
67 |
-
margin: 0 -25px;
|
68 |
-
opacity: 0.5;
|
69 |
-
z-index: 1;
|
70 |
-
position: relative;
|
71 |
-
}
|
72 |
-
.df:hover {
|
73 |
-
opacity: 1;
|
74 |
-
}
|
75 |
-
.pipeline {
|
76 |
-
text-align: center;
|
77 |
-
}
|
78 |
-
</style>
|
79 |
|
80 |
This is a demonstration of [PyTerrier's Doc2Query package](https://github.com/terrierteam/pyterrier_doc2query). Doc2Query generates
|
81 |
queries for a document, which can then be appended to a document's text before indexing to boost important terms and add missing terms.
|
@@ -87,5 +22,3 @@ Doc2Query functions as a `D→D` (document-to-document) transformer and can be u
|
|
87 |
<div class="transformer" title="Doc2Query Transformer">Doc2Query</div>
|
88 |
<div class="df" title="Document Frame">D</div>
|
89 |
</div>
|
90 |
-
|
91 |
-
Try it below!
|
|
|
10 |
models:
|
11 |
- macavaney/doc2query-t5-base-msmarco
|
12 |
---
|
13 |
+
# 🐕 PyTerrier: Doc2Query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
This is a demonstration of [PyTerrier's Doc2Query package](https://github.com/terrierteam/pyterrier_doc2query). Doc2Query generates
|
16 |
queries for a document, which can then be appended to a document's text before indexing to boost important terms and add missing terms.
|
|
|
22 |
<div class="transformer" title="Doc2Query Transformer">Doc2Query</div>
|
23 |
<div class="df" title="Document Frame">D</div>
|
24 |
</div>
|
|
|
|
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
-
import base64
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from pyterrier_doc2query import Doc2Query
|
|
|
5 |
|
6 |
MODEL = 'macavaney/doc2query-t5-base-msmarco'
|
7 |
|
@@ -13,41 +13,18 @@ COLAB_INSTALL = '''
|
|
13 |
!pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
|
14 |
'''.strip()
|
15 |
|
16 |
-
def df2code(df):
|
17 |
-
rows = []
|
18 |
-
for row in df.itertuples(index=False):
|
19 |
-
rows.append(f' {dict(row._asdict())},')
|
20 |
-
rows = '\n'.join(rows)
|
21 |
-
return f'''pd.DataFrame([
|
22 |
-
{rows}
|
23 |
-
])'''
|
24 |
-
|
25 |
-
def code2colab(code):
|
26 |
-
enc_code = base64.b64encode((COLAB_INSTALL + '\n\n' + code).encode()).decode()
|
27 |
-
url = f'https://colaburl.macavaney.us/?py64={enc_code}&name={COLAB_NAME}'
|
28 |
-
return f'<a href="{url}" rel="nofollow" target="_blank" style="float: right;"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" style="margin: 0;" /></a>'
|
29 |
-
|
30 |
-
def code2md(code):
|
31 |
-
return f'''
|
32 |
-
{code2colab(code)}
|
33 |
-
|
34 |
-
**Code:**
|
35 |
-
|
36 |
-
```python
|
37 |
-
{code}
|
38 |
-
```
|
39 |
-
'''
|
40 |
-
|
41 |
def predict(input, model, append, num_samples):
|
42 |
assert model == MODEL
|
43 |
doc2query.append = append
|
44 |
doc2query.num_samples = num_samples
|
45 |
code = f'''import pandas as pd
|
46 |
from pyterrier_doc2query import Doc2Query
|
|
|
47 |
doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})
|
|
|
48 |
doc2query({df2code(input)})
|
49 |
'''
|
50 |
-
return (doc2query(input), code2md(code))
|
51 |
|
52 |
example_inp = pd.DataFrame([
|
53 |
{'docno': '0', 'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'},
|
@@ -55,19 +32,13 @@ example_inp = pd.DataFrame([
|
|
55 |
{'docno': '985', 'text': 'Continue on Hollins Ferry Road to Patapsco Avenue. Make a right onto Patapsco Avenue for approximately 2.5 miles. The courthouse is at the corner of Patapsco Avenue and 7th Street. The commissioner\'s office is on the first (ground) floor.'}
|
56 |
])
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
predict,
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
col_count=(2, "fixed"),
|
66 |
-
row_count=1,
|
67 |
-
wrap=True,
|
68 |
-
label='Pipeline Input',
|
69 |
-
value=example_inp,
|
70 |
-
), gr.Dropdown(
|
71 |
choices=[MODEL],
|
72 |
value=MODEL,
|
73 |
label='Model',
|
@@ -82,17 +53,6 @@ gr.Interface(
|
|
82 |
step=1.,
|
83 |
label='# Queries'
|
84 |
)],
|
85 |
-
|
86 |
-
|
87 |
-
datatype=["str", "str", "str"],
|
88 |
-
col_count=3,
|
89 |
-
row_count=1,
|
90 |
-
wrap=True,
|
91 |
-
label='Pipeline Output',
|
92 |
-
value=example_out[0],
|
93 |
-
), gr.Markdown(value=example_out[1])],
|
94 |
-
title="🐕 PyTerrier: Doc2Query",
|
95 |
-
description=open('README.md', 'rt').read().split('\n---\n')[-1],
|
96 |
-
allow_flagging='never',
|
97 |
-
css="table.font-mono td, table.font-mono th { white-space: pre-line; font-size: 11px; line-height: 16px; } table.font-mono td input { width: 95%; } th .cursor-pointer {display: none;} th .min-h-\[2\.3rem\] {min-height: inherit;}",
|
98 |
).launch(share=False)
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import gradio as gr
|
3 |
from pyterrier_doc2query import Doc2Query
|
4 |
+
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md
|
5 |
|
6 |
MODEL = 'macavaney/doc2query-t5-base-msmarco'
|
7 |
|
|
|
13 |
!pip install -q git+https://github.com/terrierteam/pyterrier_doc2query
|
14 |
'''.strip()
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
def predict(input, model, append, num_samples):
|
17 |
assert model == MODEL
|
18 |
doc2query.append = append
|
19 |
doc2query.num_samples = num_samples
|
20 |
code = f'''import pandas as pd
|
21 |
from pyterrier_doc2query import Doc2Query
|
22 |
+
|
23 |
doc2query = Doc2Query({repr(model)}, append={append}, num_samples={num_samples})
|
24 |
+
|
25 |
doc2query({df2code(input)})
|
26 |
'''
|
27 |
+
return (doc2query(input), code2md(code, COLAB_INSTALL, COLAB_NAME))
|
28 |
|
29 |
example_inp = pd.DataFrame([
|
30 |
{'docno': '0', 'text': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'},
|
|
|
32 |
{'docno': '985', 'text': 'Continue on Hollins Ferry Road to Patapsco Avenue. Make a right onto Patapsco Avenue for approximately 2.5 miles. The courthouse is at the corner of Patapsco Avenue and 7th Street. The commissioner\'s office is on the first (ground) floor.'}
|
33 |
])
|
34 |
|
35 |
+
interface(
|
36 |
+
MarkdownFile('README.md'),
|
37 |
+
Demo(
|
38 |
predict,
|
39 |
+
example_inp,
|
40 |
+
[
|
41 |
+
gr.Dropdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
choices=[MODEL],
|
43 |
value=MODEL,
|
44 |
label='Model',
|
|
|
53 |
step=1.,
|
54 |
label='# Queries'
|
55 |
)],
|
56 |
+
),
|
57 |
+
MarkdownFile('wrapup.md'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
).launch(share=False)
|
requirements.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
git+https://github.com/terrier-org/pyterrier
|
2 |
git+https://github.com/terrierteam/pyterrier_doc2query@master
|
3 |
ir_datasets
|
|
|
1 |
+
git+https://github.com/seanmacavaney/[email protected]
|
2 |
git+https://github.com/terrier-org/pyterrier
|
3 |
git+https://github.com/terrierteam/pyterrier_doc2query@master
|
4 |
ir_datasets
|
wrapup.md
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Putting it all together
|
2 |
+
|
3 |
+
You can use Doc2Query in an indexing pipeline to build an index of the expanded documents:
|
4 |
+
|
5 |
+
<div class="pipeline">
|
6 |
+
<div class="df" title="Document Frame">D</div>
|
7 |
+
<div class="transformer" title="Doc2Query Transformer">Doc2Query</div>
|
8 |
+
<div class="df" title="Document Frame">D</div>
|
9 |
+
<div class="transformer boring" title="Indexer">Indexer</div>
|
10 |
+
<div class="artefact" title="Doc2Query Index">IDX</div>
|
11 |
+
</div>
|
12 |
+
|
13 |
+
```python
|
14 |
+
import pyterrer as pt
|
15 |
+
pt.init()
|
16 |
+
import pyterrier_doc2query
|
17 |
+
doc2query = pyterrier_doc2query.Doc2Query(append=True)
|
18 |
+
|
19 |
+
dataset = pt.get_dataset('irds:msmarco-passage')
|
20 |
+
|
21 |
+
indexer = pt.IterDictIndexer('./msmarco_psg')
|
22 |
+
|
23 |
+
indxer_pipe = doc2query >> indexer
|
24 |
+
indxer_pipe.index(dataset.get_corpus_iter())
|
25 |
+
```
|
26 |
+
|
27 |
+
Once you built an index, you can retrieve from it using any retrieval function (often BM25):
|
28 |
+
|
29 |
+
<div class="pipeline">
|
30 |
+
<div class="df" title="Query Frame">Q</div>
|
31 |
+
<div class="transformer boring" title="BM25 Transformer">BM25 Retriever <div class="artefact" title="Doc2Query Index">IDX</div></div>
|
32 |
+
<div class="df" title="Result Frame">R</div>
|
33 |
+
</div>
|
34 |
+
|
35 |
+
```python
|
36 |
+
bm25 = pt.BatchRetrieve('./msmarco_psg', wmodel="BM25")
|
37 |
+
```
|
38 |
+
|
39 |
+
### References & Credits
|
40 |
+
|
41 |
+
- Rodrigo Nogueira and Jimmy Lin. [From doc2query to docTTTTTquery](https://cs.uwaterloo.ca/~jimmylin/publications/Nogueira_Lin_2019_docTTTTTquery-v2.pdf).
|
42 |
+
- Craig Macdonald, Nicola Tonellotto, Sean MacAvaney, Iadh Ounis. [PyTerrier: Declarative Experimentation in Python from BM25 to Dense Retrieval](https://dl.acm.org/doi/abs/10.1145/3459637.3482013). CIKM 2021.
|