Update README.md
Browse files
README.md
CHANGED
@@ -50,20 +50,21 @@ n_gpu: int = 1 # Set your number of available GPUs
|
|
50 |
experiment: str = "" # Name of the folder where the logs and created indices will be stored
|
51 |
index_name: str = "" # The name of your index, i.e. the name of your vector database
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
67 |
```
|
68 |
|
69 |
### Searching
|
@@ -77,17 +78,18 @@ experiment: str = "" # Name of the folder where the logs and created indices wi
|
|
77 |
index_name: str = "" # Name of your previously created index where the documents you want to search are stored.
|
78 |
k: int = 10 # how many results you want to retrieve
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
91 |
```
|
92 |
|
93 |
## Evaluation Results
|
|
|
50 |
experiment: str = "" # Name of the folder where the logs and created indices will be stored
|
51 |
index_name: str = "" # The name of your index, i.e. the name of your vector database
|
52 |
|
53 |
+
if __name__ == "__main__":
|
54 |
+
with Run().context(RunConfig(nranks=n_gpu, experiment=experiment)):
|
55 |
+
config = ColBERTConfig(
|
56 |
+
doc_maxlen=8192 # Our model supports 8k context length for indexing long documents
|
57 |
+
)
|
58 |
+
indexer = Indexer(
|
59 |
+
checkpoint="jinaai/jina-colbert-v1-en",
|
60 |
+
config=config,
|
61 |
+
)
|
62 |
+
documents = [
|
63 |
+
"ColBERT is an efficient and effective passage retrieval model.",
|
64 |
+
"Jina-ColBERT is a ColBERT-style model but based on JinaBERT so it can support both 8k context length.",
|
65 |
+
# Add more documents here to ensure the clustering work correctly
|
66 |
+
]
|
67 |
+
indexer.index(name=index_name, collection=documents)
|
68 |
```
|
69 |
|
70 |
### Searching
|
|
|
78 |
index_name: str = "" # Name of your previously created index where the documents you want to search are stored.
|
79 |
k: int = 10 # how many results you want to retrieve
|
80 |
|
81 |
+
if __name__ == "__main__":
|
82 |
+
with Run().context(RunConfig(nranks=n_gpu, experiment=experiment)):
|
83 |
+
config = ColBERTConfig(
|
84 |
+
query_maxlen=128 # Although the model supports 8k context length, we suggest not to use a very long query, as it may cause significant computational complexity and CUDA memory usage.
|
85 |
+
)
|
86 |
+
searcher = Searcher(
|
87 |
+
index=index_name,
|
88 |
+
config=config
|
89 |
+
) # You don't need to specify the checkpoint again, the model name is stored in the index.
|
90 |
+
query = "How to use ColBERT for indexing long documents?"
|
91 |
+
results = searcher.search(query, k=k)
|
92 |
+
# results: tuple of tuples of length k containing ((passage_id, passage_rank, passage_score), ...)
|
93 |
```
|
94 |
|
95 |
## Evaluation Results
|