jiaxin.shan
commited on
Commit
·
76559e0
1
Parent(s):
bea623c
Sync cross-encoder/mmarco-mMiniLMv2-L12-H384-v1 here
Browse files- .gitattributes +3 -0
- README.md +57 -0
- config.json +34 -0
- pytorch_model.bin +3 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +1 -0
- tokenizer.json +3 -0
- tokenizer_config.json +1 -0
- train_script.py +362 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
38 |
+
sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,60 @@
|
|
1 |
---
|
2 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
license: apache-2.0
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
- ar
|
6 |
+
- zh
|
7 |
+
- nl
|
8 |
+
- fr
|
9 |
+
- de
|
10 |
+
- hi
|
11 |
+
- in
|
12 |
+
- it
|
13 |
+
- ja
|
14 |
+
- pt
|
15 |
+
- ru
|
16 |
+
- es
|
17 |
+
- vi
|
18 |
+
- multilingual
|
19 |
+
datasets:
|
20 |
+
- unicamp-dl/mmarco
|
21 |
---
|
22 |
+
# Cross-Encoder for multilingual MS Marco
|
23 |
+
|
24 |
+
This model was trained on the [MMARCO](https://hf.co/unicamp-dl/mmarco) dataset. It is a machine translated version of MS MARCO using Google Translate. It was translated to 14 languages. In our experiments, we observed that it performs also well for other languages.
|
25 |
+
|
26 |
+
As a base model, we used the [multilingual MiniLMv2](https://huggingface.co/nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large) model.
|
27 |
+
|
28 |
+
The model can be used for Information Retrieval: Given a query, encode the query will all possible passages (e.g. retrieved with ElasticSearch). Then sort the passages in a decreasing order. See [SBERT.net Retrieve & Re-rank](https://www.sbert.net/examples/applications/retrieve_rerank/README.html) for more details. The training code is available here: [SBERT.net Training MS Marco](https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/ms_marco)
|
29 |
+
|
30 |
+
## Usage with SentenceTransformers
|
31 |
+
|
32 |
+
The usage becomes easy when you have [SentenceTransformers](https://www.sbert.net/) installed. Then, you can use the pre-trained models like this:
|
33 |
+
```python
|
34 |
+
from sentence_transformers import CrossEncoder
|
35 |
+
model = CrossEncoder('model_name')
|
36 |
+
scores = model.predict([('Query', 'Paragraph1'), ('Query', 'Paragraph2') , ('Query', 'Paragraph3')])
|
37 |
+
```
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
## Usage with Transformers
|
43 |
+
|
44 |
+
```python
|
45 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
46 |
+
import torch
|
47 |
+
|
48 |
+
model = AutoModelForSequenceClassification.from_pretrained('model_name')
|
49 |
+
tokenizer = AutoTokenizer.from_pretrained('model_name')
|
50 |
+
|
51 |
+
features = tokenizer(['How many people live in Berlin?', 'How many people live in Berlin?'], ['Berlin has a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.', 'New York City is famous for the Metropolitan Museum of Art.'], padding=True, truncation=True, return_tensors="pt")
|
52 |
+
|
53 |
+
model.eval()
|
54 |
+
with torch.no_grad():
|
55 |
+
scores = model(**features).logits
|
56 |
+
print(scores)
|
57 |
+
```
|
58 |
+
|
59 |
+
|
60 |
+
|
config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large",
|
3 |
+
"architectures": [
|
4 |
+
"XLMRobertaForSequenceClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"classifier_dropout": null,
|
9 |
+
"eos_token_id": 2,
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 384,
|
13 |
+
"id2label": {
|
14 |
+
"0": "LABEL_0"
|
15 |
+
},
|
16 |
+
"initializer_range": 0.02,
|
17 |
+
"intermediate_size": 1536,
|
18 |
+
"label2id": {
|
19 |
+
"LABEL_0": 0
|
20 |
+
},
|
21 |
+
"layer_norm_eps": 1e-05,
|
22 |
+
"max_position_embeddings": 514,
|
23 |
+
"model_type": "xlm-roberta",
|
24 |
+
"num_attention_heads": 12,
|
25 |
+
"num_hidden_layers": 12,
|
26 |
+
"pad_token_id": 1,
|
27 |
+
"position_embedding_type": "absolute",
|
28 |
+
"torch_dtype": "float32",
|
29 |
+
"transformers_version": "4.18.0",
|
30 |
+
"type_vocab_size": 1,
|
31 |
+
"use_cache": true,
|
32 |
+
"vocab_size": 250002,
|
33 |
+
"sbert_ce_default_activation_function": "torch.nn.modules.linear.Identity"
|
34 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1abc209e54d70bbcb08c1b5111a924fb99c0428f51cab1659310ccdcab69dc03
|
3 |
+
size 470633197
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:62c24cdc13d4c9952d63718d6c9fa4c287974249e16b7ade6d5a85e7bbb75626
|
3 |
+
size 17082660
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "sep_token": "</s>", "cls_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large", "tokenizer_class": "XLMRobertaTokenizer"}
|
train_script.py
ADDED
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from codecs import EncodedFile
|
2 |
+
from datetime import datetime
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
import datasets
|
6 |
+
import torch
|
7 |
+
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything
|
8 |
+
from torch.utils.data import DataLoader
|
9 |
+
from transformers import (
|
10 |
+
AutoConfig,
|
11 |
+
AutoModelForSequenceClassification,
|
12 |
+
AutoTokenizer,
|
13 |
+
get_linear_schedule_with_warmup,
|
14 |
+
get_scheduler,
|
15 |
+
)
|
16 |
+
import torch
|
17 |
+
import sys
|
18 |
+
import os
|
19 |
+
from argparse import ArgumentParser
|
20 |
+
from datasets import load_dataset
|
21 |
+
import tqdm
|
22 |
+
import json
|
23 |
+
import gzip
|
24 |
+
import random
|
25 |
+
from pytorch_lightning.callbacks import ModelCheckpoint
|
26 |
+
import numpy as np
|
27 |
+
from shutil import copyfile
|
28 |
+
from pytorch_lightning.loggers import WandbLogger
|
29 |
+
import transformers
|
30 |
+
|
31 |
+
|
32 |
+
class MSMARCOData(LightningDataModule):
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
model_name: str,
|
36 |
+
triplets_path: str,
|
37 |
+
langs,
|
38 |
+
max_seq_length: int = 250,
|
39 |
+
train_batch_size: int = 32,
|
40 |
+
eval_batch_size: int = 32,
|
41 |
+
num_negs: int = 3,
|
42 |
+
cross_lingual_chance: float = 0.0,
|
43 |
+
**kwargs,
|
44 |
+
):
|
45 |
+
super().__init__()
|
46 |
+
self.model_name = model_name
|
47 |
+
self.triplets_path = triplets_path
|
48 |
+
self.max_seq_length = max_seq_length
|
49 |
+
self.train_batch_size = train_batch_size
|
50 |
+
self.eval_batch_size = eval_batch_size
|
51 |
+
self.langs = langs
|
52 |
+
self.num_negs = num_negs
|
53 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
54 |
+
self.cross_lingual_chance = cross_lingual_chance #Probability for cross-lingual batches
|
55 |
+
|
56 |
+
#def setup(self, stage: str):
|
57 |
+
print(f"!!!!!!!!!!!!!!!!!! SETUP {os.getpid()} !!!!!!!!!!!!!!!")
|
58 |
+
|
59 |
+
#Get the queries
|
60 |
+
self.queries = {lang: {} for lang in self.langs}
|
61 |
+
|
62 |
+
for lang in self.langs:
|
63 |
+
for row in tqdm.tqdm(load_dataset('unicamp-dl/mmarco', f'queries-{lang}')['train'], desc=lang):
|
64 |
+
self.queries[lang][row['id']] = row['text']
|
65 |
+
|
66 |
+
#Get the passages
|
67 |
+
self.collections = {lang: load_dataset('unicamp-dl/mmarco', f'collection-{lang}')['collection'] for lang in self.langs}
|
68 |
+
|
69 |
+
#Get the triplets
|
70 |
+
with gzip.open(self.triplets_path, 'rt') as fIn:
|
71 |
+
self.triplets = [json.loads(line) for line in tqdm.tqdm(fIn, desc="triplets", total=502938)]
|
72 |
+
"""
|
73 |
+
self.triplets = []
|
74 |
+
for line in tqdm.tqdm(fIn):
|
75 |
+
self.triplets.append(json.loads(line))
|
76 |
+
if len(self.triplets) >= 1000:
|
77 |
+
break
|
78 |
+
"""
|
79 |
+
|
80 |
+
def collate_fn(self, batch):
|
81 |
+
cross_lingual_batch = random.random() < self.cross_lingual_chance
|
82 |
+
|
83 |
+
#Create data for list-rank-loss
|
84 |
+
query_doc_pairs = [[] for _ in range(1+self.num_negs)]
|
85 |
+
|
86 |
+
for row in batch:
|
87 |
+
qid = row['qid']
|
88 |
+
pos_id = random.choice(row['pos'])
|
89 |
+
|
90 |
+
query_lang = random.choice(self.langs)
|
91 |
+
query_text = self.queries[query_lang][qid]
|
92 |
+
|
93 |
+
doc_lang = random.choice(self.langs) if cross_lingual_batch else query_lang
|
94 |
+
query_doc_pairs[0].append((query_text, self.collections[doc_lang][pos_id]['text']))
|
95 |
+
|
96 |
+
dense_bm25_neg = list(set(row['dense_neg'] + row['bm25_neg']))
|
97 |
+
neg_ids = random.sample(dense_bm25_neg, self.num_negs)
|
98 |
+
|
99 |
+
for num_neg, neg_id in enumerate(neg_ids):
|
100 |
+
doc_lang = random.choice(self.langs) if cross_lingual_batch else query_lang
|
101 |
+
query_doc_pairs[1+num_neg].append((query_text, self.collections[doc_lang][neg_id]['text']))
|
102 |
+
|
103 |
+
#Now tokenize the data
|
104 |
+
features = [self.tokenizer(qd_pair, max_length=self.max_seq_length, padding=True, truncation='only_second', return_tensors="pt") for qd_pair in query_doc_pairs]
|
105 |
+
|
106 |
+
return features
|
107 |
+
|
108 |
+
def train_dataloader(self):
|
109 |
+
return DataLoader(self.triplets, shuffle=True, batch_size=self.train_batch_size, num_workers=1, pin_memory=True, collate_fn=self.collate_fn)
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
class ListRankLoss(LightningModule):
|
116 |
+
def __init__(
|
117 |
+
self,
|
118 |
+
model_name: str,
|
119 |
+
learning_rate: float = 2e-5,
|
120 |
+
warmup_steps: int = 1000,
|
121 |
+
weight_decay: float = 0.01,
|
122 |
+
train_batch_size: int = 32,
|
123 |
+
eval_batch_size: int = 32,
|
124 |
+
**kwargs,
|
125 |
+
):
|
126 |
+
super().__init__()
|
127 |
+
|
128 |
+
self.save_hyperparameters()
|
129 |
+
print(self.hparams)
|
130 |
+
|
131 |
+
self.config = AutoConfig.from_pretrained(model_name, num_labels=1)
|
132 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=self.config)
|
133 |
+
self.loss_fct = torch.nn.CrossEntropyLoss()
|
134 |
+
self.global_train_step = 0
|
135 |
+
|
136 |
+
|
137 |
+
def forward(self, **inputs):
|
138 |
+
return self.model(**inputs)
|
139 |
+
|
140 |
+
def training_step(self, batch, batch_idx):
|
141 |
+
pred_scores = []
|
142 |
+
scores = torch.tensor([0] * len(batch[0]['input_ids']), device=self.model.device)
|
143 |
+
|
144 |
+
for feature in batch:
|
145 |
+
pred_scores.append(self(**feature).logits.squeeze())
|
146 |
+
|
147 |
+
pred_scores = torch.stack(pred_scores, 1)
|
148 |
+
loss_value = self.loss_fct(pred_scores, scores)
|
149 |
+
self.global_train_step += 1
|
150 |
+
self.log('global_train_step', self.global_train_step)
|
151 |
+
self.log("train/loss", loss_value)
|
152 |
+
|
153 |
+
return loss_value
|
154 |
+
|
155 |
+
|
156 |
+
def setup(self, stage=None) -> None:
|
157 |
+
if stage != "fit":
|
158 |
+
return
|
159 |
+
# Get dataloader by calling it - train_dataloader() is called after setup() by default
|
160 |
+
train_loader = self.trainer.datamodule.train_dataloader()
|
161 |
+
|
162 |
+
# Calculate total steps
|
163 |
+
tb_size = self.hparams.train_batch_size * max(1, self.trainer.gpus)
|
164 |
+
ab_size = self.trainer.accumulate_grad_batches
|
165 |
+
self.total_steps = (len(train_loader) // ab_size) * self.trainer.max_epochs
|
166 |
+
|
167 |
+
print(f"{tb_size=}")
|
168 |
+
print(f"{ab_size=}")
|
169 |
+
print(f"{len(train_loader)=}")
|
170 |
+
print(f"{self.total_steps=}")
|
171 |
+
|
172 |
+
|
173 |
+
def configure_optimizers(self):
|
174 |
+
"""Prepare optimizer and schedule (linear warmup and decay)"""
|
175 |
+
model = self.model
|
176 |
+
no_decay = ["bias", "LayerNorm.weight"]
|
177 |
+
optimizer_grouped_parameters = [
|
178 |
+
{
|
179 |
+
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
180 |
+
"weight_decay": self.hparams.weight_decay,
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
|
184 |
+
"weight_decay": 0.0,
|
185 |
+
},
|
186 |
+
]
|
187 |
+
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate)
|
188 |
+
|
189 |
+
lr_scheduler = get_scheduler(
|
190 |
+
name="linear",
|
191 |
+
optimizer=optimizer,
|
192 |
+
num_warmup_steps=self.hparams.warmup_steps,
|
193 |
+
num_training_steps=self.total_steps,
|
194 |
+
)
|
195 |
+
|
196 |
+
scheduler = {"scheduler": lr_scheduler, "interval": "step", "frequency": 1}
|
197 |
+
return [optimizer], [scheduler]
|
198 |
+
|
199 |
+
|
200 |
+
|
201 |
+
def main(args):
|
202 |
+
dm = MSMARCOData(
|
203 |
+
model_name=args.model,
|
204 |
+
langs=args.langs,
|
205 |
+
triplets_path='data/msmarco-hard-triplets.jsonl.gz',
|
206 |
+
train_batch_size=args.batch_size,
|
207 |
+
cross_lingual_chance=args.cross_lingual_chance,
|
208 |
+
num_negs=args.num_negs
|
209 |
+
)
|
210 |
+
output_dir = f"output/{args.model.replace('/', '-')}-{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
|
211 |
+
print("Output_dir:", output_dir)
|
212 |
+
|
213 |
+
os.makedirs(output_dir, exist_ok=True)
|
214 |
+
|
215 |
+
wandb_logger = WandbLogger(project="multilingual-cross-encoder", name=output_dir.split("/")[-1])
|
216 |
+
|
217 |
+
train_script_path = os.path.join(output_dir, 'train_script.py')
|
218 |
+
copyfile(__file__, train_script_path)
|
219 |
+
with open(train_script_path, 'a') as fOut:
|
220 |
+
fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
|
221 |
+
|
222 |
+
|
223 |
+
# saves top-K checkpoints based on "val_loss" metric
|
224 |
+
checkpoint_callback = ModelCheckpoint(
|
225 |
+
every_n_train_steps=25000,
|
226 |
+
save_top_k=5,
|
227 |
+
monitor="global_train_step",
|
228 |
+
mode="max",
|
229 |
+
dirpath=output_dir,
|
230 |
+
filename="ckpt-{global_train_step}",
|
231 |
+
)
|
232 |
+
|
233 |
+
|
234 |
+
model = ListRankLoss(model_name=args.model)
|
235 |
+
|
236 |
+
trainer = Trainer(max_epochs=args.epochs,
|
237 |
+
accelerator="gpu",
|
238 |
+
devices=args.num_gpus,
|
239 |
+
precision=args.precision,
|
240 |
+
strategy=args.strategy,
|
241 |
+
default_root_dir=output_dir,
|
242 |
+
callbacks=[checkpoint_callback],
|
243 |
+
logger=wandb_logger
|
244 |
+
)
|
245 |
+
|
246 |
+
trainer.fit(model, datamodule=dm)
|
247 |
+
|
248 |
+
#Save final HF model
|
249 |
+
final_path = os.path.join(output_dir, "final")
|
250 |
+
dm.tokenizer.save_pretrained(final_path)
|
251 |
+
model.model.save_pretrained(final_path)
|
252 |
+
|
253 |
+
|
254 |
+
def eval(args):
|
255 |
+
import ir_datasets
|
256 |
+
|
257 |
+
|
258 |
+
model = ListRankLoss.load_from_checkpoint(args.ckpt)
|
259 |
+
hf_model = model.model.cuda()
|
260 |
+
tokenizer = AutoTokenizer.from_pretrained(model.hparams.model_name)
|
261 |
+
|
262 |
+
dev_qids = set()
|
263 |
+
|
264 |
+
dev_queries = {}
|
265 |
+
dev_rel_docs = {}
|
266 |
+
needed_pids = set()
|
267 |
+
needed_qids = set()
|
268 |
+
|
269 |
+
corpus = {}
|
270 |
+
retrieved_docs = {}
|
271 |
+
|
272 |
+
dataset = ir_datasets.load("msmarco-passage/dev/small")
|
273 |
+
for query in dataset.queries_iter():
|
274 |
+
dev_qids.add(query.query_id)
|
275 |
+
|
276 |
+
|
277 |
+
with open('data/qrels.dev.tsv') as fIn:
|
278 |
+
for line in fIn:
|
279 |
+
qid, _, pid, _ = line.strip().split('\t')
|
280 |
+
|
281 |
+
if qid not in dev_qids:
|
282 |
+
continue
|
283 |
+
|
284 |
+
if qid not in dev_rel_docs:
|
285 |
+
dev_rel_docs[qid] = set()
|
286 |
+
dev_rel_docs[qid].add(pid)
|
287 |
+
|
288 |
+
retrieved_docs[qid] = set()
|
289 |
+
needed_qids.add(qid)
|
290 |
+
needed_pids.add(pid)
|
291 |
+
|
292 |
+
for query in dataset.queries_iter():
|
293 |
+
qid = query.query_id
|
294 |
+
if qid in needed_qids:
|
295 |
+
dev_queries[qid] = query.text
|
296 |
+
|
297 |
+
with open('data/top1000.dev', 'rt') as fIn:
|
298 |
+
for line in fIn:
|
299 |
+
qid, pid, query, passage = line.strip().split("\t")
|
300 |
+
corpus[pid] = passage
|
301 |
+
retrieved_docs[qid].add(pid)
|
302 |
+
|
303 |
+
|
304 |
+
## Run evaluator
|
305 |
+
print("Queries: {}".format(len(dev_queries)))
|
306 |
+
|
307 |
+
mrr_scores = []
|
308 |
+
hf_model.eval()
|
309 |
+
|
310 |
+
with torch.no_grad():
|
311 |
+
for qid in tqdm.tqdm(dev_queries, total=len(dev_queries)):
|
312 |
+
query = dev_queries[qid]
|
313 |
+
top_pids = list(retrieved_docs[qid])
|
314 |
+
cross_inp = [[query, corpus[pid]] for pid in top_pids]
|
315 |
+
|
316 |
+
encoded = tokenizer(cross_inp, padding=True, truncation=True, return_tensors="pt").to('cuda')
|
317 |
+
output = model(**encoded)
|
318 |
+
bert_score = output.logits.detach().cpu().numpy()
|
319 |
+
bert_score = np.squeeze(bert_score)
|
320 |
+
|
321 |
+
argsort = np.argsort(-bert_score)
|
322 |
+
|
323 |
+
rank_score = 0
|
324 |
+
for rank, idx in enumerate(argsort[0:10]):
|
325 |
+
pid = top_pids[idx]
|
326 |
+
if pid in dev_rel_docs[qid]:
|
327 |
+
rank_score = 1/(rank+1)
|
328 |
+
break
|
329 |
+
|
330 |
+
mrr_scores.append(rank_score)
|
331 |
+
|
332 |
+
if len(mrr_scores) % 10 == 0:
|
333 |
+
print("{} MRR@10: {:.2f}".format(len(mrr_scores), 100*np.mean(mrr_scores)))
|
334 |
+
|
335 |
+
print("MRR@10: {:.2f}".format(np.mean(mrr_scores)*100))
|
336 |
+
|
337 |
+
|
338 |
+
if __name__ == '__main__':
|
339 |
+
parser = ArgumentParser()
|
340 |
+
parser.add_argument("--num_gpus", type=int, default=1)
|
341 |
+
parser.add_argument("--batch_size", type=int, default=32)
|
342 |
+
parser.add_argument("--epochs", type=int, default=10)
|
343 |
+
parser.add_argument("--strategy", default=None)
|
344 |
+
parser.add_argument("--model", default='microsoft/mdeberta-v3-base')
|
345 |
+
parser.add_argument("--eval", action="store_true")
|
346 |
+
parser.add_argument("--ckpt")
|
347 |
+
parser.add_argument("--cross_lingual_chance", type=float, default=0.33)
|
348 |
+
parser.add_argument("--precision", type=int, default=16)
|
349 |
+
parser.add_argument("--num_negs", type=int, default=3)
|
350 |
+
parser.add_argument("--langs", nargs="+", default=['english', 'chinese', 'french', 'german', 'indonesian', 'italian', 'portuguese', 'russian', 'spanish', 'arabic', 'dutch', 'hindi', 'japanese', 'vietnamese'])
|
351 |
+
|
352 |
+
|
353 |
+
args = parser.parse_args()
|
354 |
+
|
355 |
+
if args.eval:
|
356 |
+
eval(args)
|
357 |
+
else:
|
358 |
+
main(args)
|
359 |
+
|
360 |
+
|
361 |
+
# Script was called via:
|
362 |
+
#python cross_mutlilingual.py --model nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large
|