Update README.md
Browse files
README.md
CHANGED
@@ -17,99 +17,110 @@ Labels:
|
|
17 |
- **demand**: somebody wants to buy something or hire somebody
|
18 |
- **noise**: messages unrelated to topic.
|
19 |
|
20 |
-
## Usage
|
21 |
|
|
|
22 |
``` python
|
23 |
-
from
|
24 |
import torch
|
25 |
from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss
|
26 |
from huggingface_hub import PyTorchModelHubMixin
|
27 |
-
|
|
|
28 |
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
Module,
|
36 |
-
PyTorchModelHubMixin,
|
37 |
-
repo_url=HF_MODEL_NAME,
|
38 |
-
library_name="torch",
|
39 |
-
tags=["PyTorch", "sentence-transformers", "NLP", "text classification"],
|
40 |
-
docs_url="https://pytorch.org/docs/stable/index.html"
|
41 |
-
):
|
42 |
-
def __init__(self,
|
43 |
-
num_labels: Optional[int] = 3,
|
44 |
-
use_adapter: bool = False
|
45 |
):
|
46 |
super().__init__()
|
47 |
self.use_adapter = use_adapter
|
48 |
self.num_labels = num_labels
|
49 |
-
self.
|
50 |
-
|
|
|
51 |
# Adapter layer
|
52 |
if self.use_adapter:
|
53 |
self.adapter = TransformerEncoderLayer(
|
54 |
-
d_model=self.backbone.config.hidden_size,
|
55 |
-
nhead=self.backbone.config.num_attention_heads,
|
56 |
-
dim_feedforward=self.backbone.config.intermediate_size,
|
57 |
activation="gelu",
|
58 |
dropout=0.1,
|
59 |
-
batch_first=True
|
60 |
)
|
61 |
else:
|
62 |
self.adapter = None
|
63 |
-
|
64 |
# Classification head
|
65 |
self.separator_head = Linear(self.backbone.config.hidden_size, num_labels)
|
66 |
self.loss = CrossEntropyLoss()
|
67 |
-
|
68 |
-
def
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
labels: Optional[torch.Tensor] = None
|
72 |
) -> dict[str, torch.Tensor]:
|
73 |
outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
|
74 |
last_hidden_state = outputs.last_hidden_state
|
75 |
-
|
76 |
if self.use_adapter:
|
77 |
last_hidden_state = self.adapter(last_hidden_state)
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
if labels is not None:
|
83 |
loss = self.loss(logits, labels)
|
84 |
return {
|
85 |
-
"loss": loss,
|
86 |
-
"logits": logits,
|
87 |
-
"
|
88 |
}
|
89 |
return {
|
90 |
-
"logits": logits,
|
91 |
-
"
|
92 |
}
|
93 |
|
94 |
-
|
95 |
-
model =
|
96 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
97 |
-
model.eval()
|
98 |
|
99 |
message = "Куплю Iphone 8"
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
preds = torch.argmax(logits)
|
105 |
-
print(id2label[int(preds)])
|
106 |
```
|
|
|
|
|
107 |
## Training
|
108 |
-
Backbone was trained on clustered dataset for matching problem.
|
|
|
109 |
```
|
110 |
weighted average precision : 0.946
|
111 |
weighted average f1-score : 0.945
|
112 |
macro average precision : 0.943
|
113 |
macro average f1-score : 0.945
|
114 |
-
```
|
115 |
-
|
|
|
17 |
- **demand**: somebody wants to buy something or hire somebody
|
18 |
- **noise**: messages unrelated to topic.
|
19 |
|
|
|
20 |
|
21 |
+
## Usage
|
22 |
``` python
|
23 |
+
from typing import Optional
|
24 |
import torch
|
25 |
from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss
|
26 |
from huggingface_hub import PyTorchModelHubMixin
|
27 |
+
import numpy as np
|
28 |
+
|
29 |
|
30 |
+
class GodDeal(Module, PyTorchModelHubMixin):
|
31 |
+
def __init__(self,
|
32 |
+
backbone_name: str,
|
33 |
+
num_labels: Optional[int] = 3,
|
34 |
+
use_adapter: Optional[bool] = False,
|
35 |
+
id2label: Optional[dict[int, str]] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
):
|
37 |
super().__init__()
|
38 |
self.use_adapter = use_adapter
|
39 |
self.num_labels = num_labels
|
40 |
+
self.id2label = id2label
|
41 |
+
self.backbone = AutoModel.from_pretrained(backbone_name)
|
42 |
+
|
43 |
# Adapter layer
|
44 |
if self.use_adapter:
|
45 |
self.adapter = TransformerEncoderLayer(
|
46 |
+
d_model=self.backbone.config.hidden_size,
|
47 |
+
nhead=self.backbone.config.num_attention_heads,
|
48 |
+
dim_feedforward=self.backbone.config.intermediate_size,
|
49 |
activation="gelu",
|
50 |
dropout=0.1,
|
51 |
+
batch_first=True # I/O shape: batch, seq, feature
|
52 |
)
|
53 |
else:
|
54 |
self.adapter = None
|
55 |
+
|
56 |
# Classification head
|
57 |
self.separator_head = Linear(self.backbone.config.hidden_size, num_labels)
|
58 |
self.loss = CrossEntropyLoss()
|
59 |
+
|
60 |
+
def __decode_labels(self, labels: list | np.ndarray) -> list:
|
61 |
+
return list(map(lambda idx: self.id2label[str(idx)], labels))
|
62 |
+
|
63 |
+
def predict(self,
|
64 |
+
input_ids: torch.Tensor,
|
65 |
+
attention_mask: torch.Tensor
|
66 |
+
) -> dict[str, np.ndarray]:
|
67 |
+
with torch.inference_mode():
|
68 |
+
outputs = self(input_ids, attention_mask)
|
69 |
+
|
70 |
+
logits = outputs["logits"]
|
71 |
+
labels = logits.softmax(-1).argmax(-1).cpu().numpy()
|
72 |
+
|
73 |
+
if self.id2label is not None:
|
74 |
+
labels = self.__decode_labels(labels)
|
75 |
+
|
76 |
+
return {
|
77 |
+
"labels": labels,
|
78 |
+
"embeddings": outputs["embeddings"].cpu().numpy()
|
79 |
+
}
|
80 |
+
|
81 |
+
def forward(self,
|
82 |
+
input_ids: torch.Tensor,
|
83 |
+
attention_mask: torch.Tensor,
|
84 |
labels: Optional[torch.Tensor] = None
|
85 |
) -> dict[str, torch.Tensor]:
|
86 |
outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
|
87 |
last_hidden_state = outputs.last_hidden_state
|
88 |
+
|
89 |
if self.use_adapter:
|
90 |
last_hidden_state = self.adapter(last_hidden_state)
|
91 |
+
cls_embeddings = last_hidden_state[:, 0]
|
92 |
+
logits = self.separator_head(cls_embeddings)
|
93 |
+
|
|
|
94 |
if labels is not None:
|
95 |
loss = self.loss(logits, labels)
|
96 |
return {
|
97 |
+
"loss": loss,
|
98 |
+
"logits": logits,
|
99 |
+
"embeddings": cls_embeddings
|
100 |
}
|
101 |
return {
|
102 |
+
"logits": logits,
|
103 |
+
"embeddings": cls_embeddings
|
104 |
}
|
105 |
|
106 |
+
MODEL_NAME = 'poc-embeddings/rubert-tiny-turbo-godeal'
|
107 |
+
model = GodDeal.from_pretrained(MODEL_NAME)
|
108 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
|
|
109 |
|
110 |
message = "Куплю Iphone 8"
|
111 |
+
ids = tokenizer(message, return_tensors="pt")
|
112 |
+
input_ids, attention_mask = ids['input_ids'], ids['attention_mask']
|
113 |
+
preds = model.predict(input_ids, attention_mask)
|
114 |
+
print(preds)
|
|
|
|
|
115 |
```
|
116 |
+
|
117 |
+
|
118 |
## Training
|
119 |
+
Backbone was trained on clustered dataset for matching problem.
|
120 |
+
Partially unfreezed model with classification head on custom dataset containing exports from different telegram chats.
|
121 |
```
|
122 |
weighted average precision : 0.946
|
123 |
weighted average f1-score : 0.945
|
124 |
macro average precision : 0.943
|
125 |
macro average f1-score : 0.945
|
126 |
+
```
|
|