|
--- |
|
library_name: torch |
|
tags: |
|
- NLP |
|
- PyTorch |
|
- model_hub_mixin |
|
- pytorch_model_hub_mixin |
|
- sentence-transformers |
|
- text classification |
|
--- |
|
|
|
This is sergeyzh/rubert-tiny-turbo fine-tuned for classification messages type from telegram marketplaces. |
|
|
|
Labels: |
|
|
|
- **supply**: somebody willing to sell something or provide service |
|
- **demand**: somebody wants to buy something or hire somebody |
|
- **noise**: messages unrelated to topic. |
|
|
|
|
|
## Usage |
|
``` python |
|
from typing import Optional |
|
import torch |
|
from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss |
|
from huggingface_hub import PyTorchModelHubMixin |
|
import numpy as np |
|
|
|
|
|
class GodDeal(Module, PyTorchModelHubMixin): |
|
def __init__(self, |
|
backbone_name: str, |
|
num_labels: Optional[int] = 3, |
|
use_adapter: Optional[bool] = False, |
|
id2label: Optional[dict[int, str]] = None |
|
): |
|
super().__init__() |
|
self.use_adapter = use_adapter |
|
self.num_labels = num_labels |
|
self.id2label = id2label |
|
self.backbone = AutoModel.from_pretrained(backbone_name) |
|
|
|
# Adapter layer |
|
if self.use_adapter: |
|
self.adapter = TransformerEncoderLayer( |
|
d_model=self.backbone.config.hidden_size, |
|
nhead=self.backbone.config.num_attention_heads, |
|
dim_feedforward=self.backbone.config.intermediate_size, |
|
activation="gelu", |
|
dropout=0.1, |
|
batch_first=True # I/O shape: batch, seq, feature |
|
) |
|
else: |
|
self.adapter = None |
|
|
|
# Classification head |
|
self.separator_head = Linear(self.backbone.config.hidden_size, num_labels) |
|
self.loss = CrossEntropyLoss() |
|
|
|
def __decode_labels(self, labels: list | np.ndarray) -> list: |
|
return list(map(lambda idx: self.id2label[str(idx)], labels)) |
|
|
|
def predict(self, |
|
input_ids: torch.Tensor, |
|
attention_mask: torch.Tensor |
|
) -> dict[str, np.ndarray]: |
|
with torch.inference_mode(): |
|
outputs = self(input_ids, attention_mask) |
|
|
|
logits = outputs["logits"] |
|
labels = logits.softmax(-1).argmax(-1).cpu().numpy() |
|
|
|
if self.id2label is not None: |
|
labels = self.__decode_labels(labels) |
|
|
|
return { |
|
"labels": labels, |
|
"embeddings": outputs["embeddings"].cpu().numpy() |
|
} |
|
|
|
def forward(self, |
|
input_ids: torch.Tensor, |
|
attention_mask: torch.Tensor, |
|
labels: Optional[torch.Tensor] = None |
|
) -> dict[str, torch.Tensor]: |
|
outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask) |
|
last_hidden_state = outputs.last_hidden_state |
|
|
|
if self.use_adapter: |
|
last_hidden_state = self.adapter(last_hidden_state) |
|
cls_embeddings = last_hidden_state[:, 0] |
|
logits = self.separator_head(cls_embeddings) |
|
|
|
if labels is not None: |
|
loss = self.loss(logits, labels) |
|
return { |
|
"loss": loss, |
|
"logits": logits, |
|
"embeddings": cls_embeddings |
|
} |
|
return { |
|
"logits": logits, |
|
"embeddings": cls_embeddings |
|
} |
|
|
|
MODEL_NAME = 'poc-embeddings/rubert-tiny-turbo-godeal' |
|
model = GodDeal.from_pretrained(MODEL_NAME) |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
|
message = "Куплю Iphone 8" |
|
ids = tokenizer(message, return_tensors="pt") |
|
input_ids, attention_mask = ids['input_ids'], ids['attention_mask'] |
|
preds = model.predict(input_ids, attention_mask) |
|
print(preds) |
|
``` |
|
|
|
|
|
## Training |
|
Backbone was trained on clustered dataset for matching problem. |
|
Partially unfreezed model with classification head on custom dataset containing exports from different telegram chats. |
|
``` |
|
weighted average precision : 0.946 |
|
weighted average f1-score : 0.945 |
|
macro average precision : 0.943 |
|
macro average f1-score : 0.945 |
|
``` |