yeamerci's picture
Update README.md
0aa2933 verified
---
library_name: torch
tags:
- NLP
- PyTorch
- model_hub_mixin
- pytorch_model_hub_mixin
- sentence-transformers
- text classification
---
This is sergeyzh/rubert-tiny-turbo fine-tuned for classification messages type from telegram marketplaces.
Labels:
- **supply**: somebody willing to sell something or provide service
- **demand**: somebody wants to buy something or hire somebody
- **noise**: messages unrelated to topic.
## Usage
``` python
from typing import Optional
import torch
from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss
from huggingface_hub import PyTorchModelHubMixin
import numpy as np
class GodDeal(Module, PyTorchModelHubMixin):
def __init__(self,
backbone_name: str,
num_labels: Optional[int] = 3,
use_adapter: Optional[bool] = False,
id2label: Optional[dict[int, str]] = None
):
super().__init__()
self.use_adapter = use_adapter
self.num_labels = num_labels
self.id2label = id2label
self.backbone = AutoModel.from_pretrained(backbone_name)
# Adapter layer
if self.use_adapter:
self.adapter = TransformerEncoderLayer(
d_model=self.backbone.config.hidden_size,
nhead=self.backbone.config.num_attention_heads,
dim_feedforward=self.backbone.config.intermediate_size,
activation="gelu",
dropout=0.1,
batch_first=True # I/O shape: batch, seq, feature
)
else:
self.adapter = None
# Classification head
self.separator_head = Linear(self.backbone.config.hidden_size, num_labels)
self.loss = CrossEntropyLoss()
def __decode_labels(self, labels: list | np.ndarray) -> list:
return list(map(lambda idx: self.id2label[str(idx)], labels))
def predict(self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor
) -> dict[str, np.ndarray]:
with torch.inference_mode():
outputs = self(input_ids, attention_mask)
logits = outputs["logits"]
labels = logits.softmax(-1).argmax(-1).cpu().numpy()
if self.id2label is not None:
labels = self.__decode_labels(labels)
return {
"labels": labels,
"embeddings": outputs["embeddings"].cpu().numpy()
}
def forward(self,
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
labels: Optional[torch.Tensor] = None
) -> dict[str, torch.Tensor]:
outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
last_hidden_state = outputs.last_hidden_state
if self.use_adapter:
last_hidden_state = self.adapter(last_hidden_state)
cls_embeddings = last_hidden_state[:, 0]
logits = self.separator_head(cls_embeddings)
if labels is not None:
loss = self.loss(logits, labels)
return {
"loss": loss,
"logits": logits,
"embeddings": cls_embeddings
}
return {
"logits": logits,
"embeddings": cls_embeddings
}
MODEL_NAME = 'poc-embeddings/rubert-tiny-turbo-godeal'
model = GodDeal.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
message = "Куплю Iphone 8"
ids = tokenizer(message, return_tensors="pt")
input_ids, attention_mask = ids['input_ids'], ids['attention_mask']
preds = model.predict(input_ids, attention_mask)
print(preds)
```
## Training
Backbone was trained on clustered dataset for matching problem.
Partially unfreezed model with classification head on custom dataset containing exports from different telegram chats.
```
weighted average precision : 0.946
weighted average f1-score : 0.945
macro average precision : 0.943
macro average f1-score : 0.945
```