poc-embeddings/ru-en-RoSBERTa-godeal

This is ai-forever/ru-en-RoSBERTa fine-tuned for classification messages type from telegram marketplaces.

Labels:

supply: somebody willing to sell something or provide service
demand: somebody wants to buy something or hire somebody
noise: messages unrelated to topic.

Usage

from typing import Optional
import torch
from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss
from huggingface_hub import PyTorchModelHubMixin
import numpy as np


class GodDeal(Module, PyTorchModelHubMixin):
    def __init__(self,
                 backbone_name: str,
                 num_labels: Optional[int] = 3,
                 use_adapter: Optional[bool] = False,
                 id2label: Optional[dict[int, str]] = None
                 ):
        super().__init__()
        self.use_adapter = use_adapter
        self.num_labels = num_labels
        self.id2label = id2label
        self.backbone = AutoModel.from_pretrained(backbone_name)

        # Adapter layer
        if self.use_adapter:
            self.adapter = TransformerEncoderLayer(
                d_model=self.backbone.config.hidden_size,
                nhead=self.backbone.config.num_attention_heads,
                dim_feedforward=self.backbone.config.intermediate_size,
                activation="gelu",
                dropout=0.1,
                batch_first=True  # I/O shape: batch, seq, feature
            )
        else:
            self.adapter = None

        # Classification head
        self.separator_head = Linear(self.backbone.config.hidden_size, num_labels)
        self.loss = CrossEntropyLoss()

    def __decode_labels(self, labels: list | np.ndarray) -> list:
        return list(map(lambda idx: self.id2label[str(idx)], labels))

    def predict(self,
                input_ids: torch.Tensor,
                attention_mask: torch.Tensor
                ) -> dict[str, np.ndarray]:
        with torch.inference_mode():
            outputs = self(input_ids, attention_mask)

        logits = outputs["logits"]
        labels = logits.softmax(-1).argmax(-1).cpu().numpy()

        if self.id2label is not None:
            labels = self.__decode_labels(labels)

        return {
            "labels": labels,
            "embeddings": outputs["embeddings"].cpu().numpy()
        }

    def forward(self,
                input_ids: torch.Tensor,
                attention_mask: torch.Tensor,
                labels: Optional[torch.Tensor] = None
                ) -> dict[str, torch.Tensor]:
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        if self.use_adapter:
            last_hidden_state = self.adapter(last_hidden_state)
        cls_embeddings = last_hidden_state[:, 0]
        logits = self.separator_head(cls_embeddings)

        if labels is not None:
            loss = self.loss(logits, labels)
            return {
                "loss": loss,
                "logits": logits,
                "embeddings": cls_embeddings
            }
        return {
            "logits": logits,
            "embeddings": cls_embeddings
        }


       

MODEL_NAME = "poc-embeddings/ru-en-RoSBERTa-godeal"
model = GodDeal.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


message = "Куплю Iphone 8"
ids = tokenizer(message, return_tensors="pt")
input_ids, attention_mask = ids['input_ids'], ids['attention_mask']
preds = model.predict(input_ids, attention_mask)
print(preds)

Training

Backbone was trained on clustered dataset for matching problem. Partially unfreezed model with classification head on custom dataset containing exports from different telegram chats.

weighted average precision	: 0.962
weighted average f1-score	: 0.962
macro average precision		: 0.962
macro average f1-score		: 0.962