poc-embeddings
/

rubert-tiny-turbo-godeal

@@ -17,99 +17,110 @@ Labels:
  - **demand**: somebody wants to buy something or hire somebody
  - **noise**: messages unrelated to topic.
-## Usage
 ``` python
-from transformers import AutoModel, AutoTokenizer
 import torch
 from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss
 from huggingface_hub import PyTorchModelHubMixin
-from typing import Optional
-HF_MODEL_NAME = 'poc-embeddings/rubert-tiny-turbo-godeal'
-MODEL_NAME = 'sergeyzh/rubert-tiny-turbo'
-id2label = {0:'noise', 1:'supply', 2:'demand'}
-class SupplyDemandTrader(
- Module,
- PyTorchModelHubMixin,
- repo_url=HF_MODEL_NAME,
- library_name="torch",
- tags=["PyTorch", "sentence-transformers", "NLP", "text classification"],
- docs_url="https://pytorch.org/docs/stable/index.html"
-):
- def __init__(self,
- num_labels: Optional[int] = 3,
- use_adapter: bool = False
  ):
  super().__init__()
  self.use_adapter = use_adapter
  self.num_labels = num_labels
- self.backbone = AutoModel.from_pretrained(MODEL_NAME)
  # Adapter layer
  if self.use_adapter:
  self.adapter = TransformerEncoderLayer(
- d_model=self.backbone.config.hidden_size,
- nhead=self.backbone.config.num_attention_heads,
- dim_feedforward=self.backbone.config.intermediate_size,
  activation="gelu",
  dropout=0.1,
- batch_first=True # I/O shape: batch, seq, feature
  )
  else:
  self.adapter = None
  # Classification head
  self.separator_head = Linear(self.backbone.config.hidden_size, num_labels)
  self.loss = CrossEntropyLoss()
- def forward(self,
- input_ids: torch.Tensor,
- attention_mask: torch.Tensor,
  labels: Optional[torch.Tensor] = None
  ) -> dict[str, torch.Tensor]:
  outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
  last_hidden_state = outputs.last_hidden_state
  if self.use_adapter:
  last_hidden_state = self.adapter(last_hidden_state)
- cls_embedding = last_hidden_state[:, 0]
- logits = self.separator_head(cls_embedding)
  if labels is not None:
  loss = self.loss(logits, labels)
  return {
- "loss": loss,
- "logits": logits,
- "embedding": cls_embedding
  }
  return {
- "logits": logits,
- "embedding": cls_embedding
  }
-model = SupplyDemandTrader.from_pretrained(HF_MODEL_NAME)
-tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
-model.eval()
 message = "Куплю Iphone 8"
-with torch.inference_mode():
- ids = tokenizer(message, return_tensors="pt")
- logits = model.forward(ids['input_ids'], ids['attention_mask'])['logits']
- preds = torch.argmax(logits)
- print(id2label[int(preds)])
 ```
 ## Training
-Backbone was trained on clustered dataset for matching problem. Partially unfreezed model with classification head on custom dataset containing exports from different telegram chats.
 ```
 weighted average precision : 0.946
 weighted average f1-score : 0.945
 macro average precision : 0.943
 macro average f1-score : 0.945
-```

  - **demand**: somebody wants to buy something or hire somebody
  - **noise**: messages unrelated to topic.
+## Usage
 ``` python
+from typing import Optional
 import torch
 from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss
 from huggingface_hub import PyTorchModelHubMixin
+import numpy as np
+class GodDeal(Module, PyTorchModelHubMixin):
+ def __init__(self,
+ backbone_name: str,
+ num_labels: Optional[int] = 3,
+ use_adapter: Optional[bool] = False,
+ id2label: Optional[dict[int, str]] = None
  ):
  super().__init__()
  self.use_adapter = use_adapter
  self.num_labels = num_labels
+ self.id2label = id2label
+ self.backbone = AutoModel.from_pretrained(backbone_name)
  # Adapter layer
  if self.use_adapter:
  self.adapter = TransformerEncoderLayer(
+ d_model=self.backbone.config.hidden_size,
+ nhead=self.backbone.config.num_attention_heads,
+ dim_feedforward=self.backbone.config.intermediate_size,
  activation="gelu",
  dropout=0.1,
+ batch_first=True # I/O shape: batch, seq, feature
  )
  else:
  self.adapter = None
  # Classification head
  self.separator_head = Linear(self.backbone.config.hidden_size, num_labels)
  self.loss = CrossEntropyLoss()
+ def __decode_labels(self, labels: list | np.ndarray) -> list:
+ return list(map(lambda idx: self.id2label[str(idx)], labels))
+ def predict(self,
+ input_ids: torch.Tensor,
+ attention_mask: torch.Tensor
+ ) -> dict[str, np.ndarray]:
+ with torch.inference_mode():
+ outputs = self(input_ids, attention_mask)
+ logits = outputs["logits"]
+ labels = logits.softmax(-1).argmax(-1).cpu().numpy()
+ if self.id2label is not None:
+ labels = self.__decode_labels(labels)
+ return {
+ "labels": labels,
+ "embeddings": outputs["embeddings"].cpu().numpy()
+ }
+ def forward(self,
+ input_ids: torch.Tensor,
+ attention_mask: torch.Tensor,
  labels: Optional[torch.Tensor] = None
  ) -> dict[str, torch.Tensor]:
  outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
  last_hidden_state = outputs.last_hidden_state
  if self.use_adapter:
  last_hidden_state = self.adapter(last_hidden_state)
+ cls_embeddings = last_hidden_state[:, 0]
+ logits = self.separator_head(cls_embeddings)
  if labels is not None:
  loss = self.loss(logits, labels)
  return {
+ "loss": loss,
+ "logits": logits,
+ "embeddings": cls_embeddings
  }
  return {
+ "logits": logits,
+ "embeddings": cls_embeddings
  }
+MODEL_NAME = 'poc-embeddings/rubert-tiny-turbo-godeal'
+model = GodDeal.from_pretrained(MODEL_NAME)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 message = "Куплю Iphone 8"
+ids = tokenizer(message, return_tensors="pt")
+input_ids, attention_mask = ids['input_ids'], ids['attention_mask']
+preds = model.predict(input_ids, attention_mask)
+print(preds)
 ```
 ## Training
+Backbone was trained on clustered dataset for matching problem.
+Partially unfreezed model with classification head on custom dataset containing exports from different telegram chats.
 ```
 weighted average precision : 0.946
 weighted average f1-score : 0.945
 macro average precision : 0.943
 macro average f1-score : 0.945
+```