yeamerci commited on
Commit
0aa2933
1 Parent(s): 8a7e7d0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +64 -53
README.md CHANGED
@@ -17,99 +17,110 @@ Labels:
17
  - **demand**: somebody wants to buy something or hire somebody
18
  - **noise**: messages unrelated to topic.
19
 
20
- ## Usage
21
 
 
22
  ``` python
23
- from transformers import AutoModel, AutoTokenizer
24
  import torch
25
  from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss
26
  from huggingface_hub import PyTorchModelHubMixin
27
- from typing import Optional
 
28
 
29
- HF_MODEL_NAME = 'poc-embeddings/rubert-tiny-turbo-godeal'
30
- MODEL_NAME = 'sergeyzh/rubert-tiny-turbo'
31
-
32
- id2label = {0:'noise', 1:'supply', 2:'demand'}
33
-
34
- class SupplyDemandTrader(
35
- Module,
36
- PyTorchModelHubMixin,
37
- repo_url=HF_MODEL_NAME,
38
- library_name="torch",
39
- tags=["PyTorch", "sentence-transformers", "NLP", "text classification"],
40
- docs_url="https://pytorch.org/docs/stable/index.html"
41
- ):
42
- def __init__(self,
43
- num_labels: Optional[int] = 3,
44
- use_adapter: bool = False
45
  ):
46
  super().__init__()
47
  self.use_adapter = use_adapter
48
  self.num_labels = num_labels
49
- self.backbone = AutoModel.from_pretrained(MODEL_NAME)
50
-
 
51
  # Adapter layer
52
  if self.use_adapter:
53
  self.adapter = TransformerEncoderLayer(
54
- d_model=self.backbone.config.hidden_size,
55
- nhead=self.backbone.config.num_attention_heads,
56
- dim_feedforward=self.backbone.config.intermediate_size,
57
  activation="gelu",
58
  dropout=0.1,
59
- batch_first=True # I/O shape: batch, seq, feature
60
  )
61
  else:
62
  self.adapter = None
63
-
64
  # Classification head
65
  self.separator_head = Linear(self.backbone.config.hidden_size, num_labels)
66
  self.loss = CrossEntropyLoss()
67
-
68
- def forward(self,
69
- input_ids: torch.Tensor,
70
- attention_mask: torch.Tensor,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  labels: Optional[torch.Tensor] = None
72
  ) -> dict[str, torch.Tensor]:
73
  outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
74
  last_hidden_state = outputs.last_hidden_state
75
-
76
  if self.use_adapter:
77
  last_hidden_state = self.adapter(last_hidden_state)
78
- cls_embedding = last_hidden_state[:, 0]
79
-
80
- logits = self.separator_head(cls_embedding)
81
-
82
  if labels is not None:
83
  loss = self.loss(logits, labels)
84
  return {
85
- "loss": loss,
86
- "logits": logits,
87
- "embedding": cls_embedding
88
  }
89
  return {
90
- "logits": logits,
91
- "embedding": cls_embedding
92
  }
93
 
94
-
95
- model = SupplyDemandTrader.from_pretrained(HF_MODEL_NAME)
96
- tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
97
- model.eval()
98
 
99
  message = "Куплю Iphone 8"
100
-
101
- with torch.inference_mode():
102
- ids = tokenizer(message, return_tensors="pt")
103
- logits = model.forward(ids['input_ids'], ids['attention_mask'])['logits']
104
- preds = torch.argmax(logits)
105
- print(id2label[int(preds)])
106
  ```
 
 
107
  ## Training
108
- Backbone was trained on clustered dataset for matching problem. Partially unfreezed model with classification head on custom dataset containing exports from different telegram chats.
 
109
  ```
110
  weighted average precision : 0.946
111
  weighted average f1-score : 0.945
112
  macro average precision : 0.943
113
  macro average f1-score : 0.945
114
- ```
115
-
 
17
  - **demand**: somebody wants to buy something or hire somebody
18
  - **noise**: messages unrelated to topic.
19
 
 
20
 
21
+ ## Usage
22
  ``` python
23
+ from typing import Optional
24
  import torch
25
  from torch.nn import Linear, Module, TransformerEncoderLayer, CrossEntropyLoss
26
  from huggingface_hub import PyTorchModelHubMixin
27
+ import numpy as np
28
+
29
 
30
+ class GodDeal(Module, PyTorchModelHubMixin):
31
+ def __init__(self,
32
+ backbone_name: str,
33
+ num_labels: Optional[int] = 3,
34
+ use_adapter: Optional[bool] = False,
35
+ id2label: Optional[dict[int, str]] = None
 
 
 
 
 
 
 
 
 
 
36
  ):
37
  super().__init__()
38
  self.use_adapter = use_adapter
39
  self.num_labels = num_labels
40
+ self.id2label = id2label
41
+ self.backbone = AutoModel.from_pretrained(backbone_name)
42
+
43
  # Adapter layer
44
  if self.use_adapter:
45
  self.adapter = TransformerEncoderLayer(
46
+ d_model=self.backbone.config.hidden_size,
47
+ nhead=self.backbone.config.num_attention_heads,
48
+ dim_feedforward=self.backbone.config.intermediate_size,
49
  activation="gelu",
50
  dropout=0.1,
51
+ batch_first=True # I/O shape: batch, seq, feature
52
  )
53
  else:
54
  self.adapter = None
55
+
56
  # Classification head
57
  self.separator_head = Linear(self.backbone.config.hidden_size, num_labels)
58
  self.loss = CrossEntropyLoss()
59
+
60
+ def __decode_labels(self, labels: list | np.ndarray) -> list:
61
+ return list(map(lambda idx: self.id2label[str(idx)], labels))
62
+
63
+ def predict(self,
64
+ input_ids: torch.Tensor,
65
+ attention_mask: torch.Tensor
66
+ ) -> dict[str, np.ndarray]:
67
+ with torch.inference_mode():
68
+ outputs = self(input_ids, attention_mask)
69
+
70
+ logits = outputs["logits"]
71
+ labels = logits.softmax(-1).argmax(-1).cpu().numpy()
72
+
73
+ if self.id2label is not None:
74
+ labels = self.__decode_labels(labels)
75
+
76
+ return {
77
+ "labels": labels,
78
+ "embeddings": outputs["embeddings"].cpu().numpy()
79
+ }
80
+
81
+ def forward(self,
82
+ input_ids: torch.Tensor,
83
+ attention_mask: torch.Tensor,
84
  labels: Optional[torch.Tensor] = None
85
  ) -> dict[str, torch.Tensor]:
86
  outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
87
  last_hidden_state = outputs.last_hidden_state
88
+
89
  if self.use_adapter:
90
  last_hidden_state = self.adapter(last_hidden_state)
91
+ cls_embeddings = last_hidden_state[:, 0]
92
+ logits = self.separator_head(cls_embeddings)
93
+
 
94
  if labels is not None:
95
  loss = self.loss(logits, labels)
96
  return {
97
+ "loss": loss,
98
+ "logits": logits,
99
+ "embeddings": cls_embeddings
100
  }
101
  return {
102
+ "logits": logits,
103
+ "embeddings": cls_embeddings
104
  }
105
 
106
+ MODEL_NAME = 'poc-embeddings/rubert-tiny-turbo-godeal'
107
+ model = GodDeal.from_pretrained(MODEL_NAME)
108
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
109
 
110
  message = "Куплю Iphone 8"
111
+ ids = tokenizer(message, return_tensors="pt")
112
+ input_ids, attention_mask = ids['input_ids'], ids['attention_mask']
113
+ preds = model.predict(input_ids, attention_mask)
114
+ print(preds)
 
 
115
  ```
116
+
117
+
118
  ## Training
119
+ Backbone was trained on clustered dataset for matching problem.
120
+ Partially unfreezed model with classification head on custom dataset containing exports from different telegram chats.
121
  ```
122
  weighted average precision : 0.946
123
  weighted average f1-score : 0.945
124
  macro average precision : 0.943
125
  macro average f1-score : 0.945
126
+ ```