Hiveurban commited on
Commit
ee7449c
1 Parent(s): f3c3179

commit files to HF hub

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. config.json +44 -2
  3. hive_token_classification.py +20 -0
  4. tokenizer.json +0 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  vocab.txt filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  vocab.txt filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json CHANGED
@@ -1,12 +1,22 @@
1
  {
 
2
  "architectures": [
3
  "BertForJointParsing"
4
  ],
 
5
  "auto_map": {
6
  "AutoModel": "dicta-il/dictabert-joint--BertForJointParsing.BertForJointParsing"
7
  },
8
- "attention_probs_dropout_prob": 0.1,
9
  "classifier_dropout": null,
 
 
 
 
 
 
 
 
 
10
  "do_lex": true,
11
  "do_morph": true,
12
  "do_ner": true,
@@ -83,9 +93,41 @@
83
  "num_hidden_layers": 24,
84
  "pad_token_id": 0,
85
  "position_embedding_type": "absolute",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  "syntax_head_size": 128,
87
  "torch_dtype": "float32",
88
- "transformers_version": "4.36.2",
89
  "type_vocab_size": 2,
90
  "use_cache": true,
91
  "vocab_size": 128000
 
1
  {
2
+ "_name_or_path": "Hiveurban/dictabert-large-parse",
3
  "architectures": [
4
  "BertForJointParsing"
5
  ],
6
+ "attention_probs_dropout_prob": 0.1,
7
  "auto_map": {
8
  "AutoModel": "dicta-il/dictabert-joint--BertForJointParsing.BertForJointParsing"
9
  },
 
10
  "classifier_dropout": null,
11
+ "custom_pipelines": {
12
+ "hive-token-classification": {
13
+ "impl": "hive_token_classification.HiveTokenClassification",
14
+ "pt": [
15
+ "AutoModel"
16
+ ],
17
+ "tf": []
18
+ }
19
+ },
20
  "do_lex": true,
21
  "do_morph": true,
22
  "do_ner": true,
 
93
  "num_hidden_layers": 24,
94
  "pad_token_id": 0,
95
  "position_embedding_type": "absolute",
96
+ "prefix_cfg": {
97
+ "possible_classes": [
98
+ [
99
+ "\u05dc\u05db\u05e9",
100
+ "\u05db\u05e9",
101
+ "\u05de\u05e9",
102
+ "\u05d1\u05e9",
103
+ "\u05dc\u05e9"
104
+ ],
105
+ [
106
+ "\u05de"
107
+ ],
108
+ [
109
+ "\u05e9"
110
+ ],
111
+ [
112
+ "\u05d4"
113
+ ],
114
+ [
115
+ "\u05d5"
116
+ ],
117
+ [
118
+ "\u05db"
119
+ ],
120
+ [
121
+ "\u05dc"
122
+ ],
123
+ [
124
+ "\u05d1"
125
+ ]
126
+ ]
127
+ },
128
  "syntax_head_size": 128,
129
  "torch_dtype": "float32",
130
+ "transformers_version": "4.44.2",
131
  "type_vocab_size": 2,
132
  "use_cache": true,
133
  "vocab_size": 128000
hive_token_classification.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict
2
+ from transformers import Pipeline, AutoModel, AutoTokenizer
3
+ from transformers.pipelines.base import GenericTensor, ModelOutput
4
+
5
+
6
+ class HiveTokenClassification(Pipeline):
7
+ def _sanitize_parameters(self, **kwargs):
8
+ forward_parameters = {}
9
+ if "output_style" in kwargs:
10
+ forward_parameters["output_style"] = kwargs["output_style"]
11
+ return {}, forward_parameters, {}
12
+
13
+ def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
14
+ return input_
15
+
16
+ def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
17
+ return self.model.predict(input_tensors, self.tokenizer, **forward_parameters)
18
+
19
+ def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any:
20
+ return {"output": model_outputs, "length": len(model_outputs)}
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff