upskyy commited on
Commit
ff39d87
·
verified ·
1 Parent(s): beb15ff

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1024,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
README.md CHANGED
@@ -1,3 +1,356 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - multilingual
4
+ - af
5
+ - am
6
+ - ar
7
+ - as
8
+ - az
9
+ - be
10
+ - bg
11
+ - bn
12
+ - br
13
+ - bs
14
+ - ca
15
+ - cs
16
+ - cy
17
+ - da
18
+ - de
19
+ - el
20
+ - en
21
+ - eo
22
+ - es
23
+ - et
24
+ - eu
25
+ - fa
26
+ - fi
27
+ - fr
28
+ - fy
29
+ - ga
30
+ - gd
31
+ - gl
32
+ - gu
33
+ - ha
34
+ - he
35
+ - hi
36
+ - hr
37
+ - hu
38
+ - hy
39
+ - id
40
+ - is
41
+ - it
42
+ - ja
43
+ - jv
44
+ - ka
45
+ - kk
46
+ - km
47
+ - kn
48
+ - ko
49
+ - ku
50
+ - ky
51
+ - la
52
+ - lo
53
+ - lt
54
+ - lv
55
+ - mg
56
+ - mk
57
+ - ml
58
+ - mn
59
+ - mr
60
+ - ms
61
+ - my
62
+ - ne
63
+ - nl
64
+ - 'no'
65
+ - om
66
+ - or
67
+ - pa
68
+ - pl
69
+ - ps
70
+ - pt
71
+ - ro
72
+ - ru
73
+ - sa
74
+ - sd
75
+ - si
76
+ - sk
77
+ - sl
78
+ - so
79
+ - sq
80
+ - sr
81
+ - su
82
+ - sv
83
+ - sw
84
+ - ta
85
+ - te
86
+ - th
87
+ - tl
88
+ - tr
89
+ - ug
90
+ - uk
91
+ - ur
92
+ - uz
93
+ - vi
94
+ - xh
95
+ - yi
96
+ - zh
97
+ license: mit
98
+ library_name: sentence-transformers
99
+ tags:
100
+ - korean
101
+ - sentence-transformers
102
+ - transformers
103
+ - multilingual
104
+ - sentence-transformers
105
+ - sentence-similarity
106
+ - feature-extraction
107
+ base_model: intfloat/multilingual-e5-large
108
+ datasets: []
109
+ metrics:
110
+ - pearson_cosine
111
+ - spearman_cosine
112
+ - pearson_manhattan
113
+ - spearman_manhattan
114
+ - pearson_euclidean
115
+ - spearman_euclidean
116
+ - pearson_dot
117
+ - spearman_dot
118
+ - pearson_max
119
+ - spearman_max
120
+ widget:
121
+ - source_sentence: 이집트 군대가 형제애를 단속하다
122
+ sentences:
123
+ - 이집트의 군대가 무슬림 형제애를 단속하다
124
+ - 아르헨티나의 기예르모 코리아와 네덜란드의 마틴 버커크의 또 다른 준결승전도 매력적이다.
125
+ - 그것이 사실일 수도 있다고 생각하는 것은 재미있다.
126
+ - source_sentence: 오, 그리고 다시 결혼은 근본적인 인권이라고 주장한다.
127
+ sentences:
128
+ - 특히 결혼은 근본적인 인권이라고 말한 후에.
129
+ - 해변에 있는 흑인과 그의 개...
130
+ - 이란은 핵 프로그램이 평화적인 목적을 위한 것이라고 주장한다
131
+ - source_sentence: 조지 샤힌은 안데르센 컨설팅 사업부에서 일했다.
132
+ sentences:
133
+ - 112건의 퇴거를 예방하거나 미연에 방지하여 151,619달러의 피난처 비용과 그들이 실향민이 되었을 때 가족들이 겪는 혼란을 덜어주었다.
134
+ - 안데르센 컨설팅은 여전히 번창하는 사업이다.
135
+ - 이것은 내가 영국의 아서 안데르센 사업부의 파트너인 짐 와디아를 아서 안데르센 경영진이 선택한 것보다 래리 웨인바흐를 안데르센 월드와이드의
136
+ 경영 파트너로 승계하기 위해 안데르센 컨설팅 사업부(현재의 엑센츄어라고 알려져 있음)의 전 관리 파트너인 조지 샤힌에 대한 지지를 표명했을
137
+ 때 가장 명백했다.
138
+ - source_sentence: 그 표는 주요 경제 정보를 보여준다.
139
+ sentences:
140
+ - 표는 모집단 밀도를 나타냅니다.
141
+ - 아이들이 야외에서 놀고 있다.
142
+ - 표 3은 배출량 감소가 개인 소비와 국내총생산(GDP)의 다른 구성 요소에 미치는 영향을 비교하기 위해 2010년의 주요 거시경제 데이터를
143
+ 요약한 것이다.
144
+ - source_sentence: 안경을 쓴 나이든 남자가 바닥에 누워 갓난아기와 장난감 소방차를 가지고 놀고 있다.
145
+ sentences:
146
+ - 긴 검은 머리와 초록색 탱크톱을 가진 남자가 손가락을 보고 있다.
147
+ - 안경을 쓴 남자는 원숭이이고 아기 원숭이와 놀고 있다.
148
+ - 안경을 쓴 남자가 바닥에 누워 놀고 있다.
149
+ pipeline_tag: sentence-similarity
150
+ model-index:
151
+ - name: upskyy/e5-large-korean
152
+ results:
153
+ - task:
154
+ type: semantic-similarity
155
+ name: Semantic Similarity
156
+ dataset:
157
+ name: sts dev
158
+ type: sts-dev
159
+ metrics:
160
+ - type: pearson_cosine
161
+ value: 0.8710078333363093
162
+ name: Pearson Cosine
163
+ - type: spearman_cosine
164
+ value: 0.8698788475177747
165
+ name: Spearman Cosine
166
+ - type: pearson_manhattan
167
+ value: 0.8598807479137434
168
+ name: Pearson Manhattan
169
+ - type: spearman_manhattan
170
+ value: 0.8682945370063891
171
+ name: Spearman Manhattan
172
+ - type: pearson_euclidean
173
+ value: 0.8596482760879562
174
+ name: Pearson Euclidean
175
+ - type: spearman_euclidean
176
+ value: 0.8679655812613122
177
+ name: Spearman Euclidean
178
+ - type: pearson_dot
179
+ value: 0.8684600033706916
180
+ name: Pearson Dot
181
+ - type: spearman_dot
182
+ value: 0.8668368265035578
183
+ name: Spearman Dot
184
+ - type: pearson_max
185
+ value: 0.8710078333363093
186
+ name: Pearson Max
187
+ - type: spearman_max
188
+ value: 0.8698788475177747
189
+ name: Spearman Max
190
+ ---
191
+
192
+ # upskyy/e5-large-korean
193
+
194
+ This model is korsts and kornli finetuning model from [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
195
+
196
+ ## Model Details
197
+
198
+ ### Model Description
199
+ - **Model Type:** Sentence Transformer
200
+ - **Base model:** [intfloat/multilingual-e5-large](https://huggingface.co/intfloat/multilingual-e5-large) <!-- at revision ab10c1a7f42e74530fe7ae5be82e6d4f11a719eb -->
201
+ - **Maximum Sequence Length:** 512 tokens
202
+ - **Output Dimensionality:** 1024 tokens
203
+ - **Similarity Function:** Cosine Similarity
204
+ <!-- - **Training Dataset:** Unknown -->
205
+ <!-- - **Language:** Unknown -->
206
+ <!-- - **License:** Unknown -->
207
+
208
+ ### Full Model Architecture
209
+
210
+ ```
211
+ SentenceTransformer(
212
+ (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel
213
+ (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
214
+ )
215
+ ```
216
+
217
+ ## Usage
218
+
219
+ ### Usage (Sentence-Transformers)
220
+
221
+
222
+ First install the Sentence Transformers library:
223
+
224
+ ```bash
225
+ pip install -U sentence-transformers
226
+ ```
227
+
228
+ Then you can load this model and run inference.
229
+ ```python
230
+ from sentence_transformers import SentenceTransformer
231
+
232
+ # Download from the 🤗 Hub
233
+ model = SentenceTransformer("upskyy/e5-large-korean")
234
+
235
+ # Run inference
236
+ sentences = [
237
+ '아이를 가진 엄마가 해변을 걷는다.',
238
+ '두 사람이 해변을 걷는다.',
239
+ '한 남자가 해변에서 개를 산책시킨다.',
240
+ ]
241
+ embeddings = model.encode(sentences)
242
+ print(embeddings.shape)
243
+ # [3, 1024]
244
+
245
+ # Get the similarity scores for the embeddings
246
+ similarities = model.similarity(embeddings, embeddings)
247
+ print(similarities.shape)
248
+ # [3, 3]
249
+ ```
250
+
251
+ ### Usage (HuggingFace Transformers)
252
+
253
+ Without sentence-transformers, you can use the model like this:
254
+ First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
255
+
256
+ ```python
257
+ from transformers import AutoTokenizer, AutoModel
258
+ import torch
259
+
260
+
261
+ # Mean Pooling - Take attention mask into account for correct averaging
262
+ def mean_pooling(model_output, attention_mask):
263
+ token_embeddings = model_output[0] # First element of model_output contains all token embeddings
264
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
265
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
266
+
267
+
268
+ # Sentences we want sentence embeddings for
269
+ sentences = ["안녕하세요?", "한국어 문장 임베딩을 위한 버트 모델입니다."]
270
+
271
+ # Load model from HuggingFace Hub
272
+ tokenizer = AutoTokenizer.from_pretrained("upskyy/e5-large-korean")
273
+ model = AutoModel.from_pretrained("upskyy/e5-large-korean")
274
+
275
+ # Tokenize sentences
276
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
277
+
278
+ # Compute token embeddings
279
+ with torch.no_grad():
280
+ model_output = model(**encoded_input)
281
+
282
+ # Perform pooling. In this case, mean pooling.
283
+ sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
284
+
285
+ print("Sentence embeddings:")
286
+ print(sentence_embeddings)
287
+ ```
288
+
289
+
290
+ ## Evaluation
291
+
292
+ ### Metrics
293
+
294
+ #### Semantic Similarity
295
+ * Dataset: `sts-dev`
296
+ * Evaluated with [<code>EmbeddingSimilarityEvaluator</code>](https://sbert.net/docs/package_reference/sentence_transformer/evaluation.html#sentence_transformers.evaluation.EmbeddingSimilarityEvaluator)
297
+
298
+ | Metric | Value |
299
+ | :----------------- | :--------- |
300
+ | pearson_cosine | 0.871 |
301
+ | spearman_cosine | 0.8699 |
302
+ | pearson_manhattan | 0.8599 |
303
+ | spearman_manhattan | 0.8683 |
304
+ | pearson_euclidean | 0.8596 |
305
+ | spearman_euclidean | 0.868 |
306
+ | pearson_dot | 0.8685 |
307
+ | spearman_dot | 0.8668 |
308
+ | **pearson_max** | **0.871** |
309
+ | **spearman_max** | **0.8699** |
310
+
311
+ <!--
312
+ ## Bias, Risks and Limitations
313
+
314
+ *What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
315
+ -->
316
+
317
+ <!--
318
+ ### Recommendations
319
+
320
+ *What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
321
+ -->
322
+
323
+
324
+ ### Framework Versions
325
+ - Python: 3.10.13
326
+ - Sentence Transformers: 3.0.1
327
+ - Transformers: 4.42.4
328
+ - PyTorch: 2.3.0+cu121
329
+ - Accelerate: 0.30.1
330
+ - Datasets: 2.16.1
331
+ - Tokenizers: 0.19.1
332
+
333
+ ## Citation
334
+
335
+ ### BibTeX
336
+
337
+ ```bibtex
338
+ @article{wang2024multilingual,
339
+ title={Multilingual E5 Text Embeddings: A Technical Report},
340
+ author={Wang, Liang and Yang, Nan and Huang, Xiaolong and Yang, Linjun and Majumder, Rangan and Wei, Furu},
341
+ journal={arXiv preprint arXiv:2402.05672},
342
+ year={2024}
343
+ }
344
+ ```
345
+
346
+ ```bibtex
347
+ @inproceedings{reimers-2019-sentence-bert,
348
+ title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
349
+ author = "Reimers, Nils and Gurevych, Iryna",
350
+ booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
351
+ month = "11",
352
+ year = "2019",
353
+ publisher = "Association for Computational Linguistics",
354
+ url = "https://arxiv.org/abs/1908.10084",
355
+ }
356
+ ```
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "xlm-roberta",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 24,
19
+ "output_past": true,
20
+ "pad_token_id": 1,
21
+ "position_embedding_type": "absolute",
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.42.4",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 250002
27
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d85f779ad4259caa8459b18582d9682b45c622b5684bbb61c51863b30cff7184
3
+ size 2239607176
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
3
+ size 17082987
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 512,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "tokenizer_class": "XLMRobertaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }