init commit

Files changed (12) hide show

1_Pooling/config.json +7 -0
2_Dense/config.json +1 -0
2_Dense/pytorch_model.bin +3 -0
README.md +1084 -0
config.json +31 -0
modules.json +20 -0
pytorch_model.bin +3 -0
sentence_bert_config.json +4 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +57 -0
vocab.txt +0 -0

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 1024,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

2_Dense/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"in_features": 1024, "out_features": 1792, "bias": true, "activation_function": "torch.nn.modules.linear.Identity"}

2_Dense/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd927506d94e1574d3e5922a4554127bca9aa79cf9b885dbcf7118d8811d93ac
+size 7348796

README.md ADDED Viewed

	@@ -0,0 +1,1084 @@

+---
+tags:
+- mteb
+model-index:
+- name: piccolo-embedding_mixed2
+  results:
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/AFQMC
+      name: MTEB AFQMC
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 56.918538280469875
+    - type: cos_sim_spearman
+      value: 60.95597435855258
+    - type: euclidean_pearson
+      value: 59.73821610051437
+    - type: euclidean_spearman
+      value: 60.956778530262454
+    - type: manhattan_pearson
+      value: 59.739675774225475
+    - type: manhattan_spearman
+      value: 60.95243600302903
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/ATEC
+      name: MTEB ATEC
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 56.79417977023184
+    - type: cos_sim_spearman
+      value: 58.80984726256814
+    - type: euclidean_pearson
+      value: 63.42225182281334
+    - type: euclidean_spearman
+      value: 58.80957930593542
+    - type: manhattan_pearson
+      value: 63.41128425333986
+    - type: manhattan_spearman
+      value: 58.80784321716389
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_reviews_multi
+      name: MTEB AmazonReviewsClassification (zh)
+      config: zh
+      split: test
+      revision: 1399c76144fd37290681b995c656ef9b2e06e26d
+    metrics:
+    - type: accuracy
+      value: 50.074000000000005
+    - type: f1
+      value: 47.11468271375511
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/BQ
+      name: MTEB BQ
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 73.3412976021806
+    - type: cos_sim_spearman
+      value: 75.0799965464816
+    - type: euclidean_pearson
+      value: 73.7874729086686
+    - type: euclidean_spearman
+      value: 75.07910973646369
+    - type: manhattan_pearson
+      value: 73.7716616949607
+    - type: manhattan_spearman
+      value: 75.06089549008017
+  - task:
+      type: Clustering
+    dataset:
+      type: C-MTEB/CLSClusteringP2P
+      name: MTEB CLSClusteringP2P
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: v_measure
+      value: 60.4206935177474
+  - task:
+      type: Clustering
+    dataset:
+      type: C-MTEB/CLSClusteringS2S
+      name: MTEB CLSClusteringS2S
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: v_measure
+      value: 49.53654617222264
+  - task:
+      type: Reranking
+    dataset:
+      type: C-MTEB/CMedQAv1-reranking
+      name: MTEB CMedQAv1
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map
+      value: 90.96386786978509
+    - type: mrr
+      value: 92.8897619047619
+  - task:
+      type: Reranking
+    dataset:
+      type: C-MTEB/CMedQAv2-reranking
+      name: MTEB CMedQAv2
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: map
+      value: 90.41014127763198
+    - type: mrr
+      value: 92.45039682539682
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/CmedqaRetrieval
+      name: MTEB CmedqaRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 26.901999999999997
+    - type: map_at_10
+      value: 40.321
+    - type: map_at_100
+      value: 42.176
+    - type: map_at_1000
+      value: 42.282
+    - type: map_at_3
+      value: 35.882
+    - type: map_at_5
+      value: 38.433
+    - type: mrr_at_1
+      value: 40.910000000000004
+    - type: mrr_at_10
+      value: 49.309999999999995
+    - type: mrr_at_100
+      value: 50.239
+    - type: mrr_at_1000
+      value: 50.278
+    - type: mrr_at_3
+      value: 46.803
+    - type: mrr_at_5
+      value: 48.137
+    - type: ndcg_at_1
+      value: 40.785
+    - type: ndcg_at_10
+      value: 47.14
+    - type: ndcg_at_100
+      value: 54.156000000000006
+    - type: ndcg_at_1000
+      value: 55.913999999999994
+    - type: ndcg_at_3
+      value: 41.669
+    - type: ndcg_at_5
+      value: 43.99
+    - type: precision_at_1
+      value: 40.785
+    - type: precision_at_10
+      value: 10.493
+    - type: precision_at_100
+      value: 1.616
+    - type: precision_at_1000
+      value: 0.184
+    - type: precision_at_3
+      value: 23.723
+    - type: precision_at_5
+      value: 17.249
+    - type: recall_at_1
+      value: 26.901999999999997
+    - type: recall_at_10
+      value: 58.25
+    - type: recall_at_100
+      value: 87.10900000000001
+    - type: recall_at_1000
+      value: 98.804
+    - type: recall_at_3
+      value: 41.804
+    - type: recall_at_5
+      value: 48.884
+  - task:
+      type: PairClassification
+    dataset:
+      type: C-MTEB/CMNLI
+      name: MTEB Cmnli
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: cos_sim_accuracy
+      value: 86.42212868310283
+    - type: cos_sim_ap
+      value: 92.83788702972741
+    - type: cos_sim_f1
+      value: 87.08912233141307
+    - type: cos_sim_precision
+      value: 84.24388111888112
+    - type: cos_sim_recall
+      value: 90.13327098433481
+    - type: dot_accuracy
+      value: 86.44618159951895
+    - type: dot_ap
+      value: 92.81146275060858
+    - type: dot_f1
+      value: 87.06857911250562
+    - type: dot_precision
+      value: 83.60232408005164
+    - type: dot_recall
+      value: 90.83469721767594
+    - type: euclidean_accuracy
+      value: 86.42212868310283
+    - type: euclidean_ap
+      value: 92.83805700492603
+    - type: euclidean_f1
+      value: 87.08803611738148
+    - type: euclidean_precision
+      value: 84.18066768492254
+    - type: euclidean_recall
+      value: 90.20341360766892
+    - type: manhattan_accuracy
+      value: 86.28983764281419
+    - type: manhattan_ap
+      value: 92.82818970981005
+    - type: manhattan_f1
+      value: 87.12625521832335
+    - type: manhattan_precision
+      value: 84.19101613606628
+    - type: manhattan_recall
+      value: 90.27355623100304
+    - type: max_accuracy
+      value: 86.44618159951895
+    - type: max_ap
+      value: 92.83805700492603
+    - type: max_f1
+      value: 87.12625521832335
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/CovidRetrieval
+      name: MTEB CovidRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 79.215
+    - type: map_at_10
+      value: 86.516
+    - type: map_at_100
+      value: 86.6
+    - type: map_at_1000
+      value: 86.602
+    - type: map_at_3
+      value: 85.52
+    - type: map_at_5
+      value: 86.136
+    - type: mrr_at_1
+      value: 79.663
+    - type: mrr_at_10
+      value: 86.541
+    - type: mrr_at_100
+      value: 86.625
+    - type: mrr_at_1000
+      value: 86.627
+    - type: mrr_at_3
+      value: 85.564
+    - type: mrr_at_5
+      value: 86.15899999999999
+    - type: ndcg_at_1
+      value: 79.663
+    - type: ndcg_at_10
+      value: 89.399
+    - type: ndcg_at_100
+      value: 89.727
+    - type: ndcg_at_1000
+      value: 89.781
+    - type: ndcg_at_3
+      value: 87.402
+    - type: ndcg_at_5
+      value: 88.479
+    - type: precision_at_1
+      value: 79.663
+    - type: precision_at_10
+      value: 9.926
+    - type: precision_at_100
+      value: 1.006
+    - type: precision_at_1000
+      value: 0.101
+    - type: precision_at_3
+      value: 31.226
+    - type: precision_at_5
+      value: 19.283
+    - type: recall_at_1
+      value: 79.215
+    - type: recall_at_10
+      value: 98.209
+    - type: recall_at_100
+      value: 99.579
+    - type: recall_at_1000
+      value: 100.0
+    - type: recall_at_3
+      value: 92.703
+    - type: recall_at_5
+      value: 95.364
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/DuRetrieval
+      name: MTEB DuRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 27.391
+    - type: map_at_10
+      value: 82.82000000000001
+    - type: map_at_100
+      value: 85.5
+    - type: map_at_1000
+      value: 85.533
+    - type: map_at_3
+      value: 57.802
+    - type: map_at_5
+      value: 72.82600000000001
+    - type: mrr_at_1
+      value: 92.80000000000001
+    - type: mrr_at_10
+      value: 94.83500000000001
+    - type: mrr_at_100
+      value: 94.883
+    - type: mrr_at_1000
+      value: 94.884
+    - type: mrr_at_3
+      value: 94.542
+    - type: mrr_at_5
+      value: 94.729
+    - type: ndcg_at_1
+      value: 92.7
+    - type: ndcg_at_10
+      value: 89.435
+    - type: ndcg_at_100
+      value: 91.78699999999999
+    - type: ndcg_at_1000
+      value: 92.083
+    - type: ndcg_at_3
+      value: 88.595
+    - type: ndcg_at_5
+      value: 87.53
+    - type: precision_at_1
+      value: 92.7
+    - type: precision_at_10
+      value: 42.4
+    - type: precision_at_100
+      value: 4.823
+    - type: precision_at_1000
+      value: 0.48900000000000005
+    - type: precision_at_3
+      value: 79.133
+    - type: precision_at_5
+      value: 66.8
+    - type: recall_at_1
+      value: 27.391
+    - type: recall_at_10
+      value: 90.069
+    - type: recall_at_100
+      value: 97.875
+    - type: recall_at_1000
+      value: 99.436
+    - type: recall_at_3
+      value: 59.367999999999995
+    - type: recall_at_5
+      value: 76.537
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/EcomRetrieval
+      name: MTEB EcomRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 54.800000000000004
+    - type: map_at_10
+      value: 65.289
+    - type: map_at_100
+      value: 65.845
+    - type: map_at_1000
+      value: 65.853
+    - type: map_at_3
+      value: 62.766999999999996
+    - type: map_at_5
+      value: 64.252
+    - type: mrr_at_1
+      value: 54.800000000000004
+    - type: mrr_at_10
+      value: 65.255
+    - type: mrr_at_100
+      value: 65.81700000000001
+    - type: mrr_at_1000
+      value: 65.824
+    - type: mrr_at_3
+      value: 62.683
+    - type: mrr_at_5
+      value: 64.248
+    - type: ndcg_at_1
+      value: 54.800000000000004
+    - type: ndcg_at_10
+      value: 70.498
+    - type: ndcg_at_100
+      value: 72.82300000000001
+    - type: ndcg_at_1000
+      value: 73.053
+    - type: ndcg_at_3
+      value: 65.321
+    - type: ndcg_at_5
+      value: 67.998
+    - type: precision_at_1
+      value: 54.800000000000004
+    - type: precision_at_10
+      value: 8.690000000000001
+    - type: precision_at_100
+      value: 0.97
+    - type: precision_at_1000
+      value: 0.099
+    - type: precision_at_3
+      value: 24.233
+    - type: precision_at_5
+      value: 15.840000000000002
+    - type: recall_at_1
+      value: 54.800000000000004
+    - type: recall_at_10
+      value: 86.9
+    - type: recall_at_100
+      value: 97.0
+    - type: recall_at_1000
+      value: 98.9
+    - type: recall_at_3
+      value: 72.7
+    - type: recall_at_5
+      value: 79.2
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/IFlyTek-classification
+      name: MTEB IFlyTek
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 51.758368603308966
+    - type: f1
+      value: 40.249503783871596
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/JDReview-classification
+      name: MTEB JDReview
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 89.08067542213884
+    - type: ap
+      value: 60.31281895139249
+    - type: f1
+      value: 84.20883153932607
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/LCQMC
+      name: MTEB LCQMC
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 74.04193577551248
+    - type: cos_sim_spearman
+      value: 79.81875884845549
+    - type: euclidean_pearson
+      value: 80.02581187503708
+    - type: euclidean_spearman
+      value: 79.81877215060574
+    - type: manhattan_pearson
+      value: 80.01767830530258
+    - type: manhattan_spearman
+      value: 79.81178852172727
+  - task:
+      type: Reranking
+    dataset:
+      type: C-MTEB/Mmarco-reranking
+      name: MTEB MMarcoReranking
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map
+      value: 39.90939429947956
+    - type: mrr
+      value: 39.71071428571429
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/MMarcoRetrieval
+      name: MTEB MMarcoRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 68.485
+    - type: map_at_10
+      value: 78.27199999999999
+    - type: map_at_100
+      value: 78.54100000000001
+    - type: map_at_1000
+      value: 78.546
+    - type: map_at_3
+      value: 76.339
+    - type: map_at_5
+      value: 77.61099999999999
+    - type: mrr_at_1
+      value: 70.80199999999999
+    - type: mrr_at_10
+      value: 78.901
+    - type: mrr_at_100
+      value: 79.12400000000001
+    - type: mrr_at_1000
+      value: 79.128
+    - type: mrr_at_3
+      value: 77.237
+    - type: mrr_at_5
+      value: 78.323
+    - type: ndcg_at_1
+      value: 70.759
+    - type: ndcg_at_10
+      value: 82.191
+    - type: ndcg_at_100
+      value: 83.295
+    - type: ndcg_at_1000
+      value: 83.434
+    - type: ndcg_at_3
+      value: 78.57600000000001
+    - type: ndcg_at_5
+      value: 80.715
+    - type: precision_at_1
+      value: 70.759
+    - type: precision_at_10
+      value: 9.951
+    - type: precision_at_100
+      value: 1.049
+    - type: precision_at_1000
+      value: 0.106
+    - type: precision_at_3
+      value: 29.660999999999998
+    - type: precision_at_5
+      value: 18.94
+    - type: recall_at_1
+      value: 68.485
+    - type: recall_at_10
+      value: 93.65
+    - type: recall_at_100
+      value: 98.434
+    - type: recall_at_1000
+      value: 99.522
+    - type: recall_at_3
+      value: 84.20100000000001
+    - type: recall_at_5
+      value: 89.261
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_massive_intent
+      name: MTEB MassiveIntentClassification (zh-CN)
+      config: zh-CN
+      split: test
+      revision: 31efe3c427b0bae9c22cbb560b8f15491cc6bed7
+    metrics:
+    - type: accuracy
+      value: 77.45460659045055
+    - type: f1
+      value: 73.84987702455533
+  - task:
+      type: Classification
+    dataset:
+      type: mteb/amazon_massive_scenario
+      name: MTEB MassiveScenarioClassification (zh-CN)
+      config: zh-CN
+      split: test
+      revision: 7d571f92784cd94a019292a1f45445077d0ef634
+    metrics:
+    - type: accuracy
+      value: 85.29926025554808
+    - type: f1
+      value: 84.40636286569843
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/MedicalRetrieval
+      name: MTEB MedicalRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 57.599999999999994
+    - type: map_at_10
+      value: 64.691
+    - type: map_at_100
+      value: 65.237
+    - type: map_at_1000
+      value: 65.27
+    - type: map_at_3
+      value: 62.733000000000004
+    - type: map_at_5
+      value: 63.968
+    - type: mrr_at_1
+      value: 58.099999999999994
+    - type: mrr_at_10
+      value: 64.952
+    - type: mrr_at_100
+      value: 65.513
+    - type: mrr_at_1000
+      value: 65.548
+    - type: mrr_at_3
+      value: 63.0
+    - type: mrr_at_5
+      value: 64.235
+    - type: ndcg_at_1
+      value: 57.599999999999994
+    - type: ndcg_at_10
+      value: 68.19
+    - type: ndcg_at_100
+      value: 70.98400000000001
+    - type: ndcg_at_1000
+      value: 71.811
+    - type: ndcg_at_3
+      value: 64.276
+    - type: ndcg_at_5
+      value: 66.47999999999999
+    - type: precision_at_1
+      value: 57.599999999999994
+    - type: precision_at_10
+      value: 7.920000000000001
+    - type: precision_at_100
+      value: 0.9259999999999999
+    - type: precision_at_1000
+      value: 0.099
+    - type: precision_at_3
+      value: 22.900000000000002
+    - type: precision_at_5
+      value: 14.799999999999999
+    - type: recall_at_1
+      value: 57.599999999999994
+    - type: recall_at_10
+      value: 79.2
+    - type: recall_at_100
+      value: 92.60000000000001
+    - type: recall_at_1000
+      value: 99.0
+    - type: recall_at_3
+      value: 68.7
+    - type: recall_at_5
+      value: 74.0
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/MultilingualSentiment-classification
+      name: MTEB MultilingualSentiment
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 79.45
+    - type: f1
+      value: 79.25610578280538
+  - task:
+      type: PairClassification
+    dataset:
+      type: C-MTEB/OCNLI
+      name: MTEB Ocnli
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: cos_sim_accuracy
+      value: 85.43584190579317
+    - type: cos_sim_ap
+      value: 90.89979725191012
+    - type: cos_sim_f1
+      value: 86.48383937316358
+    - type: cos_sim_precision
+      value: 80.6392694063927
+    - type: cos_sim_recall
+      value: 93.24181626187962
+    - type: dot_accuracy
+      value: 85.38170005414185
+    - type: dot_ap
+      value: 90.87532457866699
+    - type: dot_f1
+      value: 86.48383937316358
+    - type: dot_precision
+      value: 80.6392694063927
+    - type: dot_recall
+      value: 93.24181626187962
+    - type: euclidean_accuracy
+      value: 85.43584190579317
+    - type: euclidean_ap
+      value: 90.90126652086121
+    - type: euclidean_f1
+      value: 86.48383937316358
+    - type: euclidean_precision
+      value: 80.6392694063927
+    - type: euclidean_recall
+      value: 93.24181626187962
+    - type: manhattan_accuracy
+      value: 85.43584190579317
+    - type: manhattan_ap
+      value: 90.87896997853466
+    - type: manhattan_f1
+      value: 86.47581441263573
+    - type: manhattan_precision
+      value: 81.18628359592215
+    - type: manhattan_recall
+      value: 92.5026399155227
+    - type: max_accuracy
+      value: 85.43584190579317
+    - type: max_ap
+      value: 90.90126652086121
+    - type: max_f1
+      value: 86.48383937316358
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/OnlineShopping-classification
+      name: MTEB OnlineShopping
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 94.9
+    - type: ap
+      value: 93.1468223150745
+    - type: f1
+      value: 94.88918689508299
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/PAWSX
+      name: MTEB PAWSX
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 40.4831743182905
+    - type: cos_sim_spearman
+      value: 47.4163675550491
+    - type: euclidean_pearson
+      value: 46.456319899274924
+    - type: euclidean_spearman
+      value: 47.41567079730661
+    - type: manhattan_pearson
+      value: 46.48561639930895
+    - type: manhattan_spearman
+      value: 47.447721653461215
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/QBQTC
+      name: MTEB QBQTC
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 42.96423587663398
+    - type: cos_sim_spearman
+      value: 45.13742225167858
+    - type: euclidean_pearson
+      value: 39.275452114075435
+    - type: euclidean_spearman
+      value: 45.137763540967406
+    - type: manhattan_pearson
+      value: 39.24797626417764
+    - type: manhattan_spearman
+      value: 45.13817773119268
+  - task:
+      type: STS
+    dataset:
+      type: mteb/sts22-crosslingual-sts
+      name: MTEB STS22 (zh)
+      config: zh
+      split: test
+      revision: 6d1ba47164174a496b7fa5d3569dae26a6813b80
+    metrics:
+    - type: cos_sim_pearson
+      value: 66.26687809086202
+    - type: cos_sim_spearman
+      value: 66.9569145816897
+    - type: euclidean_pearson
+      value: 65.72390780809788
+    - type: euclidean_spearman
+      value: 66.95406938095539
+    - type: manhattan_pearson
+      value: 65.6220809000381
+    - type: manhattan_spearman
+      value: 66.88531036320953
+  - task:
+      type: STS
+    dataset:
+      type: C-MTEB/STSB
+      name: MTEB STSB
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: cos_sim_pearson
+      value: 80.30831700726195
+    - type: cos_sim_spearman
+      value: 82.05184068558792
+    - type: euclidean_pearson
+      value: 81.73198597791563
+    - type: euclidean_spearman
+      value: 82.05326103582206
+    - type: manhattan_pearson
+      value: 81.70886400949136
+    - type: manhattan_spearman
+      value: 82.03473274756037
+  - task:
+      type: Reranking
+    dataset:
+      type: C-MTEB/T2Reranking
+      name: MTEB T2Reranking
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map
+      value: 69.03398835347575
+    - type: mrr
+      value: 79.9212528613341
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/T2Retrieval
+      name: MTEB T2Retrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 27.515
+    - type: map_at_10
+      value: 77.40599999999999
+    - type: map_at_100
+      value: 81.087
+    - type: map_at_1000
+      value: 81.148
+    - type: map_at_3
+      value: 54.327000000000005
+    - type: map_at_5
+      value: 66.813
+    - type: mrr_at_1
+      value: 89.764
+    - type: mrr_at_10
+      value: 92.58
+    - type: mrr_at_100
+      value: 92.663
+    - type: mrr_at_1000
+      value: 92.666
+    - type: mrr_at_3
+      value: 92.15299999999999
+    - type: mrr_at_5
+      value: 92.431
+    - type: ndcg_at_1
+      value: 89.777
+    - type: ndcg_at_10
+      value: 85.013
+    - type: ndcg_at_100
+      value: 88.62100000000001
+    - type: ndcg_at_1000
+      value: 89.184
+    - type: ndcg_at_3
+      value: 86.19200000000001
+    - type: ndcg_at_5
+      value: 84.909
+    - type: precision_at_1
+      value: 89.777
+    - type: precision_at_10
+      value: 42.218
+    - type: precision_at_100
+      value: 5.032
+    - type: precision_at_1000
+      value: 0.517
+    - type: precision_at_3
+      value: 75.335
+    - type: precision_at_5
+      value: 63.199000000000005
+    - type: recall_at_1
+      value: 27.515
+    - type: recall_at_10
+      value: 84.258
+    - type: recall_at_100
+      value: 95.908
+    - type: recall_at_1000
+      value: 98.709
+    - type: recall_at_3
+      value: 56.189
+    - type: recall_at_5
+      value: 70.50800000000001
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/TNews-classification
+      name: MTEB TNews
+      config: default
+      split: validation
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 54.635999999999996
+    - type: f1
+      value: 52.63073912739558
+  - task:
+      type: Clustering
+    dataset:
+      type: C-MTEB/ThuNewsClusteringP2P
+      name: MTEB ThuNewsClusteringP2P
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: v_measure
+      value: 78.75676284855221
+  - task:
+      type: Clustering
+    dataset:
+      type: C-MTEB/ThuNewsClusteringS2S
+      name: MTEB ThuNewsClusteringS2S
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: v_measure
+      value: 71.95583733802839
+  - task:
+      type: Retrieval
+    dataset:
+      type: C-MTEB/VideoRetrieval
+      name: MTEB VideoRetrieval
+      config: default
+      split: dev
+      revision: None
+    metrics:
+    - type: map_at_1
+      value: 64.9
+    - type: map_at_10
+      value: 75.622
+    - type: map_at_100
+      value: 75.93900000000001
+    - type: map_at_1000
+      value: 75.93900000000001
+    - type: map_at_3
+      value: 73.933
+    - type: map_at_5
+      value: 74.973
+    - type: mrr_at_1
+      value: 65.0
+    - type: mrr_at_10
+      value: 75.676
+    - type: mrr_at_100
+      value: 75.994
+    - type: mrr_at_1000
+      value: 75.994
+    - type: mrr_at_3
+      value: 74.05000000000001
+    - type: mrr_at_5
+      value: 75.03999999999999
+    - type: ndcg_at_1
+      value: 64.9
+    - type: ndcg_at_10
+      value: 80.08999999999999
+    - type: ndcg_at_100
+      value: 81.44500000000001
+    - type: ndcg_at_1000
+      value: 81.45599999999999
+    - type: ndcg_at_3
+      value: 76.688
+    - type: ndcg_at_5
+      value: 78.53
+    - type: precision_at_1
+      value: 64.9
+    - type: precision_at_10
+      value: 9.379999999999999
+    - type: precision_at_100
+      value: 0.997
+    - type: precision_at_1000
+      value: 0.1
+    - type: precision_at_3
+      value: 28.199999999999996
+    - type: precision_at_5
+      value: 17.8
+    - type: recall_at_1
+      value: 64.9
+    - type: recall_at_10
+      value: 93.8
+    - type: recall_at_100
+      value: 99.7
+    - type: recall_at_1000
+      value: 99.8
+    - type: recall_at_3
+      value: 84.6
+    - type: recall_at_5
+      value: 89.0
+  - task:
+      type: Classification
+    dataset:
+      type: C-MTEB/waimai-classification
+      name: MTEB Waimai
+      config: default
+      split: test
+      revision: None
+    metrics:
+    - type: accuracy
+      value: 89.34
+    - type: ap
+      value: 75.20638024616892
+    - type: f1
+      value: 87.88648489072128
+---
+# xiaobu-embedding-v2
+基于piccolo-embedding[1]，主要改动如下：
+  - 合成数据替换为xiaobu-embedding-v1[2]所积累数据
+  - 在circle_loss[3]视角下统一处理CMTEB的6类问题，最大优势是可充分利用原始数据集中的多个正例，其次是可一定程度上避免考虑多个不同loss之间的权重问题
+## Usage (Sentence-Transformers)
+```
+pip install -U sentence-transformers
+```
+相似度计算：
+```python
+from sentence_transformers import SentenceTransformer
+sentences_1 = ["样例数据-1", "样例数据-2"]
+sentences_2 = ["样例数据-3", "样例数据-4"]
+model = SentenceTransformer('lier007/xiaobu-embedding-v2')
+embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
+embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
+similarity = embeddings_1 @ embeddings_2.T
+print(similarity)
+```
+## Reference
+1. https://github.com/hjq133/piccolo-embedding
+2. https://huggingface.co/lier007/xiaobu-embedding
+3. https://arxiv.org/abs/2002.10857

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Dense",
+    "type": "sentence_transformers.models.Dense"
+  }
+]

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7a16bd9bf2013e86160282b1cb5d145792d74fd710cd99bed42e9fff1fcfb82
+size 1302216550

sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 512,
+  "do_lower_case": false
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff