Add new SentenceTransformer model
Browse files- 1_Pooling/config.json +10 -0
- README.md +810 -0
- config.json +26 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +14 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +57 -0
- vocab.txt +0 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 768,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,810 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:5000
|
8 |
+
- loss:TripletLoss
|
9 |
+
base_model: lufercho/my-finetuned-bert-mlm
|
10 |
+
widget:
|
11 |
+
- source_sentence: "Auto-WEKA: Combined Selection and Hyperparameter Optimization\
|
12 |
+
\ of\n Classification Algorithms"
|
13 |
+
sentences:
|
14 |
+
- ' It has been a long time, since data mining technologies have made their ways
|
15 |
+
|
16 |
+
to the field of data management. Classification is one of the most important
|
17 |
+
|
18 |
+
data mining tasks for label prediction, categorization of objects into groups,
|
19 |
+
|
20 |
+
advertisement and data management. In this paper, we focus on the standard
|
21 |
+
|
22 |
+
classification problem which is predicting unknown labels in Euclidean space.
|
23 |
+
|
24 |
+
Most efforts in Machine Learning communities are devoted to methods that use
|
25 |
+
|
26 |
+
probabilistic algorithms which are heavy on Calculus and Linear Algebra. Most
|
27 |
+
|
28 |
+
of these techniques have scalability issues for big data, and are hardly
|
29 |
+
|
30 |
+
parallelizable if they are to maintain their high accuracies in their standard
|
31 |
+
|
32 |
+
form. Sampling is a new direction for improving scalability, using many small
|
33 |
+
|
34 |
+
parallel classifiers. In this paper, rather than conventional sampling methods,
|
35 |
+
|
36 |
+
we focus on a discrete classification algorithm with O(n) expected running
|
37 |
+
|
38 |
+
time. Our approach performs a similar task as sampling methods. However, we use
|
39 |
+
|
40 |
+
column-wise sampling of data, rather than the row-wise sampling used in the
|
41 |
+
|
42 |
+
literature. In either case, our algorithm is completely deterministic. Our
|
43 |
+
|
44 |
+
algorithm, proposes a way of combining 2D convex hulls in order to achieve high
|
45 |
+
|
46 |
+
classification accuracy as well as scalability in the same time. First, we
|
47 |
+
|
48 |
+
thoroughly describe and prove our O(n) algorithm for finding the convex hull of
|
49 |
+
|
50 |
+
a point set in 2D. Then, we show with experiments our classifier model built
|
51 |
+
|
52 |
+
based on this idea is very competitive compared with existing sophisticated
|
53 |
+
|
54 |
+
classification algorithms included in commercial statistical applications such
|
55 |
+
|
56 |
+
as MATLAB.
|
57 |
+
|
58 |
+
'
|
59 |
+
- ' Many different machine learning algorithms exist; taking into account each
|
60 |
+
|
61 |
+
algorithm''s hyperparameters, there is a staggeringly large number of possible
|
62 |
+
|
63 |
+
alternatives overall. We consider the problem of simultaneously selecting a
|
64 |
+
|
65 |
+
learning algorithm and setting its hyperparameters, going beyond previous work
|
66 |
+
|
67 |
+
that addresses these issues in isolation. We show that this problem can be
|
68 |
+
|
69 |
+
addressed by a fully automated approach, leveraging recent innovations in
|
70 |
+
|
71 |
+
Bayesian optimization. Specifically, we consider a wide range of feature
|
72 |
+
|
73 |
+
selection techniques (combining 3 search and 8 evaluator methods) and all
|
74 |
+
|
75 |
+
classification approaches implemented in WEKA, spanning 2 ensemble methods, 10
|
76 |
+
|
77 |
+
meta-methods, 27 base classifiers, and hyperparameter settings for each
|
78 |
+
|
79 |
+
classifier. On each of 21 popular datasets from the UCI repository, the KDD Cup
|
80 |
+
|
81 |
+
09, variants of the MNIST dataset and CIFAR-10, we show classification
|
82 |
+
|
83 |
+
performance often much better than using standard selection/hyperparameter
|
84 |
+
|
85 |
+
optimization methods. We hope that our approach will help non-expert users to
|
86 |
+
|
87 |
+
more effectively identify machine learning algorithms and hyperparameter
|
88 |
+
|
89 |
+
settings appropriate to their applications, and hence to achieve improved
|
90 |
+
|
91 |
+
performance.
|
92 |
+
|
93 |
+
'
|
94 |
+
- ' Nonnegative matrix factorization (NMF) has become a ubiquitous tool for data
|
95 |
+
|
96 |
+
analysis. An important variant is the sparse NMF problem which arises when we
|
97 |
+
|
98 |
+
explicitly require the learnt features to be sparse. A natural measure of
|
99 |
+
|
100 |
+
sparsity is the L$_0$ norm, however its optimization is NP-hard. Mixed norms,
|
101 |
+
|
102 |
+
such as L$_1$/L$_2$ measure, have been shown to model sparsity robustly, based
|
103 |
+
|
104 |
+
on intuitive attributes that such measures need to satisfy. This is in contrast
|
105 |
+
|
106 |
+
to computationally cheaper alternatives such as the plain L$_1$ norm. However,
|
107 |
+
|
108 |
+
present algorithms designed for optimizing the mixed norm L$_1$/L$_2$ are slow
|
109 |
+
|
110 |
+
and other formulations for sparse NMF have been proposed such as those based on
|
111 |
+
|
112 |
+
L$_1$ and L$_0$ norms. Our proposed algorithm allows us to solve the mixed norm
|
113 |
+
|
114 |
+
sparsity constraints while not sacrificing computation time. We present
|
115 |
+
|
116 |
+
experimental evidence on real-world datasets that shows our new algorithm
|
117 |
+
|
118 |
+
performs an order of magnitude faster compared to the current state-of-the-art
|
119 |
+
|
120 |
+
solvers optimizing the mixed norm and is suitable for large-scale datasets.
|
121 |
+
|
122 |
+
'
|
123 |
+
- source_sentence: "Effect of Different Distance Measures on the Performance of K-Means\n\
|
124 |
+
\ Algorithm: An Experimental Study in Matlab"
|
125 |
+
sentences:
|
126 |
+
- ' The kernel method is a potential approach to analyzing structured data such
|
127 |
+
|
128 |
+
as sequences, trees, and graphs; however, unordered trees have not been
|
129 |
+
|
130 |
+
investigated extensively. Kimura et al. (2011) proposed a kernel function for
|
131 |
+
|
132 |
+
unordered trees on the basis of their subpaths, which are vertical
|
133 |
+
|
134 |
+
substructures of trees responsible for hierarchical information in them. Their
|
135 |
+
|
136 |
+
kernel exhibits practically good performance in terms of accuracy and speed;
|
137 |
+
|
138 |
+
however, linear-time computation is not guaranteed theoretically, unlike the
|
139 |
+
|
140 |
+
case of the other unordered tree kernel proposed by Vishwanathan and Smola
|
141 |
+
|
142 |
+
(2003). In this paper, we propose a theoretically guaranteed linear-time kernel
|
143 |
+
|
144 |
+
computation algorithm that is practically fast, and we present an efficient
|
145 |
+
|
146 |
+
prediction algorithm whose running time depends only on the size of the input
|
147 |
+
|
148 |
+
tree. Experimental results show that the proposed algorithms are quite
|
149 |
+
|
150 |
+
efficient in practice.
|
151 |
+
|
152 |
+
'
|
153 |
+
- ' We express the classic ARMA time-series model as a directed graphical model.
|
154 |
+
|
155 |
+
In doing so, we find that the deterministic relationships in the model make it
|
156 |
+
|
157 |
+
effectively impossible to use the EM algorithm for learning model parameters.
|
158 |
+
|
159 |
+
To remedy this problem, we replace the deterministic relationships with
|
160 |
+
|
161 |
+
Gaussian distributions having a small variance, yielding the stochastic ARMA
|
162 |
+
|
163 |
+
(ARMA) model. This modification allows us to use the EM algorithm to learn
|
164 |
+
|
165 |
+
parmeters and to forecast,even in situations where some data is missing. This
|
166 |
+
|
167 |
+
modification, in conjunction with the graphicalmodel approach, also allows us
|
168 |
+
|
169 |
+
to include cross predictors in situations where there are multiple times series
|
170 |
+
|
171 |
+
and/or additional nontemporal covariates. More surprising,experiments suggest
|
172 |
+
|
173 |
+
that the move to stochastic ARMA yields improved accuracy through better
|
174 |
+
|
175 |
+
smoothing. We demonstrate improvements afforded by cross prediction and better
|
176 |
+
|
177 |
+
smoothing on real data.
|
178 |
+
|
179 |
+
'
|
180 |
+
- ' K-means algorithm is a very popular clustering algorithm which is famous for
|
181 |
+
|
182 |
+
its simplicity. Distance measure plays a very important rule on the performance
|
183 |
+
|
184 |
+
of this algorithm. We have different distance measure techniques available. But
|
185 |
+
|
186 |
+
choosing a proper technique for distance calculation is totally dependent on
|
187 |
+
|
188 |
+
the type of the data that we are going to cluster. In this paper an
|
189 |
+
|
190 |
+
experimental study is done in Matlab to cluster the iris and wine data sets
|
191 |
+
|
192 |
+
with different distance measures and thereby observing the variation of the
|
193 |
+
|
194 |
+
performances shown.
|
195 |
+
|
196 |
+
'
|
197 |
+
- source_sentence: A Dynamic Near-Optimal Algorithm for Online Linear Programming
|
198 |
+
sentences:
|
199 |
+
- ' Social media channels such as Twitter have emerged as popular platforms for
|
200 |
+
|
201 |
+
crowds to respond to public events such as speeches, sports and debates. While
|
202 |
+
|
203 |
+
this promises tremendous opportunities to understand and make sense of the
|
204 |
+
|
205 |
+
reception of an event from the social media, the promises come entwined with
|
206 |
+
|
207 |
+
significant technical challenges. In particular, given an event and an
|
208 |
+
|
209 |
+
associated large scale collection of tweets, we need approaches to effectively
|
210 |
+
|
211 |
+
align tweets and the parts of the event they refer to. This in turn raises
|
212 |
+
|
213 |
+
questions about how to segment the event into smaller yet meaningful parts, and
|
214 |
+
|
215 |
+
how to figure out whether a tweet is a general one about the entire event or
|
216 |
+
|
217 |
+
specific one aimed at a particular segment of the event. In this work, we
|
218 |
+
|
219 |
+
present ET-LDA, an effective method for aligning an event and its tweets
|
220 |
+
|
221 |
+
through joint statistical modeling of topical influences from the events and
|
222 |
+
|
223 |
+
their associated tweets. The model enables the automatic segmentation of the
|
224 |
+
|
225 |
+
events and the characterization of tweets into two categories: (1) episodic
|
226 |
+
|
227 |
+
tweets that respond specifically to the content in the segments of the events,
|
228 |
+
|
229 |
+
and (2) steady tweets that respond generally about the events. We present an
|
230 |
+
|
231 |
+
efficient inference method for this model, and a comprehensive evaluation of
|
232 |
+
|
233 |
+
its effectiveness over existing methods. In particular, through a user study,
|
234 |
+
|
235 |
+
we demonstrate that users find the topics, the segments, the alignment, and the
|
236 |
+
|
237 |
+
episodic tweets discovered by ET-LDA to be of higher quality and more
|
238 |
+
|
239 |
+
interesting as compared to the state-of-the-art, with improvements in the range
|
240 |
+
|
241 |
+
of 18-41%.
|
242 |
+
|
243 |
+
'
|
244 |
+
- ' A natural optimization model that formulates many online resource allocation
|
245 |
+
|
246 |
+
and revenue management problems is the online linear program (LP) in which the
|
247 |
+
|
248 |
+
constraint matrix is revealed column by column along with the corresponding
|
249 |
+
|
250 |
+
objective coefficient. In such a model, a decision variable has to be set each
|
251 |
+
|
252 |
+
time a column is revealed without observing the future inputs and the goal is
|
253 |
+
|
254 |
+
to maximize the overall objective function. In this paper, we provide a
|
255 |
+
|
256 |
+
near-optimal algorithm for this general class of online problems under the
|
257 |
+
|
258 |
+
assumption of random order of arrival and some mild conditions on the size of
|
259 |
+
|
260 |
+
the LP right-hand-side input. Specifically, our learning-based algorithm works
|
261 |
+
|
262 |
+
by dynamically updating a threshold price vector at geometric time intervals,
|
263 |
+
|
264 |
+
where the dual prices learned from the revealed columns in the previous period
|
265 |
+
|
266 |
+
are used to determine the sequential decisions in the current period. Due to
|
267 |
+
|
268 |
+
the feature of dynamic learning, the competitiveness of our algorithm improves
|
269 |
+
|
270 |
+
over the past study of the same problem. We also present a worst-case example
|
271 |
+
|
272 |
+
showing that the performance of our algorithm is near-optimal.
|
273 |
+
|
274 |
+
'
|
275 |
+
- ' One of the biggest challenges in Multimedia information retrieval and
|
276 |
+
|
277 |
+
understanding is to bridge the semantic gap by properly modeling concept
|
278 |
+
|
279 |
+
semantics in context. The presence of out of vocabulary (OOV) concepts
|
280 |
+
|
281 |
+
exacerbates this difficulty. To address the semantic gap issues, we formulate
|
282 |
+
a
|
283 |
+
|
284 |
+
problem on learning contextualized semantics from descriptive terms and propose
|
285 |
+
|
286 |
+
a novel Siamese architecture to model the contextualized semantics from
|
287 |
+
|
288 |
+
descriptive terms. By means of pattern aggregation and probabilistic topic
|
289 |
+
|
290 |
+
models, our Siamese architecture captures contextualized semantics from the
|
291 |
+
|
292 |
+
co-occurring descriptive terms via unsupervised learning, which leads to a
|
293 |
+
|
294 |
+
concept embedding space of the terms in context. Furthermore, the co-occurring
|
295 |
+
|
296 |
+
OOV concepts can be easily represented in the learnt concept embedding space.
|
297 |
+
|
298 |
+
The main properties of the concept embedding space are demonstrated via
|
299 |
+
|
300 |
+
visualization. Using various settings in semantic priming, we have carried out
|
301 |
+
|
302 |
+
a thorough evaluation by comparing our approach to a number of state-of-the-art
|
303 |
+
|
304 |
+
methods on six annotation corpora in different domains, i.e., MagTag5K, CAL500
|
305 |
+
|
306 |
+
and Million Song Dataset in the music domain as well as Corel5K, LabelMe and
|
307 |
+
|
308 |
+
SUNDatabase in the image domain. Experimental results on semantic priming
|
309 |
+
|
310 |
+
suggest that our approach outperforms those state-of-the-art methods
|
311 |
+
|
312 |
+
considerably in various aspects.
|
313 |
+
|
314 |
+
'
|
315 |
+
- source_sentence: Parallel Online Learning
|
316 |
+
sentences:
|
317 |
+
- ' In our recent paper, we showed that in exponential family, contrastive
|
318 |
+
|
319 |
+
divergence (CD) with fixed learning rate will give asymptotically consistent
|
320 |
+
|
321 |
+
estimates \cite{wu2016convergence}. In this paper, we establish consistency and
|
322 |
+
|
323 |
+
convergence rate of CD with annealed learning rate $\eta_t$. Specifically,
|
324 |
+
|
325 |
+
suppose CD-$m$ generates the sequence of parameters $\{\theta_t\}_{t \ge 0}$
|
326 |
+
|
327 |
+
using an i.i.d. data sample $\mathbf{X}_1^n \sim p_{\theta^*}$ of size $n$,
|
328 |
+
|
329 |
+
then $\delta_n(\mathbf{X}_1^n) = \limsup_{t \to \infty} \Vert \sum_{s=t_0}^t
|
330 |
+
|
331 |
+
\eta_s \theta_s / \sum_{s=t_0}^t \eta_s - \theta^* \Vert$ converges in
|
332 |
+
|
333 |
+
probability to 0 at a rate of $1/\sqrt[3]{n}$. The number ($m$) of MCMC
|
334 |
+
|
335 |
+
transitions in CD only affects the coefficient factor of convergence rate. Our
|
336 |
+
|
337 |
+
proof is not a simple extension of the one in \cite{wu2016convergence}. which
|
338 |
+
|
339 |
+
depends critically on the fact that $\{\theta_t\}_{t \ge 0}$ is a homogeneous
|
340 |
+
|
341 |
+
Markov chain conditional on the observed sample $\mathbf{X}_1^n$. Under
|
342 |
+
|
343 |
+
annealed learning rate, the homogeneous Markov property is not available and we
|
344 |
+
|
345 |
+
have to develop an alternative approach based on super-martingales. Experiment
|
346 |
+
|
347 |
+
results of CD on a fully-visible $2\times 2$ Boltzmann Machine are provided to
|
348 |
+
|
349 |
+
demonstrate our theoretical results.
|
350 |
+
|
351 |
+
'
|
352 |
+
- ' This report outlines an approach to learning generative models from data. We
|
353 |
+
|
354 |
+
express models as probabilistic programs, which allows us to capture abstract
|
355 |
+
|
356 |
+
patterns within the examples. By choosing our language for programs to be an
|
357 |
+
|
358 |
+
extension of the algebraic data type of the examples, we can begin with a
|
359 |
+
|
360 |
+
program that generates all and only the examples. We then introduce greater
|
361 |
+
|
362 |
+
abstraction, and hence generalization, incrementally to the extent that it
|
363 |
+
|
364 |
+
improves the posterior probability of the examples given the program. Motivated
|
365 |
+
|
366 |
+
by previous approaches to model merging and program induction, we search for
|
367 |
+
|
368 |
+
such explanatory abstractions using program transformations. We consider two
|
369 |
+
|
370 |
+
types of transformation: Abstraction merges common subexpressions within a
|
371 |
+
|
372 |
+
program into new functions (a form of anti-unification). Deargumentation
|
373 |
+
|
374 |
+
simplifies functions by reducing the number of arguments. We demonstrate that
|
375 |
+
|
376 |
+
this approach finds key patterns in the domain of nested lists, including
|
377 |
+
|
378 |
+
parameterized sub-functions and stochastic recursion.
|
379 |
+
|
380 |
+
'
|
381 |
+
- ' In this work we study parallelization of online learning, a core primitive
|
382 |
+
in
|
383 |
+
|
384 |
+
machine learning. In a parallel environment all known approaches for parallel
|
385 |
+
|
386 |
+
online learning lead to delayed updates, where the model is updated using
|
387 |
+
|
388 |
+
out-of-date information. In the worst case, or when examples are temporally
|
389 |
+
|
390 |
+
correlated, delay can have a very adverse effect on the learning algorithm.
|
391 |
+
|
392 |
+
Here, we analyze and present preliminary empirical results on a set of learning
|
393 |
+
|
394 |
+
architectures based on a feature sharding approach that present various
|
395 |
+
|
396 |
+
tradeoffs between delay, degree of parallelism, representation power and
|
397 |
+
|
398 |
+
empirical performance.
|
399 |
+
|
400 |
+
'
|
401 |
+
- source_sentence: Maximin affinity learning of image segmentation
|
402 |
+
sentences:
|
403 |
+
- ' Most existing approaches to hashing apply a single form of hash function, and
|
404 |
+
|
405 |
+
an optimization process which is typically deeply coupled to this specific
|
406 |
+
|
407 |
+
form. This tight coupling restricts the flexibility of the method to respond to
|
408 |
+
|
409 |
+
the data, and can result in complex optimization problems that are difficult to
|
410 |
+
|
411 |
+
solve. Here we propose a flexible yet simple framework that is able to
|
412 |
+
|
413 |
+
accommodate different types of loss functions and hash functions. This
|
414 |
+
|
415 |
+
framework allows a number of existing approaches to hashing to be placed in
|
416 |
+
|
417 |
+
context, and simplifies the development of new problem-specific hashing
|
418 |
+
|
419 |
+
methods. Our framework decomposes hashing learning problem into two steps: hash
|
420 |
+
|
421 |
+
bit learning and hash function learning based on the learned bits. The first
|
422 |
+
|
423 |
+
step can typically be formulated as binary quadratic problems, and the second
|
424 |
+
|
425 |
+
step can be accomplished by training standard binary classifiers. Both problems
|
426 |
+
|
427 |
+
have been extensively studied in the literature. Our extensive experiments
|
428 |
+
|
429 |
+
demonstrate that the proposed framework is effective, flexible and outperforms
|
430 |
+
|
431 |
+
the state-of-the-art.
|
432 |
+
|
433 |
+
'
|
434 |
+
- ' Changes in the UK electricity market mean that domestic users will be
|
435 |
+
|
436 |
+
required to modify their usage behaviour in order that supplies can be
|
437 |
+
|
438 |
+
maintained. Clustering allows usage profiles collected at the household level
|
439 |
+
|
440 |
+
to be clustered into groups and assigned a stereotypical profile which can be
|
441 |
+
|
442 |
+
used to target marketing campaigns. Fuzzy C Means clustering extends this by
|
443 |
+
|
444 |
+
allowing each household to be a member of many groups and hence provides the
|
445 |
+
|
446 |
+
opportunity to make personalised offers to the household dependent on their
|
447 |
+
|
448 |
+
degree of membership of each group. In addition, feedback can be provided on
|
449 |
+
|
450 |
+
how user''s changing behaviour is moving them towards more "green" or cost
|
451 |
+
|
452 |
+
effective stereotypical usage.
|
453 |
+
|
454 |
+
'
|
455 |
+
- ' Images can be segmented by first using a classifier to predict an affinity
|
456 |
+
|
457 |
+
graph that reflects the degree to which image pixels must be grouped together
|
458 |
+
|
459 |
+
and then partitioning the graph to yield a segmentation. Machine learning has
|
460 |
+
|
461 |
+
been applied to the affinity classifier to produce affinity graphs that are
|
462 |
+
|
463 |
+
good in the sense of minimizing edge misclassification rates. However, this
|
464 |
+
|
465 |
+
error measure is only indirectly related to the quality of segmentations
|
466 |
+
|
467 |
+
produced by ultimately partitioning the affinity graph. We present the first
|
468 |
+
|
469 |
+
machine learning algorithm for training a classifier to produce affinity graphs
|
470 |
+
|
471 |
+
that are good in the sense of producing segmentations that directly minimize
|
472 |
+
|
473 |
+
the Rand index, a well known segmentation performance measure. The Rand index
|
474 |
+
|
475 |
+
measures segmentation performance by quantifying the classification of the
|
476 |
+
|
477 |
+
connectivity of image pixel pairs after segmentation. By using the simple graph
|
478 |
+
|
479 |
+
partitioning algorithm of finding the connected components of the thresholded
|
480 |
+
|
481 |
+
affinity graph, we are able to train an affinity classifier to directly
|
482 |
+
|
483 |
+
minimize the Rand index of segmentations resulting from the graph partitioning.
|
484 |
+
|
485 |
+
Our learning algorithm corresponds to the learning of maximin affinities
|
486 |
+
|
487 |
+
between image pixel pairs, which are predictive of the pixel-pair connectivity.
|
488 |
+
|
489 |
+
'
|
490 |
+
pipeline_tag: sentence-similarity
|
491 |
+
library_name: sentence-transformers
|
492 |
+
---
|
493 |
+
|
494 |
+
# SentenceTransformer based on lufercho/my-finetuned-bert-mlm
|
495 |
+
|
496 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [lufercho/my-finetuned-bert-mlm](https://huggingface.co/lufercho/my-finetuned-bert-mlm). It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
497 |
+
|
498 |
+
## Model Details
|
499 |
+
|
500 |
+
### Model Description
|
501 |
+
- **Model Type:** Sentence Transformer
|
502 |
+
- **Base model:** [lufercho/my-finetuned-bert-mlm](https://huggingface.co/lufercho/my-finetuned-bert-mlm) <!-- at revision 8cf44893fd607477d06b067f1788b495abac1b2c -->
|
503 |
+
- **Maximum Sequence Length:** 512 tokens
|
504 |
+
- **Output Dimensionality:** 768 dimensions
|
505 |
+
- **Similarity Function:** Cosine Similarity
|
506 |
+
<!-- - **Training Dataset:** Unknown -->
|
507 |
+
<!-- - **Language:** Unknown -->
|
508 |
+
<!-- - **License:** Unknown -->
|
509 |
+
|
510 |
+
### Model Sources
|
511 |
+
|
512 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
513 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
514 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
515 |
+
|
516 |
+
### Full Model Architecture
|
517 |
+
|
518 |
+
```
|
519 |
+
SentenceTransformer(
|
520 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel
|
521 |
+
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
522 |
+
)
|
523 |
+
```
|
524 |
+
|
525 |
+
## Usage
|
526 |
+
|
527 |
+
### Direct Usage (Sentence Transformers)
|
528 |
+
|
529 |
+
First install the Sentence Transformers library:
|
530 |
+
|
531 |
+
```bash
|
532 |
+
pip install -U sentence-transformers
|
533 |
+
```
|
534 |
+
|
535 |
+
Then you can load this model and run inference.
|
536 |
+
```python
|
537 |
+
from sentence_transformers import SentenceTransformer
|
538 |
+
|
539 |
+
# Download from the 🤗 Hub
|
540 |
+
model = SentenceTransformer("lufercho/my-finetuned-sentence-bert")
|
541 |
+
# Run inference
|
542 |
+
sentences = [
|
543 |
+
'Maximin affinity learning of image segmentation',
|
544 |
+
' Images can be segmented by first using a classifier to predict an affinity\ngraph that reflects the degree to which image pixels must be grouped together\nand then partitioning the graph to yield a segmentation. Machine learning has\nbeen applied to the affinity classifier to produce affinity graphs that are\ngood in the sense of minimizing edge misclassification rates. However, this\nerror measure is only indirectly related to the quality of segmentations\nproduced by ultimately partitioning the affinity graph. We present the first\nmachine learning algorithm for training a classifier to produce affinity graphs\nthat are good in the sense of producing segmentations that directly minimize\nthe Rand index, a well known segmentation performance measure. The Rand index\nmeasures segmentation performance by quantifying the classification of the\nconnectivity of image pixel pairs after segmentation. By using the simple graph\npartitioning algorithm of finding the connected components of the thresholded\naffinity graph, we are able to train an affinity classifier to directly\nminimize the Rand index of segmentations resulting from the graph partitioning.\nOur learning algorithm corresponds to the learning of maximin affinities\nbetween image pixel pairs, which are predictive of the pixel-pair connectivity.\n',
|
545 |
+
' Changes in the UK electricity market mean that domestic users will be\nrequired to modify their usage behaviour in order that supplies can be\nmaintained. Clustering allows usage profiles collected at the household level\nto be clustered into groups and assigned a stereotypical profile which can be\nused to target marketing campaigns. Fuzzy C Means clustering extends this by\nallowing each household to be a member of many groups and hence provides the\nopportunity to make personalised offers to the household dependent on their\ndegree of membership of each group. In addition, feedback can be provided on\nhow user\'s changing behaviour is moving them towards more "green" or cost\neffective stereotypical usage.\n',
|
546 |
+
]
|
547 |
+
embeddings = model.encode(sentences)
|
548 |
+
print(embeddings.shape)
|
549 |
+
# [3, 768]
|
550 |
+
|
551 |
+
# Get the similarity scores for the embeddings
|
552 |
+
similarities = model.similarity(embeddings, embeddings)
|
553 |
+
print(similarities.shape)
|
554 |
+
# [3, 3]
|
555 |
+
```
|
556 |
+
|
557 |
+
<!--
|
558 |
+
### Direct Usage (Transformers)
|
559 |
+
|
560 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
561 |
+
|
562 |
+
</details>
|
563 |
+
-->
|
564 |
+
|
565 |
+
<!--
|
566 |
+
### Downstream Usage (Sentence Transformers)
|
567 |
+
|
568 |
+
You can finetune this model on your own dataset.
|
569 |
+
|
570 |
+
<details><summary>Click to expand</summary>
|
571 |
+
|
572 |
+
</details>
|
573 |
+
-->
|
574 |
+
|
575 |
+
<!--
|
576 |
+
### Out-of-Scope Use
|
577 |
+
|
578 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
579 |
+
-->
|
580 |
+
|
581 |
+
<!--
|
582 |
+
## Bias, Risks and Limitations
|
583 |
+
|
584 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
585 |
+
-->
|
586 |
+
|
587 |
+
<!--
|
588 |
+
### Recommendations
|
589 |
+
|
590 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
591 |
+
-->
|
592 |
+
|
593 |
+
## Training Details
|
594 |
+
|
595 |
+
### Training Dataset
|
596 |
+
|
597 |
+
#### Unnamed Dataset
|
598 |
+
|
599 |
+
|
600 |
+
* Size: 5,000 training samples
|
601 |
+
* Columns: <code>sentence_0</code>, <code>sentence_1</code>, and <code>sentence_2</code>
|
602 |
+
* Approximate statistics based on the first 1000 samples:
|
603 |
+
| | sentence_0 | sentence_1 | sentence_2 |
|
604 |
+
|:--------|:----------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------|
|
605 |
+
| type | string | string | string |
|
606 |
+
| details | <ul><li>min: 4 tokens</li><li>mean: 13.41 tokens</li><li>max: 38 tokens</li></ul> | <ul><li>min: 37 tokens</li><li>mean: 201.32 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 24 tokens</li><li>mean: 204.09 tokens</li><li>max: 512 tokens</li></ul> |
|
607 |
+
* Samples:
|
608 |
+
| sentence_0 | sentence_1 | sentence_2 |
|
609 |
+
|:-----------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
610 |
+
| <code>Clustering with Transitive Distance and K-Means Duality</code> | <code> Recent spectral clustering methods are a propular and powerful technique for<br>data clustering. These methods need to solve the eigenproblem whose<br>computational complexity is $O(n^3)$, where $n$ is the number of data samples.<br>In this paper, a non-eigenproblem based clustering method is proposed to deal<br>with the clustering problem. Its performance is comparable to the spectral<br>clustering algorithms but it is more efficient with computational complexity<br>$O(n^2)$. We show that with a transitive distance and an observed property,<br>called K-means duality, our algorithm can be used to handle data sets with<br>complex cluster shapes, multi-scale clusters, and noise. Moreover, no<br>parameters except the number of clusters need to be set in our algorithm.<br></code> | <code> We show that the log-likelihood of several probabilistic graphical models is<br>Lipschitz continuous with respect to the lp-norm of the parameters. We discuss<br>several implications of Lipschitz parametrization. We present an upper bound of<br>the Kullback-Leibler divergence that allows understanding methods that penalize<br>the lp-norm of differences of parameters as the minimization of that upper<br>bound. The expected log-likelihood is lower bounded by the negative lp-norm,<br>which allows understanding the generalization ability of probabilistic models.<br>The exponential of the negative lp-norm is involved in the lower bound of the<br>Bayes error rate, which shows that it is reasonable to use parameters as<br>features in algorithms that rely on metric spaces (e.g. classification,<br>dimensionality reduction, clustering). Our results do not rely on specific<br>algorithms for learning the structure or parameters. We show preliminary<br>results for activity recognition and temporal segmentation.<br></code> |
|
611 |
+
| <code>Clustering Dynamic Web Usage Data</code> | <code> Most classification methods are based on the assumption that data conforms to<br>a stationary distribution. The machine learning domain currently suffers from a<br>lack of classification techniques that are able to detect the occurrence of a<br>change in the underlying data distribution. Ignoring possible changes in the<br>underlying concept, also known as concept drift, may degrade the performance of<br>the classification model. Often these changes make the model inconsistent and<br>regular updatings become necessary. Taking the temporal dimension into account<br>during the analysis of Web usage data is a necessity, since the way a site is<br>visited may indeed evolve due to modifications in the structure and content of<br>the site, or even due to changes in the behavior of certain user groups. One<br>solution to this problem, proposed in this article, is to update models using<br>summaries obtained by means of an evolutionary approach based on an intelligent<br>clustering approach. We carry out various clustering str...</code> | <code> Exponential family extensions of principal component analysis (EPCA) have<br>received a considerable amount of attention in recent years, demonstrating the<br>growing need for basic modeling tools that do not assume the squared loss or<br>Gaussian distribution. We extend the EPCA model toolbox by presenting the first<br>exponential family multi-view learning methods of the partial least squares and<br>canonical correlation analysis, based on a unified representation of EPCA as<br>matrix factorization of the natural parameters of exponential family. The<br>models are based on a new family of priors that are generally usable for all<br>such factorizations. We also introduce new inference strategies, and<br>demonstrate how the methods outperform earlier ones when the Gaussianity<br>assumption does not hold.<br></code> |
|
612 |
+
| <code>Trading USDCHF filtered by Gold dynamics via HMM coupling</code> | <code> We devise a USDCHF trading strategy using the dynamics of gold as a filter.<br>Our strategy involves modelling both USDCHF and gold using a coupled hidden<br>Markov model (CHMM). The observations will be indicators, RSI and CCI, which<br>will be used as triggers for our trading signals. Upon decoding the model in<br>each iteration, we can get the next most probable state and the next most<br>probable observation. Hopefully by taking advantage of intermarket analysis and<br>the Markov property implicit in the model, trading with these most probable<br>values will produce profitable results.<br></code> | <code> Most existing machine learning classifiers are highly vulnerable to<br>adversarial examples. An adversarial example is a sample of input data which<br>has been modified very slightly in a way that is intended to cause a machine<br>learning classifier to misclassify it. In many cases, these modifications can<br>be so subtle that a human observer does not even notice the modification at<br>all, yet the classifier still makes a mistake. Adversarial examples pose<br>security concerns because they could be used to perform an attack on machine<br>learning systems, even if the adversary has no access to the underlying model.<br>Up to now, all previous work have assumed a threat model in which the adversary<br>can feed data directly into the machine learning classifier. This is not always<br>the case for systems operating in the physical world, for example those which<br>are using signals from cameras and other sensors as an input. This paper shows<br>that even in such physical world scenarios, machine learning systems are<br>vul...</code> |
|
613 |
+
* Loss: [<code>TripletLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#tripletloss) with these parameters:
|
614 |
+
```json
|
615 |
+
{
|
616 |
+
"distance_metric": "TripletDistanceMetric.EUCLIDEAN",
|
617 |
+
"triplet_margin": 5
|
618 |
+
}
|
619 |
+
```
|
620 |
+
|
621 |
+
### Training Hyperparameters
|
622 |
+
#### Non-Default Hyperparameters
|
623 |
+
|
624 |
+
- `per_device_train_batch_size`: 16
|
625 |
+
- `per_device_eval_batch_size`: 16
|
626 |
+
- `num_train_epochs`: 2
|
627 |
+
- `multi_dataset_batch_sampler`: round_robin
|
628 |
+
|
629 |
+
#### All Hyperparameters
|
630 |
+
<details><summary>Click to expand</summary>
|
631 |
+
|
632 |
+
- `overwrite_output_dir`: False
|
633 |
+
- `do_predict`: False
|
634 |
+
- `eval_strategy`: no
|
635 |
+
- `prediction_loss_only`: True
|
636 |
+
- `per_device_train_batch_size`: 16
|
637 |
+
- `per_device_eval_batch_size`: 16
|
638 |
+
- `per_gpu_train_batch_size`: None
|
639 |
+
- `per_gpu_eval_batch_size`: None
|
640 |
+
- `gradient_accumulation_steps`: 1
|
641 |
+
- `eval_accumulation_steps`: None
|
642 |
+
- `torch_empty_cache_steps`: None
|
643 |
+
- `learning_rate`: 5e-05
|
644 |
+
- `weight_decay`: 0.0
|
645 |
+
- `adam_beta1`: 0.9
|
646 |
+
- `adam_beta2`: 0.999
|
647 |
+
- `adam_epsilon`: 1e-08
|
648 |
+
- `max_grad_norm`: 1
|
649 |
+
- `num_train_epochs`: 2
|
650 |
+
- `max_steps`: -1
|
651 |
+
- `lr_scheduler_type`: linear
|
652 |
+
- `lr_scheduler_kwargs`: {}
|
653 |
+
- `warmup_ratio`: 0.0
|
654 |
+
- `warmup_steps`: 0
|
655 |
+
- `log_level`: passive
|
656 |
+
- `log_level_replica`: warning
|
657 |
+
- `log_on_each_node`: True
|
658 |
+
- `logging_nan_inf_filter`: True
|
659 |
+
- `save_safetensors`: True
|
660 |
+
- `save_on_each_node`: False
|
661 |
+
- `save_only_model`: False
|
662 |
+
- `restore_callback_states_from_checkpoint`: False
|
663 |
+
- `no_cuda`: False
|
664 |
+
- `use_cpu`: False
|
665 |
+
- `use_mps_device`: False
|
666 |
+
- `seed`: 42
|
667 |
+
- `data_seed`: None
|
668 |
+
- `jit_mode_eval`: False
|
669 |
+
- `use_ipex`: False
|
670 |
+
- `bf16`: False
|
671 |
+
- `fp16`: False
|
672 |
+
- `fp16_opt_level`: O1
|
673 |
+
- `half_precision_backend`: auto
|
674 |
+
- `bf16_full_eval`: False
|
675 |
+
- `fp16_full_eval`: False
|
676 |
+
- `tf32`: None
|
677 |
+
- `local_rank`: 0
|
678 |
+
- `ddp_backend`: None
|
679 |
+
- `tpu_num_cores`: None
|
680 |
+
- `tpu_metrics_debug`: False
|
681 |
+
- `debug`: []
|
682 |
+
- `dataloader_drop_last`: False
|
683 |
+
- `dataloader_num_workers`: 0
|
684 |
+
- `dataloader_prefetch_factor`: None
|
685 |
+
- `past_index`: -1
|
686 |
+
- `disable_tqdm`: False
|
687 |
+
- `remove_unused_columns`: True
|
688 |
+
- `label_names`: None
|
689 |
+
- `load_best_model_at_end`: False
|
690 |
+
- `ignore_data_skip`: False
|
691 |
+
- `fsdp`: []
|
692 |
+
- `fsdp_min_num_params`: 0
|
693 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
694 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
695 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
696 |
+
- `deepspeed`: None
|
697 |
+
- `label_smoothing_factor`: 0.0
|
698 |
+
- `optim`: adamw_torch
|
699 |
+
- `optim_args`: None
|
700 |
+
- `adafactor`: False
|
701 |
+
- `group_by_length`: False
|
702 |
+
- `length_column_name`: length
|
703 |
+
- `ddp_find_unused_parameters`: None
|
704 |
+
- `ddp_bucket_cap_mb`: None
|
705 |
+
- `ddp_broadcast_buffers`: False
|
706 |
+
- `dataloader_pin_memory`: True
|
707 |
+
- `dataloader_persistent_workers`: False
|
708 |
+
- `skip_memory_metrics`: True
|
709 |
+
- `use_legacy_prediction_loop`: False
|
710 |
+
- `push_to_hub`: False
|
711 |
+
- `resume_from_checkpoint`: None
|
712 |
+
- `hub_model_id`: None
|
713 |
+
- `hub_strategy`: every_save
|
714 |
+
- `hub_private_repo`: False
|
715 |
+
- `hub_always_push`: False
|
716 |
+
- `gradient_checkpointing`: False
|
717 |
+
- `gradient_checkpointing_kwargs`: None
|
718 |
+
- `include_inputs_for_metrics`: False
|
719 |
+
- `include_for_metrics`: []
|
720 |
+
- `eval_do_concat_batches`: True
|
721 |
+
- `fp16_backend`: auto
|
722 |
+
- `push_to_hub_model_id`: None
|
723 |
+
- `push_to_hub_organization`: None
|
724 |
+
- `mp_parameters`:
|
725 |
+
- `auto_find_batch_size`: False
|
726 |
+
- `full_determinism`: False
|
727 |
+
- `torchdynamo`: None
|
728 |
+
- `ray_scope`: last
|
729 |
+
- `ddp_timeout`: 1800
|
730 |
+
- `torch_compile`: False
|
731 |
+
- `torch_compile_backend`: None
|
732 |
+
- `torch_compile_mode`: None
|
733 |
+
- `dispatch_batches`: None
|
734 |
+
- `split_batches`: None
|
735 |
+
- `include_tokens_per_second`: False
|
736 |
+
- `include_num_input_tokens_seen`: False
|
737 |
+
- `neftune_noise_alpha`: None
|
738 |
+
- `optim_target_modules`: None
|
739 |
+
- `batch_eval_metrics`: False
|
740 |
+
- `eval_on_start`: False
|
741 |
+
- `use_liger_kernel`: False
|
742 |
+
- `eval_use_gather_object`: False
|
743 |
+
- `average_tokens_across_devices`: False
|
744 |
+
- `prompts`: None
|
745 |
+
- `batch_sampler`: batch_sampler
|
746 |
+
- `multi_dataset_batch_sampler`: round_robin
|
747 |
+
|
748 |
+
</details>
|
749 |
+
|
750 |
+
### Training Logs
|
751 |
+
| Epoch | Step | Training Loss |
|
752 |
+
|:------:|:----:|:-------------:|
|
753 |
+
| 1.5974 | 500 | 0.8647 |
|
754 |
+
|
755 |
+
|
756 |
+
### Framework Versions
|
757 |
+
- Python: 3.10.12
|
758 |
+
- Sentence Transformers: 3.3.1
|
759 |
+
- Transformers: 4.46.2
|
760 |
+
- PyTorch: 2.5.1+cu121
|
761 |
+
- Accelerate: 1.1.1
|
762 |
+
- Datasets: 3.1.0
|
763 |
+
- Tokenizers: 0.20.3
|
764 |
+
|
765 |
+
## Citation
|
766 |
+
|
767 |
+
### BibTeX
|
768 |
+
|
769 |
+
#### Sentence Transformers
|
770 |
+
```bibtex
|
771 |
+
@inproceedings{reimers-2019-sentence-bert,
|
772 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
773 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
774 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
775 |
+
month = "11",
|
776 |
+
year = "2019",
|
777 |
+
publisher = "Association for Computational Linguistics",
|
778 |
+
url = "https://arxiv.org/abs/1908.10084",
|
779 |
+
}
|
780 |
+
```
|
781 |
+
|
782 |
+
#### TripletLoss
|
783 |
+
```bibtex
|
784 |
+
@misc{hermans2017defense,
|
785 |
+
title={In Defense of the Triplet Loss for Person Re-Identification},
|
786 |
+
author={Alexander Hermans and Lucas Beyer and Bastian Leibe},
|
787 |
+
year={2017},
|
788 |
+
eprint={1703.07737},
|
789 |
+
archivePrefix={arXiv},
|
790 |
+
primaryClass={cs.CV}
|
791 |
+
}
|
792 |
+
```
|
793 |
+
|
794 |
+
<!--
|
795 |
+
## Glossary
|
796 |
+
|
797 |
+
*Clearly define terms in order to be accessible across audiences.*
|
798 |
+
-->
|
799 |
+
|
800 |
+
<!--
|
801 |
+
## Model Card Authors
|
802 |
+
|
803 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
804 |
+
-->
|
805 |
+
|
806 |
+
<!--
|
807 |
+
## Model Card Contact
|
808 |
+
|
809 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
810 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "lufercho/my-finetuned-bert-mlm",
|
3 |
+
"architectures": [
|
4 |
+
"BertModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.46.2",
|
23 |
+
"type_vocab_size": 2,
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 30522
|
26 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.3.1",
|
4 |
+
"transformers": "4.46.2",
|
5 |
+
"pytorch": "2.5.1+cu121"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c8ed93ee899a84ef830578c2353cfd642dd98d68ee3e46bfab941b2643664f38
|
3 |
+
size 437951328
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
}
|
14 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 512,
|
50 |
+
"never_split": null,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"strip_accents": null,
|
54 |
+
"tokenize_chinese_chars": true,
|
55 |
+
"tokenizer_class": "BertTokenizer",
|
56 |
+
"unk_token": "[UNK]"
|
57 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|