lfoppiano commited on
Commit
f6bfed1
·
verified ·
1 Parent(s): 49f8f9c

Upload grobid.yaml

Browse files
Files changed (1) hide show
  1. grobid.yaml +341 -0
grobid.yaml ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is the configuration file for the GROBID instance
2
+
3
+ grobid:
4
+ # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
5
+ grobidHome: "grobid-home"
6
+
7
+ # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
8
+ temp: "tmp"
9
+
10
+ # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
11
+ nativelibrary: "lib"
12
+
13
+ pdf:
14
+ pdfalto:
15
+ # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
16
+ path: "pdfalto"
17
+ # security for PDF parsing
18
+ memoryLimitMb: 6096
19
+ timeoutSec: 120
20
+
21
+ # security relative to the PDF parsing result
22
+ blocksMax: 200000
23
+ tokensMax: 1000000
24
+
25
+ consolidation:
26
+ # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
27
+ # "glutton" for https://github.com/kermitt2/biblio-glutton
28
+ service: "crossref"
29
+ #service: "glutton"
30
+ glutton:
31
+ url: "https://cloud.science-miner.com/glutton"
32
+ #url: "http://localhost:8080"
33
+ crossref:
34
+ mailto:
35
+ # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
36
+ #mailto: "[email protected]"
37
+ token:
38
+ # to use Crossref metadata plus service (available by subscription)
39
+ #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
40
+
41
+ proxy:
42
+ # proxy to be used when doing external call to the consolidation service
43
+ host:
44
+ port:
45
+
46
+ # CORS configuration for the GROBID web API service
47
+ corsAllowedOrigins: "*"
48
+ corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
49
+ corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
50
+
51
+ # the actual implementation for language recognition to be used
52
+ languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
53
+
54
+ # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
55
+ #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
56
+ sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
57
+
58
+ # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
59
+ # for a production server running only GROBID, set the value slightly above the available number of threads of the server
60
+ # to get best performance and security
61
+ concurrency: 10
62
+ # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
63
+ # to get an engine (in seconds) - normally never change it
64
+ poolMaxWait: 1
65
+
66
+ delft:
67
+ # DeLFT global parameters
68
+ # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
69
+ # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
70
+ install: "../delft"
71
+ pythonVirtualEnv:
72
+
73
+ wapiti:
74
+ # Wapiti global parameters
75
+ # number of threads for training the wapiti models (0 to use all available processors)
76
+ nbThreads: 0
77
+
78
+ models:
79
+ # we configure here how each sequence labeling model should be implemented
80
+ # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
81
+ # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
82
+ # parameters then depends on this selected DL architecture
83
+
84
+ - name: "segmentation"
85
+ # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
86
+ engine: "wapiti"
87
+ #engine: "delft"
88
+ wapiti:
89
+ # wapiti training parameters, they will be used at training time only
90
+ epsilon: 0.0000001
91
+ window: 50
92
+ nbMaxIterations: 2000
93
+ delft:
94
+ # deep learning parameters
95
+ architecture: "BidLSTM_CRF_FEATURES"
96
+ useELMo: false
97
+ runtime:
98
+ # parameters used at runtime/prediction
99
+ max_sequence_length: 3000
100
+ batch_size: 1
101
+ training:
102
+ # parameters used for training
103
+ max_sequence_length: 3000
104
+ batch_size: 10
105
+
106
+ - name: "fulltext"
107
+ # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
108
+ engine: "wapiti"
109
+ wapiti:
110
+ # wapiti training parameters, they will be used at training time only
111
+ epsilon: 0.0001
112
+ window: 20
113
+ nbMaxIterations: 1500
114
+
115
+ - name: "header"
116
+ engine: "wapiti"
117
+ #engine: "delft"
118
+ wapiti:
119
+ # wapiti training parameters, they will be used at training time only
120
+ epsilon: 0.000001
121
+ window: 30
122
+ nbMaxIterations: 1500
123
+ delft:
124
+ # deep learning parameters
125
+ architecture: "BidLSTM_ChainCRF_FEATURES"
126
+ #transformer: "allenai/scibert_scivocab_cased"
127
+ useELMo: false
128
+ runtime:
129
+ # parameters used at runtime/prediction
130
+ #max_sequence_length: 510
131
+ max_sequence_length: 3000
132
+ batch_size: 1
133
+ training:
134
+ # parameters used for training
135
+ #max_sequence_length: 510
136
+ #batch_size: 6
137
+ max_sequence_length: 3000
138
+ batch_size: 9
139
+
140
+ - name: "reference-segmenter"
141
+ engine: "wapiti"
142
+ #engine: "delft"
143
+ wapiti:
144
+ # wapiti training parameters, they will be used at training time only
145
+ epsilon: 0.00001
146
+ window: 20
147
+ delft:
148
+ # deep learning parameters
149
+ architecture: "BidLSTM_ChainCRF_FEATURES"
150
+ useELMo: false
151
+ runtime:
152
+ # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
153
+ max_sequence_length: 3000
154
+ batch_size: 2
155
+ training:
156
+ # parameters used for training
157
+ max_sequence_length: 3000
158
+ batch_size: 10
159
+
160
+ - name: "name-header"
161
+ engine: "wapiti"
162
+ #engine: "delft"
163
+ delft:
164
+ # deep learning parameters
165
+ architecture: "BidLSTM_CRF_FEATURES"
166
+
167
+ - name: "name-citation"
168
+ engine: "wapiti"
169
+ #engine: "delft"
170
+ delft:
171
+ # deep learning parameters
172
+ architecture: "BidLSTM_CRF_FEATURES"
173
+
174
+ - name: "date"
175
+ engine: "wapiti"
176
+ #engine: "delft"
177
+ delft:
178
+ # deep learning parameters
179
+ architecture: "BidLSTM_CRF_FEATURES"
180
+
181
+ - name: "figure"
182
+ engine: "wapiti"
183
+ #engine: "delft"
184
+ wapiti:
185
+ # wapiti training parameters, they will be used at training time only
186
+ epsilon: 0.00001
187
+ window: 20
188
+ delft:
189
+ # deep learning parameters
190
+ architecture: "BidLSTM_CRF"
191
+
192
+ - name: "table"
193
+ engine: "wapiti"
194
+ #engine: "delft"
195
+ wapiti:
196
+ # wapiti training parameters, they will be used at training time only
197
+ epsilon: 0.00001
198
+ window: 20
199
+ delft:
200
+ # deep learning parameters
201
+ architecture: "BidLSTM_CRF"
202
+
203
+ - name: "affiliation-address"
204
+ engine: "wapiti"
205
+ #engine: "delft"
206
+ delft:
207
+ # deep learning parameters
208
+ architecture: "BidLSTM_CRF_FEATURES"
209
+
210
+ - name: "citation"
211
+ engine: "wapiti"
212
+ #engine: "delft"
213
+ wapiti:
214
+ # wapiti training parameters, they will be used at training time only
215
+ epsilon: 0.00001
216
+ window: 50
217
+ nbMaxIterations: 3000
218
+ delft:
219
+ # deep learning parameters
220
+ architecture: "BidLSTM_CRF_FEATURES"
221
+ #architecture: "BERT_CRF"
222
+ #transformer: "michiyasunaga/LinkBERT-base"
223
+ useELMo: false
224
+ runtime:
225
+ # parameters used at runtime/prediction
226
+ max_sequence_length: 500
227
+ batch_size: 30
228
+ training:
229
+ # parameters used for training
230
+ max_sequence_length: 500
231
+ batch_size: 50
232
+
233
+ - name: "patent-citation"
234
+ engine: "wapiti"
235
+ #engine: "delft"
236
+ wapiti:
237
+ # wapiti training parameters, they will be used at training time only
238
+ epsilon: 0.0001
239
+ window: 20
240
+ delft:
241
+ # deep learning parameters
242
+ architecture: "BidLSTM_CRF_FEATURES"
243
+ #architecture: "BERT_CRF"
244
+ runtime:
245
+ # parameters used at runtime/prediction
246
+ max_sequence_length: 800
247
+ batch_size: 20
248
+ training:
249
+ # parameters used for training
250
+ max_sequence_length: 1000
251
+ batch_size: 40
252
+
253
+ - name: "funding-acknowledgement"
254
+ engine: "wapiti"
255
+ #engine: "delft"
256
+ wapiti:
257
+ # wapiti training parameters, they will be used at training time only
258
+ epsilon: 0.00001
259
+ window: 50
260
+ nbMaxIterations: 2000
261
+ delft:
262
+ # deep learning parameters
263
+ architecture: "BidLSTM_CRF_FEATURES"
264
+ #architecture: "BERT_CRF"
265
+ #transformer: "michiyasunaga/LinkBERT-base"
266
+ useELMo: false
267
+ runtime:
268
+ # parameters used at runtime/prediction
269
+ max_sequence_length: 800
270
+ batch_size: 20
271
+ training:
272
+ # parameters used for training
273
+ max_sequence_length: 500
274
+ batch_size: 40
275
+
276
+ - name: "copyright"
277
+ # at this time, we only have a DeLFT implementation,
278
+ # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
279
+ #engine: "delft"
280
+ engine: "wapiti"
281
+ delft:
282
+ # deep learning parameters
283
+ architecture: "gru"
284
+ #architecture: "bert"
285
+ #transformer: "allenai/scibert_scivocab_cased"
286
+
287
+ - name: "license"
288
+ # at this time, for being active, it must be DeLFT, no other implementation is available
289
+ # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
290
+ #engine: "delft"
291
+ engine: "wapiti"
292
+ delft:
293
+ # deep learning parameters
294
+ architecture: "gru"
295
+ #architecture: "bert"
296
+ #transformer: "allenai/scibert_scivocab_cased"
297
+
298
+ # for **service only**: how to load the models,
299
+ # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
300
+ # significantly the service at first call
301
+ # true -> all the models are loaded into memory at the server startup (default), slow the start of the services
302
+ # and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
303
+ modelPreload: true
304
+
305
+ server:
306
+ type: custom
307
+ applicationConnectors:
308
+ - type: http
309
+ port: 8070
310
+ adminConnectors:
311
+ - type: http
312
+ port: 8071
313
+ registerDefaultExceptionMappers: false
314
+ # change the following for having all http requests logged
315
+ requestLog:
316
+ appenders: []
317
+
318
+ # these logging settings apply to the Grobid service usage mode
319
+ logging:
320
+ level: INFO
321
+ loggers:
322
+ org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
323
+ org.glassfish.jersey.internal: "OFF"
324
+ com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
325
+ appenders:
326
+ - type: console
327
+ threshold: WARN
328
+ timeZone: UTC
329
+ # uncomment to have the logs in json format
330
+ #layout:
331
+ # type: json
332
+ - type: file
333
+ currentLogFilename: logs/grobid-service.log
334
+ threshold: INFO
335
+ archive: true
336
+ archivedLogFilenamePattern: logs/grobid-service-%d.log
337
+ archivedFileCount: 5
338
+ timeZone: UTC
339
+ # uncomment to have the logs in json format
340
+ #layout:
341
+ # type: json