mboth commited on
Commit
c4ce2cb
·
1 Parent(s): c878d76

Delete app

Browse files
Files changed (4) hide show
  1. app/database_build.py +0 -552
  2. app/main.py +0 -90
  3. app/metadata.pickle +0 -3
  4. app/predict_se.py +0 -264
app/database_build.py DELETED
@@ -1,552 +0,0 @@
1
- from sentence_transformers import SentenceTransformer, util
2
- import json
3
- import time
4
- import pandas as pd
5
- import numpy as np
6
- import pickle
7
-
8
- import chromadb
9
- from chromadb.config import Settings
10
- from chromadb.utils import embedding_functions
11
- from chromadb.db.clickhouse import NoDatapointsException
12
-
13
-
14
- def prepare_cd(conceptDescriptions):
15
- df_cd = pd.DataFrame(
16
- columns=["SemanticId", "Definition", "PreferredName", "Datatype", "Unit"]
17
- )
18
- # In den leeren DF werden alle Concept Descriptions eingelesen
19
- for cd in conceptDescriptions:
20
- semantic_id = cd["identification"]["id"]
21
- data_spec = cd["embeddedDataSpecifications"][0]["dataSpecificationContent"]
22
- preferred_name = data_spec["preferredName"]
23
- short_name = data_spec["shortName"]
24
- if len(preferred_name) > 1:
25
- for name_variant in preferred_name:
26
- if (
27
- name_variant["language"] == "EN"
28
- or name_variant["language"] == "en"
29
- or name_variant["language"] == "EN?"
30
- ):
31
- name = name_variant["text"]
32
- elif len(preferred_name) == 1:
33
- name = preferred_name[0]["text"]
34
- elif len(preferred_name) == 0:
35
- short_name = data_spec["shortName"]
36
- if len(short_name) == 0:
37
- name = "NaN"
38
- else:
39
- name = short_name[0]["text"]
40
-
41
- definition = data_spec["definition"]
42
- if len(definition) > 1:
43
- for definition_variant in definition:
44
- if (
45
- definition_variant["language"] == "EN"
46
- or definition_variant["language"] == "en"
47
- or definition_variant["language"] == "EN?"
48
- ):
49
- chosen_def = definition_variant["text"]
50
- elif len(definition) == 1:
51
- chosen_def = definition[0]["text"]
52
- elif len(definition) == 0:
53
- chosen_def = "NaN"
54
-
55
- if data_spec["dataType"] == "":
56
- datatype = "NaN"
57
- else:
58
- datatype = data_spec["dataType"]
59
-
60
- if data_spec["unit"] == "":
61
- unit = "NaN"
62
- else:
63
- unit = data_spec["unit"]
64
-
65
- new_entry = pd.DataFrame(
66
- {
67
- "SemanticId": semantic_id,
68
- "Definition": chosen_def,
69
- "PreferredName": name,
70
- "Datatype": datatype,
71
- "Unit": unit,
72
- },
73
- index=[0],
74
- )
75
- df_cd = pd.concat([df_cd, new_entry], ignore_index=True)
76
- return df_cd
77
-
78
-
79
- def get_values(submodel_element):
80
- # Auslesen der Submodel Element Werte
81
- se_type = submodel_element["modelType"]["name"]
82
- se_semantic_id = submodel_element["semanticId"]["keys"][0]["value"]
83
- se_semantic_id_local = submodel_element["semanticId"]["keys"][0]["local"]
84
- se_id_short = submodel_element["idShort"]
85
- value = []
86
- se_value = submodel_element["value"]
87
- value.append(se_value)
88
-
89
- return se_type, se_semantic_id, se_semantic_id_local, se_id_short, value
90
-
91
-
92
- def get_concept_description(semantic_id, df_cd):
93
- cd_content = df_cd.loc[df_cd["SemanticId"] == semantic_id]
94
-
95
- if cd_content.empty:
96
- cd_content = pd.DataFrame(
97
- {
98
- "SemanticId": semantic_id,
99
- "Definition": "NaN",
100
- "PreferredName": "NaN",
101
- "Datatype": "NaN",
102
- "Unit": "NaN",
103
- },
104
- index=[0],
105
- )
106
-
107
- cd_content = cd_content.iloc[0]
108
-
109
- return cd_content
110
-
111
-
112
- def get_values_sec(
113
- df_cd,
114
- content,
115
- df,
116
- aas_id,
117
- aas_name,
118
- submodel_id,
119
- submodel_name,
120
- submodel_semantic_id,
121
- ):
122
- collection_values = content[0]["value"]
123
- for element in collection_values:
124
- content = []
125
- content.append(element)
126
-
127
- se_type, se_semantic_id, se_semantic_id_local, se_id_short, value = get_values(
128
- element
129
- )
130
- if se_type == "SubmodelElementCollection":
131
- if se_semantic_id_local == True:
132
- cd_content = get_concept_description(se_semantic_id, df_cd)
133
- definition = cd_content["Definition"]
134
- preferred_name = cd_content["PreferredName"]
135
- datatype = cd_content["Datatype"]
136
- unit = cd_content["Unit"]
137
-
138
- else:
139
- definition = "NaN"
140
- preferred_name = "NaN"
141
- datatype = "NaN"
142
- unit = "NaN"
143
-
144
- new_row = pd.DataFrame(
145
- {
146
- "AASId": aas_id,
147
- "AASIdShort": aas_name,
148
- "SubmodelId": submodel_id,
149
- "SubmodelName": submodel_name,
150
- "SubmodelSemanticId": submodel_semantic_id,
151
- "SEContent": content,
152
- "SESemanticId": se_semantic_id,
153
- "SEModelType": se_type,
154
- "SEIdShort": se_id_short,
155
- "SEValue": value,
156
- "Definition": definition,
157
- "PreferredName": preferred_name,
158
- "Datatype": datatype,
159
- "Unit": unit,
160
- }
161
- )
162
- df = pd.concat([df, new_row], ignore_index=True)
163
-
164
- content = []
165
- content.append(element)
166
- # Rekursive Funktion -> so oft durchlaufen bis unterste Ebene der Collections erreicht ist, so werden verschachteltet SECs bis zum Ende ausgelesen
167
- df = get_values_sec(
168
- df_cd,
169
- content,
170
- df,
171
- aas_id,
172
- aas_name,
173
- submodel_id,
174
- submodel_name,
175
- submodel_semantic_id,
176
- )
177
-
178
- else:
179
- if se_semantic_id_local == True:
180
- cd_content = get_concept_description(se_semantic_id, df_cd)
181
- definition = cd_content["Definition"]
182
- preferred_name = cd_content["PreferredName"]
183
- datatype = cd_content["Datatype"]
184
- unit = cd_content["Unit"]
185
-
186
- else:
187
- definition = "NaN"
188
- preferred_name = "NaN"
189
- datatype = "NaN"
190
- unit = "NaN"
191
-
192
- new_row = pd.DataFrame(
193
- {
194
- "AASId": aas_id,
195
- "AASIdShort": aas_name,
196
- "SubmodelId": submodel_id,
197
- "SubmodelName": submodel_name,
198
- "SubmodelSemanticId": submodel_semantic_id,
199
- "SEContent": content,
200
- "SESemanticId": se_semantic_id,
201
- "SEModelType": se_type,
202
- "SEIdShort": se_id_short,
203
- "SEValue": value,
204
- "Definition": definition,
205
- "PreferredName": preferred_name,
206
- "Datatype": datatype,
207
- "Unit": unit,
208
- }
209
- )
210
- df = pd.concat([df, new_row], ignore_index=True)
211
-
212
- return df
213
-
214
-
215
- def set_up_metadata(metalabel, df):
216
- datatype_mapping = {
217
- "boolean": "BOOLEAN",
218
- "string": "STRING",
219
- "string_translatable": "STRING",
220
- "translatable_string": "STRING",
221
- "non_translatable_string": "STRING",
222
- "date": "DATE",
223
- "data_time": "DATE",
224
- "uri": "URI",
225
- "int": "INT",
226
- "int_measure": "INT",
227
- "int_currency": "INT",
228
- "integer": "INT",
229
- "real": "REAL",
230
- "real_measure": "REAL",
231
- "real_currency": "REAL",
232
- "enum_code": "ENUM_CODE",
233
- "enum_int": "ENUM_CODE",
234
- "ENUM_REAL": "ENUM_CODE",
235
- "ENUM_RATIONAL": "ENUM_CODE",
236
- "ENUM_BOOLEAN": "ENUM_CODE",
237
- "ENUM_STRING": "ENUM_CODE",
238
- "enum_reference": "ENUM_CODE",
239
- "enum_instance": "ENUM_CODE",
240
- "set(b1,b2)": "SET",
241
- "constrained_set(b1,b2,cmn,cmx)": "SET",
242
- "set [0,?]": "SET",
243
- "set [1,?]": "SET",
244
- "set [1, ?]": "SET",
245
- "nan": "NaN",
246
- "media_type": "LARGE_OBJECT_TYPE",
247
- }
248
-
249
- unit_mapping = {
250
- "nan": "NaN",
251
- "hertz": "FREQUENCY",
252
- "hz": "FREQUENCY",
253
- "pa": "PRESSURE",
254
- "pascal": "PRESSURE",
255
- "n/m²": "PRESSURE",
256
- "bar": "PRESSURE",
257
- "%": "SCALARS_PERC",
258
- "w": "POWER",
259
- "watt": "POWER",
260
- "kw": "POWER",
261
- "kg/m³": "CHEMISTRY",
262
- "m²/s": "CHEMISTRY",
263
- "pa*s": "CHEMISTRY",
264
- "v": "ELECTRICAL",
265
- "volt": "ELECTRICAL",
266
- "db": "ACOUSTICS",
267
- "db(a)": "ACOUSTICS",
268
- "k": "TEMPERATURE",
269
- "°c": "TEMPERATURE",
270
- "n": "MECHANICS",
271
- "newton": "MECHANICS",
272
- "kg/s": "FLOW",
273
- "kg/h": "FLOW",
274
- "m³/s": "FLOW",
275
- "m³/h": "FLOW",
276
- "l/s": "FLOW",
277
- "l/h": "FLOW",
278
- "µm": "LENGTH",
279
- "mm": "LENGTH",
280
- "cm": "LENGTH",
281
- "dm": "LENGTH",
282
- "m": "LENGTH",
283
- "meter": "LENGTH",
284
- "m/s": "SPEED",
285
- "km/h": "SPEED",
286
- "s^(-1)": "FREQUENCY",
287
- "1/s": "FREQUENCY",
288
- "s": "TIME",
289
- "h": "TIME",
290
- "min": "TIME",
291
- "d": "TIME",
292
- "hours": "TIME",
293
- "a": "ELECTRICAL",
294
- "m³": "VOLUME",
295
- "m²": "AREA",
296
- "rpm": "FLOW",
297
- "nm": "MECHANICS",
298
- "m/m": "MECHANICS",
299
- "m³/m²s": "MECHANICS",
300
- "w(m²*K)": "HEAT_TRANSFER",
301
- "kwh": "ELECTRICAL",
302
- "kg/(s*m²)": "FLOW",
303
- "kg": "MASS",
304
- "w/(m*k)": "HEAT_TRANSFER",
305
- "m²*k/w": "HEAT_TRANSFER",
306
- "j/s": "POWER",
307
- }
308
-
309
- dataset = df
310
- dataset["unit_lowercase"] = dataset["Unit"]
311
- dataset["unit_lowercase"] = dataset["unit_lowercase"].str.lower()
312
- dataset["unit_categ"] = dataset["unit_lowercase"].map(unit_mapping)
313
-
314
- dataset["datatype_lowercase"] = dataset["Datatype"]
315
- dataset["datatype_lowercase"] = dataset["datatype_lowercase"].str.lower()
316
- dataset["datatype_categ"] = dataset["datatype_lowercase"].map(datatype_mapping)
317
-
318
- dataset = dataset.fillna("NaN")
319
- dataset["index"] = dataset.index
320
-
321
- # uni_datatype=dataset['datatype_categ'].unique()
322
- # uni_unit=dataset['unit_categ'].unique()
323
- unique_labels_set = set()
324
-
325
- dataset["Metalabel"] = ""
326
- for i in range(0, len(dataset["Metalabel"])):
327
- concat = (str(dataset["unit_categ"][i]), str(dataset["datatype_categ"][i]))
328
- keys = [k for k, v in metalabel.items() if v == concat]
329
- dataset["Metalabel"][i] = keys[0]
330
- unique_labels_set.add(keys[0])
331
- unique_label = list(unique_labels_set)
332
- print(unique_label)
333
-
334
- return dataset
335
-
336
-
337
- def encode(aas_df, model):
338
- # Einsatz von Sentence Bert um Embeddings zu kreieren
339
- aas_df["PreferredName"] = "Name: " + aas_df["PreferredName"].astype(str)
340
- aas_df["Definition"] = "Description: " + aas_df["Definition"].astype(str) + "; "
341
- corpus_names = aas_df.loc[:, "PreferredName"]
342
- corpus_definitions = aas_df.loc[:, "Definition"]
343
- embeddings_definitions = model.encode(corpus_definitions, show_progress_bar=True)
344
- embeddings_names = model.encode(corpus_names, show_progress_bar=True)
345
- concat_name_def_emb = np.concatenate(
346
- (embeddings_definitions, embeddings_names), axis=1
347
- )
348
- # aas_df['EmbeddingDefinition'] = embeddings_definitions.tolist()
349
- # aas_df['EmbeddingName'] = embeddings_names.tolist()
350
- aas_df["EmbeddingNameDefinition"] = concat_name_def_emb.tolist()
351
- return aas_df
352
-
353
-
354
- def convert_to_list(aas_df):
355
- # Für die Datenbank werden teilweise Listen gebraucht
356
- aas_index = aas_df.index.tolist()
357
- aas_index_str = [str(r) for r in aas_index]
358
- se_content = aas_df["SEContent"].tolist()
359
- se_embedding_name_definition = aas_df["EmbeddingNameDefinition"].tolist()
360
-
361
- aas_df_dropped = aas_df.drop(
362
- ["EmbeddingNameDefinition", "SEContent", "SEValue"], axis=1
363
- )
364
-
365
- metadata = aas_df_dropped.to_dict("records")
366
-
367
- return metadata, aas_index_str, se_content, se_embedding_name_definition
368
-
369
-
370
- def set_up_chroma(
371
- metadata, aas_index_str, se_content, se_embedding_name_definition, aas_name, client
372
- ):
373
- aas_name = aas_name.lower()
374
- # Kein Großbuchstaben in Datenbank erlaubt
375
- print(aas_name)
376
- # client = chromadb.Client(Settings(
377
- # chroma_db_impl="duckdb+parquet",
378
- # persist_directory="./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment" # Optional, defaults to .chromadb/ in the current directory
379
- # ))
380
- emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
381
- model_name="gart-labor/eng-distilBERT-se-eclass"
382
- )
383
- collection = client.get_or_create_collection(
384
- name=aas_name, embedding_function=emb_fn
385
- )
386
-
387
- aas_content_string = []
388
- # Umwandeln in Json damit es in db geschrieben werden kann
389
- for element in se_content:
390
- content = json.dumps(element)
391
- aas_content_string.append(content)
392
-
393
- items = collection.count() # returns the number of items in the collection
394
- print(collection)
395
- print("Datenbank erstellt, Anzahl Items:")
396
- print(items)
397
- if items == 0:
398
- # Hinzufügen der SE Inhalte, der Embeddings und weiterer Metadaten in collection der Datenbank
399
- collection.add(
400
- documents=aas_content_string,
401
- embeddings=se_embedding_name_definition,
402
- metadatas=metadata,
403
- ids=aas_index_str,
404
- )
405
- items = collection.count() # returns the number of items in the collection
406
- print("------------")
407
- print("Datenbank befüllt, Anzahl items:")
408
- print(items)
409
- else:
410
- print("-----------")
411
- print("AAS schon vorhanden")
412
-
413
- return collection
414
-
415
-
416
- def read_aas(aas, submodels, assets, conceptDescriptions, submodels_ids, metalabel):
417
- df = pd.DataFrame(
418
- columns=[
419
- "AASId",
420
- "AASIdShort",
421
- "SubmodelId",
422
- "SubmodelName",
423
- "SubmodelSemanticId",
424
- "SEContent",
425
- "SESemanticId",
426
- "SEModelType",
427
- "SEIdShort",
428
- "SEValue",
429
- "Definition",
430
- "PreferredName",
431
- "Datatype",
432
- "Unit",
433
- ]
434
- )
435
-
436
- aas_id = aas[0]["identification"]["id"]
437
- aas_name = aas[0]["idShort"]
438
- # Aufbereiten aller Concept descriptions als pandas dataframe, damit diese nachher einfacher untersucht werden können
439
- df_cd = prepare_cd(conceptDescriptions)
440
- # Auslesen der Teilmodelle
441
- for submodel in submodels:
442
- submodel_name = submodel["idShort"]
443
- submodel_id = submodel["identification"]["id"]
444
- # Muss gemacht werden, da Anzahl der Teilmodelle innerhalb der AAS und des Env nicht immer übereisntimmen
445
- if submodel_id in submodels_ids:
446
- semantic_id_existing = submodel["semanticId"]["keys"]
447
- if not semantic_id_existing:
448
- submodel_semantic_id = "Not defined"
449
- else:
450
- submodel_semantic_id = semantic_id_existing[0]["value"]
451
- submodel_elements = submodel["submodelElements"]
452
- # Auslesen Submodel Elements
453
- for submodel_element in submodel_elements:
454
- content = []
455
- content.append(submodel_element)
456
-
457
- (
458
- se_type,
459
- se_semantic_id,
460
- se_semantic_id_local,
461
- se_id_short,
462
- value,
463
- ) = get_values(submodel_element)
464
-
465
- # When Concept Description local dann auslesen der Concept Description
466
- if se_semantic_id_local == True:
467
- cd_content = get_concept_description(se_semantic_id, df_cd)
468
- definition = cd_content["Definition"]
469
- preferred_name = cd_content["PreferredName"]
470
- datatype = cd_content["Datatype"]
471
- unit = cd_content["Unit"]
472
-
473
- else:
474
- definition = "NaN"
475
- preferred_name = "NaN"
476
- datatype = "NaN"
477
- unit = "NaN"
478
-
479
- new_row = pd.DataFrame(
480
- {
481
- "AASId": aas_id,
482
- "AASIdShort": aas_name,
483
- "SubmodelId": submodel_id,
484
- "SubmodelName": submodel_name,
485
- "SubmodelSemanticId": submodel_semantic_id,
486
- "SEContent": content,
487
- "SESemanticId": se_semantic_id,
488
- "SEModelType": se_type,
489
- "SEIdShort": se_id_short,
490
- "SEValue": value,
491
- "Definition": definition,
492
- "PreferredName": preferred_name,
493
- "Datatype": datatype,
494
- "Unit": unit,
495
- }
496
- )
497
- df = pd.concat([df, new_row], ignore_index=True)
498
-
499
- # Wenn Submodel Element Collection dann diese Werte auch auslesen
500
- if se_type == "SubmodelElementCollection":
501
- df = get_values_sec(
502
- df_cd,
503
- content,
504
- df,
505
- aas_id,
506
- aas_name,
507
- submodel_id,
508
- submodel_name,
509
- submodel_semantic_id,
510
- )
511
- else:
512
- continue
513
-
514
- df = set_up_metadata(metalabel, df)
515
-
516
- return df, aas_name
517
-
518
-
519
- def index_corpus(data, model, metalabel, client_chroma):
520
- # Start Punkt
521
-
522
- aas = data["assetAdministrationShells"]
523
- aas_submodels = aas[0]["submodels"]
524
- submodels_ids = []
525
- for submodel in aas_submodels:
526
- submodels_ids.append(submodel["keys"][0]["value"])
527
- submodels = data["submodels"]
528
- conceptDescriptions = data["conceptDescriptions"]
529
- assets = data["assets"]
530
-
531
- aas_df, aas_name = read_aas(
532
- aas, submodels, assets, conceptDescriptions, submodels_ids, metalabel
533
- )
534
- # aas_df_embeddings = encode(aas_df, model)
535
- aas_df = encode(aas_df, model)
536
- metadata, aas_index_str, se_content, se_embedding_name_definition = convert_to_list(
537
- aas_df
538
- )
539
- collection = set_up_chroma(
540
- metadata,
541
- aas_index_str,
542
- se_content,
543
- se_embedding_name_definition,
544
- aas_name,
545
- client_chroma,
546
- )
547
-
548
- return collection
549
-
550
-
551
- # if __name__ == '__main__':
552
- # create_database = index_corpus(aas = 'festo_switch.json')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/main.py DELETED
@@ -1,90 +0,0 @@
1
- from sentence_transformers import SentenceTransformer, util
2
-
3
- # from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
4
- import time
5
- import os
6
- import json
7
- import pandas as pd
8
- import numpy as np
9
- import category_encoders as ce
10
- import string
11
- import pickle
12
- import tqdm.autonotebook
13
- from fastapi import FastAPI, Request, UploadFile, File
14
- from joblib import dump, load
15
- from pydantic import BaseModel
16
- import sys
17
- from database_build import index_corpus
18
- from predict_se import ask_database
19
- from typing import Any, Dict, AnyStr, List, Union
20
- import chromadb
21
- from chromadb.config import Settings
22
-
23
- app = FastAPI(title="Interface Semantic Matching")
24
-
25
- JSONObject = Dict[AnyStr, Any]
26
- JSONArray = List[Any]
27
- JSONStructure = Union[JSONArray, JSONObject]
28
-
29
-
30
- class submodelElement(BaseModel):
31
- datatype: str ="NaN"
32
- definition: str
33
- name: str
34
- semantic_id: str
35
- unit: str = "NaN"
36
- return_matches: int = 3
37
-
38
- @app.on_event("startup")
39
- def load_hf_model():
40
- global model
41
- # Altes Modell
42
- # model = SentenceTransformer('mboth/distil-eng-quora-sentence')
43
-
44
- # Fine Tuned Modell
45
- model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass")
46
-
47
- # global model_translate
48
- # model_translate = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
49
- # global tokenizer_translate
50
- # tokenizer_translate = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
51
-
52
- with open("app/metadata.pickle", "rb") as handle:
53
- global metalabel
54
- metalabel = pickle.load(handle)
55
- global client_chroma
56
- client_chroma = chromadb.Client(
57
- Settings(
58
- chroma_api_impl="rest",
59
- # chroma_server_host muss angepasst werden nach jedem Neustart AWS
60
- chroma_server_host="3.67.80.82",
61
- chroma_server_http_port=8000,
62
- )
63
- )
64
-
65
-
66
- @app.post("/PostAssetAdministrationShellEmbeddings")
67
- async def index_aas(aas: UploadFile = File(...)):
68
- data = json.load(aas.file)
69
- print(type(data))
70
- # aas = new_file
71
- #aas, submodels, conceptDescriptions, assets, aas_df, collection, aas_name= index_corpus(data, model, metalabel, client_chroma)
72
- collection = index_corpus(data, model, metalabel, client_chroma)
73
- ready = 'AAS ready'
74
- return ready
75
-
76
-
77
- @app.post("/GetAllSubmodelElementsBySemanticIdAndSemanticInformation")
78
- def predict(name: str, definition: str, semantic_id: str, unit: str, datatype: str, return_matches: int):
79
- collections = client_chroma.list_collections()
80
- query = {
81
- "Name": name,
82
- "Definition": definition,
83
- "Unit": unit,
84
- "Datatype": datatype,
85
- "SemanticId": semantic_id,
86
- "ReturnMatches": return_matches,
87
- }
88
- results = ask_database(query, metalabel, model, collections, client_chroma)
89
-
90
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/metadata.pickle DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b4aee0cd2ca534e4af8023bd334db591a0a46b2a37154758aa5e3873b8d4728
3
- size 1670
 
 
 
 
app/predict_se.py DELETED
@@ -1,264 +0,0 @@
1
- from sentence_transformers import SentenceTransformer, util
2
- import json
3
- import time
4
- import pandas as pd
5
- import numpy as np
6
- import pickle
7
-
8
- import chromadb
9
- from chromadb.config import Settings
10
- from chromadb.utils import embedding_functions
11
- from chromadb.db.clickhouse import NoDatapointsException
12
-
13
-
14
- def query_aas(query_json, collection, model, metalabel):
15
- query = json.loads(query_json)
16
- name = query["Name"]
17
- definition = query["Definition"]
18
- unit = query["Unit"]
19
- datatype = query["Datatype"]
20
- semantic_id = query["SemanticId"]
21
- return_matches = query["ReturnMatches"]
22
-
23
- #model = SentenceTransformer("gart-labor/eng-distilBERT-se-eclass")
24
-
25
- datatype_mapping = {
26
- "boolean": "BOOLEAN",
27
- "string": "STRING",
28
- "string_translatable": "STRING",
29
- "translatable_string": "STRING",
30
- "non_translatable_string": "STRING",
31
- "date": "DATE",
32
- "data_time": "DATE",
33
- "uri": "URI",
34
- "int": "INT",
35
- "int_measure": "INT",
36
- "int_currency": "INT",
37
- "integer": "INT",
38
- "real": "REAL",
39
- "real_measure": "REAL",
40
- "real_currency": "REAL",
41
- "enum_code": "ENUM_CODE",
42
- "enum_int": "ENUM_CODE",
43
- "ENUM_REAL": "ENUM_CODE",
44
- "ENUM_RATIONAL": "ENUM_CODE",
45
- "ENUM_BOOLEAN": "ENUM_CODE",
46
- "ENUM_STRING": "ENUM_CODE",
47
- "enum_reference": "ENUM_CODE",
48
- "enum_instance": "ENUM_CODE",
49
- "set(b1,b2)": "SET",
50
- "constrained_set(b1,b2,cmn,cmx)": "SET",
51
- "set [0,?]": "SET",
52
- "set [1,?]": "SET",
53
- "set [1, ?]": "SET",
54
- "nan": "NaN",
55
- "media_type": "LARGE_OBJECT_TYPE",
56
- }
57
-
58
- unit_mapping = {
59
- "nan": "NaN",
60
- "hertz": "FREQUENCY",
61
- "hz": "FREQUENCY",
62
- "pa": "PRESSURE",
63
- "pascal": "PRESSURE",
64
- "n/m²": "PRESSURE",
65
- "bar": "PRESSURE",
66
- "%": "SCALARS_PERC",
67
- "w": "POWER",
68
- "watt": "POWER",
69
- "kw": "POWER",
70
- "kg/m³": "CHEMISTRY",
71
- "m²/s": "CHEMISTRY",
72
- "pa*s": "CHEMISTRY",
73
- "v": "ELECTRICAL",
74
- "volt": "ELECTRICAL",
75
- "db": "ACOUSTICS",
76
- "db(a)": "ACOUSTICS",
77
- "k": "TEMPERATURE",
78
- "°c": "TEMPERATURE",
79
- "n": "MECHANICS",
80
- "newton": "MECHANICS",
81
- "kg/s": "FLOW",
82
- "kg/h": "FLOW",
83
- "m³/s": "FLOW",
84
- "m³/h": "FLOW",
85
- "l/s": "FLOW",
86
- "l/h": "FLOW",
87
- "µm": "LENGTH",
88
- "mm": "LENGTH",
89
- "cm": "LENGTH",
90
- "dm": "LENGTH",
91
- "m": "LENGTH",
92
- "meter": "LENGTH",
93
- "m/s": "SPEED",
94
- "km/h": "SPEED",
95
- "s^(-1)": "FREQUENCY",
96
- "1/s": "FREQUENCY",
97
- "s": "TIME",
98
- "h": "TIME",
99
- "min": "TIME",
100
- "d": "TIME",
101
- "hours": "TIME",
102
- "a": "ELECTRICAL",
103
- "m³": "VOLUME",
104
- "m²": "AREA",
105
- "rpm": "FLOW",
106
- "nm": "MECHANICS",
107
- "m/m": "MECHANICS",
108
- "m³/m²s": "MECHANICS",
109
- "w(m²*K)": "HEAT_TRANSFER",
110
- "kwh": "ELECTRICAL",
111
- "kg/(s*m²)": "FLOW",
112
- "kg": "MASS",
113
- "w/(m*k)": "HEAT_TRANSFER",
114
- "m²*k/w": "HEAT_TRANSFER",
115
- "j/s": "POWER",
116
- }
117
-
118
- #with open(
119
- # "./drive/My Drive/Colab/NLP/SemantischeInteroperabilität/Deployment/metadata.pickle",
120
- # "rb",
121
- #) as handle:
122
- # metalabel = pickle.load(handle)
123
-
124
- unit_lower = unit.lower()
125
- datatype_lower = datatype.lower()
126
-
127
- unit_categ = unit_mapping.get(unit_lower)
128
- datatype_categ = datatype_mapping.get(datatype_lower)
129
-
130
- if unit_categ == None:
131
- unit_categ = "NaN"
132
- if datatype_categ == None:
133
- datatype_categ = "NaN"
134
-
135
- concat = (unit_categ, datatype_categ)
136
- keys = [k for k, v in metalabel.items() if v == concat]
137
- metadata = keys[0]
138
-
139
- name_embedding = model.encode(name)
140
- definition_embedding = model.encode(definition)
141
- concat_name_def_query = np.concatenate(
142
- (definition_embedding, name_embedding), axis=0
143
- )
144
- concat_name_def_query = concat_name_def_query.tolist()
145
-
146
- queries = [concat_name_def_query]
147
- print(type(queries))
148
-
149
- # Query wird mit Semantic Search, k-nearest-neighbor durchgeführt
150
- # Chroma verwendet hierfür hnswlib https://github.com/nmslib/hnswlib
151
- # Dort kann als Distanz Cosine, Squared L2 oder Inner Product eingestellt werden
152
- # In Chroma ist L2 als Distanz eingestellt, vgl. https://github.com/chroma-core/chroma/blob/4463d13f951a4d28ade1f7e777d07302ff09069b/chromadb/db/index/hnswlib.py -> suche nach l2
153
-
154
- # Homogener fall, untersuchen nach Semant Ids, wenn welche gefunden werden, ist homgen erfolgreich
155
- try:
156
- homogen = collection.query(
157
- query_embeddings=queries, n_results=1, where={"SESemanticId": semantic_id}
158
- )
159
- # except NoDatapointsException:
160
- # homogen = 'Nix'
161
-
162
- except Exception:
163
- homogen = "Nix"
164
-
165
- if homogen != "Nix":
166
- result = homogen
167
- result["matching_method"] = "Semantic equivalent , same semantic Id"
168
- result["matching_algorithm"] = "None"
169
- result["distances"] = [[0]]
170
-
171
- final_result = {
172
- "matching_method": result['matching_method'],
173
- "matching_algorithm": result['matching_algorithm'],
174
- "matching_distance": result['distances'][0][0],
175
- "aas_id": result['metadatas'][0][0]['AASId'],
176
- "aas_id_short": result['metadatas'][0][0]['AASIdShort'],
177
- "submodel_id_short": result['metadatas'][0][0]['SubmodelName'],
178
- "submodel_id": result['metadatas'][0][0]['SubmodelId'],
179
- "matched_object": result['documents'][0][0],
180
- }
181
- final_results = [final_result]
182
- # Wenn keine passende semantic id gefunden, dann weiter mit NLP mit und ohne Metadaten
183
- elif homogen == "Nix":
184
- try:
185
- with_metadata = collection.query(
186
- query_embeddings=queries,
187
- n_results=return_matches,
188
- where={"Metalabel": metadata},
189
- )
190
-
191
- # except NoDatapointsException:
192
- # with_metadata = 'Nix'
193
-
194
- except Exception:
195
- with_metadata = "Nix"
196
-
197
- without_metadata = collection.query(
198
- query_embeddings=queries,
199
- n_results=return_matches,
200
- )
201
-
202
- if with_metadata == "Nix":
203
- result = without_metadata
204
- result[
205
- "matching_method"
206
- ] = "Semantically not equivalent, NLP without Metadata"
207
- result[
208
- "matching_algorithm"
209
- ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
210
-
211
- elif with_metadata != "Nix":
212
- distance_with_meta = with_metadata["distances"][0][0]
213
- distance_without_meta = without_metadata["distances"][0][0]
214
- print(distance_with_meta)
215
- print(distance_without_meta)
216
- # Vergleich der Abstände von mit und ohne Metadaten
217
- if distance_without_meta <= distance_with_meta:
218
- result = without_metadata
219
- result[
220
- "matching_method"
221
- ] = "Semantically not equivalent, NLP without Metadata"
222
- result[
223
- "matching_algorithm"
224
- ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
225
-
226
- else:
227
- result = with_metadata
228
- result[
229
- "matching_method"
230
- ] = "Semantically not equivalent, NLP without Metadata"
231
- result[
232
- "matching_algorithm"
233
- ] = "Semantic search, k-nearest-neighbor with squared L2 distance (euclidean distance), with model gart-labor/eng-distilBERT-se-eclass"
234
- # Aufbereiten des passenden finalen Ergebnisses
235
- final_results = []
236
- for i in range(0, return_matches):
237
- value = result['documents'][0][i]
238
- value_dict = json.loads(value)
239
- final_result = {
240
- "matching_method": result['matching_method'],
241
- "matching_algorithm": result['matching_algorithm'],
242
- "matching_distance": result['distances'][0][i],
243
- "aas_id": result['metadatas'][0][i]['AASId'],
244
- "aas_id_short": result['metadatas'][0][i]['AASIdShort'],
245
- "submodel_id_short": result['metadatas'][0][i]['SubmodelName'],
246
- "submodel_id": result['metadatas'][0][i]['SubmodelId'],
247
- #"matched_object": result['documents'][0][i]
248
- "matched_object": value_dict
249
- }
250
- final_results.append(final_result)
251
- return final_results
252
-
253
-
254
- def ask_database(query, metalabel, model, collections, client_chroma):
255
- # Alle AAS werden nacheinaner abgefragt
256
- json_query = json.dumps(query, indent=4)
257
- results = []
258
- for collection in collections:
259
- print(collection.name)
260
- collection = client_chroma.get_collection(collection.name)
261
- result = query_aas(json_query, collection, model, metalabel)
262
- results.append(result)
263
- #results_json = json.dumps(results)
264
- return results