Spaces:
Running
Running
OnlyBiggg
commited on
Commit
·
c524d8c
1
Parent(s):
5543c4b
add model NER extract name
Browse files- .gitattributes +4 -0
- app/dialogflow/api/v1/dialogflow.py +45 -0
- app/dialogflow/services/dialog_service.py +46 -0
- app/ner/__init__.py +0 -0
- app/ner/models/__init__.py +0 -0
- app/ner/models/base_model.py +16 -0
- app/ner/models/ner/config.json +53 -0
- app/ner/models/ner/model_optimized.onnx +0 -0
- app/ner/models/ner/ort_config.json +37 -0
- app/ner/models/ner/special_tokens_map.json +37 -0
- app/ner/models/ner/tokenizer.json +0 -0
- app/ner/models/ner/tokenizer_config.json +65 -0
- app/ner/models/ner/vocab.txt +0 -0
- app/ner/services/ner.py +77 -0
- app/ner/utils/__init__.py +0 -0
- utils/life_span.py +17 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
|
37 |
+
*.onnx binary
|
38 |
+
tokenizer.json text eol=lf
|
39 |
+
vocab.txt text eol=lf
|
app/dialogflow/api/v1/dialogflow.py
CHANGED
@@ -4,6 +4,7 @@ from fastapi.responses import JSONResponse, RedirectResponse, HTMLResponse # typ
|
|
4 |
from datetime import datetime, timedelta
|
5 |
from fastapi.templating import Jinja2Templates
|
6 |
from app.dialogflow.services.dialog_service import dialog_service
|
|
|
7 |
from utils.format_data_dialog import extra_time_dialogflow, get_weekday_name, find_surrounding_times
|
8 |
|
9 |
from common.external.external_api import api
|
@@ -809,6 +810,50 @@ async def is_valid_select_seat(request: Request) -> Response:
|
|
809 |
except Exception as e:
|
810 |
return DialogFlowResponseAPI(text=["Hệ thống xảy ra lỗi. Quý khách vui lòng thử lại sau hoặc liên hệ Trung tâm tổng đài 1900 6067 để được hỗ trợ."])
|
811 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
812 |
@router.post('/trip/stop/pickup')
|
813 |
async def pickup(request: Request) -> Response:
|
814 |
body = await request.json()
|
|
|
4 |
from datetime import datetime, timedelta
|
5 |
from fastapi.templating import Jinja2Templates
|
6 |
from app.dialogflow.services.dialog_service import dialog_service
|
7 |
+
from app.ner.services.ner import NER
|
8 |
from utils.format_data_dialog import extra_time_dialogflow, get_weekday_name, find_surrounding_times
|
9 |
|
10 |
from common.external.external_api import api
|
|
|
810 |
except Exception as e:
|
811 |
return DialogFlowResponseAPI(text=["Hệ thống xảy ra lỗi. Quý khách vui lòng thử lại sau hoặc liên hệ Trung tâm tổng đài 1900 6067 để được hỗ trợ."])
|
812 |
|
813 |
+
@router.post('/trip/check-exist-user-info')
|
814 |
+
async def check_exist_user_info(request: Request) -> Response:
|
815 |
+
body = await request.json()
|
816 |
+
session_info = body.get("sessionInfo", {})
|
817 |
+
parameters = session_info.get("parameters")
|
818 |
+
|
819 |
+
is_exist_user_info = dialog_service.check_exist_user_info()
|
820 |
+
|
821 |
+
user_info = {}
|
822 |
+
|
823 |
+
if is_exist_user_info:
|
824 |
+
user_info = dialog_service.get_user_info()
|
825 |
+
|
826 |
+
user_name = user_info.get("name")
|
827 |
+
phone_number = user_info.get("phone_number")
|
828 |
+
email = user_info.get("email")
|
829 |
+
|
830 |
+
parameters = {
|
831 |
+
"is_user_exist": is_exist_user_info,
|
832 |
+
"user_name": user_name,
|
833 |
+
"phone_number": phone_number,
|
834 |
+
"email": email
|
835 |
+
}
|
836 |
+
|
837 |
+
return DialogFlowResponseAPI(parameters=parameters)
|
838 |
+
|
839 |
+
@router.post('/trip/extract-user-name')
|
840 |
+
async def extract_user_name(request: Request) -> Response:
|
841 |
+
body = await request.json()
|
842 |
+
session_info = body.get("sessionInfo", {})
|
843 |
+
parameters = session_info.get("parameters")
|
844 |
+
|
845 |
+
raw_text_user_name = (body.get("text",""))
|
846 |
+
|
847 |
+
ner: NER = request.app.state.ner
|
848 |
+
|
849 |
+
user_name = dialog_service.extract_user_name(text=raw_text_user_name, ner=ner)
|
850 |
+
|
851 |
+
parameters = {
|
852 |
+
"user_name": user_name
|
853 |
+
}
|
854 |
+
|
855 |
+
return DialogFlowResponseAPI(parameters=parameters)
|
856 |
+
|
857 |
@router.post('/trip/stop/pickup')
|
858 |
async def pickup(request: Request) -> Response:
|
859 |
body = await request.json()
|
app/dialogflow/services/dialog_service.py
CHANGED
@@ -3,6 +3,8 @@ from datetime import datetime, timedelta
|
|
3 |
|
4 |
from fastapi import logger
|
5 |
|
|
|
|
|
6 |
from common.external.external_api import api
|
7 |
from core.conf import settings
|
8 |
|
@@ -365,5 +367,49 @@ class DialogService:
|
|
365 |
return group["provinceName"]
|
366 |
return None
|
367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
dialog_service: DialogService = DialogService()
|
369 |
|
|
|
3 |
|
4 |
from fastapi import logger
|
5 |
|
6 |
+
from app.ner.services.ner import NER
|
7 |
+
|
8 |
from common.external.external_api import api
|
9 |
from core.conf import settings
|
10 |
|
|
|
367 |
return group["provinceName"]
|
368 |
return None
|
369 |
|
370 |
+
async def check_exist_user_info(self, user_id: str = None):
|
371 |
+
try:
|
372 |
+
# response = await api.get(f'/user/{user_id}')
|
373 |
+
# if response.get("status") == 200:
|
374 |
+
# return True
|
375 |
+
return True
|
376 |
+
|
377 |
+
# return False
|
378 |
+
except Exception as e:
|
379 |
+
logger.error(f"Error checking user info: {e}")
|
380 |
+
return False
|
381 |
+
|
382 |
+
async def get_user_info(self, user_id: str = None):
|
383 |
+
try:
|
384 |
+
# response = await api.get(f'/user/{user_id}')
|
385 |
+
# if response.get("status") == 200:
|
386 |
+
# return response.get("data")
|
387 |
+
user_info = {
|
388 |
+
"user_name": "Đại",
|
389 |
+
"phone_number": "0987654321",
|
390 |
+
"email": "[email protected]"
|
391 |
+
}
|
392 |
+
|
393 |
+
return user_info
|
394 |
+
|
395 |
+
# return None
|
396 |
+
except Exception as e:
|
397 |
+
logger.error(f"Error fetching user info: {e}")
|
398 |
+
return None
|
399 |
+
|
400 |
+
def extract_user_name(text: str, ner: NER):
|
401 |
+
if text is None:
|
402 |
+
return None
|
403 |
+
|
404 |
+
user_name_pred = ner.predict(text=text, entity_tag="PERSON")
|
405 |
+
|
406 |
+
if user_name_pred:
|
407 |
+
user_name = user_name_pred[0]
|
408 |
+
|
409 |
+
if user_name:
|
410 |
+
return user_name
|
411 |
+
|
412 |
+
return None
|
413 |
+
|
414 |
dialog_service: DialogService = DialogService()
|
415 |
|
app/ner/__init__.py
ADDED
File without changes
|
app/ner/models/__init__.py
ADDED
File without changes
|
app/ner/models/base_model.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
from optimum.onnxruntime import ORTModelForTokenClassification
|
4 |
+
from optimum.pipelines import pipeline
|
5 |
+
start = datetime.now()
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained("model_ner")
|
10 |
+
model = ORTModelForTokenClassification.from_pretrained("model_ner", provider="CPUExecutionProvider")
|
11 |
+
pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, accelerator="ort", device=-1)
|
12 |
+
|
13 |
+
result = pipe("Tôi tên là Trần Văn Đại, địa chỉ 12 Phan đình phùng")
|
14 |
+
end = datetime.now()
|
15 |
+
print(result)
|
16 |
+
print("Time taken: ", (end - start).total_seconds())
|
app/ner/models/ner/config.json
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "NlpHUST/ner-vietnamese-electra-base",
|
3 |
+
"architectures": [
|
4 |
+
"ElectraForTokenClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"embedding_size": 768,
|
9 |
+
"finetuning_task": "ner",
|
10 |
+
"hidden_act": "gelu",
|
11 |
+
"hidden_dropout_prob": 0.1,
|
12 |
+
"hidden_size": 768,
|
13 |
+
"id2label": {
|
14 |
+
"0": "B-LOCATION",
|
15 |
+
"1": "B-MISCELLANEOUS",
|
16 |
+
"2": "B-ORGANIZATION",
|
17 |
+
"3": "B-PERSON",
|
18 |
+
"4": "I-LOCATION",
|
19 |
+
"5": "I-MISCELLANEOUS",
|
20 |
+
"6": "I-ORGANIZATION",
|
21 |
+
"7": "I-PERSON",
|
22 |
+
"8": "O"
|
23 |
+
},
|
24 |
+
"initializer_range": 0.02,
|
25 |
+
"intermediate_size": 3072,
|
26 |
+
"label2id": {
|
27 |
+
"B-LOCATION": 0,
|
28 |
+
"B-MISCELLANEOUS": 1,
|
29 |
+
"B-ORGANIZATION": 2,
|
30 |
+
"B-PERSON": 3,
|
31 |
+
"I-LOCATION": 4,
|
32 |
+
"I-MISCELLANEOUS": 5,
|
33 |
+
"I-ORGANIZATION": 6,
|
34 |
+
"I-PERSON": 7,
|
35 |
+
"O": 8
|
36 |
+
},
|
37 |
+
"layer_norm_eps": 1e-12,
|
38 |
+
"max_position_embeddings": 512,
|
39 |
+
"model_type": "electra",
|
40 |
+
"num_attention_heads": 12,
|
41 |
+
"num_hidden_layers": 12,
|
42 |
+
"pad_token_id": 0,
|
43 |
+
"position_embedding_type": "absolute",
|
44 |
+
"summary_activation": "gelu",
|
45 |
+
"summary_last_dropout": 0.1,
|
46 |
+
"summary_type": "first",
|
47 |
+
"summary_use_proj": true,
|
48 |
+
"torch_dtype": "float32",
|
49 |
+
"transformers_version": "4.48.3",
|
50 |
+
"type_vocab_size": 2,
|
51 |
+
"use_cache": true,
|
52 |
+
"vocab_size": 62000
|
53 |
+
}
|
app/ner/models/ner/model_optimized.onnx
ADDED
Binary file (134 Bytes). View file
|
|
app/ner/models/ner/ort_config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"one_external_file": true,
|
3 |
+
"opset": null,
|
4 |
+
"optimization": {
|
5 |
+
"disable_attention": null,
|
6 |
+
"disable_attention_fusion": false,
|
7 |
+
"disable_bias_gelu": null,
|
8 |
+
"disable_bias_gelu_fusion": false,
|
9 |
+
"disable_bias_skip_layer_norm": null,
|
10 |
+
"disable_bias_skip_layer_norm_fusion": false,
|
11 |
+
"disable_embed_layer_norm": true,
|
12 |
+
"disable_embed_layer_norm_fusion": true,
|
13 |
+
"disable_gelu": null,
|
14 |
+
"disable_gelu_fusion": false,
|
15 |
+
"disable_group_norm_fusion": true,
|
16 |
+
"disable_layer_norm": null,
|
17 |
+
"disable_layer_norm_fusion": false,
|
18 |
+
"disable_packed_kv": true,
|
19 |
+
"disable_rotary_embeddings": false,
|
20 |
+
"disable_shape_inference": false,
|
21 |
+
"disable_skip_layer_norm": null,
|
22 |
+
"disable_skip_layer_norm_fusion": false,
|
23 |
+
"enable_gelu_approximation": true,
|
24 |
+
"enable_gemm_fast_gelu_fusion": false,
|
25 |
+
"enable_transformers_specific_optimizations": true,
|
26 |
+
"fp16": false,
|
27 |
+
"no_attention_mask": false,
|
28 |
+
"optimization_level": 2,
|
29 |
+
"optimize_for_gpu": false,
|
30 |
+
"optimize_with_onnxruntime_only": null,
|
31 |
+
"use_mask_index": false,
|
32 |
+
"use_multi_head_attention": false,
|
33 |
+
"use_raw_attention_mask": false
|
34 |
+
},
|
35 |
+
"quantization": {},
|
36 |
+
"use_external_data_format": false
|
37 |
+
}
|
app/ner/models/ner/special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
app/ner/models/ner/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app/ner/models/ner/tokenizer_config.json
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"4": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": false,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": false,
|
48 |
+
"extra_special_tokens": {},
|
49 |
+
"mask_token": "[MASK]",
|
50 |
+
"max_length": 256,
|
51 |
+
"model_max_length": 1000000000000000019884624838656,
|
52 |
+
"never_split": null,
|
53 |
+
"pad_to_multiple_of": null,
|
54 |
+
"pad_token": "[PAD]",
|
55 |
+
"pad_token_type_id": 0,
|
56 |
+
"padding_side": "right",
|
57 |
+
"sep_token": "[SEP]",
|
58 |
+
"stride": 0,
|
59 |
+
"strip_accents": null,
|
60 |
+
"tokenize_chinese_chars": true,
|
61 |
+
"tokenizer_class": "ElectraTokenizer",
|
62 |
+
"truncation_side": "right",
|
63 |
+
"truncation_strategy": "longest_first",
|
64 |
+
"unk_token": "[UNK]"
|
65 |
+
}
|
app/ner/models/ner/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app/ner/services/ner.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ....core.conf import settings
|
2 |
+
|
3 |
+
class NER:
|
4 |
+
def __init__(self, model_name: str = settings.NER_MODEL_NAME):
|
5 |
+
self.model_name = model_name
|
6 |
+
self.model = None
|
7 |
+
self.tokenizer = None
|
8 |
+
self.pipeline = None
|
9 |
+
|
10 |
+
def load_model(self):
|
11 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
12 |
+
|
13 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
14 |
+
self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
|
15 |
+
self.pipeline = pipeline(settings.TASK_NAME, model=self.model, tokenizer=self.tokenizer)
|
16 |
+
|
17 |
+
def predict(self, text: str, entity_tag: str = None):
|
18 |
+
if self.pipeline is None:
|
19 |
+
raise ValueError("Model not loaded. Please call load_model() first.")
|
20 |
+
pred = self.pipeline(text)
|
21 |
+
if entity_tag:
|
22 |
+
return self.extract_entities(pred, entity_tag)
|
23 |
+
return pred
|
24 |
+
|
25 |
+
def extract_entities(self, result_pred: list[dict[str, any]], entity: str) -> list[str]:
|
26 |
+
if self.pipeline is None:
|
27 |
+
raise ValueError("Model not loaded. Please call load_model() first.")
|
28 |
+
B_ENTITY = f"B-{entity}"
|
29 |
+
I_ENTITY = f"I-{entity}"
|
30 |
+
|
31 |
+
extracted_entities = []
|
32 |
+
current_entity_tokens = []
|
33 |
+
|
34 |
+
for item in result_pred:
|
35 |
+
word = item["word"]
|
36 |
+
entity_tag = item["entity"]
|
37 |
+
|
38 |
+
if entity_tag == B_ENTITY:
|
39 |
+
if current_entity_tokens:
|
40 |
+
extracted_entities.append(self._combine_token(current_entity_tokens))
|
41 |
+
current_entity_tokens = [word]
|
42 |
+
elif entity_tag == I_ENTITY and current_entity_tokens:
|
43 |
+
current_entity_tokens.append(word)
|
44 |
+
else:
|
45 |
+
if current_entity_tokens:
|
46 |
+
extracted_entities.append(self._combine_token(current_entity_tokens))
|
47 |
+
current_entity_tokens = []
|
48 |
+
|
49 |
+
if current_entity_tokens:
|
50 |
+
extracted_entities.append(self._combine_token(current_entity_tokens))
|
51 |
+
|
52 |
+
return extracted_entities
|
53 |
+
|
54 |
+
def _combine_token(tokens: list[str]) -> str:
|
55 |
+
"""Combines tokens into a single string, removing leading hashtags from the first token if present.
|
56 |
+
Args:
|
57 |
+
tokens (list[str]): List of tokens to combine.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
str: Combined string of tokens.
|
61 |
+
"""
|
62 |
+
if not tokens:
|
63 |
+
return ""
|
64 |
+
|
65 |
+
words = []
|
66 |
+
|
67 |
+
for token in tokens:
|
68 |
+
if token.strip("#") != token:
|
69 |
+
clean_token = token.strip("#")
|
70 |
+
if words:
|
71 |
+
words[-1] += clean_token
|
72 |
+
else:
|
73 |
+
words.append(clean_token)
|
74 |
+
else:
|
75 |
+
words.append(token)
|
76 |
+
|
77 |
+
return " ".join(words)
|
app/ner/utils/__init__.py
ADDED
File without changes
|
utils/life_span.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from contextlib import asynccontextmanager
|
2 |
+
|
3 |
+
from fastapi import FastAPI
|
4 |
+
|
5 |
+
|
6 |
+
@asynccontextmanager
|
7 |
+
async def lifespan(app: FastAPI):
|
8 |
+
# Load the ML model
|
9 |
+
from app.ner.services.ner import NER
|
10 |
+
ner: NER = NER()
|
11 |
+
ner.load_model()
|
12 |
+
app.state.ner = ner
|
13 |
+
print("NER model loaded successfully.")
|
14 |
+
yield
|
15 |
+
# Clean up the ML models and release the resources
|
16 |
+
print("Cleaning up NER model...")
|
17 |
+
del app.state.ner
|