OnlyBiggg commited on
Commit
c524d8c
·
1 Parent(s): 5543c4b

add model NER extract name

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+
37
+ *.onnx binary
38
+ tokenizer.json text eol=lf
39
+ vocab.txt text eol=lf
app/dialogflow/api/v1/dialogflow.py CHANGED
@@ -4,6 +4,7 @@ from fastapi.responses import JSONResponse, RedirectResponse, HTMLResponse # typ
4
  from datetime import datetime, timedelta
5
  from fastapi.templating import Jinja2Templates
6
  from app.dialogflow.services.dialog_service import dialog_service
 
7
  from utils.format_data_dialog import extra_time_dialogflow, get_weekday_name, find_surrounding_times
8
 
9
  from common.external.external_api import api
@@ -809,6 +810,50 @@ async def is_valid_select_seat(request: Request) -> Response:
809
  except Exception as e:
810
  return DialogFlowResponseAPI(text=["Hệ thống xảy ra lỗi. Quý khách vui lòng thử lại sau hoặc liên hệ Trung tâm tổng đài 1900 6067 để được hỗ trợ."])
811
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
812
  @router.post('/trip/stop/pickup')
813
  async def pickup(request: Request) -> Response:
814
  body = await request.json()
 
4
  from datetime import datetime, timedelta
5
  from fastapi.templating import Jinja2Templates
6
  from app.dialogflow.services.dialog_service import dialog_service
7
+ from app.ner.services.ner import NER
8
  from utils.format_data_dialog import extra_time_dialogflow, get_weekday_name, find_surrounding_times
9
 
10
  from common.external.external_api import api
 
810
  except Exception as e:
811
  return DialogFlowResponseAPI(text=["Hệ thống xảy ra lỗi. Quý khách vui lòng thử lại sau hoặc liên hệ Trung tâm tổng đài 1900 6067 để được hỗ trợ."])
812
 
813
+ @router.post('/trip/check-exist-user-info')
814
+ async def check_exist_user_info(request: Request) -> Response:
815
+ body = await request.json()
816
+ session_info = body.get("sessionInfo", {})
817
+ parameters = session_info.get("parameters")
818
+
819
+ is_exist_user_info = dialog_service.check_exist_user_info()
820
+
821
+ user_info = {}
822
+
823
+ if is_exist_user_info:
824
+ user_info = dialog_service.get_user_info()
825
+
826
+ user_name = user_info.get("name")
827
+ phone_number = user_info.get("phone_number")
828
+ email = user_info.get("email")
829
+
830
+ parameters = {
831
+ "is_user_exist": is_exist_user_info,
832
+ "user_name": user_name,
833
+ "phone_number": phone_number,
834
+ "email": email
835
+ }
836
+
837
+ return DialogFlowResponseAPI(parameters=parameters)
838
+
839
+ @router.post('/trip/extract-user-name')
840
+ async def extract_user_name(request: Request) -> Response:
841
+ body = await request.json()
842
+ session_info = body.get("sessionInfo", {})
843
+ parameters = session_info.get("parameters")
844
+
845
+ raw_text_user_name = (body.get("text",""))
846
+
847
+ ner: NER = request.app.state.ner
848
+
849
+ user_name = dialog_service.extract_user_name(text=raw_text_user_name, ner=ner)
850
+
851
+ parameters = {
852
+ "user_name": user_name
853
+ }
854
+
855
+ return DialogFlowResponseAPI(parameters=parameters)
856
+
857
  @router.post('/trip/stop/pickup')
858
  async def pickup(request: Request) -> Response:
859
  body = await request.json()
app/dialogflow/services/dialog_service.py CHANGED
@@ -3,6 +3,8 @@ from datetime import datetime, timedelta
3
 
4
  from fastapi import logger
5
 
 
 
6
  from common.external.external_api import api
7
  from core.conf import settings
8
 
@@ -365,5 +367,49 @@ class DialogService:
365
  return group["provinceName"]
366
  return None
367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  dialog_service: DialogService = DialogService()
369
 
 
3
 
4
  from fastapi import logger
5
 
6
+ from app.ner.services.ner import NER
7
+
8
  from common.external.external_api import api
9
  from core.conf import settings
10
 
 
367
  return group["provinceName"]
368
  return None
369
 
370
+ async def check_exist_user_info(self, user_id: str = None):
371
+ try:
372
+ # response = await api.get(f'/user/{user_id}')
373
+ # if response.get("status") == 200:
374
+ # return True
375
+ return True
376
+
377
+ # return False
378
+ except Exception as e:
379
+ logger.error(f"Error checking user info: {e}")
380
+ return False
381
+
382
+ async def get_user_info(self, user_id: str = None):
383
+ try:
384
+ # response = await api.get(f'/user/{user_id}')
385
+ # if response.get("status") == 200:
386
+ # return response.get("data")
387
+ user_info = {
388
+ "user_name": "Đại",
389
+ "phone_number": "0987654321",
390
+ "email": "[email protected]"
391
+ }
392
+
393
+ return user_info
394
+
395
+ # return None
396
+ except Exception as e:
397
+ logger.error(f"Error fetching user info: {e}")
398
+ return None
399
+
400
+ def extract_user_name(text: str, ner: NER):
401
+ if text is None:
402
+ return None
403
+
404
+ user_name_pred = ner.predict(text=text, entity_tag="PERSON")
405
+
406
+ if user_name_pred:
407
+ user_name = user_name_pred[0]
408
+
409
+ if user_name:
410
+ return user_name
411
+
412
+ return None
413
+
414
  dialog_service: DialogService = DialogService()
415
 
app/ner/__init__.py ADDED
File without changes
app/ner/models/__init__.py ADDED
File without changes
app/ner/models/base_model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+ from transformers import AutoTokenizer
3
+ from optimum.onnxruntime import ORTModelForTokenClassification
4
+ from optimum.pipelines import pipeline
5
+ start = datetime.now()
6
+
7
+
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained("model_ner")
10
+ model = ORTModelForTokenClassification.from_pretrained("model_ner", provider="CPUExecutionProvider")
11
+ pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, accelerator="ort", device=-1)
12
+
13
+ result = pipe("Tôi tên là Trần Văn Đại, địa chỉ 12 Phan đình phùng")
14
+ end = datetime.now()
15
+ print(result)
16
+ print("Time taken: ", (end - start).total_seconds())
app/ner/models/ner/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "NlpHUST/ner-vietnamese-electra-base",
3
+ "architectures": [
4
+ "ElectraForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 768,
9
+ "finetuning_task": "ner",
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "B-LOCATION",
15
+ "1": "B-MISCELLANEOUS",
16
+ "2": "B-ORGANIZATION",
17
+ "3": "B-PERSON",
18
+ "4": "I-LOCATION",
19
+ "5": "I-MISCELLANEOUS",
20
+ "6": "I-ORGANIZATION",
21
+ "7": "I-PERSON",
22
+ "8": "O"
23
+ },
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 3072,
26
+ "label2id": {
27
+ "B-LOCATION": 0,
28
+ "B-MISCELLANEOUS": 1,
29
+ "B-ORGANIZATION": 2,
30
+ "B-PERSON": 3,
31
+ "I-LOCATION": 4,
32
+ "I-MISCELLANEOUS": 5,
33
+ "I-ORGANIZATION": 6,
34
+ "I-PERSON": 7,
35
+ "O": 8
36
+ },
37
+ "layer_norm_eps": 1e-12,
38
+ "max_position_embeddings": 512,
39
+ "model_type": "electra",
40
+ "num_attention_heads": 12,
41
+ "num_hidden_layers": 12,
42
+ "pad_token_id": 0,
43
+ "position_embedding_type": "absolute",
44
+ "summary_activation": "gelu",
45
+ "summary_last_dropout": 0.1,
46
+ "summary_type": "first",
47
+ "summary_use_proj": true,
48
+ "torch_dtype": "float32",
49
+ "transformers_version": "4.48.3",
50
+ "type_vocab_size": 2,
51
+ "use_cache": true,
52
+ "vocab_size": 62000
53
+ }
app/ner/models/ner/model_optimized.onnx ADDED
Binary file (134 Bytes). View file
 
app/ner/models/ner/ort_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "one_external_file": true,
3
+ "opset": null,
4
+ "optimization": {
5
+ "disable_attention": null,
6
+ "disable_attention_fusion": false,
7
+ "disable_bias_gelu": null,
8
+ "disable_bias_gelu_fusion": false,
9
+ "disable_bias_skip_layer_norm": null,
10
+ "disable_bias_skip_layer_norm_fusion": false,
11
+ "disable_embed_layer_norm": true,
12
+ "disable_embed_layer_norm_fusion": true,
13
+ "disable_gelu": null,
14
+ "disable_gelu_fusion": false,
15
+ "disable_group_norm_fusion": true,
16
+ "disable_layer_norm": null,
17
+ "disable_layer_norm_fusion": false,
18
+ "disable_packed_kv": true,
19
+ "disable_rotary_embeddings": false,
20
+ "disable_shape_inference": false,
21
+ "disable_skip_layer_norm": null,
22
+ "disable_skip_layer_norm_fusion": false,
23
+ "enable_gelu_approximation": true,
24
+ "enable_gemm_fast_gelu_fusion": false,
25
+ "enable_transformers_specific_optimizations": true,
26
+ "fp16": false,
27
+ "no_attention_mask": false,
28
+ "optimization_level": 2,
29
+ "optimize_for_gpu": false,
30
+ "optimize_with_onnxruntime_only": null,
31
+ "use_mask_index": false,
32
+ "use_multi_head_attention": false,
33
+ "use_raw_attention_mask": false
34
+ },
35
+ "quantization": {},
36
+ "use_external_data_format": false
37
+ }
app/ner/models/ner/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
app/ner/models/ner/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
app/ner/models/ner/tokenizer_config.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "extra_special_tokens": {},
49
+ "mask_token": "[MASK]",
50
+ "max_length": 256,
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "never_split": null,
53
+ "pad_to_multiple_of": null,
54
+ "pad_token": "[PAD]",
55
+ "pad_token_type_id": 0,
56
+ "padding_side": "right",
57
+ "sep_token": "[SEP]",
58
+ "stride": 0,
59
+ "strip_accents": null,
60
+ "tokenize_chinese_chars": true,
61
+ "tokenizer_class": "ElectraTokenizer",
62
+ "truncation_side": "right",
63
+ "truncation_strategy": "longest_first",
64
+ "unk_token": "[UNK]"
65
+ }
app/ner/models/ner/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
app/ner/services/ner.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ....core.conf import settings
2
+
3
+ class NER:
4
+ def __init__(self, model_name: str = settings.NER_MODEL_NAME):
5
+ self.model_name = model_name
6
+ self.model = None
7
+ self.tokenizer = None
8
+ self.pipeline = None
9
+
10
+ def load_model(self):
11
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
12
+
13
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
14
+ self.model = AutoModelForTokenClassification.from_pretrained(self.model_name)
15
+ self.pipeline = pipeline(settings.TASK_NAME, model=self.model, tokenizer=self.tokenizer)
16
+
17
+ def predict(self, text: str, entity_tag: str = None):
18
+ if self.pipeline is None:
19
+ raise ValueError("Model not loaded. Please call load_model() first.")
20
+ pred = self.pipeline(text)
21
+ if entity_tag:
22
+ return self.extract_entities(pred, entity_tag)
23
+ return pred
24
+
25
+ def extract_entities(self, result_pred: list[dict[str, any]], entity: str) -> list[str]:
26
+ if self.pipeline is None:
27
+ raise ValueError("Model not loaded. Please call load_model() first.")
28
+ B_ENTITY = f"B-{entity}"
29
+ I_ENTITY = f"I-{entity}"
30
+
31
+ extracted_entities = []
32
+ current_entity_tokens = []
33
+
34
+ for item in result_pred:
35
+ word = item["word"]
36
+ entity_tag = item["entity"]
37
+
38
+ if entity_tag == B_ENTITY:
39
+ if current_entity_tokens:
40
+ extracted_entities.append(self._combine_token(current_entity_tokens))
41
+ current_entity_tokens = [word]
42
+ elif entity_tag == I_ENTITY and current_entity_tokens:
43
+ current_entity_tokens.append(word)
44
+ else:
45
+ if current_entity_tokens:
46
+ extracted_entities.append(self._combine_token(current_entity_tokens))
47
+ current_entity_tokens = []
48
+
49
+ if current_entity_tokens:
50
+ extracted_entities.append(self._combine_token(current_entity_tokens))
51
+
52
+ return extracted_entities
53
+
54
+ def _combine_token(tokens: list[str]) -> str:
55
+ """Combines tokens into a single string, removing leading hashtags from the first token if present.
56
+ Args:
57
+ tokens (list[str]): List of tokens to combine.
58
+
59
+ Returns:
60
+ str: Combined string of tokens.
61
+ """
62
+ if not tokens:
63
+ return ""
64
+
65
+ words = []
66
+
67
+ for token in tokens:
68
+ if token.strip("#") != token:
69
+ clean_token = token.strip("#")
70
+ if words:
71
+ words[-1] += clean_token
72
+ else:
73
+ words.append(clean_token)
74
+ else:
75
+ words.append(token)
76
+
77
+ return " ".join(words)
app/ner/utils/__init__.py ADDED
File without changes
utils/life_span.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+
3
+ from fastapi import FastAPI
4
+
5
+
6
+ @asynccontextmanager
7
+ async def lifespan(app: FastAPI):
8
+ # Load the ML model
9
+ from app.ner.services.ner import NER
10
+ ner: NER = NER()
11
+ ner.load_model()
12
+ app.state.ner = ner
13
+ print("NER model loaded successfully.")
14
+ yield
15
+ # Clean up the ML models and release the resources
16
+ print("Cleaning up NER model...")
17
+ del app.state.ner