File size: 2,714 Bytes
9312707
 
 
 
 
4101236
9312707
 
 
 
 
 
 
 
e029c8d
9312707
 
 
 
 
 
 
4101236
 
9312707
 
 
 
 
4101236
9312707
 
 
 
 
 
4101236
 
 
 
9312707
 
 
 
4101236
9312707
 
 
 
 
 
 
 
4101236
 
 
 
 
 
 
 
 
 
9312707
 
4101236
9312707
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from cnocr import CnOcr
import openai
from dotenv import load_dotenv
import os
import json
import checkTool

def model0(path):
    ocr = CnOcr(rec_model_name='en_PP-OCRv3')
    out = ocr.ocr(path)

    print(out)

    load_dotenv()
    openai.api_key = os.environ.get("data-extraction-api")

    invalid_list = [' ',',']
    data_set_1 = []
    for item in out:
        if item['text'] not in invalid_list:
            data_set_1.append(item['text'])

    print(f'All data here: {data_set_1}')

    completion = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        temperature = 0,
        messages = [
            {"role": "system", "content": "You are an AI assistant for extracting data from HKID card with following information \
                (name, date of birth, date of issue, HKID number) from HKID card. Uppercase and lowercase letters are the same. Store the results in \
                dictionary format"},
            {"role": "user", "content": f"Extract data from the following set of text: {data_set_1}. \
                You have three types of data to extract. \
                1. id card holder full name (it noramlly is a chinese name, including surname and family \
                name in English spelling, and it may be separate in different fields in the data set for surname and family name \
                sometimes) \
                2. date of birth (should be a date with year, month and day, e.g. 23-02-2003 is the required format, but 26-11 is not \
                because date of birth should have 10 characters) Only choose valid format!!!\
                3. date of issue (a string with format xx-xx) \
                4. HKID number (The standard format of HKID number is @123456(#) e.g. A123456(7) is a valid HKID number. \
                (a) @ represents any one or two capital letters of the alphabet. \
                (b) # is the check digit which has 11 possible values from 0 to 9 and A.) \
                Remember to include the check digit with () \
                Only reply a dictionary. No need to add other words or explanation. Use double quote for dictionary."},
                
        ]
    )

    data = completion['choices'][0]['message']['content']

    print(data)

    id_data = json.loads(data)

    name = id_data["name"]
    dateofbirth = id_data["date of birth"]
    issuedate = id_data["date of issue"]
    hkid = id_data["HKID number"]
    if checkTool.validate_hkid(hkid=hkid):
        valid_hkid = 'True'
    else:
        valid_hkid = 'False'
    name = checkTool.seperate_name(name)
    
    print(id_data)
    return [name, valid_hkid, hkid, issuedate, dateofbirth]
    # return [name, valid_hkid, hkid, issuedate]