hf-similarity-check / similarity_check.py
marcotam's picture
Upload 2 files
9687104
raw
history blame
2.26 kB
from model0 import model0
import checkTool as ct
import extract_pdf as pf
import extraction_data as ed
import get_chinese_name as cn
import search_engine as se
import get_chinese_code as cc
# get info from hkid card
def string_similarity(s1, s2): # Levenshtein distance algorithm
s1 = s1.replace(' ', '')
s1 = s1.lower()
s2 = s2.replace(' ', '')
s2 = s2.lower()
if s1 == s2:
return 100.0
len1 = len(s1)
len2 = len(s2)
matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]
for i in range(len1 + 1):
matrix[i][0] = i
for j in range(len2 + 1):
matrix[0][j] = j
for i in range(1, len1 + 1):
for j in range(1, len2 + 1):
if s1[i - 1] == s2[j - 1]:
cost = 0
else:
cost = 1
matrix[i][j] = min(matrix[i - 1][j] + 1, # deletion
matrix[i][j - 1] + 1, # insertion
matrix[i - 1][j - 1] + cost) # substitution
similarity = (1 - matrix[len1][len2] / max(len1, len2)) * 100
return round(similarity, 1)
def get_data(img1_path, img2_path):
# img_fp = 'IMG_4495.jpg'
# info1 = model1(img1_path)
# info2 = model2(img1_path)
# def print_info(name, valid_hkid, hkid, issuedate):
# print(f'Name: {name}') # name is without space
# print(f'HKID: {hkid} and validity: {valid_hkid}')
# print(f'Date of issue: {issuedate}')
# cinfo = ct.combine_info(info1, info2)
cinfo = model0(img1_path)
print(cinfo)
# get info from bank
data = ed.get_info_from_bank(img2_path)
name = data["nameStatement"]
############# Similarity check ##############
# img_fp = 'IMG_1234.jpg'
name1 = cinfo[0]
threshold = 85
similarity_score = string_similarity(name,name1)
data["similarity_score"] = similarity_score
data["name_on_id"] = name1
data["hkid"] = cinfo[2]
data["validity"] = cinfo[1]
data["issue_date"] = cinfo[3]
data["dateofbirth"] = cinfo[4]
# Get chinese name
chi_name = cc.get_chinese_name(img1_path)
# chi_name = cn.get_chiname(img1_path)["Chinese Name"]
data["chi_name_id"] = chi_name
return data