5m4ck3r's picture
Added translator
6d85c23 verified
raw
history blame
22.2 kB
import gradio as gr
from transformers import pipeline
import openpyxl
import tempfile
import pycountry
from deep_translator import GoogleTranslator
from langdetect import detect
classifier = pipeline("zero-shot-classification",
model="LogicSpine/address-large-text-classifier")
def translate_text(text: str):
text = text.strip()
translator = GoogleTranslator(source='auto', target='en')
detected_lang = detect(text)
if detected_lang == 'en':
return text
translated = translator.translate(text)
return translated
def check_for_third(address: str) -> bool:
countries = [country.name.lower() for country in pycountry.countries]
old_country_names = [
"burma",
"ceylon",
"persia",
"zaire",
"upper volta",
"swaziland",
"macedonia",
"czech republic",
"turkey",
"holland",
"kampuchea",
"dahomey",
"bechuanaland",
"gold coast",
"nyasaland",
"korea",
"russia",
"usa",
"uk"
]
countries = countries + old_country_names
if "," in address:
address = address.split(",")
else:
address = [address.strip()]
for ad in address:
if ad.lower().strip() in countries:
return True
return False
def check_for_first(address: str) -> bool:
keyword_list = ["school", "laboratory", "department"]
for key in keyword_list:
if key.lower() in address.lower():
return True
return False
def check_for_second(address: str) -> bool:
keyword_list = ["university"]
for key in keyword_list:
if key.lower() in address.lower().strip():
return True
return False
def compaire_two(bigger: str, smaller: str, mid: int) -> bool:
"""Helps to find the result according to the priority
Args:
bigger (str): Pass the bigger
smaller (str): And smaller
mid (int): Pass the mid where 1 reffer to 1st and 2 as 2nd and 3 as 3rd
Raises:
ValueError: If invalid mid is passed
Returns:
bool: if bigger have more priority then True else False
"""
if mid == 1:
if check_for_first(bigger):
return True
lab = ["School", "Department", "Laboratory"]
elif mid == 2:
if check_for_second(bigger):
return True
lab = ["University", "Polytechnic"]
elif mid == 3:
if check_for_third(bigger):
return True
lab = ["State", "District", "Country"]
else:
raise ValueError(f"Invalid value passed in mid : {mid}")
sb = classifier(bigger, lab)
ss = classifier(smaller, lab)
result_bigger = sum(sb["scores"])
result_smaller = sum(ss["scores"])
if result_bigger > result_smaller:
return True
return False
def get_ai_position(address: str) -> int:
"""This function use AI to find the position of the address
Args:
address (str): Pass the address here
Returns:
int: Return the mid 1 for 4th and 2 for 5th and 3 for 6th
"""
if check_for_first(address):
return 1
if check_for_second(address):
return 2
if check_for_third(address):
return 3
result_first = sum(classifier(address, ["School", "Department", "Laboratory"])["scores"])
result_second = sum(classifier(address, ["University", "Polytechnic"])["scores"])
result_third = sum(classifier(address, ["State", "District", "Country"])["scores"])
total = max(result_first, result_second, result_third)
if total == result_first:
return 1
elif total == result_second:
return 2
if total == result_third:
return 3
else:
return 3
def compare_by_mid(bigger: int, smaller: int, address: str, threshold: float = 0.1) -> bool:
"""Helps to find the proper position for the address according to the mid
Args:
bigger (int): Pass the mid 1, 2 or 3
smaller (int): Pass the mid 1, 2 or 3
address (str): If possibility of bigger is more then return True else False
threshold (float): Minimum score difference to consider valid comparison
Returns:
bool: Boolean
"""
if bigger == 1:
if check_for_first(address):
return True
bigger_l = ["School", "Department", "Laboratory"]
elif bigger == 2:
if check_for_second(address):
return True
bigger_l = ["University", "Polytechnic"]
else:
if check_for_third(address):
return True
bigger_l = ["State", "District", "Country"]
if smaller == 1:
smaller_l = ["School", "Department", "Laboratory"]
elif smaller == 2:
smaller_l = ["University", "Polytechnic"]
else:
smaller_l = ["State", "District", "Country"]
result_bigger = classifier(address, bigger_l)
result_smaller = classifier(address, smaller_l)
max_bigger = max(result_bigger["scores"])
max_smaller = max(result_smaller["scores"])
score_difference = max_smaller - max_bigger
return score_difference > threshold
def find_missing_data(data1: str, data2: str, data3: str, var1: str, var2: str, var3: str) -> str:
"""Helps to find the missing data
Args:
data1 (str): Pass the first data or you can say address
data2 (str): Pass the 2nd address
data3 (str): pass third address
var1 (str): pass the first variable to check
var2 (str): pass 2nd variable to check
var3 (str): pass the third variable to check the 3rd address
Returns:
str: return the address as string
"""
data_set = {data1, data2, data3}
variables_filled = {var1, var2, var3}
missing_data = data_set - variables_filled
if missing_data:
return ', '.join(missing_data)
else:
return "All data has been assigned correctly."
def swapper(i1, i2, i3):
first, second, third = None, None, None
inputs = [i1, i2, i3]
first_candidates = []
for data in inputs:
original_data = data
if check_for_third(translate_text(data)):
if third is None:
third = data
else:
third, data = data, third
if check_for_first(translate_text(data)):
first_candidates.append(data)
elif check_for_second(translate_text(data)):
if second is None:
second = data
else:
second, data = data, second
elif check_for_first(translate_text(data)):
first_candidates.append(data)
elif check_for_second(translate_text(data)):
if second is None:
second = data
else:
second, data = data, second
if first_candidates:
first = first_candidates[0] if first is None else first
if len(first_candidates) > 1:
second = first_candidates[1] if second is None else second
remaining_data = [i1, i2, i3]
if first is None:
try:
first = remaining_data.pop(remaining_data.index(next(filter(lambda x: x not in {first, second, third}, remaining_data), None)))
except:
first = i1
if second is None:
try:
second = remaining_data.pop(remaining_data.index(next(filter(lambda x: x not in {first, second, third}, remaining_data), None)))
except:
second = i2
if third is None:
try:
third = remaining_data.pop(remaining_data.index(next(filter(lambda x: x not in {first, second, third}, remaining_data), None)))
except:
third = i3
return first, second, third
def settle_all_address(address_first: str, address_second: str, address_third: str):
address_1 = address_first
address_2 = address_second
address_3 = address_third
r_add1 = None
r_add2 = None
r_add3 = None
# Check for first function
if check_for_first(address_first):
r_add1 = address_first
elif check_for_first(address_second):
r_add1 = address_second
elif check_for_first(address_third):
r_add1 = address_third
# Check for second function
if check_for_second(address_first):
r_add2 = address_first
elif check_for_second(address_second):
r_add2 = address_second
elif check_for_second(address_third):
r_add2 = address_third
# Check for third function
if check_for_third(address_first):
r_add3 = address_first
elif check_for_third(address_second):
r_add3 = address_second
elif check_for_third(address_third):
r_add3 = address_third
if r_add1 == r_add2 or r_add1 == r_add3 or r_add2 == r_add3:
# Duplicate data found now perform the comparizon in here
if r_add1 == r_add2 == r_add3:
r_add1 = None
r_add2 = None
r_add3 = None
else:
if r_add1 == r_add2 and r_add1 != None: # If address 1 and address 2 is same then use AI for checking
m_add = find_missing_data(address_1, address_2, address_3, r_add1, r_add2, r_add3) # Find the missing address and add it to the r_add3
if compaire_two(m_add, r_add2, 1):
r_add2 = m_add
else:
r_add1 = m_add
elif r_add1 == r_add3 and r_add1 != None:
m_add = find_missing_data(address_1, address_2, address_3, r_add1, r_add2, r_add3) # Find the missing address and add it to the r_add3
if compaire_two(m_add, r_add3, 1):
r_add1 = m_add
else:
r_add3 = m_add
elif r_add2 == r_add3 and r_add2 != None:
m_add = find_missing_data(address_1, address_2, address_3, r_add1, r_add2, r_add3) # Find the missing address and add it to the r_add3
if compaire_two(m_add, r_add3, 3):
r_add3 = m_add
else:
r_add2 = m_add
if r_add1 == None or r_add2 == None or r_add3 == None:
# if any of them is None then calculate the address
ai_position1 = get_ai_position(address_1)
ai_position2 = get_ai_position(address_2)
ai_position3 = get_ai_position(address_3)
if ai_position1 == 3:
if r_add3:
pass
else:
r_add3 = address_1
if r_add1 == None or r_add2 == None:
if r_add3 == address_1:
if compare_by_mid(1, 2, address_2):
r_add1 = address_2
r_add2 = address_3
else:
r_add1 = address_3
r_add2 = address_2
elif r_add3 == address_2:
if compare_by_mid(1, 2, address_1):
r_add1 = address_1
r_add2 = address_3
else:
r_add1 = address_3
r_add2 = address_1
elif r_add3 == address_3:
if compare_by_mid(1, 2, address_1):
r_add1 = address_1
r_add2 = address_2
else:
r_add1 = address_2
r_add2 = address_1
elif ai_position1 == 2:
if r_add2:
pass
else:
r_add2 = address_1
if r_add1 == None or r_add3 == None:
if r_add2 == address_1:
if compare_by_mid(1, 3, address_2):
r_add1 = address_2
r_add3 = address_3
else:
r_add1 = address_3
r_add3 = address_2
elif r_add2 == address_2:
if compare_by_mid(1, 3, address_1):
r_add1 = address_1
r_add3 = address_3
else:
r_add1 = address_3
r_add3 = address_1
elif r_add2 == address_3:
if compare_by_mid(1, 3, address_1):
r_add1 = address_1
r_add3 = address_2
else:
r_add1 = address_2
r_add3 = address_1
else:
if r_add1:
pass
else:
r_add1 = address_1
if r_add2 == None or r_add3 == None:
if r_add1 == address_1:
if compare_by_mid(2, 3, address_2):
r_add2 = address_2
r_add3 = address_3
else:
r_add2 = address_3
r_add3 = address_2
elif r_add1 == address_2:
if compare_by_mid(2, 3, address_1):
r_add2 = address_1
r_add3 = address_3
else:
r_add2 = address_3
r_add3 = address_1
elif r_add1 == address_3:
if compare_by_mid(2, 3, address_1):
r_add2 = address_1
r_add3 = address_2
else:
r_add2 = address_2
r_add3 = address_1
if ai_position2 == 3:
if r_add3:
pass
else:
r_add3 = address_2
if r_add1 == None or r_add2 == None:
if r_add3 == address_1:
if compare_by_mid(1, 2, address_2):
r_add1 = address_2
r_add2 = address_3
else:
r_add1 = address_3
r_add2 = address_2
elif r_add3 == address_2:
if compare_by_mid(1, 2, address_1):
r_add1 = address_1
r_add2 = address_3
else:
r_add1 = address_3
r_add2 = address_1
elif r_add3 == address_3:
if compare_by_mid(1, 2, address_1):
r_add1 = address_1
r_add2 = address_2
else:
r_add1 = address_2
r_add2 = address_1
elif ai_position2 == 2:
if r_add2:
pass
else:
r_add2 = address_2
if r_add1 == None or r_add3 == None:
if r_add2 == address_1:
if compare_by_mid(1, 3, address_2):
r_add1 = address_2
r_add3 = address_3
else:
r_add1 = address_3
r_add3 = address_2
elif r_add2 == address_2:
if compare_by_mid(1, 3, address_1):
r_add1 = address_1
r_add3 = address_3
else:
r_add1 = address_3
r_add3 = address_1
elif r_add2 == address_3:
if compare_by_mid(1, 3, address_1):
r_add1 = address_1
r_add3 = address_2
else:
r_add1 = address_2
r_add3 = address_1
else:
if r_add1:
pass
else:
r_add1 = address_2
if r_add2 == None or r_add3 == None:
if r_add1 == address_1:
if compare_by_mid(2, 3, address_2):
r_add2 = address_2
r_add3 = address_3
else:
r_add2 = address_3
r_add3 = address_2
elif r_add1 == address_2:
if compare_by_mid(2, 3, address_1):
r_add2 = address_1
r_add3 = address_3
else:
r_add2 = address_3
r_add3 = address_1
elif r_add1 == address_3:
if compare_by_mid(2, 3, address_1):
r_add2 = address_1
r_add3 = address_2
else:
r_add2 = address_2
r_add3 = address_1
if ai_position3 == 3:
if r_add3:
pass
else:
r_add3 = address_3
if r_add1 == None or r_add2 == None:
if r_add3 == address_1:
if compare_by_mid(1, 2, address_2):
r_add1 = address_2
r_add2 = address_3
else:
r_add1 = address_3
r_add2 = address_2
elif r_add3 == address_2:
if compare_by_mid(1, 2, address_1):
r_add1 = address_1
r_add2 = address_3
else:
r_add1 = address_3
r_add2 = address_1
elif r_add3 == address_3:
if compare_by_mid(1, 2, address_1):
r_add1 = address_1
r_add2 = address_2
else:
r_add1 = address_2
r_add2 = address_1
elif ai_position3 == 2:
if r_add2:
pass
else:
r_add2 = address_3
if r_add1 == None or r_add3 == None:
if r_add2 == address_1:
if compare_by_mid(1, 3, address_2):
r_add1 = address_2
r_add3 = address_3
else:
r_add1 = address_3
r_add3 = address_2
elif r_add2 == address_2:
if compare_by_mid(1, 3, address_1):
r_add1 = address_1
r_add3 = address_3
else:
r_add1 = address_3
r_add3 = address_1
elif r_add2 == address_3:
if compare_by_mid(1, 3, address_1):
r_add1 = address_1
r_add3 = address_2
else:
r_add1 = address_2
r_add3 = address_1
else:
if r_add1:
pass
else:
r_add1 = address_3
if r_add2 == None or r_add3 == None:
if r_add1 == address_1:
if compare_by_mid(2, 3, address_2):
r_add2 = address_2
r_add3 = address_3
else:
r_add2 = address_3
r_add3 = address_2
elif r_add1 == address_2:
if compare_by_mid(2, 3, address_1):
r_add2 = address_1
r_add3 = address_3
else:
r_add2 = address_3
r_add3 = address_1
elif r_add1 == address_3:
if compare_by_mid(2, 3, address_1):
r_add2 = address_1
r_add3 = address_2
else:
r_add2 = address_2
r_add3 = address_1
return swapper(r_add1, r_add2, r_add3)
def process_file(filepath: str):
wb = openpyxl.load_workbook(filepath, data_only=True)
ws = wb.active
new_wb = openpyxl.Workbook()
new_ws = new_wb.active
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
temp_path = temp_file.name
columns_to_process = [4, 5, 6]
for col in range(1, ws.max_column + 1):
new_ws.cell(row=1, column=col).value = ws.cell(row=1, column=col).value
empty_rows = 0
# for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
# if any(cell.value not in (None, "") for cell in row):
# total_row += 1
for row in ws.iter_rows(min_row=2, max_row=ws.max_row):
if empty_rows > 3:
break
row_num = row[0].row
for col in range(1, ws.max_column + 1):
if col not in columns_to_process:
new_ws.cell(row=row_num, column=col).value = ws.cell(row=row_num, column=col).value
else:
new_ws.cell(row=row_num, column=col).value = None
address_first = ws.cell(row=row_num, column=4).value
address_second = ws.cell(row=row_num, column=5).value
address_third = ws.cell(row=row_num, column=6).value
if address_first != None and address_second != None and address_third != None:
# print(f"Processing {address_first} | {address_second} | {address_third}")
ad1, ad2, ad3 = settle_all_address(address_first, address_second, address_third)
new_ws.cell(row=row_num, column=4).value = ad1
new_ws.cell(row=row_num, column=5).value = ad2
new_ws.cell(row=row_num, column=6).value = ad3
print(f"Adding : {ad1} | {ad2} | {ad3}")
else:
empty_rows += 1
new_wb.save(temp_path)
return temp_path
def gradio_process(file):
file_path = file.name
output_file_path = process_file(file_path)
return output_file_path
iface = gr.Interface(
fn=gradio_process,
inputs=gr.File(),
outputs=gr.File(),
title="AI Address Processor",
description="Upload an Excel file, and the AI will process the addresses."
)
iface.launch()