Spaces:
Running
Running
Commit
·
82732e8
1
Parent(s):
eda6ed8
Should match correctly in situations where you have just a full address and postcode column for input
Browse files- tools/matcher_funcs.py +2 -2
- tools/preparation.py +14 -7
tools/matcher_funcs.py
CHANGED
@@ -710,7 +710,7 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
|
|
710 |
output_files.append(final_api_output_file_name)
|
711 |
|
712 |
if InitMatch.search_df.empty or InitMatch.ref_df.empty:
|
713 |
-
out_message = "Nothing to match!"
|
714 |
print(out_message)
|
715 |
|
716 |
output_files.extend([InitMatch.results_orig_df_name, InitMatch.match_outputs_name])
|
@@ -1352,7 +1352,7 @@ def full_nn_match(ref_address_cols:List[str],
|
|
1352 |
|
1353 |
# Break if search item has length 0
|
1354 |
if search_df.empty:
|
1355 |
-
out_error = "Nothing to match!"
|
1356 |
print(out_error)
|
1357 |
return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(), out_error, search_address_cols
|
1358 |
|
|
|
710 |
output_files.append(final_api_output_file_name)
|
711 |
|
712 |
if InitMatch.search_df.empty or InitMatch.ref_df.empty:
|
713 |
+
out_message = "Nothing to match! Search data frame or reference data frame are empty"
|
714 |
print(out_message)
|
715 |
|
716 |
output_files.extend([InitMatch.results_orig_df_name, InitMatch.match_outputs_name])
|
|
|
1352 |
|
1353 |
# Break if search item has length 0
|
1354 |
if search_df.empty:
|
1355 |
+
out_error = "Nothing to match! At neural net matching stage."
|
1356 |
print(out_error)
|
1357 |
return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(), out_error, search_address_cols
|
1358 |
|
tools/preparation.py
CHANGED
@@ -11,6 +11,7 @@ array = List[str]
|
|
11 |
|
12 |
today = datetime.now().strftime("%d%m%Y")
|
13 |
today_rev = datetime.now().strftime("%Y%m%d")
|
|
|
14 |
|
15 |
|
16 |
def prepare_search_address_string(
|
@@ -72,6 +73,12 @@ def prepare_search_address(
|
|
72 |
# Clean address columns
|
73 |
#search_df_polars = pl.from_dataframe(search_df)
|
74 |
clean_addresses = _clean_columns(search_df, address_cols)
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
# Join address columns into one
|
77 |
full_addresses = _join_address(clean_addresses, address_cols)
|
@@ -255,14 +262,14 @@ def check_no_number_addresses(df, in_address_series) -> PandasSeries:
|
|
255 |
|
256 |
return df
|
257 |
|
258 |
-
def remove_postcode(df, col:str) -> PandasSeries:
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
|
265 |
-
|
266 |
|
267 |
def extract_street_name(address:str) -> str:
|
268 |
"""
|
|
|
11 |
|
12 |
today = datetime.now().strftime("%d%m%Y")
|
13 |
today_rev = datetime.now().strftime("%Y%m%d")
|
14 |
+
from tools.standardise import remove_postcode
|
15 |
|
16 |
|
17 |
def prepare_search_address_string(
|
|
|
73 |
# Clean address columns
|
74 |
#search_df_polars = pl.from_dataframe(search_df)
|
75 |
clean_addresses = _clean_columns(search_df, address_cols)
|
76 |
+
|
77 |
+
# If there is a full address and postcode column in the addresses, clean any postcodes from the first column
|
78 |
+
if len(address_cols) == 2:
|
79 |
+
# Remove postcode from address
|
80 |
+
address_series = remove_postcode(clean_addresses, address_cols[0])
|
81 |
+
clean_addresses[address_cols[0]] = address_series
|
82 |
|
83 |
# Join address columns into one
|
84 |
full_addresses = _join_address(clean_addresses, address_cols)
|
|
|
262 |
|
263 |
return df
|
264 |
|
265 |
+
# def remove_postcode(df, col:str) -> PandasSeries:
|
266 |
+
# '''
|
267 |
+
# Remove a postcode from a string column in a dataframe
|
268 |
+
# '''
|
269 |
+
# address_series_no_pcode = df[col].str.upper().str.replace(\
|
270 |
+
# "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
|
271 |
|
272 |
+
# return address_series_no_pcode
|
273 |
|
274 |
def extract_street_name(address:str) -> str:
|
275 |
"""
|