seanpedrickcase commited on
Commit
82732e8
·
1 Parent(s): eda6ed8

Should match correctly in situations where you have just a full address and postcode column for input

Browse files
Files changed (2) hide show
  1. tools/matcher_funcs.py +2 -2
  2. tools/preparation.py +14 -7
tools/matcher_funcs.py CHANGED
@@ -710,7 +710,7 @@ def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame
710
  output_files.append(final_api_output_file_name)
711
 
712
  if InitMatch.search_df.empty or InitMatch.ref_df.empty:
713
- out_message = "Nothing to match!"
714
  print(out_message)
715
 
716
  output_files.extend([InitMatch.results_orig_df_name, InitMatch.match_outputs_name])
@@ -1352,7 +1352,7 @@ def full_nn_match(ref_address_cols:List[str],
1352
 
1353
  # Break if search item has length 0
1354
  if search_df.empty:
1355
- out_error = "Nothing to match!"
1356
  print(out_error)
1357
  return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(), out_error, search_address_cols
1358
 
 
710
  output_files.append(final_api_output_file_name)
711
 
712
  if InitMatch.search_df.empty or InitMatch.ref_df.empty:
713
+ out_message = "Nothing to match! Search data frame or reference data frame are empty"
714
  print(out_message)
715
 
716
  output_files.extend([InitMatch.results_orig_df_name, InitMatch.match_outputs_name])
 
1352
 
1353
  # Break if search item has length 0
1354
  if search_df.empty:
1355
+ out_error = "Nothing to match! At neural net matching stage."
1356
  print(out_error)
1357
  return pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(),pd.DataFrame(), out_error, search_address_cols
1358
 
tools/preparation.py CHANGED
@@ -11,6 +11,7 @@ array = List[str]
11
 
12
  today = datetime.now().strftime("%d%m%Y")
13
  today_rev = datetime.now().strftime("%Y%m%d")
 
14
 
15
 
16
  def prepare_search_address_string(
@@ -72,6 +73,12 @@ def prepare_search_address(
72
  # Clean address columns
73
  #search_df_polars = pl.from_dataframe(search_df)
74
  clean_addresses = _clean_columns(search_df, address_cols)
 
 
 
 
 
 
75
 
76
  # Join address columns into one
77
  full_addresses = _join_address(clean_addresses, address_cols)
@@ -255,14 +262,14 @@ def check_no_number_addresses(df, in_address_series) -> PandasSeries:
255
 
256
  return df
257
 
258
- def remove_postcode(df, col:str) -> PandasSeries:
259
- '''
260
- Remove a postcode from a string column in a dataframe
261
- '''
262
- address_series_no_pcode = df[col].str.upper().str.replace(\
263
- "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
264
 
265
- return address_series_no_pcode
266
 
267
  def extract_street_name(address:str) -> str:
268
  """
 
11
 
12
  today = datetime.now().strftime("%d%m%Y")
13
  today_rev = datetime.now().strftime("%Y%m%d")
14
+ from tools.standardise import remove_postcode
15
 
16
 
17
  def prepare_search_address_string(
 
73
  # Clean address columns
74
  #search_df_polars = pl.from_dataframe(search_df)
75
  clean_addresses = _clean_columns(search_df, address_cols)
76
+
77
+ # If there is a full address and postcode column in the addresses, clean any postcodes from the first column
78
+ if len(address_cols) == 2:
79
+ # Remove postcode from address
80
+ address_series = remove_postcode(clean_addresses, address_cols[0])
81
+ clean_addresses[address_cols[0]] = address_series
82
 
83
  # Join address columns into one
84
  full_addresses = _join_address(clean_addresses, address_cols)
 
262
 
263
  return df
264
 
265
+ # def remove_postcode(df, col:str) -> PandasSeries:
266
+ # '''
267
+ # Remove a postcode from a string column in a dataframe
268
+ # '''
269
+ # address_series_no_pcode = df[col].str.upper().str.replace(\
270
+ # "\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2}|GIR ?0A{2})\\b$|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$|\\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\\b$","", regex=True).str.lower()
271
 
272
+ # return address_series_no_pcode
273
 
274
  def extract_street_name(address:str) -> str:
275
  """