Spaces:
Running
Running
Commit
·
4300019
1
Parent(s):
8d1cc2b
Some code rearranged. Fixed API call paths for Linux systems
Browse files- app.py +6 -341
- tools/fuzzy_match.py +6 -2
- tools/matcher_funcs.py +340 -37
app.py
CHANGED
@@ -1,27 +1,20 @@
|
|
1 |
-
# Load in packages, variables for fuzzy matching
|
2 |
import os
|
3 |
from datetime import datetime
|
4 |
from pathlib import Path
|
5 |
-
import time
|
6 |
-
import copy
|
7 |
import gradio as gr
|
8 |
-
import
|
9 |
-
#import polars as pl
|
10 |
|
11 |
-
from tools.
|
12 |
-
from tools.matcher_funcs import load_matcher_data, run_match_batch, combine_two_matches, create_match_summary
|
13 |
from tools.gradio import initial_data_load
|
14 |
from tools.aws_functions import load_data_from_aws
|
15 |
-
from tools.preparation import prepare_search_address_string, prepare_search_address, prepare_ref_address, remove_non_postal, check_no_number_addresses
|
16 |
-
from tools.standardise import standardise_wrapper_func
|
17 |
|
18 |
import warnings
|
|
|
19 |
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
|
20 |
warnings.filterwarnings("ignore", 'Downcasting behavior')
|
21 |
warnings.filterwarnings("ignore", 'A value is trying to be set on a copy of a slice from a DataFrame')
|
22 |
warnings.filterwarnings("ignore")
|
23 |
|
24 |
-
|
25 |
today = datetime.now().strftime("%d%m%Y")
|
26 |
today_rev = datetime.now().strftime("%Y%m%d")
|
27 |
|
@@ -32,335 +25,7 @@ output_folder = base_folder/"Output/"
|
|
32 |
diagnostics_folder = base_folder/"Diagnostics/"
|
33 |
prep_folder = base_folder/"Helper functions/"
|
34 |
|
35 |
-
def create_simple_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int):
|
36 |
-
#print("Search df batch size: ", batch_size)
|
37 |
-
#print("ref_df df batch size: ", ref_batch_size)
|
38 |
-
|
39 |
-
total_rows = df.shape[0]
|
40 |
-
ref_total_rows = ref_df.shape[0]
|
41 |
-
|
42 |
-
# Creating bottom and top limits for search data
|
43 |
-
search_ranges = []
|
44 |
-
for start in range(0, total_rows, batch_size):
|
45 |
-
end = min(start + batch_size - 1, total_rows - 1) # Adjusted to get the top limit
|
46 |
-
search_ranges.append((start, end))
|
47 |
-
|
48 |
-
# Creating bottom and top limits for reference data
|
49 |
-
ref_ranges = []
|
50 |
-
for start in range(0, ref_total_rows, ref_batch_size):
|
51 |
-
end = min(start + ref_batch_size - 1, ref_total_rows - 1) # Adjusted to get the top limit
|
52 |
-
ref_ranges.append((start, end))
|
53 |
-
|
54 |
-
# Create DataFrame with combinations of search_range and ref_range
|
55 |
-
result_data = []
|
56 |
-
for search_range in search_ranges:
|
57 |
-
for ref_range in ref_ranges:
|
58 |
-
result_data.append((search_range, ref_range))
|
59 |
-
|
60 |
-
range_df = pd.DataFrame(result_data, columns=['search_range', 'ref_range'])
|
61 |
-
|
62 |
-
return range_df
|
63 |
-
|
64 |
-
|
65 |
-
def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int, search_postcode_col:str, ref_postcode_col:str):
|
66 |
-
'''
|
67 |
-
Create batches of address indexes for search and reference dataframes based on shortened postcodes.
|
68 |
-
'''
|
69 |
-
|
70 |
-
# If df sizes are smaller than the batch size limits, no need to run through everything
|
71 |
-
if len(df) < batch_size and len(ref_df) < ref_batch_size:
|
72 |
-
print("Dataframe sizes are smaller than maximum batch sizes, no need to split data.")
|
73 |
-
lengths_df = pd.DataFrame(data={'search_range':[df.index.tolist()], 'ref_range':[ref_df.index.tolist()], 'batch_length':len(df), 'ref_length':len(ref_df)})
|
74 |
-
return lengths_df
|
75 |
-
|
76 |
-
#df.index = df[search_postcode_col]
|
77 |
-
|
78 |
-
df['index'] = df.index
|
79 |
-
ref_df['index'] = ref_df.index
|
80 |
-
|
81 |
-
# Remove the last character of postcode
|
82 |
-
df['postcode_minus_last_character'] = df[search_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
|
83 |
-
ref_df['postcode_minus_last_character'] = ref_df[ref_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
|
84 |
-
|
85 |
-
unique_postcodes = df['postcode_minus_last_character'][df['postcode_minus_last_character'].str.len()>=4].unique().tolist()
|
86 |
-
|
87 |
-
df = df.set_index('postcode_minus_last_character')
|
88 |
-
ref_df = ref_df.set_index('postcode_minus_last_character')
|
89 |
-
|
90 |
-
df = df.sort_index()
|
91 |
-
ref_df = ref_df.sort_index()
|
92 |
-
|
93 |
-
#df.to_csv("batch_search_df.csv")
|
94 |
-
|
95 |
-
# Overall batch variables
|
96 |
-
batch_indexes = []
|
97 |
-
ref_indexes = []
|
98 |
-
batch_lengths = []
|
99 |
-
ref_lengths = []
|
100 |
-
|
101 |
-
# Current batch variables for loop
|
102 |
-
current_batch = []
|
103 |
-
current_ref_batch = []
|
104 |
-
current_batch_length = []
|
105 |
-
current_ref_length = []
|
106 |
-
|
107 |
-
unique_postcodes_iterator = unique_postcodes.copy()
|
108 |
-
|
109 |
-
while unique_postcodes_iterator:
|
110 |
-
|
111 |
-
unique_postcodes_loop = unique_postcodes_iterator.copy()
|
112 |
-
|
113 |
-
#print("Current loop postcodes: ", unique_postcodes_loop)
|
114 |
-
|
115 |
-
for current_postcode in unique_postcodes_loop:
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
if len(current_batch) >= batch_size or len(current_ref_batch) >= ref_batch_size:
|
120 |
-
print("Batch length reached - breaking")
|
121 |
-
break
|
122 |
-
|
123 |
-
try:
|
124 |
-
current_postcode_search_data_add = df.loc[[current_postcode]]#[df['postcode_minus_last_character'].isin(current_postcode)]
|
125 |
-
current_postcode_ref_data_add = ref_df.loc[[current_postcode]]#[ref_df['postcode_minus_last_character'].isin(current_postcode)]
|
126 |
-
|
127 |
-
#print(current_postcode_search_data_add)
|
128 |
-
|
129 |
-
if not current_postcode_search_data_add.empty:
|
130 |
-
current_batch.extend(current_postcode_search_data_add['index'])
|
131 |
-
|
132 |
-
if not current_postcode_ref_data_add.empty:
|
133 |
-
current_ref_batch.extend(current_postcode_ref_data_add['index'])
|
134 |
-
|
135 |
-
except:
|
136 |
-
#print("postcode not found: ", current_postcode)
|
137 |
-
pass
|
138 |
-
|
139 |
-
unique_postcodes_iterator.remove(current_postcode)
|
140 |
-
|
141 |
-
# Append the batch data to the master lists and reset lists
|
142 |
-
batch_indexes.append(current_batch)
|
143 |
-
ref_indexes.append(current_ref_batch)
|
144 |
-
|
145 |
-
current_batch_length = len(current_batch)
|
146 |
-
current_ref_length = len(current_ref_batch)
|
147 |
-
|
148 |
-
batch_lengths.append(current_batch_length)
|
149 |
-
ref_lengths.append(current_ref_length)
|
150 |
-
|
151 |
-
current_batch = []
|
152 |
-
current_ref_batch = []
|
153 |
-
current_batch_length = []
|
154 |
-
current_ref_length = []
|
155 |
-
|
156 |
-
# Create df to store lengths
|
157 |
-
lengths_df = pd.DataFrame(data={'search_range':batch_indexes, 'ref_range':ref_indexes, 'batch_length':batch_lengths, 'ref_length':ref_lengths})
|
158 |
-
|
159 |
-
return lengths_df
|
160 |
-
|
161 |
-
|
162 |
-
def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
|
163 |
-
'''
|
164 |
-
Split search and reference data into batches. Loop and run through the match script for each batch of data.
|
165 |
-
'''
|
166 |
-
|
167 |
-
overall_tic = time.perf_counter()
|
168 |
-
|
169 |
-
# Load in initial data. This will filter to relevant addresses in the search and reference datasets that can potentially be matched, and will pull in API data if asked for.
|
170 |
-
InitMatch = load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, InitMatch, in_api, in_api_key)
|
171 |
-
|
172 |
-
if InitMatch.search_df.empty or InitMatch.ref_df.empty:
|
173 |
-
out_message = "Nothing to match!"
|
174 |
-
print(out_message)
|
175 |
-
return out_message, [InitMatch.results_orig_df_name, InitMatch.match_outputs_name]
|
176 |
-
|
177 |
-
# Run initial address preparation and standardisation processes
|
178 |
-
# Prepare address format
|
179 |
-
|
180 |
-
# Polars implementation not yet finalised
|
181 |
-
#InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
|
182 |
-
#InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
|
183 |
-
|
184 |
-
|
185 |
-
# Prepare all search addresses
|
186 |
-
if type(InitMatch.search_df) == str:
|
187 |
-
InitMatch.search_df_cleaned, InitMatch.search_df_key_field, InitMatch.search_address_cols = prepare_search_address_string(InitMatch.search_df)
|
188 |
-
else:
|
189 |
-
InitMatch.search_df_cleaned = prepare_search_address(InitMatch.search_df, InitMatch.search_address_cols, InitMatch.search_postcode_col, InitMatch.search_df_key_field)
|
190 |
-
|
191 |
-
# Remove addresses that are not postal addresses
|
192 |
-
InitMatch.search_df_cleaned = remove_non_postal(InitMatch.search_df_cleaned, "full_address")
|
193 |
-
|
194 |
-
# Remove addresses that have no numbers in from consideration
|
195 |
-
InitMatch.search_df_cleaned = check_no_number_addresses(InitMatch.search_df_cleaned, "full_address")
|
196 |
-
|
197 |
-
# Initial preparation of reference addresses
|
198 |
-
InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
|
199 |
-
|
200 |
-
|
201 |
-
# Sort dataframes by postcode - will allow for more efficient matching process if using multiple batches
|
202 |
-
#InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.sort_values(by="postcode")
|
203 |
-
#InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.sort_values(by="Postcode")
|
204 |
-
|
205 |
-
# Polars implementation - not finalised
|
206 |
-
#InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
|
207 |
-
#InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
|
208 |
-
|
209 |
-
# Standardise addresses
|
210 |
-
# Standardise - minimal
|
211 |
-
|
212 |
-
|
213 |
-
tic = time.perf_counter()
|
214 |
-
InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
|
215 |
-
InitMatch.search_df_cleaned.copy(),
|
216 |
-
InitMatch.ref_df_cleaned.copy(),
|
217 |
-
standardise = False,
|
218 |
-
filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
|
219 |
-
match_task="fuzzy") # InitMatch.search_df_after_stand_series, InitMatch.ref_df_after_stand_series
|
220 |
-
|
221 |
-
toc = time.perf_counter()
|
222 |
-
print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
|
223 |
-
|
224 |
-
# Standardise - full
|
225 |
-
tic = time.perf_counter()
|
226 |
-
InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
|
227 |
-
InitMatch.search_df_cleaned.copy(),
|
228 |
-
InitMatch.ref_df_cleaned.copy(),
|
229 |
-
standardise = True,
|
230 |
-
filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
|
231 |
-
match_task="fuzzy") # , InitMatch.search_df_after_stand_series_full_stand, InitMatch.ref_df_after_stand_series_full_stand
|
232 |
-
|
233 |
-
toc = time.perf_counter()
|
234 |
-
print(f"Performed the full standardisation step in {toc - tic:0.1f} seconds")
|
235 |
-
|
236 |
-
# Determine length of search df to create batches to send through the functions.
|
237 |
-
#try:
|
238 |
-
range_df = create_batch_ranges(InitMatch.search_df_cleaned.copy(), InitMatch.ref_df_cleaned.copy(), batch_size, ref_batch_size, "postcode", "Postcode")
|
239 |
-
#except:
|
240 |
-
# range_df = create_simple_batch_ranges(InitMatch.search_df_cleaned, InitMatch.ref_df_cleaned, batch_size, #ref_batch_size)
|
241 |
-
|
242 |
-
print("Batches to run in this session: ", range_df)
|
243 |
-
|
244 |
-
OutputMatch = copy.copy(InitMatch)
|
245 |
-
|
246 |
-
n = 0
|
247 |
-
number_of_batches = range_df.shape[0]
|
248 |
-
|
249 |
-
for row in progress.tqdm(range(0,len(range_df)), desc= "Running through batches", unit="batches", total=number_of_batches):
|
250 |
-
print("Running batch ", str(n+1))
|
251 |
-
|
252 |
-
search_range = range_df.iloc[row]['search_range']
|
253 |
-
ref_range = range_df.iloc[row]['ref_range']
|
254 |
-
|
255 |
-
#print("search_range: ", search_range)
|
256 |
-
#pd.DataFrame(search_range).to_csv("search_range.csv")
|
257 |
-
#print("ref_range: ", ref_range)
|
258 |
-
|
259 |
-
BatchMatch = copy.copy(InitMatch)
|
260 |
-
|
261 |
-
# Subset the search and reference dfs based on current batch ranges
|
262 |
-
# BatchMatch.search_df = BatchMatch.search_df.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
263 |
-
# BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
|
264 |
-
# BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
265 |
-
# BatchMatch.ref_df = BatchMatch.ref_df.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
266 |
-
# BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
267 |
-
|
268 |
-
|
269 |
-
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.iloc[search_range[0]:search_range[1] + 1]
|
270 |
-
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.iloc[ref_range[0]:ref_range[1] + 1]
|
271 |
-
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.iloc[search_range[0]:search_range[1] + 1]
|
272 |
-
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.iloc[ref_range[0]:ref_range[1] + 1]
|
273 |
-
|
274 |
-
# BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
275 |
-
# BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
276 |
-
# BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
277 |
-
# BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
278 |
-
|
279 |
-
BatchMatch.search_df = BatchMatch.search_df[BatchMatch.search_df.index.isin(search_range)].reset_index(drop=True)
|
280 |
-
BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
|
281 |
-
BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned[BatchMatch.search_df_cleaned.index.isin(search_range)].reset_index(drop=True)
|
282 |
-
|
283 |
-
BatchMatch.ref_df = BatchMatch.ref_df[BatchMatch.ref_df.index.isin(ref_range)].reset_index(drop=True)
|
284 |
-
BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned[BatchMatch.ref_df_cleaned.index.isin(ref_range)].reset_index(drop=True)
|
285 |
-
|
286 |
-
# Dataframes after standardisation process
|
287 |
-
BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand[BatchMatch.search_df_after_stand.index.isin(search_range)].reset_index(drop=True)
|
288 |
-
BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand[BatchMatch.search_df_after_full_stand.index.isin(search_range)].reset_index(drop=True)
|
289 |
-
|
290 |
-
### Create lookup lists for fuzzy matches
|
291 |
-
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand']
|
292 |
-
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand']
|
293 |
-
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.sort_index()
|
294 |
-
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.sort_index()
|
295 |
-
|
296 |
-
#BatchMatch.search_df_after_stand.reset_index(inplace=True, drop = True)
|
297 |
-
#BatchMatch.search_df_after_full_stand.reset_index(inplace=True, drop = True)
|
298 |
-
|
299 |
-
BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand[BatchMatch.ref_df_after_stand.index.isin(ref_range)].reset_index(drop=True)
|
300 |
-
BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand[BatchMatch.ref_df_after_full_stand.index.isin(ref_range)].reset_index(drop=True)
|
301 |
-
|
302 |
-
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand']
|
303 |
-
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_full_stand.copy().set_index('postcode_search')['ref_address_stand']
|
304 |
-
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.sort_index()
|
305 |
-
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.sort_index()
|
306 |
-
|
307 |
-
# BatchMatch.ref_df_after_stand.reset_index(inplace=True, drop=True)
|
308 |
-
# BatchMatch.ref_df_after_full_stand.reset_index(inplace=True, drop=True)
|
309 |
-
|
310 |
-
# Match the data, unless the search or reference dataframes are empty
|
311 |
-
if BatchMatch.search_df.empty or BatchMatch.ref_df.empty:
|
312 |
-
out_message = "Nothing to match for batch: " + str(n)
|
313 |
-
print(out_message)
|
314 |
-
BatchMatch_out = BatchMatch
|
315 |
-
BatchMatch_out.results_on_orig_df = pd.DataFrame(data={"index":BatchMatch.search_df.index,
|
316 |
-
"Excluded from search":False,
|
317 |
-
"Matched with reference address":False})
|
318 |
-
else:
|
319 |
-
summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
|
320 |
-
|
321 |
-
print("BatchMatch_out match shape: ", BatchMatch_out.results_on_orig_df.shape)
|
322 |
-
|
323 |
-
OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
|
324 |
-
|
325 |
-
print("Output results match shape: ", OutputMatch.results_on_orig_df.shape)
|
326 |
-
|
327 |
-
n += 1
|
328 |
-
|
329 |
-
if in_api==True:
|
330 |
-
OutputMatch.results_on_orig_df['Matched with reference address'] = OutputMatch.results_on_orig_df['Matched with reference address'].replace({1:True, 0:False})
|
331 |
-
OutputMatch.results_on_orig_df['Excluded from search'] = OutputMatch.results_on_orig_df['Excluded from search'].replace('nan', False).fillna(False)
|
332 |
-
|
333 |
-
# Remove any duplicates from reference df, prioritise successful matches
|
334 |
-
OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
|
335 |
-
|
336 |
-
|
337 |
-
overall_toc = time.perf_counter()
|
338 |
-
time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
|
339 |
-
|
340 |
-
print(OutputMatch.output_summary)
|
341 |
-
|
342 |
-
if OutputMatch.output_summary == "":
|
343 |
-
OutputMatch.output_summary = "No matches were found."
|
344 |
-
|
345 |
-
fuzzy_not_std_output = OutputMatch.match_results_output.copy()
|
346 |
-
fuzzy_not_std_output_mask = ~(fuzzy_not_std_output["match_method"].str.contains("Fuzzy match")) | (fuzzy_not_std_output["standardised_address"] == True)
|
347 |
-
fuzzy_not_std_output.loc[fuzzy_not_std_output_mask, "full_match"] = False
|
348 |
-
fuzzy_not_std_summary = create_match_summary(fuzzy_not_std_output, "Fuzzy not standardised")
|
349 |
-
|
350 |
-
fuzzy_std_output = OutputMatch.match_results_output.copy()
|
351 |
-
fuzzy_std_output_mask = fuzzy_std_output["match_method"].str.contains("Fuzzy match")
|
352 |
-
fuzzy_std_output.loc[fuzzy_std_output_mask == False, "full_match"] = False
|
353 |
-
fuzzy_std_summary = create_match_summary(fuzzy_std_output, "Fuzzy standardised")
|
354 |
-
|
355 |
-
nnet_std_output = OutputMatch.match_results_output.copy()
|
356 |
-
nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
|
357 |
-
|
358 |
-
final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
|
359 |
-
|
360 |
-
return final_summary, [OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name]
|
361 |
-
|
362 |
# Create the gradio interface
|
363 |
-
|
364 |
block = gr.Blocks(theme = gr.themes.Base())
|
365 |
|
366 |
with block:
|
@@ -401,8 +66,8 @@ with block:
|
|
401 |
in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
|
402 |
|
403 |
with gr.Accordion("Use Addressbase API instead of reference file", open = False):
|
404 |
-
in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
|
405 |
-
in_api_key = gr.Textbox(label="Addressbase API key")
|
406 |
|
407 |
with gr.Accordion("Custom reference file format or join columns (i.e. not LLPG LPI format)", open = False):
|
408 |
in_refcol = gr.Dropdown(choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
|
@@ -439,7 +104,7 @@ with block:
|
|
439 |
#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
|
440 |
|
441 |
# Simple run for AWS server
|
442 |
-
block.queue().launch(ssl_verify=False) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
|
443 |
|
444 |
# Download OpenSSL from here:
|
445 |
# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
|
|
|
|
|
1 |
import os
|
2 |
from datetime import datetime
|
3 |
from pathlib import Path
|
|
|
|
|
4 |
import gradio as gr
|
5 |
+
import pandas as pd
|
|
|
6 |
|
7 |
+
from tools.matcher_funcs import run_matcher
|
|
|
8 |
from tools.gradio import initial_data_load
|
9 |
from tools.aws_functions import load_data_from_aws
|
|
|
|
|
10 |
|
11 |
import warnings
|
12 |
+
# Remove warnings from print statements
|
13 |
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression')
|
14 |
warnings.filterwarnings("ignore", 'Downcasting behavior')
|
15 |
warnings.filterwarnings("ignore", 'A value is trying to be set on a copy of a slice from a DataFrame')
|
16 |
warnings.filterwarnings("ignore")
|
17 |
|
|
|
18 |
today = datetime.now().strftime("%d%m%Y")
|
19 |
today_rev = datetime.now().strftime("%Y%m%d")
|
20 |
|
|
|
25 |
diagnostics_folder = base_folder/"Diagnostics/"
|
26 |
prep_folder = base_folder/"Helper functions/"
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# Create the gradio interface
|
|
|
29 |
block = gr.Blocks(theme = gr.themes.Base())
|
30 |
|
31 |
with block:
|
|
|
66 |
in_ref = gr.File(label="Input reference addresses from file", file_count= "multiple")
|
67 |
|
68 |
with gr.Accordion("Use Addressbase API instead of reference file", open = False):
|
69 |
+
in_api = gr.Dropdown(label="Choose API type", multiselect=False, value=None, choices=["Postcode"])#["Postcode", "UPRN"]) #choices=["Address", "Postcode", "UPRN"])
|
70 |
+
in_api_key = gr.Textbox(label="Addressbase API key", type='password')
|
71 |
|
72 |
with gr.Accordion("Custom reference file format or join columns (i.e. not LLPG LPI format)", open = False):
|
73 |
in_refcol = gr.Dropdown(choices=[], multiselect=True, label="Select columns that make up the reference address. Make sure postcode is at the end")
|
|
|
104 |
#block.queue().launch(debug=True) # root_path="/address-match", debug=True, server_name="0.0.0.0",
|
105 |
|
106 |
# Simple run for AWS server
|
107 |
+
block.queue().launch(ssl_verify=False, inbrowser=True) # root_path="/address-match", debug=True, server_name="0.0.0.0", server_port=7861
|
108 |
|
109 |
# Download OpenSSL from here:
|
110 |
# Running on local server with https: https://discuss.huggingface.co/t/how-to-run-gradio-with-0-0-0-0-and-https/38003 or https://dev.to/rajshirolkar/fastapi-over-https-for-development-on-windows-2p7d
|
tools/fuzzy_match.py
CHANGED
@@ -169,7 +169,11 @@ def string_match_by_post_code_multiple(match_address_series:PandasSeries, refere
|
|
169 |
|
170 |
return out_frame
|
171 |
|
172 |
-
def _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cleaned, ref_df_after_stand, fuzzy_match_limit, search_df_cleaned, search_df_key_field, new_join_col, standardise, blocker_col):
|
|
|
|
|
|
|
|
|
173 |
|
174 |
## Diagnostics
|
175 |
|
@@ -227,7 +231,7 @@ def _create_fuzzy_match_results_output(results, search_df_after_stand, ref_df_cl
|
|
227 |
|
228 |
return match_results_output, diag_shortlist, diag_best_match
|
229 |
|
230 |
-
def create_diag_shortlist(results_df, matched_col, fuzzy_match_limit, blocker_col, fuzzy_col="fuzzy_score", search_mod_address = "search_mod_address", resolve_tie_breaks=True, no_number_fuzzy_match_limit=no_number_fuzzy_match_limit):
|
231 |
'''
|
232 |
Create a shortlist of the best matches from a list of suggested matches
|
233 |
'''
|
|
|
169 |
|
170 |
return out_frame
|
171 |
|
172 |
+
def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_stand:PandasDataFrame, ref_df_cleaned:PandasDataFrame, ref_df_after_stand:PandasDataFrame, fuzzy_match_limit:int, search_df_cleaned:PandasDataFrame, search_df_key_field:str, new_join_col:str, standardise:bool, blocker_col:str):
|
173 |
+
|
174 |
+
'''
|
175 |
+
Take fuzzy match outputs, create shortlist dataframes, rearrange, return diagnostics and shortlist dataframes for export
|
176 |
+
'''
|
177 |
|
178 |
## Diagnostics
|
179 |
|
|
|
231 |
|
232 |
return match_results_output, diag_shortlist, diag_best_match
|
233 |
|
234 |
+
def create_diag_shortlist(results_df:PandasDataFrame, matched_col:str, fuzzy_match_limit:int, blocker_col:str, fuzzy_col:str="fuzzy_score", search_mod_address:str = "search_mod_address", resolve_tie_breaks:bool=True, no_number_fuzzy_match_limit:int=no_number_fuzzy_match_limit) -> PandasDataFrame:
|
235 |
'''
|
236 |
Create a shortlist of the best matches from a list of suggested matches
|
237 |
'''
|
tools/matcher_funcs.py
CHANGED
@@ -24,8 +24,10 @@ run_fuzzy_match = True
|
|
24 |
run_nnet_match = True
|
25 |
run_standardise = True
|
26 |
|
27 |
-
from tools.
|
|
|
28 |
from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
|
|
|
29 |
|
30 |
# Neural network functions
|
31 |
### Predict function for imported model
|
@@ -64,17 +66,17 @@ def read_file(filename:str) -> PandasDataFrame:
|
|
64 |
elif file_type == 'parquet':
|
65 |
return pd.read_parquet(filename)
|
66 |
|
67 |
-
def get_file_name(in_name:str) -> str:
|
68 |
-
|
69 |
-
Get the name of a file from a string using the re package.
|
70 |
-
'''
|
71 |
|
72 |
-
|
73 |
-
match = re.search(
|
74 |
if match:
|
75 |
matched_result = match.group(1)
|
76 |
else:
|
77 |
matched_result = None
|
|
|
|
|
78 |
|
79 |
return matched_result
|
80 |
|
@@ -492,24 +494,22 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
|
|
492 |
if not data_state.empty:
|
493 |
|
494 |
Matcher.search_df = data_state
|
495 |
-
|
496 |
Matcher.search_df['index'] = Matcher.search_df.reset_index().index
|
497 |
|
498 |
else:
|
499 |
Matcher.search_df = pd.DataFrame()
|
500 |
|
501 |
-
# If
|
502 |
if in_text:
|
503 |
Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
|
504 |
|
505 |
-
# If
|
506 |
if Matcher.search_df.empty and in_file:
|
507 |
output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
|
508 |
|
509 |
file_list = [string.name for string in in_file]
|
510 |
-
data_file_names = [string for string in file_list if "
|
511 |
-
|
512 |
-
#print("Data file names: ", data_file_names)
|
513 |
Matcher.file_name = get_file_name(data_file_names[0])
|
514 |
|
515 |
# search_df makes column to use as index
|
@@ -524,20 +524,11 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
|
|
524 |
Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
|
525 |
|
526 |
# If no join on column suggested, assume the user wants the UPRN
|
527 |
-
print("in_joincol: ", in_joincol)
|
528 |
-
|
529 |
if not in_joincol:
|
530 |
Matcher.new_join_col = ['UPRN']
|
531 |
-
#Matcher.new_join_col = Matcher.new_join_col#[0]
|
532 |
|
533 |
else:
|
534 |
Matcher.new_join_col = in_joincol
|
535 |
-
#Matcher.new_join_col = Matcher.new_join_col
|
536 |
-
|
537 |
-
# Extract the column names from the input data
|
538 |
-
#print("In colnames: ", in_colnames)
|
539 |
-
|
540 |
-
print("Matcher.in_joincol: ", Matcher.new_join_col)
|
541 |
|
542 |
if len(in_colnames) > 1:
|
543 |
Matcher.search_postcode_col = [in_colnames[-1]]
|
@@ -566,7 +557,6 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
|
|
566 |
|
567 |
length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
|
568 |
|
569 |
-
|
570 |
### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
|
571 |
if not in_api:
|
572 |
if Matcher.filter_to_lambeth_pcodes == True:
|
@@ -621,7 +611,6 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
|
|
621 |
|
622 |
Matcher.search_df_not_matched = Matcher.search_df
|
623 |
|
624 |
-
|
625 |
# If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
|
626 |
if in_api:
|
627 |
|
@@ -629,10 +618,14 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
|
|
629 |
output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
|
630 |
|
631 |
file_list = [string.name for string in in_file]
|
632 |
-
data_file_names = [string for string in file_list if "
|
633 |
|
634 |
Matcher.file_name = get_file_name(data_file_names[0])
|
635 |
|
|
|
|
|
|
|
|
|
636 |
else:
|
637 |
if in_text:
|
638 |
Matcher.file_name = in_text
|
@@ -654,8 +647,6 @@ def check_match_data_filter(Matcher:MatcherClass, data_state:PandasDataFrame, re
|
|
654 |
|
655 |
|
656 |
Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
|
657 |
-
#Matcher.search_df = Matcher.search_df.reset_index(drop=True)
|
658 |
-
#Matcher.search_df.index.name = 'index'
|
659 |
|
660 |
return Matcher
|
661 |
|
@@ -677,15 +668,11 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
|
|
677 |
# If doing API calls, we need to know the search data before querying for specific addresses/postcodes
|
678 |
Matcher = check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
|
679 |
|
680 |
-
|
681 |
# If an API call, ref_df data is loaded after
|
682 |
if in_api:
|
|
|
683 |
Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
|
684 |
|
685 |
-
#print("Resetting index.")
|
686 |
-
# API-called data will often have duplicate indexes in it - drop them to avoid conflicts down the line
|
687 |
-
#Matcher.ref_df = Matcher.ref_df.reset_index(drop = True)
|
688 |
-
|
689 |
print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
|
690 |
print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
|
691 |
|
@@ -697,7 +684,328 @@ def load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state,
|
|
697 |
|
698 |
return Matcher
|
699 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
700 |
# Run a match run for a single batch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
701 |
def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
|
702 |
'''
|
703 |
Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
|
@@ -721,12 +1029,7 @@ def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, p
|
|
721 |
print(message)
|
722 |
return message, InitialMatch
|
723 |
|
724 |
-
print("FuzzyNotStdMatch shape before combine two matches: ", FuzzyNotStdMatch.results_on_orig_df.shape)
|
725 |
-
|
726 |
FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
|
727 |
-
|
728 |
-
print("InitialMatch shape: ", InitialMatch.results_on_orig_df.shape)
|
729 |
-
print("FuzzyNotStdMatch shape: ", FuzzyNotStdMatch.results_on_orig_df.shape)
|
730 |
|
731 |
if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
|
732 |
overall_toc = time.perf_counter()
|
|
|
24 |
run_nnet_match = True
|
25 |
run_standardise = True
|
26 |
|
27 |
+
from tools.constants import *
|
28 |
+
from tools.preparation import prepare_search_address_string, prepare_search_address, extract_street_name, prepare_ref_address, remove_non_postal, check_no_number_addresses
|
29 |
from tools.fuzzy_match import string_match_by_post_code_multiple, _create_fuzzy_match_results_output, join_to_orig_df
|
30 |
+
from tools.standardise import standardise_wrapper_func
|
31 |
|
32 |
# Neural network functions
|
33 |
### Predict function for imported model
|
|
|
66 |
elif file_type == 'parquet':
|
67 |
return pd.read_parquet(filename)
|
68 |
|
69 |
+
def get_file_name(in_name: str) -> str:
|
70 |
+
"""Get the name of a file from a string, handling both Windows and Unix paths."""
|
|
|
|
|
71 |
|
72 |
+
print("in_name: ", in_name)
|
73 |
+
match = re.search(rf'{os.sep}(?!.*{os.sep})(.*)', in_name)
|
74 |
if match:
|
75 |
matched_result = match.group(1)
|
76 |
else:
|
77 |
matched_result = None
|
78 |
+
|
79 |
+
print("Matched result: ", matched_result)
|
80 |
|
81 |
return matched_result
|
82 |
|
|
|
494 |
if not data_state.empty:
|
495 |
|
496 |
Matcher.search_df = data_state
|
|
|
497 |
Matcher.search_df['index'] = Matcher.search_df.reset_index().index
|
498 |
|
499 |
else:
|
500 |
Matcher.search_df = pd.DataFrame()
|
501 |
|
502 |
+
# If a single address entered into the text box, just load this instead
|
503 |
if in_text:
|
504 |
Matcher.search_df, Matcher.search_df_key_field, Matcher.search_address_cols, Matcher.search_postcode_col = prepare_search_address_string(in_text)
|
505 |
|
506 |
+
# If no file loaded yet and a file has been selected
|
507 |
if Matcher.search_df.empty and in_file:
|
508 |
output_message, drop1, drop2, Matcher.search_df, results_data_state = initial_data_load(in_file)
|
509 |
|
510 |
file_list = [string.name for string in in_file]
|
511 |
+
data_file_names = [string for string in file_list if "results_" not in string.lower()]
|
512 |
+
|
|
|
513 |
Matcher.file_name = get_file_name(data_file_names[0])
|
514 |
|
515 |
# search_df makes column to use as index
|
|
|
524 |
Matcher.search_df = Matcher.search_df.merge(results_data_state, on = "index", how = "left")
|
525 |
|
526 |
# If no join on column suggested, assume the user wants the UPRN
|
|
|
|
|
527 |
if not in_joincol:
|
528 |
Matcher.new_join_col = ['UPRN']
|
|
|
529 |
|
530 |
else:
|
531 |
Matcher.new_join_col = in_joincol
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
|
533 |
if len(in_colnames) > 1:
|
534 |
Matcher.search_postcode_col = [in_colnames[-1]]
|
|
|
557 |
|
558 |
length_more_than_0 = Matcher.search_df["address_cols_joined"].str.len() > 0
|
559 |
|
|
|
560 |
### Filter addresses to match to postcode areas present in both search_df and ref_df_cleaned only (postcode without the last three characters). Only run if API call is false. When the API is called, relevant addresses and postcodes should be brought in by the API.
|
561 |
if not in_api:
|
562 |
if Matcher.filter_to_lambeth_pcodes == True:
|
|
|
611 |
|
612 |
Matcher.search_df_not_matched = Matcher.search_df
|
613 |
|
|
|
614 |
# If this is for an API call, we need to convert the search_df address columns to one column now. This is so the API call can be made and the reference dataframe created.
|
615 |
if in_api:
|
616 |
|
|
|
618 |
output_message, drop1, drop2, df, results_data_state = initial_data_load(in_file)
|
619 |
|
620 |
file_list = [string.name for string in in_file]
|
621 |
+
data_file_names = [string for string in file_list if "results_" not in string.lower()]
|
622 |
|
623 |
Matcher.file_name = get_file_name(data_file_names[0])
|
624 |
|
625 |
+
print("File list in in_api bit: ", file_list)
|
626 |
+
print("data_file_names in in_api bit: ", data_file_names)
|
627 |
+
print("Matcher.file_name in in_api bit: ", Matcher.file_name)
|
628 |
+
|
629 |
else:
|
630 |
if in_text:
|
631 |
Matcher.file_name = in_text
|
|
|
647 |
|
648 |
|
649 |
Matcher.search_df['full_address_postcode'] = search_df_cleaned["full_address"]
|
|
|
|
|
650 |
|
651 |
return Matcher
|
652 |
|
|
|
668 |
# If doing API calls, we need to know the search data before querying for specific addresses/postcodes
|
669 |
Matcher = check_match_data_filter(Matcher, data_state, results_data_state, in_file, in_text, in_colnames, in_joincol, in_existing, in_api)
|
670 |
|
|
|
671 |
# If an API call, ref_df data is loaded after
|
672 |
if in_api:
|
673 |
+
|
674 |
Matcher = check_ref_data_exists(Matcher, ref_data_state, in_ref, in_refcol, in_api, in_api_key, query_type=in_api)
|
675 |
|
|
|
|
|
|
|
|
|
676 |
print("Shape of ref_df after filtering is: ", Matcher.ref_df.shape)
|
677 |
print("Shape of search_df after filtering is: ", Matcher.search_df.shape)
|
678 |
|
|
|
684 |
|
685 |
return Matcher
|
686 |
|
687 |
+
# Run whole matcher process
|
688 |
+
def run_matcher(in_text:str, in_file:str, in_ref:str, data_state:PandasDataFrame, results_data_state:PandasDataFrame, ref_data_state:PandasDataFrame, in_colnames:List[str], in_refcol:List[str], in_joincol:List[str], in_existing:List[str], in_api:str, in_api_key:str, InitMatch:MatcherClass = InitMatch, progress=gr.Progress()):
|
689 |
+
'''
|
690 |
+
Split search and reference data into batches. Loop and run through the match script for each batch of data.
|
691 |
+
'''
|
692 |
+
|
693 |
+
overall_tic = time.perf_counter()
|
694 |
+
|
695 |
+
# Load in initial data. This will filter to relevant addresses in the search and reference datasets that can potentially be matched, and will pull in API data if asked for.
|
696 |
+
InitMatch = load_matcher_data(in_text, in_file, in_ref, data_state, results_data_state, ref_data_state, in_colnames, in_refcol, in_joincol, in_existing, InitMatch, in_api, in_api_key)
|
697 |
+
|
698 |
+
if InitMatch.search_df.empty or InitMatch.ref_df.empty:
|
699 |
+
out_message = "Nothing to match!"
|
700 |
+
print(out_message)
|
701 |
+
return out_message, [InitMatch.results_orig_df_name, InitMatch.match_outputs_name]
|
702 |
+
|
703 |
+
# Run initial address preparation and standardisation processes
|
704 |
+
# Prepare address format
|
705 |
+
|
706 |
+
# Polars implementation not yet finalised
|
707 |
+
#InitMatch.search_df = pl.from_pandas(InitMatch.search_df)
|
708 |
+
#InitMatch.ref_df = pl.from_pandas(InitMatch.ref_df)
|
709 |
+
|
710 |
+
|
711 |
+
# Prepare all search addresses
|
712 |
+
if type(InitMatch.search_df) == str:
|
713 |
+
InitMatch.search_df_cleaned, InitMatch.search_df_key_field, InitMatch.search_address_cols = prepare_search_address_string(InitMatch.search_df)
|
714 |
+
else:
|
715 |
+
InitMatch.search_df_cleaned = prepare_search_address(InitMatch.search_df, InitMatch.search_address_cols, InitMatch.search_postcode_col, InitMatch.search_df_key_field)
|
716 |
+
|
717 |
+
# Remove addresses that are not postal addresses
|
718 |
+
InitMatch.search_df_cleaned = remove_non_postal(InitMatch.search_df_cleaned, "full_address")
|
719 |
+
|
720 |
+
# Remove addresses that have no numbers in from consideration
|
721 |
+
InitMatch.search_df_cleaned = check_no_number_addresses(InitMatch.search_df_cleaned, "full_address")
|
722 |
+
|
723 |
+
# Initial preparation of reference addresses
|
724 |
+
InitMatch.ref_df_cleaned = prepare_ref_address(InitMatch.ref_df, InitMatch.ref_address_cols, InitMatch.new_join_col)
|
725 |
+
|
726 |
+
|
727 |
+
# Sort dataframes by postcode - will allow for more efficient matching process if using multiple batches
|
728 |
+
#InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.sort_values(by="postcode")
|
729 |
+
#InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.sort_values(by="Postcode")
|
730 |
+
|
731 |
+
# Polars implementation - not finalised
|
732 |
+
#InitMatch.search_df_cleaned = InitMatch.search_df_cleaned.to_pandas()
|
733 |
+
#InitMatch.ref_df_cleaned = InitMatch.ref_df_cleaned.to_pandas()
|
734 |
+
|
735 |
+
# Standardise addresses
|
736 |
+
# Standardise - minimal
|
737 |
+
|
738 |
+
|
739 |
+
tic = time.perf_counter()
|
740 |
+
InitMatch.search_df_after_stand, InitMatch.ref_df_after_stand = standardise_wrapper_func(
|
741 |
+
InitMatch.search_df_cleaned.copy(),
|
742 |
+
InitMatch.ref_df_cleaned.copy(),
|
743 |
+
standardise = False,
|
744 |
+
filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
|
745 |
+
match_task="fuzzy") # InitMatch.search_df_after_stand_series, InitMatch.ref_df_after_stand_series
|
746 |
+
|
747 |
+
toc = time.perf_counter()
|
748 |
+
print(f"Performed the minimal standardisation step in {toc - tic:0.1f} seconds")
|
749 |
+
|
750 |
+
# Standardise - full
|
751 |
+
tic = time.perf_counter()
|
752 |
+
InitMatch.search_df_after_full_stand, InitMatch.ref_df_after_full_stand = standardise_wrapper_func(
|
753 |
+
InitMatch.search_df_cleaned.copy(),
|
754 |
+
InitMatch.ref_df_cleaned.copy(),
|
755 |
+
standardise = True,
|
756 |
+
filter_to_lambeth_pcodes=filter_to_lambeth_pcodes,
|
757 |
+
match_task="fuzzy") # , InitMatch.search_df_after_stand_series_full_stand, InitMatch.ref_df_after_stand_series_full_stand
|
758 |
+
|
759 |
+
toc = time.perf_counter()
|
760 |
+
print(f"Performed the full standardisation step in {toc - tic:0.1f} seconds")
|
761 |
+
|
762 |
+
# Determine length of search df to create batches to send through the functions.
|
763 |
+
#try:
|
764 |
+
range_df = create_batch_ranges(InitMatch.search_df_cleaned.copy(), InitMatch.ref_df_cleaned.copy(), batch_size, ref_batch_size, "postcode", "Postcode")
|
765 |
+
#except:
|
766 |
+
# range_df = create_simple_batch_ranges(InitMatch.search_df_cleaned, InitMatch.ref_df_cleaned, batch_size, #ref_batch_size)
|
767 |
+
|
768 |
+
print("Batches to run in this session: ", range_df)
|
769 |
+
|
770 |
+
OutputMatch = copy.copy(InitMatch)
|
771 |
+
|
772 |
+
n = 0
|
773 |
+
number_of_batches = range_df.shape[0]
|
774 |
+
|
775 |
+
for row in progress.tqdm(range(0,len(range_df)), desc= "Running through batches", unit="batches", total=number_of_batches):
|
776 |
+
print("Running batch ", str(n+1))
|
777 |
+
|
778 |
+
search_range = range_df.iloc[row]['search_range']
|
779 |
+
ref_range = range_df.iloc[row]['ref_range']
|
780 |
+
|
781 |
+
#print("search_range: ", search_range)
|
782 |
+
#pd.DataFrame(search_range).to_csv("search_range.csv")
|
783 |
+
#print("ref_range: ", ref_range)
|
784 |
+
|
785 |
+
BatchMatch = copy.copy(InitMatch)
|
786 |
+
|
787 |
+
# Subset the search and reference dfs based on current batch ranges
|
788 |
+
# BatchMatch.search_df = BatchMatch.search_df.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
789 |
+
# BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
|
790 |
+
# BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
791 |
+
# BatchMatch.ref_df = BatchMatch.ref_df.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
792 |
+
# BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
793 |
+
|
794 |
+
|
795 |
+
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.iloc[search_range[0]:search_range[1] + 1]
|
796 |
+
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.iloc[ref_range[0]:ref_range[1] + 1]
|
797 |
+
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.iloc[search_range[0]:search_range[1] + 1]
|
798 |
+
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.iloc[ref_range[0]:ref_range[1] + 1]
|
799 |
+
|
800 |
+
# BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
801 |
+
# BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
802 |
+
# BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand.iloc[search_range[0]:search_range[1] + 1,:].reset_index(drop=True)
|
803 |
+
# BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand.iloc[ref_range[0]:ref_range[1] + 1,:].reset_index(drop=True)
|
804 |
+
|
805 |
+
BatchMatch.search_df = BatchMatch.search_df[BatchMatch.search_df.index.isin(search_range)].reset_index(drop=True)
|
806 |
+
BatchMatch.search_df_not_matched = BatchMatch.search_df.copy()
|
807 |
+
BatchMatch.search_df_cleaned = BatchMatch.search_df_cleaned[BatchMatch.search_df_cleaned.index.isin(search_range)].reset_index(drop=True)
|
808 |
+
|
809 |
+
BatchMatch.ref_df = BatchMatch.ref_df[BatchMatch.ref_df.index.isin(ref_range)].reset_index(drop=True)
|
810 |
+
BatchMatch.ref_df_cleaned = BatchMatch.ref_df_cleaned[BatchMatch.ref_df_cleaned.index.isin(ref_range)].reset_index(drop=True)
|
811 |
+
|
812 |
+
# Dataframes after standardisation process
|
813 |
+
BatchMatch.search_df_after_stand = BatchMatch.search_df_after_stand[BatchMatch.search_df_after_stand.index.isin(search_range)].reset_index(drop=True)
|
814 |
+
BatchMatch.search_df_after_full_stand = BatchMatch.search_df_after_full_stand[BatchMatch.search_df_after_full_stand.index.isin(search_range)].reset_index(drop=True)
|
815 |
+
|
816 |
+
### Create lookup lists for fuzzy matches
|
817 |
+
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand.copy().set_index('postcode_search')['search_address_stand']
|
818 |
+
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_full_stand.copy().set_index('postcode_search')['search_address_stand']
|
819 |
+
# BatchMatch.search_df_after_stand_series = BatchMatch.search_df_after_stand_series.sort_index()
|
820 |
+
# BatchMatch.search_df_after_stand_series_full_stand = BatchMatch.search_df_after_stand_series_full_stand.sort_index()
|
821 |
+
|
822 |
+
#BatchMatch.search_df_after_stand.reset_index(inplace=True, drop = True)
|
823 |
+
#BatchMatch.search_df_after_full_stand.reset_index(inplace=True, drop = True)
|
824 |
+
|
825 |
+
BatchMatch.ref_df_after_stand = BatchMatch.ref_df_after_stand[BatchMatch.ref_df_after_stand.index.isin(ref_range)].reset_index(drop=True)
|
826 |
+
BatchMatch.ref_df_after_full_stand = BatchMatch.ref_df_after_full_stand[BatchMatch.ref_df_after_full_stand.index.isin(ref_range)].reset_index(drop=True)
|
827 |
+
|
828 |
+
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand.copy().set_index('postcode_search')['ref_address_stand']
|
829 |
+
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_full_stand.copy().set_index('postcode_search')['ref_address_stand']
|
830 |
+
# BatchMatch.ref_df_after_stand_series = BatchMatch.ref_df_after_stand_series.sort_index()
|
831 |
+
# BatchMatch.ref_df_after_stand_series_full_stand = BatchMatch.ref_df_after_stand_series_full_stand.sort_index()
|
832 |
+
|
833 |
+
# BatchMatch.ref_df_after_stand.reset_index(inplace=True, drop=True)
|
834 |
+
# BatchMatch.ref_df_after_full_stand.reset_index(inplace=True, drop=True)
|
835 |
+
|
836 |
+
# Match the data, unless the search or reference dataframes are empty
|
837 |
+
if BatchMatch.search_df.empty or BatchMatch.ref_df.empty:
|
838 |
+
out_message = "Nothing to match for batch: " + str(n)
|
839 |
+
print(out_message)
|
840 |
+
BatchMatch_out = BatchMatch
|
841 |
+
BatchMatch_out.results_on_orig_df = pd.DataFrame(data={"index":BatchMatch.search_df.index,
|
842 |
+
"Excluded from search":False,
|
843 |
+
"Matched with reference address":False})
|
844 |
+
else:
|
845 |
+
summary_of_summaries, BatchMatch_out = run_match_batch(BatchMatch, n, number_of_batches)
|
846 |
+
|
847 |
+
OutputMatch = combine_two_matches(OutputMatch, BatchMatch_out, "All up to and including batch " + str(n+1))
|
848 |
+
|
849 |
+
n += 1
|
850 |
+
|
851 |
+
if in_api==True:
|
852 |
+
OutputMatch.results_on_orig_df['Matched with reference address'] = OutputMatch.results_on_orig_df['Matched with reference address'].replace({1:True, 0:False})
|
853 |
+
OutputMatch.results_on_orig_df['Excluded from search'] = OutputMatch.results_on_orig_df['Excluded from search'].replace('nan', False).fillna(False)
|
854 |
+
|
855 |
+
# Remove any duplicates from reference df, prioritise successful matches
|
856 |
+
OutputMatch.results_on_orig_df = OutputMatch.results_on_orig_df.sort_values(by=["index", "Matched with reference address"], ascending=[True,False]).drop_duplicates(subset="index")
|
857 |
+
|
858 |
+
overall_toc = time.perf_counter()
|
859 |
+
time_out = f"The overall match (all batches) took {overall_toc - overall_tic:0.1f} seconds"
|
860 |
+
|
861 |
+
print(OutputMatch.output_summary)
|
862 |
+
|
863 |
+
if OutputMatch.output_summary == "":
|
864 |
+
OutputMatch.output_summary = "No matches were found."
|
865 |
+
|
866 |
+
fuzzy_not_std_output = OutputMatch.match_results_output.copy()
|
867 |
+
fuzzy_not_std_output_mask = ~(fuzzy_not_std_output["match_method"].str.contains("Fuzzy match")) | (fuzzy_not_std_output["standardised_address"] == True)
|
868 |
+
fuzzy_not_std_output.loc[fuzzy_not_std_output_mask, "full_match"] = False
|
869 |
+
fuzzy_not_std_summary = create_match_summary(fuzzy_not_std_output, "Fuzzy not standardised")
|
870 |
+
|
871 |
+
fuzzy_std_output = OutputMatch.match_results_output.copy()
|
872 |
+
fuzzy_std_output_mask = fuzzy_std_output["match_method"].str.contains("Fuzzy match")
|
873 |
+
fuzzy_std_output.loc[fuzzy_std_output_mask == False, "full_match"] = False
|
874 |
+
fuzzy_std_summary = create_match_summary(fuzzy_std_output, "Fuzzy standardised")
|
875 |
+
|
876 |
+
nnet_std_output = OutputMatch.match_results_output.copy()
|
877 |
+
nnet_std_summary = create_match_summary(nnet_std_output, "Neural net standardised")
|
878 |
+
|
879 |
+
final_summary = fuzzy_not_std_summary + "\n" + fuzzy_std_summary + "\n" + nnet_std_summary + "\n" + time_out
|
880 |
+
|
881 |
+
return final_summary, [OutputMatch.results_orig_df_name, OutputMatch.match_outputs_name]
|
882 |
+
|
883 |
# Run a match run for a single batch
|
884 |
+
def create_simple_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int):
|
885 |
+
#print("Search df batch size: ", batch_size)
|
886 |
+
#print("ref_df df batch size: ", ref_batch_size)
|
887 |
+
|
888 |
+
total_rows = df.shape[0]
|
889 |
+
ref_total_rows = ref_df.shape[0]
|
890 |
+
|
891 |
+
# Creating bottom and top limits for search data
|
892 |
+
search_ranges = []
|
893 |
+
for start in range(0, total_rows, batch_size):
|
894 |
+
end = min(start + batch_size - 1, total_rows - 1) # Adjusted to get the top limit
|
895 |
+
search_ranges.append((start, end))
|
896 |
+
|
897 |
+
# Creating bottom and top limits for reference data
|
898 |
+
ref_ranges = []
|
899 |
+
for start in range(0, ref_total_rows, ref_batch_size):
|
900 |
+
end = min(start + ref_batch_size - 1, ref_total_rows - 1) # Adjusted to get the top limit
|
901 |
+
ref_ranges.append((start, end))
|
902 |
+
|
903 |
+
# Create DataFrame with combinations of search_range and ref_range
|
904 |
+
result_data = []
|
905 |
+
for search_range in search_ranges:
|
906 |
+
for ref_range in ref_ranges:
|
907 |
+
result_data.append((search_range, ref_range))
|
908 |
+
|
909 |
+
range_df = pd.DataFrame(result_data, columns=['search_range', 'ref_range'])
|
910 |
+
|
911 |
+
return range_df
|
912 |
+
|
913 |
+
def create_batch_ranges(df:PandasDataFrame, ref_df:PandasDataFrame, batch_size:int, ref_batch_size:int, search_postcode_col:str, ref_postcode_col:str):
|
914 |
+
'''
|
915 |
+
Create batches of address indexes for search and reference dataframes based on shortened postcodes.
|
916 |
+
'''
|
917 |
+
|
918 |
+
# If df sizes are smaller than the batch size limits, no need to run through everything
|
919 |
+
if len(df) < batch_size and len(ref_df) < ref_batch_size:
|
920 |
+
print("Dataframe sizes are smaller than maximum batch sizes, no need to split data.")
|
921 |
+
lengths_df = pd.DataFrame(data={'search_range':[df.index.tolist()], 'ref_range':[ref_df.index.tolist()], 'batch_length':len(df), 'ref_length':len(ref_df)})
|
922 |
+
return lengths_df
|
923 |
+
|
924 |
+
#df.index = df[search_postcode_col]
|
925 |
+
|
926 |
+
df['index'] = df.index
|
927 |
+
ref_df['index'] = ref_df.index
|
928 |
+
|
929 |
+
# Remove the last character of postcode
|
930 |
+
df['postcode_minus_last_character'] = df[search_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
|
931 |
+
ref_df['postcode_minus_last_character'] = ref_df[ref_postcode_col].str.lower().str.strip().str.replace("\s+", "", regex=True).str[:-1]
|
932 |
+
|
933 |
+
unique_postcodes = df['postcode_minus_last_character'][df['postcode_minus_last_character'].str.len()>=4].unique().tolist()
|
934 |
+
|
935 |
+
df = df.set_index('postcode_minus_last_character')
|
936 |
+
ref_df = ref_df.set_index('postcode_minus_last_character')
|
937 |
+
|
938 |
+
df = df.sort_index()
|
939 |
+
ref_df = ref_df.sort_index()
|
940 |
+
|
941 |
+
#df.to_csv("batch_search_df.csv")
|
942 |
+
|
943 |
+
# Overall batch variables
|
944 |
+
batch_indexes = []
|
945 |
+
ref_indexes = []
|
946 |
+
batch_lengths = []
|
947 |
+
ref_lengths = []
|
948 |
+
|
949 |
+
# Current batch variables for loop
|
950 |
+
current_batch = []
|
951 |
+
current_ref_batch = []
|
952 |
+
current_batch_length = []
|
953 |
+
current_ref_length = []
|
954 |
+
|
955 |
+
unique_postcodes_iterator = unique_postcodes.copy()
|
956 |
+
|
957 |
+
while unique_postcodes_iterator:
|
958 |
+
|
959 |
+
unique_postcodes_loop = unique_postcodes_iterator.copy()
|
960 |
+
|
961 |
+
#print("Current loop postcodes: ", unique_postcodes_loop)
|
962 |
+
|
963 |
+
for current_postcode in unique_postcodes_loop:
|
964 |
+
|
965 |
+
|
966 |
+
|
967 |
+
if len(current_batch) >= batch_size or len(current_ref_batch) >= ref_batch_size:
|
968 |
+
print("Batch length reached - breaking")
|
969 |
+
break
|
970 |
+
|
971 |
+
try:
|
972 |
+
current_postcode_search_data_add = df.loc[[current_postcode]]#[df['postcode_minus_last_character'].isin(current_postcode)]
|
973 |
+
current_postcode_ref_data_add = ref_df.loc[[current_postcode]]#[ref_df['postcode_minus_last_character'].isin(current_postcode)]
|
974 |
+
|
975 |
+
#print(current_postcode_search_data_add)
|
976 |
+
|
977 |
+
if not current_postcode_search_data_add.empty:
|
978 |
+
current_batch.extend(current_postcode_search_data_add['index'])
|
979 |
+
|
980 |
+
if not current_postcode_ref_data_add.empty:
|
981 |
+
current_ref_batch.extend(current_postcode_ref_data_add['index'])
|
982 |
+
|
983 |
+
except:
|
984 |
+
#print("postcode not found: ", current_postcode)
|
985 |
+
pass
|
986 |
+
|
987 |
+
unique_postcodes_iterator.remove(current_postcode)
|
988 |
+
|
989 |
+
# Append the batch data to the master lists and reset lists
|
990 |
+
batch_indexes.append(current_batch)
|
991 |
+
ref_indexes.append(current_ref_batch)
|
992 |
+
|
993 |
+
current_batch_length = len(current_batch)
|
994 |
+
current_ref_length = len(current_ref_batch)
|
995 |
+
|
996 |
+
batch_lengths.append(current_batch_length)
|
997 |
+
ref_lengths.append(current_ref_length)
|
998 |
+
|
999 |
+
current_batch = []
|
1000 |
+
current_ref_batch = []
|
1001 |
+
current_batch_length = []
|
1002 |
+
current_ref_length = []
|
1003 |
+
|
1004 |
+
# Create df to store lengths
|
1005 |
+
lengths_df = pd.DataFrame(data={'search_range':batch_indexes, 'ref_range':ref_indexes, 'batch_length':batch_lengths, 'ref_length':ref_lengths})
|
1006 |
+
|
1007 |
+
return lengths_df
|
1008 |
+
|
1009 |
def run_match_batch(InitialMatch:MatcherClass, batch_n:int, total_batches:int, progress=gr.Progress()):
|
1010 |
'''
|
1011 |
Over-arching function for running a single batch of data through the full matching process. Calls fuzzy matching, then neural network match functions in order. It outputs a summary of the match, and a MatcherClass with the matched data included.
|
|
|
1029 |
print(message)
|
1030 |
return message, InitialMatch
|
1031 |
|
|
|
|
|
1032 |
FuzzyNotStdMatch = combine_two_matches(InitialMatch, FuzzyNotStdMatch, df_name)
|
|
|
|
|
|
|
1033 |
|
1034 |
if (len(FuzzyNotStdMatch.search_df_not_matched) == 0) | (sum(FuzzyNotStdMatch.match_results_output[FuzzyNotStdMatch.match_results_output['full_match']==False]['fuzzy_score'])==0):
|
1035 |
overall_toc = time.perf_counter()
|