seanpedrickcase commited on
Commit
ac346a9
·
1 Parent(s): 82732e8

Updated packages, change to ref file batch size to avoid excess memory usage

Browse files
.dockerignore CHANGED
@@ -16,4 +16,9 @@ build/*
16
  dist/*
17
  .ipynb_checkpoints/*
18
  orchestration/*
19
- .vscode/*
 
 
 
 
 
 
16
  dist/*
17
  .ipynb_checkpoints/*
18
  orchestration/*
19
+ .vscode/*
20
+ usage/
21
+ logs/
22
+ feedback/
23
+ input/
24
+ output/
.gitignore CHANGED
@@ -16,4 +16,9 @@ experiments/*
16
  build_deps/*
17
  build/*
18
  dist/*
19
- .vscode/*
 
 
 
 
 
 
16
  build_deps/*
17
  build/*
18
  dist/*
19
+ .vscode/*
20
+ usage/*
21
+ logs/*
22
+ feedback/*
23
+ input/*
24
+ output/*
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
2
 
3
  # Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
4
  #COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
@@ -10,9 +10,9 @@ WORKDIR /src
10
 
11
  COPY requirements_aws.txt .
12
 
13
- RUN pip install --no-cache-dir torch==2.4.1+cpu --index-url https://download.pytorch.org/whl/cpu && \
14
  pip install --no-cache-dir -r requirements_aws.txt && \
15
- pip install --no-cache-dir gradio==4.44.0
16
 
17
  # Set up a new user named "user" with user ID 1000
18
  RUN useradd -m -u 1000 user
 
1
+ FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
2
 
3
  # Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
4
  #COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
 
10
 
11
  COPY requirements_aws.txt .
12
 
13
+ RUN pip install --no-cache-dir torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu && \
14
  pip install --no-cache-dir -r requirements_aws.txt && \
15
+ pip install --no-cache-dir gradio==5.34.0
16
 
17
  # Set up a new user named "user" with user ID 1000
18
  RUN useradd -m -u 1000 user
requirements.txt CHANGED
@@ -1,12 +1,12 @@
1
- torch==2.4.1
2
- pandas==2.2.2
3
  rapidfuzz==3.8.1
4
  recordlinkage==0.16
5
  pyap==0.3.1
6
  pytest==7.4.3
7
- pyarrow==14.0.1
8
  openpyxl==3.1.2
9
- gradio==4.44.0
10
- boto3==1.34.158
11
  polars==0.20.19
12
  numpy==1.26.4
 
1
+ torch==2.7.1
2
+ pandas==2.2.3
3
  rapidfuzz==3.8.1
4
  recordlinkage==0.16
5
  pyap==0.3.1
6
  pytest==7.4.3
7
+ pyarrow==19.0.1
8
  openpyxl==3.1.2
9
+ gradio==5.34.0
10
+ boto3==1.38.37
11
  polars==0.20.19
12
  numpy==1.26.4
requirements_aws.txt CHANGED
@@ -1,11 +1,11 @@
1
- pandas==2.2.2
2
  rapidfuzz==3.8.1
3
  recordlinkage==0.16
4
  pyap==0.3.1
5
  pytest==7.4.3
6
- pyarrow==14.0.1
7
  openpyxl==3.1.2
8
- gradio==4.44.0
9
- boto3==1.34.158
10
  polars==0.20.19
11
  numpy==1.26.4
 
1
+ pandas==2.2.3
2
  rapidfuzz==3.8.1
3
  recordlinkage==0.16
4
  pyap==0.3.1
5
  pytest==7.4.3
6
+ pyarrow==19.0.0
7
  openpyxl==3.1.2
8
+ gradio==5.34.0
9
+ boto3==1.38.37
10
  polars==0.20.19
11
  numpy==1.26.4
tools/constants.py CHANGED
@@ -215,7 +215,7 @@ else: exported_model = []
215
  ### ADDRESS MATCHING FUNCTIONS
216
  # Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
217
  batch_size = 10000
218
- ref_batch_size = 150000
219
 
220
  ### Fuzzy match method
221
 
 
215
  ### ADDRESS MATCHING FUNCTIONS
216
  # Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
217
  batch_size = 10000
218
+ ref_batch_size = 20000
219
 
220
  ### Fuzzy match method
221
 
tools/fuzzy_match.py CHANGED
@@ -182,7 +182,6 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
182
  fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
183
 
184
  ## Fuzzy search results
185
-
186
  match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
187
  'full_match',
188
  'full_number_match',
@@ -216,7 +215,7 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
216
  joined_ref_cols = ["fulladdress", "Reference file"]
217
  joined_ref_cols.extend(new_join_col)
218
 
219
- print("joined_ref_cols: ", joined_ref_cols)
220
  # Keep only columns that exist in reference dataset
221
  joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]
222
 
 
182
  fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
183
 
184
  ## Fuzzy search results
 
185
  match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
186
  'full_match',
187
  'full_number_match',
 
215
  joined_ref_cols = ["fulladdress", "Reference file"]
216
  joined_ref_cols.extend(new_join_col)
217
 
218
+ #print("joined_ref_cols: ", joined_ref_cols)
219
  # Keep only columns that exist in reference dataset
220
  joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]
221