Spaces:

seanpedrickcase
/

address_matcher

Running

seanpedrickcase commited on 11 days ago

Commit

ac346a9

1 Parent(s): 82732e8

Updated packages, change to ref file batch size to avoid excess memory usage

Files changed (7) hide show

.dockerignore CHANGED Viewed

@@ -16,4 +16,9 @@ build/*
 dist/*
 .ipynb_checkpoints/*
 orchestration/*
-.vscode/*

 dist/*
 .ipynb_checkpoints/*
 orchestration/*
+.vscode/*
+usage/
+logs/
+feedback/
+input/
+output/

.gitignore CHANGED Viewed

@@ -16,4 +16,9 @@ experiments/*
 build_deps/*
 build/*
 dist/*
-.vscode/*

 build_deps/*
 build/*
 dist/*
+.vscode/*
+usage/*
+logs/*
+feedback/*
+input/*
+output/*

Dockerfile CHANGED Viewed

@@ -1,4 +1,4 @@
-FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 # Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
 #COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
@@ -10,9 +10,9 @@ WORKDIR /src
 COPY requirements_aws.txt .
-RUN pip install --no-cache-dir torch==2.4.1+cpu --index-url https://download.pytorch.org/whl/cpu && \
 	pip install --no-cache-dir -r requirements_aws.txt && \
-	pip install --no-cache-dir gradio==4.44.0
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user

+FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
 # Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
 #COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
 COPY requirements_aws.txt .
+RUN pip install --no-cache-dir torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu && \
 	pip install --no-cache-dir -r requirements_aws.txt && \
+	pip install --no-cache-dir gradio==5.34.0
 # Set up a new user named "user" with user ID 1000
 RUN useradd -m -u 1000 user

requirements.txt CHANGED Viewed

@@ -1,12 +1,12 @@
-torch==2.4.1
-pandas==2.2.2
 rapidfuzz==3.8.1
 recordlinkage==0.16
 pyap==0.3.1
 pytest==7.4.3
-pyarrow==14.0.1
 openpyxl==3.1.2
-gradio==4.44.0
-boto3==1.34.158
 polars==0.20.19
 numpy==1.26.4

+torch==2.7.1
+pandas==2.2.3
 rapidfuzz==3.8.1
 recordlinkage==0.16
 pyap==0.3.1
 pytest==7.4.3
+pyarrow==19.0.1
 openpyxl==3.1.2
+gradio==5.34.0
+boto3==1.38.37
 polars==0.20.19
 numpy==1.26.4

requirements_aws.txt CHANGED Viewed

@@ -1,11 +1,11 @@
-pandas==2.2.2
 rapidfuzz==3.8.1
 recordlinkage==0.16
 pyap==0.3.1
 pytest==7.4.3
-pyarrow==14.0.1
 openpyxl==3.1.2
-gradio==4.44.0
-boto3==1.34.158
 polars==0.20.19
 numpy==1.26.4

+pandas==2.2.3
 rapidfuzz==3.8.1
 recordlinkage==0.16
 pyap==0.3.1
 pytest==7.4.3
+pyarrow==19.0.0
 openpyxl==3.1.2
+gradio==5.34.0
+boto3==1.38.37
 polars==0.20.19
 numpy==1.26.4

tools/constants.py CHANGED Viewed

@@ -215,7 +215,7 @@ else: exported_model = []
 ### ADDRESS MATCHING FUNCTIONS
 # Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
 batch_size = 10000
-ref_batch_size = 150000
 ### Fuzzy match method

 ### ADDRESS MATCHING FUNCTIONS
 # Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
 batch_size = 10000
+ref_batch_size = 20000
 ### Fuzzy match method

tools/fuzzy_match.py CHANGED Viewed

@@ -182,7 +182,6 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
                                       fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
         ## Fuzzy search results
         match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
         'full_match',
         'full_number_match',
@@ -216,7 +215,7 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
         joined_ref_cols = ["fulladdress", "Reference file"]
         joined_ref_cols.extend(new_join_col)
-        print("joined_ref_cols: ", joined_ref_cols)
         # Keep only columns that exist in reference dataset
         joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]

                                       fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
         ## Fuzzy search results
         match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
         'full_match',
         'full_number_match',
         joined_ref_cols = ["fulladdress", "Reference file"]
         joined_ref_cols.extend(new_join_col)
+        #print("joined_ref_cols: ", joined_ref_cols)
         # Keep only columns that exist in reference dataset
         joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]