Spaces:
Running
Running
Commit
·
ac346a9
1
Parent(s):
82732e8
Updated packages, change to ref file batch size to avoid excess memory usage
Browse files- .dockerignore +6 -1
- .gitignore +6 -1
- Dockerfile +3 -3
- requirements.txt +5 -5
- requirements_aws.txt +4 -4
- tools/constants.py +1 -1
- tools/fuzzy_match.py +1 -2
.dockerignore
CHANGED
@@ -16,4 +16,9 @@ build/*
|
|
16 |
dist/*
|
17 |
.ipynb_checkpoints/*
|
18 |
orchestration/*
|
19 |
-
.vscode/*
|
|
|
|
|
|
|
|
|
|
|
|
16 |
dist/*
|
17 |
.ipynb_checkpoints/*
|
18 |
orchestration/*
|
19 |
+
.vscode/*
|
20 |
+
usage/
|
21 |
+
logs/
|
22 |
+
feedback/
|
23 |
+
input/
|
24 |
+
output/
|
.gitignore
CHANGED
@@ -16,4 +16,9 @@ experiments/*
|
|
16 |
build_deps/*
|
17 |
build/*
|
18 |
dist/*
|
19 |
-
.vscode/*
|
|
|
|
|
|
|
|
|
|
|
|
16 |
build_deps/*
|
17 |
build/*
|
18 |
dist/*
|
19 |
+
.vscode/*
|
20 |
+
usage/*
|
21 |
+
logs/*
|
22 |
+
feedback/*
|
23 |
+
input/*
|
24 |
+
output/*
|
Dockerfile
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
FROM public.ecr.aws/docker/library/python:3.11.
|
2 |
|
3 |
# Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
4 |
#COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
|
@@ -10,9 +10,9 @@ WORKDIR /src
|
|
10 |
|
11 |
COPY requirements_aws.txt .
|
12 |
|
13 |
-
RUN pip install --no-cache-dir torch==2.
|
14 |
pip install --no-cache-dir -r requirements_aws.txt && \
|
15 |
-
pip install --no-cache-dir gradio==
|
16 |
|
17 |
# Set up a new user named "user" with user ID 1000
|
18 |
RUN useradd -m -u 1000 user
|
|
|
1 |
+
FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
|
2 |
|
3 |
# Install Lambda web adapter in case you want to run with with an AWS Lamba function URL
|
4 |
#COPY --from=public.ecr.aws/awsguru/aws-lambda-adapter:0.8.3 /lambda-adapter /opt/extensions/lambda-adapter
|
|
|
10 |
|
11 |
COPY requirements_aws.txt .
|
12 |
|
13 |
+
RUN pip install --no-cache-dir torch==2.7.1+cpu --index-url https://download.pytorch.org/whl/cpu && \
|
14 |
pip install --no-cache-dir -r requirements_aws.txt && \
|
15 |
+
pip install --no-cache-dir gradio==5.34.0
|
16 |
|
17 |
# Set up a new user named "user" with user ID 1000
|
18 |
RUN useradd -m -u 1000 user
|
requirements.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
-
torch==2.
|
2 |
-
pandas==2.2.
|
3 |
rapidfuzz==3.8.1
|
4 |
recordlinkage==0.16
|
5 |
pyap==0.3.1
|
6 |
pytest==7.4.3
|
7 |
-
pyarrow==
|
8 |
openpyxl==3.1.2
|
9 |
-
gradio==
|
10 |
-
boto3==1.
|
11 |
polars==0.20.19
|
12 |
numpy==1.26.4
|
|
|
1 |
+
torch==2.7.1
|
2 |
+
pandas==2.2.3
|
3 |
rapidfuzz==3.8.1
|
4 |
recordlinkage==0.16
|
5 |
pyap==0.3.1
|
6 |
pytest==7.4.3
|
7 |
+
pyarrow==19.0.1
|
8 |
openpyxl==3.1.2
|
9 |
+
gradio==5.34.0
|
10 |
+
boto3==1.38.37
|
11 |
polars==0.20.19
|
12 |
numpy==1.26.4
|
requirements_aws.txt
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
pandas==2.2.
|
2 |
rapidfuzz==3.8.1
|
3 |
recordlinkage==0.16
|
4 |
pyap==0.3.1
|
5 |
pytest==7.4.3
|
6 |
-
pyarrow==
|
7 |
openpyxl==3.1.2
|
8 |
-
gradio==
|
9 |
-
boto3==1.
|
10 |
polars==0.20.19
|
11 |
numpy==1.26.4
|
|
|
1 |
+
pandas==2.2.3
|
2 |
rapidfuzz==3.8.1
|
3 |
recordlinkage==0.16
|
4 |
pyap==0.3.1
|
5 |
pytest==7.4.3
|
6 |
+
pyarrow==19.0.0
|
7 |
openpyxl==3.1.2
|
8 |
+
gradio==5.34.0
|
9 |
+
boto3==1.38.37
|
10 |
polars==0.20.19
|
11 |
numpy==1.26.4
|
tools/constants.py
CHANGED
@@ -215,7 +215,7 @@ else: exported_model = []
|
|
215 |
### ADDRESS MATCHING FUNCTIONS
|
216 |
# Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
|
217 |
batch_size = 10000
|
218 |
-
ref_batch_size =
|
219 |
|
220 |
### Fuzzy match method
|
221 |
|
|
|
215 |
### ADDRESS MATCHING FUNCTIONS
|
216 |
# Address matcher will try to match <batch_size> records in one go to avoid exceeding memory limits.
|
217 |
batch_size = 10000
|
218 |
+
ref_batch_size = 20000
|
219 |
|
220 |
### Fuzzy match method
|
221 |
|
tools/fuzzy_match.py
CHANGED
@@ -182,7 +182,6 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
|
|
182 |
fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
|
183 |
|
184 |
## Fuzzy search results
|
185 |
-
|
186 |
match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
|
187 |
'full_match',
|
188 |
'full_number_match',
|
@@ -216,7 +215,7 @@ def _create_fuzzy_match_results_output(results:PandasDataFrame, search_df_after_
|
|
216 |
joined_ref_cols = ["fulladdress", "Reference file"]
|
217 |
joined_ref_cols.extend(new_join_col)
|
218 |
|
219 |
-
print("joined_ref_cols: ", joined_ref_cols)
|
220 |
# Keep only columns that exist in reference dataset
|
221 |
joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]
|
222 |
|
|
|
182 |
fuzzy_match_limit = fuzzy_match_limit, blocker_col=blocker_col)
|
183 |
|
184 |
## Fuzzy search results
|
|
|
185 |
match_results_cols = ['search_orig_address','reference_orig_address', 'ref_index',
|
186 |
'full_match',
|
187 |
'full_number_match',
|
|
|
215 |
joined_ref_cols = ["fulladdress", "Reference file"]
|
216 |
joined_ref_cols.extend(new_join_col)
|
217 |
|
218 |
+
#print("joined_ref_cols: ", joined_ref_cols)
|
219 |
# Keep only columns that exist in reference dataset
|
220 |
joined_ref_cols = [col for col in joined_ref_cols if col in ref_df_cleaned.columns]
|
221 |
|