dlsmallw commited on
Commit
946f92f
·
1 Parent(s): 5fe82da

Task-330 Setup basic documentation

Browse files
Files changed (7) hide show
  1. Pipfile +2 -0
  2. Pipfile.lock +108 -3
  3. README.md +46 -16
  4. docs/index.md +52 -0
  5. mkdocs.yml +15 -0
  6. scripts/predict.py +129 -24
  7. setup.sh +13 -13
Pipfile CHANGED
@@ -15,6 +15,8 @@ joblib = "*"
15
  nltk = "*"
16
  htbuilder = "*"
17
  nest-asyncio = "*"
 
 
18
 
19
  [dev-packages]
20
 
 
15
  nltk = "*"
16
  htbuilder = "*"
17
  nest-asyncio = "*"
18
+ mkdocs = "*"
19
+ mkdocstrings-python = "*"
20
 
21
  [dev-packages]
22
 
Pipfile.lock CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_meta": {
3
  "hash": {
4
- "sha256": "675e02cbb23625b60f9b1ab6ac0ae5b84786b0def940f4f98e8000207075ba67"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
@@ -167,7 +167,7 @@
167
  "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44",
168
  "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"
169
  ],
170
- "markers": "platform_system == 'Windows'",
171
  "version": "==0.4.6"
172
  },
173
  "filelock": {
@@ -186,6 +186,13 @@
186
  "markers": "python_version >= '3.8'",
187
  "version": "==2025.3.0"
188
  },
 
 
 
 
 
 
 
189
  "gitdb": {
190
  "hashes": [
191
  "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571",
@@ -202,6 +209,14 @@
202
  "markers": "python_version >= '3.7'",
203
  "version": "==3.1.44"
204
  },
 
 
 
 
 
 
 
 
205
  "htbuilder": {
206
  "hashes": [
207
  "sha256:58c0bc5502c1a46b42ae9e074c43ec0f6fdc24ed334936cb17e1ed5a8938aee2"
@@ -260,6 +275,14 @@
260
  "markers": "python_version >= '3.9'",
261
  "version": "==2024.10.1"
262
  },
 
 
 
 
 
 
 
 
263
  "markupsafe": {
264
  "hashes": [
265
  "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4",
@@ -327,6 +350,56 @@
327
  "markers": "python_version >= '3.9'",
328
  "version": "==3.0.2"
329
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  "mpmath": {
331
  "hashes": [
332
  "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f",
@@ -487,6 +560,14 @@
487
  "markers": "python_version >= '3.9'",
488
  "version": "==2.2.3"
489
  },
 
 
 
 
 
 
 
 
490
  "pillow": {
491
  "hashes": [
492
  "sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83",
@@ -564,6 +645,14 @@
564
  "markers": "python_version >= '3.9'",
565
  "version": "==11.1.0"
566
  },
 
 
 
 
 
 
 
 
567
  "protobuf": {
568
  "hashes": [
569
  "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7",
@@ -637,6 +726,14 @@
637
  "markers": "python_version >= '3.8'",
638
  "version": "==0.9.1"
639
  },
 
 
 
 
 
 
 
 
640
  "python-dateutil": {
641
  "hashes": [
642
  "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3",
@@ -711,6 +808,14 @@
711
  "markers": "python_version >= '3.8'",
712
  "version": "==6.0.2"
713
  },
 
 
 
 
 
 
 
 
714
  "referencing": {
715
  "hashes": [
716
  "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa",
@@ -1070,7 +1175,7 @@
1070
  "sha256:e7631a77ffb1f7d2eefa4445ebbee491c720a5661ddf6df3498ebecae5ed375c",
1071
  "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2"
1072
  ],
1073
- "markers": "platform_system != 'Darwin'",
1074
  "version": "==6.0.0"
1075
  }
1076
  },
 
1
  {
2
  "_meta": {
3
  "hash": {
4
+ "sha256": "011a0284b34b98265fb35b5b99964b701b6a30f703a45de6b092a7d7c631032d"
5
  },
6
  "pipfile-spec": 6,
7
  "requires": {
 
167
  "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44",
168
  "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"
169
  ],
170
+ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'",
171
  "version": "==0.4.6"
172
  },
173
  "filelock": {
 
186
  "markers": "python_version >= '3.8'",
187
  "version": "==2025.3.0"
188
  },
189
+ "ghp-import": {
190
+ "hashes": [
191
+ "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619",
192
+ "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"
193
+ ],
194
+ "version": "==2.1.0"
195
+ },
196
  "gitdb": {
197
  "hashes": [
198
  "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571",
 
209
  "markers": "python_version >= '3.7'",
210
  "version": "==3.1.44"
211
  },
212
+ "griffe": {
213
+ "hashes": [
214
+ "sha256:3a46fa7bd83280909b63c12b9a975732a927dd97809efe5b7972290b606c5d91",
215
+ "sha256:6399f7e663150e4278a312a8e8a14d2f3d7bd86e2ef2f8056a1058e38579c2ee"
216
+ ],
217
+ "markers": "python_version >= '3.9'",
218
+ "version": "==1.6.2"
219
+ },
220
  "htbuilder": {
221
  "hashes": [
222
  "sha256:58c0bc5502c1a46b42ae9e074c43ec0f6fdc24ed334936cb17e1ed5a8938aee2"
 
275
  "markers": "python_version >= '3.9'",
276
  "version": "==2024.10.1"
277
  },
278
+ "markdown": {
279
+ "hashes": [
280
+ "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2",
281
+ "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"
282
+ ],
283
+ "markers": "python_version >= '3.8'",
284
+ "version": "==3.7"
285
+ },
286
  "markupsafe": {
287
  "hashes": [
288
  "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4",
 
350
  "markers": "python_version >= '3.9'",
351
  "version": "==3.0.2"
352
  },
353
+ "mergedeep": {
354
+ "hashes": [
355
+ "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8",
356
+ "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"
357
+ ],
358
+ "markers": "python_version >= '3.6'",
359
+ "version": "==1.3.4"
360
+ },
361
+ "mkdocs": {
362
+ "hashes": [
363
+ "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2",
364
+ "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"
365
+ ],
366
+ "index": "pypi",
367
+ "markers": "python_version >= '3.8'",
368
+ "version": "==1.6.1"
369
+ },
370
+ "mkdocs-autorefs": {
371
+ "hashes": [
372
+ "sha256:4b5b6235a4becb2b10425c2fa191737e415b37aa3418919db33e5d774c9db079",
373
+ "sha256:9793c5ac06a6ebbe52ec0f8439256e66187badf4b5334b5fde0b128ec134df4f"
374
+ ],
375
+ "markers": "python_version >= '3.9'",
376
+ "version": "==1.4.1"
377
+ },
378
+ "mkdocs-get-deps": {
379
+ "hashes": [
380
+ "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c",
381
+ "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"
382
+ ],
383
+ "markers": "python_version >= '3.8'",
384
+ "version": "==0.2.0"
385
+ },
386
+ "mkdocstrings": {
387
+ "hashes": [
388
+ "sha256:3657be1384543ce0ee82112c3e521bbf48e41303aa0c229b9ffcccba057d922e",
389
+ "sha256:8ea98358d2006f60befa940fdebbbc88a26b37ecbcded10be726ba359284f73d"
390
+ ],
391
+ "markers": "python_version >= '3.9'",
392
+ "version": "==0.29.0"
393
+ },
394
+ "mkdocstrings-python": {
395
+ "hashes": [
396
+ "sha256:211b7aaf776cd45578ecb531e5ad0d3a35a8be9101a6bfa10de38a69af9d8fd8",
397
+ "sha256:9453ccae69be103810c1cf6435ce71c8f714ae37fef4d87d16aa92a7c800fe1d"
398
+ ],
399
+ "index": "pypi",
400
+ "markers": "python_version >= '3.9'",
401
+ "version": "==1.16.8"
402
+ },
403
  "mpmath": {
404
  "hashes": [
405
  "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f",
 
560
  "markers": "python_version >= '3.9'",
561
  "version": "==2.2.3"
562
  },
563
+ "pathspec": {
564
+ "hashes": [
565
+ "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08",
566
+ "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"
567
+ ],
568
+ "markers": "python_version >= '3.8'",
569
+ "version": "==0.12.1"
570
+ },
571
  "pillow": {
572
  "hashes": [
573
  "sha256:015c6e863faa4779251436db398ae75051469f7c903b043a48f078e437656f83",
 
645
  "markers": "python_version >= '3.9'",
646
  "version": "==11.1.0"
647
  },
648
+ "platformdirs": {
649
+ "hashes": [
650
+ "sha256:a03875334331946f13c549dbd8f4bac7a13a50a895a0eb1e8c6a8ace80d40a94",
651
+ "sha256:eb437d586b6a0986388f0d6f74aa0cde27b48d0e3d66843640bfb6bdcdb6e351"
652
+ ],
653
+ "markers": "python_version >= '3.9'",
654
+ "version": "==4.3.7"
655
+ },
656
  "protobuf": {
657
  "hashes": [
658
  "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7",
 
726
  "markers": "python_version >= '3.8'",
727
  "version": "==0.9.1"
728
  },
729
+ "pymdown-extensions": {
730
+ "hashes": [
731
+ "sha256:05e0bee73d64b9c71a4ae17c72abc2f700e8bc8403755a00580b49a4e9f189e9",
732
+ "sha256:41e576ce3f5d650be59e900e4ceff231e0aed2a88cf30acaee41e02f063a061b"
733
+ ],
734
+ "markers": "python_version >= '3.8'",
735
+ "version": "==10.14.3"
736
+ },
737
  "python-dateutil": {
738
  "hashes": [
739
  "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3",
 
808
  "markers": "python_version >= '3.8'",
809
  "version": "==6.0.2"
810
  },
811
+ "pyyaml-env-tag": {
812
+ "hashes": [
813
+ "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb",
814
+ "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"
815
+ ],
816
+ "markers": "python_version >= '3.6'",
817
+ "version": "==0.1"
818
+ },
819
  "referencing": {
820
  "hashes": [
821
  "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa",
 
1175
  "sha256:e7631a77ffb1f7d2eefa4445ebbee491c720a5661ddf6df3498ebecae5ed375c",
1176
  "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2"
1177
  ],
1178
+ "markers": "python_version >= '3.9'",
1179
  "version": "==6.0.0"
1180
  }
1181
  },
README.md CHANGED
@@ -12,25 +12,55 @@ license: mit
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
 
15
 
16
- # Setup for Pushing to GitHub and HF Space:
17
- #### Due to the project being configured to use hugging face spaces to host the python web-app, the instructions will outline how to setup the project to push to any newly created Hugging Face Space.
18
- Note: Streamlit can still be developed and deployed to environments other than Hugging Face Spaces. Refer to the appropriate documentation associated with a chosen hosting service for how to deploy the web-app to the services environment.
19
 
20
- ### After Creation of a Streamlit Hugging Face Space:
21
- - In the directory of the cloned repository, add the hugging face space as an additional remote origin:
22
- - You can specify any name to use for the origin name (i.e., hf_origin)
23
- ```
24
- git remote add <hf-origin-name> <hf-space-url>
25
- ```
26
 
27
- - Once the space is linked, you will need to force update the space with the contents of the current repository as follows (This will sync the HF Space with the main repositories history):
28
- ```
29
- git push --force <hf-origin-name> main
30
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- - Following these steps, any new commits made can be pushed to the HF Space by using the following command:
33
  ```
34
- git push <hf-space-name> main
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  ```
36
-
 
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
15
+ # NLPinitiative Streamlit Web Application
16
 
17
+ ---
 
 
18
 
19
+ ## Project Details
 
 
 
 
 
20
 
21
+ ### Description
22
+
23
+ Codebase for the Streamlit app hosted on Hugging Face Spaces that provides a basic user interface for performing inference on text input by the user using the models training within the NLPinitiative project.
24
+
25
+ ---
26
+ ### Project Setup
27
+
28
+ **Setup for Pushing to GitHub and HF Space**:
29
+
30
+ Due to the project being configured to use hugging face spaces to host the python web-app, the instructions will outline how to setup the project to push to any newly created Hugging Face Space.
31
+
32
+ **Note**: Streamlit can still be developed and deployed to environments other than Hugging Face Spaces. Refer to the appropriate documentation associated with a chosen hosting service for how to deploy the web-app to the services environment.
33
+
34
+ **After Creation of a Streamlit Hugging Face Space**:
35
+
36
+ In the directory of the cloned repository, add the hugging face space as an additional remote origin:
37
+ `git remote add <hf-origin-name> <hf-space-url>`
38
+
39
+ - **NOTE**: *You can specify any name to use for the origin name (i.e., hf_origin)*
40
+
41
+ Once the space is linked, you will need to force update the space with the contents of the current repository as follows (This will sync the HF Space with the main repositories history):
42
+ `git push --force <hf-origin-name> main`
43
+
44
+ Following these steps, any new commits made can be pushed to the HF Space by using the following command:
45
+ `git push <hf-space-name> main`
46
+
47
+ ---
48
+ ### Project layout
49
 
 
50
  ```
51
+ ├── docs <- A directory containing documentation used for generating and serving
52
+ │ project documentation
53
+ ├── scripts <- Source code for model inference
54
+ │ ├── __init__.py <- Makes modeling a Python module
55
+ │ └── predict.py <- Code to run model inference with trained models
56
+ ├── app.py <- Entry point for the application
57
+ ├── config.py <- Store useful variables and configuration
58
+ ├── LICENSE <- Open-source license if one is chosen
59
+ ├── mkdocs.yml <- mkdocs project configuration
60
+ ├── Pipfile <- The project dependency file for reproducing the analysis environment,
61
+ │ e.g., generated with `pipenv install`
62
+ ├── Pipfile.lock <- Locked file containing hashes for dependencies
63
+ ├── README.md <- The top-level README for developers using this project
64
+ ├── requirements.txt <- Plaintext dependency information (necessary for app hosting)
65
+ └── setup.sh <- Bash script containing convenience commands for managing the project
66
  ```
 
docs/index.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NLPinitiative Streamlit Documentation
2
+
3
+ ---
4
+
5
+ ## Project Details
6
+
7
+ ### Description
8
+
9
+ Codebase for the Streamlit app hosted on Hugging Face Spaces that provides a basic user interface for performing inference on text input by the user using the models training within the NLPinitiative project.
10
+
11
+ ---
12
+ ### Project Setup
13
+
14
+ **Setup for Pushing to GitHub and HF Space**:
15
+
16
+ Due to the project being configured to use hugging face spaces to host the python web-app, the instructions will outline how to setup the project to push to any newly created Hugging Face Space.
17
+
18
+ **Note**: Streamlit can still be developed and deployed to environments other than Hugging Face Spaces. Refer to the appropriate documentation associated with a chosen hosting service for how to deploy the web-app to the services environment.
19
+
20
+ **After Creation of a Streamlit Hugging Face Space**:
21
+
22
+ In the directory of the cloned repository, add the hugging face space as an additional remote origin:
23
+ `git remote add <hf-origin-name> <hf-space-url>`
24
+
25
+ - **NOTE**: *You can specify any name to use for the origin name (i.e., hf_origin)*
26
+
27
+ Once the space is linked, you will need to force update the space with the contents of the current repository as follows (This will sync the HF Space with the main repositories history):
28
+ `git push --force <hf-origin-name> main`
29
+
30
+ Following these steps, any new commits made can be pushed to the HF Space by using the following command:
31
+ `git push <hf-space-name> main`
32
+
33
+ ---
34
+ ### Project layout
35
+
36
+ ```
37
+ ├── docs <- A directory containing documentation used for generating and serving
38
+ │ project documentation
39
+ ├── scripts <- Source code for model inference
40
+ │ ├── __init__.py <- Makes modeling a Python module
41
+ │ └── predict.py <- Code to run model inference with trained models
42
+ ├── app.py <- Entry point for the application
43
+ ├── config.py <- Store useful variables and configuration
44
+ ├── LICENSE <- Open-source license if one is chosen
45
+ ├── mkdocs.yml <- mkdocs project configuration
46
+ ├── Pipfile <- The project dependency file for reproducing the analysis environment,
47
+ │ e.g., generated with `pipenv install`
48
+ ├── Pipfile.lock <- Locked file containing hashes for dependencies
49
+ ├── README.md <- The top-level README for developers using this project
50
+ ├── requirements.txt <- Plaintext dependency information (necessary for app hosting)
51
+ └── setup.sh <- Bash script containing convenience commands for managing the project
52
+ ```
mkdocs.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ site_name: NLPinitiative Streamlit Documentation
2
+
3
+ nav:
4
+ - Home: index.md
5
+
6
+ theme:
7
+ name: readthedocs
8
+
9
+ plugins:
10
+ - search
11
+ - mkdocstrings:
12
+ handlers:
13
+ python:
14
+ options:
15
+ docstring_style: numpy
scripts/predict.py CHANGED
@@ -16,23 +16,53 @@ from transformers import (
16
  BIN_REPO = 'dlsmallw/NLPinitiative-Binary-Classification'
17
  ML_REPO = 'dlsmallw/NLPinitiative-Multilabel-Regression'
18
 
19
- ## Class used to encapsulate and handle the logic for inference
20
  class InferenceHandler:
21
- def __init__(self, api_token):
 
 
 
 
 
 
 
 
 
 
22
  self.api_token = api_token
23
- self.bin_tokenizer, self.bin_model = self.init_model_and_tokenizer(BIN_REPO)
24
- self.ml_regr_tokenizer, self.ml_regr_model = self.init_model_and_tokenizer(ML_REPO)
25
  nltk.download('punkt_tab')
26
 
27
- def get_config(self, repo_id):
 
 
 
 
 
 
 
 
 
28
  config = None
29
  if repo_id and self.api_token:
30
  config = huggingface_hub.hf_hub_download(repo_id, filename='config.json', token=self.api_token)
31
  return config
32
 
33
- ## Initializes a model and tokenizer for use in inference using the models path
34
- def init_model_and_tokenizer(self, repo_id):
35
- config = self.get_config(repo_id)
 
 
 
 
 
 
 
 
 
 
 
 
36
  with open(config) as config_file:
37
  config_json = json.load(config_file)
38
  model_name = config_json['_name_or_path']
@@ -43,24 +73,75 @@ class InferenceHandler:
43
  model.eval()
44
  return tokenizer, model
45
 
46
- ## Handles logic used to encode the text for use in binary classification
47
- def encode_binary(self, text):
 
 
 
 
 
 
 
 
 
 
 
 
48
  bin_tokenized_input = self.bin_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
49
  return bin_tokenized_input
50
 
51
- ## Handles logic used to encode the text for use in multilabel regression
52
- def encode_multilabel(self, text):
 
 
 
 
 
 
 
 
 
 
 
 
53
  ml_tokenized_input = self.ml_regr_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
54
  return ml_tokenized_input
55
 
56
- ## Handles text encoding for both binary classification and multilabel regression
57
- def encode_input(self, text):
58
- bin_inputs = self.encode_binary(text)
59
- ml_inputs = self.encode_multilabel(text)
 
 
 
 
 
 
 
 
 
 
 
 
60
  return bin_inputs, ml_inputs
61
 
62
- ## Handles performing the full sentiment analysis (binary classification and multilabel regression)
63
- def classify_text(self, input):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  result = {
65
  'text_input': input,
66
  'results': []
@@ -100,9 +181,21 @@ class InferenceHandler:
100
  result['results'] = sent_res_arr
101
  return result
102
 
103
- ## Handles logic for checking the binary classfication of the text
104
- def discriminatory_inference(self, text):
105
- bin_inputs = self.encode_binary(text)
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  with torch.no_grad():
108
  bin_logits = self.bin_model(**bin_inputs).logits
@@ -114,9 +207,21 @@ class InferenceHandler:
114
 
115
  return bin_text_pred, pred_class
116
 
117
- ## Handles logic for assessing the categories of discrimination
118
- def category_inference(self, text):
119
- ml_inputs = self.encode_multilabel(text)
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  with torch.no_grad():
122
  ml_outputs = self.ml_regr_model(**ml_inputs).logits
 
16
  BIN_REPO = 'dlsmallw/NLPinitiative-Binary-Classification'
17
  ML_REPO = 'dlsmallw/NLPinitiative-Multilabel-Regression'
18
 
 
19
  class InferenceHandler:
20
+ """A class that handles performing inference using the trained binary classification and multilabel regression models."""
21
+
22
+ def __init__(self, api_token: str):
23
+ """Constructor for instantiating an InferenceHandler object.
24
+
25
+ Parameters
26
+ ----------
27
+ api_token : str
28
+ A Hugging Face token with read/write access privileges to allow exporting the trained models (default is None).
29
+ """
30
+
31
  self.api_token = api_token
32
+ self.bin_tokenizer, self.bin_model = self._init_model_and_tokenizer(BIN_REPO)
33
+ self.ml_regr_tokenizer, self.ml_regr_model = self._init_model_and_tokenizer(ML_REPO)
34
  nltk.download('punkt_tab')
35
 
36
+ def _get_config(self, repo_id: str) -> str:
37
+ """Retrieves the config.json file from the specified model repository.
38
+
39
+ Parameters
40
+ ----------
41
+ repo_id : str
42
+ The repository id (i.e., <owner username>/<repository name>).
43
+
44
+ """
45
+
46
  config = None
47
  if repo_id and self.api_token:
48
  config = huggingface_hub.hf_hub_download(repo_id, filename='config.json', token=self.api_token)
49
  return config
50
 
51
+ def _init_model_and_tokenizer(self, repo_id: str):
52
+ """Initializes a model and tokenizer for use in inference using the models path.
53
+
54
+ Parameters
55
+ ----------
56
+ model_path : Path
57
+ Directory path to the models tensor file.
58
+
59
+ Returns
60
+ -------
61
+ tuple[PreTrainedTokenizer | PreTrainedTokenizerFast, PreTrainedModel]
62
+ A tuple containing the tokenizer and model objects.
63
+ """
64
+
65
+ config = self._get_config(repo_id)
66
  with open(config) as config_file:
67
  config_json = json.load(config_file)
68
  model_name = config_json['_name_or_path']
 
73
  model.eval()
74
  return tokenizer, model
75
 
76
+ def _encode_binary(self, text: str):
77
+ """Preprocesses and tokenizes the input text for binary classification.
78
+
79
+ Parameters
80
+ ----------
81
+ text : str
82
+ The input text to be preprocessed and tokenized.
83
+
84
+ Returns
85
+ -------
86
+ BatchEncoding
87
+ The preprocessed and tokenized input text.
88
+ """
89
+
90
  bin_tokenized_input = self.bin_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
91
  return bin_tokenized_input
92
 
93
+ def _encode_multilabel(self, text: str):
94
+ """Preprocesses and tokenizes the input text for multilabel regression.
95
+
96
+ Parameters
97
+ ----------
98
+ text : str
99
+ The input text to be preprocessed and tokenized.
100
+
101
+ Returns
102
+ -------
103
+ BatchEncoding
104
+ The preprocessed and tokenized input text.
105
+ """
106
+
107
  ml_tokenized_input = self.ml_regr_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
108
  return ml_tokenized_input
109
 
110
+ def _encode_input(self, text: str):
111
+ """Preprocesses and tokenizes the input text sentiment classification (both models).
112
+
113
+ Parameters
114
+ ----------
115
+ text : str
116
+ The input text to be preprocessed and tokenized.
117
+
118
+ Returns
119
+ -------
120
+ tuple[BatchEncoding, BatchEncoding]
121
+ A tuple containing preprocessed and tokenized input text for both the binary and multilabel regression models.
122
+ """
123
+
124
+ bin_inputs = self._encode_binary(text)
125
+ ml_inputs = self._encode_multilabel(text)
126
  return bin_inputs, ml_inputs
127
 
128
+ def classify_text(self, input: str):
129
+ """Performs inference on the input text to determine the binary classification and the multilabel regression for the categories.
130
+
131
+ Determines whether the text is discriminatory. If it is discriminatory, it will then perform regression on the input text to determine the
132
+ assesed percentage that each category applies.
133
+
134
+ Parameters
135
+ ----------
136
+ input : str
137
+ The input text to be classified.
138
+
139
+ Returns
140
+ -------
141
+ dict[str, Any]
142
+ The resulting classification and regression values for each category.
143
+ """
144
+
145
  result = {
146
  'text_input': input,
147
  'results': []
 
181
  result['results'] = sent_res_arr
182
  return result
183
 
184
+ def discriminatory_inference(self, text: str):
185
+ """Performs inference on the input text to determine the binary classification.
186
+
187
+ Parameters
188
+ ----------
189
+ text : str
190
+ The input text to be classified.
191
+
192
+ Returns
193
+ -------
194
+ tuple[str, Number]
195
+ A tuple consisting of the string classification (Discriminatory or Non-Discriminatory) and the numeric prediction class (1 or 0).
196
+ """
197
+
198
+ bin_inputs = self._encode_binary(text)
199
 
200
  with torch.no_grad():
201
  bin_logits = self.bin_model(**bin_inputs).logits
 
207
 
208
  return bin_text_pred, pred_class
209
 
210
+ def category_inference(self, text: str):
211
+ """Performs inference on the input text to determine the regression values for the categories of discrimination.
212
+
213
+ Parameters
214
+ ----------
215
+ text : str
216
+ The input text to be classified.
217
+
218
+ Returns
219
+ -------
220
+ list[float]
221
+ A tuple consisting of the string classification (Discriminatory or Non-Discriminatory) and the numeric prediction class (1 or 0).
222
+ """
223
+
224
+ ml_inputs = self._encode_multilabel(text)
225
 
226
  with torch.no_grad():
227
  ml_outputs = self.ml_regr_model(**ml_inputs).logits
setup.sh CHANGED
@@ -23,16 +23,16 @@ requirements() {
23
  pipenv requirements > requirements.txt
24
  }
25
 
26
- # docs() {
27
- # case $1 in
28
- # build)
29
- # mkdocs build
30
- # ;;
31
- # serve)
32
- # mkdocs serve
33
- # ;;
34
- # *)
35
- # log_error "Specify 'build' or 'serve'. For example: docs build"
36
- # ;;
37
- # esac
38
- # }
 
23
  pipenv requirements > requirements.txt
24
  }
25
 
26
+ docs() {
27
+ case $1 in
28
+ build)
29
+ mkdocs build
30
+ ;;
31
+ serve)
32
+ mkdocs serve
33
+ ;;
34
+ *)
35
+ log_error "Specify 'build' or 'serve'. For example: docs build"
36
+ ;;
37
+ esac
38
+ }