Jayra Ortiz commited on
Commit
b0716cb
·
1 Parent(s): 180c3a0

:star: added initial working architecture

Browse files
.env ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ COMPLETIONS_MODEL=none
2
+ OPEN_API_KEY=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
3
+ AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=scraperoutputs;AccountKey=n3bsqhhFypROsBb9huJoUkscit6XVqn7cAS3cWYaRltyONh4+2NvlOMFx6FPBKL6PGt9+aqvN7WA+AStd23gpQ==;EndpointSuffix=core.windows.net"
4
+ CONTAINER=identity
.gitignore ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use the official Python base image with tag/version of your choice
2
+ FROM python:3.9
3
+
4
+ # Set the working directory in the container
5
+
6
+
7
+ ENV WORKDIR=/app
8
+ WORKDIR ${WORKDIR}
9
+
10
+ # Copy the requirements file into the container
11
+ COPY requirements.txt .
12
+
13
+ # Install the Python dependencies
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Copy the application code into the container
17
+ COPY . .
18
+
19
+ ENV PYTHONPATH "${PYTHONPATH}:/code/src"
20
+
21
+ # Define default run command
22
+ WORKDIR ${WORKDIR}/src
23
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
configs/config.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ embedding_model: text-embedding-ada-002
2
+ completion_model: gpt-3.5-turbo-16k
core/environments.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings import OpenAIEmbeddings
2
+
3
+ import openai
4
+ import os
5
+ from pathlib import Path
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+
10
+ COMPLETIONS_MODEL = "gpt-3.5-turbo-16k"
11
+ EMBEDDING_MODEL = "text-embedding-ada-002"
12
+ OPEN_API_KEY= os.getenv("OPEN_API_KEY")
13
+ AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
14
+
15
+ CONTAINER = os.getenv("CONTAINER")
16
+ openai.api_key =OPEN_API_KEY
17
+ os.environ['OPENAI_API_KEY'] = openai.api_key
18
+ embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
19
+ mod_path = Path(__file__).parent.parent
20
+ src_path = Path(__file__).parent
core/models/__init__.py ADDED
File without changes
core/models/parser.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Optional, Union
3
+
4
+ class WorkExperience(BaseModel):
5
+ position: str
6
+ company: str
7
+ date: str
8
+ responsibilities: str
9
+
10
+ class Education(BaseModel):
11
+ degree: str
12
+ school: str
13
+ date: Optional[str] = None
14
+
15
+ class ApplicantResume(BaseModel):
16
+ applicant_name: str = None
17
+ phone_number: Optional[str] = None
18
+ email: Optional[str] = None
19
+ website: Optional[str] = None
20
+ applicant_summary: Optional[str] = None
21
+ work_experience: Optional[list[WorkExperience]] = None
22
+ skills: list[str]
23
+ education: Optional[list[Education]] = None
24
+ Publications: Optional[dict] = None
25
+ location: Optional[str] = None
26
+ languages: Optional[list[str]] = None
27
+
28
+
29
+
30
+
core/models/reports.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import Any
3
+
4
+ class ReportConfiguration(BaseModel):
5
+ entities: list[str]
6
+ pos_frequencies: dict[str, int]
7
+ keyterms: list[Any]
8
+ bi_grams: list[Any]
core/parser/__init__.py ADDED
File without changes
core/parser/file_parser.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import fitz
4
+ import docx
5
+
6
+ class BaseParser():
7
+ def __init__(self, file_name: str) -> None:
8
+ self.file_name = file_name
9
+ self.file_path = f'./{file_name}'
10
+ self.file_type = file_name.split('.')[-1]
11
+
12
+ def fitz_pymupdf_parser(self):
13
+ doc = fitz.open(self.file_path)
14
+ text = ""
15
+
16
+ for page in doc:
17
+ text += page.get_text().encode("utf-8", "ignore").decode("utf-8", "ignore")
18
+
19
+ print(text)
20
+ return text
21
+
22
+ def docx_parser(self):
23
+ docs = docx.Document(self.file_path)
24
+ text = ""
25
+
26
+ for paragraph in docs.paragraphs:
27
+ text += paragraph.text + "\n"
28
+
29
+ print(text)
30
+ return text
31
+
32
+ def parse_pdf(self):
33
+ parsed_text = ""
34
+ if self.file_type == "txt":
35
+ parsed_text = open(self.file_name, "r").read()
36
+ elif self.file_type == "pdf":
37
+ parsed_text = self.fitz_pymupdf_parser()
38
+ elif self.file_type == "docx":
39
+ parsed_text = self.docx_parser()
40
+
41
+ return parsed_text
core/parser/prompt.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def generate_identity_prompt():
2
+
3
+ prompt = """
4
+ Identity:
5
+ 1. You are a resume parser. A parser that returns the field you are given.
6
+ 2. You will act as a bot that parses and analyzes the resume based on it's context.
7
+ 3. You will be taking different types of format for the resume. The ultimate goal is for you to be able to parse the texts as it is. No modifications just copy and paste.
8
+
9
+ Instructions:
10
+ 1. You're goal is to scrape the data from the resume based on it's contextual order since resumes are unstructured data, you will need to make sure that you're ordering is as follows:
11
+ * applicant_name = name of applicant
12
+ * phone_number = phone number indicated there. Should be optional
13
+ * email = email of the applicant. Should be optional
14
+ * website = (url you find in the resume if Any)
15
+ * applicant_summary = (their introduction to who they are if Any)
16
+ * work_experience (Their Job Experience over the years if Any). If there is follow this format per work experience:
17
+ * position: str
18
+ * company: str
19
+ * date: str
20
+ * responsibilities: str
21
+ * skills (The skills they acquire in their tenure as a professional if Any), should be a list of strings.
22
+ * education (Their Educational Attainment if Any). If there is follow this format per education attained:
23
+ * degree: str
24
+ * school: str
25
+ * date: Optional[str] = None
26
+ * publications (Their Publications if Any) should be in list[dict] form.
27
+ * location (the city and country where the person is located if any). Should be a string.
28
+ * languages(the languages spoken by the candidate if any). should be a list of string.
29
+ 2. You will parse only these fields and will make sure that the list above remains the same number. You should not make up other words aside from what is explicitly written in the resume. You will only copy it as it is.
30
+ 3. Make sure that you only get from the context provided below else just return null.
31
+ 4. Make sure that you're output is in a json format based on the fields provided to you.
32
+ """
33
+
34
+ return prompt
35
+
36
+ def generate_context_prompt(resume_raw_txt: str):
37
+ prompt = f"""
38
+ Context:
39
+
40
+ {resume_raw_txt}
41
+ """
42
+
43
+ return prompt
44
+
core/parser/task.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from core.models.parser import ApplicantResume
3
+ from core.parser.file_parser import BaseParser
4
+ from core.parser.prompt import generate_context_prompt, generate_identity_prompt
5
+ from core.resources.azure_openai import generate_response
6
+ import json
7
+
8
+ def extractor_task(file_name: str):
9
+ # Parse all data in it's raw format.
10
+ parser = BaseParser(file_name)
11
+ raw_txt = parser.parse_pdf()
12
+ messages = generate_messages(raw_txt)
13
+
14
+ # generate response and put it in a pydantic model that validates what gpt has given us
15
+ # it will also serve as our baseline on which data we can access from the resume
16
+ # if it has enough fields for us to do an context similarity on, we can generate a proper report.
17
+ # The model will also serve in our reporting as we can raise this as an issue if they do not have
18
+ # enough context for us to work on. that means it is either it is not a complete resume.
19
+ response = generate_response(messages)
20
+ json_response = json.loads(response)
21
+ model_response = ApplicantResume(**json_response)
22
+
23
+ return json_response
24
+
25
+
26
+ def generate_messages(raw_txt: str):
27
+ print("raw_txt in generate_messages", raw_txt)
28
+ messages = []
29
+
30
+ identity_prompt = generate_identity_prompt()
31
+ context_prompt = generate_context_prompt(raw_txt)
32
+
33
+ messages.append({"role": "system", "content": identity_prompt})
34
+ messages.append({"role": "system", "content": context_prompt})
35
+
36
+ return messages
37
+
38
+ def list_dict_to_str_parser(items):
39
+ result = ""
40
+ for item in items:
41
+ for val in item.dict().values():
42
+ result += f"{val}\n"
43
+
44
+ return result
45
+
core/report/__init__.py ADDED
File without changes
core/resources/__init__.py ADDED
File without changes
core/resources/azure_openai.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings import OpenAIEmbeddings
2
+ from core.environments import EMBEDDING_MODEL, COMPLETIONS_MODEL
3
+ import openai
4
+
5
+
6
+ def generate_response(messages: list):
7
+ chat_response = openai.ChatCompletion.create(
8
+ #deployment_id="chatbot",
9
+ model=COMPLETIONS_MODEL, messages=messages
10
+ )
11
+ print(chat_response)
12
+ return chat_response["choices"][0]["message"]["content"].strip(" \n")
13
+
14
+ def get_embeddings():
15
+ return OpenAIEmbeddings(model=EMBEDDING_MODEL)
core/resources/constants.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def get_embedding_model():
2
+ return "text-embedding-ada-002"
3
+
4
+
5
+ def get_completion_model():
6
+ return "gpt-3.5-turbo-16k"
core/similarity/__init__.py ADDED
File without changes
core/similarity/context.py ADDED
File without changes
core/similarity/service.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import textacy
3
+ from textacy import extract
4
+ import spacy
5
+ from core.models.reports import ReportConfiguration
6
+ import nltk
7
+
8
+ spacy.cli.download("en_core_web_sm")
9
+ spacy.cli.download("en_core_web_md")
10
+
11
+ class EntityManager:
12
+ def __init__(self, text):
13
+
14
+ self.nlp = spacy.load('en_core_web_sm')
15
+ self.text = text
16
+ self.doc = self.nlp(text)
17
+
18
+ def count_frequency(self):
19
+ """
20
+ Count the frequency of words in the input text.
21
+
22
+ Returns:
23
+ dict: A dictionary with the words as keys and the frequency as values.
24
+ """
25
+ pos_freq = {}
26
+ for token in self.doc:
27
+ if token.pos_ in pos_freq:
28
+ pos_freq[token.pos_] += 1
29
+ else:
30
+ pos_freq[token.pos_] = 1
31
+ return pos_freq
32
+
33
+ def get_entity_from_txt(self):
34
+ pos_tags = ['NOUN', 'PROPN']
35
+ nouns = [str(token.text) for token in self.doc if token.pos_ in pos_tags]
36
+ return nouns
37
+
38
+ def extract_attrs(self) -> ReportConfiguration:
39
+ """
40
+ Extract frequency based attrs based from the parsed text
41
+ """
42
+ key_terms = KeytermExtractor(self.text)
43
+
44
+ return ReportConfiguration(
45
+ entities=self.get_entity_from_txt(),
46
+ pos_frequencies=self.count_frequency(),
47
+ keyterms=key_terms.get_keyterms_based_on_sgrank(),
48
+ bi_grams=key_terms.bi_gramchunker()
49
+ )
50
+
51
+
52
+ class KeytermExtractor:
53
+ """
54
+ A class for extracting keyterms from a given text using various algorithms.
55
+ """
56
+
57
+ def __init__(self, raw_text: str, top_n_values: int = 20):
58
+ """
59
+ Initialize the KeytermExtractor object.
60
+
61
+ Args:
62
+ raw_text (str): The raw input text.
63
+ top_n_values (int): The number of top keyterms to extract.
64
+ """
65
+ self.raw_text = raw_text
66
+ self.text_doc = textacy.make_spacy_doc(
67
+ self.raw_text, lang="en_core_web_md")
68
+ self.top_n_values = top_n_values
69
+
70
+ def get_keyterms_based_on_textrank(self):
71
+ """
72
+ Extract keyterms using the TextRank algorithm.
73
+
74
+ Returns:
75
+ List[str]: A list of top keyterms based on TextRank.
76
+ """
77
+ return list(extract.keyterms.textrank(self.text_doc, normalize="lemma",
78
+ topn=self.top_n_values))
79
+
80
+ def get_keyterms_based_on_sgrank(self):
81
+ """
82
+ Extract keyterms using the SGRank algorithm.
83
+
84
+ Returns:
85
+ List[str]: A list of top keyterms based on SGRank.
86
+ """
87
+ return list(extract.keyterms.sgrank(self.text_doc, normalize="lemma",
88
+ topn=self.top_n_values))
89
+
90
+ def bi_gramchunker(self):
91
+ """
92
+ Chunk the text into bigrams.
93
+
94
+ Returns:
95
+ List[str]: A list of bigrams.
96
+ """
97
+ return list(textacy.extract.basics.ngrams(self.text_doc, n=2, filter_stops=True,
98
+ filter_nums=True, filter_punct=True))
99
+
100
+ def tri_gramchunker(self):
101
+ """
102
+ Chunk the text into trigrams.
103
+
104
+ Returns:
105
+ List[str]: A list of trigrams.
106
+ """
107
+ return list(textacy.extract.basics.ngrams(self.text_doc, n=3, filter_stops=True,
108
+ filter_nums=True, filter_punct=True))
109
+
110
+
111
+ def create_annotated_text(input_string: str, word_list: list[str], annotation: str, color_code: str):
112
+ # Tokenize the input string
113
+ tokens = nltk.word_tokenize(input_string)
114
+
115
+ # Convert the list to a set for quick lookups
116
+ word_set = set(word_list)
117
+
118
+ # Initialize an empty list to hold the annotated text
119
+ annotated_text = []
120
+
121
+ for token in tokens:
122
+ # Check if the token is in the set
123
+ if token in word_set:
124
+ # If it is, append a tuple with the token, annotation, and color code
125
+ annotated_text.append((token, annotation, color_code))
126
+ else:
127
+ # If it's not, just append the token as a string
128
+ annotated_text.append(token)
129
+
130
+ return annotated_text
cv_job_maching.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eff3c0940574b067c7411dda40ae3ce67f905b3ce2d24909d89b7023465e6c5a
3
+ size 4595256
data/sample_linkedin_1.pdf ADDED
Binary file (41 kB). View file
 
data/sample_linkedin_2.pdf ADDED
Binary file (42.9 kB). View file
 
main.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+
3
+ from utils.settings import app
4
+ from fastapi import UploadFile
5
+
6
+ import pandas as pd
7
+ import aiofiles
8
+
9
+ from starlette import status
10
+ from starlette.responses import JSONResponse
11
+ from core.parser.file_parser import BaseParser
12
+ from core.parser.task import extractor_task, list_dict_to_str_parser
13
+ from core.similarity.service import EntityManager
14
+
15
+
16
+
17
+ @app.post("/api/parse-file")
18
+ async def upload_config_file_to_s3(
19
+ file: UploadFile,
20
+ checkpoint_name: str,
21
+ scraper: str = "base_parser",
22
+ checkpoint_layer: str = "tier_0",
23
+ ):
24
+ try:
25
+ file_path = f'./{file.filename}'
26
+ async with aiofiles.open(file_path, 'wb') as out_file:
27
+ content = await file.read() # async read
28
+ await out_file.write(content)
29
+
30
+ json_response = extractor_task(file_path)
31
+
32
+ return JSONResponse(
33
+ status_code=status.HTTP_200_OK,
34
+ content={"code": 200, "data": json_response},
35
+ )
36
+ except Exception as e:
37
+ return JSONResponse(
38
+ status_code=status.HTTP_400_BAD_REQUEST,
39
+ content={"code": 400, "message": f"{e}"},
40
+ )
41
+
42
+
43
+ if __name__ == "__main__":
44
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
observation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ scipy
3
+ numpy
4
+ pandas==1.3.5
5
+ chromadb==0.3.26
6
+ langchain==0.0.228
7
+ tiktoken
8
+ flask
9
+ redis
10
+ flask-redis
11
+ plotly==5.16.1
12
+ scikit-learn
13
+ matplotlib
14
+ pymupdf
15
+ python-docx
16
+ gensim
17
+ nltk
18
+ textacy
19
+ aiofiles
test.ipynb ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "!pip3 install -r requirements.txt"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 1,
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "Looking in indexes: https://dtn.jfrog.io/artifactory/api/pypi/pypi/simple\n",
22
+ "Collecting en-core-web-sm==3.7.0\n",
23
+ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)\n",
24
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
25
+ "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.0 in /opt/homebrew/lib/python3.11/site-packages (from en-core-web-sm==3.7.0) (3.7.1)\n",
26
+ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.0.12)\n",
27
+ "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.0.5)\n",
28
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.0.10)\n",
29
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.8)\n",
30
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.0.9)\n",
31
+ "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (8.2.1)\n",
32
+ "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.1.2)\n",
33
+ "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.4.8)\n",
34
+ "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.10)\n",
35
+ "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.3.2)\n",
36
+ "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.9.0)\n",
37
+ "Requirement already satisfied: pathy>=0.10.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.10.2)\n",
38
+ "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (6.4.0)\n",
39
+ "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (4.66.1)\n",
40
+ "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.31.0)\n",
41
+ "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.10.13)\n",
42
+ "Requirement already satisfied: jinja2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.1.2)\n",
43
+ "Requirement already satisfied: setuptools in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (68.1.2)\n",
44
+ "Requirement already satisfied: packaging>=20.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (23.1)\n",
45
+ "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.3.0)\n",
46
+ "Requirement already satisfied: numpy>=1.19.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.26.0)\n",
47
+ "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/homebrew/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (4.8.0)\n",
48
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.2.0)\n",
49
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.4)\n",
50
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.26.16)\n",
51
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2023.7.22)\n",
52
+ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/homebrew/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.7.11)\n",
53
+ "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/homebrew/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.1.3)\n",
54
+ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/homebrew/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (8.1.7)\n",
55
+ "Requirement already satisfied: cloudpathlib<0.16.0,>=0.7.0 in /opt/homebrew/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.15.1)\n",
56
+ "Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/lib/python3.11/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.1.3)\n",
57
+ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
58
+ "You can now load the package via spacy.load('en_core_web_sm')\n",
59
+ "Looking in indexes: https://dtn.jfrog.io/artifactory/api/pypi/pypi/simple\n",
60
+ "Collecting en-core-web-md==3.7.0\n",
61
+ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl (42.8 MB)\n",
62
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
63
+ "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.0 in /opt/homebrew/lib/python3.11/site-packages (from en-core-web-md==3.7.0) (3.7.1)\n",
64
+ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.0.12)\n",
65
+ "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.0.5)\n",
66
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.0.10)\n",
67
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.0.8)\n",
68
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.0.9)\n",
69
+ "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (8.2.1)\n",
70
+ "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.1.2)\n",
71
+ "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.4.8)\n",
72
+ "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.0.10)\n",
73
+ "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.3.2)\n",
74
+ "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.9.0)\n",
75
+ "Requirement already satisfied: pathy>=0.10.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.10.2)\n",
76
+ "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (6.4.0)\n",
77
+ "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (4.66.1)\n",
78
+ "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.31.0)\n",
79
+ "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.10.13)\n",
80
+ "Requirement already satisfied: jinja2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.1.2)\n",
81
+ "Requirement already satisfied: setuptools in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (68.1.2)\n",
82
+ "Requirement already satisfied: packaging>=20.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (23.1)\n",
83
+ "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.3.0)\n",
84
+ "Requirement already satisfied: numpy>=1.19.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.26.0)\n",
85
+ "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/homebrew/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (4.8.0)\n",
86
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.2.0)\n",
87
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.4)\n",
88
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.26.16)\n",
89
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2023.7.22)\n",
90
+ "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/homebrew/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.7.11)\n",
91
+ "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/homebrew/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.1.3)\n",
92
+ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/homebrew/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (8.1.7)\n",
93
+ "Requirement already satisfied: cloudpathlib<0.16.0,>=0.7.0 in /opt/homebrew/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.15.1)\n",
94
+ "Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/lib/python3.11/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.1.3)\n",
95
+ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
96
+ "You can now load the package via spacy.load('en_core_web_md')\n",
97
+ "Jayra Gaile Ortiz \n",
98
+ "+63 969 392 3470 \n",
99
+ " \n",
100
101
+ " \n",
102
+ "https://www.linkedin.com/in/jayra-gaile-\n",
103
+ "ortiz/ \n",
104
+ " \n",
105
+ " \n",
106
+ "A Data Engineer in the industry of fuels and agriculture with 3 years of experience working with various \n",
107
+ "stakeholders to provide near real-time transactional data in the best practices of a software engineer. A \n",
108
+ "proven track record of using python, rust, and typescript as my main programming language to provide \n",
109
+ "data pipelines and their respective workflow using spark, AWS, and terraform to provide quality data by \n",
110
+ "communicating with key stakeholders as well as data scientists who leverage the data. \n",
111
+ "Experience \n",
112
+ "SEPTEMBER 2022 – PRESSENT \n",
113
+ "Data Engineer/DTN, Philippines \n",
114
+ "• \n",
115
+ "Developed and Maintained workflows for Archiving Energy Transaction Data from generated \n",
116
+ "parquet files from redshift to s3 and archived in s3 glacier to match the client’s source \n",
117
+ "data(oracle) and maintain the views for Data Scientist to access on occasions of the need for \n",
118
+ "more historical data. \n",
119
+ "• \n",
120
+ "Developed ETL pipelines using AWS Glue, Spark, Step Functions and Lambda that handle TB of \n",
121
+ "Transactional Data from Fueling Companies inside Redshift that help stakeholders decide which \n",
122
+ "component will be likely in demand for the upcoming seasons. \n",
123
+ "• \n",
124
+ "Maintained schedules in AWS EKS using Terraform to handle pods that work individually as \n",
125
+ "Helpers for the ETL pipelines \n",
126
+ "• \n",
127
+ "Created and Developed batch jobs to process agricultural commodities and predicted price \n",
128
+ "transaction. \n",
129
+ "• \n",
130
+ "Created workflows to handle failed jobs, ensuring fault tolerance to deliver data even at the \n",
131
+ "state of migration. \n",
132
+ "APRIL 2021 – AUGUST 2022 \n",
133
+ "Associate Software Engineer/ Getaka Labs, Manila, Philippines \n",
134
+ "• \n",
135
+ "Research and Develop 2 company manual operation patterns for the implementation of a Certificate \n",
136
+ "generator automation using Rust, Typescript and a Rule Engine microservices run via Pub Nub \n",
137
+ "workers all containerized using Kubernetes while decreasing the load of the human resource \n",
138
+ "operations b. \n",
139
+ "• \n",
140
+ "Created and Develop the initial RPA orchestrator using Rust, Typescript, and Python to handle the \n",
141
+ "existing supervised bot services and the third-party cloud services used mainly in AWS using Glue, \n",
142
+ "Lambda, and RDS provided by the Getaka Labs. \n",
143
+ "• \n",
144
+ "Improve the standard process of handling products related to automation by providing technical \n",
145
+ "documentation to Automation developers using the orchestrator and bot builder product. \n",
146
+ "OCTOBER 2020 – APRIL 2021 \n",
147
+ "Software Engineer(Part-Time)/Antares Software, Melbourne, Australia \n",
148
+ "• \n",
149
+ "Collaborated with senior developers to develop and deploy a crowd-sourcing site using Django \n",
150
+ "Framework and Third-Party Cloud Services in AWS that handles volunteering tasks and volunteers \n",
151
+ "alike from different major companies like Pfizer, RSPCA Victoria, Climate Action Network Australia, \n",
152
+ "Australian Red Cross – VIC and other major volunteering sources from Australia improving the social \n",
153
+ "network gathered by 40%. \n",
154
+ "• \n",
155
+ "Assisted the maintenance of the Australian govt site using CRM and Vue.js with a few Cloud services \n",
156
+ "in AWS. \n",
157
+ "2 \n",
158
+ " \n",
159
+ "OCTOBER 2020 – APRIL 2021 \n",
160
+ "Software Engineer Intern/Antares Software, Melbourne, Australia \n",
161
+ "• \n",
162
+ "Develop and Automate Tests for the APIs used by Antares Software for the software product \n",
163
+ "provided for customers with minimal customization use. \n",
164
+ "• \n",
165
+ "Assisted the language translation implementation using I18n for the Australian Awareness in \n",
166
+ "Covid Information and Response. \n",
167
+ "Volunteer Experience \n",
168
+ "OCTOBER 2021 - PRESENT \n",
169
+ "Junior Machine Learning Engineer/ Omdena \n",
170
+ "• \n",
171
+ "Assisted on managing and deploying Deep Learning Models particularly (TinyYolov3, Yolov5) \n",
172
+ "using Pytorch and Jetkins to deliver the classification of Shipments, Number of Persons and \n",
173
+ "Labels within the cargo to help alleviate AcuaOcean goal of rescueing refugees from across \n",
174
+ "different borders. \n",
175
+ "• \n",
176
+ "Collaborated in developing and deploying Deep Learning Models using flair and transformers in \n",
177
+ "TensorFlow and SciPy to cater consumer’s private data on Credit Card Agreements and Medical \n",
178
+ "Data. \n",
179
+ "Skills \n",
180
+ "Rust, Python, and Typescript/JavaScript • PSQL, Redshift, Spark, DBT and HDFS • AWS services • Docker, \n",
181
+ "Kubernetes(k8s), CI/CD GitLab, Git, Shell, Terraform, Datadog• Redis, Celery • Tableau, Plotly • Kimball \n",
182
+ "(Data Modelling), Star Schema \n",
183
+ "Education \n",
184
+ "MARCH 2019 – OCTOBER 2023 \n",
185
+ "Bachelors of Science in Computer Science/Polytechnic University of the Philippines Manila, \n",
186
+ "Philippines \n",
187
+ "Certifications \n",
188
+ "Deep Learning Specialization - DeepLearningAI \n",
189
+ "SEPTEMBER 2021 \n",
190
+ "Mainly focused on the best practices in training models and their optimal hyperparameters, gathering \n",
191
+ "analyzing Data based on the nature of the problem, and understanding the process of the MLOps. \n",
192
+ "Tableau Data Scientist - Tableau \n",
193
+ "SEPTEMBER 2021 \n",
194
+ "Mainly focused on how to visualize data with a story to tell. How we can communicate with the clients \n",
195
+ "based on the data findings. \n",
196
+ " \n",
197
+ " \n",
198
+ "3 \n",
199
+ " \n",
200
+ "Ongoing Publications \n",
201
+ "Optimized Bi-LSTM-CRF De-Identification Model using Ensemble Learning \n",
202
+ "MARCH 2022-PRESENT \n",
203
+ "Mainly focused on optimizing the latest advancements of Bi-LSTM-CRF in terms of NER, due to the \n",
204
+ "limitation of not being able to identify the span of the location and the incorrect labeling of the PHI \n",
205
+ "terms in the guidelines of n2c2 2014 De-dentification Dataset from Harvard. \n",
206
+ "\n",
207
+ "raw_txt in generate_messages Jayra Gaile Ortiz \n",
208
+ "+63 969 392 3470 \n",
209
+ " \n",
210
211
+ " \n",
212
+ "https://www.linkedin.com/in/jayra-gaile-\n",
213
+ "ortiz/ \n",
214
+ " \n",
215
+ " \n",
216
+ "A Data Engineer in the industry of fuels and agriculture with 3 years of experience working with various \n",
217
+ "stakeholders to provide near real-time transactional data in the best practices of a software engineer. A \n",
218
+ "proven track record of using python, rust, and typescript as my main programming language to provide \n",
219
+ "data pipelines and their respective workflow using spark, AWS, and terraform to provide quality data by \n",
220
+ "communicating with key stakeholders as well as data scientists who leverage the data. \n",
221
+ "Experience \n",
222
+ "SEPTEMBER 2022 – PRESSENT \n",
223
+ "Data Engineer/DTN, Philippines \n",
224
+ "• \n",
225
+ "Developed and Maintained workflows for Archiving Energy Transaction Data from generated \n",
226
+ "parquet files from redshift to s3 and archived in s3 glacier to match the client’s source \n",
227
+ "data(oracle) and maintain the views for Data Scientist to access on occasions of the need for \n",
228
+ "more historical data. \n",
229
+ "• \n",
230
+ "Developed ETL pipelines using AWS Glue, Spark, Step Functions and Lambda that handle TB of \n",
231
+ "Transactional Data from Fueling Companies inside Redshift that help stakeholders decide which \n",
232
+ "component will be likely in demand for the upcoming seasons. \n",
233
+ "• \n",
234
+ "Maintained schedules in AWS EKS using Terraform to handle pods that work individually as \n",
235
+ "Helpers for the ETL pipelines \n",
236
+ "• \n",
237
+ "Created and Developed batch jobs to process agricultural commodities and predicted price \n",
238
+ "transaction. \n",
239
+ "• \n",
240
+ "Created workflows to handle failed jobs, ensuring fault tolerance to deliver data even at the \n",
241
+ "state of migration. \n",
242
+ "APRIL 2021 – AUGUST 2022 \n",
243
+ "Associate Software Engineer/ Getaka Labs, Manila, Philippines \n",
244
+ "• \n",
245
+ "Research and Develop 2 company manual operation patterns for the implementation of a Certificate \n",
246
+ "generator automation using Rust, Typescript and a Rule Engine microservices run via Pub Nub \n",
247
+ "workers all containerized using Kubernetes while decreasing the load of the human resource \n",
248
+ "operations b. \n",
249
+ "• \n",
250
+ "Created and Develop the initial RPA orchestrator using Rust, Typescript, and Python to handle the \n",
251
+ "existing supervised bot services and the third-party cloud services used mainly in AWS using Glue, \n",
252
+ "Lambda, and RDS provided by the Getaka Labs. \n",
253
+ "• \n",
254
+ "Improve the standard process of handling products related to automation by providing technical \n",
255
+ "documentation to Automation developers using the orchestrator and bot builder product. \n",
256
+ "OCTOBER 2020 – APRIL 2021 \n",
257
+ "Software Engineer(Part-Time)/Antares Software, Melbourne, Australia \n",
258
+ "• \n",
259
+ "Collaborated with senior developers to develop and deploy a crowd-sourcing site using Django \n",
260
+ "Framework and Third-Party Cloud Services in AWS that handles volunteering tasks and volunteers \n",
261
+ "alike from different major companies like Pfizer, RSPCA Victoria, Climate Action Network Australia, \n",
262
+ "Australian Red Cross – VIC and other major volunteering sources from Australia improving the social \n",
263
+ "network gathered by 40%. \n",
264
+ "• \n",
265
+ "Assisted the maintenance of the Australian govt site using CRM and Vue.js with a few Cloud services \n",
266
+ "in AWS. \n",
267
+ "2 \n",
268
+ " \n",
269
+ "OCTOBER 2020 – APRIL 2021 \n",
270
+ "Software Engineer Intern/Antares Software, Melbourne, Australia \n",
271
+ "• \n",
272
+ "Develop and Automate Tests for the APIs used by Antares Software for the software product \n",
273
+ "provided for customers with minimal customization use. \n",
274
+ "• \n",
275
+ "Assisted the language translation implementation using I18n for the Australian Awareness in \n",
276
+ "Covid Information and Response. \n",
277
+ "Volunteer Experience \n",
278
+ "OCTOBER 2021 - PRESENT \n",
279
+ "Junior Machine Learning Engineer/ Omdena \n",
280
+ "• \n",
281
+ "Assisted on managing and deploying Deep Learning Models particularly (TinyYolov3, Yolov5) \n",
282
+ "using Pytorch and Jetkins to deliver the classification of Shipments, Number of Persons and \n",
283
+ "Labels within the cargo to help alleviate AcuaOcean goal of rescueing refugees from across \n",
284
+ "different borders. \n",
285
+ "• \n",
286
+ "Collaborated in developing and deploying Deep Learning Models using flair and transformers in \n",
287
+ "TensorFlow and SciPy to cater consumer’s private data on Credit Card Agreements and Medical \n",
288
+ "Data. \n",
289
+ "Skills \n",
290
+ "Rust, Python, and Typescript/JavaScript • PSQL, Redshift, Spark, DBT and HDFS • AWS services • Docker, \n",
291
+ "Kubernetes(k8s), CI/CD GitLab, Git, Shell, Terraform, Datadog• Redis, Celery • Tableau, Plotly • Kimball \n",
292
+ "(Data Modelling), Star Schema \n",
293
+ "Education \n",
294
+ "MARCH 2019 – OCTOBER 2023 \n",
295
+ "Bachelors of Science in Computer Science/Polytechnic University of the Philippines Manila, \n",
296
+ "Philippines \n",
297
+ "Certifications \n",
298
+ "Deep Learning Specialization - DeepLearningAI \n",
299
+ "SEPTEMBER 2021 \n",
300
+ "Mainly focused on the best practices in training models and their optimal hyperparameters, gathering \n",
301
+ "analyzing Data based on the nature of the problem, and understanding the process of the MLOps. \n",
302
+ "Tableau Data Scientist - Tableau \n",
303
+ "SEPTEMBER 2021 \n",
304
+ "Mainly focused on how to visualize data with a story to tell. How we can communicate with the clients \n",
305
+ "based on the data findings. \n",
306
+ " \n",
307
+ " \n",
308
+ "3 \n",
309
+ " \n",
310
+ "Ongoing Publications \n",
311
+ "Optimized Bi-LSTM-CRF De-Identification Model using Ensemble Learning \n",
312
+ "MARCH 2022-PRESENT \n",
313
+ "Mainly focused on optimizing the latest advancements of Bi-LSTM-CRF in terms of NER, due to the \n",
314
+ "limitation of not being able to identify the span of the location and the incorrect labeling of the PHI \n",
315
+ "terms in the guidelines of n2c2 2014 De-dentification Dataset from Harvard. \n",
316
+ "\n",
317
+ "{\n",
318
+ " \"id\": \"chatcmpl-8AbWUmgdjrYSJq9znjUkahom2oD2d\",\n",
319
+ " \"object\": \"chat.completion\",\n",
320
+ " \"created\": 1697538210,\n",
321
+ " \"model\": \"gpt-3.5-turbo-16k-0613\",\n",
322
+ " \"choices\": [\n",
323
+ " {\n",
324
+ " \"index\": 0,\n",
325
+ " \"message\": {\n",
326
+ " \"role\": \"assistant\",\n",
327
+ " \"content\": \"{\\n \\\"applicant_name\\\": \\\"Jayra Gaile Ortiz\\\",\\n \\\"phone_number\\\": \\\"+63 969 392 3470\\\",\\n \\\"email\\\": \\\"[email protected]\\\",\\n \\\"website\\\": \\\"https://www.linkedin.com/in/jayra-gaile-ortiz/\\\",\\n \\\"applicant_summary\\\": \\\"A Data Engineer in the industry of fuels and agriculture with 3 years of experience working with various stakeholders to provide near real-time transactional data in the best practices of a software engineer.\\\",\\n \\\"work_experience\\\": [\\n {\\n \\\"position\\\": \\\"Data Engineer\\\",\\n \\\"company\\\": \\\"DTN\\\",\\n \\\"date\\\": \\\"SEPTEMBER 2022 \\u2013 PRESSENT\\\",\\n \\\"responsibilities\\\": \\\"Developed and Maintained workflows for Archiving Energy Transaction Data from generated parquet files from redshift to s3 and archived in s3 glacier to match the client\\u2019s source data(oracle) and maintain the views for Data Scientist to access on occasions of the need for more historical data.\\\\nDeveloped ETL pipelines using AWS Glue, Spark, Step Functions and Lambda that handle TB of Transactional Data from Fueling Companies inside Redshift that help stakeholders decide which component will be likely in demand for the upcoming seasons.\\\\nMaintained schedules in AWS EKS using Terraform to handle pods that work individually as Helpers for the ETL pipelines\\\\nCreated and Developed batch jobs to process agricultural commodities and predicted price transaction.\\\\nCreated workflows to handle failed jobs, ensuring fault tolerance to deliver data even at the state of migration.\\\"\\n },\\n {\\n \\\"position\\\": \\\"Associate Software Engineer\\\",\\n \\\"company\\\": \\\"Getaka Labs\\\",\\n \\\"date\\\": \\\"APRIL 2021 \\u2013 AUGUST 2022\\\",\\n \\\"responsibilities\\\": \\\"Research and Develop 2 company manual operation patterns for the implementation of a Certificate generator automation using Rust, Typescript and a Rule Engine microservices run via Pub Nub workers all containerized using Kubernetes while decreasing the load of the human resource operations b.\\\\nCreated and Develop the initial RPA orchestrator using Rust, Typescript, and Python to handle the existing supervised bot services and the third-party cloud services used mainly in AWS using Glue, Lambda, and RDS provided by the Getaka Labs.\\\\nImprove the standard process of handling products related to automation by providing technical documentation to Automation developers using the orchestrator and bot builder product.\\\"\\n },\\n {\\n \\\"position\\\": \\\"Software Engineer(Part-Time)\\\",\\n \\\"company\\\": \\\"Antares Software\\\",\\n \\\"date\\\": \\\"OCTOBER 2020 \\u2013 APRIL 2021\\\",\\n \\\"responsibilities\\\": \\\"Collaborated with senior developers to develop and deploy a crowd-sourcing site using Django Framework and Third-Party Cloud Services in AWS that handles volunteering tasks and volunteers alike from different major companies like Pfizer, RSPCA Victoria, Climate Action Network Australia, Australian Red Cross \\u2013 VIC and other major volunteering sources from Australia improving the social network gathered by 40%.\\\\nAssisted the maintenance of the Australian govt site using CRM and Vue.js with a few Cloud services in AWS.\\\"\\n },\\n {\\n \\\"position\\\": \\\"Software Engineer Intern\\\",\\n \\\"company\\\": \\\"Antares Software\\\",\\n \\\"date\\\": \\\"OCTOBER 2020 \\u2013 APRIL 2021\\\",\\n \\\"responsibilities\\\": \\\"Develop and Automate Tests for the APIs used by Antares Software for the software product provided for customers with minimal customization use.\\\\nAssisted the language translation implementation using I18n for the Australian Awareness in Covid Information and Response.\\\"\\n }\\n ],\\n \\\"skills\\\": [\\n \\\"Rust\\\",\\n \\\"Python\\\",\\n \\\"Typescript/JavaScript\\\",\\n \\\"PSQL\\\",\\n \\\"Redshift\\\",\\n \\\"Spark\\\",\\n \\\"DBT\\\",\\n \\\"HDFS\\\",\\n \\\"AWS services\\\",\\n \\\"Docker\\\",\\n \\\"Kubernetes(k8s)\\\",\\n \\\"CI/CD GitLab\\\",\\n \\\"Git\\\",\\n \\\"Shell\\\",\\n \\\"Terraform\\\",\\n \\\"Datadog\\\",\\n \\\"Redis\\\",\\n \\\"Celery\\\",\\n \\\"Tableau\\\",\\n \\\"Plotly\\\",\\n \\\"Kimball(Data Modelling)\\\",\\n \\\"Star Schema\\\"\\n ],\\n \\\"education\\\": [\\n {\\n \\\"degree\\\": \\\"Bachelors of Science in Computer Science\\\",\\n \\\"school\\\": \\\"Polytechnic University of the Philippines Manila, Philippines\\\",\\n \\\"date\\\": \\\"MARCH 2019 \\u2013 OCTOBER 2023\\\"\\n }\\n ],\\n \\\"publications\\\": [\\n {\\n \\\"title\\\": \\\"Optimized Bi-LSTM-CRF De-Identification Model using Ensemble Learning\\\",\\n \\\"date\\\": \\\"MARCH 2022-PRESENT\\\",\\n \\\"description\\\": \\\"Mainly focused on optimizing the latest advancements of Bi-LSTM-CRF in terms of NER, due to the limitation of not being able to identify the span of the location and the incorrect labeling of the PHI terms in the guidelines of n2c2 2014 De-dentification Dataset from Harvard.\\\"\\n }\\n ],\\n \\\"location\\\": \\\"Manila, Philippines\\\",\\n \\\"languages\\\": []\\n}\"\n",
328
+ " },\n",
329
+ " \"finish_reason\": \"stop\"\n",
330
+ " }\n",
331
+ " ],\n",
332
+ " \"usage\": {\n",
333
+ " \"prompt_tokens\": 1651,\n",
334
+ " \"completion_tokens\": 1044,\n",
335
+ " \"total_tokens\": 2695\n",
336
+ " }\n",
337
+ "}\n"
338
+ ]
339
+ }
340
+ ],
341
+ "source": [
342
+ "from core.parser.file_parser import BaseParser\n",
343
+ "from core.parser.task import extractor_task, list_dict_to_str_parser\n",
344
+ "from core.similarity.service import EntityManager\n",
345
+ "\n",
346
+ "json_response = extractor_task(\"Ortiz_02-04-2023resume-dtn.pdf\")"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": 2,
352
+ "metadata": {},
353
+ "outputs": [
354
+ {
355
+ "data": {
356
+ "text/plain": [
357
+ "{'applicant_name': 'Jayra Gaile Ortiz',\n",
358
+ " 'phone_number': '+63 969 392 3470',\n",
359
+ " 'email': '[email protected]',\n",
360
+ " 'website': 'https://www.linkedin.com/in/jayra-gaile-ortiz/',\n",
361
+ " 'applicant_summary': 'A Data Engineer in the industry of fuels and agriculture with 3 years of experience working with various stakeholders to provide near real-time transactional data in the best practices of a software engineer.',\n",
362
+ " 'work_experience': [{'position': 'Data Engineer',\n",
363
+ " 'company': 'DTN',\n",
364
+ " 'date': 'SEPTEMBER 2022 – PRESSENT',\n",
365
+ " 'responsibilities': 'Developed and Maintained workflows for Archiving Energy Transaction Data from generated parquet files from redshift to s3 and archived in s3 glacier to match the client’s source data(oracle) and maintain the views for Data Scientist to access on occasions of the need for more historical data.\\nDeveloped ETL pipelines using AWS Glue, Spark, Step Functions and Lambda that handle TB of Transactional Data from Fueling Companies inside Redshift that help stakeholders decide which component will be likely in demand for the upcoming seasons.\\nMaintained schedules in AWS EKS using Terraform to handle pods that work individually as Helpers for the ETL pipelines\\nCreated and Developed batch jobs to process agricultural commodities and predicted price transaction.\\nCreated workflows to handle failed jobs, ensuring fault tolerance to deliver data even at the state of migration.'},\n",
366
+ " {'position': 'Associate Software Engineer',\n",
367
+ " 'company': 'Getaka Labs',\n",
368
+ " 'date': 'APRIL 2021 – AUGUST 2022',\n",
369
+ " 'responsibilities': 'Research and Develop 2 company manual operation patterns for the implementation of a Certificate generator automation using Rust, Typescript and a Rule Engine microservices run via Pub Nub workers all containerized using Kubernetes while decreasing the load of the human resource operations b.\\nCreated and Develop the initial RPA orchestrator using Rust, Typescript, and Python to handle the existing supervised bot services and the third-party cloud services used mainly in AWS using Glue, Lambda, and RDS provided by the Getaka Labs.\\nImprove the standard process of handling products related to automation by providing technical documentation to Automation developers using the orchestrator and bot builder product.'},\n",
370
+ " {'position': 'Software Engineer(Part-Time)',\n",
371
+ " 'company': 'Antares Software',\n",
372
+ " 'date': 'OCTOBER 2020 – APRIL 2021',\n",
373
+ " 'responsibilities': 'Collaborated with senior developers to develop and deploy a crowd-sourcing site using Django Framework and Third-Party Cloud Services in AWS that handles volunteering tasks and volunteers alike from different major companies like Pfizer, RSPCA Victoria, Climate Action Network Australia, Australian Red Cross ��� VIC and other major volunteering sources from Australia improving the social network gathered by 40%.\\nAssisted the maintenance of the Australian govt site using CRM and Vue.js with a few Cloud services in AWS.'},\n",
374
+ " {'position': 'Software Engineer Intern',\n",
375
+ " 'company': 'Antares Software',\n",
376
+ " 'date': 'OCTOBER 2020 – APRIL 2021',\n",
377
+ " 'responsibilities': 'Develop and Automate Tests for the APIs used by Antares Software for the software product provided for customers with minimal customization use.\\nAssisted the language translation implementation using I18n for the Australian Awareness in Covid Information and Response.'}],\n",
378
+ " 'skills': ['Rust',\n",
379
+ " 'Python',\n",
380
+ " 'Typescript/JavaScript',\n",
381
+ " 'PSQL',\n",
382
+ " 'Redshift',\n",
383
+ " 'Spark',\n",
384
+ " 'DBT',\n",
385
+ " 'HDFS',\n",
386
+ " 'AWS services',\n",
387
+ " 'Docker',\n",
388
+ " 'Kubernetes(k8s)',\n",
389
+ " 'CI/CD GitLab',\n",
390
+ " 'Git',\n",
391
+ " 'Shell',\n",
392
+ " 'Terraform',\n",
393
+ " 'Datadog',\n",
394
+ " 'Redis',\n",
395
+ " 'Celery',\n",
396
+ " 'Tableau',\n",
397
+ " 'Plotly',\n",
398
+ " 'Kimball(Data Modelling)',\n",
399
+ " 'Star Schema'],\n",
400
+ " 'education': [{'degree': 'Bachelors of Science in Computer Science',\n",
401
+ " 'school': 'Polytechnic University of the Philippines Manila, Philippines',\n",
402
+ " 'date': 'MARCH 2019 – OCTOBER 2023'}],\n",
403
+ " 'Publications': None,\n",
404
+ " 'location': 'Manila, Philippines',\n",
405
+ " 'languages': []}"
406
+ ]
407
+ },
408
+ "execution_count": 2,
409
+ "metadata": {},
410
+ "output_type": "execute_result"
411
+ }
412
+ ],
413
+ "source": [
414
+ "json_response.dict()"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": 3,
420
+ "metadata": {},
421
+ "outputs": [],
422
+ "source": [
423
+ "input_JD = \"\"\"\n",
424
+ "COMPUTER SPECIALIST (SOFTWARE)\n",
425
+ "9 Metrotech Center, Brooklyn N\n",
426
+ "\n",
427
+ "Job Category: Technology, Data & Innovation\n",
428
+ "\n",
429
+ "(1) A baccalaureate degree from an accredited college, including or supplemented by twenty-four (24) semester credits in computer science or a related computer field and two (2) years of satisfactory full-time software experience in designing, programming, debugging, maintaining, implementing, and enhancing computer software applications, systems programming, systems analysis and design, data communication software, or database design and programming, including one year in a project leader capacity or as a major contributor on a complex project; or (2) A four-year high school diploma or its educational equivalent and six (6) years of full-time satisfactory software experience as described in “1 above, including one year in a project leader capacity or as a major contributor on a complex project; or (3) A satisfactory combination of education and experience that is equivalent to (1) or (2) above. College education may be substituted for up to two years of the required experience in (2) above on the basis that sixty (60) semester credits from an accredited college is equated to one year of experience. A masters degree in computer science or a related computer field may be substituted for one year of the required experience in (1) or (2) above. However, all candidates must have a four year high school diploma or its educational equivalent, plus at least one (1) year of satisfactory full-time software experience in a project leader capacity or as a major contributor on a complex project. NOTE: In order to have your experience accepted as Project Leader or Major Contributor experience, you must explain in detail how your experience qualifies you as a project leader or as a major contributor. Experience in computer operations, technical support, quality assurance (QA), hardware installation, help desk, or as an end user will not be accepted for meeting the minimum qualification requirements. Special Note To be eligible for placement in Assignment Level IV, in addition to the Qualification Requirements stated above, individuals must have one year of satisfactory experience in a project leader capacity or as a major contributor on a complex project in data administration, database management systems, operating systems, data communications systems, capacity planning, and/or on-line applications programming.\n",
430
+ "Strong proficiency in JavaScript and ReactJS framework or other front-end development frameworks is required. Candidate will also need to possess advanced knowledge of web-related technologies such as HTM5L, CSS, and the DOM and have utilized UI libraries such as Prime React, Bootstrap, Ant Design or Material UI to build applications that works on multiple form factors. Experience building and consuming RESTful APIs is also required. Knowledge of modern version control systems (GitHub) and automated build workflows/pipelines for web applications. The individual will thrive in a fast-paced agile environment founded on strong analytical and communication skills. Member should be enthusiastic and open for learning new skills and keen on understanding new technologies. Knowledge of Java, PL/SQL and Oracle suite of software/cloud and backend development a plus. Knowledge of modern authentication protocols and mechanisms such as JWT, OAuth2, etc.\n",
431
+ "\n",
432
+ "We offer great benefits and programs! -Health Benefits at no or low cost with an array of health plans -Defined Pension Plans -401(k) and 457(k) Retirement Savings Programs -Dental and Vision Coverage -Prescription Drug Program -Flexible Spending Program -Paid Holidays and Generous Annual Leave -Training and Professional Development -Opportunity for Scholarship -College Savings Program -Commuter Benefits -Employee Assistance Programs -Workplace Wellness Programs -Student Loan Forgiveness* -Municipal Credit Union NOTE: This position may be eligible for remote work up to 2 days per weeks, pursuant to the Remote Work Pilot Program agreed between the City and DC37. **Appointments are subject to Office of Management and Budget (OMB) approval.\n",
433
+ "\n",
434
+ "\"\"\""
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "execution_count": 4,
440
+ "metadata": {},
441
+ "outputs": [],
442
+ "source": [
443
+ "import re\n",
444
+ "\n",
445
+ "work_exps_str = list_dict_to_str_parser(json_response.work_experience)\n",
446
+ "education_str = list_dict_to_str_parser(json_response.education)\n",
447
+ "clean_text = re.sub(\"[^a-zA-Z\\s]\",\"\",f\"{json_response.applicant_summary}\\n\\n {work_exps_str}\\n\\n {education_str}\")\n",
448
+ "\n",
449
+ "entity_resume = EntityManager(clean_text.lower()).extract_attrs()\n",
450
+ "entity_jd = EntityManager(input_JD.lower()).extract_attrs()\n",
451
+ "\n",
452
+ "job_keywords = set(entity_jd.entities)\n",
453
+ "resume_keywords = set(entity_resume.entities)\n",
454
+ "jaccard_similarity = len(job_keywords.intersection(resume_keywords)) / len(job_keywords.union(resume_keywords))\n"
455
+ ]
456
+ },
457
+ {
458
+ "cell_type": "code",
459
+ "execution_count": 5,
460
+ "metadata": {},
461
+ "outputs": [
462
+ {
463
+ "name": "stdout",
464
+ "output_type": "stream",
465
+ "text": [
466
+ "{'semester', 'framework', 'dc37', 'installation', 'individual', 'office', 'time', 'pilot', 'coverage', 'support', 'education', 'pipelines', 'authentication', 'analysis', 'data', 'design', 'libraries', 'days', 'jwt', 'basis', 'software', 'contributor', 'level', 'program', 'material', 'equivalent', '-employee', 'technology', 'operating', 'apis', 'designing', 'drug', 'science', 'years', 'java', 'pl', '457(k', 'user', '-401(k', 'assistance', 'health', 'retirement', 'forgiveness', 'mechanisms', 'administration', '-dental', 'spending', 'desk', 'applications', 'development', 'order', 'quality', 'debugging', 'assignment', '-commuter', 'appointments', 'sql', 'array', 'metrotech', 'credits', 'systems', 'note', 'budget', 'diploma', 'detail', 'college', 'build', 'brooklyn', 'cloud', 'plans', 'dom', 'addition', 'technologies', 'qualification', 'communication', '*', 'leave', 'union', '-college', 'holidays', 'placement', 'baccalaureate', 'weeks', 'capacity', '⊜1', 'individuals', 'assurance', 'building', 'credit', 'position', 'scholarship', 'factors', 'category', 'protocols', 'line', 'version', 'benefits', 'github', 'knowledge', '-workplace', 'year', 'job', 'savings', '-municipal', 'work', 'htm5l', 'loan', 'vision', 'iv', 'candidate', 'omb', 'requirements', 'management', 'javascript', 'operations', 'control', 'specialist', 'cost', 'project', 'candidates', 'end', 'workflows', 'degree', 'city', 'wellness', '-opportunity', 'plus', 'form', 'leader', 'programming', 'qa', 'hardware', 'school', 'bootstrap', 'planning', 'database', 'programs', 'css', 'computer', 'skills', 'suite', 'react', 'center', 'proficiency', 'masters', 'member', 'environment', 'approval', 'experience', '-student', 'pension', 'communications', 'web', 'combination', '-health', '-prescription', 'innovation', 'oracle', 'frameworks', 'field'}\n",
467
+ "{'scientist', 'framework', 'django', 'tb', 'source', 'implementation', 'crowdsourcing', 'glacier', 'kubernetes', 'process', 'products', 'pipelines', 'volunteers', 'automation', 'data', 'engineer', 'component', 'generator', 'resource', 'software', 'seasons', 'labs', 'apis', 'years', 'migration', 'science', 'engine', 'vic', 'getaka', 'clients', 'state', 'bot', 'bachelors', 'services', 'redshift', 'product', 'rust', 'files', 'need', 'microservices', 'translation', 'documentation', 'crm', 'information', 'step', 'workers', 'developers', 'network', 'spark', 'stakeholders', 'dataoracle', 'customization', 'demand', 'intern', 'patterns', 'language', 'research', 'schedules', 'company', 'vuejs', 'cloud', 'transaction', 'views', 'price', 'awareness', 'industry', 'occasions', 'typescript', 'b', 'helpers', 'commodities', 'tests', 'march', 'builder', 'red', 'pub', 'cross', 'october', 'pods', 'load', 'jobs', 'rspca', 'philippines', 'september', 'fuels', 'action', 'antares', 'manila', 'rule', 'tasks', 'response', 'customers', 'volunteering', 'nub', 'rds', 'thirdparty', 'practices', 'operations', 'tolerance', 'batch', 'glue', 'sources', 'etl', 'functions', 'govt', 'university', 'energy', 'workflows', 'april', 'victoria', 'climate', 'terraform', 'orchestrator', 'certificate', 'use', 'august', 'eks', 'pfizer', 'maintenance', 'site', 'computer', 'rpa', 'operation', 'experience', 'aws', 'dtn', 'lambda', 'agriculture', 'australia', 'australian', 'companies'}\n",
468
+ "0.043010752688172046\n"
469
+ ]
470
+ }
471
+ ],
472
+ "source": [
473
+ "print(job_keywords)\n",
474
+ "print(resume_keywords)\n",
475
+ "print(jaccard_similarity)"
476
+ ]
477
+ },
478
+ {
479
+ "cell_type": "code",
480
+ "execution_count": null,
481
+ "metadata": {},
482
+ "outputs": [],
483
+ "source": [
484
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
485
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
486
+ "\n",
487
+ "vectorizer = CountVectorizer()\n",
488
+ "X = vectorizer.fit_transform([clean_text, input_JD])\n",
489
+ "cosine_sim = cosine_similarity(X)"
490
+ ]
491
+ },
492
+ {
493
+ "cell_type": "code",
494
+ "execution_count": null,
495
+ "metadata": {},
496
+ "outputs": [],
497
+ "source": [
498
+ "cosine_sim"
499
+ ]
500
+ }
501
+ ],
502
+ "metadata": {
503
+ "kernelspec": {
504
+ "display_name": "Python 3",
505
+ "language": "python",
506
+ "name": "python3"
507
+ },
508
+ "language_info": {
509
+ "codemirror_mode": {
510
+ "name": "ipython",
511
+ "version": 3
512
+ },
513
+ "file_extension": ".py",
514
+ "mimetype": "text/x-python",
515
+ "name": "python",
516
+ "nbconvert_exporter": "python",
517
+ "pygments_lexer": "ipython3",
518
+ "version": "3.11.5"
519
+ },
520
+ "orig_nbformat": 4
521
+ },
522
+ "nbformat": 4,
523
+ "nbformat_minor": 2
524
+ }
utils/__init__.py ADDED
File without changes
utils/settings.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file includes all the configuration file for API
3
+ """
4
+ from fastapi import FastAPI
5
+ from fastapi.middleware.cors import CORSMiddleware
6
+
7
+ # Handles documentation title
8
+ app = FastAPI(
9
+ version="version 1.0.0",
10
+ title="Scraper Pipeline",
11
+ description="API helper to interact with scrapers",
12
+ debug=True,
13
+ docs_url="/",
14
+ redoc_url=None,
15
+ )
16
+
17
+ # Handles cors
18
+ origins = ["http://0.0.0.0:8000"]
19
+
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=origins,
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )