Spaces:

spitzc32
/

cv-assesment-tools

Runtime error

App Files Files Community

Jayra Ortiz commited on Oct 19, 2023

Commit

b0716cb

1 Parent(s): 180c3a0

:star: added initial working architecture

Browse files

Files changed (28) hide show

.env +4 -0
.gitignore +37 -0
Dockerfile +23 -0
configs/config.yaml +2 -0
core/environments.py +20 -0
core/models/__init__.py +0 -0
core/models/parser.py +30 -0
core/models/reports.py +8 -0
core/parser/__init__.py +0 -0
core/parser/file_parser.py +41 -0
core/parser/prompt.py +44 -0
core/parser/task.py +45 -0
core/report/__init__.py +0 -0
core/resources/__init__.py +0 -0
core/resources/azure_openai.py +15 -0
core/resources/constants.py +6 -0
core/similarity/__init__.py +0 -0
core/similarity/context.py +0 -0
core/similarity/service.py +130 -0
cv_job_maching.model +3 -0
data/sample_linkedin_1.pdf +0 -0
data/sample_linkedin_2.pdf +0 -0
main.py +44 -0
observation.ipynb +0 -0
requirements.txt +19 -0
test.ipynb +524 -0
utils/__init__.py +0 -0
utils/settings.py +26 -0

.env ADDED Viewed

	@@ -0,0 +1,4 @@

+COMPLETIONS_MODEL=none
+OPEN_API_KEY=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
+AZURE_STORAGE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=scraperoutputs;AccountKey=n3bsqhhFypROsBb9huJoUkscit6XVqn7cAS3cWYaRltyONh4+2NvlOMFx6FPBKL6PGt9+aqvN7WA+AStd23gpQ==;EndpointSuffix=core.windows.net"
+CONTAINER=identity

.gitignore ADDED Viewed

	@@ -0,0 +1,37 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Use the official Python base image with tag/version of your choice
+FROM python:3.9
+# Set the working directory in the container
+ENV WORKDIR=/app
+WORKDIR ${WORKDIR}
+# Copy the requirements file into the container
+COPY requirements.txt .
+# Install the Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the application code into the container
+COPY . .
+ENV PYTHONPATH "${PYTHONPATH}:/code/src"
+# Define default run command
+WORKDIR ${WORKDIR}/src
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]

configs/config.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ embedding_model: text-embedding-ada-002
2	+ completion_model: gpt-3.5-turbo-16k

core/environments.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from langchain.embeddings import OpenAIEmbeddings
+import openai
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+load_dotenv()
+COMPLETIONS_MODEL = "gpt-3.5-turbo-16k"
+EMBEDDING_MODEL = "text-embedding-ada-002"
+OPEN_API_KEY= os.getenv("OPEN_API_KEY")
+AZURE_STORAGE_CONNECTION_STRING = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
+CONTAINER = os.getenv("CONTAINER")
+openai.api_key =OPEN_API_KEY
+os.environ['OPENAI_API_KEY'] = openai.api_key
+embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
+mod_path = Path(__file__).parent.parent
+src_path = Path(__file__).parent

core/models/__init__.py ADDED Viewed

File without changes

core/models/parser.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from pydantic import BaseModel, Field
+from typing import Optional, Union
+class WorkExperience(BaseModel):
+    position: str
+    company: str
+    date: str
+    responsibilities: str
+class Education(BaseModel):
+    degree: str
+    school: str
+    date: Optional[str] = None
+class ApplicantResume(BaseModel):
+    applicant_name: str = None
+    phone_number: Optional[str] = None
+    email: Optional[str] = None
+    website: Optional[str] = None
+    applicant_summary: Optional[str] = None
+    work_experience: Optional[list[WorkExperience]] = None
+    skills: list[str]
+    education: Optional[list[Education]] = None
+    Publications: Optional[dict] = None
+    location: Optional[str] = None
+    languages: Optional[list[str]] = None

core/models/reports.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from pydantic import BaseModel, Field
+from typing import Any
+class ReportConfiguration(BaseModel):
+    entities: list[str]
+    pos_frequencies: dict[str, int]
+    keyterms: list[Any]
+    bi_grams: list[Any]

core/parser/__init__.py ADDED Viewed

File without changes

core/parser/file_parser.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import pandas as pd
+import numpy as np
+import fitz
+import docx
+class BaseParser():
+    def __init__(self,  file_name: str) -> None:
+        self.file_name = file_name
+        self.file_path = f'./{file_name}'
+        self.file_type = file_name.split('.')[-1]
+    def fitz_pymupdf_parser(self):
+        doc = fitz.open(self.file_path)
+        text = ""
+        for page in doc:
+            text += page.get_text().encode("utf-8", "ignore").decode("utf-8", "ignore")
+        print(text)
+        return text
+    def docx_parser(self):
+        docs = docx.Document(self.file_path)
+        text = ""
+        for paragraph in docs.paragraphs:
+            text +=  paragraph.text + "\n"
+        print(text)
+        return text
+    def parse_pdf(self):
+        parsed_text = ""
+        if self.file_type == "txt":
+            parsed_text = open(self.file_name, "r").read()
+        elif self.file_type == "pdf":
+            parsed_text = self.fitz_pymupdf_parser()
+        elif self.file_type == "docx":
+            parsed_text = self.docx_parser()
+        return parsed_text

core/parser/prompt.py ADDED Viewed

	@@ -0,0 +1,44 @@

+def generate_identity_prompt():
+    prompt = """
+    Identity:
+    1. You are a resume parser. A parser that returns the field you are given.
+    2.  You will act as a bot that parses and analyzes the resume based on it's context.
+    3. You will be taking different types of format for the resume. The ultimate goal is for you to be able to parse the texts as it is. No modifications just copy and paste.
+    Instructions:
+    1. You're goal is to scrape the data from the resume based on it's contextual order since resumes are unstructured data, you will need to make sure that you're ordering is as follows:
+        * applicant_name = name of applicant
+        * phone_number = phone number indicated there. Should be optional
+        * email = email of the applicant. Should be optional
+        * website = (url you find in the resume if Any)
+        * applicant_summary =  (their introduction to who they are if Any)
+        * work_experience (Their Job Experience over the years if Any). If there is follow this format per work experience:
+            * position: str
+            * company: str
+            * date: str
+            * responsibilities: str
+        * skills (The skills they acquire in their tenure as a professional if Any), should be a list of strings.
+        * education (Their Educational Attainment if Any). If there is follow this format per education attained:
+            * degree: str
+            * school: str
+            * date: Optional[str] = None
+        * publications (Their Publications if Any) should be in list[dict] form.
+        * location (the city and country where the person is located if any). Should be a string.
+        * languages(the languages spoken by the candidate if any). should be a list of string.
+    2. You will parse only these fields and will make sure that the list above remains the same number. You should not make up other words aside from what is explicitly written in the resume. You will only copy it as it is.
+    3. Make sure that you only get from the context provided below else just return null.
+    4. Make sure that you're output is in a json format based on the fields provided to you.
+    """
+    return prompt
+def generate_context_prompt(resume_raw_txt: str):
+    prompt = f"""
+    Context:
+    {resume_raw_txt}
+    """
+    return prompt

core/parser/task.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from core.models.parser import ApplicantResume
+from core.parser.file_parser import BaseParser
+from core.parser.prompt import generate_context_prompt, generate_identity_prompt
+from core.resources.azure_openai import generate_response
+import json
+def extractor_task(file_name: str):
+    # Parse all data in it's raw format.
+    parser = BaseParser(file_name)
+    raw_txt = parser.parse_pdf()
+    messages = generate_messages(raw_txt)
+    # generate response and put it in a pydantic model that validates what gpt has given us
+    # it will also serve as our baseline on which data we can access from the resume
+    # if it has enough fields for us to do an context similarity on, we can generate a proper report.
+    # The model will also serve in our reporting as we can raise this as an issue if they do not have
+    # enough context for us to work on. that means it is either it is not a complete resume.
+    response = generate_response(messages)
+    json_response = json.loads(response)
+    model_response = ApplicantResume(**json_response)
+    return json_response
+def generate_messages(raw_txt: str):
+    print("raw_txt in generate_messages", raw_txt)
+    messages = []
+    identity_prompt = generate_identity_prompt()
+    context_prompt = generate_context_prompt(raw_txt)
+    messages.append({"role": "system", "content": identity_prompt})
+    messages.append({"role": "system", "content": context_prompt})
+    return messages
+def list_dict_to_str_parser(items):
+    result = ""
+    for item in items:
+        for val in item.dict().values():
+            result += f"{val}\n"
+    return result

core/report/__init__.py ADDED Viewed

File without changes

core/resources/__init__.py ADDED Viewed

File without changes

core/resources/azure_openai.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from langchain.embeddings import OpenAIEmbeddings
+from core.environments import EMBEDDING_MODEL, COMPLETIONS_MODEL
+import openai
+def generate_response(messages: list):
+    chat_response = openai.ChatCompletion.create(
+       #deployment_id="chatbot",
+       model=COMPLETIONS_MODEL, messages=messages
+    )
+    print(chat_response)
+    return chat_response["choices"][0]["message"]["content"].strip(" \n")
+def get_embeddings():
+    return OpenAIEmbeddings(model=EMBEDDING_MODEL)

core/resources/constants.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def get_embedding_model():
+    return "text-embedding-ada-002"
+def get_completion_model():
+    return "gpt-3.5-turbo-16k"

core/similarity/__init__.py ADDED Viewed

File without changes

core/similarity/context.py ADDED Viewed

File without changes

core/similarity/service.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import numpy as np
+import textacy
+from textacy import extract
+import spacy
+from core.models.reports import ReportConfiguration
+import nltk
+spacy.cli.download("en_core_web_sm")
+spacy.cli.download("en_core_web_md")
+class EntityManager:
+    def __init__(self, text):
+        self.nlp = spacy.load('en_core_web_sm')
+        self.text = text
+        self.doc = self.nlp(text)
+    def count_frequency(self):
+        """
+        Count the frequency of words in the input text.
+        Returns:
+            dict: A dictionary with the words as keys and the frequency as values.
+        """
+        pos_freq = {}
+        for token in self.doc:
+            if token.pos_ in pos_freq:
+                pos_freq[token.pos_] += 1
+            else:
+                pos_freq[token.pos_] = 1
+        return pos_freq
+    def get_entity_from_txt(self):
+        pos_tags = ['NOUN', 'PROPN']
+        nouns = [str(token.text) for token in self.doc if token.pos_ in pos_tags]
+        return nouns
+    def extract_attrs(self) -> ReportConfiguration:
+        """
+        Extract frequency based attrs based from the parsed text
+        """
+        key_terms = KeytermExtractor(self.text)
+        return ReportConfiguration(
+            entities=self.get_entity_from_txt(),
+            pos_frequencies=self.count_frequency(),
+            keyterms=key_terms.get_keyterms_based_on_sgrank(),
+            bi_grams=key_terms.bi_gramchunker()
+        )
+class KeytermExtractor:
+    """
+    A class for extracting keyterms from a given text using various algorithms.
+    """
+    def __init__(self, raw_text: str, top_n_values: int = 20):
+        """
+        Initialize the KeytermExtractor object.
+        Args:
+            raw_text (str): The raw input text.
+            top_n_values (int): The number of top keyterms to extract.
+        """
+        self.raw_text = raw_text
+        self.text_doc = textacy.make_spacy_doc(
+            self.raw_text, lang="en_core_web_md")
+        self.top_n_values = top_n_values
+    def get_keyterms_based_on_textrank(self):
+        """
+        Extract keyterms using the TextRank algorithm.
+        Returns:
+            List[str]: A list of top keyterms based on TextRank.
+        """
+        return list(extract.keyterms.textrank(self.text_doc, normalize="lemma",
+                                              topn=self.top_n_values))
+    def get_keyterms_based_on_sgrank(self):
+        """
+        Extract keyterms using the SGRank algorithm.
+        Returns:
+            List[str]: A list of top keyterms based on SGRank.
+        """
+        return list(extract.keyterms.sgrank(self.text_doc, normalize="lemma",
+                                            topn=self.top_n_values))
+    def bi_gramchunker(self):
+        """
+        Chunk the text into bigrams.
+        Returns:
+            List[str]: A list of bigrams.
+        """
+        return list(textacy.extract.basics.ngrams(self.text_doc, n=2, filter_stops=True,
+                                                  filter_nums=True, filter_punct=True))
+    def tri_gramchunker(self):
+        """
+        Chunk the text into trigrams.
+        Returns:
+            List[str]: A list of trigrams.
+        """
+        return list(textacy.extract.basics.ngrams(self.text_doc, n=3, filter_stops=True,
+                                                  filter_nums=True, filter_punct=True))
+def create_annotated_text(input_string: str, word_list: list[str], annotation: str, color_code: str):
+    # Tokenize the input string
+    tokens = nltk.word_tokenize(input_string)
+    # Convert the list to a set for quick lookups
+    word_set = set(word_list)
+    # Initialize an empty list to hold the annotated text
+    annotated_text = []
+    for token in tokens:
+        # Check if the token is in the set
+        if token in word_set:
+            # If it is, append a tuple with the token, annotation, and color code
+            annotated_text.append((token, annotation, color_code))
+        else:
+            # If it's not, just append the token as a string
+            annotated_text.append(token)
+    return annotated_text

cv_job_maching.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eff3c0940574b067c7411dda40ae3ce67f905b3ce2d24909d89b7023465e6c5a
+size 4595256

data/sample_linkedin_1.pdf ADDED Viewed

Binary file (41 kB). View file

data/sample_linkedin_2.pdf ADDED Viewed

Binary file (42.9 kB). View file

main.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import uvicorn
+from utils.settings import app
+from fastapi import UploadFile
+import pandas as pd
+import aiofiles
+from starlette import status
+from starlette.responses import JSONResponse
+from core.parser.file_parser import BaseParser
+from core.parser.task import extractor_task, list_dict_to_str_parser
+from core.similarity.service import EntityManager
+@app.post("/api/parse-file")
+async def upload_config_file_to_s3(
+    file: UploadFile,
+    checkpoint_name: str,
+    scraper: str = "base_parser",
+    checkpoint_layer: str = "tier_0",
+):
+    try:
+        file_path = f'./{file.filename}'
+        async with aiofiles.open(file_path, 'wb') as out_file:
+            content = await file.read()  # async read
+            await out_file.write(content)
+        json_response = extractor_task(file_path)
+        return JSONResponse(
+            status_code=status.HTTP_200_OK,
+            content={"code": 200, "data": json_response},
+        )
+    except Exception as e:
+        return JSONResponse(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                content={"code": 400, "message": f"{e}"},
+            )
+if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

observation.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+openai
+scipy
+numpy
+pandas==1.3.5
+chromadb==0.3.26
+langchain==0.0.228
+tiktoken
+flask
+redis
+flask-redis
+plotly==5.16.1
+scikit-learn
+matplotlib
+pymupdf
+python-docx
+gensim
+nltk
+textacy
+aiofiles

test.ipynb ADDED Viewed

	@@ -0,0 +1,524 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://dtn.jfrog.io/artifactory/api/pypi/pypi/simple\n",
+      "Collecting en-core-web-sm==3.7.0\n",
+      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.0 in /opt/homebrew/lib/python3.11/site-packages (from en-core-web-sm==3.7.0) (3.7.1)\n",
+      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.0.12)\n",
+      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.0.5)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.0.10)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.8)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.0.9)\n",
+      "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (8.2.1)\n",
+      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.1.2)\n",
+      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.4.8)\n",
+      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.0.10)\n",
+      "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.3.2)\n",
+      "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.9.0)\n",
+      "Requirement already satisfied: pathy>=0.10.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.10.2)\n",
+      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (6.4.0)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (4.66.1)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.31.0)\n",
+      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.10.13)\n",
+      "Requirement already satisfied: jinja2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.1.2)\n",
+      "Requirement already satisfied: setuptools in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (68.1.2)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (23.1)\n",
+      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.3.0)\n",
+      "Requirement already satisfied: numpy>=1.19.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.26.0)\n",
+      "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/homebrew/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (4.8.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.2.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (1.26.16)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2023.7.22)\n",
+      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/homebrew/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.7.11)\n",
+      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/homebrew/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.1.3)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/homebrew/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (8.1.7)\n",
+      "Requirement already satisfied: cloudpathlib<0.16.0,>=0.7.0 in /opt/homebrew/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (0.15.1)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/lib/python3.11/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->en-core-web-sm==3.7.0) (2.1.3)\n",
+      "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
+      "You can now load the package via spacy.load('en_core_web_sm')\n",
+      "Looking in indexes: https://dtn.jfrog.io/artifactory/api/pypi/pypi/simple\n",
+      "Collecting en-core-web-md==3.7.0\n",
+      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.0/en_core_web_md-3.7.0-py3-none-any.whl (42.8 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.8/42.8 MB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
+      "\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.0 in /opt/homebrew/lib/python3.11/site-packages (from en-core-web-md==3.7.0) (3.7.1)\n",
+      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.0.12)\n",
+      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.0.5)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.0.10)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.0.8)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.0.9)\n",
+      "Requirement already satisfied: thinc<8.3.0,>=8.1.8 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (8.2.1)\n",
+      "Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.1.2)\n",
+      "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.4.8)\n",
+      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.0.10)\n",
+      "Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.3.2)\n",
+      "Requirement already satisfied: typer<0.10.0,>=0.3.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.9.0)\n",
+      "Requirement already satisfied: pathy>=0.10.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.10.2)\n",
+      "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (6.4.0)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (4.66.1)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.31.0)\n",
+      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.10.13)\n",
+      "Requirement already satisfied: jinja2 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.1.2)\n",
+      "Requirement already satisfied: setuptools in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (68.1.2)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (23.1)\n",
+      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.3.0)\n",
+      "Requirement already satisfied: numpy>=1.19.0 in /opt/homebrew/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.26.0)\n",
+      "Requirement already satisfied: typing-extensions>=4.2.0 in /opt/homebrew/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (4.8.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.2.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (1.26.16)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2023.7.22)\n",
+      "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /opt/homebrew/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.7.11)\n",
+      "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /opt/homebrew/lib/python3.11/site-packages (from thinc<8.3.0,>=8.1.8->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.1.3)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/homebrew/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (8.1.7)\n",
+      "Requirement already satisfied: cloudpathlib<0.16.0,>=0.7.0 in /opt/homebrew/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (0.15.1)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/lib/python3.11/site-packages (from jinja2->spacy<3.8.0,>=3.7.0->en-core-web-md==3.7.0) (2.1.3)\n",
+      "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
+      "You can now load the package via spacy.load('en_core_web_md')\n",
+      "Jayra Gaile Ortiz \n",
+      "+63 969 392 3470 \n",
+      " \n",
+      "[email protected] \n",
+      " \n",
+      "https://www.linkedin.com/in/jayra-gaile-\n",
+      "ortiz/ \n",
+      " \n",
+      " \n",
+      "A Data Engineer in the industry of fuels and agriculture with 3 years of experience working with various \n",
+      "stakeholders to provide near real-time transactional data in the best practices of a software engineer. A \n",
+      "proven track record of using python, rust, and typescript as my main programming language to provide \n",
+      "data pipelines and their respective workflow using spark, AWS, and terraform to provide quality data by \n",
+      "communicating with key stakeholders as well as data scientists who leverage the data. \n",
+      "Experience \n",
+      "SEPTEMBER 2022 – PRESSENT  \n",
+      "Data Engineer/DTN, Philippines \n",
+      "• \n",
+      "Developed and Maintained workflows for Archiving Energy Transaction Data from generated \n",
+      "parquet files from redshift to s3 and archived in s3 glacier to match the client’s source \n",
+      "data(oracle) and maintain the views for Data Scientist to access on occasions of the need for \n",
+      "more historical data. \n",
+      "• \n",
+      "Developed ETL pipelines using AWS Glue, Spark, Step Functions and Lambda that handle TB of \n",
+      "Transactional Data from Fueling Companies inside Redshift that help stakeholders decide which \n",
+      "component will be likely in demand for the upcoming seasons. \n",
+      "• \n",
+      "Maintained schedules in AWS EKS using Terraform to handle pods that work individually as \n",
+      "Helpers for the ETL pipelines \n",
+      "• \n",
+      "Created and Developed batch jobs to process agricultural commodities and predicted price \n",
+      "transaction. \n",
+      "• \n",
+      "Created workflows to handle failed jobs, ensuring fault tolerance to deliver data even at the \n",
+      "state of migration. \n",
+      "APRIL 2021 – AUGUST 2022 \n",
+      "Associate Software Engineer/ Getaka Labs, Manila, Philippines \n",
+      "• \n",
+      "Research and Develop 2 company manual operation patterns for the implementation of a Certificate \n",
+      "generator automation using Rust, Typescript and a Rule Engine microservices run via Pub Nub \n",
+      "workers all containerized using Kubernetes while decreasing the load of the human resource \n",
+      "operations b.  \n",
+      "• \n",
+      "Created and Develop the initial RPA orchestrator using Rust, Typescript, and Python to handle the \n",
+      "existing supervised bot services and the third-party cloud services used mainly in AWS using Glue, \n",
+      "Lambda, and RDS provided by the Getaka Labs.  \n",
+      "• \n",
+      "Improve the standard process of handling products related to automation by providing technical \n",
+      "documentation to Automation developers using the orchestrator and bot builder product.  \n",
+      "OCTOBER 2020 – APRIL 2021  \n",
+      "Software Engineer(Part-Time)/Antares Software, Melbourne, Australia  \n",
+      "• \n",
+      "Collaborated with senior developers to develop and deploy a crowd-sourcing site using Django \n",
+      "Framework and Third-Party Cloud Services in AWS that handles volunteering tasks and volunteers \n",
+      "alike from different major companies like Pfizer, RSPCA Victoria, Climate Action Network Australia, \n",
+      "Australian Red Cross – VIC and other major volunteering sources from Australia improving the social \n",
+      "network gathered by 40%.  \n",
+      "• \n",
+      "Assisted the maintenance of the Australian govt site using CRM and Vue.js with a few Cloud services \n",
+      "in AWS.  \n",
+      "2 \n",
+      " \n",
+      "OCTOBER 2020 – APRIL 2021  \n",
+      "Software Engineer Intern/Antares Software, Melbourne, Australia  \n",
+      "• \n",
+      "Develop and Automate Tests for the APIs used by Antares Software for the software product \n",
+      "provided for customers with minimal customization use.   \n",
+      "• \n",
+      "Assisted the language translation implementation using I18n for the Australian Awareness in \n",
+      "Covid Information and Response.  \n",
+      "Volunteer Experience \n",
+      "OCTOBER 2021 - PRESENT \n",
+      "Junior Machine Learning Engineer/ Omdena \n",
+      "• \n",
+      "Assisted on managing and deploying Deep Learning Models particularly (TinyYolov3, Yolov5) \n",
+      "using Pytorch and Jetkins to deliver the classification of Shipments, Number of Persons and \n",
+      "Labels within the cargo to help alleviate AcuaOcean goal of rescueing refugees from across \n",
+      "different borders. \n",
+      "• \n",
+      "Collaborated in developing and deploying Deep Learning Models using flair and transformers in \n",
+      "TensorFlow and SciPy to cater consumer’s private data on Credit Card Agreements and Medical \n",
+      "Data. \n",
+      "Skills \n",
+      "Rust, Python, and Typescript/JavaScript • PSQL, Redshift, Spark, DBT and HDFS • AWS services • Docker, \n",
+      "Kubernetes(k8s), CI/CD GitLab, Git, Shell, Terraform, Datadog• Redis, Celery • Tableau, Plotly • Kimball \n",
+      "(Data Modelling), Star Schema \n",
+      "Education \n",
+      "MARCH 2019 – OCTOBER 2023  \n",
+      "Bachelors of Science in Computer Science/Polytechnic University of the Philippines Manila, \n",
+      "Philippines  \n",
+      "Certifications  \n",
+      "Deep Learning Specialization - DeepLearningAI  \n",
+      "SEPTEMBER 2021  \n",
+      "Mainly focused on the best practices in training models and their optimal hyperparameters, gathering \n",
+      "analyzing Data based on the nature of the problem, and understanding the process of the MLOps.  \n",
+      "Tableau Data Scientist - Tableau  \n",
+      "SEPTEMBER 2021  \n",
+      "Mainly focused on how to visualize data with a story to tell. How we can communicate with the clients \n",
+      "based on the data findings.  \n",
+      " \n",
+      " \n",
+      "3 \n",
+      " \n",
+      "Ongoing Publications  \n",
+      "Optimized Bi-LSTM-CRF De-Identification Model using Ensemble Learning  \n",
+      "MARCH 2022-PRESENT  \n",
+      "Mainly focused on optimizing the latest advancements of Bi-LSTM-CRF in terms of NER, due to the \n",
+      "limitation of not being able to identify the span of the location and the incorrect labeling of the PHI \n",
+      "terms in the guidelines of n2c2 2014 De-dentification Dataset from Harvard.  \n",
+      "\n",
+      "raw_txt in generate_messages Jayra Gaile Ortiz \n",
+      "+63 969 392 3470 \n",
+      " \n",
+      "[email protected] \n",
+      " \n",
+      "https://www.linkedin.com/in/jayra-gaile-\n",
+      "ortiz/ \n",
+      " \n",
+      " \n",
+      "A Data Engineer in the industry of fuels and agriculture with 3 years of experience working with various \n",
+      "stakeholders to provide near real-time transactional data in the best practices of a software engineer. A \n",
+      "proven track record of using python, rust, and typescript as my main programming language to provide \n",
+      "data pipelines and their respective workflow using spark, AWS, and terraform to provide quality data by \n",
+      "communicating with key stakeholders as well as data scientists who leverage the data. \n",
+      "Experience \n",
+      "SEPTEMBER 2022 – PRESSENT  \n",
+      "Data Engineer/DTN, Philippines \n",
+      "• \n",
+      "Developed and Maintained workflows for Archiving Energy Transaction Data from generated \n",
+      "parquet files from redshift to s3 and archived in s3 glacier to match the client’s source \n",
+      "data(oracle) and maintain the views for Data Scientist to access on occasions of the need for \n",
+      "more historical data. \n",
+      "• \n",
+      "Developed ETL pipelines using AWS Glue, Spark, Step Functions and Lambda that handle TB of \n",
+      "Transactional Data from Fueling Companies inside Redshift that help stakeholders decide which \n",
+      "component will be likely in demand for the upcoming seasons. \n",
+      "• \n",
+      "Maintained schedules in AWS EKS using Terraform to handle pods that work individually as \n",
+      "Helpers for the ETL pipelines \n",
+      "• \n",
+      "Created and Developed batch jobs to process agricultural commodities and predicted price \n",
+      "transaction. \n",
+      "• \n",
+      "Created workflows to handle failed jobs, ensuring fault tolerance to deliver data even at the \n",
+      "state of migration. \n",
+      "APRIL 2021 – AUGUST 2022 \n",
+      "Associate Software Engineer/ Getaka Labs, Manila, Philippines \n",
+      "• \n",
+      "Research and Develop 2 company manual operation patterns for the implementation of a Certificate \n",
+      "generator automation using Rust, Typescript and a Rule Engine microservices run via Pub Nub \n",
+      "workers all containerized using Kubernetes while decreasing the load of the human resource \n",
+      "operations b.  \n",
+      "• \n",
+      "Created and Develop the initial RPA orchestrator using Rust, Typescript, and Python to handle the \n",
+      "existing supervised bot services and the third-party cloud services used mainly in AWS using Glue, \n",
+      "Lambda, and RDS provided by the Getaka Labs.  \n",
+      "• \n",
+      "Improve the standard process of handling products related to automation by providing technical \n",
+      "documentation to Automation developers using the orchestrator and bot builder product.  \n",
+      "OCTOBER 2020 – APRIL 2021  \n",
+      "Software Engineer(Part-Time)/Antares Software, Melbourne, Australia  \n",
+      "• \n",
+      "Collaborated with senior developers to develop and deploy a crowd-sourcing site using Django \n",
+      "Framework and Third-Party Cloud Services in AWS that handles volunteering tasks and volunteers \n",
+      "alike from different major companies like Pfizer, RSPCA Victoria, Climate Action Network Australia, \n",
+      "Australian Red Cross – VIC and other major volunteering sources from Australia improving the social \n",
+      "network gathered by 40%.  \n",
+      "• \n",
+      "Assisted the maintenance of the Australian govt site using CRM and Vue.js with a few Cloud services \n",
+      "in AWS.  \n",
+      "2 \n",
+      " \n",
+      "OCTOBER 2020 – APRIL 2021  \n",
+      "Software Engineer Intern/Antares Software, Melbourne, Australia  \n",
+      "• \n",
+      "Develop and Automate Tests for the APIs used by Antares Software for the software product \n",
+      "provided for customers with minimal customization use.   \n",
+      "• \n",
+      "Assisted the language translation implementation using I18n for the Australian Awareness in \n",
+      "Covid Information and Response.  \n",
+      "Volunteer Experience \n",
+      "OCTOBER 2021 - PRESENT \n",
+      "Junior Machine Learning Engineer/ Omdena \n",
+      "• \n",
+      "Assisted on managing and deploying Deep Learning Models particularly (TinyYolov3, Yolov5) \n",
+      "using Pytorch and Jetkins to deliver the classification of Shipments, Number of Persons and \n",
+      "Labels within the cargo to help alleviate AcuaOcean goal of rescueing refugees from across \n",
+      "different borders. \n",
+      "• \n",
+      "Collaborated in developing and deploying Deep Learning Models using flair and transformers in \n",
+      "TensorFlow and SciPy to cater consumer’s private data on Credit Card Agreements and Medical \n",
+      "Data. \n",
+      "Skills \n",
+      "Rust, Python, and Typescript/JavaScript • PSQL, Redshift, Spark, DBT and HDFS • AWS services • Docker, \n",
+      "Kubernetes(k8s), CI/CD GitLab, Git, Shell, Terraform, Datadog• Redis, Celery • Tableau, Plotly • Kimball \n",
+      "(Data Modelling), Star Schema \n",
+      "Education \n",
+      "MARCH 2019 – OCTOBER 2023  \n",
+      "Bachelors of Science in Computer Science/Polytechnic University of the Philippines Manila, \n",
+      "Philippines  \n",
+      "Certifications  \n",
+      "Deep Learning Specialization - DeepLearningAI  \n",
+      "SEPTEMBER 2021  \n",
+      "Mainly focused on the best practices in training models and their optimal hyperparameters, gathering \n",
+      "analyzing Data based on the nature of the problem, and understanding the process of the MLOps.  \n",
+      "Tableau Data Scientist - Tableau  \n",
+      "SEPTEMBER 2021  \n",
+      "Mainly focused on how to visualize data with a story to tell. How we can communicate with the clients \n",
+      "based on the data findings.  \n",
+      " \n",
+      " \n",
+      "3 \n",
+      " \n",
+      "Ongoing Publications  \n",
+      "Optimized Bi-LSTM-CRF De-Identification Model using Ensemble Learning  \n",
+      "MARCH 2022-PRESENT  \n",
+      "Mainly focused on optimizing the latest advancements of Bi-LSTM-CRF in terms of NER, due to the \n",
+      "limitation of not being able to identify the span of the location and the incorrect labeling of the PHI \n",
+      "terms in the guidelines of n2c2 2014 De-dentification Dataset from Harvard.  \n",
+      "\n",
+      "{\n",
+      "  \"id\": \"chatcmpl-8AbWUmgdjrYSJq9znjUkahom2oD2d\",\n",
+      "  \"object\": \"chat.completion\",\n",
+      "  \"created\": 1697538210,\n",
+      "  \"model\": \"gpt-3.5-turbo-16k-0613\",\n",
+      "  \"choices\": [\n",
+      "    {\n",
+      "      \"index\": 0,\n",
+      "      \"message\": {\n",
+      "        \"role\": \"assistant\",\n",
+      "        \"content\": \"{\\n  \\\"applicant_name\\\": \\\"Jayra Gaile Ortiz\\\",\\n  \\\"phone_number\\\": \\\"+63 969 392 3470\\\",\\n  \\\"email\\\": \\\"[email protected]\\\",\\n  \\\"website\\\": \\\"https://www.linkedin.com/in/jayra-gaile-ortiz/\\\",\\n  \\\"applicant_summary\\\": \\\"A Data Engineer in the industry of fuels and agriculture with 3 years of experience working with various stakeholders to provide near real-time transactional data in the best practices of a software engineer.\\\",\\n  \\\"work_experience\\\": [\\n    {\\n      \\\"position\\\": \\\"Data Engineer\\\",\\n      \\\"company\\\": \\\"DTN\\\",\\n      \\\"date\\\": \\\"SEPTEMBER 2022 \\u2013 PRESSENT\\\",\\n      \\\"responsibilities\\\": \\\"Developed and Maintained workflows for Archiving Energy Transaction Data from generated parquet files from redshift to s3 and archived in s3 glacier to match the client\\u2019s source data(oracle) and maintain the views for Data Scientist to access on occasions of the need for more historical data.\\\\nDeveloped ETL pipelines using AWS Glue, Spark, Step Functions and Lambda that handle TB of Transactional Data from Fueling Companies inside Redshift that help stakeholders decide which component will be likely in demand for the upcoming seasons.\\\\nMaintained schedules in AWS EKS using Terraform to handle pods that work individually as Helpers for the ETL pipelines\\\\nCreated and Developed batch jobs to process agricultural commodities and predicted price transaction.\\\\nCreated workflows to handle failed jobs, ensuring fault tolerance to deliver data even at the state of migration.\\\"\\n    },\\n    {\\n      \\\"position\\\": \\\"Associate Software Engineer\\\",\\n      \\\"company\\\": \\\"Getaka Labs\\\",\\n      \\\"date\\\": \\\"APRIL 2021 \\u2013 AUGUST 2022\\\",\\n      \\\"responsibilities\\\": \\\"Research and Develop 2 company manual operation patterns for the implementation of a Certificate generator automation using Rust, Typescript and a Rule Engine microservices run via Pub Nub workers all containerized using Kubernetes while decreasing the load of the human resource operations b.\\\\nCreated and Develop the initial RPA orchestrator using Rust, Typescript, and Python to handle the existing supervised bot services and the third-party cloud services used mainly in AWS using Glue, Lambda, and RDS provided by the Getaka Labs.\\\\nImprove the standard process of handling products related to automation by providing technical documentation to Automation developers using the orchestrator and bot builder product.\\\"\\n    },\\n    {\\n      \\\"position\\\": \\\"Software Engineer(Part-Time)\\\",\\n      \\\"company\\\": \\\"Antares Software\\\",\\n      \\\"date\\\": \\\"OCTOBER 2020 \\u2013 APRIL 2021\\\",\\n      \\\"responsibilities\\\": \\\"Collaborated with senior developers to develop and deploy a crowd-sourcing site using Django Framework and Third-Party Cloud Services in AWS that handles volunteering tasks and volunteers alike from different major companies like Pfizer, RSPCA Victoria, Climate Action Network Australia, Australian Red Cross \\u2013 VIC and other major volunteering sources from Australia improving the social network gathered by 40%.\\\\nAssisted the maintenance of the Australian govt site using CRM and Vue.js with a few Cloud services in AWS.\\\"\\n    },\\n    {\\n      \\\"position\\\": \\\"Software Engineer Intern\\\",\\n      \\\"company\\\": \\\"Antares Software\\\",\\n      \\\"date\\\": \\\"OCTOBER 2020 \\u2013 APRIL 2021\\\",\\n      \\\"responsibilities\\\": \\\"Develop and Automate Tests for the APIs used by Antares Software for the software product provided for customers with minimal customization use.\\\\nAssisted the language translation implementation using I18n for the Australian Awareness in Covid Information and Response.\\\"\\n    }\\n  ],\\n  \\\"skills\\\": [\\n    \\\"Rust\\\",\\n    \\\"Python\\\",\\n    \\\"Typescript/JavaScript\\\",\\n    \\\"PSQL\\\",\\n    \\\"Redshift\\\",\\n    \\\"Spark\\\",\\n    \\\"DBT\\\",\\n    \\\"HDFS\\\",\\n    \\\"AWS services\\\",\\n    \\\"Docker\\\",\\n    \\\"Kubernetes(k8s)\\\",\\n    \\\"CI/CD GitLab\\\",\\n    \\\"Git\\\",\\n    \\\"Shell\\\",\\n    \\\"Terraform\\\",\\n    \\\"Datadog\\\",\\n    \\\"Redis\\\",\\n    \\\"Celery\\\",\\n    \\\"Tableau\\\",\\n    \\\"Plotly\\\",\\n    \\\"Kimball(Data Modelling)\\\",\\n    \\\"Star Schema\\\"\\n  ],\\n  \\\"education\\\": [\\n    {\\n      \\\"degree\\\": \\\"Bachelors of Science in Computer Science\\\",\\n      \\\"school\\\": \\\"Polytechnic University of the Philippines Manila, Philippines\\\",\\n      \\\"date\\\": \\\"MARCH 2019 \\u2013 OCTOBER 2023\\\"\\n    }\\n  ],\\n  \\\"publications\\\": [\\n    {\\n      \\\"title\\\": \\\"Optimized Bi-LSTM-CRF De-Identification Model using Ensemble Learning\\\",\\n      \\\"date\\\": \\\"MARCH 2022-PRESENT\\\",\\n      \\\"description\\\": \\\"Mainly focused on optimizing the latest advancements of Bi-LSTM-CRF in terms of NER, due to the limitation of not being able to identify the span of the location and the incorrect labeling of the PHI terms in the guidelines of n2c2 2014 De-dentification Dataset from Harvard.\\\"\\n    }\\n  ],\\n  \\\"location\\\": \\\"Manila, Philippines\\\",\\n  \\\"languages\\\": []\\n}\"\n",
+      "      },\n",
+      "      \"finish_reason\": \"stop\"\n",
+      "    }\n",
+      "  ],\n",
+      "  \"usage\": {\n",
+      "    \"prompt_tokens\": 1651,\n",
+      "    \"completion_tokens\": 1044,\n",
+      "    \"total_tokens\": 2695\n",
+      "  }\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from core.parser.file_parser import BaseParser\n",
+    "from core.parser.task import extractor_task, list_dict_to_str_parser\n",
+    "from core.similarity.service import EntityManager\n",
+    "\n",
+    "json_response = extractor_task(\"Ortiz_02-04-2023resume-dtn.pdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'applicant_name': 'Jayra Gaile Ortiz',\n",
+       " 'phone_number': '+63 969 392 3470',\n",
+       " 'email': '[email protected]',\n",
+       " 'website': 'https://www.linkedin.com/in/jayra-gaile-ortiz/',\n",
+       " 'applicant_summary': 'A Data Engineer in the industry of fuels and agriculture with 3 years of experience working with various stakeholders to provide near real-time transactional data in the best practices of a software engineer.',\n",
+       " 'work_experience': [{'position': 'Data Engineer',\n",
+       "   'company': 'DTN',\n",
+       "   'date': 'SEPTEMBER 2022 – PRESSENT',\n",
+       "   'responsibilities': 'Developed and Maintained workflows for Archiving Energy Transaction Data from generated parquet files from redshift to s3 and archived in s3 glacier to match the client’s source data(oracle) and maintain the views for Data Scientist to access on occasions of the need for more historical data.\\nDeveloped ETL pipelines using AWS Glue, Spark, Step Functions and Lambda that handle TB of Transactional Data from Fueling Companies inside Redshift that help stakeholders decide which component will be likely in demand for the upcoming seasons.\\nMaintained schedules in AWS EKS using Terraform to handle pods that work individually as Helpers for the ETL pipelines\\nCreated and Developed batch jobs to process agricultural commodities and predicted price transaction.\\nCreated workflows to handle failed jobs, ensuring fault tolerance to deliver data even at the state of migration.'},\n",
+       "  {'position': 'Associate Software Engineer',\n",
+       "   'company': 'Getaka Labs',\n",
+       "   'date': 'APRIL 2021 – AUGUST 2022',\n",
+       "   'responsibilities': 'Research and Develop 2 company manual operation patterns for the implementation of a Certificate generator automation using Rust, Typescript and a Rule Engine microservices run via Pub Nub workers all containerized using Kubernetes while decreasing the load of the human resource operations b.\\nCreated and Develop the initial RPA orchestrator using Rust, Typescript, and Python to handle the existing supervised bot services and the third-party cloud services used mainly in AWS using Glue, Lambda, and RDS provided by the Getaka Labs.\\nImprove the standard process of handling products related to automation by providing technical documentation to Automation developers using the orchestrator and bot builder product.'},\n",
+       "  {'position': 'Software Engineer(Part-Time)',\n",
+       "   'company': 'Antares Software',\n",
+       "   'date': 'OCTOBER 2020 – APRIL 2021',\n",
+       "   'responsibilities': 'Collaborated with senior developers to develop and deploy a crowd-sourcing site using Django Framework and Third-Party Cloud Services in AWS that handles volunteering tasks and volunteers alike from different major companies like Pfizer, RSPCA Victoria, Climate Action Network Australia, Australian Red Cross ��� VIC and other major volunteering sources from Australia improving the social network gathered by 40%.\\nAssisted the maintenance of the Australian govt site using CRM and Vue.js with a few Cloud services in AWS.'},\n",
+       "  {'position': 'Software Engineer Intern',\n",
+       "   'company': 'Antares Software',\n",
+       "   'date': 'OCTOBER 2020 – APRIL 2021',\n",
+       "   'responsibilities': 'Develop and Automate Tests for the APIs used by Antares Software for the software product provided for customers with minimal customization use.\\nAssisted the language translation implementation using I18n for the Australian Awareness in Covid Information and Response.'}],\n",
+       " 'skills': ['Rust',\n",
+       "  'Python',\n",
+       "  'Typescript/JavaScript',\n",
+       "  'PSQL',\n",
+       "  'Redshift',\n",
+       "  'Spark',\n",
+       "  'DBT',\n",
+       "  'HDFS',\n",
+       "  'AWS services',\n",
+       "  'Docker',\n",
+       "  'Kubernetes(k8s)',\n",
+       "  'CI/CD GitLab',\n",
+       "  'Git',\n",
+       "  'Shell',\n",
+       "  'Terraform',\n",
+       "  'Datadog',\n",
+       "  'Redis',\n",
+       "  'Celery',\n",
+       "  'Tableau',\n",
+       "  'Plotly',\n",
+       "  'Kimball(Data Modelling)',\n",
+       "  'Star Schema'],\n",
+       " 'education': [{'degree': 'Bachelors of Science in Computer Science',\n",
+       "   'school': 'Polytechnic University of the Philippines Manila, Philippines',\n",
+       "   'date': 'MARCH 2019 – OCTOBER 2023'}],\n",
+       " 'Publications': None,\n",
+       " 'location': 'Manila, Philippines',\n",
+       " 'languages': []}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "json_response.dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_JD = \"\"\"\n",
+    "COMPUTER SPECIALIST (SOFTWARE)\n",
+    "9 Metrotech Center, Brooklyn N\n",
+    "\n",
+    "Job Category: Technology, Data & Innovation\n",
+    "\n",
+    "(1) A baccalaureate degree from an accredited college, including or supplemented by twenty-four (24) semester credits in computer science or a related computer field and two (2) years of satisfactory full-time software experience in designing, programming, debugging, maintaining, implementing, and enhancing computer software applications, systems programming, systems analysis and design, data communication software, or database design and programming, including one year in a project leader capacity or as a major contributor on a complex project; or  (2) A four-year high school diploma or its educational equivalent and six (6) years of full-time satisfactory software experience as described in √¢¬Ä¬ú1 above, including one year in a project leader capacity or as a major contributor on a complex project; or  (3) A satisfactory combination of education and experience that is equivalent to (1) or (2) above. College education may be substituted for up to two years of the required experience in (2) above on the basis that sixty (60) semester credits from an accredited college is equated to one year of experience. A masters degree in computer science or a related computer field may be substituted for one year of the required experience in (1) or (2) above. However, all candidates must have a four year high school diploma or its educational equivalent, plus at least one (1) year of satisfactory full-time software experience in a project leader capacity or as a major contributor on a complex project.  NOTE: In order to have your experience accepted as Project Leader or Major Contributor experience, you must explain in detail how your experience qualifies you as a project leader or as a major contributor. Experience in computer operations, technical support, quality assurance (QA), hardware installation, help desk, or as an end user will not be accepted for meeting the minimum qualification  requirements.  Special Note  To be eligible for placement in Assignment Level IV, in addition to the Qualification Requirements stated above, individuals must have one year of satisfactory experience in a project leader capacity or as a major contributor on a complex project in data administration, database management systems, operating systems, data communications systems, capacity planning, and/or on-line applications programming.\n",
+    "Strong proficiency in JavaScript and ReactJS framework or other front-end development frameworks is required. Candidate will also need to possess advanced knowledge of web-related technologies such as HTM5L, CSS, and the DOM and have utilized UI libraries such as Prime React, Bootstrap, Ant Design or Material UI to build applications that works on multiple form factors. Experience building and consuming RESTful APIs is also required. Knowledge of modern version control systems (GitHub) and automated build workflows/pipelines for web applications. The individual will thrive in a fast-paced agile environment founded on strong analytical and communication skills. Member should be enthusiastic and open for learning new skills and keen on understanding new technologies. Knowledge of Java, PL/SQL and Oracle suite of software/cloud and backend development a plus. Knowledge of modern authentication protocols and mechanisms such as JWT, OAuth2, etc.\n",
+    "\n",
+    "We offer great benefits and programs!  -Health Benefits at no or low cost with an array of health plans -Defined Pension Plans  -401(k) and 457(k) Retirement Savings Programs  -Dental and Vision Coverage  -Prescription Drug Program  -Flexible Spending Program  -Paid Holidays and Generous Annual Leave -Training and Professional Development  -Opportunity for Scholarship  -College Savings Program -Commuter Benefits -Employee Assistance Programs -Workplace Wellness Programs -Student Loan Forgiveness* -Municipal Credit Union  NOTE: This position may be eligible for remote work up to 2 days per weeks, pursuant to the Remote Work Pilot Program  agreed between the City and DC37.  **Appointments are subject to Office of Management and Budget (OMB) approval.\n",
+    "\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "work_exps_str = list_dict_to_str_parser(json_response.work_experience)\n",
+    "education_str = list_dict_to_str_parser(json_response.education)\n",
+    "clean_text = re.sub(\"[^a-zA-Z\\s]\",\"\",f\"{json_response.applicant_summary}\\n\\n {work_exps_str}\\n\\n {education_str}\")\n",
+    "\n",
+    "entity_resume = EntityManager(clean_text.lower()).extract_attrs()\n",
+    "entity_jd = EntityManager(input_JD.lower()).extract_attrs()\n",
+    "\n",
+    "job_keywords = set(entity_jd.entities)\n",
+    "resume_keywords = set(entity_resume.entities)\n",
+    "jaccard_similarity = len(job_keywords.intersection(resume_keywords)) / len(job_keywords.union(resume_keywords))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'semester', 'framework', 'dc37', 'installation', 'individual', 'office', 'time', 'pilot', 'coverage', 'support', 'education', 'pipelines', 'authentication', 'analysis', 'data', 'design', 'libraries', 'days', 'jwt', 'basis', 'software', 'contributor', 'level', 'program', 'material', 'equivalent', '-employee', 'technology', 'operating', 'apis', 'designing', 'drug', 'science', 'years', 'java', 'pl', '457(k', 'user', '-401(k', 'assistance', 'health', 'retirement', 'forgiveness', 'mechanisms', 'administration', '-dental', 'spending', 'desk', 'applications', 'development', 'order', 'quality', 'debugging', 'assignment', '-commuter', 'appointments', 'sql', 'array', 'metrotech', 'credits', 'systems', 'note', 'budget', 'diploma', 'detail', 'college', 'build', 'brooklyn', 'cloud', 'plans', 'dom', 'addition', 'technologies', 'qualification', 'communication', '*', 'leave', 'union', '-college', 'holidays', 'placement', 'baccalaureate', 'weeks', 'capacity', '√¢¬ä¬ú1', 'individuals', 'assurance', 'building', 'credit', 'position', 'scholarship', 'factors', 'category', 'protocols', 'line', 'version', 'benefits', 'github', 'knowledge', '-workplace', 'year', 'job', 'savings', '-municipal', 'work', 'htm5l', 'loan', 'vision', 'iv', 'candidate', 'omb', 'requirements', 'management', 'javascript', 'operations', 'control', 'specialist', 'cost', 'project', 'candidates', 'end', 'workflows', 'degree', 'city', 'wellness', '-opportunity', 'plus', 'form', 'leader', 'programming', 'qa', 'hardware', 'school', 'bootstrap', 'planning', 'database', 'programs', 'css', 'computer', 'skills', 'suite', 'react', 'center', 'proficiency', 'masters', 'member', 'environment', 'approval', 'experience', '-student', 'pension', 'communications', 'web', 'combination', '-health', '-prescription', 'innovation', 'oracle', 'frameworks', 'field'}\n",
+      "{'scientist', 'framework', 'django', 'tb', 'source', 'implementation', 'crowdsourcing', 'glacier', 'kubernetes', 'process', 'products', 'pipelines', 'volunteers', 'automation', 'data', 'engineer', 'component', 'generator', 'resource', 'software', 'seasons', 'labs', 'apis', 'years', 'migration', 'science', 'engine', 'vic', 'getaka', 'clients', 'state', 'bot', 'bachelors', 'services', 'redshift', 'product', 'rust', 'files', 'need', 'microservices', 'translation', 'documentation', 'crm', 'information', 'step', 'workers', 'developers', 'network', 'spark', 'stakeholders', 'dataoracle', 'customization', 'demand', 'intern', 'patterns', 'language', 'research', 'schedules', 'company', 'vuejs', 'cloud', 'transaction', 'views', 'price', 'awareness', 'industry', 'occasions', 'typescript', 'b', 'helpers', 'commodities', 'tests', 'march', 'builder', 'red', 'pub', 'cross', 'october', 'pods', 'load', 'jobs', 'rspca', 'philippines', 'september', 'fuels', 'action', 'antares', 'manila', 'rule', 'tasks', 'response', 'customers', 'volunteering', 'nub', 'rds', 'thirdparty', 'practices', 'operations', 'tolerance', 'batch', 'glue', 'sources', 'etl', 'functions', 'govt', 'university', 'energy', 'workflows', 'april', 'victoria', 'climate', 'terraform', 'orchestrator', 'certificate', 'use', 'august', 'eks', 'pfizer', 'maintenance', 'site', 'computer', 'rpa', 'operation', 'experience', 'aws', 'dtn', 'lambda', 'agriculture', 'australia', 'australian', 'companies'}\n",
+      "0.043010752688172046\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(job_keywords)\n",
+    "print(resume_keywords)\n",
+    "print(jaccard_similarity)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "vectorizer = CountVectorizer()\n",
+    "X = vectorizer.fit_transform([clean_text, input_JD])\n",
+    "cosine_sim = cosine_similarity(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cosine_sim"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

utils/__init__.py ADDED Viewed

File without changes

utils/settings.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+    This file includes all the configuration file for API
+"""
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+# Handles documentation title
+app = FastAPI(
+    version="version 1.0.0",
+    title="Scraper Pipeline",
+    description="API helper to interact with scrapers",
+    debug=True,
+    docs_url="/",
+    redoc_url=None,
+)
+# Handles cors
+origins = ["http://0.0.0.0:8000"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)