{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-01-12 10:07:12,956 INFO: PyTorch version 2.5.1 available.\n" ] } ], "source": [ "import hopsworks\n", "from sentence_transformers import SentenceTransformer, InputExample, losses\n", "from torch.utils.data import DataLoader\n", "from sklearn.model_selection import train_test_split\n", "from dotenv import load_dotenv\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-01-12 10:07:14,851 INFO: Initializing external client\n", "2025-01-12 10:07:14,852 INFO: Base URL: https://c.app.hopsworks.ai:443\n", "2025-01-12 10:07:15,245 WARNING: InsecureRequestWarning: Unverified HTTPS request is being made to host 'c.app.hopsworks.ai'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", "\n", "2025-01-12 10:07:18,039 INFO: Python Engine initialized.\n", "\n", "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1158296\n" ] } ], "source": [ "# Initialize Hopsworks connection\n", "load_dotenv()\n", "\n", "api_key = os.getenv(\"HOPSWORKS_API_KEY\")\n", "project = hopsworks.login(project=\"orestavf\", api_key_value=api_key)\n", "fs = project.get_feature_store()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.46s) \n" ] } ], "source": [ "# Load preprocessed data\n", "feedback_fg = fs.get_feature_group(name=\"job_feedback\", version=1)\n", "feedback_df = feedback_fg.read()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Split into train and validation sets\n", "train_df, val_df = train_test_split(feedback_df, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Prepare data for SentenceTransformer\n", "def prepare_examples(df):\n", " examples = []\n", " for _, row in df.iterrows():\n", " examples.append(\n", " InputExample(\n", " texts=[row[\"resume_text\"], row[\"job_description\"]],\n", " label=float(row[\"is_relevant\"]) # Convert to float for loss calculation\n", " )\n", " )\n", " return examples" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "train_examples = prepare_examples(train_df)\n", "val_examples = prepare_examples(val_df)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2025-01-12 10:07:23,794 INFO: Use pytorch device_name: cpu\n", "2025-01-12 10:07:23,795 INFO: Load pretrained SentenceTransformer: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2\n" ] } ], "source": [ "# Load pretrained SentenceTransformer\n", "model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Define DataLoader\n", "train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)\n", "val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Define loss\n", "train_loss = losses.CosineSimilarityLoss(model)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Configure training\n", "num_epochs = 3\n", "warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of training as warmup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "13a4c4779de349a4a93c26a2a952d713", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/6 [00:00 406\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 407\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m e:\n", "File \u001b[1;32mc:\\Users\\Filip\\jobsai\\venv\\Lib\\site-packages\\requests\\models.py:1024\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1023\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[1;32m-> 1024\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n", "\u001b[1;31mHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[1;31mHfHubHTTPError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[20], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Push the model to huggingface\u001b[39;00m\n\u001b[1;32m----> 2\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mforestav/job_matching_sentence_transformer\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\Users\\Filip\\jobsai\\venv\\Lib\\site-packages\\sentence_transformers\\SentenceTransformer.py:1370\u001b[0m, in \u001b[0;36mSentenceTransformer.push_to_hub\u001b[1;34m(self, repo_id, token, private, safe_serialization, commit_message, local_model_path, exist_ok, replace_model_card, train_datasets, revision, create_pr)\u001b[0m\n\u001b[0;32m 1350\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1351\u001b[0m \u001b[38;5;124;03mUploads all elements of this Sentence Transformer to a new HuggingFace Hub repository.\u001b[39;00m\n\u001b[0;32m 1352\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 1367\u001b[0m \u001b[38;5;124;03m str: The url of the commit of your model in the repository on the Hugging Face Hub.\u001b[39;00m\n\u001b[0;32m 1368\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 1369\u001b[0m api \u001b[38;5;241m=\u001b[39m HfApi(token\u001b[38;5;241m=\u001b[39mtoken)\n\u001b[1;32m-> 1370\u001b[0m repo_url \u001b[38;5;241m=\u001b[39m \u001b[43mapi\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_repo\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1371\u001b[0m \u001b[43m \u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1372\u001b[0m \u001b[43m \u001b[49m\u001b[43mprivate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprivate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1373\u001b[0m \u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[0;32m 1374\u001b[0m \u001b[43m \u001b[49m\u001b[43mexist_ok\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexist_ok\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mcreate_pr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 1375\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1376\u001b[0m repo_id \u001b[38;5;241m=\u001b[39m repo_url\u001b[38;5;241m.\u001b[39mrepo_id \u001b[38;5;66;03m# Update the repo_id in case the old repo_id didn't contain a user or organization\u001b[39;00m\n\u001b[0;32m 1377\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel_card_data\u001b[38;5;241m.\u001b[39mset_model_id(repo_id)\n", "File \u001b[1;32mc:\\Users\\Filip\\jobsai\\venv\\Lib\\site-packages\\huggingface_hub\\utils\\_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.._inner_fn\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[0;32m 112\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[1;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32mc:\\Users\\Filip\\jobsai\\venv\\Lib\\site-packages\\huggingface_hub\\hf_api.py:3525\u001b[0m, in \u001b[0;36mHfApi.create_repo\u001b[1;34m(self, repo_id, token, private, repo_type, exist_ok, resource_group_id, space_sdk, space_hardware, space_storage, space_sleep_time, space_secrets, space_variables)\u001b[0m\n\u001b[0;32m 3522\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m 3524\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m-> 3525\u001b[0m \u001b[43mhf_raise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43mr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 3526\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m 3527\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m exist_ok \u001b[38;5;129;01mand\u001b[39;00m err\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m409\u001b[39m:\n\u001b[0;32m 3528\u001b[0m \u001b[38;5;66;03m# Repo already exists and `exist_ok=True`\u001b[39;00m\n", "File \u001b[1;32mc:\\Users\\Filip\\jobsai\\venv\\Lib\\site-packages\\huggingface_hub\\utils\\_http.py:477\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[1;34m(response, endpoint_name)\u001b[0m\n\u001b[0;32m 473\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, message, response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[0;32m 475\u001b[0m \u001b[38;5;66;03m# Convert `HTTPError` into a `HfHubHTTPError` to display request information\u001b[39;00m\n\u001b[0;32m 476\u001b[0m \u001b[38;5;66;03m# as well (request id and/or server error message)\u001b[39;00m\n\u001b[1;32m--> 477\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, \u001b[38;5;28mstr\u001b[39m(e), response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n", "\u001b[1;31mHfHubHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-678391c6-22b5b53a19ac4add675f0e05;3fa72b47-baef-4170-8fe3-772ad458534e)\n\nYou already created this model repo" ] } ], "source": [ "# Push the model to huggingface\n", "model.push_to_hub(repo_id=\"forestav/job_matching_sentence_transformer\", exist_ok=True)" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 2 }