{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5247856b", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2023-04-16T22:13:19.965100Z", "iopub.status.busy": "2023-04-16T22:13:19.964059Z", "iopub.status.idle": "2023-04-16T22:13:24.741753Z", "shell.execute_reply": "2023-04-16T22:13:24.740665Z" }, "papermill": { "duration": 4.784228, "end_time": "2023-04-16T22:13:24.744315", "exception": false, "start_time": "2023-04-16T22:13:19.960087", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import datasets\n", "from datasets import Dataset\n", "import numpy as np\n", "import json\n", "import os\n", "from transformers import AutoTokenizer, AutoModel\n", "import torch\n", "import torch.nn.functional as F\n", "from tqdm.notebook import tqdm as tqdm\n", "\n", "ARTICLES_PATH = '/kaggle/input/ysda-ml-02-05-process-json/articles.hf'\n", "OUTPUT_PATH = '/kaggle/working/embeddings.npy'" ] }, { "cell_type": "code", "execution_count": 2, "id": "e843dcbc", "metadata": { "execution": { "iopub.execute_input": "2023-04-16T22:13:24.750354Z", "iopub.status.busy": "2023-04-16T22:13:24.749123Z", "iopub.status.idle": "2023-04-16T22:13:31.724234Z", "shell.execute_reply": "2023-04-16T22:13:31.723052Z" }, "papermill": { "duration": 6.980451, "end_time": "2023-04-16T22:13:31.726848", "exception": false, "start_time": "2023-04-16T22:13:24.746397", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bf7e94fe0aba40cea4d15a6f598be1ea", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading (…)okenizer_config.json: 0%| | 0.00/350 [00:00