{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"gpu","dataSources":[{"sourceId":10019907,"sourceType":"datasetVersion","datasetId":6169792}],"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip install rank_bm25","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-11-27T12:55:34.945215Z","iopub.execute_input":"2024-11-27T12:55:34.945467Z","iopub.status.idle":"2024-11-27T12:55:44.655417Z","shell.execute_reply.started":"2024-11-27T12:55:34.945442Z","shell.execute_reply":"2024-11-27T12:55:44.654556Z"}},"outputs":[{"name":"stdout","text":"Collecting rank_bm25\n Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)\nRequirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from rank_bm25) (1.26.4)\nDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)\nInstalling collected packages: rank_bm25\nSuccessfully installed rank_bm25-0.2.2\n","output_type":"stream"}],"execution_count":1},{"cell_type":"code","source":"!pip install sentence_transformers","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-11-27T12:55:44.657566Z","iopub.execute_input":"2024-11-27T12:55:44.657961Z","iopub.status.idle":"2024-11-27T12:55:53.264757Z","shell.execute_reply.started":"2024-11-27T12:55:44.657921Z","shell.execute_reply":"2024-11-27T12:55:53.263881Z"}},"outputs":[{"name":"stdout","text":"Collecting sentence_transformers\n Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)\nRequirement already satisfied: transformers<5.0.0,>=4.41.0 in /opt/conda/lib/python3.10/site-packages (from sentence_transformers) (4.45.1)\nRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from sentence_transformers) (4.66.4)\nRequirement already satisfied: torch>=1.11.0 in /opt/conda/lib/python3.10/site-packages (from sentence_transformers) (2.4.0)\nRequirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (from sentence_transformers) (1.2.2)\nRequirement already satisfied: scipy in /opt/conda/lib/python3.10/site-packages (from sentence_transformers) (1.14.1)\nRequirement already satisfied: huggingface-hub>=0.20.0 in /opt/conda/lib/python3.10/site-packages (from sentence_transformers) (0.25.1)\nRequirement already satisfied: Pillow in /opt/conda/lib/python3.10/site-packages (from sentence_transformers) (10.3.0)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.20.0->sentence_transformers) (3.15.1)\nRequirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.20.0->sentence_transformers) (2024.6.1)\nRequirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.20.0->sentence_transformers) (21.3)\nRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.20.0->sentence_transformers) (6.0.2)\nRequirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.20.0->sentence_transformers) (2.32.3)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.20.0->sentence_transformers) (4.12.2)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (1.13.3)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (3.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (3.1.4)\nRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from transformers<5.0.0,>=4.41.0->sentence_transformers) (1.26.4)\nRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers<5.0.0,>=4.41.0->sentence_transformers) (2024.5.15)\nRequirement already satisfied: safetensors>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from transformers<5.0.0,>=4.41.0->sentence_transformers) (0.4.5)\nRequirement already satisfied: tokenizers<0.21,>=0.20 in /opt/conda/lib/python3.10/site-packages (from transformers<5.0.0,>=4.41.0->sentence_transformers) (0.20.0)\nRequirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (1.4.2)\nRequirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (3.5.0)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.9->huggingface-hub>=0.20.0->sentence_transformers) (3.1.2)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.11.0->sentence_transformers) (2.1.5)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.20.0->sentence_transformers) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.20.0->sentence_transformers) (3.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.20.0->sentence_transformers) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.20.0->sentence_transformers) (2024.8.30)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.11.0->sentence_transformers) (1.3.0)\nDownloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)\n\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n\u001b[?25hInstalling collected packages: sentence_transformers\nSuccessfully installed sentence_transformers-3.3.1\n","output_type":"stream"}],"execution_count":2},{"cell_type":"code","source":"from rank_bm25 import BM25Okapi\nimport numpy as np\nfrom tqdm import tqdm\nimport os\nimport pickle\nimport torch\nimport glob\nimport json\n\nfrom sentence_transformers import SentenceTransformer","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-11-27T12:55:53.265931Z","iopub.execute_input":"2024-11-27T12:55:53.266227Z","iopub.status.idle":"2024-11-27T12:56:10.674019Z","shell.execute_reply.started":"2024-11-27T12:55:53.266199Z","shell.execute_reply":"2024-11-27T12:56:10.673097Z"}},"outputs":[],"execution_count":3},{"cell_type":"code","source":"def tokenize_doc_to_str(doc: dict) -> str:\n tokenized_doc = []\n for key, value in doc.items():\n tokenized_key = key.lower().replace(\"_\", \" \")\n tokenized_doc.append(tokenized_key)\n tokenized_doc.append(':')\n if isinstance(value, str):\n tokenized_doc.extend(value)\n else:\n tokenized_doc.extend(str(value))\n return ' '.join(tokenized_doc)\n\n# Take all json files with names that end '_processed' \nbase_path = \"/kaggle/input/jokerbot-rag/Kaggle_rag_data\"\nsbert_embeddings_path = \"/kaggle/working/sbert_embeddings.pt\"\n\ndocs = []\nfor path in glob.glob(f\"{base_path}/*_processed.json\"):\n with open(path, 'r') as f:\n docs.extend(json.load(f))\n\n# Initialize SentenceTransformer and ensure it uses GPU\ndevice = 'cuda' if torch.cuda.is_available() else 'cpu'\nsbert = SentenceTransformer('sentence-transformers/all-distilroberta-v1', device=device)\n\nstr_docs = [tokenize_doc_to_str(doc) for doc in docs]\n\nsbert_embeddings = sbert.encode(str_docs, show_progress_bar=True, convert_to_tensor=True, batch_size=256)\nsbert_embeddings = sbert_embeddings.cpu() # Move to CPU before saving\n\ntorch.save(sbert_embeddings, sbert_embeddings_path)","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true,"execution":{"iopub.status.busy":"2024-11-26T16:28:45.856755Z","iopub.execute_input":"2024-11-26T16:28:45.857664Z","iopub.status.idle":"2024-11-26T17:09:55.260372Z","shell.execute_reply.started":"2024-11-26T16:28:45.857629Z","shell.execute_reply":"2024-11-26T17:09:55.259570Z"},"scrolled":true},"outputs":[{"output_type":"display_data","data":{"text/plain":"modules.json: 0%| | 0.00/349 [00:00