{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/train-stage-1"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:46:04.681825Z","iopub.status.busy":"2023-06-26T15:46:04.681521Z","iopub.status.idle":"2023-06-26T15:46:20.131828Z","shell.execute_reply":"2023-06-26T15:46:20.130853Z","shell.execute_reply.started":"2023-06-26T15:46:04.681800Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install pyvi rank_bm25 pandarallel gensim --q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:46:20.133869Z","iopub.status.busy":"2023-06-26T15:46:20.133505Z","iopub.status.idle":"2023-06-26T15:46:21.705024Z","shell.execute_reply":"2023-06-26T15:46:21.703777Z","shell.execute_reply.started":"2023-06-26T15:46:20.133832Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["INFO: Pandarallel will run on 10 workers.\n","INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n"]}],"source":["import os\n","import re\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","import math\n","import pandas as pd\n","import string\n","from pyvi.ViTokenizer import tokenize\n","import numpy as np\n","import json, pickle\n","from rank_bm25 import BM25Okapi\n","import argparse\n","import gc\n","\n","from glob import glob \n","from nltk import word_tokenize as lib_tokenizer \n","\n","from pandarallel import pandarallel\n","from gensim.corpora import Dictionary\n","from gensim.corpora import MmCorpus\n","from gensim.models import TfidfModel, OkapiBM25Model\n","from gensim.similarities import SparseMatrixSimilarity\n","pandarallel.initialize(progress_bar=True, nb_workers=10)"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.194411Z","iopub.status.busy":"2023-06-26T15:12:52.193733Z","iopub.status.idle":"2023-06-26T15:12:52.208121Z","shell.execute_reply":"2023-06-26T15:12:52.206564Z","shell.execute_reply.started":"2023-06-26T15:12:52.194376Z"},"trusted":true},"outputs":[],"source":["def get_topk(query, topk = 100):\n"," tokenized_query = query.split()\n"," tfidf_query = tfidf_model[dictionary.doc2bow(tokenized_query)]\n"," scores = bm25_index[tfidf_query]\n"," top_n = np.argsort(scores)[::-1][:topk]\n"," titles = [df_wiki.title.values[i] for i in top_n]\n"," texts = [df_wiki.text.values[i] for i in top_n]\n"," # print(titles)\n"," # print(tfidf_query, scores)\n"," return titles, texts, scores[top_n]\n","\n","def post_process(x):\n"," x = \" \".join(word_tokenize(strip_context(x))).strip()\n"," x = x.replace(\"\\n\",\" \")\n"," x = \"\".join([i for i in x if i not in string.punctuation])\n"," return x\n","\n","dict_map = dict({}) \n","def word_tokenize(text): \n"," global dict_map \n"," words = text.split() \n"," words_norm = [] \n"," for w in words: \n"," if dict_map.get(w, None) is None: \n"," dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n"," words_norm.append(dict_map[w]) \n"," return words_norm \n"," \n","def strip_context(text): \n"," text = text.replace('\\n', ' ') \n"," text = re.sub(r'\\s+', ' ', text) \n"," text = text.strip() \n"," return text"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.210996Z","iopub.status.busy":"2023-06-26T15:12:52.210623Z","iopub.status.idle":"2023-06-26T15:12:52.227767Z","shell.execute_reply":"2023-06-26T15:12:52.226604Z","shell.execute_reply.started":"2023-06-26T15:12:52.210948Z"},"trusted":true},"outputs":[],"source":["wiki_cleaned_path = \"/kaggle/input/e2eqa-wiki-zalo-ai/processed/wikipedia_20220620_cleaned_v2.csv\"\n","test_data_path = \"/kaggle/input/e2eqa-wiki-zalo-ai/e2eqa-trainpublic_test-v1/e2eqa-train+public_test-v1/zac2022_testa_sample_submission.json\"\n","topk = 300"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.230234Z","iopub.status.busy":"2023-06-26T15:12:52.229690Z","iopub.status.idle":"2023-06-26T15:14:32.740956Z","shell.execute_reply":"2023-06-26T15:14:32.739621Z","shell.execute_reply.started":"2023-06-26T15:12:52.230185Z"},"trusted":true},"outputs":[],"source":["df_wiki = pd.read_csv(wiki_cleaned_path)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:32.743587Z","iopub.status.busy":"2023-06-26T15:14:32.743104Z","iopub.status.idle":"2023-06-26T15:14:35.140539Z","shell.execute_reply":"2023-06-26T15:14:35.139511Z","shell.execute_reply.started":"2023-06-26T15:14:32.743544Z"},"trusted":true},"outputs":[],"source":["df_wiki = df_wiki.fillna(\"NaN\")\n","if \"title\" not in df_wiki.columns:\n"," df_wiki[\"title\"] = df_wiki[\"titles=\"].fillna(\"\")"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:35.143448Z","iopub.status.busy":"2023-06-26T15:14:35.142192Z","iopub.status.idle":"2023-06-26T15:14:35.178049Z","shell.execute_reply":"2023-06-26T15:14:35.176844Z","shell.execute_reply.started":"2023-06-26T15:14:35.143403Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
titletextbm25_text
0Trang ChínhTrang Chính\\n\\n<templatestyles src=\"Wiki2021/s...trang chính <templatestyles src= wiki2021 styl...
1Internet SocietyInternet Society hay ISOC là một tổ chức quốc...internet society hay isoc là một tổ chức quốc ...
2Tiếng ViệtTiếng Việt , cũng gọi là tiếng Việt Nam hay Vi...tiếng việt cũng gọi là tiếng việt nam hay việt...
3Tiếng Việthệ thống thanh điệu phát triển cao hơn, hệ thố...hệ thống thanh điệu phát triển cao hơn hệ thốn...
4Tiếng Việttiếp xúc Hán – Việt thành 2 giai đoạn chính: \\...tiếp xúc hán – việt thành 2 giai đoạn chính bu...
\n","
"],"text/plain":[" title text \\\n","0 Trang Chính Trang Chính\\n\\n