{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/train-stage-1"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:46:04.681825Z","iopub.status.busy":"2023-06-26T15:46:04.681521Z","iopub.status.idle":"2023-06-26T15:46:20.131828Z","shell.execute_reply":"2023-06-26T15:46:20.130853Z","shell.execute_reply.started":"2023-06-26T15:46:04.681800Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install pyvi rank_bm25 pandarallel gensim --q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:46:20.133869Z","iopub.status.busy":"2023-06-26T15:46:20.133505Z","iopub.status.idle":"2023-06-26T15:46:21.705024Z","shell.execute_reply":"2023-06-26T15:46:21.703777Z","shell.execute_reply.started":"2023-06-26T15:46:20.133832Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["INFO: Pandarallel will run on 10 workers.\n","INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n"]}],"source":["import os\n","import re\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","import math\n","import pandas as pd\n","import string\n","from pyvi.ViTokenizer import tokenize\n","import numpy as np\n","import json, pickle\n","from rank_bm25 import BM25Okapi\n","import argparse\n","import gc\n","\n","from glob import glob \n","from nltk import word_tokenize as lib_tokenizer \n","\n","from pandarallel import pandarallel\n","from gensim.corpora import Dictionary\n","from gensim.corpora import MmCorpus\n","from gensim.models import TfidfModel, OkapiBM25Model\n","from gensim.similarities import SparseMatrixSimilarity\n","pandarallel.initialize(progress_bar=True, nb_workers=10)"]},{"cell_type":"code","execution_count":3,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.194411Z","iopub.status.busy":"2023-06-26T15:12:52.193733Z","iopub.status.idle":"2023-06-26T15:12:52.208121Z","shell.execute_reply":"2023-06-26T15:12:52.206564Z","shell.execute_reply.started":"2023-06-26T15:12:52.194376Z"},"trusted":true},"outputs":[],"source":["def get_topk(query, topk = 100):\n"," tokenized_query = query.split()\n"," tfidf_query = tfidf_model[dictionary.doc2bow(tokenized_query)]\n"," scores = bm25_index[tfidf_query]\n"," top_n = np.argsort(scores)[::-1][:topk]\n"," titles = [df_wiki.title.values[i] for i in top_n]\n"," texts = [df_wiki.text.values[i] for i in top_n]\n"," # print(titles)\n"," # print(tfidf_query, scores)\n"," return titles, texts, scores[top_n]\n","\n","def post_process(x):\n"," x = \" \".join(word_tokenize(strip_context(x))).strip()\n"," x = x.replace(\"\\n\",\" \")\n"," x = \"\".join([i for i in x if i not in string.punctuation])\n"," return x\n","\n","dict_map = dict({}) \n","def word_tokenize(text): \n"," global dict_map \n"," words = text.split() \n"," words_norm = [] \n"," for w in words: \n"," if dict_map.get(w, None) is None: \n"," dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '\"').replace(\"''\", '\"') \n"," words_norm.append(dict_map[w]) \n"," return words_norm \n"," \n","def strip_context(text): \n"," text = text.replace('\\n', ' ') \n"," text = re.sub(r'\\s+', ' ', text) \n"," text = text.strip() \n"," return text"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.210996Z","iopub.status.busy":"2023-06-26T15:12:52.210623Z","iopub.status.idle":"2023-06-26T15:12:52.227767Z","shell.execute_reply":"2023-06-26T15:12:52.226604Z","shell.execute_reply.started":"2023-06-26T15:12:52.210948Z"},"trusted":true},"outputs":[],"source":["wiki_cleaned_path = \"/kaggle/input/e2eqa-wiki-zalo-ai/processed/wikipedia_20220620_cleaned_v2.csv\"\n","test_data_path = \"/kaggle/input/e2eqa-wiki-zalo-ai/e2eqa-trainpublic_test-v1/e2eqa-train+public_test-v1/zac2022_testa_sample_submission.json\"\n","topk = 300"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:12:52.230234Z","iopub.status.busy":"2023-06-26T15:12:52.229690Z","iopub.status.idle":"2023-06-26T15:14:32.740956Z","shell.execute_reply":"2023-06-26T15:14:32.739621Z","shell.execute_reply.started":"2023-06-26T15:12:52.230185Z"},"trusted":true},"outputs":[],"source":["df_wiki = pd.read_csv(wiki_cleaned_path)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:32.743587Z","iopub.status.busy":"2023-06-26T15:14:32.743104Z","iopub.status.idle":"2023-06-26T15:14:35.140539Z","shell.execute_reply":"2023-06-26T15:14:35.139511Z","shell.execute_reply.started":"2023-06-26T15:14:32.743544Z"},"trusted":true},"outputs":[],"source":["df_wiki = df_wiki.fillna(\"NaN\")\n","if \"title\" not in df_wiki.columns:\n"," df_wiki[\"title\"] = df_wiki[\"titles=\"].fillna(\"\")"]},{"cell_type":"code","execution_count":7,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T15:14:35.143448Z","iopub.status.busy":"2023-06-26T15:14:35.142192Z","iopub.status.idle":"2023-06-26T15:14:35.178049Z","shell.execute_reply":"2023-06-26T15:14:35.176844Z","shell.execute_reply.started":"2023-06-26T15:14:35.143403Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n"," | title | \n","text | \n","bm25_text | \n","
---|---|---|---|
0 | \n","Trang Chính | \n","Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s... | \n","trang chính <templatestyles src= wiki2021 styl... | \n","
1 | \n","Internet Society | \n","Internet Society hay ISOC là một tổ chức quốc... | \n","internet society hay isoc là một tổ chức quốc ... | \n","
2 | \n","Tiếng Việt | \n","Tiếng Việt , cũng gọi là tiếng Việt Nam hay Vi... | \n","tiếng việt cũng gọi là tiếng việt nam hay việt... | \n","
3 | \n","Tiếng Việt | \n","hệ thống thanh điệu phát triển cao hơn, hệ thố... | \n","hệ thống thanh điệu phát triển cao hơn hệ thốn... | \n","
4 | \n","Tiếng Việt | \n","tiếp xúc Hán – Việt thành 2 giai đoạn chính: \\... | \n","tiếp xúc hán – việt thành 2 giai đoạn chính bu... | \n","