{"cells":[{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### Kaggle link: https://www.kaggle.com/code/noobhocai/train-stage-2"]},{"cell_type":"code","execution_count":1,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:15:26.396463Z","iopub.status.busy":"2023-06-26T16:15:26.396153Z","iopub.status.idle":"2023-06-26T16:15:44.091436Z","shell.execute_reply":"2023-06-26T16:15:44.090085Z","shell.execute_reply.started":"2023-06-26T16:15:26.396437Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n","\u001b[0m"]}],"source":["!pip install rank_bm25 pandarallel gensim --q"]},{"cell_type":"code","execution_count":2,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:15:46.131348Z","iopub.status.busy":"2023-06-26T16:15:46.130925Z","iopub.status.idle":"2023-06-26T16:15:47.760579Z","shell.execute_reply":"2023-06-26T16:15:47.759408Z","shell.execute_reply.started":"2023-06-26T16:15:46.131315Z"},"trusted":true},"outputs":[],"source":["import os\n","import json\n","import pandas as pd\n","import numpy as np\n","import json, pickle\n","from rank_bm25 import BM25Okapi\n","import argparse\n","import gc\n","from tqdm.auto import tqdm\n","tqdm.pandas()\n","from glob import glob \n","import re \n","from nltk import word_tokenize as lib_tokenizer \n","import string\n","from gensim.corpora import Dictionary\n","from gensim.models import TfidfModel, OkapiBM25Model\n","from gensim.similarities import SparseMatrixSimilarity"]},{"cell_type":"code","execution_count":4,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:16:45.111948Z","iopub.status.busy":"2023-06-26T16:16:45.111271Z","iopub.status.idle":"2023-06-26T16:16:45.117881Z","shell.execute_reply":"2023-06-26T16:16:45.116573Z","shell.execute_reply.started":"2023-06-26T16:16:45.111915Z"},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["INFO: Pandarallel will run on 10 workers.\n","INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.\n"]}],"source":["from pandarallel import pandarallel\n","\n","pandarallel.initialize(progress_bar=True, nb_workers=10)"]},{"cell_type":"code","execution_count":5,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:16:47.210933Z","iopub.status.busy":"2023-06-26T16:16:47.210499Z","iopub.status.idle":"2023-06-26T16:17:49.317252Z","shell.execute_reply":"2023-06-26T16:17:49.316132Z","shell.execute_reply.started":"2023-06-26T16:16:47.210900Z"},"trusted":true},"outputs":[],"source":["df_wiki = pd.read_json(\"/kaggle/input/e2eqa-wiki-zalo-ai/wikipedia_20220620_cleaned/wikipedia_20220620_cleaned.jsonl\", lines=True)"]},{"cell_type":"code","execution_count":6,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:18:42.926014Z","iopub.status.busy":"2023-06-26T16:18:42.925307Z","iopub.status.idle":"2023-06-26T16:18:42.961174Z","shell.execute_reply":"2023-06-26T16:18:42.959896Z","shell.execute_reply.started":"2023-06-26T16:18:42.925974Z"},"trusted":true},"outputs":[{"data":{"text/html":["
\n","\n","
\n"," \n"," \n"," | \n"," id | \n"," url | \n"," title | \n"," text | \n"," timestamp | \n"," revid | \n","
\n"," \n"," \n"," \n"," 0 | \n"," 2 | \n"," https://vi.wikipedia.org/wiki?curid=2 | \n"," Trang Chính | \n"," Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s... | \n"," 2022-05-12 12:46:53+00:00 | \n"," 68591979 | \n","
\n"," \n"," 1 | \n"," 4 | \n"," https://vi.wikipedia.org/wiki?curid=4 | \n"," Internet Society | \n"," Internet Society\\n\\nInternet Society hay ISOC ... | \n"," 2022-01-20 07:59:10+00:00 | \n"," 67988747 | \n","
\n"," \n"," 2 | \n"," 13 | \n"," https://vi.wikipedia.org/wiki?curid=13 | \n"," Tiếng Việt | \n"," Tiếng Việt\\n\\nTiếng Việt, cũng gọi là tiếng Vi... | \n"," 2022-05-29 03:42:42+00:00 | \n"," 68660631 | \n","
\n"," \n"," 3 | \n"," 24 | \n"," https://vi.wikipedia.org/wiki?curid=24 | \n"," Ohio | \n"," Ohio\\n\\nOhio (viết tắt là OH, viết tắt cũ là O... | \n"," 2022-04-17 08:15:22+00:00 | \n"," 68482118 | \n","
\n"," \n"," 4 | \n"," 26 | \n"," https://vi.wikipedia.org/wiki?curid=26 | \n"," California | \n"," California\\n\\nCalifornia (phát âm như \"Ca-li-p... | \n"," 2022-06-16 15:27:07+00:00 | \n"," 68738039 | \n","
\n"," \n","
\n","
"],"text/plain":[" id url title \\\n","0 2 https://vi.wikipedia.org/wiki?curid=2 Trang Chính \n","1 4 https://vi.wikipedia.org/wiki?curid=4 Internet Society \n","2 13 https://vi.wikipedia.org/wiki?curid=13 Tiếng Việt \n","3 24 https://vi.wikipedia.org/wiki?curid=24 Ohio \n","4 26 https://vi.wikipedia.org/wiki?curid=26 California \n","\n"," text \\\n","0 Trang Chính\\n\\n<;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] != '(' and text[-1] == ')' and '(' in text: \n"," break \n"," if text[-1] == '\"' and text[0] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[:-1].strip() \n"," while text[0] in '.,/><;:\\'\"[]{}+=-_)(*&^!~`': \n"," if text[0] == '\"' and text[-1] != '\"' and text.count('\"') > 1: \n"," break \n"," text = text[1:].strip() \n"," text = text.strip() \n"," return text \n"," \n","def strip_context(text): \n"," text = text.replace('\\n', ' ') \n"," text = re.sub(r'\\s+', ' ', text) \n"," text = text.strip() \n"," return text\n","\n","def check_(x):\n"," x = str(x).lower()\n"," return (x.isnumeric() or \"ngày\" in x or \"tháng\" in x or \"năm\" in x)\n","\n","def find_candidate_ids(x, raw_answer=None, already_added=[], topk=50):\n"," x = str(x)\n"," query = post_process(x).lower().split()\n"," tfidf_query = tfidf_model[dictionary.doc2bow(query)]\n"," scores = bm25_index[tfidf_query]\n"," top_n = list(np.argsort(scores)[::-1][:topk])\n"," top_n = [i for i in top_n if i not in already_added]\n"," # scores = list(scores[top_n])\n"," if raw_answer is not None:\n"," raw_answer = raw_answer.strip()\n"," if raw_answer in entity_dict:\n"," title = entity_dict[raw_answer].replace(\"wiki/\",\"\").replace(\"_\",\" \")\n"," extra_id = title2idx.get(title, -1)\n"," # print((raw_answer,title,extra_id, extra_id not in top_n))\n"," if extra_id != -1 and extra_id not in top_n:\n"," print(f\"Add extra id {extra_id} for {raw_answer}\")\n"," top_n.append(extra_id)\n"," top_n = list(set(top_n))\n"," scores = scores[top_n]\n"," return list(top_n), np.array(scores)"]},{"cell_type":"code","execution_count":8,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:20:18.394704Z","iopub.status.busy":"2023-06-26T16:20:18.394284Z","iopub.status.idle":"2023-06-26T16:30:31.484998Z","shell.execute_reply":"2023-06-26T16:30:31.483810Z","shell.execute_reply.started":"2023-06-26T16:20:18.394671Z"},"trusted":true},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"8cd018dfcf7e4ccc85f93f8bb319f26c","version_major":2,"version_minor":0},"text/plain":["VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=127347), Label(value='0 / 127347')…"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"a13f08c5d7974e1087d598ca8b488840","version_major":2,"version_minor":0},"text/plain":["VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=127347), Label(value='0 / 127347')…"]},"metadata":{},"output_type":"display_data"}],"source":["df_wiki['title_lower'] = df_wiki['title'].apply(lambda x: x.lower()).parallel_apply(post_process)\n","df_wiki['text_lower'] = df_wiki['text'].apply(lambda x: x.lower()).parallel_apply(post_process)"]},{"cell_type":"code","execution_count":9,"metadata":{"execution":{"iopub.execute_input":"2023-06-26T16:33:42.344050Z","iopub.status.busy":"2023-06-26T16:33:42.342811Z","iopub.status.idle":"2023-06-26T16:33:42.362074Z","shell.execute_reply":"2023-06-26T16:33:42.360662Z","shell.execute_reply.started":"2023-06-26T16:33:42.344003Z"},"trusted":true},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," id | \n"," url | \n"," title | \n"," text | \n"," timestamp | \n"," revid | \n"," title_lower | \n"," text_lower | \n","
\n"," \n"," \n"," \n"," 0 | \n"," 2 | \n"," https://vi.wikipedia.org/wiki?curid=2 | \n"," Trang Chính | \n"," Trang Chính\\n\\n<templatestyles src=\"Wiki2021/s... | \n"," 2022-05-12 12:46:53+00:00 | \n"," 68591979 | \n"," trang chính | \n"," trang chính templatestyles src wiki2021stylesc... | \n","
\n"," \n"," 1 | \n"," 4 | \n"," https://vi.wikipedia.org/wiki?curid=4 | \n"," Internet Society | \n"," Internet Society\\n\\nInternet Society hay ISOC ... | \n"," 2022-01-20 07:59:10+00:00 | \n"," 67988747 | \n"," internet society | \n"," internet society internet society hay isoc là ... | \n","
\n"," \n"," 2 | \n"," 13 | \n"," https://vi.wikipedia.org/wiki?curid=13 | \n"," Tiếng Việt | \n"," Tiếng Việt\\n\\nTiếng Việt, cũng gọi là tiếng Vi... | \n"," 2022-05-29 03:42:42+00:00 | \n"," 68660631 | \n"," tiếng việt | \n"," tiếng việt tiếng việt cũng gọi là tiếng việt n... | \n","
\n"," \n"," 3 | \n"," 24 | \n"," https://vi.wikipedia.org/wiki?curid=24 | \n"," Ohio | \n"," Ohio\\n\\nOhio (viết tắt là OH, viết tắt cũ là O... | \n"," 2022-04-17 08:15:22+00:00 | \n"," 68482118 | \n"," ohio | \n"," ohio ohio viết tắt là oh viết tắt cũ là o là m... | \n","
\n"," \n"," 4 | \n"," 26 | \n"," https://vi.wikipedia.org/wiki?curid=26 | \n"," California | \n"," California\\n\\nCalifornia (phát âm như \"Ca-li-p... | \n"," 2022-06-16 15:27:07+00:00 | \n"," 68738039 | \n"," california | \n"," california california phát âm như caliphótnia ... | \n","
\n"," \n","
\n","
"],"text/plain":[" id url title \\\n","0 2 https://vi.wikipedia.org/wiki?curid=2 Trang Chính \n","1 4 https://vi.wikipedia.org/wiki?curid=4 Internet Society \n","2 13 https://vi.wikipedia.org/wiki?curid=13 Tiếng Việt \n","3 24 https://vi.wikipedia.org/wiki?curid=24 Ohio \n","4 26 https://vi.wikipedia.org/wiki?curid=26 California \n","\n"," text \\\n","0 Trang Chính\\n\\n