{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":82695,"databundleVersionId":9738540,"sourceType":"competition"},{"sourceId":202482692,"sourceType":"kernelVersion"},{"sourceId":210169931,"sourceType":"kernelVersion"},{"sourceId":212909050,"sourceType":"kernelVersion"},{"sourceId":181555,"sourceType":"modelInstanceVersion","modelInstanceId":154735,"modelId":177197},{"sourceId":185957,"sourceType":"modelInstanceVersion","modelInstanceId":158544,"modelId":180911},{"sourceId":189058,"sourceType":"modelInstanceVersion","modelInstanceId":161187,"modelId":183575},{"sourceId":192491,"sourceType":"modelInstanceVersion","modelInstanceId":164133,"modelId":186481},{"sourceId":193719,"sourceType":"modelInstanceVersion","modelInstanceId":165209,"modelId":187529},{"sourceId":174909,"sourceType":"modelInstanceVersion","isSourceIdPinned":true,"modelInstanceId":148911,"modelId":171421}],"dockerImageVersionId":30776,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"%%capture\n!pip install --no-index --find-links=/kaggle/input/reranker-scripts/packages -U transformers bitsandbytes accelerate peft","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:49.145040Z","iopub.execute_input":"2024-12-20T09:36:49.145387Z","iopub.status.idle":"2024-12-20T09:36:50.814837Z","shell.execute_reply.started":"2024-12-20T09:36:49.145354Z","shell.execute_reply":"2024-12-20T09:36:50.813733Z"}},"outputs":[],"execution_count":1},{"cell_type":"code","source":"import re\nimport pandas as pd\nimport numpy as np\n\ncomp_dir = '/kaggle/input/eedi-mining-misconceptions-in-mathematics'\n\ntrain = pd.read_csv(f'{comp_dir}/train.csv')\ntest = pd.read_csv(f'{comp_dir}/test.csv')\nmisconceptions = pd.read_csv(f'{comp_dir}/misconception_mapping.csv')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:50.816741Z","iopub.execute_input":"2024-12-20T09:36:50.817036Z","iopub.status.idle":"2024-12-20T09:36:51.189886Z","shell.execute_reply.started":"2024-12-20T09:36:50.817006Z","shell.execute_reply":"2024-12-20T09:36:51.189168Z"}},"outputs":[],"execution_count":2},{"cell_type":"code","source":"zeros = [5,\n 7,\n 10,\n 12,\n 13,\n 17,\n 18,\n 24,\n 25,\n 30,\n 40,\n 41,\n 43,\n 46,\n 50,\n 53,\n 56,\n 59,\n 62,\n 63,\n 65,\n 67,\n 69,\n 72,\n 73,\n 75,\n 79,\n 80,\n 87,\n 90,\n 92,\n 93,\n 94,\n 96,\n 104,\n 112,\n 115,\n 116,\n 121,\n 122,\n 124,\n 125,\n 128,\n 129,\n 135,\n 139,\n 140,\n 144,\n 145,\n 147,\n 148,\n 149,\n 150,\n 155,\n 156,\n 157,\n 160,\n 163,\n 165,\n 168,\n 170,\n 173,\n 174,\n 175,\n 177,\n 178,\n 180,\n 184,\n 188,\n 192,\n 193,\n 198,\n 199,\n 200,\n 203,\n 204,\n 206,\n 208,\n 209,\n 214,\n 215,\n 216,\n 222,\n 225,\n 231,\n 235,\n 237,\n 238,\n 240,\n 241,\n 243,\n 246,\n 250,\n 254,\n 257,\n 258,\n 259,\n 266,\n 269,\n 274,\n 275,\n 276,\n 277,\n 286,\n 288,\n 291,\n 298,\n 299,\n 301,\n 304,\n 310,\n 311,\n 313,\n 314,\n 316,\n 319,\n 320,\n 321,\n 323,\n 324,\n 325,\n 327,\n 330,\n 333,\n 335,\n 336,\n 341,\n 342,\n 343,\n 348,\n 351,\n 354,\n 356,\n 358,\n 359,\n 360,\n 361,\n 365,\n 367,\n 368,\n 369,\n 371,\n 385,\n 386,\n 387,\n 390,\n 394,\n 395,\n 399,\n 403,\n 406,\n 407,\n 410,\n 411,\n 412,\n 413,\n 416,\n 419,\n 420,\n 425,\n 428,\n 429,\n 430,\n 431,\n 432,\n 434,\n 435,\n 437,\n 440,\n 442,\n 444,\n 448,\n 453,\n 456,\n 457,\n 459,\n 462,\n 465,\n 472,\n 473,\n 475,\n 476,\n 477,\n 479,\n 480,\n 482,\n 484,\n 485,\n 487,\n 489,\n 490,\n 494,\n 497,\n 500,\n 502,\n 504,\n 505,\n 506,\n 513,\n 514,\n 516,\n 517,\n 518,\n 522,\n 523,\n 529,\n 530,\n 534,\n 535,\n 536,\n 538,\n 541,\n 543,\n 546,\n 548,\n 552,\n 555,\n 559,\n 560,\n 561,\n 562,\n 569,\n 571,\n 574,\n 575,\n 579,\n 580,\n 582,\n 586,\n 592,\n 593,\n 595,\n 596,\n 597,\n 598,\n 605,\n 607,\n 610,\n 612,\n 613,\n 615,\n 622,\n 627,\n 632,\n 634,\n 636,\n 640,\n 645,\n 647,\n 660,\n 662,\n 665,\n 667,\n 675,\n 676,\n 677,\n 678,\n 679,\n 681,\n 682,\n 683,\n 689,\n 692,\n 693,\n 696,\n 697,\n 698,\n 700,\n 701,\n 703,\n 705,\n 707,\n 714,\n 716,\n 720,\n 721,\n 722,\n 726,\n 728,\n 731,\n 738,\n 740,\n 741,\n 744,\n 748,\n 749,\n 750,\n 752,\n 753,\n 755,\n 761,\n 763,\n 768,\n 769,\n 770,\n 771,\n 774,\n 775,\n 776,\n 777,\n 778,\n 781,\n 786,\n 787,\n 788,\n 798,\n 799,\n 802,\n 803,\n 805,\n 810,\n 817,\n 818,\n 819,\n 822,\n 824,\n 825,\n 826,\n 827,\n 830,\n 831,\n 837,\n 841,\n 844,\n 846,\n 849,\n 850,\n 853,\n 854,\n 855,\n 856,\n 857,\n 859,\n 861,\n 862,\n 865,\n 869,\n 873,\n 881,\n 883,\n 884,\n 885,\n 886,\n 887,\n 889,\n 892,\n 896,\n 897,\n 899,\n 903,\n 908,\n 913,\n 916,\n 917,\n 918,\n 919,\n 920,\n 921,\n 928,\n 929,\n 934,\n 938,\n 939,\n 943,\n 945,\n 949,\n 953,\n 956,\n 957,\n 958,\n 963,\n 966,\n 967,\n 970,\n 973,\n 979,\n 985,\n 992,\n 993,\n 995,\n 998,\n 999,\n 1000,\n 1001,\n 1002,\n 1003,\n 1005,\n 1006,\n 1010,\n 1011,\n 1014,\n 1015,\n 1019,\n 1020,\n 1021,\n 1023,\n 1027,\n 1034,\n 1037,\n 1038,\n 1039,\n 1042,\n 1045,\n 1052,\n 1053,\n 1056,\n 1059,\n 1060,\n 1061,\n 1062,\n 1064,\n 1065,\n 1067,\n 1076,\n 1077,\n 1080,\n 1081,\n 1084,\n 1085,\n 1086,\n 1087,\n 1088,\n 1090,\n 1092,\n 1094,\n 1095,\n 1096,\n 1097,\n 1101,\n 1105,\n 1106,\n 1109,\n 1111,\n 1112,\n 1117,\n 1121,\n 1122,\n 1125,\n 1127,\n 1129,\n 1131,\n 1136,\n 1137,\n 1141,\n 1143,\n 1147,\n 1149,\n 1154,\n 1159,\n 1160,\n 1161,\n 1162,\n 1170,\n 1171,\n 1174,\n 1178,\n 1182,\n 1185,\n 1189,\n 1191,\n 1194,\n 1195,\n 1197,\n 1199,\n 1200,\n 1201,\n 1211,\n 1217,\n 1219,\n 1220,\n 1221,\n 1227,\n 1228,\n 1232,\n 1236,\n 1238,\n 1239,\n 1241,\n 1242,\n 1243,\n 1245,\n 1246,\n 1247,\n 1249,\n 1253,\n 1254,\n 1267,\n 1273,\n 1275,\n 1276,\n 1279,\n 1281,\n 1284,\n 1285,\n 1286,\n 1289,\n 1296,\n 1297,\n 1298,\n 1299,\n 1300,\n 1301,\n 1305,\n 1309,\n 1314,\n 1315,\n 1317,\n 1323,\n 1325,\n 1328,\n 1330,\n 1331,\n 1335,\n 1337,\n 1339,\n 1341,\n 1342,\n 1343,\n 1345,\n 1346,\n 1347,\n 1351,\n 1352,\n 1353,\n 1355,\n 1359,\n 1366,\n 1368,\n 1369,\n 1372,\n 1375,\n 1377,\n 1378,\n 1381,\n 1382,\n 1385,\n 1390,\n 1391,\n 1395,\n 1397,\n 1401,\n 1404,\n 1405,\n 1407,\n 1409,\n 1412,\n 1413,\n 1414,\n 1423,\n 1434,\n 1438,\n 1439,\n 1441,\n 1446,\n 1448,\n 1451,\n 1454,\n 1462,\n 1463,\n 1465,\n 1466,\n 1472,\n 1474,\n 1475,\n 1476,\n 1477,\n 1478,\n 1482,\n 1485,\n 1486,\n 1489,\n 1493,\n 1496,\n 1497,\n 1498,\n 1501,\n 1502,\n 1503,\n 1504,\n 1506,\n 1512,\n 1515,\n 1518,\n 1521,\n 1531,\n 1538,\n 1544,\n 1545,\n 1546,\n 1548,\n 1551,\n 1555,\n 1562,\n 1563,\n 1564,\n 1565,\n 1567,\n 1569,\n 1570,\n 1573,\n 1574,\n 1575,\n 1576,\n 1578,\n 1579,\n 1580,\n 1581,\n 1583,\n 1584,\n 1587,\n 1589,\n 1592,\n 1594,\n 1603,\n 1607,\n 1612,\n 1615,\n 1617,\n 1618,\n 1620,\n 1625,\n 1627,\n 1629,\n 1634,\n 1638,\n 1643,\n 1647,\n 1649,\n 1650,\n 1653,\n 1654,\n 1662,\n 1665,\n 1679,\n 1681,\n 1683,\n 1684,\n 1685,\n 1688,\n 1689,\n 1692,\n 1698,\n 1699,\n 1700,\n 1709,\n 1711,\n 1712,\n 1713,\n 1719,\n 1722,\n 1723,\n 1727,\n 1728,\n 1729,\n 1732,\n 1736,\n 1739,\n 1742,\n 1745,\n 1747,\n 1750,\n 1753,\n 1754,\n 1758,\n 1762,\n 1765,\n 1768,\n 1770,\n 1772,\n 1774,\n 1777,\n 1778,\n 1780,\n 1785,\n 1789,\n 1797,\n 1798,\n 1799,\n 1801,\n 1802,\n 1804,\n 1808,\n 1812,\n 1816,\n 1817,\n 1821,\n 1822,\n 1826,\n 1827,\n 1829,\n 1830,\n 1833,\n 1838,\n 1843,\n 1844,\n 1848,\n 1853,\n 1855,\n 1856,\n 1857,\n 1859,\n 1865,\n 1867,\n 1873,\n 1874,\n 1878,\n 1879,\n 1888,\n 1890,\n 1891,\n 1892,\n 1899,\n 1901,\n 1903,\n 1904,\n 1905,\n 1910,\n 1913,\n 1919,\n 1922,\n 1926,\n 1928,\n 1930,\n 1931,\n 1933,\n 1934,\n 1935,\n 1936,\n 1938,\n 1941,\n 1943,\n 1947,\n 1949,\n 1951,\n 1952,\n 1955,\n 1961,\n 1962,\n 1964,\n 1967,\n 1969,\n 1974,\n 1975,\n 1977,\n 1979,\n 1983,\n 1991,\n 1998,\n 2000,\n 2001,\n 2004,\n 2005,\n 2008,\n 2010,\n 2013,\n 2014,\n 2015,\n 2018,\n 2019,\n 2025,\n 2028,\n 2029,\n 2033,\n 2036,\n 2040,\n 2041,\n 2052,\n 2057,\n 2058,\n 2059,\n 2060,\n 2061,\n 2062,\n 2063,\n 2064,\n 2065,\n 2067,\n 2070,\n 2074,\n 2075,\n 2076,\n 2079,\n 2086,\n 2088,\n 2089,\n 2096,\n 2097,\n 2099,\n 2106,\n 2108,\n 2115,\n 2116,\n 2118,\n 2120,\n 2129,\n 2136,\n 2141,\n 2144,\n 2146,\n 2151,\n 2153,\n 2158,\n 2164,\n 2165,\n 2167,\n 2168,\n 2169,\n 2170,\n 2171,\n 2172,\n 2174,\n 2176,\n 2177,\n 2182,\n 2183,\n 2184,\n 2186,\n 2188,\n 2194,\n 2195,\n 2196,\n 2200,\n 2201,\n 2202,\n 2205,\n 2207,\n 2211,\n 2213,\n 2216,\n 2219,\n 2222,\n 2223,\n 2224,\n 2227,\n 2229,\n 2232,\n 2233,\n 2235,\n 2236,\n 2242,\n 2243,\n 2244,\n 2246,\n 2247,\n 2249,\n 2253,\n 2254,\n 2255,\n 2257,\n 2258,\n 2259,\n 2267,\n 2268,\n 2272,\n 2274,\n 2276,\n 2277,\n 2278,\n 2281,\n 2283,\n 2287,\n 2290,\n 2293,\n 2294,\n 2295,\n 2298,\n 2299,\n 2300,\n 2304,\n 2310,\n 2313,\n 2315,\n 2322,\n 2323,\n 2324,\n 2325,\n 2328,\n 2335,\n 2337,\n 2338,\n 2339,\n 2342,\n 2347,\n 2348,\n 2358,\n 2360,\n 2366,\n 2367,\n 2369,\n 2370,\n 2372,\n 2379,\n 2381,\n 2382,\n 2383,\n 2385,\n 2387,\n 2390,\n 2391,\n 2393,\n 2394,\n 2396,\n 2400,\n 2403,\n 2404,\n 2405,\n 2406,\n 2407,\n 2409,\n 2410,\n 2411,\n 2418,\n 2419,\n 2420,\n 2421,\n 2422,\n 2429,\n 2430,\n 2431,\n 2434,\n 2439,\n 2440,\n 2441,\n 2444,\n 2448,\n 2449,\n 2451,\n 2452,\n 2455,\n 2459,\n 2460,\n 2462,\n 2463,\n 2467,\n 2468,\n 2471,\n 2477,\n 2478,\n 2482,\n 2483,\n 2486,\n 2490,\n 2495,\n 2496,\n 2497,\n 2498,\n 2502,\n 2503,\n 2506,\n 2509,\n 2513,\n 2518,\n 2521,\n 2522,\n 2523,\n 2526,\n 2527,\n 2528,\n 2529,\n 2533,\n 2534,\n 2536,\n 2538,\n 2540,\n 2541,\n 2545,\n 2548,\n 2552,\n 2553,\n 2556,\n 2557,\n 2559,\n 2560,\n 2562,\n 2564,\n 2567,\n 2568,\n 2570,\n 2571,\n 2574,\n 2575,\n 2578,\n 2580,\n 2582,\n 2584]","metadata":{"trusted":true,"scrolled":true,"jupyter":{"source_hidden":true},"_kg_hide-input":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.191003Z","iopub.execute_input":"2024-12-20T09:36:51.191268Z","iopub.status.idle":"2024-12-20T09:36:51.219924Z","shell.execute_reply.started":"2024-12-20T09:36:51.191243Z","shell.execute_reply":"2024-12-20T09:36:51.219040Z"}},"outputs":[],"execution_count":3},{"cell_type":"markdown","source":"# preprocessing","metadata":{}},{"cell_type":"code","source":"answer_cols = [\"AnswerAText\", \"AnswerBText\", \"AnswerCText\", \"AnswerDText\"]\nkeep_cols = [\"QuestionId\", \"CorrectAnswer\", \"ConstructName\", \"SubjectName\", \"QuestionText\" ]\n\ndef wide_to_long(df: pd.DataFrame) -> pd.DataFrame:\n answers_df = pd.melt(\n id_vars=keep_cols,\n frame=df[keep_cols + answer_cols],\n var_name='Answer', value_name='AnswerText'\n ).sort_values([\"QuestionId\", \"Answer\"]).reset_index(drop=True) \n return answers_df\n\ndef preprocess_text(x):\n x = re.sub(r\"http\\w+\", '', x) # Delete URL\n x = re.sub(r\"\\.+\", \".\", x) # Replace consecutive periods with a single period\n x = re.sub(r\"\\,+\", \",\", x) # Replace consecutive commas with a single comma\n x = re.sub(r\"\\\\\\\\\", r\"\\\\\", x) # Normalize multiple backslashes to double backslashes\n x = re.sub(r\"[ ]{2,}\", \" \", x) # Replace multiple spaces with a single space\n x = x.strip() # Remove empty characters at the beginning and end\n return x\n\ntask_description = 'Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.'\ndef get_text(row):\n text = f'''Instruct: {task_description}\n\nQuery:\n###Construct###:{row['ConstructName']}\n###Subject###:{row['SubjectName']}\n###Question###:{row['QuestionText']}\n###Correct Answer###:{row['CorrectAnswerText']}\n###Incorrect Answer###:{row['AnswerText']}\n###Misconception###:\n'''\n return preprocess_text(text)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.221573Z","iopub.execute_input":"2024-12-20T09:36:51.221859Z","iopub.status.idle":"2024-12-20T09:36:51.234513Z","shell.execute_reply.started":"2024-12-20T09:36:51.221834Z","shell.execute_reply":"2024-12-20T09:36:51.233784Z"}},"outputs":[],"execution_count":4},{"cell_type":"code","source":"df = wide_to_long(test)\ndf['AnswerId'] = df.Answer.str.replace('Answer', '').str.replace('Text', '')\n\nca_map_df = df[df['CorrectAnswer']==df['AnswerId']][['QuestionId', 'AnswerText']].reset_index(drop=True)\nca_map_df.columns = ['QuestionId', 'CorrectAnswerText']\ndf = pd.merge(df, ca_map_df, on='QuestionId', how='left')\n\ndf = df[df['CorrectAnswer']!=df['AnswerId']].reset_index(drop=True)\ndf['text'] = df.apply(get_text, axis=1)\ndf['QuestionId_Answer'] = df['QuestionId'].astype(str) + '_' + df['AnswerId'].astype(str)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.235313Z","iopub.execute_input":"2024-12-20T09:36:51.235540Z","iopub.status.idle":"2024-12-20T09:36:51.259820Z","shell.execute_reply.started":"2024-12-20T09:36:51.235517Z","shell.execute_reply":"2024-12-20T09:36:51.258966Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"print(df['text'][0])","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.260830Z","iopub.execute_input":"2024-12-20T09:36:51.261065Z","iopub.status.idle":"2024-12-20T09:36:51.265899Z","shell.execute_reply.started":"2024-12-20T09:36:51.261042Z","shell.execute_reply":"2024-12-20T09:36:51.265019Z"}},"outputs":[{"name":"stdout","text":"Instruct: Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.\n\nQuery:\n###Construct###:Use the order of operations to carry out calculations involving powers\n###Subject###:BIDMAS\n###Question###:\\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\n###Correct Answer###:\\( 3 \\times(2+4)-5 \\)\n###Incorrect Answer###:\\( 3 \\times 2+(4-5) \\)\n###Misconception###:\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"len(df)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.267142Z","iopub.execute_input":"2024-12-20T09:36:51.267464Z","iopub.status.idle":"2024-12-20T09:36:51.277187Z","shell.execute_reply.started":"2024-12-20T09:36:51.267428Z","shell.execute_reply":"2024-12-20T09:36:51.276171Z"}},"outputs":[{"execution_count":7,"output_type":"execute_result","data":{"text/plain":"9"},"metadata":{}}],"execution_count":7},{"cell_type":"code","source":"df.head(3)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.278299Z","iopub.execute_input":"2024-12-20T09:36:51.278606Z","iopub.status.idle":"2024-12-20T09:36:51.295275Z","shell.execute_reply.started":"2024-12-20T09:36:51.278568Z","shell.execute_reply":"2024-12-20T09:36:51.294395Z"}},"outputs":[{"execution_count":8,"output_type":"execute_result","data":{"text/plain":" QuestionId CorrectAnswer \\\n0 1869 A \n1 1869 A \n2 1869 A \n\n ConstructName SubjectName \\\n0 Use the order of operations to carry out calcu... BIDMAS \n1 Use the order of operations to carry out calcu... BIDMAS \n2 Use the order of operations to carry out calcu... BIDMAS \n\n QuestionText Answer \\\n0 \\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ... AnswerBText \n1 \\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ... AnswerCText \n2 \\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ... AnswerDText \n\n AnswerText AnswerId CorrectAnswerText \\\n0 \\( 3 \\times 2+(4-5) \\) B \\( 3 \\times(2+4)-5 \\) \n1 \\( 3 \\times(2+4-5) \\) C \\( 3 \\times(2+4)-5 \\) \n2 Does not need brackets D \\( 3 \\times(2+4)-5 \\) \n\n text QuestionId_Answer \n0 Instruct: Given a math question and a misconce... 1869_B \n1 Instruct: Given a math question and a misconce... 1869_C \n2 Instruct: Given a math question and a misconce... 1869_D ","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
QuestionIdCorrectAnswerConstructNameSubjectNameQuestionTextAnswerAnswerTextAnswerIdCorrectAnswerTexttextQuestionId_Answer
01869AUse the order of operations to carry out calcu...BIDMAS\\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ...AnswerBText\\( 3 \\times 2+(4-5) \\)B\\( 3 \\times(2+4)-5 \\)Instruct: Given a math question and a misconce...1869_B
11869AUse the order of operations to carry out calcu...BIDMAS\\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ...AnswerCText\\( 3 \\times(2+4-5) \\)C\\( 3 \\times(2+4)-5 \\)Instruct: Given a math question and a misconce...1869_C
21869AUse the order of operations to carry out calcu...BIDMAS\\[\\n3 \\times 2+4-5\\n\\]\\nWhere do the brackets ...AnswerDTextDoes not need bracketsD\\( 3 \\times(2+4)-5 \\)Instruct: Given a math question and a misconce...1869_D
\n
"},"metadata":{}}],"execution_count":8},{"cell_type":"code","source":"df.to_parquet(\"df_preprocessed.parquet\", index=False)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.296206Z","iopub.execute_input":"2024-12-20T09:36:51.296412Z","iopub.status.idle":"2024-12-20T09:36:51.320210Z","shell.execute_reply.started":"2024-12-20T09:36:51.296391Z","shell.execute_reply":"2024-12-20T09:36:51.319663Z"}},"outputs":[],"execution_count":9},{"cell_type":"markdown","source":"# Embedder","metadata":{}},{"cell_type":"code","source":"%%writefile run_embedder.py\n\nimport argparse\nfrom tqdm import tqdm, trange\nimport numpy as np\nimport pandas as pd\nfrom sklearn.metrics.pairwise import cosine_similarity\nimport torch\nimport torch.nn.functional as F\nfrom torch import Tensor\nfrom transformers import AutoTokenizer, Qwen2Model, BitsAndBytesConfig, set_seed\nfrom peft import LoraConfig, get_peft_model\n\n# パラメータ\nMODEL_PATH = \"/kaggle/input/qwen2.5-32b/transformers/default/1\"\nMAX_LENGTH = 512\nBATCH_SIZE = 8\nDEVICE = 'auto'\n\nset_seed(42)\n\ndef last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:\n \"\"\"\n Extract the embedding of the last valid token based on attention mask.\n \"\"\"\n if attention_mask[:, -1].all(): # Check if right padding is used\n return last_hidden_states[:, -1]\n sequence_lengths = attention_mask.sum(dim=1) - 1\n return last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths]\n\n@torch.no_grad()\n@torch.amp.autocast('cuda')\ndef inference(df, model, tokenizer, query_text='query_text'):\n \"\"\"\n Perform inference to extract embeddings for a given dataframe of sentences.\n \"\"\"\n sentences = df[query_text].tolist()\n all_embeddings = [None] * len(sentences)\n\n # Sort sentences by length (descending)\n length_sorted_idx = np.argsort([-len(sen) for sen in sentences])\n sentences_sorted = [sentences[idx] for idx in length_sorted_idx]\n\n for start_idx in trange(0, len(sentences), BATCH_SIZE, desc=\"Batches\"):\n sentences_batch = sentences_sorted[start_idx: start_idx + BATCH_SIZE]\n features = tokenizer(sentences_batch, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors=\"pt\")\n features = {key: value.to(model.device) for key, value in features.items() if isinstance(value, Tensor)}\n\n outputs = model.model(**features)\n embeddings = last_token_pool(outputs.last_hidden_state, features['attention_mask'])\n embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu().numpy()\n\n for i, emb in enumerate(embeddings):\n original_idx = length_sorted_idx[start_idx + i]\n all_embeddings[original_idx] = emb\n\n return np.vstack(all_embeddings)\n\nif __name__ == \"__main__\":\n # Parse arguments\n parser = argparse.ArgumentParser(description=\"Run embedding inference.\")\n parser.add_argument(\"--df_path\", type=str, required=True)\n parser.add_argument(\"--lora_path\", type=str, required=True)\n parser.add_argument(\"--output_path\", type=str, required=True)\n args = parser.parse_args()\n\n # Load data\n df_path = args.df_path\n lora_path = args.lora_path\n output_path = args.output_path\n\n df = pd.read_parquet(df_path)\n misconceptions = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')\n\n # Load tokenizer and model\n tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)\n bnb_config = BitsAndBytesConfig(\n load_in_4bit=True,\n bnb_4bit_use_double_quant=True,\n bnb_4bit_quant_type=\"nf4\",\n bnb_4bit_compute_dtype=torch.bfloat16\n )\n model = Qwen2Model.from_pretrained(\n MODEL_PATH,\n quantization_config=bnb_config,\n device_map=DEVICE,\n )\n config = LoraConfig(\n r=64,\n lora_alpha=128,\n target_modules=[\n \"q_proj\",\n \"k_proj\",\n \"v_proj\",\n \"o_proj\",\n \"gate_proj\",\n \"up_proj\",\n \"down_proj\",\n ],\n bias=\"none\",\n lora_dropout=0.05,\n task_type=\"CAUSAL_LM\",\n )\n model = get_peft_model(model, config)\n d = torch.load(lora_path, map_location=model.device, weights_only=True)\n model.load_state_dict(d, strict=False)\n\n # Perform inference\n query_embeddings = inference(df, model, tokenizer, query_text='text')\n passage_embeddings = inference(misconceptions, model, tokenizer, query_text='MisconceptionName')\n\n # Compute similarity and save results\n similarity = cosine_similarity(query_embeddings, passage_embeddings)\n np.save(output_path, similarity)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:36:51.322886Z","iopub.execute_input":"2024-12-20T09:36:51.323104Z","iopub.status.idle":"2024-12-20T09:36:51.329503Z","shell.execute_reply.started":"2024-12-20T09:36:51.323083Z","shell.execute_reply":"2024-12-20T09:36:51.328756Z"}},"outputs":[{"name":"stdout","text":"Overwriting run_embedder.py\n","output_type":"stream"}],"execution_count":10},{"cell_type":"code","source":"!pip install peft\n!pip install bitsandbytes\n!pip install accelerate","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:39:20.237450Z","iopub.execute_input":"2024-12-20T09:39:20.237851Z","iopub.status.idle":"2024-12-20T09:39:44.566957Z","shell.execute_reply.started":"2024-12-20T09:39:20.237810Z","shell.execute_reply":"2024-12-20T09:39:44.566020Z"}},"outputs":[{"name":"stdout","text":"Requirement already satisfied: peft in /opt/conda/lib/python3.10/site-packages (0.14.0)\nRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from peft) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from peft) (21.3)\nRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from peft) (5.9.3)\nRequirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from peft) (6.0.2)\nRequirement already satisfied: torch>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from peft) (2.4.0)\nRequirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from peft) (4.44.2)\nRequirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from peft) (4.66.4)\nRequirement already satisfied: accelerate>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from peft) (0.34.2)\nRequirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from peft) (0.4.5)\nRequirement already satisfied: huggingface-hub>=0.25.0 in /opt/conda/lib/python3.10/site-packages (from peft) (0.25.0)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.25.0->peft) (3.15.1)\nRequirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.25.0->peft) (2024.6.1)\nRequirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.25.0->peft) (2.32.3)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.25.0->peft) (4.12.2)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.0->peft) (3.1.2)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (1.13.3)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (3.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft) (3.1.4)\nRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->peft) (2024.5.15)\nRequirement already satisfied: tokenizers<0.20,>=0.19 in /opt/conda/lib/python3.10/site-packages (from transformers->peft) (0.19.1)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft) (2.1.5)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.25.0->peft) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.25.0->peft) (3.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.25.0->peft) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.25.0->peft) (2024.8.30)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft) (1.3.0)\nRequirement already satisfied: bitsandbytes in /opt/conda/lib/python3.10/site-packages (0.45.0)\nRequirement already satisfied: torch in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (2.4.0)\nRequirement already satisfied: numpy in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (1.26.4)\nRequirement already satisfied: typing_extensions>=4.8.0 in /opt/conda/lib/python3.10/site-packages (from bitsandbytes) (4.12.2)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (3.15.1)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (1.13.3)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (3.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (3.1.4)\nRequirement already satisfied: fsspec in /opt/conda/lib/python3.10/site-packages (from torch->bitsandbytes) (2024.6.1)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch->bitsandbytes) (2.1.5)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch->bitsandbytes) (1.3.0)\nRequirement already satisfied: accelerate in /opt/conda/lib/python3.10/site-packages (0.34.2)\nRequirement already satisfied: numpy<3.0.0,>=1.17 in /opt/conda/lib/python3.10/site-packages (from accelerate) (1.26.4)\nRequirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from accelerate) (21.3)\nRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate) (5.9.3)\nRequirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from accelerate) (6.0.2)\nRequirement already satisfied: torch>=1.10.0 in /opt/conda/lib/python3.10/site-packages (from accelerate) (2.4.0)\nRequirement already satisfied: huggingface-hub>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from accelerate) (0.25.0)\nRequirement already satisfied: safetensors>=0.4.3 in /opt/conda/lib/python3.10/site-packages (from accelerate) (0.4.5)\nRequirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (3.15.1)\nRequirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (2024.6.1)\nRequirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)\nRequirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (4.66.4)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.21.0->accelerate) (4.12.2)\nRequirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging>=20.0->accelerate) (3.1.2)\nRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (1.13.3)\nRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (3.3)\nRequirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate) (3.1.4)\nRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5)\nRequirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (1.26.18)\nRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2024.8.30)\nRequirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n","output_type":"stream"}],"execution_count":21},{"cell_type":"code","source":"!python run_embedder.py \\\n --df_path df_preprocessed.parquet \\\n --lora_path /kaggle/input/2211-lora-14b/transformers/default/1 \\\n --output_path similarity1.npy","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T10:07:39.583084Z","iopub.execute_input":"2024-12-20T10:07:39.583805Z","iopub.status.idle":"2024-12-20T10:13:11.378130Z","shell.execute_reply.started":"2024-12-20T10:07:39.583767Z","shell.execute_reply":"2024-12-20T10:13:11.376855Z"}},"outputs":[{"name":"stdout","text":"Loading checkpoint shards: 100%|████████████████| 17/17 [04:54<00:00, 17.35s/it]\nTraceback (most recent call last):\n File \"/kaggle/working/run_embedder.py\", line 104, in \n d = torch.load(lora_path, map_location=model.device, weights_only=True)\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 1065, in load\n with _open_file_like(f, 'rb') as opened_file:\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 468, in _open_file_like\n return _open_file(name_or_buffer, mode)\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 449, in __init__\n super().__init__(open(name, mode))\nIsADirectoryError: [Errno 21] Is a directory: '/kaggle/input/2211-lora-14b/transformers/default/1'\n","output_type":"stream"}],"execution_count":45},{"cell_type":"code","source":"!python run_embedder.py \\\n --df_path df_preprocessed.parquet \\\n --lora_path /kaggle/input/embedder-lora-v6/transformers/default/1/adapter.bin \\\n --output_path similarity2.npy","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:44:20.174501Z","iopub.execute_input":"2024-12-20T09:44:20.174819Z","iopub.status.idle":"2024-12-20T09:49:00.609829Z","shell.execute_reply.started":"2024-12-20T09:44:20.174789Z","shell.execute_reply":"2024-12-20T09:49:00.608586Z"}},"outputs":[{"name":"stdout","text":"Loading checkpoint shards: 100%|████████████████| 17/17 [04:04<00:00, 14.41s/it]\nTraceback (most recent call last):\n File \"/kaggle/working/run_embedder.py\", line 104, in \n d = torch.load(lora_path, map_location=model.device, weights_only=True)\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 1065, in load\n with _open_file_like(f, 'rb') as opened_file:\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 468, in _open_file_like\n return _open_file(name_or_buffer, mode)\n File \"/opt/conda/lib/python3.10/site-packages/torch/serialization.py\", line 449, in __init__\n super().__init__(open(name, mode))\nFileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/embedder-lora-v6/transformers/default/1/adapter.bin'\n","output_type":"stream"}],"execution_count":23},{"cell_type":"code","source":"sim1 = np.load('/kaggle/input/sub-embedder-reranker-ensemble-v3/similarity1.npy')\nsim2 = np.load('/kaggle/input/sub-embedder-reranker-ensemble-v3/similarity2.npy')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:00.480906Z","iopub.execute_input":"2024-12-20T09:53:00.481486Z","iopub.status.idle":"2024-12-20T09:53:00.493772Z","shell.execute_reply.started":"2024-12-20T09:53:00.481446Z","shell.execute_reply":"2024-12-20T09:53:00.492868Z"}},"outputs":[],"execution_count":28},{"cell_type":"code","source":"for i in range(sim1.shape[0]):\n sim1[i, zeros] *= 1.5\n sim2[i, zeros] *= 1.5","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:01.884017Z","iopub.execute_input":"2024-12-20T09:53:01.884376Z","iopub.status.idle":"2024-12-20T09:53:01.891969Z","shell.execute_reply.started":"2024-12-20T09:53:01.884344Z","shell.execute_reply":"2024-12-20T09:53:01.891197Z"}},"outputs":[],"execution_count":29},{"cell_type":"code","source":"top_n_indices1 = np.argsort(sim1, axis=1)[:, ::-1][:, :25]\ntop_n_indices2 = np.argsort(sim2, axis=1)[:, ::-1][:, :25]","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:02.201726Z","iopub.execute_input":"2024-12-20T09:53:02.202248Z","iopub.status.idle":"2024-12-20T09:53:02.210604Z","shell.execute_reply.started":"2024-12-20T09:53:02.202219Z","shell.execute_reply":"2024-12-20T09:53:02.209640Z"}},"outputs":[],"execution_count":30},{"cell_type":"code","source":"df = pd.read_parquet('df_preprocessed.parquet')\n\ndf['top_indices1'] = top_n_indices1.tolist()\ndf['top_indices2'] = top_n_indices2.tolist()\ndf.to_parquet('df_embedded.parquet', index=False)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:02.572997Z","iopub.execute_input":"2024-12-20T09:53:02.573307Z","iopub.status.idle":"2024-12-20T09:53:02.639880Z","shell.execute_reply.started":"2024-12-20T09:53:02.573277Z","shell.execute_reply":"2024-12-20T09:53:02.639199Z"}},"outputs":[],"execution_count":31},{"cell_type":"markdown","source":"# Reranker 1","metadata":{}},{"cell_type":"code","source":"import re\n\ndef get_candidates(c_indices):\n candidates = []\n\n mis_names = misconceptions[\"MisconceptionName\"].values\n for ix in c_indices:\n c_names = []\n for i, name in enumerate(mis_names[ix]):\n # A, B, C...\n c_names.append(f\"{chr(65 + i)}. {name}\")\n\n candidates.append(\"\\n\".join(c_names))\n\n return candidates\n\ndef preprocess_text(x):\n x = re.sub(r\"http\\w+\", '', x) # Delete URL\n x = re.sub(r\"\\.+\", \".\", x) # Replace consecutive periods with a single period\n x = re.sub(r\"\\,+\", \",\", x) # Replace consecutive commas with a single comma\n x = re.sub(r\"\\\\\\\\\", r\"\\\\\", x) # Normalize multiple backslashes to double backslashes\n x = re.sub(r\"[ ]{2,}\", \" \", x) # Replace multiple spaces with a single space\n x = x.strip() # Remove empty characters at the beginning and end\n return x\n\nPROMPT = \"\"\"Here is a question about {ConstructName}({SubjectName}).\nQuestion: {Question}\nCorrect Answer: {CorrectAnswer}\nIncorrect Answer: {IncorrectAnswer}\n\nYou are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.\nChoose the most appropriate letter corresponding to the misconception from the options below:\n\n{Retrival}\n\"\"\"\n\ndef preprocess_row(row):\n conversations = [\n {\n \"role\": \"user\",\n \"content\": preprocess_text(\n PROMPT.format(\n ConstructName=row[\"ConstructName\"],\n SubjectName=row[\"SubjectName\"],\n Question=row[\"QuestionText\"],\n CorrectAnswer=row[\"CorrectAnswerText\"],\n IncorrectAnswer=row[\"AnswerText\"],\n Retrival=row[\"retrieval\"]\n )\n )\n }\n ]\n return conversations","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:05.068502Z","iopub.execute_input":"2024-12-20T09:53:05.068841Z","iopub.status.idle":"2024-12-20T09:53:05.077336Z","shell.execute_reply.started":"2024-12-20T09:53:05.068813Z","shell.execute_reply":"2024-12-20T09:53:05.076413Z"}},"outputs":[],"execution_count":32},{"cell_type":"code","source":"def process_dataframe(top_indices_column, output_file, top_k):\n df = pd.read_parquet('df_embedded.parquet')\n top_indices = np.array(df[top_indices_column].tolist())[:, :top_k]\n df['retrieval'] = get_candidates(top_indices)\n df['conversations'] = df.apply(preprocess_row, axis=1)\n df['TOP_K'] = top_k\n df['id'] = range(len(df))\n df.to_parquet(output_file, index=False)\n print(df['conversations'][0][0]['content'])\n print('-'*50)\n\n# 定数の設定\nTOP_K = 25\n\n# ファイル処理の呼び出し\nprocess_dataframe('top_indices1', 'df_test1.parquet', TOP_K)\nprocess_dataframe('top_indices2', 'df_test2.parquet', TOP_K)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:31.032546Z","iopub.execute_input":"2024-12-20T09:53:31.032910Z","iopub.status.idle":"2024-12-20T09:53:31.061895Z","shell.execute_reply.started":"2024-12-20T09:53:31.032879Z","shell.execute_reply":"2024-12-20T09:53:31.061014Z"}},"outputs":[{"name":"stdout","text":"Here is a question about Use the order of operations to carry out calculations involving powers(BIDMAS).\nQuestion: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\nCorrect Answer: \\( 3 \\times(2+4)-5 \\)\nIncorrect Answer: \\( 3 \\times 2+(4-5) \\)\n\nYou are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.\nChoose the most appropriate letter corresponding to the misconception from the options below:\n\nA. Inserts brackets but not changed order of operation\nB. May have made a calculation error using the order of operations\nC. Carries out operations from left to right regardless of priority order, unless brackets are used\nD. Confuses the order of operations, believes subtraction comes before multiplication \nE. Has removed brackets but not performed the operation\nF. Has not realised that the answer may be changed by the insertion of brackets\nG. Thinks the subtraction sign means multiply\nH. Does not perform calculations in the numerator of a fraction before division by the denominator\nI. Applies BIDMAS in strict order (does not realize addition and subtraction, and multiplication and division, are of equal priority)\nJ. Uses addition instead of the associative property of multiplication\nK. Carries out operations from left to right regardless of priority order\nL. Thinks a divide and a negative sign next to each other makes a plus\nM. Believes order of operations does not affect the answer to a calculation\nN. Done a different calculation to the one given\nO. Does not include brackets when required\nP. When there's a negative sign in the question, thinks the answer must be negative\nQ. Thinks a divide and a negative sign next to each other makes a minus\nR. Thinks multiplication and addition are the same\nS. Carries out operations from right to left regardless of priority order\nT. Answers order of operations questions with brackets as if the brackets are not there\nU. Doesn't recognise commutativity of addition with negative numbers\nV. Confuses the order of operations, believes addition comes before multiplication \nW. Believes a subtraction cannot be partitioned into separate subtractions\nX. Assumes the negative sign in a power has no meaning\nY. When factorising into double brackets, finds the correct values for the non variable terms but swops the plus and minus sign\n--------------------------------------------------\nHere is a question about Use the order of operations to carry out calculations involving powers(BIDMAS).\nQuestion: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\nCorrect Answer: \\( 3 \\times(2+4)-5 \\)\nIncorrect Answer: \\( 3 \\times 2+(4-5) \\)\n\nYou are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question.\nChoose the most appropriate letter corresponding to the misconception from the options below:\n\nA. Inserts brackets but not changed order of operation\nB. May have made a calculation error using the order of operations\nC. Carries out operations from left to right regardless of priority order, unless brackets are used\nD. Uses addition instead of the associative property of multiplication\nE. Confuses the order of operations, believes subtraction comes before multiplication \nF. Believes that adding a positive to a negative makes your answer more negative\nG. Has removed brackets but not performed the operation\nH. Doesn't recognise commutativity of addition with negative numbers\nI. When there's a negative sign in the question, thinks the answer must be negative\nJ. Has not realised that the answer may be changed by the insertion of brackets\nK. Done a different calculation to the one given\nL. When a subtraction of one positive number from another, results in a negative answer, they believe the answer is the sum of those 2 numbers with a negative sign put in front.\nM. Does not know the distributive property\nN. When two digits multiply to 10 or more during a multiplication problem, does not add one to the preceding digit\nO. Thinks multiplication and addition are the same\nP. Believes a subtraction cannot be partitioned into separate subtractions\nQ. When factorising into double brackets, finds the correct values for the non variable terms but swops the plus and minus sign\nR. Does not understand that adding on to a multiple can give you the next multiple\nS. Confuses the direction of vectors when adding or subtracting\nT. When adding negatives believes that they can just add their absolute values and add a negative sign to the answer\nU. Thinks the multiplication sign means to add\nV. Does not understand the question\nW. Believes that if one number in a product decreases, the answer must increase. \nX. Thinks the inverse of multiplication is addition\nY. Tries to add or subtract unlike terms\n--------------------------------------------------\n","output_type":"stream"}],"execution_count":35},{"cell_type":"code","source":"%%writefile run_reranker1.py\n\nimport argparse\nimport time\nfrom tqdm import tqdm\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nimport random\nimport numpy as np\nimport pandas as pd\nimport torch\nfrom transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed\nfrom transformers.data.data_collator import pad_without_fast_tokenizer_warning\nfrom peft import PeftModel\n\n# 固定パラメータ\nMODEL_DIR = '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'\nMAX_LENGTH = 1536\nBATCH_SIZE = 1\n\nset_seed(42)\n\ndef tokenize(tokenizer, conversations, max_length=MAX_LENGTH):\n texts = []\n for messages in conversations:\n text = tokenizer.apply_chat_template(\n messages,\n tokenize=False,\n add_generation_prompt=True\n )\n texts.append(text)\n\n tokenized = tokenizer(texts, add_special_tokens=False, max_length=max_length, truncation=True)\n input_ids = tokenized.input_ids\n attention_mask = tokenized.attention_mask\n return input_ids, attention_mask\n\n@torch.no_grad()\n@torch.amp.autocast('cuda')\ndef inference(df, model, label_idx, batch_size=BATCH_SIZE, max_length=MAX_LENGTH):\n scores = []\n\n # バッチごとに処理を行うループ\n for start_idx in tqdm(range(0, len(df), batch_size)):\n end_idx = min(start_idx + batch_size, len(df))\n tmp = df.iloc[start_idx:end_idx]\n input_ids = tmp[\"input_ids\"].to_list()\n attention_mask = tmp[\"attention_mask\"].to_list()\n \n # 入力のパディング処理\n inputs = pad_without_fast_tokenizer_warning(\n tokenizer,\n {\"input_ids\": input_ids, \"attention_mask\": attention_mask},\n padding=\"longest\",\n pad_to_multiple_of=None,\n return_tensors=\"pt\",\n )\n \n # モデル推論\n outputs = model(**inputs.to(model.device))\n proba = outputs.logits[:, -1, label_idx].softmax(-1).cpu()\n scores.extend(proba.tolist())\n \n df['score'] = scores\n return df\n\nif __name__ == \"__main__\":\n # コマンドライン引数を定義\n parser = argparse.ArgumentParser(description=\"Run reranker inference.\")\n parser.add_argument(\"--lora_dir1\", type=str, required=True)\n parser.add_argument(\"--input_path\", type=str, required=True)\n parser.add_argument(\"--output_path\", type=str, required=True)\n args = parser.parse_args()\n\n # 設定\n lora_dir1 = args.lora_dir1\n input_path = args.input_path\n output_path = args.output_path\n\n # トークナイザーとモデルの読み込み\n tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\n model = AutoModelForCausalLM.from_pretrained(\n MODEL_DIR,\n device_map='auto',\n use_cache=True,\n )\n\n # LoRAアダプターのロード\n model = PeftModel.from_pretrained(model, lora_dir1, adapter_name=\"lora1\")\n\n # データ読み込み\n df_test = pd.read_parquet(input_path)\n TOP_K = df_test['TOP_K'][0]\n label_idx = [tokenizer(f'{chr(65 + i)}', add_special_tokens=False)['input_ids'][-1] for i in range(TOP_K)]\n print(tokenizer.decode(label_idx))\n\n # データのトークン化\n data = pd.DataFrame()\n data[\"id\"] = df_test[\"id\"]\n data[\"input_ids\"], data[\"attention_mask\"] = tokenize(tokenizer, df_test[\"conversations\"])\n data[\"length\"] = data[\"input_ids\"].apply(len)\n data = data.sort_values(\"length\", ascending=False).reset_index(drop=True)\n\n # 推論と保存\n score_df = inference(data, model, label_idx).sort_values('id').reset_index(drop=True)\n score_df.to_parquet(output_path, index=False)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:32.583830Z","iopub.execute_input":"2024-12-20T09:53:32.584170Z","iopub.status.idle":"2024-12-20T09:53:32.591128Z","shell.execute_reply.started":"2024-12-20T09:53:32.584142Z","shell.execute_reply":"2024-12-20T09:53:32.590270Z"}},"outputs":[{"name":"stdout","text":"Overwriting run_reranker1.py\n","output_type":"stream"}],"execution_count":36},{"cell_type":"code","source":"!python run_reranker1.py \\\n --lora_dir1 /kaggle/input/reranker1-lora-v33/transformers/default/1 \\\n --input_path df_test1.parquet \\\n --output_path score_df1.parquet","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:33.050970Z","iopub.execute_input":"2024-12-20T09:53:33.051808Z","iopub.status.idle":"2024-12-20T09:53:42.227652Z","shell.execute_reply.started":"2024-12-20T09:53:33.051772Z","shell.execute_reply":"2024-12-20T09:53:42.226528Z"}},"outputs":[{"name":"stdout","text":"Traceback (most recent call last):\n File \"/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py\", line 402, in cached_file\n resolved_file = hf_hub_download(\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py\", line 101, in inner_f\n return f(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 106, in _inner_fn\n validate_repo_id(arg_value)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 154, in validate_repo_id\n raise HFValidationError(\nhuggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'. Use `repo_type` argument if needed.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/kaggle/working/run_reranker1.py\", line 80, in \n tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py\", line 834, in from_pretrained\n tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py\", line 666, in get_tokenizer_config\n resolved_config_file = cached_file(\n File \"/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py\", line 466, in cached_file\n raise EnvironmentError(\nOSError: Incorrect path_or_model_id: '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'. Please provide either the path to a local folder or the repo_id of a model on the Hub.\n","output_type":"stream"}],"execution_count":37},{"cell_type":"code","source":"!python run_reranker1.py \\\n --lora_dir1 /kaggle/input/reranker1-lora-v65/transformers/default/1 \\\n --input_path df_test2.parquet \\\n --output_path score_df2.parquet","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:53:42.229720Z","iopub.execute_input":"2024-12-20T09:53:42.230036Z","iopub.status.idle":"2024-12-20T09:53:50.356546Z","shell.execute_reply.started":"2024-12-20T09:53:42.230006Z","shell.execute_reply":"2024-12-20T09:53:50.355692Z"}},"outputs":[{"name":"stdout","text":"Traceback (most recent call last):\n File \"/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py\", line 402, in cached_file\n resolved_file = hf_hub_download(\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py\", line 101, in inner_f\n return f(*args, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 106, in _inner_fn\n validate_repo_id(arg_value)\n File \"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py\", line 154, in validate_repo_id\n raise HFValidationError(\nhuggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'. Use `repo_type` argument if needed.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n File \"/kaggle/working/run_reranker1.py\", line 80, in \n tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py\", line 834, in from_pretrained\n tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)\n File \"/opt/conda/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py\", line 666, in get_tokenizer_config\n resolved_config_file = cached_file(\n File \"/opt/conda/lib/python3.10/site-packages/transformers/utils/hub.py\", line 466, in cached_file\n raise EnvironmentError(\nOSError: Incorrect path_or_model_id: '/kaggle/input/qwen2-5-32b-instruct-bnb-4bit'. Please provide either the path to a local folder or the repo_id of a model on the Hub.\n","output_type":"stream"}],"execution_count":38},{"cell_type":"markdown","source":"# Submit","metadata":{}},{"cell_type":"code","source":"df_embedded = pd.read_parquet('df_embedded.parquet')\nscore_df1 = pd.read_parquet(\"/kaggle/input/sub-embedder-reranker-ensemble-v3/score_df1.parquet\")\nscore_df2 = pd.read_parquet(\"/kaggle/input/sub-embedder-reranker-ensemble-v3/score_df2.parquet\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:55:05.933245Z","iopub.execute_input":"2024-12-20T09:55:05.934133Z","iopub.status.idle":"2024-12-20T09:55:05.954696Z","shell.execute_reply.started":"2024-12-20T09:55:05.934084Z","shell.execute_reply":"2024-12-20T09:55:05.954066Z"}},"outputs":[],"execution_count":40},{"cell_type":"code","source":"# top_indices1 = np.array(df_embedded['top_indices1'].tolist())\n# top_indices2 = np.array(df_embedded['top_indices2'].tolist())\n\n# scores1 = np.array(score_df1['score'].tolist())\n# scores2 = np.array(score_df2['score'].tolist())\n\n# # 配列を結合\n# indices_combined = np.concatenate([top_indices1, top_indices2], axis=1) # 2次元で列方向に結合\n# scores_combined = np.concatenate([scores1, scores2], axis=1) # 同じく列方向に結合\n\n# # 各行でスコア降順にソート\n# sorted_indices = np.argsort(-scores_combined, axis=1) # スコアの降順ソートインデックス\n# sorted_top_indices = np.take_along_axis(indices_combined, sorted_indices, axis=1)\n\n# # 重複を削除して最終結果を取得\n# final_indices = []\n# for row in sorted_top_indices:\n# unique_row = np.unique(row, return_index=True)[1] # 重複を除外\n# final_indices.append(row[np.sort(unique_row)][:25]) # 元の順序を維持\n\n# final_indices = np.array(final_indices).tolist()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:55:09.262137Z","iopub.execute_input":"2024-12-20T09:55:09.262992Z","iopub.status.idle":"2024-12-20T09:55:09.267440Z","shell.execute_reply.started":"2024-12-20T09:55:09.262954Z","shell.execute_reply":"2024-12-20T09:55:09.266515Z"}},"outputs":[],"execution_count":41},{"cell_type":"code","source":"top_indices1 = np.array(df_embedded['top_indices1'].tolist())\ntop_indices2 = np.array(df_embedded['top_indices2'].tolist())\n\nscores1 = np.array(score_df1['score'].tolist())\nscores2 = np.array(score_df2['score'].tolist())\n\n# それぞれの行ごとに処理\nfinal_indices = []\nfor i in range(len(top_indices1)):\n # 現行の行のindexとスコア\n row_indices1 = top_indices1[i]\n row_scores1 = scores1[i]\n row_indices2 = top_indices2[i]\n row_scores2 = scores2[i]\n\n # インデックスとスコアをまとめるための辞書 (index -> score)\n combined_dict = {}\n\n # indices1, scores1を追加\n for idx, s in zip(row_indices1, row_scores1):\n combined_dict[idx] = s\n\n # indices2, scores2を追加。既に存在するindexなら平均化\n for idx, s in zip(row_indices2, row_scores2):\n if idx in combined_dict:\n combined_dict[idx] = (combined_dict[idx] + s) / 2.0\n else:\n combined_dict[idx] = s\n\n # `zeros` に含まれるインデックスのスコアを 1.3 倍\n for idx in zeros:\n if idx in combined_dict:\n combined_dict[idx] *= 1.3\n\n # 辞書から(インデックス, スコア)タプルのリストを作成し、スコア降順にソート\n combined_list = sorted(combined_dict.items(), key=lambda x: x[1], reverse=True)\n\n top_index = combined_list[0][0] # 가장 높은 점수의 인덱스 하나만 추출\n final_indices.append([top_index])\n","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:55:09.682176Z","iopub.execute_input":"2024-12-20T09:55:09.682912Z","iopub.status.idle":"2024-12-20T09:55:09.691523Z","shell.execute_reply.started":"2024-12-20T09:55:09.682883Z","shell.execute_reply":"2024-12-20T09:55:09.690819Z"}},"outputs":[],"execution_count":42},{"cell_type":"code","source":"for idx in final_indices:\n misconception_name = misconceptions.iloc[idx]['MisconceptionName']\n print(f\"Misconception Index: {idx}, Misconception: {misconception_name}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T09:55:10.219923Z","iopub.execute_input":"2024-12-20T09:55:10.220223Z","iopub.status.idle":"2024-12-20T09:55:10.231268Z","shell.execute_reply.started":"2024-12-20T09:55:10.220184Z","shell.execute_reply":"2024-12-20T09:55:10.230332Z"}},"outputs":[{"name":"stdout","text":"Misconception Index: [1005], Misconception: 1005 Carries out operations from left to right rega...\nName: MisconceptionName, dtype: object\nMisconception Index: [1507], Misconception: 1507 Carries out operations from left to right rega...\nName: MisconceptionName, dtype: object\nMisconception Index: [1507], Misconception: 1507 Carries out operations from left to right rega...\nName: MisconceptionName, dtype: object\nMisconception Index: [1755], Misconception: 1755 Incorrectly factorises a quadratic\nName: MisconceptionName, dtype: object\nMisconception Index: [2398], Misconception: 2398 Thinks you can divide terms by different facto...\nName: MisconceptionName, dtype: object\nMisconception Index: [1755], Misconception: 1755 Incorrectly factorises a quadratic\nName: MisconceptionName, dtype: object\nMisconception Index: [1287], Misconception: 1287 Believes if you changed all values by the same...\nName: MisconceptionName, dtype: object\nMisconception Index: [1287], Misconception: 1287 Believes if you changed all values by the same...\nName: MisconceptionName, dtype: object\nMisconception Index: [1073], Misconception: 1073 Believes if you add the same value to all numb...\nName: MisconceptionName, dtype: object\n","output_type":"stream"}],"execution_count":43},{"cell_type":"code","source":"from transformers import AutoTokenizer, AutoModelForCausalLM\nfrom peft import PeftModel\n\nMODEL_DIR = '/kaggle/input/qwen2.5-32b/transformers/default/1'\nLORA_DIR = '/kaggle/input/2211-lora-14b/transformers/default/1'\n\n# 모델과 토크나이저 로드\ntokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)\nmodel = AutoModelForCausalLM.from_pretrained(MODEL_DIR, device_map='auto')\nmodel = PeftModel.from_pretrained(model, LORA_DIR, adapter_name=\"lora1\")\n\n# 모델 저장\noutput_path = './exported_model'\nmodel.save_pretrained(output_path)\ntokenizer.save_pretrained(output_path)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2024-12-20T10:15:51.442501Z","iopub.execute_input":"2024-12-20T10:15:51.443391Z"}},"outputs":[{"name":"stderr","text":"/opt/conda/lib/python3.10/site-packages/accelerate/utils/modeling.py:1462: UserWarning: Current model requires 7113553152 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True.\n warnings.warn(\n","output_type":"stream"},{"output_type":"display_data","data":{"text/plain":"Loading checkpoint shards: 0%| | 0/17 [00:00