{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [
"bmXln03G-6jt",
"NJN3_9QzBMFz",
"NJUwS6uF_nPO",
"aAlncS4tPKyR",
"qpg8QGRLckvg",
"pwDRSup3lhnX",
"7hDyA__j0_cg",
"DeD2dGjbMWQS",
"MnzenpVK1RUw",
"YLFxW6OTmkSz",
"He3sYIMuYHdj",
"GutjrXHHb0_B",
"u9NimG14vOGS",
"MSgsUE3a7F6V"
]
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"894f0647a28f4449a63e680841518b0d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_707e977651d6492082bb6615ba6b7d6e",
"IPY_MODEL_732150fa2e1a43c7aa61719cb083d6ac",
"IPY_MODEL_86eae60d0e184e27a6d1f73bc7f37560"
],
"layout": "IPY_MODEL_8ad5f7a38f9d4b4780a3a45a9842a56d"
}
},
"707e977651d6492082bb6615ba6b7d6e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_fb6e2cca3ca84dafbf09599481314a2a",
"placeholder": "",
"style": "IPY_MODEL_3188c60873aa4d2faa5be0ea04ea3e09",
"value": "Epoch 5: 100%"
}
},
"732150fa2e1a43c7aa61719cb083d6ac": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_befb029ec4814bf8b12ed8b7c222d8ab",
"max": 48,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_f801d767b2d446229f30c418604c6d84",
"value": 48
}
},
"86eae60d0e184e27a6d1f73bc7f37560": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b459c7018ee94121891d27687ba5ae59",
"placeholder": "",
"style": "IPY_MODEL_d1dad85ea0ae46fab93b0cc0cbc86310",
"value": " 48/48 [00:17<00:00, 2.67it/s, v_num=1, train_loss=25.90]"
}
},
"8ad5f7a38f9d4b4780a3a45a9842a56d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": "inline-flex",
"flex": null,
"flex_flow": "row wrap",
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "100%"
}
},
"fb6e2cca3ca84dafbf09599481314a2a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"3188c60873aa4d2faa5be0ea04ea3e09": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"befb029ec4814bf8b12ed8b7c222d8ab": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": "2",
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f801d767b2d446229f30c418604c6d84": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"b459c7018ee94121891d27687ba5ae59": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d1dad85ea0ae46fab93b0cc0cbc86310": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"dca4feb8999943458c52402e4821fd9b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_962de14c45d440f981cab7f98bb999d4",
"IPY_MODEL_e1670826b6984372b3e1f6d38dc775ce",
"IPY_MODEL_3cd0ed60902944d39d3f0c1acd41cdc3"
],
"layout": "IPY_MODEL_a73eb0974895406f923d3db27483084f"
}
},
"962de14c45d440f981cab7f98bb999d4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c7645ed1ba1a4401be3523e96b611adb",
"placeholder": "",
"style": "IPY_MODEL_d5d2baac77dd489c911e851aea84b2e8",
"value": "modules.json: 100%"
}
},
"e1670826b6984372b3e1f6d38dc775ce": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_de9347d60d8b43888b3603bbfbceb2af",
"max": 229,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_a627519796034c88a763f9cd2ec0808e",
"value": 229
}
},
"3cd0ed60902944d39d3f0c1acd41cdc3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_7a868d4c9e3f4a70961c2d056d6a5653",
"placeholder": "",
"style": "IPY_MODEL_b2b43b9c16664528a1c55e66805eff76",
"value": " 229/229 [00:00<00:00, 4.52kB/s]"
}
},
"a73eb0974895406f923d3db27483084f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c7645ed1ba1a4401be3523e96b611adb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d5d2baac77dd489c911e851aea84b2e8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"de9347d60d8b43888b3603bbfbceb2af": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"a627519796034c88a763f9cd2ec0808e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"7a868d4c9e3f4a70961c2d056d6a5653": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b2b43b9c16664528a1c55e66805eff76": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"d02d64dc3c9a499491861bef90086d82": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_7aad2011378e434eac5d46c19a117631",
"IPY_MODEL_208acf230279450ca859204736946a28",
"IPY_MODEL_641efe28133240ccbcff70a3d4424249"
],
"layout": "IPY_MODEL_95152a8da2dd4333afeb475aec2b3def"
}
},
"7aad2011378e434eac5d46c19a117631": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_0388f73993794aa3aba737c2ff7a06e7",
"placeholder": "",
"style": "IPY_MODEL_89414b1d7b06439c8bdb33948be5d11e",
"value": "config_sentence_transformers.json: 100%"
}
},
"208acf230279450ca859204736946a28": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_d1f4213d8cbe4416a26520c67b8898ef",
"max": 122,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_d3d80ca0e7dd4ec396e96892ec86e17c",
"value": 122
}
},
"641efe28133240ccbcff70a3d4424249": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_8843fcd259744d94979cc460e24f5ba3",
"placeholder": "",
"style": "IPY_MODEL_0de221a4d2d644c885c0d3f82f8b924c",
"value": " 122/122 [00:00<00:00, 1.57kB/s]"
}
},
"95152a8da2dd4333afeb475aec2b3def": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0388f73993794aa3aba737c2ff7a06e7": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"89414b1d7b06439c8bdb33948be5d11e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"d1f4213d8cbe4416a26520c67b8898ef": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d3d80ca0e7dd4ec396e96892ec86e17c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"8843fcd259744d94979cc460e24f5ba3": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0de221a4d2d644c885c0d3f82f8b924c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"9ed3dd803a884c52aff984be748b0fa2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_d8d8af05c11e4ecfa4c75f9eea5f581a",
"IPY_MODEL_134b61665b5f42ebaa18710e78e4b37a",
"IPY_MODEL_8257e6f8f14c4754a938dd2ac67002aa"
],
"layout": "IPY_MODEL_c16eb49089ee4cbfb42b58b09c529608"
}
},
"d8d8af05c11e4ecfa4c75f9eea5f581a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_be6a7355b7be437586bb693fecae802d",
"placeholder": "",
"style": "IPY_MODEL_4969a761d9f34a62b5502683ad93a723",
"value": "README.md: 100%"
}
},
"134b61665b5f42ebaa18710e78e4b37a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_f39aa003b8b243efa9992302d9c39ecb",
"max": 4126,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_6556302f910349f7aea105bcc4a4e516",
"value": 4126
}
},
"8257e6f8f14c4754a938dd2ac67002aa": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_819d6c6459fc4a71bf37cce6daf4cd04",
"placeholder": "",
"style": "IPY_MODEL_6a5b78e5f5834238b48b3b893d1a797c",
"value": " 4.13k/4.13k [00:00<00:00, 79.0kB/s]"
}
},
"c16eb49089ee4cbfb42b58b09c529608": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"be6a7355b7be437586bb693fecae802d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4969a761d9f34a62b5502683ad93a723": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"f39aa003b8b243efa9992302d9c39ecb": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6556302f910349f7aea105bcc4a4e516": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"819d6c6459fc4a71bf37cce6daf4cd04": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6a5b78e5f5834238b48b3b893d1a797c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"d2acc67c459840c5872a0aed088a673a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_b22bd6adc4944ac2a8223d1fc54e278e",
"IPY_MODEL_9eed2253e2794b96b078174c26f1599e",
"IPY_MODEL_9c12caeb617e4a079bb4960698a9addf"
],
"layout": "IPY_MODEL_f96eeb6fb26243a0988eed5281101c36"
}
},
"b22bd6adc4944ac2a8223d1fc54e278e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_543c6458885e4c92911ac85a6bceff99",
"placeholder": "",
"style": "IPY_MODEL_be795953cbe942d891d3e2e1a857b1c7",
"value": "sentence_bert_config.json: 100%"
}
},
"9eed2253e2794b96b078174c26f1599e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_cffa853377644f2f8a8bba31795146a0",
"max": 53,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_08a68525c0174d77b8fa1f7a16b37f3d",
"value": 53
}
},
"9c12caeb617e4a079bb4960698a9addf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_e5f8df883014424997370c858ca75119",
"placeholder": "",
"style": "IPY_MODEL_980b8f08ae1f441ba4fcd9fec7bba172",
"value": " 53.0/53.0 [00:00<00:00, 1.17kB/s]"
}
},
"f96eeb6fb26243a0988eed5281101c36": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"543c6458885e4c92911ac85a6bceff99": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"be795953cbe942d891d3e2e1a857b1c7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"cffa853377644f2f8a8bba31795146a0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"08a68525c0174d77b8fa1f7a16b37f3d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"e5f8df883014424997370c858ca75119": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"980b8f08ae1f441ba4fcd9fec7bba172": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"fb399b6ccf14488e8f3a5ee25ba1f519": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_9689a4b5fc3443c988355d51fb8b31dd",
"IPY_MODEL_07c7545951bd45838b37656a7d55c4f5",
"IPY_MODEL_375f551994f141a197d717059513a15a"
],
"layout": "IPY_MODEL_f8e89fd727a747d1a205f0ea3452a94a"
}
},
"9689a4b5fc3443c988355d51fb8b31dd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b0d6e4b6564c46348efe5ee5e24a5a36",
"placeholder": "",
"style": "IPY_MODEL_ab1c746c9a414d16a11db2ecad0797ab",
"value": "config.json: 100%"
}
},
"07c7545951bd45838b37656a7d55c4f5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_7297eacf7ca34b3bba0029375b8c811a",
"max": 723,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_0075b2bdafa04862b6a871356351dfa1",
"value": 723
}
},
"375f551994f141a197d717059513a15a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_64db601a4cca46a882e32f771c3dba3d",
"placeholder": "",
"style": "IPY_MODEL_80946e3d99314550b777b8a62ffe0521",
"value": " 723/723 [00:00<00:00, 16.4kB/s]"
}
},
"f8e89fd727a747d1a205f0ea3452a94a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b0d6e4b6564c46348efe5ee5e24a5a36": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ab1c746c9a414d16a11db2ecad0797ab": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"7297eacf7ca34b3bba0029375b8c811a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0075b2bdafa04862b6a871356351dfa1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"64db601a4cca46a882e32f771c3dba3d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"80946e3d99314550b777b8a62ffe0521": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"f810aade9a2943aaa5949f30774ab26f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_ea5a3b55a5bc450aa0f92c88625778f8",
"IPY_MODEL_ba21fc338bfc4269bef02e50c0711b7c",
"IPY_MODEL_5a7c1b813c5b4ceea534f74765c5e605"
],
"layout": "IPY_MODEL_894779b9a1384b668593ae969055c32c"
}
},
"ea5a3b55a5bc450aa0f92c88625778f8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_75d1b0ba8e7048cfb30e507ec3f24ff8",
"placeholder": "",
"style": "IPY_MODEL_59dda900f0894c54b6ce7302e761ad43",
"value": "model.safetensors: 100%"
}
},
"ba21fc338bfc4269bef02e50c0711b7c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_bd87f489e77d44d9b1e066f16c0ca9ed",
"max": 1112201288,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_4e0bd61e94594f5db415e7d5315de7c0",
"value": 1112201288
}
},
"5a7c1b813c5b4ceea534f74765c5e605": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_c239ef650cb54478825730509e2a1bd2",
"placeholder": "",
"style": "IPY_MODEL_ee1503ea36ab41d2a48155fbe442d909",
"value": " 1.11G/1.11G [00:14<00:00, 95.1MB/s]"
}
},
"894779b9a1384b668593ae969055c32c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"75d1b0ba8e7048cfb30e507ec3f24ff8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"59dda900f0894c54b6ce7302e761ad43": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"bd87f489e77d44d9b1e066f16c0ca9ed": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"4e0bd61e94594f5db415e7d5315de7c0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"c239ef650cb54478825730509e2a1bd2": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ee1503ea36ab41d2a48155fbe442d909": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"8798e6b2ae124152bb6cca66b14c0d3a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_ba5e2e03809a428796f8905392eae113",
"IPY_MODEL_27067ff09cbe4ad091ffdd18ea30bc17",
"IPY_MODEL_d722357cd295474290b5dc0ebd704f4c"
],
"layout": "IPY_MODEL_d89ed637839f4c9abd4268f3d71f3eab"
}
},
"ba5e2e03809a428796f8905392eae113": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_63c52b32dbae41b4a82020cbf030ddb4",
"placeholder": "",
"style": "IPY_MODEL_3adaba744a5740318b2cf4188429acc0",
"value": "tokenizer_config.json: 100%"
}
},
"27067ff09cbe4ad091ffdd18ea30bc17": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_cc0fc21f8f2d43b4859f1886a33cf0d9",
"max": 402,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_d696138c3379401fae995f49d0d4cd48",
"value": 402
}
},
"d722357cd295474290b5dc0ebd704f4c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_e291c590d5ad4f62b29f65a451fdea09",
"placeholder": "",
"style": "IPY_MODEL_f42a6bf637f846659b0f7d7d198e6eb4",
"value": " 402/402 [00:00<00:00, 23.5kB/s]"
}
},
"d89ed637839f4c9abd4268f3d71f3eab": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"63c52b32dbae41b4a82020cbf030ddb4": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"3adaba744a5740318b2cf4188429acc0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"cc0fc21f8f2d43b4859f1886a33cf0d9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d696138c3379401fae995f49d0d4cd48": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"e291c590d5ad4f62b29f65a451fdea09": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f42a6bf637f846659b0f7d7d198e6eb4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"4a74fa41f5d8471e8db6f79ee06d88a1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_df3a741ce3334174a1ba1c185bcf7e4d",
"IPY_MODEL_27e51c8b4ed54557987dc6362be11609",
"IPY_MODEL_af075877b5864bd3b6f98b13695f1e0d"
],
"layout": "IPY_MODEL_b8935da2360a4a8f8be0f1cf12049704"
}
},
"df3a741ce3334174a1ba1c185bcf7e4d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_0007b3003cf94edfbe879278e68ad521",
"placeholder": "",
"style": "IPY_MODEL_91d456c10ac14ddda54da8f6b0d6c45a",
"value": "sentencepiece.bpe.model: 100%"
}
},
"27e51c8b4ed54557987dc6362be11609": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_304f170991ee4e538908b36bc5ee5c71",
"max": 5069051,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_e1cebdb893b64ef0ac977255e216143a",
"value": 5069051
}
},
"af075877b5864bd3b6f98b13695f1e0d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b7327644f1a14c9f9b620747902501f1",
"placeholder": "",
"style": "IPY_MODEL_ef7f978170c746f09aaa8f1450c02548",
"value": " 5.07M/5.07M [00:00<00:00, 78.9MB/s]"
}
},
"b8935da2360a4a8f8be0f1cf12049704": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"0007b3003cf94edfbe879278e68ad521": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"91d456c10ac14ddda54da8f6b0d6c45a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"304f170991ee4e538908b36bc5ee5c71": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"e1cebdb893b64ef0ac977255e216143a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"b7327644f1a14c9f9b620747902501f1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ef7f978170c746f09aaa8f1450c02548": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"757581a0b5ca4ab5af853223f95e21e4": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_ace6230a55254740919ad2ee94309bff",
"IPY_MODEL_569e73779e594bc0acb72bdcec240c26",
"IPY_MODEL_506eec7a17cd4657a36d3970c81689b0"
],
"layout": "IPY_MODEL_d0cb73af415c49f8887fe530cb58e218"
}
},
"ace6230a55254740919ad2ee94309bff": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_b285870fcfd04f13a064b4f41bcdc15c",
"placeholder": "",
"style": "IPY_MODEL_3b63a5eb40bc46dbb59ddf0dab56f961",
"value": "tokenizer.json: 100%"
}
},
"569e73779e594bc0acb72bdcec240c26": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_39abfc34a8de449ab918374cd617983d",
"max": 9081518,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_6dbd7ac42eae437092baca2fcc660b50",
"value": 9081518
}
},
"506eec7a17cd4657a36d3970c81689b0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_31434f024801490aa52a110bb0814510",
"placeholder": "",
"style": "IPY_MODEL_61f56526d84e409d879cfd9935eb5084",
"value": " 9.08M/9.08M [00:00<00:00, 23.0MB/s]"
}
},
"d0cb73af415c49f8887fe530cb58e218": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"b285870fcfd04f13a064b4f41bcdc15c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"3b63a5eb40bc46dbb59ddf0dab56f961": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"39abfc34a8de449ab918374cd617983d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6dbd7ac42eae437092baca2fcc660b50": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"31434f024801490aa52a110bb0814510": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"61f56526d84e409d879cfd9935eb5084": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"ffdf2ed2b8d2412997a9845a85d1f4ab": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_5aa4451625f04032a62b35829b5cbfa3",
"IPY_MODEL_e8202c4e66814d07bf4e7e980e930a90",
"IPY_MODEL_a54eef1414914da0981e3037d19e79bd"
],
"layout": "IPY_MODEL_8dcd120dfe98453b9f4e9cb145696940"
}
},
"5aa4451625f04032a62b35829b5cbfa3": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_1741b6afdc08426ea2f7058457413999",
"placeholder": "",
"style": "IPY_MODEL_c87cf21dbf074c699c1c57aa604940e6",
"value": "special_tokens_map.json: 100%"
}
},
"e8202c4e66814d07bf4e7e980e930a90": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_3dec68173930485d82b3a0d53deda040",
"max": 239,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_320ea31c60b34983bf6b9c1a6d258eeb",
"value": 239
}
},
"a54eef1414914da0981e3037d19e79bd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_1d7d3a59a8cc4e03825aebbd9fa8bdb6",
"placeholder": "",
"style": "IPY_MODEL_07215f1e31c44c8893b096808be4d96e",
"value": " 239/239 [00:00<00:00, 22.5kB/s]"
}
},
"8dcd120dfe98453b9f4e9cb145696940": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"1741b6afdc08426ea2f7058457413999": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c87cf21dbf074c699c1c57aa604940e6": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"3dec68173930485d82b3a0d53deda040": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"320ea31c60b34983bf6b9c1a6d258eeb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"1d7d3a59a8cc4e03825aebbd9fa8bdb6": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"07215f1e31c44c8893b096808be4d96e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"438a2b975062455fb5fc06eb3cdaf625": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_0445438e0b9c4ac680f5e82183708492",
"IPY_MODEL_5d0f54bb9df64b8786538424532d6397",
"IPY_MODEL_a325f3831f664a0c96061d16e1d9c358"
],
"layout": "IPY_MODEL_aac58743d8d64b8591abe9d421a43565"
}
},
"0445438e0b9c4ac680f5e82183708492": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_aa49127f1f4d4b168a5d5b69dc7fed0d",
"placeholder": "",
"style": "IPY_MODEL_fe79e4d23eb64d82bd818ac4c369303a",
"value": "1_Pooling%2Fconfig.json: 100%"
}
},
"5d0f54bb9df64b8786538424532d6397": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_3d60222523434c1fbc1647b104d2af20",
"max": 190,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_44dba6a745254c3ab17a76ba184b4ecd",
"value": 190
}
},
"a325f3831f664a0c96061d16e1d9c358": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_d530476ab4934024a5753e5a520d7772",
"placeholder": "",
"style": "IPY_MODEL_6bc1fefb830a4636b06a0f692bf7d71b",
"value": " 190/190 [00:00<00:00, 10.9kB/s]"
}
},
"aac58743d8d64b8591abe9d421a43565": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"aa49127f1f4d4b168a5d5b69dc7fed0d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"fe79e4d23eb64d82bd818ac4c369303a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"3d60222523434c1fbc1647b104d2af20": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"44dba6a745254c3ab17a76ba184b4ecd": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"d530476ab4934024a5753e5a520d7772": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"6bc1fefb830a4636b06a0f692bf7d71b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# ***REGRESSION***"
],
"metadata": {
"id": "bmXln03G-6jt"
}
},
{
"cell_type": "code",
"source": [
"!pip install catboost\n",
"!pip install optuna"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "DjZTGudd_ZNL",
"outputId": "27f69bb2-f6a3-467d-a88a-94f2ad9533ce"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting catboost\n",
" Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)\n",
"Requirement already satisfied: graphviz in /usr/local/lib/python3.11/dist-packages (from catboost) (0.20.3)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (from catboost) (3.10.0)\n",
"Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.11/dist-packages (from catboost) (1.26.4)\n",
"Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.11/dist-packages (from catboost) (2.2.2)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from catboost) (1.13.1)\n",
"Requirement already satisfied: plotly in /usr/local/lib/python3.11/dist-packages (from catboost) (5.24.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.11/dist-packages (from catboost) (1.17.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=0.24->catboost) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=0.24->catboost) (2025.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=0.24->catboost) (2025.1)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (1.3.1)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (4.56.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (1.4.8)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (24.2)\n",
"Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (11.1.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (3.2.1)\n",
"Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.11/dist-packages (from plotly->catboost) (9.0.0)\n",
"Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.7/98.7 MB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: catboost\n",
"Successfully installed catboost-1.2.7\n",
"Collecting optuna\n",
" Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)\n",
"Collecting alembic>=1.5.0 (from optuna)\n",
" Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)\n",
"Collecting colorlog (from optuna)\n",
" Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from optuna) (1.26.4)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from optuna) (24.2)\n",
"Requirement already satisfied: sqlalchemy>=1.4.2 in /usr/local/lib/python3.11/dist-packages (from optuna) (2.0.38)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from optuna) (4.67.1)\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from optuna) (6.0.2)\n",
"Collecting Mako (from alembic>=1.5.0->optuna)\n",
" Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)\n",
"Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.11/dist-packages (from alembic>=1.5.0->optuna) (4.12.2)\n",
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.11/dist-packages (from sqlalchemy>=1.4.2->optuna) (3.1.1)\n",
"Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.11/dist-packages (from Mako->alembic>=1.5.0->optuna) (3.0.2)\n",
"Downloading optuna-4.2.1-py3-none-any.whl (383 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m383.6/383.6 kB\u001b[0m \u001b[31m19.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.6/233.6 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)\n",
"Downloading Mako-1.3.9-py3-none-any.whl (78 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.5/78.5 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: Mako, colorlog, alembic, optuna\n",
"Successfully installed Mako-1.3.9 alembic-1.14.1 colorlog-6.9.0 optuna-4.2.1\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.impute import SimpleImputer\n",
"import numpy as np\n",
"import pandas as pd\n",
"from catboost import CatBoostRegressor, Pool\n",
"from sklearn.metrics import mean_squared_error\n",
"import optuna\n",
"from sklearn.model_selection import train_test_split"
],
"metadata": {
"id": "Mm3YgMLcxrTF"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train = pd.read_csv('/content/train (6).csv', index_col='Id')\n",
"train.head(3)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"id": "pMBWe5Bs-6RU",
"outputId": "041e66c6-2e4e-4c9d-c4a6-ee0f9b2e84d0"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"Id \n",
"1 60 RL 65.0 8450 Pave NaN Reg \n",
"2 20 RL 80.0 9600 Pave NaN Reg \n",
"3 60 RL 68.0 11250 Pave NaN IR1 \n",
"\n",
" LandContour Utilities LotConfig ... PoolArea PoolQC Fence MiscFeature \\\n",
"Id ... \n",
"1 Lvl AllPub Inside ... 0 NaN NaN NaN \n",
"2 Lvl AllPub FR2 ... 0 NaN NaN NaN \n",
"3 Lvl AllPub Inside ... 0 NaN NaN NaN \n",
"\n",
" MiscVal MoSold YrSold SaleType SaleCondition SalePrice \n",
"Id \n",
"1 0 2 2008 WD Normal 208500 \n",
"2 0 5 2007 WD Normal 181500 \n",
"3 0 9 2008 WD Normal 223500 \n",
"\n",
"[3 rows x 80 columns]"
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" MSSubClass | \n",
" MSZoning | \n",
" LotFrontage | \n",
" LotArea | \n",
" Street | \n",
" Alley | \n",
" LotShape | \n",
" LandContour | \n",
" Utilities | \n",
" LotConfig | \n",
" ... | \n",
" PoolArea | \n",
" PoolQC | \n",
" Fence | \n",
" MiscFeature | \n",
" MiscVal | \n",
" MoSold | \n",
" YrSold | \n",
" SaleType | \n",
" SaleCondition | \n",
" SalePrice | \n",
"
\n",
" \n",
" Id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 60 | \n",
" RL | \n",
" 65.0 | \n",
" 8450 | \n",
" Pave | \n",
" NaN | \n",
" Reg | \n",
" Lvl | \n",
" AllPub | \n",
" Inside | \n",
" ... | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" 2 | \n",
" 2008 | \n",
" WD | \n",
" Normal | \n",
" 208500 | \n",
"
\n",
" \n",
" 2 | \n",
" 20 | \n",
" RL | \n",
" 80.0 | \n",
" 9600 | \n",
" Pave | \n",
" NaN | \n",
" Reg | \n",
" Lvl | \n",
" AllPub | \n",
" FR2 | \n",
" ... | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" 5 | \n",
" 2007 | \n",
" WD | \n",
" Normal | \n",
" 181500 | \n",
"
\n",
" \n",
" 3 | \n",
" 60 | \n",
" RL | \n",
" 68.0 | \n",
" 11250 | \n",
" Pave | \n",
" NaN | \n",
" IR1 | \n",
" Lvl | \n",
" AllPub | \n",
" Inside | \n",
" ... | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" 9 | \n",
" 2008 | \n",
" WD | \n",
" Normal | \n",
" 223500 | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 80 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "train"
}
},
"metadata": {},
"execution_count": 25
}
]
},
{
"cell_type": "code",
"source": [
"columns = list(train.columns)\n",
"categorical_features = []\n",
"for i in columns:\n",
" if str(type(train[i].iloc[0])) == \"\":\n",
" categorical_features.append(i)"
],
"metadata": {
"id": "AmXBmYG2DK_N"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"missing_values_table = train.isna().sum().reset_index()\n",
"missing_values_table.columns = ['Column', 'Missing Values']\n",
"missing_values_table['% of Total Values'] = (missing_values_table['Missing Values'] / len(train)) * 100\n",
"\n",
"print(missing_values_table['% of Total Values'])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WWzL-Oe-EILC",
"outputId": "1118d6bb-f44f-4b4a-a0b4-f4b4bdcef6cf"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0 0.000000\n",
"1 0.000000\n",
"2 17.739726\n",
"3 0.000000\n",
"4 0.000000\n",
" ... \n",
"75 0.000000\n",
"76 0.000000\n",
"77 0.000000\n",
"78 0.000000\n",
"79 0.000000\n",
"Name: % of Total Values, Length: 80, dtype: float64\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"a = list(missing_values_table['% of Total Values'])\n",
"num = []\n",
"cat = []\n",
"drp = []\n",
"for i in range(len(columns)):\n",
" if missing_values_table['% of Total Values'].iloc[i] > 0:\n",
" if missing_values_table['% of Total Values'].iloc[i] > 40:\n",
" drp.append(str(missing_values_table['Column'].iloc[i]))\n",
" elif str(missing_values_table['Column'].iloc[i]) in categorical_features:\n",
" print(missing_values_table['Column'].iloc[i], missing_values_table['% of Total Values'].iloc[i], \"CATEGORICAL\")\n",
" cat.append(str(missing_values_table['Column'].iloc[i]))\n",
" else:\n",
" print(missing_values_table['Column'].iloc[i], missing_values_table['% of Total Values'].iloc[i])\n",
" num.append(str(missing_values_table['Column'].iloc[i]))\n",
"\n",
"print(drp)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "boRDfj2FE3MJ",
"outputId": "cde30a7f-cf35-48ce-8801-6d8ce7453020"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"LotFrontage 17.73972602739726\n",
"MasVnrArea 0.547945205479452\n",
"BsmtQual 2.5342465753424657 CATEGORICAL\n",
"BsmtCond 2.5342465753424657 CATEGORICAL\n",
"BsmtExposure 2.6027397260273974 CATEGORICAL\n",
"BsmtFinType1 2.5342465753424657 CATEGORICAL\n",
"BsmtFinType2 2.6027397260273974 CATEGORICAL\n",
"Electrical 0.0684931506849315 CATEGORICAL\n",
"GarageType 5.5479452054794525 CATEGORICAL\n",
"GarageYrBlt 5.5479452054794525\n",
"GarageFinish 5.5479452054794525 CATEGORICAL\n",
"GarageQual 5.5479452054794525 CATEGORICAL\n",
"GarageCond 5.5479452054794525 CATEGORICAL\n",
"['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"train = train.drop(drp, axis = 1)\n",
"\n",
"imputer = SimpleImputer(strategy='mean')\n",
"train[num] = imputer.fit_transform(train[num])\n",
"\n",
"imputer_cat = SimpleImputer(strategy='most_frequent')\n",
"train[cat] = imputer_cat.fit_transform(train[cat])"
],
"metadata": {
"id": "XoJkLYjGqzIM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"y = train['SalePrice']\n",
"x = train.drop('SalePrice', axis =1)\n",
"\n",
"train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.2)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 224
},
"id": "fjfWVR3__nMr",
"outputId": "a6688bbb-d91e-473d-e0e2-c2810100ce5f"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour \\\n",
"Id \n",
"973 120 RL 55.0 7892 Pave Reg Lvl \n",
"805 20 RL 75.0 9000 Pave Reg Lvl \n",
"1190 60 RL 60.0 7500 Pave Reg Lvl \n",
"\n",
" Utilities LotConfig LandSlope ... OpenPorchSF EnclosedPorch 3SsnPorch \\\n",
"Id ... \n",
"973 AllPub Inside Gtl ... 0 0 0 \n",
"805 AllPub Inside Gtl ... 0 0 0 \n",
"1190 AllPub Inside Gtl ... 60 0 0 \n",
"\n",
" ScreenPorch PoolArea MiscVal MoSold YrSold SaleType SaleCondition \n",
"Id \n",
"973 0 0 0 4 2010 WD Normal \n",
"805 0 0 0 6 2006 WD Family \n",
"1190 0 0 0 6 2010 WD Normal \n",
"\n",
"[3 rows x 73 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" MSSubClass | \n",
" MSZoning | \n",
" LotFrontage | \n",
" LotArea | \n",
" Street | \n",
" LotShape | \n",
" LandContour | \n",
" Utilities | \n",
" LotConfig | \n",
" LandSlope | \n",
" ... | \n",
" OpenPorchSF | \n",
" EnclosedPorch | \n",
" 3SsnPorch | \n",
" ScreenPorch | \n",
" PoolArea | \n",
" MiscVal | \n",
" MoSold | \n",
" YrSold | \n",
" SaleType | \n",
" SaleCondition | \n",
"
\n",
" \n",
" Id | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 973 | \n",
" 120 | \n",
" RL | \n",
" 55.0 | \n",
" 7892 | \n",
" Pave | \n",
" Reg | \n",
" Lvl | \n",
" AllPub | \n",
" Inside | \n",
" Gtl | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 4 | \n",
" 2010 | \n",
" WD | \n",
" Normal | \n",
"
\n",
" \n",
" 805 | \n",
" 20 | \n",
" RL | \n",
" 75.0 | \n",
" 9000 | \n",
" Pave | \n",
" Reg | \n",
" Lvl | \n",
" AllPub | \n",
" Inside | \n",
" Gtl | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 6 | \n",
" 2006 | \n",
" WD | \n",
" Family | \n",
"
\n",
" \n",
" 1190 | \n",
" 60 | \n",
" RL | \n",
" 60.0 | \n",
" 7500 | \n",
" Pave | \n",
" Reg | \n",
" Lvl | \n",
" AllPub | \n",
" Inside | \n",
" Gtl | \n",
" ... | \n",
" 60 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 6 | \n",
" 2010 | \n",
" WD | \n",
" Normal | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 73 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "train_x"
}
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"source": [
"columns = list(train_x.columns)\n",
"categorical_features = []\n",
"for i in columns:\n",
" if str(type(train[i].iloc[0])) == \"\":\n",
" categorical_features.append(i)"
],
"metadata": {
"id": "aGT0YgWAyag6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def objective(trial):\n",
" params = {\n",
" \"iterations\": trial.suggest_int(\"iterations\", 500, 1000),\n",
" \"depth\": trial.suggest_int(\"depth\", 3, 6),\n",
" \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.01, 0.3, log=True),\n",
" \"l2_leaf_reg\": trial.suggest_float(\"l2_leaf_reg\", 1, 5, log=True),\n",
" \"loss_function\": \"RMSE\",\n",
" \"eval_metric\": \"RMSE\",\n",
" \"random_seed\": 42,\n",
" \"verbose\": 0\n",
" }\n",
"\n",
" model = CatBoostRegressor(**params)\n",
" model.fit(train_x, train_y, eval_set=(val_x, val_y), verbose=1000, cat_features=categorical_features)\n",
"\n",
" preds = model.predict(val_x)\n",
" rmse = mean_squared_error(val_y, preds)\n",
" rmse = np.sqrt(rmse)\n",
" return rmse\n",
"\n",
"study = optuna.create_study(direction=\"minimize\")\n",
"study.optimize(objective, n_trials=20) # Количество итераций\n",
"\n",
"# 📌 6. Обучение модели с лучшими параметрами\n",
"best_params = study.best_params\n",
"best_model = CatBoostRegressor(**best_params)\n",
"best_model.fit(train_x, train_y, eval_set=(val_x, val_y), verbose=100, cat_features=categorical_features)\n",
"\n",
"# 📌 7. Оценка модели\n",
"final_preds = best_model.predict(val_x)\n",
"rmse = mean_squared_error(val_y, final_preds)\n",
"rmse = np.sqrt(rmse)\n",
"print(f\"Final RMSE: {rmse}\")"
],
"metadata": {
"id": "znndOvytsZjD",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "d89b6f06-5024-4d49-b9e5-3c4230f75ffc"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:31:59,003] A new study created in memory with name: no-name-58f52034-88e2-4d88-946d-5248e270279f\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"0:\tlearn: 78333.0649674\ttest: 79904.7958863\tbest: 79904.7958863 (0)\ttotal: 27.3ms\tremaining: 18s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:32:11,019] Trial 0 finished with value: 27590.170657313705 and parameters: {'iterations': 660, 'depth': 5, 'learning_rate': 0.016357363220883117, 'l2_leaf_reg': 1.0467755164243835}. Best is trial 0 with value: 27590.170657313705.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"659:\tlearn: 15502.3089534\ttest: 27590.1706573\tbest: 27590.1706573 (659)\ttotal: 11.8s\tremaining: 0us\n",
"\n",
"bestTest = 27590.17066\n",
"bestIteration = 659\n",
"\n",
"0:\tlearn: 78018.7607019\ttest: 79644.0190750\tbest: 79644.0190750 (0)\ttotal: 19.2ms\tremaining: 18.9s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:32:31,044] Trial 1 finished with value: 28054.269444924557 and parameters: {'iterations': 983, 'depth': 6, 'learning_rate': 0.023902850062876548, 'l2_leaf_reg': 3.5310269432730204}. Best is trial 0 with value: 27590.170657313705.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"982:\tlearn: 11591.3196453\ttest: 28054.5273026\tbest: 28054.2694449 (980)\ttotal: 19.9s\tremaining: 0us\n",
"\n",
"bestTest = 28054.26944\n",
"bestIteration = 980\n",
"\n",
"Shrink model to first 981 iterations.\n",
"0:\tlearn: 75845.8232014\ttest: 77563.7878037\tbest: 77563.7878037 (0)\ttotal: 5.46ms\tremaining: 5.36s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:32:36,495] Trial 2 finished with value: 28698.64513756474 and parameters: {'iterations': 982, 'depth': 3, 'learning_rate': 0.07393437088919934, 'l2_leaf_reg': 4.407243596530369}. Best is trial 0 with value: 27590.170657313705.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"981:\tlearn: 12432.3504777\ttest: 28710.5183190\tbest: 28698.6451376 (980)\ttotal: 5.33s\tremaining: 0us\n",
"\n",
"bestTest = 28698.64514\n",
"bestIteration = 980\n",
"\n",
"Shrink model to first 981 iterations.\n",
"0:\tlearn: 74955.9690638\ttest: 76649.8689567\tbest: 76649.8689567 (0)\ttotal: 7.68ms\tremaining: 3.96s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:32:41,288] Trial 3 finished with value: 27869.917555899843 and parameters: {'iterations': 517, 'depth': 4, 'learning_rate': 0.09140291137980973, 'l2_leaf_reg': 1.8278943374326093}. Best is trial 0 with value: 27590.170657313705.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"516:\tlearn: 10760.0069501\ttest: 27872.5799946\tbest: 27869.9175559 (514)\ttotal: 4.7s\tremaining: 0us\n",
"\n",
"bestTest = 27869.91756\n",
"bestIteration = 514\n",
"\n",
"Shrink model to first 515 iterations.\n",
"0:\tlearn: 75902.0846134\ttest: 77618.1762520\tbest: 77618.1762520 (0)\ttotal: 5.85ms\tremaining: 5.36s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:32:45,628] Trial 4 finished with value: 29733.270157741175 and parameters: {'iterations': 917, 'depth': 3, 'learning_rate': 0.07192589301350974, 'l2_leaf_reg': 3.2029754979257365}. Best is trial 0 with value: 27590.170657313705.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"916:\tlearn: 12609.4158087\ttest: 29733.2701577\tbest: 29733.2701577 (916)\ttotal: 4.23s\tremaining: 0us\n",
"\n",
"bestTest = 29733.27016\n",
"bestIteration = 916\n",
"\n",
"0:\tlearn: 75582.1119423\ttest: 77314.1873521\tbest: 77314.1873521 (0)\ttotal: 5.75ms\tremaining: 3.94s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:32:49,583] Trial 5 finished with value: 27968.432737151175 and parameters: {'iterations': 686, 'depth': 3, 'learning_rate': 0.07995901292964724, 'l2_leaf_reg': 4.308480258560373}. Best is trial 0 with value: 27590.170657313705.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"685:\tlearn: 14109.0275208\ttest: 27971.4125965\tbest: 27968.4327372 (682)\ttotal: 3.85s\tremaining: 0us\n",
"\n",
"bestTest = 27968.43274\n",
"bestIteration = 682\n",
"\n",
"Shrink model to first 683 iterations.\n",
"0:\tlearn: 75970.1743846\ttest: 77762.0780575\tbest: 77762.0780575 (0)\ttotal: 14.7ms\tremaining: 8.91s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:33:01,983] Trial 6 finished with value: 25505.915021309767 and parameters: {'iterations': 607, 'depth': 6, 'learning_rate': 0.0677948134301802, 'l2_leaf_reg': 2.0814646273997335}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"606:\tlearn: 7610.4218093\ttest: 25505.9150213\tbest: 25505.9150213 (606)\ttotal: 12.3s\tremaining: 0us\n",
"\n",
"bestTest = 25505.91502\n",
"bestIteration = 606\n",
"\n",
"0:\tlearn: 78495.5523435\ttest: 80059.4150816\tbest: 80059.4150816 (0)\ttotal: 7.88ms\tremaining: 6.68s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:33:08,075] Trial 7 finished with value: 29586.253948823174 and parameters: {'iterations': 848, 'depth': 4, 'learning_rate': 0.013374392400396598, 'l2_leaf_reg': 2.5661120089875267}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"847:\tlearn: 18062.8046939\ttest: 29605.7690621\tbest: 29586.2539488 (845)\ttotal: 5.97s\tremaining: 0us\n",
"\n",
"bestTest = 29586.25395\n",
"bestIteration = 845\n",
"\n",
"Shrink model to first 846 iterations.\n",
"0:\tlearn: 73604.1185921\ttest: 75594.7388721\tbest: 75594.7388721 (0)\ttotal: 16.3ms\tremaining: 14.4s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:33:29,081] Trial 8 finished with value: 25942.63417941992 and parameters: {'iterations': 882, 'depth': 6, 'learning_rate': 0.11850075769005632, 'l2_leaf_reg': 1.2815307519387589}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"881:\tlearn: 2670.1533352\ttest: 25988.0585841\tbest: 25942.6341794 (629)\ttotal: 20.8s\tremaining: 0us\n",
"\n",
"bestTest = 25942.63418\n",
"bestIteration = 629\n",
"\n",
"Shrink model to first 630 iterations.\n",
"0:\tlearn: 70492.3985001\ttest: 72724.8985510\tbest: 72724.8985510 (0)\ttotal: 15.7ms\tremaining: 14.4s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:33:50,797] Trial 9 finished with value: 28116.9240097469 and parameters: {'iterations': 920, 'depth': 6, 'learning_rate': 0.18979605485431683, 'l2_leaf_reg': 1.6195835013784619}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"919:\tlearn: 1285.3683811\ttest: 28119.6568015\tbest: 28116.9240097 (912)\ttotal: 21.6s\tremaining: 0us\n",
"\n",
"bestTest = 28116.92401\n",
"bestIteration = 912\n",
"\n",
"Shrink model to first 913 iterations.\n",
"0:\tlearn: 77376.3347012\ttest: 78983.1492222\tbest: 78983.1492222 (0)\ttotal: 10.7ms\tremaining: 5.35s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:33:57,061] Trial 10 finished with value: 28578.426794142797 and parameters: {'iterations': 500, 'depth': 5, 'learning_rate': 0.037494437856804716, 'l2_leaf_reg': 2.120553073873791}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"499:\tlearn: 13757.2642000\ttest: 28583.2570555\tbest: 28578.4267941 (496)\ttotal: 6.15s\tremaining: 0us\n",
"\n",
"bestTest = 28578.42679\n",
"bestIteration = 496\n",
"\n",
"Shrink model to first 497 iterations.\n",
"0:\tlearn: 71124.7124091\ttest: 73315.6531358\tbest: 73315.6531358 (0)\ttotal: 15.2ms\tremaining: 12.1s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:34:12,433] Trial 11 finished with value: 25654.218131104917 and parameters: {'iterations': 794, 'depth': 6, 'learning_rate': 0.174045018590583, 'l2_leaf_reg': 1.2553559685810602}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"793:\tlearn: 1806.1983257\ttest: 25665.0388059\tbest: 25654.2181311 (748)\ttotal: 15.2s\tremaining: 0us\n",
"\n",
"bestTest = 25654.21813\n",
"bestIteration = 748\n",
"\n",
"Shrink model to first 749 iterations.\n",
"0:\tlearn: 65965.9525773\ttest: 67916.9191772\tbest: 67916.9191772 (0)\ttotal: 11ms\tremaining: 8.49s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:34:21,637] Trial 12 finished with value: 27568.089228145127 and parameters: {'iterations': 772, 'depth': 5, 'learning_rate': 0.29984336832096536, 'l2_leaf_reg': 1.4187464720313705}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"771:\tlearn: 2005.5261278\ttest: 27917.9813807\tbest: 27568.0892281 (122)\ttotal: 9.07s\tremaining: 0us\n",
"\n",
"bestTest = 27568.08923\n",
"bestIteration = 122\n",
"\n",
"Shrink model to first 123 iterations.\n",
"0:\tlearn: 71830.7050936\ttest: 73968.8924491\tbest: 73968.8924491 (0)\ttotal: 18.1ms\tremaining: 10.6s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:34:33,113] Trial 13 finished with value: 27783.988974347536 and parameters: {'iterations': 584, 'depth': 6, 'learning_rate': 0.15753043024546495, 'l2_leaf_reg': 1.0844493978212573}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"583:\tlearn: 2932.3559304\ttest: 27843.9749369\tbest: 27783.9889743 (530)\ttotal: 11.3s\tremaining: 0us\n",
"\n",
"bestTest = 27783.98897\n",
"bestIteration = 530\n",
"\n",
"Shrink model to first 531 iterations.\n",
"0:\tlearn: 77459.1675696\ttest: 79062.3182394\tbest: 79062.3182394 (0)\ttotal: 13.9ms\tremaining: 10.6s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:34:41,591] Trial 14 finished with value: 26010.87795996257 and parameters: {'iterations': 765, 'depth': 5, 'learning_rate': 0.035862946535418744, 'l2_leaf_reg': 2.3974801719363676}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"764:\tlearn: 11631.4998202\ttest: 26010.9213248\tbest: 26010.8779600 (763)\ttotal: 8.32s\tremaining: 0us\n",
"\n",
"bestTest = 26010.87796\n",
"bestIteration = 763\n",
"\n",
"Shrink model to first 764 iterations.\n",
"0:\tlearn: 67429.2452416\ttest: 69891.3553373\tbest: 69891.3553373 (0)\ttotal: 33.2ms\tremaining: 22s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:34:55,343] Trial 15 finished with value: 28138.792254996108 and parameters: {'iterations': 663, 'depth': 6, 'learning_rate': 0.2632530567491658, 'l2_leaf_reg': 1.8431661317560617}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"662:\tlearn: 1571.5532066\ttest: 28146.6342828\tbest: 28138.7922550 (556)\ttotal: 13.6s\tremaining: 0us\n",
"\n",
"bestTest = 28138.79225\n",
"bestIteration = 556\n",
"\n",
"Shrink model to first 557 iterations.\n",
"0:\tlearn: 77293.1263869\ttest: 78982.5109645\tbest: 78982.5109645 (0)\ttotal: 15.7ms\tremaining: 12.7s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:35:11,098] Trial 16 finished with value: 26805.860728204792 and parameters: {'iterations': 811, 'depth': 6, 'learning_rate': 0.038422029485352796, 'l2_leaf_reg': 1.3242863864510208}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"810:\tlearn: 8503.8489310\ttest: 26808.4863444\tbest: 26805.8607282 (780)\ttotal: 15.6s\tremaining: 0us\n",
"\n",
"bestTest = 26805.86073\n",
"bestIteration = 780\n",
"\n",
"Shrink model to first 781 iterations.\n",
"0:\tlearn: 72807.3168340\ttest: 74567.4359805\tbest: 74567.4359805 (0)\ttotal: 8.14ms\tremaining: 4.83s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:35:16,427] Trial 17 finished with value: 27731.2561681143 and parameters: {'iterations': 594, 'depth': 4, 'learning_rate': 0.14231687844310859, 'l2_leaf_reg': 2.7513555972307113}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"593:\tlearn: 8708.7251480\ttest: 27745.5415055\tbest: 27731.2561681 (592)\ttotal: 5.21s\tremaining: 0us\n",
"\n",
"bestTest = 27731.25617\n",
"bestIteration = 592\n",
"\n",
"Shrink model to first 593 iterations.\n",
"0:\tlearn: 69632.6423041\ttest: 71491.3158993\tbest: 71491.3158993 (0)\ttotal: 11.4ms\tremaining: 8.13s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:35:24,653] Trial 18 finished with value: 28562.629881846802 and parameters: {'iterations': 712, 'depth': 5, 'learning_rate': 0.21317542189811298, 'l2_leaf_reg': 1.9932455456134346}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"711:\tlearn: 3451.4434144\ttest: 28588.2419094\tbest: 28562.6298818 (662)\ttotal: 8.09s\tremaining: 0us\n",
"\n",
"bestTest = 28562.62988\n",
"bestIteration = 662\n",
"\n",
"Shrink model to first 663 iterations.\n",
"0:\tlearn: 76839.6083414\ttest: 78564.7956370\tbest: 78564.7956370 (0)\ttotal: 15.2ms\tremaining: 9.2s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 13:35:36,463] Trial 19 finished with value: 27636.592120178866 and parameters: {'iterations': 608, 'depth': 6, 'learning_rate': 0.04833367065674872, 'l2_leaf_reg': 1.5504919469866252}. Best is trial 6 with value: 25505.915021309767.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"607:\tlearn: 9007.5238814\ttest: 27636.5921202\tbest: 27636.5921202 (607)\ttotal: 11.7s\tremaining: 0us\n",
"\n",
"bestTest = 27636.59212\n",
"bestIteration = 607\n",
"\n",
"0:\tlearn: 75633.7775578\ttest: 77511.7476552\tbest: 77511.7476552 (0)\ttotal: 14.1ms\tremaining: 8.56s\n",
"100:\tlearn: 18169.0442353\ttest: 31046.3187900\tbest: 31046.3187900 (100)\ttotal: 2.28s\tremaining: 11.4s\n",
"200:\tlearn: 14158.4309190\ttest: 29124.2526013\tbest: 29124.2526013 (200)\ttotal: 4.18s\tremaining: 8.45s\n",
"300:\tlearn: 11582.0708109\ttest: 28465.6510796\tbest: 28465.6510796 (300)\ttotal: 6.09s\tremaining: 6.19s\n",
"400:\tlearn: 9875.7318011\ttest: 28048.5426547\tbest: 28043.2554774 (398)\ttotal: 7.97s\tremaining: 4.09s\n",
"500:\tlearn: 8681.9566210\ttest: 27841.4177588\tbest: 27840.8079547 (493)\ttotal: 9.94s\tremaining: 2.1s\n",
"600:\tlearn: 7654.3280918\ttest: 27619.5164463\tbest: 27609.0403144 (598)\ttotal: 11.8s\tremaining: 118ms\n",
"606:\tlearn: 7594.7749448\ttest: 27610.8376438\tbest: 27609.0403144 (598)\ttotal: 11.9s\tremaining: 0us\n",
"\n",
"bestTest = 27609.04031\n",
"bestIteration = 598\n",
"\n",
"Shrink model to first 599 iterations.\n",
"Final RMSE: 27609.040314427017\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(best_params)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RjnaYan5DT38",
"outputId": "16e18b21-7947-496a-900d-4eaf300fe889"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'iterations': 607, 'depth': 6, 'learning_rate': 0.0677948134301802, 'l2_leaf_reg': 2.0814646273997335}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"test = pd.read_csv('/content/test (3).csv')\n",
"test.head(3)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 193
},
"id": "6wFXbBuh3A_A",
"outputId": "ae1e95e7-d685-4451-d328-98ec736045c4"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n",
"0 1461 20 RH 80.0 11622 Pave NaN Reg \n",
"1 1462 20 RL 81.0 14267 Pave NaN IR1 \n",
"2 1463 60 RL 74.0 13830 Pave NaN IR1 \n",
"\n",
" LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature \\\n",
"0 Lvl AllPub ... 120 0 NaN MnPrv NaN \n",
"1 Lvl AllPub ... 0 0 NaN NaN Gar2 \n",
"2 Lvl AllPub ... 0 0 NaN MnPrv NaN \n",
"\n",
" MiscVal MoSold YrSold SaleType SaleCondition \n",
"0 0 6 2010 WD Normal \n",
"1 12500 6 2010 WD Normal \n",
"2 0 3 2010 WD Normal \n",
"\n",
"[3 rows x 80 columns]"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" MSSubClass | \n",
" MSZoning | \n",
" LotFrontage | \n",
" LotArea | \n",
" Street | \n",
" Alley | \n",
" LotShape | \n",
" LandContour | \n",
" Utilities | \n",
" ... | \n",
" ScreenPorch | \n",
" PoolArea | \n",
" PoolQC | \n",
" Fence | \n",
" MiscFeature | \n",
" MiscVal | \n",
" MoSold | \n",
" YrSold | \n",
" SaleType | \n",
" SaleCondition | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1461 | \n",
" 20 | \n",
" RH | \n",
" 80.0 | \n",
" 11622 | \n",
" Pave | \n",
" NaN | \n",
" Reg | \n",
" Lvl | \n",
" AllPub | \n",
" ... | \n",
" 120 | \n",
" 0 | \n",
" NaN | \n",
" MnPrv | \n",
" NaN | \n",
" 0 | \n",
" 6 | \n",
" 2010 | \n",
" WD | \n",
" Normal | \n",
"
\n",
" \n",
" 1 | \n",
" 1462 | \n",
" 20 | \n",
" RL | \n",
" 81.0 | \n",
" 14267 | \n",
" Pave | \n",
" NaN | \n",
" IR1 | \n",
" Lvl | \n",
" AllPub | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" NaN | \n",
" Gar2 | \n",
" 12500 | \n",
" 6 | \n",
" 2010 | \n",
" WD | \n",
" Normal | \n",
"
\n",
" \n",
" 2 | \n",
" 1463 | \n",
" 60 | \n",
" RL | \n",
" 74.0 | \n",
" 13830 | \n",
" Pave | \n",
" NaN | \n",
" IR1 | \n",
" Lvl | \n",
" AllPub | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" MnPrv | \n",
" NaN | \n",
" 0 | \n",
" 3 | \n",
" 2010 | \n",
" WD | \n",
" Normal | \n",
"
\n",
" \n",
"
\n",
"
3 rows × 80 columns
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "test"
}
},
"metadata": {},
"execution_count": 43
}
]
},
{
"cell_type": "code",
"source": [
"test = test.drop(drp, axis = 1)"
],
"metadata": {
"id": "gywjd_Mg6D6a"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"x = list(val_x.columns)\n",
"y = list(test.columns)\n",
"for i in y:\n",
" if i not in x:\n",
" print(i)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3YbK3HHZ6EPl",
"outputId": "ea20f06b-5170-4084-a6ed-2dd29efda442"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Id\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"missing_values_table = test.isna().sum().reset_index()\n",
"missing_values_table.columns = ['Column', 'Missing Values']\n",
"missing_values_table['% of Total Values'] = (missing_values_table['Missing Values'] / len(test)) * 100\n",
"\n",
"print(missing_values_table['% of Total Values'])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oUzMUS_e6_20",
"outputId": "4653b080-5814-4867-ffca-7d1f6d19f50c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0 0.0\n",
"1 0.0\n",
"2 0.0\n",
"3 0.0\n",
"4 0.0\n",
" ... \n",
"68 0.0\n",
"69 0.0\n",
"70 0.0\n",
"71 0.0\n",
"72 0.0\n",
"Name: % of Total Values, Length: 73, dtype: float64\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"a = list(missing_values_table['% of Total Values'])\n",
"num = []\n",
"cat = []\n",
"drp = []\n",
"for i in range(len(columns)):\n",
" if missing_values_table['% of Total Values'].iloc[i] > 0:\n",
" if missing_values_table['% of Total Values'].iloc[i] > 40:\n",
" drp.append(str(missing_values_table['Column'].iloc[i]))\n",
" elif str(missing_values_table['Column'].iloc[i]) in categorical_features:\n",
" print(missing_values_table['Column'].iloc[i], missing_values_table['% of Total Values'].iloc[i], \"CATEGORICAL\")\n",
" cat.append(str(missing_values_table['Column'].iloc[i]))\n",
" else:\n",
" print(missing_values_table['Column'].iloc[i], missing_values_table['% of Total Values'].iloc[i])\n",
" num.append(str(missing_values_table['Column'].iloc[i]))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3x5eDmQk7djd",
"outputId": "bff3649e-357f-4352-af6b-142a4e443774"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"MSZoning 0.2741603838245374 CATEGORICAL\n",
"LotFrontage 15.558601782042494\n",
"Utilities 0.1370801919122687 CATEGORICAL\n",
"Exterior1st 0.06854009595613435 CATEGORICAL\n",
"Exterior2nd 0.06854009595613435 CATEGORICAL\n",
"MasVnrArea 1.0281014393420151\n",
"BsmtQual 3.015764222069911 CATEGORICAL\n",
"BsmtCond 3.0843043180260454 CATEGORICAL\n",
"BsmtExposure 3.015764222069911 CATEGORICAL\n",
"BsmtFinType1 2.878684030157642 CATEGORICAL\n",
"BsmtFinSF1 0.06854009595613435\n",
"BsmtFinType2 2.878684030157642 CATEGORICAL\n",
"BsmtFinSF2 0.06854009595613435\n",
"BsmtUnfSF 0.06854009595613435\n",
"TotalBsmtSF 0.06854009595613435\n",
"BsmtFullBath 0.1370801919122687\n",
"BsmtHalfBath 0.1370801919122687\n",
"KitchenQual 0.06854009595613435 CATEGORICAL\n",
"Functional 0.1370801919122687 CATEGORICAL\n",
"GarageType 5.20904729266621 CATEGORICAL\n",
"GarageYrBlt 5.346127484578479\n",
"GarageFinish 5.346127484578479 CATEGORICAL\n",
"GarageCars 0.06854009595613435\n",
"GarageArea 0.06854009595613435\n",
"GarageQual 5.346127484578479 CATEGORICAL\n",
"GarageCond 5.346127484578479 CATEGORICAL\n",
"SaleType 0.06854009595613435 CATEGORICAL\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"imputer = SimpleImputer(strategy='mean')\n",
"test[num] = imputer.fit_transform(test[num])\n",
"\n",
"imputer_cat = SimpleImputer(strategy='most_frequent')\n",
"test[cat] = imputer_cat.fit_transform(test[cat])"
],
"metadata": {
"id": "mN0V3AR77rLz"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"final_preds = best_model.predict(test)"
],
"metadata": {
"id": "OfjqJ21s5HAe"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"final = pd.DataFrame({\n",
" \"Id\": test['Id'],\n",
" \"SalePrice\": final_preds\n",
"})\n",
"final.head(3)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "UcsIk_te3HBw",
"outputId": "39b56c05-d2ff-419d-c2f4-e31af181fc37"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Id SalePrice\n",
"0 1461 116908.794116\n",
"1 1462 163613.449249\n",
"2 1463 184095.361491"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" SalePrice | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1461 | \n",
" 116908.794116 | \n",
"
\n",
" \n",
" 1 | \n",
" 1462 | \n",
" 163613.449249 | \n",
"
\n",
" \n",
" 2 | \n",
" 1463 | \n",
" 184095.361491 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "final",
"summary": "{\n \"name\": \"final\",\n \"rows\": 1459,\n \"fields\": [\n {\n \"column\": \"Id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 421,\n \"min\": 1461,\n \"max\": 2919,\n \"num_unique_values\": 1459,\n \"samples\": [\n 2782,\n 2297,\n 1874\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SalePrice\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 76481.96226474205,\n \"min\": 37583.029954204336,\n \"max\": 525656.1493670065,\n \"num_unique_values\": 1458,\n \"samples\": [\n 93322.11940372009,\n 363516.775079109,\n 206466.05067474546\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 45
}
]
},
{
"cell_type": "code",
"source": [
"final.to_csv('final_0.csv', index=False)"
],
"metadata": {
"id": "R6O-mhDg4Er8"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Classification"
],
"metadata": {
"id": "NJN3_9QzBMFz"
}
},
{
"cell_type": "code",
"source": [
"!pip install optuna\n",
"!pip install catboost"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "N98BauBOBSay",
"outputId": "cf59a174-7f19-4350-8f31-e6b3f41d8a8f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: optuna in /usr/local/lib/python3.11/dist-packages (4.2.1)\n",
"Requirement already satisfied: alembic>=1.5.0 in /usr/local/lib/python3.11/dist-packages (from optuna) (1.14.1)\n",
"Requirement already satisfied: colorlog in /usr/local/lib/python3.11/dist-packages (from optuna) (6.9.0)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from optuna) (1.26.4)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from optuna) (24.2)\n",
"Requirement already satisfied: sqlalchemy>=1.4.2 in /usr/local/lib/python3.11/dist-packages (from optuna) (2.0.38)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from optuna) (4.67.1)\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from optuna) (6.0.2)\n",
"Requirement already satisfied: Mako in /usr/local/lib/python3.11/dist-packages (from alembic>=1.5.0->optuna) (1.3.9)\n",
"Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.11/dist-packages (from alembic>=1.5.0->optuna) (4.12.2)\n",
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.11/dist-packages (from sqlalchemy>=1.4.2->optuna) (3.1.1)\n",
"Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.11/dist-packages (from Mako->alembic>=1.5.0->optuna) (3.0.2)\n",
"Requirement already satisfied: catboost in /usr/local/lib/python3.11/dist-packages (1.2.7)\n",
"Requirement already satisfied: graphviz in /usr/local/lib/python3.11/dist-packages (from catboost) (0.20.3)\n",
"Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (from catboost) (3.10.0)\n",
"Requirement already satisfied: numpy<2.0,>=1.16.0 in /usr/local/lib/python3.11/dist-packages (from catboost) (1.26.4)\n",
"Requirement already satisfied: pandas>=0.24 in /usr/local/lib/python3.11/dist-packages (from catboost) (2.2.2)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.11/dist-packages (from catboost) (1.13.1)\n",
"Requirement already satisfied: plotly in /usr/local/lib/python3.11/dist-packages (from catboost) (5.24.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.11/dist-packages (from catboost) (1.17.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas>=0.24->catboost) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas>=0.24->catboost) (2025.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas>=0.24->catboost) (2025.1)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (1.3.1)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (4.56.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (1.4.8)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (24.2)\n",
"Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (11.1.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib->catboost) (3.2.1)\n",
"Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.11/dist-packages (from plotly->catboost) (9.0.0)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from sklearn.impute import SimpleImputer\n",
"import numpy as np\n",
"import pandas as pd\n",
"from catboost import CatBoostClassifier, Pool\n",
"import optuna\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score"
],
"metadata": {
"id": "jo-YFxu7CwV5"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train = pd.read_csv('/content/train (7).csv', index_col=False)\n",
"train = train.drop('PassengerId', axis=1)\n",
"test = pd.read_csv('/content/test (4).csv', index_col=False)"
],
"metadata": {
"id": "LKe8gmdxC8ZU"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train.head(3)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "ZzkNv4UzLQ0B",
"outputId": "ae3a43e1-0e84-4723-cec4-ddf24d85a9a2"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Survived Pclass Name \\\n",
"0 0 3 Braund, Mr. Owen Harris \n",
"1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... \n",
"2 1 3 Heikkinen, Miss. Laina \n",
"\n",
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \n",
"0 male 22.0 1 0 A/5 21171 7.2500 NaN S \n",
"1 female 38.0 1 0 PC 17599 71.2833 C85 C \n",
"2 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Survived | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" Braund, Mr. Owen Harris | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" A/5 21171 | \n",
" 7.2500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" PC 17599 | \n",
" 71.2833 | \n",
" C85 | \n",
" C | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 3 | \n",
" Heikkinen, Miss. Laina | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" STON/O2. 3101282 | \n",
" 7.9250 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "train",
"summary": "{\n \"name\": \"train\",\n \"rows\": 891,\n \"fields\": [\n {\n \"column\": \"Survived\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Pclass\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 3,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 891,\n \"samples\": [\n \"Moubarek, Master. Halim Gonios (\\\"William George\\\")\",\n \"Kvillner, Mr. Johan Henrik Johannesson\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sex\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"female\",\n \"male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.526497332334044,\n \"min\": 0.42,\n \"max\": 80.0,\n \"num_unique_values\": 88,\n \"samples\": [\n 0.75,\n 22.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SibSp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 0,\n \"max\": 8,\n \"num_unique_values\": 7,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Parch\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 6,\n \"num_unique_values\": 7,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Ticket\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 681,\n \"samples\": [\n \"11774\",\n \"248740\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 49.693428597180905,\n \"min\": 0.0,\n \"max\": 512.3292,\n \"num_unique_values\": 248,\n \"samples\": [\n 11.2417,\n 51.8625\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cabin\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 147,\n \"samples\": [\n \"D45\",\n \"B49\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Embarked\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"S\",\n \"C\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 69
}
]
},
{
"cell_type": "code",
"source": [
"missing_values_table = train.isna().sum().reset_index()\n",
"missing_values_table.columns = ['Column', 'Missing Values']\n",
"missing_values_table['% of Total Values'] = (missing_values_table['Missing Values'] / len(train)) * 100\n",
"\n",
"print(missing_values_table['% of Total Values'])"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "4P7skW49DMAD",
"outputId": "3fd75205-5e77-4d06-d6a2-8046c51549a6"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0 0.000000\n",
"1 0.000000\n",
"2 0.000000\n",
"3 0.000000\n",
"4 19.865320\n",
"5 0.000000\n",
"6 0.000000\n",
"7 0.000000\n",
"8 0.000000\n",
"9 77.104377\n",
"10 0.224467\n",
"Name: % of Total Values, dtype: float64\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"columns = list(train.columns)\n",
"categorical_features = []\n",
"for i in columns:\n",
" if str(type(train[i].iloc[0])) == \"\":\n",
" categorical_features.append(i)"
],
"metadata": {
"id": "Q0MJb-FnGXTT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"a = list(missing_values_table['% of Total Values'])\n",
"num = []\n",
"cat = []\n",
"drp = []\n",
"for i in range(len(a)):\n",
" if missing_values_table['% of Total Values'].iloc[i] > 0:\n",
" if missing_values_table['% of Total Values'].iloc[i] > 40:\n",
" drp.append(str(missing_values_table['Column'].iloc[i]))\n",
" elif str(missing_values_table['Column'].iloc[i]) in categorical_features:\n",
" print(missing_values_table['Column'].iloc[i], missing_values_table['% of Total Values'].iloc[i], \"CATEGORICAL\")\n",
" cat.append(str(missing_values_table['Column'].iloc[i]))\n",
" else:\n",
" print(missing_values_table['Column'].iloc[i], missing_values_table['% of Total Values'].iloc[i])\n",
" num.append(str(missing_values_table['Column'].iloc[i]))\n",
"\n",
"print(drp)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CO6Z0u4TDTFx",
"outputId": "74fa99ed-db10-4781-9520-caba786bfa2e"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Age 19.865319865319865\n",
"Embarked 0.22446689113355783 CATEGORICAL\n",
"['Cabin']\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"train = train.drop(drp, axis = 1)\n",
"\n",
"imputer = SimpleImputer(strategy='mean')\n",
"train[num] = imputer.fit_transform(train[num])\n",
"\n",
"imputer_cat = SimpleImputer(strategy='most_frequent')\n",
"train[cat] = imputer_cat.fit_transform(train[cat])"
],
"metadata": {
"id": "9Z_pmurrF3NP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"y = train['Survived']\n",
"x = train.drop('Survived', axis =1)\n",
"\n",
"train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.2)"
],
"metadata": {
"id": "wxvdyGaYLD6d"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"columns = list(train_x.columns)\n",
"categorical_features = []\n",
"for i in columns:\n",
" if str(type(train[i].iloc[0])) == \"\":\n",
" categorical_features.append(i)"
],
"metadata": {
"id": "_a5vRqgOMK_n"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def objective(trial):\n",
" params = {\n",
" \"iterations\": trial.suggest_int(\"iterations\", 500, 2000),\n",
" \"depth\": trial.suggest_int(\"depth\", 4, 7),\n",
" \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.01, 0.3, log=True),\n",
" \"l2_leaf_reg\": trial.suggest_float(\"l2_leaf_reg\", 1, 5, log=True),\n",
" \"loss_function\": \"Logloss\",\n",
" \"eval_metric\": \"Accuracy\",\n",
" \"random_seed\": 42,\n",
" \"verbose\": 0\n",
" }\n",
"\n",
" model = CatBoostClassifier(**params)\n",
" model.fit(train_x, train_y, eval_set=(val_x, val_y), verbose=1000, cat_features=categorical_features)\n",
"\n",
" preds = model.predict(val_x)\n",
" accuracy = accuracy_score(val_y, preds)\n",
" return accuracy\n",
"\n",
"# 📌 5. Оптимизация гиперпараметров\n",
"study = optuna.create_study(direction=\"maximize\") # Для классификации — максимизируем accuracy\n",
"study.optimize(objective, n_trials=20) # Количество итераций\n",
"\n",
"# 📌 6. Обучение модели с лучшими параметрами\n",
"best_params = study.best_params\n",
"best_model = CatBoostClassifier(**best_params)\n",
"best_model.fit(train_x, train_y, eval_set=(val_x, val_y), verbose=100, cat_features=categorical_features)\n",
"\n",
"# 📌 7. Оценка модели\n",
"final_preds = best_model.predict(val_x)\n",
"accuracy = accuracy_score(val_y, final_preds)\n",
"print(f\"Final Accuracy: {accuracy:.4f}\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FXMxqKChGOlq",
"outputId": "1f6b7a2a-4bf8-440b-9546-2074e282d21b"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:28,374] A new study created in memory with name: no-name-5696c2d7-b3d1-4ecd-9c50-1c3b744c4f02\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"0:\tlearn: 0.8146067\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.92ms\tremaining: 3.15s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:30,557] Trial 0 finished with value: 0.8491620111731844 and parameters: {'iterations': 1078, 'depth': 5, 'learning_rate': 0.10281491066829387, 'l2_leaf_reg': 2.079064968064275}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1000:\tlearn: 0.9915730\ttest: 0.8212291\tbest: 0.8491620 (110)\ttotal: 1.92s\tremaining: 147ms\n",
"1077:\tlearn: 0.9929775\ttest: 0.8268156\tbest: 0.8491620 (110)\ttotal: 2.06s\tremaining: 0us\n",
"\n",
"bestTest = 0.8491620112\n",
"bestIteration = 110\n",
"\n",
"Shrink model to first 111 iterations.\n",
"0:\tlearn: 0.8188202\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 4.02ms\tremaining: 5.7s\n",
"1000:\tlearn: 0.9676966\ttest: 0.8268156\tbest: 0.8435754 (364)\ttotal: 2.31s\tremaining: 963ms\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:34,005] Trial 1 finished with value: 0.8435754189944135 and parameters: {'iterations': 1418, 'depth': 6, 'learning_rate': 0.043508919824835304, 'l2_leaf_reg': 3.6659717941077608}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1417:\tlearn: 0.9761236\ttest: 0.8268156\tbest: 0.8435754 (364)\ttotal: 3.29s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 364\n",
"\n",
"Shrink model to first 365 iterations.\n",
"0:\tlearn: 0.8188202\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 3.05ms\tremaining: 1.61s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:35,126] Trial 2 finished with value: 0.8379888268156425 and parameters: {'iterations': 529, 'depth': 6, 'learning_rate': 0.01588182180207621, 'l2_leaf_reg': 1.6640193240287826}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"528:\tlearn: 0.8918539\ttest: 0.8212291\tbest: 0.8379888 (183)\ttotal: 1.04s\tremaining: 0us\n",
"\n",
"bestTest = 0.8379888268\n",
"bestIteration = 183\n",
"\n",
"Shrink model to first 184 iterations.\n",
"0:\tlearn: 0.8188202\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.86ms\tremaining: 3.12s\n",
"1000:\tlearn: 0.9845506\ttest: 0.8044693\tbest: 0.8435754 (220)\ttotal: 2.29s\tremaining: 204ms\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:37,940] Trial 3 finished with value: 0.8435754189944135 and parameters: {'iterations': 1090, 'depth': 6, 'learning_rate': 0.07543482284922365, 'l2_leaf_reg': 4.14476144682591}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1089:\tlearn: 0.9873596\ttest: 0.7988827\tbest: 0.8435754 (220)\ttotal: 2.68s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 220\n",
"\n",
"Shrink model to first 221 iterations.\n",
"0:\tlearn: 0.8188202\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 5.43ms\tremaining: 5.62s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:41,497] Trial 4 finished with value: 0.8435754189944135 and parameters: {'iterations': 1035, 'depth': 7, 'learning_rate': 0.2571458943835452, 'l2_leaf_reg': 3.8609229716750417}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1000:\tlearn: 1.0000000\ttest: 0.7877095\tbest: 0.8435754 (73)\ttotal: 3.33s\tremaining: 113ms\n",
"1034:\tlearn: 1.0000000\ttest: 0.7877095\tbest: 0.8435754 (73)\ttotal: 3.42s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 73\n",
"\n",
"Shrink model to first 74 iterations.\n",
"0:\tlearn: 0.8075843\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 1.89ms\tremaining: 2.21s\n",
"1000:\tlearn: 0.9016854\ttest: 0.8156425\tbest: 0.8268156 (180)\ttotal: 1.56s\tremaining: 260ms\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:43,441] Trial 5 finished with value: 0.8268156424581006 and parameters: {'iterations': 1168, 'depth': 4, 'learning_rate': 0.01997922756376188, 'l2_leaf_reg': 4.656972624040588}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1167:\tlearn: 0.9073034\ttest: 0.8156425\tbest: 0.8268156 (180)\ttotal: 1.82s\tremaining: 0us\n",
"\n",
"bestTest = 0.8268156425\n",
"bestIteration = 180\n",
"\n",
"Shrink model to first 181 iterations.\n",
"0:\tlearn: 0.8146067\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.57ms\tremaining: 1.48s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:44,662] Trial 6 finished with value: 0.8435754189944135 and parameters: {'iterations': 574, 'depth': 5, 'learning_rate': 0.08834109776170529, 'l2_leaf_reg': 1.004762461065921}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"573:\tlearn: 0.9803371\ttest: 0.8156425\tbest: 0.8435754 (196)\ttotal: 1.14s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 196\n",
"\n",
"Shrink model to first 197 iterations.\n",
"0:\tlearn: 0.8146067\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.62ms\tremaining: 2.25s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:46,428] Trial 7 finished with value: 0.8435754189944135 and parameters: {'iterations': 859, 'depth': 5, 'learning_rate': 0.05228624368798647, 'l2_leaf_reg': 2.2087846373564632}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"858:\tlearn: 0.9620787\ttest: 0.8212291\tbest: 0.8435754 (269)\ttotal: 1.66s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 269\n",
"\n",
"Shrink model to first 270 iterations.\n",
"0:\tlearn: 0.8188202\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.57ms\tremaining: 2.59s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:48,963] Trial 8 finished with value: 0.8379888268156425 and parameters: {'iterations': 1008, 'depth': 6, 'learning_rate': 0.23851821727115008, 'l2_leaf_reg': 1.9779337548268094}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1000:\tlearn: 1.0000000\ttest: 0.8044693\tbest: 0.8379888 (13)\ttotal: 2.39s\tremaining: 16.7ms\n",
"1007:\tlearn: 1.0000000\ttest: 0.8044693\tbest: 0.8379888 (13)\ttotal: 2.41s\tremaining: 0us\n",
"\n",
"bestTest = 0.8379888268\n",
"bestIteration = 13\n",
"\n",
"Shrink model to first 14 iterations.\n",
"0:\tlearn: 0.8188202\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 4.03ms\tremaining: 6.37s\n",
"1000:\tlearn: 0.9957865\ttest: 0.7988827\tbest: 0.8435754 (180)\ttotal: 2.79s\tremaining: 1.61s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:53,178] Trial 9 finished with value: 0.8435754189944135 and parameters: {'iterations': 1581, 'depth': 6, 'learning_rate': 0.1067027642243077, 'l2_leaf_reg': 1.5977216451114515}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1580:\tlearn: 0.9985955\ttest: 0.7988827\tbest: 0.8435754 (180)\ttotal: 4.05s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 180\n",
"\n",
"Shrink model to first 181 iterations.\n",
"0:\tlearn: 0.8075843\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.32ms\tremaining: 4.15s\n",
"1000:\tlearn: 0.9859551\ttest: 0.8100559\tbest: 0.8379888 (70)\ttotal: 1.69s\tremaining: 1.33s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:05:56,240] Trial 10 finished with value: 0.8379888268156425 and parameters: {'iterations': 1790, 'depth': 4, 'learning_rate': 0.14047712034504312, 'l2_leaf_reg': 2.8601732035579426}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1789:\tlearn: 0.9943820\ttest: 0.8044693\tbest: 0.8379888 (70)\ttotal: 2.88s\tremaining: 0us\n",
"\n",
"bestTest = 0.8379888268\n",
"bestIteration = 70\n",
"\n",
"Shrink model to first 71 iterations.\n",
"0:\tlearn: 0.8202247\ttest: 0.8044693\tbest: 0.8044693 (0)\ttotal: 3.74ms\tremaining: 5.42s\n",
"1000:\tlearn: 0.9662921\ttest: 0.8212291\tbest: 0.8324022 (93)\ttotal: 2.87s\tremaining: 1.29s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:01,278] Trial 11 finished with value: 0.8324022346368715 and parameters: {'iterations': 1450, 'depth': 7, 'learning_rate': 0.028888991528930555, 'l2_leaf_reg': 3.0539039026362467}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1449:\tlearn: 0.9775281\ttest: 0.8044693\tbest: 0.8324022 (93)\ttotal: 4.86s\tremaining: 0us\n",
"\n",
"bestTest = 0.8324022346\n",
"bestIteration = 93\n",
"\n",
"Shrink model to first 94 iterations.\n",
"0:\tlearn: 0.8146067\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.62ms\tremaining: 3.66s\n",
"1000:\tlearn: 0.9634831\ttest: 0.8324022\tbest: 0.8435754 (789)\ttotal: 1.88s\tremaining: 752ms\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:04,107] Trial 12 finished with value: 0.8435754189944135 and parameters: {'iterations': 1401, 'depth': 5, 'learning_rate': 0.04369338314178926, 'l2_leaf_reg': 2.9064845966321844}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1400:\tlearn: 0.9733146\ttest: 0.8212291\tbest: 0.8435754 (789)\ttotal: 2.67s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 789\n",
"\n",
"Shrink model to first 790 iterations.\n",
"0:\tlearn: 0.8146067\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.54ms\tremaining: 5.06s\n",
"1000:\tlearn: 0.9620787\ttest: 0.8324022\tbest: 0.8379888 (559)\ttotal: 1.91s\tremaining: 1.9s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:08,112] Trial 13 finished with value: 0.8379888268156425 and parameters: {'iterations': 1997, 'depth': 5, 'learning_rate': 0.03491901521515997, 'l2_leaf_reg': 1.341232525010233}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1996:\tlearn: 0.9887640\ttest: 0.8156425\tbest: 0.8379888 (559)\ttotal: 3.81s\tremaining: 0us\n",
"\n",
"bestTest = 0.8379888268\n",
"bestIteration = 559\n",
"\n",
"Shrink model to first 560 iterations.\n",
"0:\tlearn: 0.8202247\ttest: 0.8044693\tbest: 0.8044693 (0)\ttotal: 4.51ms\tremaining: 6.08s\n",
"1000:\tlearn: 0.9157303\ttest: 0.8379888\tbest: 0.8379888 (911)\ttotal: 2.5s\tremaining: 868ms\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:12,325] Trial 14 finished with value: 0.8435754189944135 and parameters: {'iterations': 1348, 'depth': 7, 'learning_rate': 0.011244048959052837, 'l2_leaf_reg': 3.417468037759992}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1347:\tlearn: 0.9367978\ttest: 0.8379888\tbest: 0.8435754 (1013)\ttotal: 4.01s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 1013\n",
"\n",
"Shrink model to first 1014 iterations.\n",
"0:\tlearn: 0.8075843\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.96ms\tremaining: 2.35s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:13,805] Trial 15 finished with value: 0.8379888268156425 and parameters: {'iterations': 794, 'depth': 4, 'learning_rate': 0.17644799666045521, 'l2_leaf_reg': 2.4992825317390106}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"793:\tlearn: 0.9887640\ttest: 0.8156425\tbest: 0.8379888 (21)\ttotal: 1.36s\tremaining: 0us\n",
"\n",
"bestTest = 0.8379888268\n",
"bestIteration = 21\n",
"\n",
"Shrink model to first 22 iterations.\n",
"0:\tlearn: 0.8146067\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.62ms\tremaining: 4.26s\n",
"1000:\tlearn: 0.9845506\ttest: 0.8044693\tbest: 0.8379888 (419)\ttotal: 1.99s\tremaining: 1.25s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:17,181] Trial 16 finished with value: 0.8379888268156425 and parameters: {'iterations': 1630, 'depth': 5, 'learning_rate': 0.07402923733761196, 'l2_leaf_reg': 1.801948907451662}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1629:\tlearn: 0.9943820\ttest: 0.7988827\tbest: 0.8379888 (419)\ttotal: 3.2s\tremaining: 0us\n",
"\n",
"bestTest = 0.8379888268\n",
"bestIteration = 419\n",
"\n",
"Shrink model to first 420 iterations.\n",
"0:\tlearn: 0.8188202\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 3.28ms\tremaining: 4.12s\n",
"1000:\tlearn: 0.9648876\ttest: 0.8324022\tbest: 0.8435754 (614)\ttotal: 2.29s\tremaining: 588ms\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:20,264] Trial 17 finished with value: 0.8435754189944135 and parameters: {'iterations': 1258, 'depth': 6, 'learning_rate': 0.025923545234300217, 'l2_leaf_reg': 1.3320547003575962}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"1257:\tlearn: 0.9705056\ttest: 0.8324022\tbest: 0.8435754 (614)\ttotal: 2.92s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 614\n",
"\n",
"Shrink model to first 615 iterations.\n",
"0:\tlearn: 0.8188202\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 3.62ms\tremaining: 3.28s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:22,587] Trial 18 finished with value: 0.8379888268156425 and parameters: {'iterations': 907, 'depth': 6, 'learning_rate': 0.12819035429978284, 'l2_leaf_reg': 4.983150443455822}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"906:\tlearn: 0.9887640\ttest: 0.7988827\tbest: 0.8379888 (129)\ttotal: 2.19s\tremaining: 0us\n",
"\n",
"bestTest = 0.8379888268\n",
"bestIteration = 129\n",
"\n",
"Shrink model to first 130 iterations.\n",
"0:\tlearn: 0.8146067\ttest: 0.7988827\tbest: 0.7988827 (0)\ttotal: 2.39ms\tremaining: 1.74s\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-22 15:06:24,680] Trial 19 finished with value: 0.8435754189944135 and parameters: {'iterations': 728, 'depth': 5, 'learning_rate': 0.06324796994402315, 'l2_leaf_reg': 2.469288457865203}. Best is trial 0 with value: 0.8491620111731844.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"727:\tlearn: 0.9691011\ttest: 0.8268156\tbest: 0.8435754 (408)\ttotal: 1.96s\tremaining: 0us\n",
"\n",
"bestTest = 0.843575419\n",
"bestIteration = 408\n",
"\n",
"Shrink model to first 409 iterations.\n",
"0:\tlearn: 0.6306689\ttest: 0.6331918\tbest: 0.6331918 (0)\ttotal: 2.52ms\tremaining: 2.72s\n",
"100:\tlearn: 0.2803246\ttest: 0.4433972\tbest: 0.4266044 (31)\ttotal: 200ms\tremaining: 1.93s\n",
"200:\tlearn: 0.2079957\ttest: 0.4501661\tbest: 0.4266044 (31)\ttotal: 393ms\tremaining: 1.71s\n",
"300:\tlearn: 0.1601585\ttest: 0.4684582\tbest: 0.4266044 (31)\ttotal: 587ms\tremaining: 1.51s\n",
"400:\tlearn: 0.1227384\ttest: 0.4862125\tbest: 0.4266044 (31)\ttotal: 799ms\tremaining: 1.35s\n",
"500:\tlearn: 0.1003636\ttest: 0.5040966\tbest: 0.4266044 (31)\ttotal: 999ms\tremaining: 1.15s\n",
"600:\tlearn: 0.0832366\ttest: 0.5282830\tbest: 0.4266044 (31)\ttotal: 1.19s\tremaining: 942ms\n",
"700:\tlearn: 0.0718905\ttest: 0.5492214\tbest: 0.4266044 (31)\ttotal: 1.37s\tremaining: 738ms\n",
"800:\tlearn: 0.0637023\ttest: 0.5543152\tbest: 0.4266044 (31)\ttotal: 1.56s\tremaining: 540ms\n",
"900:\tlearn: 0.0556375\ttest: 0.5667068\tbest: 0.4266044 (31)\ttotal: 1.77s\tremaining: 347ms\n",
"1000:\tlearn: 0.0478476\ttest: 0.5991677\tbest: 0.4266044 (31)\ttotal: 1.96s\tremaining: 150ms\n",
"1077:\tlearn: 0.0433863\ttest: 0.6106439\tbest: 0.4266044 (31)\ttotal: 2.1s\tremaining: 0us\n",
"\n",
"bestTest = 0.4266044357\n",
"bestIteration = 31\n",
"\n",
"Shrink model to first 32 iterations.\n",
"Final Accuracy: 0.8156\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"print(best_params)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UQJrShMqMTG4",
"outputId": "5a8f0ccf-6eae-4997-cbf7-c5137903f4fe"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{'iterations': 1078, 'depth': 5, 'learning_rate': 0.10281491066829387, 'l2_leaf_reg': 2.079064968064275}\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"test.head(3)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "qhjxdAn3NT0O",
"outputId": "2de94b17-f37a-4fc3-80a9-6e980fc72c73"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Pclass Name Sex Age SibSp \\\n",
"0 892 3 Kelly, Mr. James male 34.5 0 \n",
"1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 \n",
"2 894 2 Myles, Mr. Thomas Francis male 62.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 330911 7.8292 NaN Q \n",
"1 0 363272 7.0000 NaN S \n",
"2 0 240276 9.6875 NaN Q "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 892 | \n",
" 3 | \n",
" Kelly, Mr. James | \n",
" male | \n",
" 34.5 | \n",
" 0 | \n",
" 0 | \n",
" 330911 | \n",
" 7.8292 | \n",
" NaN | \n",
" Q | \n",
"
\n",
" \n",
" 1 | \n",
" 893 | \n",
" 3 | \n",
" Wilkes, Mrs. James (Ellen Needs) | \n",
" female | \n",
" 47.0 | \n",
" 1 | \n",
" 0 | \n",
" 363272 | \n",
" 7.0000 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 2 | \n",
" 894 | \n",
" 2 | \n",
" Myles, Mr. Thomas Francis | \n",
" male | \n",
" 62.0 | \n",
" 0 | \n",
" 0 | \n",
" 240276 | \n",
" 9.6875 | \n",
" NaN | \n",
" Q | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "test",
"summary": "{\n \"name\": \"test\",\n \"rows\": 418,\n \"fields\": [\n {\n \"column\": \"PassengerId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 120,\n \"min\": 892,\n \"max\": 1309,\n \"num_unique_values\": 418,\n \"samples\": [\n 1213,\n 1216,\n 1280\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Pclass\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 3,\n \"num_unique_values\": 3,\n \"samples\": [\n 3,\n 2,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 418,\n \"samples\": [\n \"Krekorian, Mr. Neshan\",\n \"Kreuchen, Miss. Emilie\",\n \"Canavan, Mr. Patrick\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Sex\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"female\",\n \"male\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 14.18120923562442,\n \"min\": 0.17,\n \"max\": 76.0,\n \"num_unique_values\": 79,\n \"samples\": [\n 10.0,\n 34.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"SibSp\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 8,\n \"num_unique_values\": 7,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Parch\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 9,\n \"num_unique_values\": 8,\n \"samples\": [\n 1,\n 6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Ticket\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 363,\n \"samples\": [\n \"2673\",\n \"W./C. 6607\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Fare\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 55.90757617997383,\n \"min\": 0.0,\n \"max\": 512.3292,\n \"num_unique_values\": 169,\n \"samples\": [\n 41.5792,\n 57.75\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Cabin\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 76,\n \"samples\": [\n \"A21\",\n \"E45\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Embarked\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Q\",\n \"S\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 81
}
]
},
{
"cell_type": "code",
"source": [
"a = list(test.columns)\n",
"num = []\n",
"cat = []\n",
"for i in range(len(a)):\n",
" if missing_values_table['% of Total Values'].iloc[i] > 0:\n",
" if missing_values_table['% of Total Values'].iloc[i] > 40:\n",
" continue\n",
" elif str(missing_values_table['Column'].iloc[i]) in categorical_features:\n",
" print(missing_values_table['Column'].iloc[i], missing_values_table['% of Total Values'].iloc[i], \"CATEGORICAL\")\n",
" cat.append(str(missing_values_table['Column'].iloc[i]))\n",
" else:\n",
" print(missing_values_table['Column'].iloc[i], missing_values_table['% of Total Values'].iloc[i])\n",
" num.append(str(missing_values_table['Column'].iloc[i]))\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "D0kD2MfdQVO8",
"outputId": "211a6081-9c17-4a69-9976-7b10cfc8a321"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Age 19.865319865319865\n",
"Embarked 0.22446689113355783 CATEGORICAL\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"test = test.drop(drp, axis = 1)\n",
"\n",
"imputer = SimpleImputer(strategy='mean')\n",
"test[num] = imputer.fit_transform(test[num])\n",
"\n",
"imputer_cat = SimpleImputer(strategy='most_frequent')\n",
"test[cat] = imputer_cat.fit_transform(test[cat])"
],
"metadata": {
"id": "kPTizT0IPr4W"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"id = test['PassengerId']\n",
"test = test.drop('PassengerId', axis = 1)"
],
"metadata": {
"id": "3g-o_Y5jUI0M"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"final_preds = best_model.predict(test)\n",
"\n",
"final = pd.DataFrame({\n",
" \"PassengerId\": id,\n",
" \"Survived\": final_preds\n",
"})\n",
"final.head(3)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 143
},
"id": "r_WNf6D4Tgjz",
"outputId": "b13cc5d2-1c6f-4005-d7e7-46d41d1ad3f9"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" PassengerId Survived\n",
"0 892 0\n",
"1 893 1\n",
"2 894 0"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 892 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 893 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 894 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "final",
"summary": "{\n \"name\": \"final\",\n \"rows\": 418,\n \"fields\": [\n {\n \"column\": \"PassengerId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 120,\n \"min\": 892,\n \"max\": 1309,\n \"num_unique_values\": 418,\n \"samples\": [\n 1213,\n 1216,\n 1280\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Survived\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 96
}
]
},
{
"cell_type": "code",
"source": [
"final.to_csv('final_2.csv', index=False)"
],
"metadata": {
"id": "vJvWgfZGUGHM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Classification YOLO"
],
"metadata": {
"id": "NJUwS6uF_nPO"
}
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import os\n",
"from ultralytics import YOLO\n",
"\n",
"DATA_DIR = \"path/to/dataset\"\n",
"TEST_DIR = \"path/to/test\"\n",
"OUTPUT_CSV = \"submission.csv\"\n",
"# n - s - m - l - x\n",
"model = YOLO(\"yolov11x-cls.pt\")\n",
"\n",
"model.train(\n",
" data=DATA_DIR,\n",
" epochs=10,\n",
" imgsz=640,\n",
" batch=16,\n",
" lr0=1e-3,\n",
" momentum=0.9,\n",
" augment=True\n",
")\n",
"\n",
"model_path = \"runs/train/exp/weights/best.pt\"\n",
"model = YOLO(model_path)\n",
"\n",
"test_images = [f for f in os.listdir(TEST_DIR) if f.endswith((\".jpg\", \".png\"))]\n",
"\n",
"predictions = []\n",
"\n",
"for img_name in test_images:\n",
" img_path = os.path.join(TEST_DIR, img_name)\n",
" results = model(img_path)\n",
"\n",
" predicted_class = results[0].probs.top1\n",
" predictions.append((img_name, predicted_class))\n",
"\n",
"df = pd.DataFrame(predictions, columns=[\"filename\", \"class\"])\n",
"df.to_csv(OUTPUT_CSV, index=False)\n",
"\n",
"print(f\"✅ Предсказания сохранены в {OUTPUT_CSV}\")\n"
],
"metadata": {
"id": "LOe1KDtKAo4H"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Detection YOLO"
],
"metadata": {
"id": "aAlncS4tPKyR"
}
},
{
"cell_type": "code",
"source": [
"#Преобразование из csv в yaml\n",
"import os\n",
"import pandas as pd\n",
"from PIL import Image\n",
"\n",
"images_dir = \"path/to/images\"\n",
"csv_file = \"path/to/annotations.csv\"\n",
"output_dir = \"path/to/yolo_format\"\n",
"\n",
"annotations = pd.read_csv(csv_file)\n",
"\n",
"def normalize_coordinates(x1, y1, x2, y2, img_width, img_height):\n",
" x_center = (x1 + x2) / 2 / img_width\n",
" y_center = (y1 + y2) / 2 / img_height\n",
" width = (x2 - x1) / img_width\n",
" height = (y2 - y1) / img_height\n",
" return x_center, y_center, width, height\n",
"\n",
"for img_name in annotations['filename'].unique():\n",
" img_path = os.path.join(images_dir, img_name)\n",
" img = Image.open(img_path)\n",
" img_width, img_height = img.size\n",
"\n",
" txt_file = os.path.join(output_dir, img_name.replace('.jpg', '.txt').replace('.png', '.txt'))\n",
"\n",
" with open(txt_file, 'w') as f:\n",
" img_annotations = annotations[annotations['filename'] == img_name]\n",
"\n",
" for _, row in img_annotations.iterrows():\n",
" class_id = row['class']\n",
" x_center, y_center, width, height = normalize_coordinates(\n",
" row['x1'], row['y1'], row['x2'], row['y2'], img_width, img_height\n",
" )\n",
" f.write(f\"{class_id} {x_center} {y_center} {width} {height}\\n\")\n",
"\n"
],
"metadata": {
"id": "2nZ4zfEcPP-N"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# dataset.yaml\n",
"# train: path/to/dataset/images/train # Путь к изображениям для обучения\n",
"# val: path/to/dataset/images/val # Путь к изображениям для валидации\n",
"# test: path/to/dataset/images/test # Путь к изображениям для тестирования\n",
"\n",
"# nc: 3 # Количество классов\n",
"# names: ['car', 'person', 'dog'] # Имена классов\n"
],
"metadata": {
"id": "1KghBRiCQz7o"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import os\n",
"import pandas as pd\n",
"from ultralytics import YOLO\n",
"\n",
"# === 1. ПОДГОТОВКА ДАННЫХ ===\n",
"DATA_YAML = \"path/to/dataset.yaml\" # Файл конфигурации датасета\n",
"TEST_DIR = \"path/to/test\" # Папка с тестовыми изображениями\n",
"OUTPUT_CSV = \"submission.csv\" # Файл для предсказаний\n",
"\n",
"# === 2. ОБУЧЕНИЕ YOLOv11 ===\n",
"model = YOLO(\"yolov11n.pt\") # Используем Nano-версию для быстрого обучения\n",
"\n",
"model.train(\n",
" data=DATA_YAML, # Путь к файлу .yaml с разметкой\n",
" epochs=10, # Количество эпох\n",
" imgsz=640, # Размер входного изображения\n",
" batch=16, # Размер батча\n",
" lr0=1e-3, # Начальный learning rate\n",
" momentum=0.9, # Стабильность градиентов\n",
" augment=True # Включаем встроенные аугментации\n",
")\n",
"\n",
"model_path = \"runs/train/exp/weights/best.pt\"\n",
"model = YOLO(model_path)\n",
"\n",
"test_images = [f for f in os.listdir(TEST_DIR) if f.endswith((\".jpg\", \".png\"))]\n",
"\n",
"predictions = []\n",
"\n",
"for img_name in test_images:\n",
" img_path = os.path.join(TEST_DIR, img_name)\n",
" results = model(img_path)\n",
"\n",
" for result in results:\n",
" for box in result.boxes.data:\n",
" x1, y1, x2, y2, conf, cls = box.tolist()\n",
" predictions.append((img_name, int(cls), conf, x1, y1, x2, y2))\n",
"\n",
"df = pd.DataFrame(predictions, columns=[\"filename\", \"class\", \"confidence\", \"x1\", \"y1\", \"x2\", \"y2\"])\n",
"df.to_csv(OUTPUT_CSV, index=False)"
],
"metadata": {
"id": "lDVTY5XRRwQG"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Segmentation YOLO"
],
"metadata": {
"id": "qpg8QGRLckvg"
}
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import os\n",
"from PIL import Image\n",
"\n",
"csv_path = 'annotations.csv'\n",
"images_dir = 'path/to/images'\n",
"output_dir = 'yolo_annotations'\n",
"\n",
"# Создаем папку для сохранения аннотаций\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"data = pd.read_csv(csv_path)\n",
"\n",
"def convert_to_yolo_format(image_width, image_height, x_min, y_min, x_max, y_max):\n",
" x_center = (x_min + x_max) / 2 / image_width\n",
" y_center = (y_min + y_max) / 2 / image_height\n",
" width = (x_max - x_min) / image_width\n",
" height = (y_max - y_min) / image_height\n",
" return x_center, y_center, width, height\n",
"\n",
"for image_name in os.listdir(images_dir):\n",
" image_path = os.path.join(images_dir, image_name)\n",
"\n",
" with Image.open(image_path) as img:\n",
" image_width, image_height = img.size\n",
"\n",
" image_annotations = data[data['image_name'] == image_name]\n",
"\n",
" annotation_file = os.path.join(output_dir, os.path.splitext(image_name)[0] + '.txt')\n",
" with open(annotation_file, 'w') as f:\n",
" for _, row in image_annotations.iterrows():\n",
" class_id = row['class_id']\n",
" x_min, y_min, x_max, y_max = row['x_min'], row['y_min'], row['x_max'], row['y_max']\n",
"\n",
" x_center, y_center, width, height = convert_to_yolo_format(image_width, image_height, x_min, y_min, x_max, y_max)\n",
"\n",
" f.write(f\"{class_id} {x_center} {y_center} {width} {height}\\n\")\n"
],
"metadata": {
"id": "QeFxyZnicqk_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# train: path/to/train/images\n",
"# val: path/to/val/images\n",
"# test: path/to/test/images\n",
"\n",
"# nc: 2 # Количество классов\n",
"# names: ['class_0', 'class_1'] # Имена классов"
],
"metadata": {
"id": "BrvqznGjdW24"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from ultralytics import YOLO\n",
"import os\n",
"import pandas as pd\n",
"\n",
"def mask_to_rle(mask):\n",
" mask = mask.astype(np.uint8)\n",
"\n",
" rle = mask_util.encode(np.asfortranarray(mask))\n",
"\n",
" rle_string = str(rle['counts'], encoding='utf-8')\n",
" return rle_string\n",
"\n",
"data_path = 'path/to/data.yaml'\n",
"\n",
"model = YOLO('yolo11n-seg.pt')\n",
"\n",
"model.train(data=data_path, epochs=50, imgsz=640, batch_size=16, augment=True)\n",
"\n",
"model.save('yolov11_segmentation_model.pt')\n",
"\n",
"test_path = 'path/to/test/images'\n",
"results = model.predict(test_path, save=True)\n",
"\n",
"predictions = []\n",
"for image_path, pred in zip(results.files, results.pred):\n",
" file_name = os.path.basename(image_path)\n",
"\n",
" masks = pred.masks\n",
"\n",
" if masks is not None:\n",
" for idx, mask in enumerate(masks):\n",
" rle_string = mask_to_rle(mask)\n",
" predictions.append([file_name, idx, rle_string])\n",
"\n",
"df = pd.DataFrame(predictions, columns=['file_name', 'object_id', 'rle'])\n",
"df.to_csv('final.csv', index=False)\n"
],
"metadata": {
"id": "bk8iuOpChbdr"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Pose Detection"
],
"metadata": {
"id": "pwDRSup3lhnX"
}
},
{
"cell_type": "code",
"source": [
"from ultralytics import YOLO\n",
"import os\n",
"import pandas as pd\n",
"import numpy as np\n",
"from PIL import Image\n",
"\n",
"data_path = 'path/to/data.yaml'\n",
"\n",
"model = YOLO('yolov11-pose.pt')\n",
"\n",
"model.train(data=data_path, epochs=50, imgsz=640, batch_size=16)\n",
"\n",
"test_path = 'path/to/test/images'\n",
"results = model.predict(test_path, save=True)\n",
"\n",
"def extract_keypoints(pred):\n",
" keypoints = []\n",
" for i, person in enumerate(pred.keypoints):\n",
" keypoints.append(list(person.xy))\n",
" return keypoints\n",
"\n",
"predictions = []\n",
"for image_path, pred in zip(results.files, results.pred):\n",
" file_name = os.path.basename(image_path)\n",
"\n",
" keypoints = extract_keypoints(pred)\n",
"\n",
" if keypoints:\n",
" for person_id, person_keypoints in enumerate(keypoints):\n",
" # Сохраняем координаты точек для каждого человека\n",
" for idx, (x, y) in enumerate(person_keypoints):\n",
" predictions.append([file_name, person_id, idx, x, y]) # Добавляем все ключевые точки\n",
"\n",
"df = pd.DataFrame(predictions, columns=['file_name', 'person_id', 'keypoint_id', 'x', 'y'])\n",
"df.to_csv('final.csv', index=False)\n"
],
"metadata": {
"id": "2I1qy3cmlpW-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# RecSys Surprise"
],
"metadata": {
"id": "7hDyA__j0_cg"
}
},
{
"cell_type": "code",
"source": [
"!pip install scikit-surprise optuna"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Bvwtxw401Daf",
"outputId": "72dffabf-8da5-4eaa-9af5-0a1297fcdf21"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: scikit-surprise in /usr/local/lib/python3.11/dist-packages (1.1.4)\n",
"Collecting optuna\n",
" Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)\n",
"Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-surprise) (1.4.2)\n",
"Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.11/dist-packages (from scikit-surprise) (1.26.4)\n",
"Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-surprise) (1.13.1)\n",
"Collecting alembic>=1.5.0 (from optuna)\n",
" Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)\n",
"Collecting colorlog (from optuna)\n",
" Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from optuna) (24.2)\n",
"Requirement already satisfied: sqlalchemy>=1.4.2 in /usr/local/lib/python3.11/dist-packages (from optuna) (2.0.38)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from optuna) (4.67.1)\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.11/dist-packages (from optuna) (6.0.2)\n",
"Collecting Mako (from alembic>=1.5.0->optuna)\n",
" Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)\n",
"Requirement already satisfied: typing-extensions>=4 in /usr/local/lib/python3.11/dist-packages (from alembic>=1.5.0->optuna) (4.12.2)\n",
"Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.11/dist-packages (from sqlalchemy>=1.4.2->optuna) (3.1.1)\n",
"Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.11/dist-packages (from Mako->alembic>=1.5.0->optuna) (3.0.2)\n",
"Downloading optuna-4.2.1-py3-none-any.whl (383 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m383.6/383.6 kB\u001b[0m \u001b[31m12.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.6/233.6 kB\u001b[0m \u001b[31m19.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)\n",
"Downloading Mako-1.3.9-py3-none-any.whl (78 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.5/78.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: Mako, colorlog, alembic, optuna\n",
"Successfully installed Mako-1.3.9 alembic-1.14.1 colorlog-6.9.0 optuna-4.2.1\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import random\n",
"import pandas as pd\n",
"from surprise import Dataset, Reader, SVD\n",
"from surprise.model_selection import train_test_split\n",
"from surprise import accuracy"
],
"metadata": {
"id": "tyKsXFGU2Ikq"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from surprise import Dataset, Reader, SVD\n",
"from surprise.model_selection import cross_validate, GridSearchCV\n",
"\n",
"# 📌 Загружаем MovieLens 100K\n",
"data = Dataset.load_builtin('ml-100k')\n",
"\n",
"# 📌 Определяем параметры для Grid Search\n",
"param_grid = {\n",
" 'n_factors': [50, 100, 150], # Количество латентных факторов\n",
" 'lr_all': [0.002, 0.005, 0.01], # Скорость обучения\n",
" 'reg_all': [0.02, 0.05, 0.1] # Регуляризация\n",
"}\n",
"\n",
"# 📌 Запускаем Grid Search\n",
"gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1)\n",
"gs.fit(data)\n",
"\n",
"# 📌 Выводим лучшие параметры\n",
"print(f\"📊 Лучший RMSE: {gs.best_score['rmse']:.4f}\")\n",
"print(f\"🔧 Лучшие параметры: {gs.best_params['rmse']}\")\n",
"\n",
"# 📌 Обучаем финальную модель с лучшими параметрами\n",
"best_model = SVD(**gs.best_params['rmse'])\n",
"cross_validate(best_model, data, cv=5, verbose=True)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_zNeD2K83qqp",
"outputId": "529eb700-2d6a-4dde-9077-bb9ed2ab7d96"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"📊 Лучший RMSE: 0.9181\n",
"🔧 Лучшие параметры: {'n_factors': 150, 'lr_all': 0.01, 'reg_all': 0.1}\n",
"Evaluating RMSE, MAE of algorithm SVD on 5 split(s).\n",
"\n",
" Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std \n",
"RMSE (testset) 0.9226 0.9176 0.9012 0.9308 0.9212 0.9187 0.0098 \n",
"MAE (testset) 0.7307 0.7283 0.7118 0.7365 0.7283 0.7271 0.0082 \n",
"Fit time 1.19 1.48 1.15 1.17 1.13 1.22 0.13 \n",
"Test time 0.11 0.15 0.07 0.15 0.07 0.11 0.03 \n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'test_rmse': array([0.92262018, 0.91755708, 0.90115132, 0.93077068, 0.92121232]),\n",
" 'test_mae': array([0.73069412, 0.72832189, 0.71182612, 0.7364805 , 0.72833285]),\n",
" 'fit_time': (1.185208797454834,\n",
" 1.483828067779541,\n",
" 1.1467053890228271,\n",
" 1.1749987602233887,\n",
" 1.1339025497436523),\n",
" 'test_time': (0.1124725341796875,\n",
" 0.15291881561279297,\n",
" 0.07221841812133789,\n",
" 0.14911413192749023,\n",
" 0.0736684799194336)}"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
"import optuna\n",
"import pandas as pd\n",
"from surprise import Dataset, Reader, KNNBasic\n",
"from surprise.model_selection import cross_validate\n",
"\n",
"# 📌 Загружаем MovieLens 100K\n",
"data = Dataset.load_builtin('ml-100k')\n",
"\n",
"# 📌 Функция для оптимизации гиперпараметров\n",
"def objective(trial):\n",
" sim_options = {\n",
" \"name\": trial.suggest_categorical(\"name\", [\"cosine\", \"pearson\", \"msd\"]),\n",
" \"user_based\": trial.suggest_categorical(\"user_based\", [True, False])\n",
" }\n",
"\n",
" model = KNNBasic(k=trial.suggest_int(\"k\", 10, 50), sim_options=sim_options)\n",
"\n",
" cv_results = cross_validate(model, data, measures=[\"rmse\"], cv=3, verbose=False)\n",
" return cv_results[\"test_rmse\"].mean()\n",
"\n",
"# 📌 Оптимизация гиперпараметров с Optuna\n",
"study = optuna.create_study(direction=\"minimize\")\n",
"study.optimize(objective, n_trials=30)\n",
"\n",
"# 📌 Выводим лучшие параметры\n",
"best_params = study.best_params\n",
"print(f\"📊 Лучший RMSE: {study.best_value:.4f}\")\n",
"print(f\"🔧 Лучшие параметры: {best_params}\")\n",
"\n",
"# 📌 Обучаем финальную модель с лучшими параметрами\n",
"final_sim_options = {\"name\": best_params[\"name\"], \"user_based\": best_params[\"user_based\"]}\n",
"best_model = KNNBasic(k=best_params[\"k\"], sim_options=final_sim_options)\n",
"\n",
"cross_validate(best_model, data, measures=[\"rmse\"], cv=5, verbose=True)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YyEITx6a2hi2",
"outputId": "8d8680a7-ce25-4241-f56d-15be36497d5e"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:53:10,905] A new study created in memory with name: no-name-4ee24999-736b-4631-a413-5080a373b7b0\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:53:30,955] Trial 0 finished with value: 1.0273098011901922 and parameters: {'name': 'pearson', 'user_based': True, 'k': 19}. Best is trial 0 with value: 1.0273098011901922.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:53:40,559] Trial 1 finished with value: 1.040094723866775 and parameters: {'name': 'pearson', 'user_based': True, 'k': 11}. Best is trial 0 with value: 1.0273098011901922.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:53:53,032] Trial 2 finished with value: 1.073262901812824 and parameters: {'name': 'cosine', 'user_based': False, 'k': 16}. Best is trial 0 with value: 1.0273098011901922.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:54:06,531] Trial 3 finished with value: 1.050133624804347 and parameters: {'name': 'cosine', 'user_based': False, 'k': 25}. Best is trial 0 with value: 1.0273098011901922.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:54:18,437] Trial 4 finished with value: 1.0207003527145204 and parameters: {'name': 'cosine', 'user_based': True, 'k': 47}. Best is trial 4 with value: 1.0207003527145204.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:54:30,850] Trial 5 finished with value: 1.0596060578187558 and parameters: {'name': 'cosine', 'user_based': False, 'k': 21}. Best is trial 4 with value: 1.0207003527145204.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the cosine similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:54:41,313] Trial 6 finished with value: 1.035549661559487 and parameters: {'name': 'cosine', 'user_based': True, 'k': 15}. Best is trial 4 with value: 1.0207003527145204.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:54:52,725] Trial 7 finished with value: 1.0277808579160697 and parameters: {'name': 'pearson', 'user_based': True, 'k': 19}. Best is trial 4 with value: 1.0207003527145204.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:55:04,190] Trial 8 finished with value: 0.9869635347848069 and parameters: {'name': 'msd', 'user_based': True, 'k': 38}. Best is trial 8 with value: 0.9869635347848069.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:55:18,106] Trial 9 finished with value: 0.9867222892578903 and parameters: {'name': 'msd', 'user_based': False, 'k': 50}. Best is trial 9 with value: 0.9867222892578903.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:55:32,326] Trial 10 finished with value: 0.9858887823748562 and parameters: {'name': 'msd', 'user_based': False, 'k': 49}. Best is trial 10 with value: 0.9858887823748562.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:55:46,312] Trial 11 finished with value: 0.9870863575929674 and parameters: {'name': 'msd', 'user_based': False, 'k': 50}. Best is trial 10 with value: 0.9858887823748562.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:56:00,123] Trial 12 finished with value: 0.9851113891510596 and parameters: {'name': 'msd', 'user_based': False, 'k': 40}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:56:13,653] Trial 13 finished with value: 0.9864620349826786 and parameters: {'name': 'msd', 'user_based': False, 'k': 40}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:56:26,697] Trial 14 finished with value: 0.9856580660612981 and parameters: {'name': 'msd', 'user_based': False, 'k': 37}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:56:39,580] Trial 15 finished with value: 0.9853303657602779 and parameters: {'name': 'msd', 'user_based': False, 'k': 34}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:56:52,224] Trial 16 finished with value: 0.9869874201851742 and parameters: {'name': 'msd', 'user_based': False, 'k': 32}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:57:04,967] Trial 17 finished with value: 0.9869053043690581 and parameters: {'name': 'msd', 'user_based': False, 'k': 31}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:57:18,816] Trial 18 finished with value: 0.9860014872498327 and parameters: {'name': 'msd', 'user_based': False, 'k': 43}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:57:31,623] Trial 19 finished with value: 0.9860790732255814 and parameters: {'name': 'msd', 'user_based': False, 'k': 34}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:57:45,444] Trial 20 finished with value: 1.0577893280614548 and parameters: {'name': 'pearson', 'user_based': False, 'k': 28}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:57:58,649] Trial 21 finished with value: 0.9856411564475596 and parameters: {'name': 'msd', 'user_based': False, 'k': 36}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:58:12,506] Trial 22 finished with value: 0.9863498341826844 and parameters: {'name': 'msd', 'user_based': False, 'k': 43}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:58:25,961] Trial 23 finished with value: 0.9865147605031322 and parameters: {'name': 'msd', 'user_based': False, 'k': 35}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:58:38,301] Trial 24 finished with value: 0.9880187808024873 and parameters: {'name': 'msd', 'user_based': False, 'k': 27}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:58:51,496] Trial 25 finished with value: 0.9859721086989618 and parameters: {'name': 'msd', 'user_based': False, 'k': 43}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:59:04,643] Trial 26 finished with value: 0.9853186408076082 and parameters: {'name': 'msd', 'user_based': False, 'k': 40}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:59:17,650] Trial 27 finished with value: 0.987735307296505 and parameters: {'name': 'msd', 'user_based': False, 'k': 40}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the pearson similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:59:32,391] Trial 28 finished with value: 1.046721221705121 and parameters: {'name': 'pearson', 'user_based': False, 'k': 46}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"[I 2025-02-24 16:59:43,336] Trial 29 finished with value: 0.9900902320723297 and parameters: {'name': 'msd', 'user_based': True, 'k': 40}. Best is trial 12 with value: 0.9851113891510596.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"📊 Лучший RMSE: 0.9851\n",
"🔧 Лучшие параметры: {'name': 'msd', 'user_based': False, 'k': 40}\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Computing the msd similarity matrix...\n",
"Done computing similarity matrix.\n",
"Evaluating RMSE of algorithm KNNBasic on 5 split(s).\n",
"\n",
" Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std \n",
"RMSE (testset) 0.9697 0.9694 0.9790 0.9778 0.9769 0.9746 0.0041 \n",
"Fit time 0.76 0.53 0.48 0.52 0.52 0.56 0.10 \n",
"Test time 2.75 2.70 2.53 3.01 2.62 2.72 0.16 \n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"{'test_rmse': array([0.96973846, 0.96939912, 0.97902707, 0.97778443, 0.97694979]),\n",
" 'fit_time': (0.7602078914642334,\n",
" 0.5323970317840576,\n",
" 0.48438191413879395,\n",
" 0.5157642364501953,\n",
" 0.5216193199157715),\n",
" 'test_time': (2.753460168838501,\n",
" 2.6966280937194824,\n",
" 2.530104160308838,\n",
" 3.0083537101745605,\n",
" 2.6171016693115234)}"
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"source": [
"\n",
" \\begin{array}{|c|c|c|c|c|c|c|c|}\\hline\\\\ \\\\\n",
" \\mathcal{} & Movielens 100k & RMSE & MAE & Time \\\\ \\hline\\\\\n",
" a & SVD & 0.934 & 0.737 & 0:00:06 \\\\ \\hline\\\\ \\\\\n",
" b & SVD++ (cache_ratings=False) & 0.919 & 0.721 & 0:01:39\\\\ \\hline\\\\ \\\\\n",
" c & SVD++ (cache_ratings=True) & 0.919 & 0.721 & 0:01:22 \\\\ \\hline\\\\ \\\\\n",
" d & NMF & 0.963 & 0.758 & 0:00:06 \\\\ \\hline\\\\ \\\\\n",
" e & Slope One & 0.946 & 0.743 & 0:00:09 \\\\ \\hline\\\\ \\\\\n",
" f & k-NN & 0.98 & 0.774 & 0:00:08 \\\\ \\hline\\\\ \\\\\n",
" g & Slope One & 0.946 & 0.743 & 0:00:09 \\\\ \\hline\\\\ \\\\\n",
" h & Centered k-NN & 0.951 & 0.749 & 0:00:09 \\\\ \\hline\\\\ \\\\\n",
" i & k-NN Baseline & 0.931 & 0.733 & 0:00:13 \\\\ \\hline\\\\ \\\\\n",
" j & Co-Clustering & 0.963 & 0.753 & 0:00:06 \\\\ \\hline\\\\ \\\\\n",
" k & Random & 1.518 & 1.219 & 0:00:01 \\\\ \\hline\\\\ \\\\\n",
" l & Baseline & 0.944 & 0.748 & 0:00:02 \\\\ \\hline\n",
" \\end{array}\n",
"\n",
"\n"
],
"metadata": {
"id": "nRpCIQAxBtLW"
}
},
{
"cell_type": "markdown",
"source": [
"# RecSys Rectools"
],
"metadata": {
"id": "DeD2dGjbMWQS"
}
},
{
"cell_type": "code",
"source": [
"!pip install rectools\n",
"!pip install rectools[all]\n",
"!pip install rectools[torch]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "zCse_PYDM4iu",
"outputId": "4aaebdbd-cbfc-4a61-e5f9-bdb5e421a515"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: rectools in /usr/local/lib/python3.11/dist-packages (0.12.0)\n",
"Requirement already satisfied: attrs<24.0.0,>=19.1.0 in /usr/local/lib/python3.11/dist-packages (from rectools) (23.2.0)\n",
"Requirement already satisfied: implicit<0.8.0,>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from rectools) (0.7.2)\n",
"Requirement already satisfied: numpy<2.0.0,>=1.22 in /usr/local/lib/python3.11/dist-packages (from rectools) (1.26.4)\n",
"Requirement already satisfied: pandas<3.0.0,>=1.5.0 in /usr/local/lib/python3.11/dist-packages (from rectools) (2.2.2)\n",
"Requirement already satisfied: pydantic<3.0.0,>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from rectools) (2.10.6)\n",
"Requirement already satisfied: pydantic-core<3.0.0,>=2.20.1 in /usr/local/lib/python3.11/dist-packages (from rectools) (2.27.2)\n",
"Requirement already satisfied: scipy<1.13,>=1.10.1 in /usr/local/lib/python3.11/dist-packages (from rectools) (1.12.0)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.27.0 in /usr/local/lib/python3.11/dist-packages (from rectools) (4.67.1)\n",
"Requirement already satisfied: typeguard<5.0.0,>=4.1.0 in /usr/local/lib/python3.11/dist-packages (from rectools) (4.4.2)\n",
"Requirement already satisfied: typing-extensions<5.0.0,>=4.12.2 in /usr/local/lib/python3.11/dist-packages (from rectools) (4.12.2)\n",
"Requirement already satisfied: threadpoolctl in /usr/local/lib/python3.11/dist-packages (from implicit<0.8.0,>=0.7.1->rectools) (3.5.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools) (2025.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools) (2025.1)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.8.2->rectools) (0.7.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas<3.0.0,>=1.5.0->rectools) (1.17.0)\n",
"Requirement already satisfied: rectools[all] in /usr/local/lib/python3.11/dist-packages (0.12.0)\n",
"Requirement already satisfied: attrs<24.0.0,>=19.1.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (23.2.0)\n",
"Requirement already satisfied: cupy-cuda12x<14.0.0,>=13.3.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (13.3.0)\n",
"Requirement already satisfied: implicit<0.8.0,>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (0.7.2)\n",
"Requirement already satisfied: ipywidgets<8.2,>=7.7 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (7.7.1)\n",
"Requirement already satisfied: nbformat>=4.2.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (5.10.4)\n",
"Requirement already satisfied: nmslib-metabrainz<3.0.0,>=2.1.3 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (2.1.3)\n",
"Requirement already satisfied: numpy<2.0.0,>=1.22 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (1.26.4)\n",
"Requirement already satisfied: pandas<3.0.0,>=1.5.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (2.2.2)\n",
"Requirement already satisfied: plotly<6.0.0,>=5.22.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (5.24.1)\n",
"Requirement already satisfied: pydantic<3.0.0,>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (2.10.6)\n",
"Requirement already satisfied: pydantic-core<3.0.0,>=2.20.1 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (2.27.2)\n",
"Requirement already satisfied: pytorch-lightning<3.0.0,>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (2.5.0.post0)\n",
"Requirement already satisfied: rectools-lightfm<2.0.0,>=1.17.3 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (1.17.3)\n",
"Requirement already satisfied: scipy<1.13,>=1.10.1 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (1.12.0)\n",
"Requirement already satisfied: torch<3.0.0,>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (2.5.1+cu124)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.27.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (4.67.1)\n",
"Requirement already satisfied: typeguard<5.0.0,>=4.1.0 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (4.4.2)\n",
"Requirement already satisfied: typing-extensions<5.0.0,>=4.12.2 in /usr/local/lib/python3.11/dist-packages (from rectools[all]) (4.12.2)\n",
"Requirement already satisfied: fastrlock>=0.5 in /usr/local/lib/python3.11/dist-packages (from cupy-cuda12x<14.0.0,>=13.3.0->rectools[all]) (0.8.3)\n",
"Requirement already satisfied: threadpoolctl in /usr/local/lib/python3.11/dist-packages (from implicit<0.8.0,>=0.7.1->rectools[all]) (3.5.0)\n",
"Requirement already satisfied: ipykernel>=4.5.1 in /usr/local/lib/python3.11/dist-packages (from ipywidgets<8.2,>=7.7->rectools[all]) (6.17.1)\n",
"Requirement already satisfied: ipython-genutils~=0.2.0 in /usr/local/lib/python3.11/dist-packages (from ipywidgets<8.2,>=7.7->rectools[all]) (0.2.0)\n",
"Requirement already satisfied: traitlets>=4.3.1 in /usr/local/lib/python3.11/dist-packages (from ipywidgets<8.2,>=7.7->rectools[all]) (5.7.1)\n",
"Requirement already satisfied: widgetsnbextension~=3.6.0 in /usr/local/lib/python3.11/dist-packages (from ipywidgets<8.2,>=7.7->rectools[all]) (3.6.10)\n",
"Requirement already satisfied: ipython>=4.0.0 in /usr/local/lib/python3.11/dist-packages (from ipywidgets<8.2,>=7.7->rectools[all]) (7.34.0)\n",
"Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from ipywidgets<8.2,>=7.7->rectools[all]) (3.0.13)\n",
"Requirement already satisfied: fastjsonschema>=2.15 in /usr/local/lib/python3.11/dist-packages (from nbformat>=4.2.0->rectools[all]) (2.21.1)\n",
"Requirement already satisfied: jsonschema>=2.6 in /usr/local/lib/python3.11/dist-packages (from nbformat>=4.2.0->rectools[all]) (4.23.0)\n",
"Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /usr/local/lib/python3.11/dist-packages (from nbformat>=4.2.0->rectools[all]) (5.7.2)\n",
"Requirement already satisfied: pybind11>=2.2.3 in /usr/local/lib/python3.11/dist-packages (from nmslib-metabrainz<3.0.0,>=2.1.3->rectools[all]) (2.13.6)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from nmslib-metabrainz<3.0.0,>=2.1.3->rectools[all]) (5.9.5)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools[all]) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools[all]) (2025.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools[all]) (2025.1)\n",
"Requirement already satisfied: tenacity>=6.2.0 in /usr/local/lib/python3.11/dist-packages (from plotly<6.0.0,>=5.22.0->rectools[all]) (9.0.0)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from plotly<6.0.0,>=5.22.0->rectools[all]) (24.2)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.8.2->rectools[all]) (0.7.0)\n",
"Requirement already satisfied: PyYAML>=5.4 in /usr/local/lib/python3.11/dist-packages (from pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (6.0.2)\n",
"Requirement already satisfied: fsspec>=2022.5.0 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (2024.10.0)\n",
"Requirement already satisfied: torchmetrics>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (1.6.1)\n",
"Requirement already satisfied: lightning-utilities>=0.10.0 in /usr/local/lib/python3.11/dist-packages (from pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (0.12.0)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from rectools-lightfm<2.0.0,>=1.17.3->rectools[all]) (2.32.3)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (from rectools-lightfm<2.0.0,>=1.17.3->rectools[all]) (1.6.1)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (3.17.0)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (3.4.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (3.1.5)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (12.4.127)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (12.4.127)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (12.4.127)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (9.1.0.70)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (12.4.5.8)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (11.2.1.3)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (10.3.5.147)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (11.6.1.9)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (12.3.1.170)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (2.21.5)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (12.4.127)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (12.4.127)\n",
"Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (3.1.0)\n",
"Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[all]) (1.13.1)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch<3.0.0,>=1.6.0->rectools[all]) (1.3.0)\n",
"Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (3.11.12)\n",
"Requirement already satisfied: debugpy>=1.0 in /usr/local/lib/python3.11/dist-packages (from ipykernel>=4.5.1->ipywidgets<8.2,>=7.7->rectools[all]) (1.8.0)\n",
"Requirement already satisfied: jupyter-client>=6.1.12 in /usr/local/lib/python3.11/dist-packages (from ipykernel>=4.5.1->ipywidgets<8.2,>=7.7->rectools[all]) (6.1.12)\n",
"Requirement already satisfied: matplotlib-inline>=0.1 in /usr/local/lib/python3.11/dist-packages (from ipykernel>=4.5.1->ipywidgets<8.2,>=7.7->rectools[all]) (0.1.7)\n",
"Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.11/dist-packages (from ipykernel>=4.5.1->ipywidgets<8.2,>=7.7->rectools[all]) (1.6.0)\n",
"Requirement already satisfied: pyzmq>=17 in /usr/local/lib/python3.11/dist-packages (from ipykernel>=4.5.1->ipywidgets<8.2,>=7.7->rectools[all]) (24.0.1)\n",
"Requirement already satisfied: tornado>=6.1 in /usr/local/lib/python3.11/dist-packages (from ipykernel>=4.5.1->ipywidgets<8.2,>=7.7->rectools[all]) (6.4.2)\n",
"Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.11/dist-packages (from ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (75.1.0)\n",
"Requirement already satisfied: jedi>=0.16 in /usr/local/lib/python3.11/dist-packages (from ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.19.2)\n",
"Requirement already satisfied: decorator in /usr/local/lib/python3.11/dist-packages (from ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (4.4.2)\n",
"Requirement already satisfied: pickleshare in /usr/local/lib/python3.11/dist-packages (from ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.7.5)\n",
"Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (3.0.50)\n",
"Requirement already satisfied: pygments in /usr/local/lib/python3.11/dist-packages (from ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (2.18.0)\n",
"Requirement already satisfied: backcall in /usr/local/lib/python3.11/dist-packages (from ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.2.0)\n",
"Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.11/dist-packages (from ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (4.9.0)\n",
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->rectools[all]) (2024.10.1)\n",
"Requirement already satisfied: referencing>=0.28.4 in /usr/local/lib/python3.11/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->rectools[all]) (0.36.2)\n",
"Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema>=2.6->nbformat>=4.2.0->rectools[all]) (0.22.3)\n",
"Requirement already satisfied: platformdirs>=2.5 in /usr/local/lib/python3.11/dist-packages (from jupyter-core!=5.0.*,>=4.12->nbformat>=4.2.0->rectools[all]) (4.3.6)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas<3.0.0,>=1.5.0->rectools[all]) (1.17.0)\n",
"Requirement already satisfied: notebook>=4.4.1 in /usr/local/lib/python3.11/dist-packages (from widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (6.5.5)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch<3.0.0,>=1.6.0->rectools[all]) (3.0.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->rectools-lightfm<2.0.0,>=1.17.3->rectools[all]) (3.4.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->rectools-lightfm<2.0.0,>=1.17.3->rectools[all]) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->rectools-lightfm<2.0.0,>=1.17.3->rectools[all]) (2.3.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->rectools-lightfm<2.0.0,>=1.17.3->rectools[all]) (2025.1.31)\n",
"Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn->rectools-lightfm<2.0.0,>=1.17.3->rectools[all]) (1.4.2)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (2.4.6)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (1.3.2)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (1.5.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (6.1.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (0.2.1)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[all]) (1.18.3)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.4 in /usr/local/lib/python3.11/dist-packages (from jedi>=0.16->ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.8.4)\n",
"Requirement already satisfied: argon2-cffi in /usr/local/lib/python3.11/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (23.1.0)\n",
"Requirement already satisfied: nbconvert>=5 in /usr/local/lib/python3.11/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (7.16.6)\n",
"Requirement already satisfied: Send2Trash>=1.8.0 in /usr/local/lib/python3.11/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (1.8.3)\n",
"Requirement already satisfied: terminado>=0.8.3 in /usr/local/lib/python3.11/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.18.1)\n",
"Requirement already satisfied: prometheus-client in /usr/local/lib/python3.11/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.21.1)\n",
"Requirement already satisfied: nbclassic>=0.4.7 in /usr/local/lib/python3.11/dist-packages (from notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (1.2.0)\n",
"Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.11/dist-packages (from pexpect>4.3->ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.7.0)\n",
"Requirement already satisfied: wcwidth in /usr/local/lib/python3.11/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.2.13)\n",
"Requirement already satisfied: notebook-shim>=0.2.3 in /usr/local/lib/python3.11/dist-packages (from nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.2.4)\n",
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (4.13.3)\n",
"Requirement already satisfied: bleach!=5.0.0 in /usr/local/lib/python3.11/dist-packages (from bleach[css]!=5.0.0->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (6.2.0)\n",
"Requirement already satisfied: defusedxml in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.7.1)\n",
"Requirement already satisfied: jupyterlab-pygments in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.3.0)\n",
"Requirement already satisfied: mistune<4,>=2.0.3 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (3.1.2)\n",
"Requirement already satisfied: nbclient>=0.5.0 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.10.2)\n",
"Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.11/dist-packages (from nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (1.5.1)\n",
"Requirement already satisfied: argon2-cffi-bindings in /usr/local/lib/python3.11/dist-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (21.2.0)\n",
"Requirement already satisfied: webencodings in /usr/local/lib/python3.11/dist-packages (from bleach!=5.0.0->bleach[css]!=5.0.0->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (0.5.1)\n",
"Requirement already satisfied: tinycss2<1.5,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from bleach[css]!=5.0.0->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (1.4.0)\n",
"Requirement already satisfied: jupyter-server<3,>=1.8 in /usr/local/lib/python3.11/dist-packages (from notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (1.24.0)\n",
"Requirement already satisfied: cffi>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from argon2-cffi-bindings->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (1.17.1)\n",
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4->nbconvert>=5->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (2.6)\n",
"Requirement already satisfied: pycparser in /usr/local/lib/python3.11/dist-packages (from cffi>=1.0.1->argon2-cffi-bindings->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (2.22)\n",
"Requirement already satisfied: anyio<4,>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (3.7.1)\n",
"Requirement already satisfied: websocket-client in /usr/local/lib/python3.11/dist-packages (from jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (1.8.0)\n",
"Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.11/dist-packages (from anyio<4,>=3.1.0->jupyter-server<3,>=1.8->notebook-shim>=0.2.3->nbclassic>=0.4.7->notebook>=4.4.1->widgetsnbextension~=3.6.0->ipywidgets<8.2,>=7.7->rectools[all]) (1.3.1)\n",
"Requirement already satisfied: rectools[torch] in /usr/local/lib/python3.11/dist-packages (0.12.0)\n",
"Requirement already satisfied: attrs<24.0.0,>=19.1.0 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (23.2.0)\n",
"Requirement already satisfied: implicit<0.8.0,>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (0.7.2)\n",
"Requirement already satisfied: numpy<2.0.0,>=1.22 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (1.26.4)\n",
"Requirement already satisfied: pandas<3.0.0,>=1.5.0 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (2.2.2)\n",
"Requirement already satisfied: pydantic<3.0.0,>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (2.10.6)\n",
"Requirement already satisfied: pydantic-core<3.0.0,>=2.20.1 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (2.27.2)\n",
"Requirement already satisfied: pytorch-lightning<3.0.0,>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (2.5.0.post0)\n",
"Requirement already satisfied: scipy<1.13,>=1.10.1 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (1.12.0)\n",
"Requirement already satisfied: torch<3.0.0,>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (2.5.1+cu124)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.27.0 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (4.67.1)\n",
"Requirement already satisfied: typeguard<5.0.0,>=4.1.0 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (4.4.2)\n",
"Requirement already satisfied: typing-extensions<5.0.0,>=4.12.2 in /usr/local/lib/python3.11/dist-packages (from rectools[torch]) (4.12.2)\n",
"Requirement already satisfied: threadpoolctl in /usr/local/lib/python3.11/dist-packages (from implicit<0.8.0,>=0.7.1->rectools[torch]) (3.5.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools[torch]) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools[torch]) (2025.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas<3.0.0,>=1.5.0->rectools[torch]) (2025.1)\n",
"Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.11/dist-packages (from pydantic<3.0.0,>=2.8.2->rectools[torch]) (0.7.0)\n",
"Requirement already satisfied: PyYAML>=5.4 in /usr/local/lib/python3.11/dist-packages (from pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (6.0.2)\n",
"Requirement already satisfied: fsspec>=2022.5.0 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (2024.10.0)\n",
"Requirement already satisfied: torchmetrics>=0.7.0 in /usr/local/lib/python3.11/dist-packages (from pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (1.6.1)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (24.2)\n",
"Requirement already satisfied: lightning-utilities>=0.10.0 in /usr/local/lib/python3.11/dist-packages (from pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (0.12.0)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (3.17.0)\n",
"Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (3.4.2)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (3.1.5)\n",
"Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (12.4.127)\n",
"Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (12.4.127)\n",
"Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (12.4.127)\n",
"Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (9.1.0.70)\n",
"Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (12.4.5.8)\n",
"Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (11.2.1.3)\n",
"Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (10.3.5.147)\n",
"Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (11.6.1.9)\n",
"Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (12.3.1.170)\n",
"Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (2.21.5)\n",
"Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (12.4.127)\n",
"Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (12.4.127)\n",
"Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (3.1.0)\n",
"Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch<3.0.0,>=1.6.0->rectools[torch]) (1.13.1)\n",
"Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch<3.0.0,>=1.6.0->rectools[torch]) (1.3.0)\n",
"Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (3.11.12)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.11/dist-packages (from lightning-utilities>=0.10.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (75.1.0)\n",
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas<3.0.0,>=1.5.0->rectools[torch]) (1.17.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch<3.0.0,>=1.6.0->rectools[torch]) (3.0.2)\n",
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (2.4.6)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (1.3.2)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (1.5.0)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (6.1.0)\n",
"Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (0.2.1)\n",
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (1.18.3)\n",
"Requirement already satisfied: idna>=2.0 in /usr/local/lib/python3.11/dist-packages (from yarl<2.0,>=1.17.0->aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]>=2022.5.0->pytorch-lightning<3.0.0,>=1.6.0->rectools[torch]) (3.10)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"'''По факту от пользователя требуется обычная таблица, где каждая строка отражает одно взаимодействие: в первом столбце id юзера, во втором — айтема,\n",
" а в третьем — скор взаимодействия (например, купил/ не купил). Если есть данные по времени, их тоже можно добавить. '''"
],
"metadata": {
"id": "lU1LEYZONAHR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from implicit.nearest_neighbours import TFIDFRecommender\n",
"\n",
"from rectools import Columns\n",
"from rectools.dataset import Dataset\n",
"from rectools.metrics import (\n",
" Precision,\n",
" NDCG,\n",
" AvgRecPopularity,\n",
" Intersection,\n",
" HitRate,\n",
" SufficientReco,\n",
" DebiasConfig,\n",
" IntraListDiversity,\n",
" Serendipity,\n",
" calc_metrics,\n",
")\n",
"from rectools.metrics.distances import PairwiseHammingDistanceCalculator\n",
"from rectools.models import *"
],
"metadata": {
"id": "XdEtxqrdRX7H"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"ratings = pd.read_csv(\n",
" \"ratings.dat\",\n",
" sep=\"::\",\n",
" engine=\"python\", # Because of 2-chars separators\n",
" header=None,\n",
" names=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],\n",
")\n",
"print(ratings.shape)\n",
"ratings.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 223
},
"id": "9rkswbQDRg5B",
"outputId": "74bb481e-157e-4159-ec70-ec7606ad6041"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(1000209, 4)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" user_id item_id weight datetime\n",
"0 1 1193 5 978300760\n",
"1 1 661 3 978302109\n",
"2 1 914 3 978301968\n",
"3 1 3408 4 978300275\n",
"4 1 2355 5 978824291"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" item_id | \n",
" weight | \n",
" datetime | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1193 | \n",
" 5 | \n",
" 978300760 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 661 | \n",
" 3 | \n",
" 978302109 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 914 | \n",
" 3 | \n",
" 978301968 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 3408 | \n",
" 4 | \n",
" 978300275 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 2355 | \n",
" 5 | \n",
" 978824291 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "ratings"
}
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [
"ratings[\"datetime\"] = pd.to_datetime(ratings[\"datetime\"] * 10 ** 9)\n",
"ratings[\"datetime\"].min(), ratings[\"datetime\"].max()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "H1Em3kpYRksu",
"outputId": "8ddf600d-16e2-403e-8497-1de5ebedf264"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(Timestamp('2000-04-25 23:05:32'), Timestamp('2003-02-28 17:49:50'))"
]
},
"metadata": {},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"source": [
"movies = pd.read_csv(\n",
" \"movies.dat\",\n",
" sep=\"::\",\n",
" engine=\"python\", # Because of 2-chars separators\n",
" header=None,\n",
" names=[Columns.Item, \"title\", \"genres\"],\n",
" encoding_errors=\"ignore\",\n",
")\n",
"print(movies.shape)\n",
"split_dt = pd.Timestamp(\"2003-02-01\")\n",
"df_train = ratings.loc[ratings[\"datetime\"] < split_dt]\n",
"df_test = ratings.loc[ratings[\"datetime\"] >= split_dt]\n",
"dataset = Dataset.construct(df_train)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-bR-lgPXSXvA",
"outputId": "a5e079e4-143f-4a6c-b4c5-b23f56f4d7c2"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(3883, 3)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from rectools.models import LightFMWrapperModel\n",
"from lightfm import LightFM\n",
"\n",
"model = LightFMWrapperModel(\n",
" # внутри модели указываем параметр no_components\n",
" # это размезность эмбеддингов, которые выучит модель\n",
" model=LightFM(no_components = 30)\n",
" )\n",
"\n",
"model.fit(dataset)\n",
"recos = model.recommend(\n",
" users=ratings[Columns.User].unique(),\n",
" dataset=dataset,\n",
" k=5,\n",
" filter_viewed=True,\n",
")\n",
"\n",
"serendipity = Serendipity(k=10)\n",
"precision = Precision(k=10, r_precision=True) # r_precision means division by min(k, n_user_test_items)\n",
"ndcg = NDCG(k=10, log_base=3)\n",
"\n",
"movies[\"genre\"] = movies[\"genres\"].str.split(\"|\")\n",
"genre_exploded = movies[[\"item_id\", \"genre\"]].set_index(\"item_id\").explode(\"genre\")\n",
"genre_dummies = pd.get_dummies(genre_exploded, prefix=\"\", prefix_sep=\"\").groupby(\"item_id\").sum()\n",
"\n",
"precision_value = precision.calc(reco=recos, interactions=df_test)\n",
"print(f\"precision: {precision_value}\")\n",
"\n",
"catalog = df_train[Columns.Item].unique()\n",
"\n",
"serendipity_value = serendipity.calc(\n",
" reco=recos,\n",
" interactions=df_test,\n",
" prev_interactions=df_train,\n",
" catalog=catalog\n",
")\n",
"print(\"Serendipity: \", serendipity_value)\n",
"\n",
"print(\"NDCG: \", ndcg.calc(reco=recos, interactions=df_test))\n",
"\n",
"distance_calculator = PairwiseHammingDistanceCalculator(genre_dummies)\n",
"ild = IntraListDiversity(k=10, distance_calculator=distance_calculator)\n",
"print(\"ILD: \", ild.calc(reco=recos))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "c3yLXDPcfcS6",
"outputId": "07f3b781-b6d6-4c20-daf8-6b89c3d17c75"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"precision: 0.032996632996633\n",
"Serendipity: 4.163643706464913e-06\n",
"NDCG: 0.041591868823295505\n",
"ILD: 3.4595364238410595\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"model = SASRecModel(\n",
" session_max_len=20,\n",
" loss=\"softmax\",\n",
" n_factors=64,\n",
" n_blocks=1,\n",
" n_heads=4,\n",
" dropout_rate=0.2,\n",
" lr=0.001,\n",
" batch_size=128,\n",
" epochs=6,\n",
" verbose=1,\n",
" deterministic=True,\n",
")\n",
"\n",
"model.fit(dataset)\n",
"recos = model.recommend(\n",
" users=ratings[Columns.User].unique(),\n",
" dataset=dataset,\n",
" k=5,\n",
" filter_viewed=True,\n",
")\n",
"\n",
"serendipity = Serendipity(k=10)\n",
"precision = Precision(k=10, r_precision=True) # r_precision means division by min(k, n_user_test_items)\n",
"ndcg = NDCG(k=10, log_base=3)\n",
"\n",
"movies[\"genre\"] = movies[\"genres\"].str.split(\"|\")\n",
"genre_exploded = movies[[\"item_id\", \"genre\"]].set_index(\"item_id\").explode(\"genre\")\n",
"genre_dummies = pd.get_dummies(genre_exploded, prefix=\"\", prefix_sep=\"\").groupby(\"item_id\").sum()\n",
"\n",
"precision_value = precision.calc(reco=recos, interactions=df_test)\n",
"print(f\"precision: {precision_value}\")\n",
"\n",
"catalog = df_train[Columns.Item].unique()\n",
"\n",
"serendipity_value = serendipity.calc(\n",
" reco=recos,\n",
" interactions=df_test,\n",
" prev_interactions=df_train,\n",
" catalog=catalog\n",
")\n",
"print(\"Serendipity: \", serendipity_value)\n",
"\n",
"print(\"NDCG: \", ndcg.calc(reco=recos, interactions=df_test))\n",
"\n",
"distance_calculator = PairwiseHammingDistanceCalculator(genre_dummies)\n",
"ild = IntraListDiversity(k=10, distance_calculator=distance_calculator)\n",
"print(\"ILD: \", ild.calc(reco=recos))"
],
"metadata": {
"id": "ufAFUYjdjXZx",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 555,
"referenced_widgets": [
"894f0647a28f4449a63e680841518b0d",
"707e977651d6492082bb6615ba6b7d6e",
"732150fa2e1a43c7aa61719cb083d6ac",
"86eae60d0e184e27a6d1f73bc7f37560",
"8ad5f7a38f9d4b4780a3a45a9842a56d",
"fb6e2cca3ca84dafbf09599481314a2a",
"3188c60873aa4d2faa5be0ea04ea3e09",
"befb029ec4814bf8b12ed8b7c222d8ab",
"f801d767b2d446229f30c418604c6d84",
"b459c7018ee94121891d27687ba5ae59",
"d1dad85ea0ae46fab93b0cc0cbc86310"
]
},
"outputId": "fc606408-dce5-4aba-ac04-568c1261096f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False\n",
"INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores\n",
"INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs\n",
"/usr/local/lib/python3.11/dist-packages/rectools/dataset/identifiers.py:60: FutureWarning: unique with argument that is not not a Series, Index, ExtensionArray, or np.ndarray is deprecated and will raise in a future version.\n",
" unq_values = pd.unique(values)\n",
"/usr/local/lib/python3.11/dist-packages/rectools/models/nn/item_net.py:129: UserWarning: Ignoring `CatFeaturesItemNet` block because dataset doesn't contain item features.\n",
" warnings.warn(explanation)\n",
"/usr/local/lib/python3.11/dist-packages/pydantic/main.py:426: UserWarning: Pydantic serializer warnings:\n",
" Expected `str` but got `tuple` with value `('rectools.models.nn.item...net.CatFeaturesItemNet')` - serialized value may not be as expected\n",
" return self.__pydantic_serializer__.to_python(\n",
"/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.\n",
"INFO:pytorch_lightning.callbacks.model_summary:\n",
" | Name | Type | Params | Mode \n",
"-----------------------------------------------------------------\n",
"0 | torch_model | TransformerTorchBackbone | 242 K | train\n",
"-----------------------------------------------------------------\n",
"242 K Trainable params\n",
"0 Non-trainable params\n",
"242 K Total params\n",
"0.972 Total estimated model params size (MB)\n",
"23 Modules in train mode\n",
"0 Modules in eval mode\n",
"/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loops/fit_loop.py:310: The number of training batches (48) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"Training: | | 0/? [00:00, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "894f0647a28f4449a63e680841518b0d"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=6` reached.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"precision: 0.04781144781144782\n",
"Serendipity: 4.502274814780399e-05\n",
"NDCG: 0.03513844945033891\n",
"ILD: 3.287748344370861\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"model = BERT4RecModel(\n",
" mask_prob=0.15, # specify probability of masking tokens\n",
" deterministic=True,\n",
")\n",
"\n",
"model.fit(dataset)\n",
"recos = model.recommend(\n",
" users=ratings[Columns.User].unique(),\n",
" dataset=dataset,\n",
" k=5,\n",
" filter_viewed=True,\n",
")\n",
"\n",
"serendipity = Serendipity(k=10)\n",
"precision = Precision(k=10, r_precision=True) # r_precision means division by min(k, n_user_test_items)\n",
"ndcg = NDCG(k=10, log_base=3)\n",
"\n",
"movies[\"genre\"] = movies[\"genres\"].str.split(\"|\")\n",
"genre_exploded = movies[[\"item_id\", \"genre\"]].set_index(\"item_id\").explode(\"genre\")\n",
"genre_dummies = pd.get_dummies(genre_exploded, prefix=\"\", prefix_sep=\"\").groupby(\"item_id\").sum()\n",
"\n",
"precision_value = precision.calc(reco=recos, interactions=df_test)\n",
"print(f\"precision: {precision_value}\")\n",
"\n",
"catalog = df_train[Columns.Item].unique()\n",
"\n",
"serendipity_value = serendipity.calc(\n",
" reco=recos,\n",
" interactions=df_test,\n",
" prev_interactions=df_train,\n",
" catalog=catalog\n",
")\n",
"print(\"Serendipity: \", serendipity_value)\n",
"\n",
"print(\"NDCG: \", ndcg.calc(reco=recos, interactions=df_test))\n",
"\n",
"distance_calculator = PairwiseHammingDistanceCalculator(genre_dummies)\n",
"ild = IntraListDiversity(k=10, distance_calculator=distance_calculator)\n",
"print(\"ILD: \", ild.calc(reco=recos))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PL_0puloZAbQ",
"outputId": "7f563b85-dcb7-4194-a3ba-cfdb00980806"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False\n",
"INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores\n",
"INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs\n",
"/usr/local/lib/python3.11/dist-packages/rectools/dataset/identifiers.py:60: FutureWarning: unique with argument that is not not a Series, Index, ExtensionArray, or np.ndarray is deprecated and will raise in a future version.\n",
" unq_values = pd.unique(values)\n",
"/usr/local/lib/python3.11/dist-packages/rectools/models/nn/item_net.py:129: UserWarning: Ignoring `CatFeaturesItemNet` block because dataset doesn't contain item features.\n",
" warnings.warn(explanation)\n",
"/usr/local/lib/python3.11/dist-packages/pydantic/main.py:426: UserWarning: Pydantic serializer warnings:\n",
" Expected `str` but got `tuple` with value `('rectools.models.nn.item...net.CatFeaturesItemNet')` - serialized value may not be as expected\n",
" return self.__pydantic_serializer__.to_python(\n",
"/usr/local/lib/python3.11/dist-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.\n",
"INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"precision: 0.02630270963604297\n",
"Serendipity: 1.1552771056779242e-05\n",
"NDCG: 0.03232912926419424\n",
"ILD: 3.031059602649006\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"\n",
" \\begin{array}{|c|c|c|c|c|c|c|c|}\\hline\\\\ \\\\\n",
" \\mathcal{} & Movielens 1m & Precision & Serendipity & NDCG & ILD \\\\ \\hline\\\\\n",
" a & SASRec (6 epochs) & 0.04781 & 4.5023e-05 & 0.03513 & 3.2877 \\\\ \\hline\\\\ \\\\\n",
" b & BERT4Rec & 0.02630 & 1.1553e-05 & 0.03233 & 3.0311 \\\\ \\hline\\\\ \\\\\n",
" c & implicit ALS Wrapper & 0.0515 & 5.4056e-05 & 0.05165 & 2.8392 \\\\ \\hline\\\\ \\\\\n",
" d & implicit BPR-MF Wrapper & 0.02997 & 3.19868e-06 & 0.03615 & 3.86506 \\\\ \\hline\\\\ \\\\\n",
" e & implicit ItemKNN Wrapper & 0.0456 & 3.0093e-05 & 0.04615 & 3.1726 \\\\ \\hline\\\\ \\\\\n",
" f & LightFM Wrapper & 0.05858 & 4.7578e-06 & 0.0604 & 3.6395 \\\\ \\hline\\\\ \\\\\n",
" g & EASE & 0.0367 & 3.0522e-05 & 0.03431 & 2.8200 \\\\ \\hline\\\\ \\\\\n",
" h & PureSVD & 0.05952 & 2.5205e-05 & 0.05248 & 3.0020 \\\\ \\hline\\\\ \\\\\n",
" j & Popular & 0.0330 & 4.1636e-06 & 0.04160 & 3.4595 \\\\ \\hline\\\\ \\\\\n",
" l & Random & 0.0131 & 1.6940e-05 & 0.00487 & 2.6273 \\\\ \\hline\n",
" \\end{array}"
],
"metadata": {
"id": "bmPZPxd8cQ0o"
}
},
{
"cell_type": "markdown",
"source": [
"# Детекция ботов"
],
"metadata": {
"id": "MnzenpVK1RUw"
}
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from implicit.nearest_neighbours import TFIDFRecommender\n",
"\n",
"from rectools import Columns\n",
"from rectools.dataset import Dataset\n",
"from rectools.metrics import (\n",
" Precision,\n",
" NDCG,\n",
" IntraListDiversity,\n",
" Serendipity,\n",
" calc_metrics,\n",
")\n",
"from rectools.metrics.distances import PairwiseHammingDistanceCalculator\n",
"from rectools.models import LightFMWrapperModel\n",
"from lightfm import LightFM\n",
"\n",
"# 1️⃣ Загрузка данных\n",
"ratings = pd.read_csv(\n",
" \"ratings.dat\",\n",
" sep=\"::\",\n",
" engine=\"python\",\n",
" header=None,\n",
" names=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],\n",
")\n",
"\n",
"ratings[\"datetime\"] = pd.to_datetime(ratings[\"datetime\"] * 10 ** 9)\n",
"\n",
"movies = pd.read_csv(\n",
" \"movies.dat\",\n",
" sep=\"::\",\n",
" engine=\"python\",\n",
" header=None,\n",
" names=[Columns.Item, \"title\", \"genres\"],\n",
" encoding_errors=\"ignore\",\n",
")\n",
"\n",
"# 2️⃣ ДЕТЕКЦИЯ БОТОВ\n",
"\n",
"# 2.1 Определение аномальной активности пользователей\n",
"user_activity = ratings.groupby(Columns.User).agg(\n",
" num_ratings=(Columns.Item, \"count\"), # Количество оценок\n",
" unique_movies=(Columns.Item, \"nunique\"), # Уникальные фильмы\n",
" rating_std=(Columns.Weight, \"std\"), # Разброс оценок\n",
" first_rating_time=(\"datetime\", \"min\"),\n",
" last_rating_time=(\"datetime\", \"max\")\n",
")\n",
"\n",
"# 2.2 Добавляем скорость оценивания (оценки в день)\n",
"user_activity[\"rating_speed\"] = user_activity[\"num_ratings\"] / (\n",
" (user_activity[\"last_rating_time\"] - user_activity[\"first_rating_time\"]).dt.days + 1\n",
")\n",
"\n",
"# 2.3 Удаление ботов по критериям:\n",
"thresholds = {\n",
" \"num_ratings\": user_activity[\"num_ratings\"].quantile(0.99), # > 99-го перцентиля\n",
" \"unique_movies\": user_activity[\"unique_movies\"].quantile(0.01), # < 1-го перцентиля\n",
" \"rating_std\": 0.1, # Нет разброса в оценках\n",
" \"rating_speed\": user_activity[\"rating_speed\"].quantile(0.99), # > 99-го перцентиля\n",
"}\n",
"\n",
"bots = user_activity[\n",
" (user_activity[\"num_ratings\"] > thresholds[\"num_ratings\"]) |\n",
" (user_activity[\"unique_movies\"] < thresholds[\"unique_movies\"]) |\n",
" (user_activity[\"rating_std\"] < thresholds[\"rating_std\"]) |\n",
" (user_activity[\"rating_speed\"] > thresholds[\"rating_speed\"])\n",
"]\n",
"\n",
"# Удаляем ботов\n",
"ratings_cleaned = ratings[~ratings[Columns.User].isin(bots.index)]\n",
"print(f\"Удалено {len(bots)} подозрительных пользователей.\")\n",
"\n",
"# 3️⃣ Деление на train/test\n",
"split_dt = pd.Timestamp(\"2003-02-01\")\n",
"df_train = ratings_cleaned.loc[ratings_cleaned[\"datetime\"] < split_dt]\n",
"df_test = ratings_cleaned.loc[ratings_cleaned[\"datetime\"] >= split_dt]\n",
"\n",
"# 4️⃣ Обучение LightFM\n",
"dataset = Dataset.construct(df_train)\n",
"\n",
"model = LightFMWrapperModel(\n",
" model=LightFM(no_components=30)\n",
")\n",
"model.fit(dataset)\n",
"\n",
"# 5️⃣ Генерация рекомендаций\n",
"recos = model.recommend(\n",
" users=df_test[Columns.User].unique(),\n",
" dataset=dataset,\n",
" k=5,\n",
" filter_viewed=True,\n",
")\n",
"\n",
"# 6️⃣ Оценка метрик\n",
"serendipity = Serendipity(k=10)\n",
"precision = Precision(k=10, r_precision=True)\n",
"ndcg = NDCG(k=10, log_base=3)\n",
"\n",
"movies[\"genre\"] = movies[\"genres\"].str.split(\"|\")\n",
"genre_exploded = movies[[Columns.Item, \"genre\"]].set_index(Columns.Item).explode(\"genre\")\n",
"genre_dummies = pd.get_dummies(genre_exploded, prefix=\"\", prefix_sep=\"\").groupby(Columns.Item).sum()\n",
"\n",
"precision_value = precision.calc(reco=recos, interactions=df_test)\n",
"print(f\"precision: {precision_value}\")\n",
"\n",
"catalog = df_train[Columns.Item].unique()\n",
"serendipity_value = serendipity.calc(\n",
" reco=recos,\n",
" interactions=df_test,\n",
" prev_interactions=df_train,\n",
" catalog=catalog\n",
")\n",
"print(\"Serendipity: \", serendipity_value)\n",
"\n",
"print(\"NDCG: \", ndcg.calc(reco=recos, interactions=df_test))\n",
"\n",
"distance_calculator = PairwiseHammingDistanceCalculator(genre_dummies)\n",
"ild = IntraListDiversity(k=10, distance_calculator=distance_calculator)\n",
"print(\"ILD: \", ild.calc(reco=recos))\n"
],
"metadata": {
"id": "-uNXsWUD1TxJ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "H2aMMUEh1me4"
}
},
{
"cell_type": "markdown",
"source": [
"# NLP NIR Natasha"
],
"metadata": {
"id": "YLFxW6OTmkSz"
}
},
{
"cell_type": "code",
"source": [
"!pip install natasha"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SDr6wjRTmtVG",
"outputId": "4fbf4295-a7ba-4682-8198-7f8236b4c262"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting natasha\n",
" Downloading natasha-1.6.0-py3-none-any.whl.metadata (23 kB)\n",
"Collecting pymorphy2 (from natasha)\n",
" Downloading pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)\n",
"Collecting razdel>=0.5.0 (from natasha)\n",
" Downloading razdel-0.5.0-py3-none-any.whl.metadata (10.0 kB)\n",
"Collecting navec>=0.9.0 (from natasha)\n",
" Downloading navec-0.10.0-py3-none-any.whl.metadata (21 kB)\n",
"Collecting slovnet>=0.6.0 (from natasha)\n",
" Downloading slovnet-0.6.0-py3-none-any.whl.metadata (34 kB)\n",
"Collecting yargy>=0.16.0 (from natasha)\n",
" Downloading yargy-0.16.0-py3-none-any.whl.metadata (3.5 kB)\n",
"Collecting ipymarkup>=0.8.0 (from natasha)\n",
" Downloading ipymarkup-0.9.0-py3-none-any.whl.metadata (5.6 kB)\n",
"Collecting intervaltree>=3 (from ipymarkup>=0.8.0->natasha)\n",
" Downloading intervaltree-3.1.0.tar.gz (32 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from navec>=0.9.0->natasha) (1.26.4)\n",
"Collecting dawg-python>=0.7.1 (from pymorphy2->natasha)\n",
" Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)\n",
"Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2->natasha)\n",
" Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl.metadata (2.1 kB)\n",
"Collecting docopt>=0.6 (from pymorphy2->natasha)\n",
" Downloading docopt-0.6.2.tar.gz (25 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: sortedcontainers<3.0,>=2.0 in /usr/local/lib/python3.11/dist-packages (from intervaltree>=3->ipymarkup>=0.8.0->natasha) (2.4.0)\n",
"Downloading natasha-1.6.0-py3-none-any.whl (34.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.4/34.4 MB\u001b[0m \u001b[31m32.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading ipymarkup-0.9.0-py3-none-any.whl (14 kB)\n",
"Downloading navec-0.10.0-py3-none-any.whl (23 kB)\n",
"Downloading razdel-0.5.0-py3-none-any.whl (21 kB)\n",
"Downloading slovnet-0.6.0-py3-none-any.whl (46 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.7/46.7 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading yargy-0.16.0-py3-none-any.whl (33 kB)\n",
"Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)\n",
"Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.2/8.2 MB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hBuilding wheels for collected packages: docopt, intervaltree\n",
" Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=1c63601457de054370a39a15b73fae84c828da1412d67838ceaca15fd87d8aec\n",
" Stored in directory: /root/.cache/pip/wheels/1a/b0/8c/4b75c4116c31f83c8f9f047231251e13cc74481cca4a78a9ce\n",
" Building wheel for intervaltree (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26097 sha256=4f8268b1c102f6a686fac1ad2b6f5e31d2e0c2c5f6fe16af2f6177850adac975\n",
" Stored in directory: /root/.cache/pip/wheels/31/d7/d9/eec6891f78cac19a693bd40ecb8365d2f4613318c145ec9816\n",
"Successfully built docopt intervaltree\n",
"Installing collected packages: razdel, pymorphy2-dicts-ru, docopt, dawg-python, pymorphy2, navec, intervaltree, yargy, slovnet, ipymarkup, natasha\n",
"Successfully installed dawg-python-0.7.2 docopt-0.6.2 intervaltree-3.1.0 ipymarkup-0.9.0 natasha-1.6.0 navec-0.10.0 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844 razdel-0.5.0 slovnet-0.6.0 yargy-0.16.0\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from natasha import (\n",
" Doc,\n",
" Segmenter,\n",
" MorphVocab,\n",
" NewsEmbedding,\n",
" NewsMorphTagger,\n",
" NewsSyntaxParser,\n",
" NewsNERTagger,\n",
")\n",
"\n",
"# Инициализация компонентов Natasha\n",
"segmenter = Segmenter()\n",
"morph_vocab = MorphVocab()\n",
"emb = NewsEmbedding()\n",
"morph_tagger = NewsMorphTagger(emb)\n",
"syntax_parser = NewsSyntaxParser(emb)\n",
"ner_tagger = NewsNERTagger(emb)\n",
"\n",
"# Функция для обработки предложения и извлечения сущностей\n",
"def extract_entities(sentence, sentence_id):\n",
" doc = Doc(sentence)\n",
" doc.segment(segmenter)\n",
" doc.tag_morph(morph_tagger)\n",
" doc.tag_ner(ner_tagger)\n",
"\n",
" entities = []\n",
" for span in doc.spans:\n",
" entities.append({\n",
" 'sentence_id': sentence_id,\n",
" 'entity': span.text,\n",
" 'entity_type': span.type\n",
" })\n",
" return entities\n",
"\n",
"# Пример предложения\n",
"sentence = \"Бурятия и Забайкальский край переданы из Сибирского федерального округа (СФО) в состав Дальневосточного (ДФО). Соответствующий указ подписал президент Владимир Путин, документ опубликован на официальном интернет-портале правовой информации. Этим же указом глава государства поручил руководителю своей администрации утвердить структуру и штатную численность аппаратов полномочных представителей президента в этих двух округах. После исключения Бурятии и Забайкалья в составе СФО остались десять регионов: Алтай, Алтайский край, Иркутская, Кемеровская, Новосибирская, Омская и Томская области, Красноярский край, Тува и Хакасия. Действующим полпредом президента в этом округе является бывший губернатор Севастополя, экс-заместитель командующего Черноморским флотом России Сергей Меняйло. В составе ДФО отныне 11 субъектов. Помимо Бурятии и Забайкалья, это Камчатский, Приморский и Хабаровский края, Амурская, Еврейская автономная, Магаданская и Сахалинская области, а также Якутия и Чукотка. Дальневосточное полпредство возглавляет Юрий Трутнев, совмещающий эту должность с постом вице-премьера в правительстве России. Федеральные округа были созданы в мае 2000 года в соответствии с указом президента Путина.\"\n",
"sentence_id = 1 # ID предложения\n",
"\n",
"# Извлечение сущностей\n",
"entities = extract_entities(sentence, sentence_id)\n",
"\n",
"# Создание DataFrame\n",
"df = pd.DataFrame(entities)\n",
"\n",
"# Вывод результата\n",
"df.head(20)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 677
},
"id": "wK8KTIjdpi4A",
"outputId": "f4364c32-d6c0-4ecc-d1ec-89cebcaa48fb"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" sentence_id entity entity_type\n",
"0 1 Бурятия LOC\n",
"1 1 Забайкальский край LOC\n",
"2 1 Сибирского федерального округа (СФО) LOC\n",
"3 1 Дальневосточного (ДФО) LOC\n",
"4 1 Владимир Путин PER\n",
"5 1 Бурятии LOC\n",
"6 1 Забайкалья LOC\n",
"7 1 СФО LOC\n",
"8 1 Алтай LOC\n",
"9 1 Алтайский край LOC\n",
"10 1 Иркутская LOC\n",
"11 1 Кемеровская LOC\n",
"12 1 Новосибирская LOC\n",
"13 1 Омская LOC\n",
"14 1 Томская области LOC\n",
"15 1 Красноярский край LOC\n",
"16 1 Тува LOC\n",
"17 1 Хакасия LOC\n",
"18 1 Севастополя LOC\n",
"19 1 Черноморским флотом ORG"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" sentence_id | \n",
" entity | \n",
" entity_type | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Бурятия | \n",
" LOC | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" Забайкальский край | \n",
" LOC | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" Сибирского федерального округа (СФО) | \n",
" LOC | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" Дальневосточного (ДФО) | \n",
" LOC | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" Владимир Путин | \n",
" PER | \n",
"
\n",
" \n",
" 5 | \n",
" 1 | \n",
" Бурятии | \n",
" LOC | \n",
"
\n",
" \n",
" 6 | \n",
" 1 | \n",
" Забайкалья | \n",
" LOC | \n",
"
\n",
" \n",
" 7 | \n",
" 1 | \n",
" СФО | \n",
" LOC | \n",
"
\n",
" \n",
" 8 | \n",
" 1 | \n",
" Алтай | \n",
" LOC | \n",
"
\n",
" \n",
" 9 | \n",
" 1 | \n",
" Алтайский край | \n",
" LOC | \n",
"
\n",
" \n",
" 10 | \n",
" 1 | \n",
" Иркутская | \n",
" LOC | \n",
"
\n",
" \n",
" 11 | \n",
" 1 | \n",
" Кемеровская | \n",
" LOC | \n",
"
\n",
" \n",
" 12 | \n",
" 1 | \n",
" Новосибирская | \n",
" LOC | \n",
"
\n",
" \n",
" 13 | \n",
" 1 | \n",
" Омская | \n",
" LOC | \n",
"
\n",
" \n",
" 14 | \n",
" 1 | \n",
" Томская области | \n",
" LOC | \n",
"
\n",
" \n",
" 15 | \n",
" 1 | \n",
" Красноярский край | \n",
" LOC | \n",
"
\n",
" \n",
" 16 | \n",
" 1 | \n",
" Тува | \n",
" LOC | \n",
"
\n",
" \n",
" 17 | \n",
" 1 | \n",
" Хакасия | \n",
" LOC | \n",
"
\n",
" \n",
" 18 | \n",
" 1 | \n",
" Севастополя | \n",
" LOC | \n",
"
\n",
" \n",
" 19 | \n",
" 1 | \n",
" Черноморским флотом | \n",
" ORG | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 37,\n \"fields\": [\n {\n \"column\": \"sentence_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 1,\n \"max\": 1,\n \"num_unique_values\": 1,\n \"samples\": [\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"entity\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 34,\n \"samples\": [\n \"\\u041a\\u0440\\u0430\\u0441\\u043d\\u043e\\u044f\\u0440\\u0441\\u043a\\u0438\\u0439 \\u043a\\u0440\\u0430\\u0439\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"entity_type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"LOC\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 25
}
]
},
{
"cell_type": "markdown",
"source": [
"# Обработка + Word2Vec + TF-IDF"
],
"metadata": {
"id": "He3sYIMuYHdj"
}
},
{
"cell_type": "code",
"source": [
"!pip install pymorphy3"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "L4drvZGJZDma",
"outputId": "325b7fbc-ebe3-4c7b-8790-298ad296d167"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting pymorphy3\n",
" Downloading pymorphy3-2.0.3-py3-none-any.whl.metadata (1.9 kB)\n",
"Collecting dawg2-python>=0.8.0 (from pymorphy3)\n",
" Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)\n",
"Collecting pymorphy3-dicts-ru (from pymorphy3)\n",
" Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)\n",
"Downloading pymorphy3-2.0.3-py3-none-any.whl (53 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.8/53.8 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)\n",
"Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.4/8.4 MB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: pymorphy3-dicts-ru, dawg2-python, pymorphy3\n",
"Successfully installed dawg2-python-0.9.0 pymorphy3-2.0.3 pymorphy3-dicts-ru-2.4.417150.4580142\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"categories = {\n",
" \"Жизнь человека\": {\n",
" \"description\": \"Задания о ценности человеческой жизни, донорстве, осведомленности о заболеваниях\",\n",
" \"examples\": [\n",
" \"организация донорской акции\",\n",
" \"переливание крови\",\n",
" \"медицинское просвещение\",\n",
" \"профилактика заболеваний\",\n",
" \"первая помощь\"\n",
" ]\n",
" },\n",
" \"Достоинство человека\": {\n",
" \"description\": \"Задания об уважении к профессиям и людям разных социальных статусов\",\n",
" \"examples\": [\n",
" \"мастер-класс о уважении\",\n",
" \"социальный статус\",\n",
" \"профессиональная этика\",\n",
" \"равенство возможностей\",\n",
" \"толерантность\",\n",
" \"\"\n",
" ]\n",
" },\n",
" \"Права и свободы человека\": {\n",
" \"description\": \"Изучение и защита прав человека\",\n",
" \"examples\": [\n",
" \"тест о правах человека\",\n",
" \"конвенция о правах ребенка\",\n",
" \"правовая грамотность\",\n",
" \"защита свобод\",\n",
" \"гражданские права\"\n",
" ]\n",
" },\n",
" \"Патриотизм\": {\n",
" \"description\": \"История государства, предки, патриотические организации\",\n",
" \"examples\": [\n",
" \"Россия\",\n",
" \"Российская федерация\",\n",
" \"СССР\"\n",
" \"посещение военного музея\",\n",
" \"волонтерство 9 мая\",\n",
" \"история великой отечественной\",\n",
" \"патриотический флешмоб\",\n",
" \"герои россии\"\n",
" ]\n",
" },\n",
" \"Гражданственность\": {\n",
" \"description\": \"Процветание общества через личное участие\",\n",
" \"examples\": [\n",
" \"субботник\",\n",
" \"экологическая акция\",\n",
" \"гражданские инициативы\",\n",
" \"благоустройство города\",\n",
" \"общественный контроль\"\n",
" ]\n",
" },\n",
" \"Служение Отечеству и ответственность за его судьбу\": {\n",
" \"description\": \"История отечества и физическая подготовка граждан\",\n",
" \"examples\": [\n",
" \"военно-спортивные игры\",\n",
" \"уроки мужества\",\n",
" \"здоровье нации\",\n",
" \"историческая реконструкция\",\n",
" \"вахта памяти\"\n",
" ]\n",
" },\n",
" \"Высокие нравственные идеалы\": {\n",
" \"description\": \"Культура, идеи и творчество, формирующие мораль\",\n",
" \"examples\": [\n",
" \"обсуждение классической литературы\",\n",
" \"нравственные дилеммы\",\n",
" \"этические нормы\",\n",
" \"моральный выбор\",\n",
" \"духовные ценности\",\n",
" \"история\",\n",
" \"страны\"\n",
" ]\n",
" },\n",
" \"Крепкая семья\": {\n",
" \"description\": \"Совместная семейная деятельность и отношения\",\n",
" \"examples\": [\n",
" \"семейный квест\",\n",
" \"генеалогическое древо\",\n",
" \"совместный пикник\",\n",
" \"семейные традиции\",\n",
" \"родительский день\"\n",
" ]\n",
" },\n",
" \"Созидательный труд\": {\n",
" \"description\": \"Обучение практическим навыкам и физическая помощь\",\n",
" \"examples\": [\n",
" \"мастер-класс по ремеслам\",\n",
" \"помощь пожилым\",\n",
" \"трудовой десант\",\n",
" \"профессиональные пробы\",\n",
" \"социальный труд\"\n",
" ]\n",
" },\n",
" \"Приоритет духовного над материальным\": {\n",
" \"description\": \"Помощь нуждающимся вместо материальных благ\",\n",
" \"examples\": [\n",
" \"благотворительная акция\",\n",
" \"помощь приюту\",\n",
" \"духовные практики\",\n",
" \"волонтерство в храме\",\n",
" \"нематериальные ценности\"\n",
" ]\n",
" },\n",
" \"Гуманизм\": {\n",
" \"description\": \"Помощь людям и добрые дела\",\n",
" \"examples\": [\n",
" \"волонтерство в приюте\",\n",
" \"помощь бездомным\",\n",
" \"добрые письма\",\n",
" \"поддержка инвалидов\",\n",
" \"человеколюбие\"\n",
" ]\n",
" },\n",
" \"Милосердие\": {\n",
" \"description\": \"Помощь окружающим и информирование о болезнях\",\n",
" \"examples\": [\n",
" \"сбор вещей нуждающимся\",\n",
" \"помощь пожилым\",\n",
" \"донорство органов\",\n",
" \"паллиативная помощь\",\n",
" \"социальная поддержка\"\n",
" ]\n",
" },\n",
" \"Справедливость\": {\n",
" \"description\": \"Обсуждение и укрепление справедливости\",\n",
" \"examples\": [\n",
" \"дебаты о равенстве\",\n",
" \"правовые квесты\",\n",
" \"честное распределение\",\n",
" \"борьба с дискриминацией\",\n",
" \"равные возможности\"\n",
" ]\n",
" },\n",
" \"Коллективизм\": {\n",
" \"description\": \"Командная работа и взаимодействие в группе\",\n",
" \"examples\": [\n",
" \"групповой турпоход\",\n",
" \"командный квиз\",\n",
" \"совместный проект\",\n",
" \"работа в команде\",\n",
" \"коллективное решение\"\n",
" ]\n",
" },\n",
" \"Взаимопомощь и взаимоуважение\": {\n",
" \"description\": \"Волонтерство и поддержка окружающих\",\n",
" \"examples\": [\n",
" \"помощь одноклассникам\",\n",
" \"тьюторство\",\n",
" \"шефская помощь\",\n",
" \"социальное наставничество\",\n",
" \"взаимная поддержка\"\n",
" ]\n",
" },\n",
" \"Историческая память и преемственность поколений\": {\n",
" \"description\": \"История семьи и страны, связь поколений\",\n",
" \"examples\": [\n",
" \"интервью с ветеранами\",\n",
" \"семейная летопись\",\n",
" \"музей истории\",\n",
" \"реконструкция событий\",\n",
" \"устная история\"\n",
" ]\n",
" },\n",
" \"Единство народов России\": {\n",
" \"description\": \"Культурное многообразие и национальные традиции\",\n",
" \"examples\": [\n",
" \"фестиваль культур\",\n",
" \"этнографический музей\",\n",
" \"национальные ремесла\",\n",
" \"межнациональный диалог\",\n",
" \"традиционные обычаи\"\n",
" ]\n",
" }\n",
"}"
],
"metadata": {
"id": "97It6DvYZYOP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
"from string import punctuation\n",
"import pymorphy3\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"\n",
"nltk.download('stopwords')\n",
"\n",
"morph = pymorphy3.MorphAnalyzer()\n",
"russian_stopwords = stopwords.words('russian')\n",
"\n",
"def preprocess(text):\n",
" text = text.lower()\n",
" text = re.sub(f'[{punctuation}»«–…№©™•°]', '', text)\n",
" words = re.findall(r'\\b[a-zа-яё]+\\b', text)\n",
" lemmas = []\n",
" for word in words:\n",
" if word not in russian_stopwords and len(word) > 2:\n",
" lemma = morph.parse(word)[0].normal_form\n",
" lemmas.append(lemma)\n",
" return ' '.join(lemmas)\n",
"\n",
"# Создание корпуса документов\n",
"category_docs = []\n",
"for cat, data_s in categories.items():\n",
" doc = f\"{cat} {data_s['description']} {' '.join(data_s['examples'])}\"\n",
" category_docs.append(doc)\n",
"\n",
"# Инициализация TF-IDF\n",
"vectorizer = TfidfVectorizer(tokenizer=preprocess)\n",
"tfidf_matrix = vectorizer.fit_transform(category_docs)\n",
"\n",
"def classify_text(text):\n",
" processed_text = preprocess(text)\n",
"\n",
" text_vector = vectorizer.transform([processed_text])\n",
" print(text_vector)\n",
" similarities = cosine_similarity(text_vector, tfidf_matrix)\n",
" print(similarities)\n",
" best_match_idx = similarities.argmax()\n",
" best_category = list(categories.keys())[best_match_idx]\n",
" accuracy = similarities[0][best_match_idx]\n",
"\n",
"\n",
" return best_category, accuracy"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "MHBcm352YSBS",
"outputId": "95184d17-ba84-43e4-8128-47b4e7160cbb"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"/usr/local/lib/python3.11/dist-packages/sklearn/feature_extraction/text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
" warnings.warn(\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"input_text = \"Организация группового турпохода с одноклассниками и работа в команде\"\n",
"result = classify_text(input_text)\n",
"result"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 408
},
"id": "-MdsShmMZPJV",
"outputId": "6e4e1588-2856-499d-9167-b3ee1805a692"
},
"execution_count": null,
"outputs": [
{
"output_type": "error",
"ename": "ValueError",
"evalue": "setting an array element with a sequence.",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;31mTypeError\u001b[0m: float() argument must be a string or a real number, not 'csr_matrix'",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0minput_text\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Организация группового турпохода с одноклассниками и работа в команде\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclassify_text_x\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_text\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m\u001b[0m in \u001b[0;36mclassify_text_x\u001b[0;34m(text)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;31m# Считаем сходство с каждым классом\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 48\u001b[0;31m similarities = {cat: cosine_similarity(text_vector.reshape(1, -1), cat_vector.reshape(1, -1))[0][0] \n\u001b[0m\u001b[1;32m 49\u001b[0m for cat, cat_vector in category_vectors.items()}\n\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[0;31m# Считаем сходство с каждым классом\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 48\u001b[0;31m similarities = {cat: cosine_similarity(text_vector.reshape(1, -1), cat_vector.reshape(1, -1))[0][0] \n\u001b[0m\u001b[1;32m 49\u001b[0m for cat, cat_vector in category_vectors.items()}\n\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 214\u001b[0m )\n\u001b[1;32m 215\u001b[0m ):\n\u001b[0;32m--> 216\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 217\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mInvalidParameterError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0;31m# When the function is just a wrapper around an estimator, we allow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.11/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mcosine_similarity\u001b[0;34m(X, Y, dense_output)\u001b[0m\n\u001b[1;32m 1739\u001b[0m \u001b[0;31m# to avoid recursive import\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1740\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1741\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_pairwise_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1742\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1743\u001b[0m \u001b[0mX_normalized\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnormalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.11/dist-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype, accept_sparse, force_all_finite, ensure_all_finite, ensure_2d, copy)\u001b[0m\n\u001b[1;32m 198\u001b[0m )\n\u001b[1;32m 199\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 200\u001b[0;31m X = check_array(\n\u001b[0m\u001b[1;32m 201\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 202\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_all_finite, ensure_non_negative, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001b[0m\n\u001b[1;32m 1053\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1054\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1055\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_asarray_with_order\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mxp\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mxp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1056\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcomplex_warning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1057\u001b[0m raise ValueError(\n",
"\u001b[0;32m/usr/local/lib/python3.11/dist-packages/sklearn/utils/_array_api.py\u001b[0m in \u001b[0;36m_asarray_with_order\u001b[0;34m(array, dtype, order, copy, xp, device)\u001b[0m\n\u001b[1;32m 837\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 838\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 839\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 840\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 841\u001b[0m \u001b[0;31m# At this point array is a NumPy ndarray. We convert it to an array\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: setting an array element with a sequence."
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Определение похожих запросов"
],
"metadata": {
"id": "GutjrXHHb0_B"
}
},
{
"cell_type": "code",
"source": [
"import torch\n",
"from sentence_transformers import SentenceTransformer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import pandas as pd\n",
"\n",
"# ✅ Загружаем предобученную модель, поддерживающую русский язык\n",
"model = SentenceTransformer(\"sentence-transformers/paraphrase-multilingual-mpnet-base-v2\")\n",
"\n",
"# 📌 Пример запросов\n",
"queries = [\n",
" \"Как приготовить борщ?\",\n",
" \"Рецепт вкусного борща\",\n",
" \"Где найти рецепт лазаньи?\",\n",
" \"Как сварить суп?\"\n",
"]\n",
"\n",
"# 🔥 Получаем эмбеддинги\n",
"query_embeddings = model.encode(queries, convert_to_tensor=True)\n",
"\n",
"# 🔥 Вычисляем матрицу косинусного сходства\n",
"similarity_matrix = cosine_similarity(query_embeddings.cpu().numpy())\n",
"\n",
"# 📌 Выводим результаты\n",
"df = pd.DataFrame(similarity_matrix, index=queries, columns=queries)\n",
"print(df)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 560,
"referenced_widgets": [
"dca4feb8999943458c52402e4821fd9b",
"962de14c45d440f981cab7f98bb999d4",
"e1670826b6984372b3e1f6d38dc775ce",
"3cd0ed60902944d39d3f0c1acd41cdc3",
"a73eb0974895406f923d3db27483084f",
"c7645ed1ba1a4401be3523e96b611adb",
"d5d2baac77dd489c911e851aea84b2e8",
"de9347d60d8b43888b3603bbfbceb2af",
"a627519796034c88a763f9cd2ec0808e",
"7a868d4c9e3f4a70961c2d056d6a5653",
"b2b43b9c16664528a1c55e66805eff76",
"d02d64dc3c9a499491861bef90086d82",
"7aad2011378e434eac5d46c19a117631",
"208acf230279450ca859204736946a28",
"641efe28133240ccbcff70a3d4424249",
"95152a8da2dd4333afeb475aec2b3def",
"0388f73993794aa3aba737c2ff7a06e7",
"89414b1d7b06439c8bdb33948be5d11e",
"d1f4213d8cbe4416a26520c67b8898ef",
"d3d80ca0e7dd4ec396e96892ec86e17c",
"8843fcd259744d94979cc460e24f5ba3",
"0de221a4d2d644c885c0d3f82f8b924c",
"9ed3dd803a884c52aff984be748b0fa2",
"d8d8af05c11e4ecfa4c75f9eea5f581a",
"134b61665b5f42ebaa18710e78e4b37a",
"8257e6f8f14c4754a938dd2ac67002aa",
"c16eb49089ee4cbfb42b58b09c529608",
"be6a7355b7be437586bb693fecae802d",
"4969a761d9f34a62b5502683ad93a723",
"f39aa003b8b243efa9992302d9c39ecb",
"6556302f910349f7aea105bcc4a4e516",
"819d6c6459fc4a71bf37cce6daf4cd04",
"6a5b78e5f5834238b48b3b893d1a797c",
"d2acc67c459840c5872a0aed088a673a",
"b22bd6adc4944ac2a8223d1fc54e278e",
"9eed2253e2794b96b078174c26f1599e",
"9c12caeb617e4a079bb4960698a9addf",
"f96eeb6fb26243a0988eed5281101c36",
"543c6458885e4c92911ac85a6bceff99",
"be795953cbe942d891d3e2e1a857b1c7",
"cffa853377644f2f8a8bba31795146a0",
"08a68525c0174d77b8fa1f7a16b37f3d",
"e5f8df883014424997370c858ca75119",
"980b8f08ae1f441ba4fcd9fec7bba172",
"fb399b6ccf14488e8f3a5ee25ba1f519",
"9689a4b5fc3443c988355d51fb8b31dd",
"07c7545951bd45838b37656a7d55c4f5",
"375f551994f141a197d717059513a15a",
"f8e89fd727a747d1a205f0ea3452a94a",
"b0d6e4b6564c46348efe5ee5e24a5a36",
"ab1c746c9a414d16a11db2ecad0797ab",
"7297eacf7ca34b3bba0029375b8c811a",
"0075b2bdafa04862b6a871356351dfa1",
"64db601a4cca46a882e32f771c3dba3d",
"80946e3d99314550b777b8a62ffe0521",
"f810aade9a2943aaa5949f30774ab26f",
"ea5a3b55a5bc450aa0f92c88625778f8",
"ba21fc338bfc4269bef02e50c0711b7c",
"5a7c1b813c5b4ceea534f74765c5e605",
"894779b9a1384b668593ae969055c32c",
"75d1b0ba8e7048cfb30e507ec3f24ff8",
"59dda900f0894c54b6ce7302e761ad43",
"bd87f489e77d44d9b1e066f16c0ca9ed",
"4e0bd61e94594f5db415e7d5315de7c0",
"c239ef650cb54478825730509e2a1bd2",
"ee1503ea36ab41d2a48155fbe442d909",
"8798e6b2ae124152bb6cca66b14c0d3a",
"ba5e2e03809a428796f8905392eae113",
"27067ff09cbe4ad091ffdd18ea30bc17",
"d722357cd295474290b5dc0ebd704f4c",
"d89ed637839f4c9abd4268f3d71f3eab",
"63c52b32dbae41b4a82020cbf030ddb4",
"3adaba744a5740318b2cf4188429acc0",
"cc0fc21f8f2d43b4859f1886a33cf0d9",
"d696138c3379401fae995f49d0d4cd48",
"e291c590d5ad4f62b29f65a451fdea09",
"f42a6bf637f846659b0f7d7d198e6eb4",
"4a74fa41f5d8471e8db6f79ee06d88a1",
"df3a741ce3334174a1ba1c185bcf7e4d",
"27e51c8b4ed54557987dc6362be11609",
"af075877b5864bd3b6f98b13695f1e0d",
"b8935da2360a4a8f8be0f1cf12049704",
"0007b3003cf94edfbe879278e68ad521",
"91d456c10ac14ddda54da8f6b0d6c45a",
"304f170991ee4e538908b36bc5ee5c71",
"e1cebdb893b64ef0ac977255e216143a",
"b7327644f1a14c9f9b620747902501f1",
"ef7f978170c746f09aaa8f1450c02548",
"757581a0b5ca4ab5af853223f95e21e4",
"ace6230a55254740919ad2ee94309bff",
"569e73779e594bc0acb72bdcec240c26",
"506eec7a17cd4657a36d3970c81689b0",
"d0cb73af415c49f8887fe530cb58e218",
"b285870fcfd04f13a064b4f41bcdc15c",
"3b63a5eb40bc46dbb59ddf0dab56f961",
"39abfc34a8de449ab918374cd617983d",
"6dbd7ac42eae437092baca2fcc660b50",
"31434f024801490aa52a110bb0814510",
"61f56526d84e409d879cfd9935eb5084",
"ffdf2ed2b8d2412997a9845a85d1f4ab",
"5aa4451625f04032a62b35829b5cbfa3",
"e8202c4e66814d07bf4e7e980e930a90",
"a54eef1414914da0981e3037d19e79bd",
"8dcd120dfe98453b9f4e9cb145696940",
"1741b6afdc08426ea2f7058457413999",
"c87cf21dbf074c699c1c57aa604940e6",
"3dec68173930485d82b3a0d53deda040",
"320ea31c60b34983bf6b9c1a6d258eeb",
"1d7d3a59a8cc4e03825aebbd9fa8bdb6",
"07215f1e31c44c8893b096808be4d96e",
"438a2b975062455fb5fc06eb3cdaf625",
"0445438e0b9c4ac680f5e82183708492",
"5d0f54bb9df64b8786538424532d6397",
"a325f3831f664a0c96061d16e1d9c358",
"aac58743d8d64b8591abe9d421a43565",
"aa49127f1f4d4b168a5d5b69dc7fed0d",
"fe79e4d23eb64d82bd818ac4c369303a",
"3d60222523434c1fbc1647b104d2af20",
"44dba6a745254c3ab17a76ba184b4ecd",
"d530476ab4934024a5753e5a520d7772",
"6bc1fefb830a4636b06a0f692bf7d71b"
]
},
"id": "oED-a0bsb8ap",
"outputId": "b894e758-5c3d-4a9d-8793-36a9bf77bbb1"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"modules.json: 0%| | 0.00/229 [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "dca4feb8999943458c52402e4821fd9b"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"config_sentence_transformers.json: 0%| | 0.00/122 [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "d02d64dc3c9a499491861bef90086d82"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"README.md: 0%| | 0.00/4.13k [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "9ed3dd803a884c52aff984be748b0fa2"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"sentence_bert_config.json: 0%| | 0.00/53.0 [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "d2acc67c459840c5872a0aed088a673a"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"config.json: 0%| | 0.00/723 [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "fb399b6ccf14488e8f3a5ee25ba1f519"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"model.safetensors: 0%| | 0.00/1.11G [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "f810aade9a2943aaa5949f30774ab26f"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer_config.json: 0%| | 0.00/402 [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "8798e6b2ae124152bb6cca66b14c0d3a"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"sentencepiece.bpe.model: 0%| | 0.00/5.07M [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "4a74fa41f5d8471e8db6f79ee06d88a1"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"tokenizer.json: 0%| | 0.00/9.08M [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "757581a0b5ca4ab5af853223f95e21e4"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"special_tokens_map.json: 0%| | 0.00/239 [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "ffdf2ed2b8d2412997a9845a85d1f4ab"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"1_Pooling%2Fconfig.json: 0%| | 0.00/190 [00:00, ?B/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "438a2b975062455fb5fc06eb3cdaf625"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
" Как приготовить борщ? Рецепт вкусного борща \\\n",
"Как приготовить борщ? 1.000000 0.809563 \n",
"Рецепт вкусного борща 0.809563 1.000000 \n",
"Где найти рецепт лазаньи? 0.617756 0.644937 \n",
"Как сварить суп? 0.768549 0.594278 \n",
"\n",
" Где найти рецепт лазаньи? Как сварить суп? \n",
"Как приготовить борщ? 0.617756 0.768549 \n",
"Рецепт вкусного борща 0.644937 0.594278 \n",
"Где найти рецепт лазаньи? 1.000000 0.557159 \n",
"Как сварить суп? 0.557159 1.000000 \n"
]
}
]
},
{
"cell_type": "code",
"source": [
"df.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 175
},
"id": "VmZEvdLecCHc",
"outputId": "0f175449-e624-4c5b-b640-e855fd1f62a4"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" Как приготовить борщ? Рецепт вкусного борща \\\n",
"Как приготовить борщ? 1.000000 0.809563 \n",
"Рецепт вкусного борща 0.809563 1.000000 \n",
"Где найти рецепт лазаньи? 0.617756 0.644937 \n",
"Как сварить суп? 0.768549 0.594278 \n",
"\n",
" Где найти рецепт лазаньи? Как сварить суп? \n",
"Как приготовить борщ? 0.617756 0.768549 \n",
"Рецепт вкусного борща 0.644937 0.594278 \n",
"Где найти рецепт лазаньи? 1.000000 0.557159 \n",
"Как сварить суп? 0.557159 1.000000 "
],
"text/html": [
"\n",
" \n",
" \n",
"\n",
" \n",
" \n",
" \n",
" | \n",
" Как приготовить борщ? | \n",
" Рецепт вкусного борща | \n",
" Где найти рецепт лазаньи? | \n",
" Как сварить суп? | \n",
" \n",
" \n",
" \n",
" \n",
" Как приготовить борщ? | \n",
" 1.000000 | \n",
" 0.809563 | \n",
" 0.617756 | \n",
" 0.768549 | \n",
" \n",
" \n",
" Рецепт вкусного борща | \n",
" 0.809563 | \n",
" 1.000000 | \n",
" 0.644937 | \n",
" 0.594278 | \n",
" \n",
" \n",
" Где найти рецепт лазаньи? | \n",
" 0.617756 | \n",
" 0.644937 | \n",
" 1.000000 | \n",
" 0.557159 | \n",
" \n",
" \n",
" Как сварить суп? | \n",
" 0.768549 | \n",
" 0.594278 | \n",
" 0.557159 | \n",
" 1.000000 | \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" \n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "df",
"summary": "{\n \"name\": \"df\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"\\u041a\\u0430\\u043a \\u043f\\u0440\\u0438\\u0433\\u043e\\u0442\\u043e\\u0432\\u0438\\u0442\\u044c \\u0431\\u043e\\u0440\\u0449?\",\n \"properties\": {\n \"dtype\": \"float32\",\n \"num_unique_values\": 4,\n \"samples\": [\n 0.8095631003379822,\n 0.7685485482215881,\n 0.9999996423721313\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"\\u0420\\u0435\\u0446\\u0435\\u043f\\u0442 \\u0432\\u043a\\u0443\\u0441\\u043d\\u043e\\u0433\\u043e \\u0431\\u043e\\u0440\\u0449\\u0430\",\n \"properties\": {\n \"dtype\": \"float32\",\n \"num_unique_values\": 4,\n \"samples\": [\n 0.9999998807907104,\n 0.5942782163619995,\n 0.8095631003379822\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"\\u0413\\u0434\\u0435 \\u043d\\u0430\\u0439\\u0442\\u0438 \\u0440\\u0435\\u0446\\u0435\\u043f\\u0442 \\u043b\\u0430\\u0437\\u0430\\u043d\\u044c\\u0438?\",\n \"properties\": {\n \"dtype\": \"float32\",\n \"num_unique_values\": 4,\n \"samples\": [\n 0.6449370980262756,\n 0.5571587085723877,\n 0.6177558302879333\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"\\u041a\\u0430\\u043a \\u0441\\u0432\\u0430\\u0440\\u0438\\u0442\\u044c \\u0441\\u0443\\u043f?\",\n \"properties\": {\n \"dtype\": \"float32\",\n \"num_unique_values\": 4,\n \"samples\": [\n 0.5942782163619995,\n 1.000000238418579,\n 0.7685485482215881\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {},
"execution_count": 3
}
]
},
{
"cell_type": "markdown",
"source": [
"# rubert (semi-final version; sentiment analysys/text classification)"
],
"metadata": {
"id": "u9NimG14vOGS"
}
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from transformers import BertTokenizer, BertForSequenceClassification\n",
"from torch.utils.data import DataLoader, Dataset\n",
"import torch\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import f1_score"
],
"metadata": {
"id": "gQCIOMBERGSm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train=pd.read_csv(r'C:\\VisualCode\\T1\\tr.csv', engine='python', encoding='utf-8', on_bad_lines=\"skip\")\n",
"test=pd.read_csv(r'C:\\VisualCode\\T1\\ts.csv', engine='python', encoding='utf-8', on_bad_lines = \"skip\")\n",
"\n",
"# Разделение данных на признаки и метки\n",
"X_train, X_val, y_train, y_val = train_test_split(train['review'], train['sentiment'], test_size=0.2, random_state=42)"
],
"metadata": {
"id": "-QcGnSdXRLV1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"class ReviewsDataset(Dataset):\n",
" def __init__(self, reviews, labels=None, tokenizer=None, max_len=128):\n",
" self.reviews = reviews\n",
" self.labels = labels\n",
" self.tokenizer = tokenizer\n",
" self.max_len = max_len\n",
"\n",
" def __len__(self):\n",
" return len(self.reviews)\n",
"\n",
" def __getitem__(self, idx):\n",
" review = self.reviews.iloc[idx]\n",
" inputs = self.tokenizer.encode_plus(\n",
" review,\n",
" add_special_tokens=True,\n",
" max_length=self.max_len,\n",
" truncation=True,\n",
" padding='max_length',\n",
" return_tensors='pt'\n",
" )\n",
" item = {key: val.squeeze(0) for key, val in inputs.items()}\n",
" if self.labels is not None:\n",
" item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)\n",
" return item\n"
],
"metadata": {
"id": "qIaQzApYRRv1"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"tokenizer = BertTokenizer.from_pretrained(\"DeepPavlov/rubert-base-cased\")\n",
"model = BertForSequenceClassification.from_pretrained(\n",
" \"DeepPavlov/rubert-base-cased\",\n",
" num_labels=3 # У нас три класса: 0, 1, 2\n",
")"
],
"metadata": {
"id": "qvgwDYwURVWX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_dataset = ReviewsDataset(X_train, y_train, tokenizer)\n",
"val_dataset = ReviewsDataset(X_val, y_val, tokenizer)\n",
"test_dataset = ReviewsDataset(test['review'], tokenizer=tokenizer)\n",
"\n",
"train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)\n",
"val_loader = DataLoader(val_dataset, batch_size=16)\n",
"test_loader = DataLoader(test_dataset, batch_size=16)"
],
"metadata": {
"id": "JvYRnhONRYcD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from transformers import AdamW\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"model = model.to(device)\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=2e-5)\n",
"\n",
"# Функция обучения\n",
"def train_epoch(model, data_loader, optimizer, device):\n",
" model.train()\n",
" total_loss = 0\n",
" for batch in data_loader:\n",
" inputs = {key: val.to(device) for key, val in batch.items() if key != 'labels'}\n",
" labels = batch['labels'].to(device)\n",
"\n",
" outputs = model(**inputs, labels=labels)\n",
" loss = outputs.loss\n",
" total_loss += loss.item()\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
" optimizer.zero_grad()\n",
" return total_loss / len(data_loader)"
],
"metadata": {
"id": "jiY9w624RZ1_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import time\n",
"\n",
"epochs = 3 # Можно увеличить для лучшего результата\n",
"\n",
"for epoch in range(epochs):\n",
" start_time = time.time()\n",
"\n",
" train_loss = train_epoch(model, train_loader, optimizer, device)\n",
"\n",
" end_time = time.time()\n",
" epoch_duration = end_time - start_time\n",
"\n",
" hours = int(epoch_duration // 3600)\n",
" minutes = int((epoch_duration % 3600) // 60)\n",
" seconds = int(epoch_duration % 60)\n",
"\n",
" torch.save(model.state_dict(), f\"model_epoch_{epoch+1}.pth\")\n",
" print(f\"Эпоха {epoch + 1}/{3} завершена.\")\n",
" print(f\"Потеря: {train_loss:.4f}, Время выполнения: {epoch_duration:.2f} секунд.\")\n"
],
"metadata": {
"id": "PumWrG3ZRb-p"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def predict(model, data_loader, device):\n",
" model.eval()\n",
" predictions = []\n",
" with torch.no_grad():\n",
" for batch in data_loader:\n",
" inputs = {key: val.to(device) for key, val in batch.items()}\n",
" outputs = model(**inputs)\n",
" logits = outputs.logits\n",
" preds = torch.argmax(logits, dim=1).cpu().numpy()\n",
" predictions.extend(preds)\n",
" return predictions\n",
"\n",
"# Предсказания\n",
"test['sentiment'] = predict(model, test_loader, device)\n"
],
"metadata": {
"id": "rpg4htSyRfGX"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"ans = test[['index', 'sentiment']]\n",
"ans.head(3)\n",
"ans.to_csv('poputka_0.csv', index = False)"
],
"metadata": {
"id": "euO3RuywRhVF"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# RuBert + Distilbert (for classification and sentiment analysys)"
],
"metadata": {
"id": "MSgsUE3a7F6V"
}
},
{
"cell_type": "code",
"source": [
"from transformers import RobertaTokenizer, RobertaForSequenceClassification\n",
"from datasets import load_dataset\n",
"from transformers import Trainer, TrainingArguments\n",
"import torch\n",
"\n",
"# Загрузка данных\n",
"train_dataset = load_dataset('csv', data_files='train.csv')['train']\n",
"test_dataset = load_dataset('csv', data_files='test.csv')['test']\n",
"\n",
"# Токенизация\n",
"tokenizer = RobertaTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')\n",
"\n",
"def tokenize_function(examples):\n",
" return tokenizer(examples['text'], padding='max_length', truncation=True)\n",
"\n",
"train_dataset = train_dataset.map(tokenize_function, batched=True)\n",
"test_dataset = test_dataset.map(tokenize_function, batched=True)\n",
"\n",
"# Загрузка модели для классификации текста\n",
"model = RobertaForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=3) # например, 3 класса\n",
"\n",
"# Аргументы для тренировки\n",
"training_args = TrainingArguments(\n",
" output_dir='./results',\n",
" num_train_epochs=3,\n",
" per_device_train_batch_size=8,\n",
" per_device_eval_batch_size=16,\n",
" evaluation_strategy=\"epoch\",\n",
" logging_dir='./logs',\n",
")\n",
"\n",
"# Создаем Trainer\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=test_dataset,\n",
")\n",
"\n",
"# Обучаем модель\n",
"trainer.train()\n",
"\n",
"# Получаем предсказания\n",
"predictions = trainer.predict(test_dataset)\n",
"pred_labels = torch.argmax(predictions.predictions, axis=1)\n",
"print(pred_labels)\n"
],
"metadata": {
"id": "vm4LZTDAJ3R9"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from transformers import DistilBertTokenizer, DistilBertForSequenceClassification\n",
"from transformers import Trainer, TrainingArguments\n",
"from datasets import load_dataset\n",
"import torch\n",
"\n",
"# Загружаем датасет\n",
"train_dataset = load_dataset('csv', data_files='train.csv')['train']\n",
"test_dataset = load_dataset('csv', data_files='test.csv')['test']\n",
"\n",
"# Токенизация\n",
"tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')\n",
"\n",
"def tokenize_function(examples):\n",
" return tokenizer(examples['text'], padding='max_length', truncation=True)\n",
"\n",
"train_dataset = train_dataset.map(tokenize_function, batched=True)\n",
"test_dataset = test_dataset.map(tokenize_function, batched=True)\n",
"\n",
"# Загружаем модель для классификации текста\n",
"model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3) # 3 класса для примера\n",
"\n",
"# Аргументы для тренировки\n",
"training_args = TrainingArguments(\n",
" output_dir='./results',\n",
" num_train_epochs=3,\n",
" per_device_train_batch_size=8,\n",
" per_device_eval_batch_size=16,\n",
" evaluation_strategy=\"epoch\",\n",
" logging_dir='./logs',\n",
")\n",
"\n",
"# Создаем Trainer\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=test_dataset,\n",
")\n",
"\n",
"# Обучаем модель\n",
"trainer.train()\n",
"\n",
"# Получаем предсказания\n",
"predictions = trainer.predict(test_dataset)\n",
"pred_labels = torch.argmax(predictions.predictions, axis=1)\n",
"print(pred_labels)\n"
],
"metadata": {
"id": "3ahKB4_NP4zY"
},
"execution_count": null,
"outputs": []
}
]
} |