diff --git "a/Code_for_training_.ipynb" "b/Code_for_training_.ipynb" new file mode 100644--- /dev/null +++ "b/Code_for_training_.ipynb" @@ -0,0 +1,13973 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "eiQ3FKJDhBW0" + }, + "source": [ + "NLP EXERCISE: TEXT-CLASSIFICATION ON A DRUG DATASET.\n", + "\n", + "DATA WRANGLING, FINE-TUNING AND PUSHING THE MODEL TO THE HUGGING-FACE HUB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HedrkDuchBW2" + }, + "outputs": [], + "source": [ + "#let's install the necessary libraries from Hugging Face\n", + "!pip install datasets evaluate transformers[sentencepiece] transformers[torch]" + ] + }, + { + "cell_type": "code", + "source": [ + "from huggingface_hub import notebook_login\n", + "#authenticate into the huggin face\n", + "notebook_login()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 145, + "referenced_widgets": [ + "d02e8f7bdeb145d4bae8ebcbceab2e72", + "6db4a180376840d6aa8e38c46033cc93", + "893c0a9c9b51450d99c360a44aee61d4", + "3def0a68df05423baf2effef91ff64d0", + "3b78ea24be0c42ea8074a48ae4671e12", + "50b3cf9cb0c742f2907423f456e9863f", + "d252a4002a8d434db262f5056860c5c9", + "319deda6147e44ac9e1f53a75f6fdd41", + "bfe9fdcac47c4ba8b26b54a8d588afd7", + "ba29ea12a9ce4a099992df57d4b851c2", + "c5bb554a9f0e4b9b8178a25ed1fb1344", + "9d6442503c794161bf927884aaac959e", + "69f29945f16842d3bffea0b0dd50921e", + "948a520ec0e742868f2f61f00c329f0f", + "b9d706620bf24817b391cd76ee0a0866", + "a536259beed540aabdabbe694515eb56", + "b5089fdb55134b7a840d882b2488b658", + "2d7b18bfad5c45ccbce5edbc8adc364c", + "4515e7be5a5d4e0a83e7d3717eebe626", + "0ac78e74c26044faa5a484cf5e658856", + "a9cc217968cb447e875c25a60472bfea", + "4e2105e83c4743428555be2aa7f4076e", + "1160eec48195412ab7f7f9d29dcf4673", + "6b1e242e660649f0b00d75c419569e25", + "04c5b7bde3b840919d3ad3e1e04f7c6f", + "9fffc2e7f3654cc3addfe5a786d32620", + "120d065f31614ae395a30f3fdadf5b79", + "dd54337179ed44fca73911f7177896a7", + "fc4149e666d940fead7acb14b6ed0ff7", + "da5f42f0817e407bbd9cb7001970d92b", + "2d508ef9a1304e40b725c03774c4a8d6", + "241d70158ae740f0ba73be9f2b633801" + ] + }, + "id": "Z4IolwVzaQQh", + "outputId": "c5901a51-7f5f-4ad9-b90d-0cbb1b8c381a" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "VBox(children=(HTML(value='
Epoch | \n", + "Training Loss | \n", + "Validation Loss | \n", + "Accuracy | \n", + "Precision | \n", + "Recall | \n", + "F1 | \n", + "
---|---|---|---|---|---|---|
1 | \n", + "0.846900 | \n", + "0.827508 | \n", + "0.767283 | \n", + "0.768594 | \n", + "0.767283 | \n", + "0.755066 | \n", + "
2 | \n", + "0.631900 | \n", + "0.689465 | \n", + "0.809415 | \n", + "0.808975 | \n", + "0.809415 | \n", + "0.797826 | \n", + "
3 | \n", + "0.411600 | \n", + "0.667829 | \n", + "0.837578 | \n", + "0.832482 | \n", + "0.837578 | \n", + "0.831665 | \n", + "
" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=40170, training_loss=0.7559129410606727, metrics={'train_runtime': 4322.1673, 'train_samples_per_second': 74.352, 'train_steps_per_second': 9.294, 'total_flos': 3.0660538964049024e+16, 'train_loss': 0.7559129410606727, 'epoch': 3.0})" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "language_info": { + "name": "python" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "d02e8f7bdeb145d4bae8ebcbceab2e72": { + "model_module": "@jupyter-widgets/controls", + "model_name": "VBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a9cc217968cb447e875c25a60472bfea", + "IPY_MODEL_4e2105e83c4743428555be2aa7f4076e", + "IPY_MODEL_1160eec48195412ab7f7f9d29dcf4673", + "IPY_MODEL_6b1e242e660649f0b00d75c419569e25" + ], + "layout": "IPY_MODEL_d252a4002a8d434db262f5056860c5c9" + } + }, + "6db4a180376840d6aa8e38c46033cc93": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_319deda6147e44ac9e1f53a75f6fdd41", + "placeholder": "", + "style": "IPY_MODEL_bfe9fdcac47c4ba8b26b54a8d588afd7", + "value": "