{ "cells": [ { "cell_type": "code", "execution_count": 16, "id": "ace57031", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "import pandas as pd\n", "import numpy as np\n", "import torch\n", "import pickle\n", "\n", "# from transformers import AutoTokenizer, AutoModelForSequenceClassification,BertTokenizer, TFBertForSequenceClassification\n", "# from huggingface_hub import notebook_login\n", "import tensorflow as tf\n", "# notebook_login()\n", "# Step 1: Collect and preprocess data\n", "# Get all the questions from Questions column and responses from Questions column in the dataset data.csv\n", "# questions = data[\"Questions\"].tolist()\n", "# responses = data[\"Responses\"].tolist()\n", "questions = []\n", "responses = []\n", "q_id = []\n", "with open(\"data_train.csv\", \"r\") as f:\n", " for line in f:\n", " \n", " array = line.split(\",\") \n", " # questions.append(question)\n", " # responses.append(response)\n", " # q_id.append(question_id)\n", " try:\n", " question = array[1]\n", " response = array[2]\n", " question_id = array[0]\n", " questions.append(question)\n", " responses.append(response)\n", " q_id.append(question_id)\n", " except:\n", " pass\n", "\n", "# data = pd.read_csv(\"data_train.csv\")\n", "# data.tail()\n", "\n", "with open(\"data.pickle\", \"rb\") as f:\n", "# data = pd.read_pickle(f)\n", " words, labels, training, output = pickle.load(f)\n", "\n", "# data = pd.read_pickle(\"data.pickle\")\n", "# data\n", "# type(data)\n", "# show shape of tuple\n", "# print(len(data))\n", "# df = pd.DataFrame(data)\n", "# df.to_csv('data_train.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 6, "id": "8f51e39d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "missing values: 0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 0\n", " ..\n", "274 3\n", "275 3\n", "276 3\n", "277 3\n", "278 3\n", "Length: 279, dtype: int64\n" ] } ], "source": [ "print('missing values:', data.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 7, "id": "1d697a39", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4 entries, 0 to 3\n", "Columns: 279 entries, 0 to 278\n", "dtypes: object(279)\n", "memory usage: 8.8+ KB\n", "None\n" ] } ], "source": [ "print(data.info())" ] }, { "cell_type": "code", "execution_count": null, "id": "b38b2394", "metadata": {}, "outputs": [], "source": [ "# vectorize the data with question and response then save model as model.pkl and vectorizer as vectorizer.pkl\n", "vectorizer = TfidfVectorizer()" ] }, { "cell_type": "code", "execution_count": 10, "id": "c5dde0e4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(148, 252)\n", "Accuracy: 0.0\n" ] } ], "source": [ "# print(questions)\n", "# print(responses)\n", "\n", "\n", "# questions = [\"What are some symptoms of depression?\",\n", "# \"How can I manage my anxiety?\",\n", "# \"What are the treatments for bipolar disorder?\"]\n", "# responses = [\"Symptoms of depression include sadness, lack of energy, and loss of interest in activities.\",\n", "# \"You can manage your anxiety through techniques such as deep breathing, meditation, and therapy.\",\n", "# \"Treatments for bipolar disorder include medication, therapy, and lifestyle changes.\"]\n", "\n", "\n", "\n", "# vectorizer = TfidfVectorizer()\n", "vectorizer = pickle.load(open(\"vectorizer.pkl\", \"rb\"))\n", "X = vectorizer.fit_transform(questions)\n", "y = responses\n", "\n", "\n", "# Step 2: Split data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", "\n", "# Step 3: Choose a machine learning algorithm\n", "# model = LogisticRegression()\n", "model = pickle.load(open(\"model.pkl\", \"rb\"))\n", "\n", "# Step 4: Train the model\n", "model.fit(X_train, y_train)\n", "\n", "# model.push_to_hub(\"tabibu-ai/mental-health-chatbot\")\n", "# pt_model = DistilBertForSequenceClassification.from_pretrained(\"model.ipynb\", from_tf=True)\n", "# pt_model.save_pretrained(\"model.pt\")\n", "# load model from hub\n", "\n", "# Step 5: Evaluate the model\n", "y_pred = model.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy:\", accuracy)\n", "\n", "# Step 6: Use the model to make predictions\n", "\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "14406312", "metadata": {}, "outputs": [], "source": [ "new_question = input(\"Ask me anything : \")\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "6b9198db", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Prediction: ['\"Different kinds of therapy are more effective based on the nature of the mental health condition and/or symptoms and the person who has them (for example']\n" ] } ], "source": [ "new_question_vector = vectorizer.transform([new_question])\n", "prediction = model.predict(new_question_vector)\n", "print(\"Prediction:\", prediction)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.7" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 }