{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "ace57031",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "import pickle\n",
    "\n",
    "# from transformers import AutoTokenizer, AutoModelForSequenceClassification,BertTokenizer, TFBertForSequenceClassification\n",
    "# from huggingface_hub import notebook_login\n",
    "import tensorflow as tf\n",
    "# notebook_login()\n",
    "# Step 1: Collect and preprocess data\n",
    "# Get all the questions from Questions column and responses from Questions column in the dataset data.csv\n",
    "# questions = data[\"Questions\"].tolist()\n",
    "# responses = data[\"Responses\"].tolist()\n",
    "questions = []\n",
    "responses = []\n",
    "q_id = []\n",
    "with open(\"data_train.csv\", \"r\") as f:\n",
    "    for line in f:\n",
    "        \n",
    "        array = line.split(\",\") \n",
    "        # questions.append(question)\n",
    "        # responses.append(response)\n",
    "        # q_id.append(question_id)\n",
    "        try:\n",
    "            question = array[1]\n",
    "            response = array[2]\n",
    "            question_id = array[0]\n",
    "            questions.append(question)\n",
    "            responses.append(response)\n",
    "            q_id.append(question_id)\n",
    "        except:\n",
    "            pass\n",
    "\n",
    "# data = pd.read_csv(\"data_train.csv\")\n",
    "# data.tail()\n",
    "\n",
    "with open(\"data.pickle\", \"rb\") as f:\n",
    "#   data = pd.read_pickle(f)\n",
    "  words, labels, training, output = pickle.load(f)\n",
    "\n",
    "# data = pd.read_pickle(\"data.pickle\")\n",
    "# data\n",
    "# type(data)\n",
    "# show shape of tuple\n",
    "# print(len(data))\n",
    "# df = pd.DataFrame(data)\n",
    "# df.to_csv('data_train.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8f51e39d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "missing values: 0      0\n",
      "1      0\n",
      "2      0\n",
      "3      0\n",
      "4      0\n",
      "      ..\n",
      "274    3\n",
      "275    3\n",
      "276    3\n",
      "277    3\n",
      "278    3\n",
      "Length: 279, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print('missing values:', data.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1d697a39",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4 entries, 0 to 3\n",
      "Columns: 279 entries, 0 to 278\n",
      "dtypes: object(279)\n",
      "memory usage: 8.8+ KB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print(data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b38b2394",
   "metadata": {},
   "outputs": [],
   "source": [
    "# vectorize the data with question and response then save model as model.pkl and vectorizer as vectorizer.pkl\n",
    "vectorizer = TfidfVectorizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c5dde0e4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(148, 252)\n",
      "Accuracy: 0.0\n"
     ]
    }
   ],
   "source": [
    "# print(questions)\n",
    "# print(responses)\n",
    "\n",
    "\n",
    "# questions = [\"What are some symptoms of depression?\",\n",
    "#              \"How can I manage my anxiety?\",\n",
    "#              \"What are the treatments for bipolar disorder?\"]\n",
    "# responses = [\"Symptoms of depression include sadness, lack of energy, and loss of interest in activities.\",\n",
    "#              \"You can manage your anxiety through techniques such as deep breathing, meditation, and therapy.\",\n",
    "#              \"Treatments for bipolar disorder include medication, therapy, and lifestyle changes.\"]\n",
    "\n",
    "\n",
    "\n",
    "# vectorizer = TfidfVectorizer()\n",
    "vectorizer = pickle.load(open(\"vectorizer.pkl\", \"rb\"))\n",
    "X = vectorizer.fit_transform(questions)\n",
    "y = responses\n",
    "\n",
    "\n",
    "# Step 2: Split data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
    "\n",
    "# Step 3: Choose a machine learning algorithm\n",
    "# model = LogisticRegression()\n",
    "model = pickle.load(open(\"model.pkl\", \"rb\"))\n",
    "\n",
    "# Step 4: Train the model\n",
    "model.fit(X_train, y_train)\n",
    "\n",
    "# model.push_to_hub(\"tabibu-ai/mental-health-chatbot\")\n",
    "# pt_model = DistilBertForSequenceClassification.from_pretrained(\"model.ipynb\", from_tf=True)\n",
    "# pt_model.save_pretrained(\"model.pt\")\n",
    "# load model from hub\n",
    "\n",
    "# Step 5: Evaluate the model\n",
    "y_pred = model.predict(X_test)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(\"Accuracy:\", accuracy)\n",
    "\n",
    "# Step 6: Use the model to make predictions\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "14406312",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_question = input(\"Ask me anything : \")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6b9198db",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prediction: ['\"Different kinds of therapy are more effective based on the nature of the mental health condition and/or symptoms and the person who has them (for example']\n"
     ]
    }
   ],
   "source": [
    "new_question_vector = vectorizer.transform([new_question])\n",
    "prediction = model.predict(new_question_vector)\n",
    "print(\"Prediction:\", prediction)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.7"
  },
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}