{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ace57031",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 0 0 ... 0 0 0]\n",
      " [0 0 0 ... 0 0 0]\n",
      " [0 0 0 ... 0 0 0]\n",
      " ...\n",
      " [0 0 0 ... 0 0 0]\n",
      " [0 0 0 ... 0 0 0]\n",
      " [0 0 0 ... 0 0 0]]\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "empty vocabulary; perhaps the documents only contain stop words",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m/tmp/ipykernel_10290/970796795.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     35\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     36\u001b[0m \u001b[0mvectorizer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTfidfVectorizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvectorizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquestions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     38\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, raw_documents, y)\u001b[0m\n\u001b[1;32m   2077\u001b[0m             \u001b[0msublinear_tf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msublinear_tf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2078\u001b[0m         )\n\u001b[0;32m-> 2079\u001b[0;31m         \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_documents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2080\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tfidf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2081\u001b[0m         \u001b[0;31m# X is already a transformed view of raw_documents so\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, raw_documents, y)\u001b[0m\n\u001b[1;32m   1336\u001b[0m                     \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1337\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1338\u001b[0;31m         \u001b[0mvocabulary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_count_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_documents\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfixed_vocabulary_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1339\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1340\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36m_count_vocab\u001b[0;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[1;32m   1226\u001b[0m             \u001b[0mvocabulary\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvocabulary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1227\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mvocabulary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1228\u001b[0;31m                 raise ValueError(\n\u001b[0m\u001b[1;32m   1229\u001b[0m                     \u001b[0;34m\"empty vocabulary; perhaps the documents only contain stop words\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1230\u001b[0m                 )\n",
      "\u001b[0;31mValueError\u001b[0m: empty vocabulary; perhaps the documents only contain stop words"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score\n",
    "import pickle\n",
    "import nltk\n",
    "import numpy as np\n",
    "from nltk.stem.lancaster import LancasterStemmer\n",
    "# Step 1: Collect and preprocess data\n",
    "# Get all the questions from Questions column and responses from Questions column in the dataset data.csv\n",
    "# questions = data[\"Questions\"].tolist()\n",
    "# responses = data[\"Responses\"].tolist()\n",
    "questions = []\n",
    "responses = []\n",
    "q_id = []\n",
    "stemmer = LancasterStemmer()\n",
    "# with open(\"data.csv\", \"r\") as f:\n",
    "with open(\"data.pickle\", \"rb\") as f:\n",
    "    # read the file\n",
    "    words, labels, training, output = pickle.load(f)\n",
    "\n",
    "    # get the questions and responses\n",
    "    # for line in f:\n",
    "    #     array = line.split(\",\") \n",
    "    #     try:\n",
    "    #         question = array[1]\n",
    "    #         response = array[2]\n",
    "    #         question_id = array[0]\n",
    "    #         questions.append(question)\n",
    "    #         responses.append(response)\n",
    "    #         q_id.append(question_id)\n",
    "    #     except:\n",
    "    #         pass\n",
    "\n",
    "print(labels)\n",
    "vectorizer = TfidfVectorizer()\n",
    "X = vectorizer.fit_transform(questions)\n",
    "y = responses\n",
    "\n",
    "# Step 2: Split data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
    "\n",
    "# Step 3: Choose a machine learning algorithm\n",
    "model = LogisticRegression()\n",
    "\n",
    "# Step 4: Train the model\n",
    "model.fit(X_train, y_train)\n",
    "\n",
    "# Step 5: Evaluate the model\n",
    "y_pred = model.predict(X_test)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(\"Accuracy:\", accuracy)\n",
    "\n",
    "def bag_of_words(s, words):\n",
    "    bag = [0 for _ in range(len(words))]\n",
    "\n",
    "    s_words = nltk.word_tokenize(s)\n",
    "    s_words = [stemmer.stem(word.lower()) for word in s_words]\n",
    "\n",
    "    for se in s_words:\n",
    "        for i, w in enumerate(words):\n",
    "            if w == se:\n",
    "                bag[i] = 1\n",
    "            \n",
    "    return np.array(bag)\n",
    "\n",
    "# Step 6: Use the model to make predictions\n",
    "new_question = \"how are you\"\n",
    "# new_question_vector = vectorizer.transform([new_question])\n",
    "# prediction = model.predict(new_question_vector)\n",
    "\n",
    "predict =  model.predict([bag_of_words(new_question.lower(), words)])\n",
    "# print(\"Prediction:\", prediction)\n",
    "print(\"Prediction:\", predict)\n",
    "\n",
    "\n",
    "# Step 7: Save the model\n",
    "# import pickle\n",
    "# pickle.dump(model, open(\"model.pkl\", \"wb\"))\n",
    "# pickle.dump(vectorizer, open(\"vectorizer.pkl\", \"wb\"))\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.7"
  },
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}