{ "cells": [ { "cell_type": "code", "execution_count": 14, "id": "ace57031", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]\n", " ...\n", " [0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]\n", " [0 0 0 ... 0 0 0]]\n" ] }, { "ename": "ValueError", "evalue": "empty vocabulary; perhaps the documents only contain stop words", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_10290/970796795.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mvectorizer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTfidfVectorizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvectorizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mquestions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresponses\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, raw_documents, y)\u001b[0m\n\u001b[1;32m 2077\u001b[0m \u001b[0msublinear_tf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msublinear_tf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2078\u001b[0m )\n\u001b[0;32m-> 2079\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_documents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2080\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tfidf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2081\u001b[0m \u001b[0;31m# X is already a transformed view of raw_documents so\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, raw_documents, y)\u001b[0m\n\u001b[1;32m 1336\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1337\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1338\u001b[0;31m \u001b[0mvocabulary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_count_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_documents\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfixed_vocabulary_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1339\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1340\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbinary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.10/site-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36m_count_vocab\u001b[0;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[1;32m 1226\u001b[0m \u001b[0mvocabulary\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvocabulary\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1227\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mvocabulary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1228\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1229\u001b[0m \u001b[0;34m\"empty vocabulary; perhaps the documents only contain stop words\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1230\u001b[0m )\n", "\u001b[0;31mValueError\u001b[0m: empty vocabulary; perhaps the documents only contain stop words" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "import pickle\n", "import nltk\n", "import numpy as np\n", "from nltk.stem.lancaster import LancasterStemmer\n", "# Step 1: Collect and preprocess data\n", "# Get all the questions from Questions column and responses from Questions column in the dataset data.csv\n", "# questions = data[\"Questions\"].tolist()\n", "# responses = data[\"Responses\"].tolist()\n", "questions = []\n", "responses = []\n", "q_id = []\n", "stemmer = LancasterStemmer()\n", "# with open(\"data.csv\", \"r\") as f:\n", "with open(\"data.pickle\", \"rb\") as f:\n", " # read the file\n", " words, labels, training, output = pickle.load(f)\n", "\n", " # get the questions and responses\n", " # for line in f:\n", " # array = line.split(\",\") \n", " # try:\n", " # question = array[1]\n", " # response = array[2]\n", " # question_id = array[0]\n", " # questions.append(question)\n", " # responses.append(response)\n", " # q_id.append(question_id)\n", " # except:\n", " # pass\n", "\n", "print(labels)\n", "vectorizer = TfidfVectorizer()\n", "X = vectorizer.fit_transform(questions)\n", "y = responses\n", "\n", "# Step 2: Split data into training and testing sets\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n", "\n", "# Step 3: Choose a machine learning algorithm\n", "model = LogisticRegression()\n", "\n", "# Step 4: Train the model\n", "model.fit(X_train, y_train)\n", "\n", "# Step 5: Evaluate the model\n", "y_pred = model.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Accuracy:\", accuracy)\n", "\n", "def bag_of_words(s, words):\n", " bag = [0 for _ in range(len(words))]\n", "\n", " s_words = nltk.word_tokenize(s)\n", " s_words = [stemmer.stem(word.lower()) for word in s_words]\n", "\n", " for se in s_words:\n", " for i, w in enumerate(words):\n", " if w == se:\n", " bag[i] = 1\n", " \n", " return np.array(bag)\n", "\n", "# Step 6: Use the model to make predictions\n", "new_question = \"how are you\"\n", "# new_question_vector = vectorizer.transform([new_question])\n", "# prediction = model.predict(new_question_vector)\n", "\n", "predict = model.predict([bag_of_words(new_question.lower(), words)])\n", "# print(\"Prediction:\", prediction)\n", "print(\"Prediction:\", predict)\n", "\n", "\n", "# Step 7: Save the model\n", "# import pickle\n", "# pickle.dump(model, open(\"model.pkl\", \"wb\"))\n", "# pickle.dump(vectorizer, open(\"vectorizer.pkl\", \"wb\"))\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.7" }, "vscode": { "interpreter": { "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" } } }, "nbformat": 4, "nbformat_minor": 5 }