{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "source": [ "##Installing Data" ], "metadata": { "id": "8RPHufKzKDHP" } }, { "cell_type": "code", "source": [ "!pip install opendatasets" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "40Tc5Jja7oI2", "outputId": "3eb4348e-b408-4d9c-f144-e13664b553ea" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting opendatasets\n", " Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from opendatasets) (4.66.2)\n", "Requirement already satisfied: kaggle in /usr/local/lib/python3.10/dist-packages (from opendatasets) (1.5.16)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from opendatasets) (8.1.7)\n", "Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.10/dist-packages (from kaggle->opendatasets) (1.16.0)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from kaggle->opendatasets) (2024.2.2)\n", "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from kaggle->opendatasets) (2.8.2)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from kaggle->opendatasets) (2.31.0)\n", "Requirement already satisfied: python-slugify in /usr/local/lib/python3.10/dist-packages (from kaggle->opendatasets) (8.0.4)\n", "Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from kaggle->opendatasets) (2.0.7)\n", "Requirement already satisfied: bleach in /usr/local/lib/python3.10/dist-packages (from kaggle->opendatasets) (6.1.0)\n", "Requirement already satisfied: webencodings in /usr/local/lib/python3.10/dist-packages (from bleach->kaggle->opendatasets) (0.5.1)\n", "Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.10/dist-packages (from python-slugify->kaggle->opendatasets) (1.3)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle->opendatasets) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle->opendatasets) (3.6)\n", "Installing collected packages: opendatasets\n", "Successfully installed opendatasets-0.1.22\n" ] } ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "yFQD_jMqzKHL" }, "outputs": [], "source": [ "dataset_link = 'https://www.kaggle.com/datasets/zarajamshaid/language-identification-datasst/download?datasetVersionNumber=1'" ] }, { "cell_type": "code", "source": [ "import opendatasets as od\n", "od.download(dataset_link)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sX9440bT7-pY", "outputId": "872b7528-059e-47dc-95d9-ed76fbd58a5a" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading language-identification-datasst.zip to ./language-identification-datasst\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "100%|██████████| 5.53M/5.53M [00:00<00:00, 45.5MB/s]" ] }, { "output_type": "stream", "name": "stdout", "text": [ "\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] } ] }, { "cell_type": "markdown", "source": [ "##Data Analyzing" ], "metadata": { "id": "ALKPYmoaCajZ" } }, { "cell_type": "code", "source": [ "import pandas as pd\n", "df = pd.read_csv('/content/language-identification-datasst/dataset.csv') #Initializing the data as dataframe using pandas" ], "metadata": { "id": "Lzdz-Irt8Cap" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "IyX0rNLW8Ie1", "outputId": "9acf1992-4468-4ae9-fd92-ab9fd2a79e95" }, "execution_count": 3, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(22000, 2)" ] }, "metadata": {}, "execution_count": 3 } ] }, { "cell_type": "code", "source": [ "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "dDYoxrf8-eHS", "outputId": "c62d1df0-502f-4b47-a656-2dd3e11d330a" }, "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text language\n", "0 klement gottwaldi surnukeha palsameeriti ning ... Estonian\n", "1 sebes joseph pereira thomas på eng the jesuit... Swedish\n", "2 ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ... Thai\n", "3 விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர... Tamil\n", "4 de spons behoort tot het geslacht haliclona en... Dutch" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlanguage
0klement gottwaldi surnukeha palsameeriti ning ...Estonian
1sebes joseph pereira thomas på eng the jesuit...Swedish
2ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...Thai
3விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...Tamil
4de spons behoort tot het geslacht haliclona en...Dutch
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 22000,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 21859,\n \"samples\": [\n \"semua kucing dalam genus ini berbagi nenek moyang yang sama yang mungkin hidup sekitar \\u2013 juta tahun yang lalu di asia hubungan yang tepat dalam felidae dekat tetapi masih belum pasti misalnya kucing gunung cina kadang-kadang diklasifikasikan dengan nama felis silvestris bieti sebagai upaspesies kucing liar seperti varietas afrika utara f s lybica\",\n \"not completely happy with their temporary name in the summer of a sign caught citos eye just outside the freestate town of kroonstad pointing to a little town called \\\"wonderboom\\\" in danny de wet left the band and was replaced by garth mcleod formerly of respected south african rock band sugardrive in garth mcleod was killed in a motorcycle accident and was replaced by jonathan bell\",\n \"tetraglenes bucculenta \\u00e4r en skalbaggsart som beskrevs av charles joseph gahan tetraglenes bucculenta ing\\u00e5r i sl\\u00e4ktet tetraglenes och familjen l\\u00e5nghorningar\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"language\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 22,\n \"samples\": [\n \"Estonian\",\n \"Korean\",\n \"Urdu\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "df['language'].unique()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_BhNeq7V-evT", "outputId": "7ae6b564-389d-4197-983d-ba6cb02dd73b" }, "execution_count": 5, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['Estonian', 'Swedish', 'Thai', 'Tamil', 'Dutch', 'Japanese',\n", " 'Turkish', 'Latin', 'Urdu', 'Indonesian', 'Portugese', 'French',\n", " 'Chinese', 'Korean', 'Hindi', 'Spanish', 'Pushto', 'Persian',\n", " 'Romanian', 'Russian', 'English', 'Arabic'], dtype=object)" ] }, "metadata": {}, "execution_count": 5 } ] }, { "cell_type": "code", "source": [ "df.language.value_counts() #All texts and languages are equal" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "I2xp4hIA-ujt", "outputId": "c3220884-b561-41c2-83f9-33888920fe80" }, "execution_count": 6, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Estonian 1000\n", "Swedish 1000\n", "Thai 1000\n", "Tamil 1000\n", "Dutch 1000\n", "Japanese 1000\n", "Turkish 1000\n", "Latin 1000\n", "Urdu 1000\n", "Indonesian 1000\n", "Portugese 1000\n", "French 1000\n", "Chinese 1000\n", "Korean 1000\n", "Hindi 1000\n", "Spanish 1000\n", "Pushto 1000\n", "Persian 1000\n", "Romanian 1000\n", "Russian 1000\n", "English 1000\n", "Arabic 1000\n", "Name: language, dtype: int64" ] }, "metadata": {}, "execution_count": 6 } ] }, { "cell_type": "code", "source": [ "df.isnull().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sA092yoQ-1Au", "outputId": "b34fe8be-9b13-4c9f-f94e-21c4e850b5b9" }, "execution_count": 7, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Text 0\n", "language 0\n", "dtype: int64" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "df.dtypes" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jsbcBy99-3gI", "outputId": "3e1f5397-b7e2-4978-b010-e84b5d1b467e" }, "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Text object\n", "language object\n", "dtype: object" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "markdown", "source": [ "##Data Preprocessing" ], "metadata": { "id": "CkRJDD3gKQTl" } }, { "cell_type": "code", "source": [ "# dropping duplicate samples\n", "df = df.drop_duplicates(subset='Text')\n", "df = df.reset_index(drop=True)" ], "metadata": { "id": "ATmJtyAw-4Sq" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "df.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8SkwcKJB_BDi", "outputId": "cbf90cf3-2349-4268-80c4-bd5bdca150c8" }, "execution_count": 10, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(21859, 2)" ] }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "# non-alphanumeric characters to remove\n", "nonalphanumeric = ['\\'', '.', ',', '\\\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',\n", " '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\\\', '?',\n", " '/','>', '<', '|', ' ']\n", "len(nonalphanumeric)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XvpnoNtV_DOk", "outputId": "3f722d14-5d23-4df5-ca83-9987b1226483" }, "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "31" ] }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "markdown", "source": [ "The function clean_text():\n", "\n", "1. tokenizes the text into a list of words\n", "2. lowers all words into lowercase\n", "3. removes non alphanumeric from the list of lowered words\n", "stems the words\n", "4. returns them in form of a string seperated by \" \"" ], "metadata": { "id": "DyMi3cfF_Z1m" } }, { "cell_type": "code", "source": [ "from nltk.tokenize import word_tokenize\n", "import nltk\n", "nltk.download('punkt')\n", "\n", "def clean_text(text):\n", " \"\"\"\n", " Function to clean and preprocess text data.\n", " \"\"\"\n", " # Tokenize the text\n", " tokens = word_tokenize(text)\n", "\n", " # Remove non-alphanumeric characters\n", " words = [word.lower() for word in tokens if word not in nonalphanumeric]\n", "\n", " # Join the cleaned words back into a single string\n", " cleaned_text = \" \".join(words)\n", "\n", " return cleaned_text\n", "\n", "# Example usage\n", "text = \"This is an example sentence for lemmatization and stemming.\"\n", "cleaned_text_result = clean_text(text)\n", "print(cleaned_text_result)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tnjimn0-_S2c", "outputId": "c9ca9776-9991-4a4c-ade4-e4cc0b22d829" }, "execution_count": 43, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "this is an example sentence for lemmatization and stemming\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ] }, { "cell_type": "code", "source": [ "# applying clean_text function to all rows in 'Text' column\n", "df['clean_text'] = df['Text'].apply(clean_text)" ], "metadata": { "id": "74MBlrEN_sZf" }, "execution_count": 13, "outputs": [] }, { "cell_type": "markdown", "source": [ "####Label Encoding" ], "metadata": { "id": "A89S5oMZLilB" } }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import LabelEncoder\n", "\n", "# using LabelEncoder to get placeholder number values for categorical variabel 'language'\n", "le = LabelEncoder()\n", "df['language_encoded'] = le.fit_transform(df['language'])\n", "df.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "0LHYyglR_uZD", "outputId": "3836e669-03c0-4cfc-844d-c4aeaca69686" }, "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Text language \\\n", "0 klement gottwaldi surnukeha palsameeriti ning ... Estonian \n", "1 sebes joseph pereira thomas på eng the jesuit... Swedish \n", "2 ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ... Thai \n", "3 விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர... Tamil \n", "4 de spons behoort tot het geslacht haliclona en... Dutch \n", "\n", " clean_text language_encoded \n", "0 klement gottwaldi surnukeha palsameeriti ning ... 4 \n", "1 sebes joseph pereira thomas på eng the jesuits... 17 \n", "2 ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ... 19 \n", "3 விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர... 18 \n", "4 de spons behoort tot het geslacht haliclona en... 2 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Textlanguageclean_textlanguage_encoded
0klement gottwaldi surnukeha palsameeriti ning ...Estonianklement gottwaldi surnukeha palsameeriti ning ...4
1sebes joseph pereira thomas på eng the jesuit...Swedishsebes joseph pereira thomas på eng the jesuits...17
2ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...Thaiถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...19
3விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...Tamilவிசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...18
4de spons behoort tot het geslacht haliclona en...Dutchde spons behoort tot het geslacht haliclona en...2
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 21859,\n \"fields\": [\n {\n \"column\": \"Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 21859,\n \"samples\": [\n \"semua kucing dalam genus ini berbagi nenek moyang yang sama yang mungkin hidup sekitar \\u2013 juta tahun yang lalu di asia hubungan yang tepat dalam felidae dekat tetapi masih belum pasti misalnya kucing gunung cina kadang-kadang diklasifikasikan dengan nama felis silvestris bieti sebagai upaspesies kucing liar seperti varietas afrika utara f s lybica\",\n \"not completely happy with their temporary name in the summer of a sign caught citos eye just outside the freestate town of kroonstad pointing to a little town called \\\"wonderboom\\\" in danny de wet left the band and was replaced by garth mcleod formerly of respected south african rock band sugardrive in garth mcleod was killed in a motorcycle accident and was replaced by jonathan bell\",\n \"tetraglenes bucculenta \\u00e4r en skalbaggsart som beskrevs av charles joseph gahan tetraglenes bucculenta ing\\u00e5r i sl\\u00e4ktet tetraglenes och familjen l\\u00e5nghorningar\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"language\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 22,\n \"samples\": [\n \"Estonian\",\n \"Korean\",\n \"Urdu\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"clean_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 21857,\n \"samples\": [\n \"semua kucing dalam genus ini berbagi nenek moyang yang sama yang mungkin hidup sekitar \\u2013 juta tahun yang lalu di asia hubungan yang tepat dalam felidae dekat tetapi masih belum pasti misalnya kucing gunung cina kadang-kadang diklasifikasikan dengan nama felis silvestris bieti sebagai upaspesies kucing liar seperti varietas afrika utara f s lybica\",\n \"\\u0628\\u0627\\u0632\\u06cc\\u0627\\u0641\\u062a \\u066a \\u062f\\u0631\\u0635\\u062f\\u06cc \\u0627\\u062a\\u0648\\u0645\\u0628\\u06cc\\u0644\\u0633\\u0627\\u0644 \\u0627\\u06cc\\u0646 \\u0627\\u062a\\u0648\\u0645\\u0628\\u06cc\\u0644 \\u0628\\u0631\\u0627\\u06cc \\u0627\\u0648\\u0644\\u06cc\\u0646 \\u0628\\u0627\\u0631 \\u062f\\u0631 \\u0631\\u0648\\u0645\\u0627\\u0646\\u06cc \\u062a\\u062d\\u062a \\u0628\\u0631\\u0646\\u062f dacia \\u0639\\u0631\\u0636\\u0647 \\u06af\\u0631\\u062f\\u06cc\\u062f \\u0627\\u0645\\u0627 \\u062f\\u0631 \\u0628\\u0631\\u062e\\u06cc \\u06a9\\u0634\\u0648\\u0631\\u0647\\u0627 \\u062a\\u062d\\u062a \\u0628\\u0631\\u0646\\u062f \\u0631\\u0646\\u0648 \\u0648 \\u062f\\u0631 \\u0628\\u0631\\u062e\\u06cc \\u0646\\u06cc\\u0632 \\u062a\\u062d\\u062a \\u0628\\u0631\\u0646\\u062f \\u0646\\u06cc\\u0633\\u0627\\u0646 \\u0639\\u0631\\u0636\\u0647 \\u0645\\u06cc\\u200c\\u06af\\u0631\\u062f\\u062f \\u0627\\u06cc\\u0646 \\u0627\\u062a\\u0648\\u0645\\u0628\\u06cc\\u0644 \\u062f\\u0631 \\u0646\\u062a\\u06cc\\u062c\\u0647 \\u0628\\u0631 \\u067e\\u0627\\u06cc\\u0647 \\u067e\\u0644\\u0627\\u062a\\u0641\\u0631\\u0645 \\u067e\\u0631\\u0648\\u0698\\u0647 \\u0627\\u06cc\\u06a9\\u0633 \\u0646\\u0648\\u062f \\u062a\\u062d\\u062a \\u0647\\u062f\\u0627\\u06cc\\u062a louis schweitzer \\u0637\\u0631\\u0627\\u062d\\u06cc \\u0648 \\u0628\\u0647 \\u0645\\u0631\\u062d\\u0644\\u0647 \\u062a\\u0648\\u0644\\u06cc\\u062f \\u0631\\u0633\\u06cc\\u062f\\u0647\\u200c\\u0627\\u0633\\u062a\",\n \"noile cerin\\u021be pentru sistemul existent au condus la eforturi de modernizare a sistemului gps prin implementarea noii genera\\u021bii de sateli\\u021bi gps iii \\u0219i urm\\u0103toarea genera\\u021bie a sistemului opera\\u021bional de control operational control segment ocx\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"language_encoded\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 6,\n \"min\": 0,\n \"max\": 21,\n \"num_unique_values\": 22,\n \"samples\": [\n 4,\n 9,\n 21\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "# list of languages encoded with thier respective indices representing their placeholder numbers\n", "\n", "lang_list = [i for i in range(22)]\n", "lang_list = le.inverse_transform(lang_list)\n", "lang_list = lang_list.tolist()\n", "lang_list" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "F4ORoPxVBHHu", "outputId": "0987547e-014f-4b89-b639-d8120cb6168a" }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['Arabic',\n", " 'Chinese',\n", " 'Dutch',\n", " 'English',\n", " 'Estonian',\n", " 'French',\n", " 'Hindi',\n", " 'Indonesian',\n", " 'Japanese',\n", " 'Korean',\n", " 'Latin',\n", " 'Persian',\n", " 'Portugese',\n", " 'Pushto',\n", " 'Romanian',\n", " 'Russian',\n", " 'Spanish',\n", " 'Swedish',\n", " 'Tamil',\n", " 'Thai',\n", " 'Turkish',\n", " 'Urdu']" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "markdown", "source": [ "###Data Visualization" ], "metadata": { "id": "Pb5fm5eOKeWV" } }, { "cell_type": "code", "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "# Define a color palette with different colors for each language\n", "palette = sns.color_palette(\"hsv\", len(df['language'].unique()))\n", "\n", "# Plotting a language-wise frequency distribution for the number of samples in each language\n", "plt.figure(figsize=(7, 5))\n", "plt.title('Language Counts')\n", "ax = sns.countplot(y=df['language'], data=df, palette=palette)\n", "plt.show()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 579 }, "id": "lBQ2FBA8BOiT", "outputId": "abc01118-fe8b-4ea4-a108-f2f89b232875" }, "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":10: FutureWarning: \n", "\n", "Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.\n", "\n", " ax = sns.countplot(y=df['language'], data=df, palette=palette)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "import re\n", "def remove_english(text):\n", " \"\"\"\n", " function that takes text as input and returns text without english words\n", " \"\"\"\n", " pat = \"[a-zA-Z]+\"\n", " text = re.sub(pat, \"\", text)\n", " return text" ], "metadata": { "id": "45aWka3aBXZe" }, "execution_count": 17, "outputs": [] }, { "cell_type": "code", "source": [ "#Removing english words from chinese texts\n", "df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset\n", "\n", "clean_text = df.loc[df.language=='Chinese']['clean_text']\n", "clean_text = clean_text.apply(remove_english) # removing english words\n", "df_Chinese.loc[:,'clean_text'] = clean_text\n", "\n", "# removing old chinese text and appending new cleaned chinese text\n", "df.drop(df[df['language']=='Chinese'].index, inplace=True, axis=0)\n", "df = df.append(df_Chinese)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4fiKFPygB8EY", "outputId": "0626982f-cd28-428a-d89a-bb4f2f054023" }, "execution_count": 18, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_Chinese.loc[:,'clean_text'] = clean_text\n", ":9: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", " df = df.append(df_Chinese)\n" ] } ] }, { "cell_type": "code", "source": [ "# shuffling dataframe and resetting index\n", "df = df.sample(frac=1).reset_index(drop=True)" ], "metadata": { "id": "GBBGjcIQCGFv" }, "execution_count": 19, "outputs": [] }, { "cell_type": "markdown", "source": [ "###Feature Vectorizing" ], "metadata": { "id": "HSO-Zh7PL1p5" } }, { "cell_type": "code", "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "#Input Variable\n", "# vectorizing input varible 'clean_text' into a matrix\n", "features = df['clean_text']\n", "\n", "cv = CountVectorizer() # ngram_range=(1,2)\n", "features = cv.fit_transform(features)\n", "\n", "# changing the datatype of the number into uint8 to consume less memory\n", "features = features.astype('uint8') # uint8 and float32" ], "metadata": { "id": "WKFKd7o9Ck9K" }, "execution_count": 20, "outputs": [] }, { "cell_type": "code", "source": [ "# defining target variable\n", "targets = df['language_encoded']" ], "metadata": { "id": "jzc4Z0x2DAhJ" }, "execution_count": 21, "outputs": [] }, { "cell_type": "markdown", "source": [ "###Data Splitting" ], "metadata": { "id": "5taxSYqzKkcz" } }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "# splitting data into training and testing datasets\n", "X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=2007)" ], "metadata": { "id": "xBgeKOT7DCvP" }, "execution_count": 22, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Machine Learning" ], "metadata": { "id": "JeP6F0odl2Mk" } }, { "cell_type": "markdown", "source": [ "####Testing a Machine Learning Algorithm on this data" ], "metadata": { "id": "qbiKiFo6MDwH" } }, { "cell_type": "code", "source": [ "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score\n", "\n", "# Initialize and train the Naive Bayes classifier\n", "nb_classifier = MultinomialNB()\n", "nb_classifier.fit(X_train, y_train)\n", "\n", "# Make predictions on the validation set\n", "y_pred = nb_classifier.predict(X_val)\n", "\n", "# Evaluate the model\n", "accuracy = accuracy_score(y_val, y_pred)\n", "print(\"Validation Accuracy:\", accuracy)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vlpNuFClDRpR", "outputId": "53f5112b-f9d7-4689-87d5-4d6d96616b5e" }, "execution_count": 23, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Validation Accuracy: 0.9595150960658737\n" ] } ] }, { "cell_type": "markdown", "source": [ "Its giving very good accuracy overall!" ], "metadata": { "id": "3U8v0ResMOnN" } }, { "cell_type": "markdown", "source": [ "##Deep Learning" ], "metadata": { "id": "5BUq4hkZmez_" } }, { "cell_type": "code", "source": [ "#Converting tensors into arrays for training\n", "X_train = X_train.toarray()\n", "X_val = X_val.toarray()" ], "metadata": { "id": "IuctL4bPCKSV" }, "execution_count": 24, "outputs": [] }, { "cell_type": "code", "source": [ "print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GwdZkUJpCR8O", "outputId": "be59a569-afa6-495e-a841-72ad9f039b10" }, "execution_count": 25, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(17487, 277227) (4372, 277227) (17487,) (4372,)\n" ] } ] }, { "cell_type": "code", "source": [ "input_size = X_train.shape[1]\n", "input_size" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KUXH90VlCTdJ", "outputId": "819bc838-6599-4144-b4b5-c31e66835054" }, "execution_count": 26, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "277227" ] }, "metadata": {}, "execution_count": 26 } ] }, { "cell_type": "code", "source": [ "# outputsize hyperparatmeter\n", "\n", "output_size = len(df['language_encoded'].unique())\n", "num_classes = len(le.classes_)\n", "output_size, num_classes" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vAvlEJ1nCU16", "outputId": "59b8947c-afd9-4dfa-9e95-7a93799373d7" }, "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(22, 22)" ] }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "code", "source": [ "#Model Architecture having 4 layers having relu activation in the first 3 layers and softmax in the last layer to get probabilities for all the languages.\n", "import tensorflow as tf\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Embedding, Flatten, Dense\n", "\n", "\n", "model = Sequential([\n", " Dense(100, activation='relu', kernel_initializer='he_normal', input_shape=(input_size,)),\n", " Dense(80, activation='relu', kernel_initializer='he_normal'),\n", " Dense(50, activation='relu', kernel_initializer='he_normal'),\n", " Dense(output_size, activation='softmax')\n", "])\n" ], "metadata": { "id": "Q9oCGyv-myIc" }, "execution_count": 28, "outputs": [] }, { "cell_type": "code", "source": [ "#Learning rate scheduling\n", "def lr_scheduler(epoch, lr):\n", " if epoch < 3:\n", " return lr\n", " else:\n", " return lr * tf.math.exp(-0.1)" ], "metadata": { "id": "n_c2D7eNDB23" }, "execution_count": 29, "outputs": [] }, { "cell_type": "code", "source": [ "#Initializing early stoppin, optimizer and learning rate schedulerr\n", "from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler\n", "\n", "early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)\n", "lr_scheduler_callback = LearningRateScheduler(lr_scheduler)\n", "optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)" ], "metadata": { "id": "D9mQNhQEDChh" }, "execution_count": 30, "outputs": [] }, { "cell_type": "code", "source": [ "model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])" ], "metadata": { "id": "F_5FShAFBx2b" }, "execution_count": 31, "outputs": [] }, { "cell_type": "code", "source": [ "model.summary() #Checking model parameters and layers" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Tge5dPw_FyRN", "outputId": "360e1d2f-0b97-40cc-d86e-76e0df1b0900" }, "execution_count": 32, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " dense (Dense) (None, 100) 27722800 \n", " \n", " dense_1 (Dense) (None, 80) 8080 \n", " \n", " dense_2 (Dense) (None, 50) 4050 \n", " \n", " dense_3 (Dense) (None, 22) 1122 \n", " \n", "=================================================================\n", "Total params: 27736052 (105.80 MB)\n", "Trainable params: 27736052 (105.80 MB)\n", "Non-trainable params: 0 (0.00 Byte)\n", "_________________________________________________________________\n" ] } ] }, { "cell_type": "code", "source": [ "from tensorflow.keras.utils import plot_model\n", "plot_model(model)" ], "metadata": { "id": "bD0KN5YbFzwN", "colab": { "base_uri": "https://localhost:8080/", "height": 466 }, "outputId": "2a10bd1b-defc-4c22-f51a-f66d281a9333" }, "execution_count": 36, "outputs": [ { "output_type": "execute_result", "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "execution_count": 36 } ] }, { "cell_type": "code", "source": [ "#Encoding the labels\n", "y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22)\n", "y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22)" ], "metadata": { "id": "p7R8zRG7EqlF" }, "execution_count": 33, "outputs": [] }, { "cell_type": "markdown", "source": [ "###Training" ], "metadata": { "id": "oNTRkl2qKuAF" } }, { "cell_type": "code", "source": [ "hist=model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_val, y_val_encoded), callbacks=[early_stopping,lr_scheduler_callback])" ], "metadata": { "id": "T09B1AeSByhS", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "23517910-cff7-4a8c-d761-ea6f1c71bed8" }, "execution_count": 34, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/10\n", "547/547 [==============================] - 16s 23ms/step - loss: 0.3566 - accuracy: 0.9326 - val_loss: 0.0814 - val_accuracy: 0.9774 - lr: 0.0010\n", "Epoch 2/10\n", "547/547 [==============================] - 10s 18ms/step - loss: 0.0071 - accuracy: 0.9988 - val_loss: 0.0810 - val_accuracy: 0.9790 - lr: 0.0010\n", "Epoch 3/10\n", "547/547 [==============================] - 10s 17ms/step - loss: 9.0363e-04 - accuracy: 0.9999 - val_loss: 0.0883 - val_accuracy: 0.9748 - lr: 0.0010\n", "Epoch 4/10\n", "547/547 [==============================] - 9s 16ms/step - loss: 6.1621e-04 - accuracy: 0.9999 - val_loss: 0.1048 - val_accuracy: 0.9716 - lr: 9.0484e-04\n", "Epoch 5/10\n", "547/547 [==============================] - 9s 17ms/step - loss: 5.1501e-04 - accuracy: 0.9999 - val_loss: 0.0975 - val_accuracy: 0.9764 - lr: 8.1873e-04\n", "Epoch 6/10\n", "547/547 [==============================] - 10s 18ms/step - loss: 5.1643e-04 - accuracy: 0.9999 - val_loss: 0.1120 - val_accuracy: 0.9705 - lr: 7.4082e-04\n" ] } ] }, { "cell_type": "markdown", "source": [ "##Evaluation" ], "metadata": { "id": "0QwsQCK8KwYt" } }, { "cell_type": "code", "source": [ "loss, accuracy = model.evaluate(X_val, y_val_encoded)\n", "print(\"Validation Accuracy:\", accuracy)" ], "metadata": { "id": "is6FP7DtB1yY", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "dbac023d-9d19-4f0f-84ab-bbff637f9c9b" }, "execution_count": 35, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "137/137 [==============================] - 1s 7ms/step - loss: 0.0810 - accuracy: 0.9790\n", "Validation Accuracy: 0.9789569973945618\n" ] } ] }, { "cell_type": "code", "source": [ "plt.title('Learning Curve')\n", "plt.xlabel('Epochs')\n", "plt.ylabel('Categorical Crossentropy')\n", "plt.plot(hist.history['loss'], label='train')\n", "plt.plot(hist.history['val_loss'], label='val')\n", "plt.legend()\n", "plt.show()" ], "metadata": { "id": "hhVxYLQJB1vX", "colab": { "base_uri": "https://localhost:8080/", "height": 472 }, "outputId": "5cf694c7-7ea5-47d3-df8c-c25c507790fc" }, "execution_count": 37, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "code", "source": [ "plt.title('Learning Curve')\n", "plt.xlabel('Epochs')\n", "plt.ylabel('Accuracy')\n", "plt.plot(hist.history['accuracy'], label='train')\n", "plt.plot(hist.history['val_accuracy'], label='val')\n", "plt.legend()\n", "plt.show()" ], "metadata": { "id": "MIh8fOjcNWAg", "colab": { "base_uri": "https://localhost:8080/", "height": 472 }, "outputId": "4aa4e096-4725-4cc6-f4a7-d86356863ab1" }, "execution_count": 38, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/png": "\n" }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "##Saving" ], "metadata": { "id": "kNyuIssDK36s" } }, { "cell_type": "code", "source": [ "model.save('language_identifcation_model.h5')" ], "metadata": { "id": "MxpbSlWnN1Cm", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0e3cb1ab-305c-4821-bfd5-43b8aa5d882e" }, "execution_count": 39, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py:3103: UserWarning: You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.\n", " saving_api.save_model(\n" ] } ] }, { "cell_type": "markdown", "source": [ "##Testing" ], "metadata": { "id": "O2WkRJfNK6kA" } }, { "cell_type": "code", "source": [ "# using the model for prediction\n", "import numpy as np\n", "sent = \"\"\"आप कितना सोचते हो\n", "अगर आप ठिठुरती रातों को गिनें\n", "अरे क्या आप मिल सकते हैं (अरे, क्या आप मिल सकते हैं?)\n", "क्या तुम मिलोगे (क्या तुम मिलोगे?)\n", "सर्दियों का अंत बताओ\n", "एक कोमल वसंत के दिन तक\n", "मैं चाहता हूं कि तुम तब तक रहो जब तक फूल खिल न जाएं\n", "ज्यों का त्यों\"\"\"\n", "\n", "\n", "def predict_language(text, model, cv, le):\n", " cleaned_text = clean_text(text) #Cleaning the text\n", " text_vectorized = cv.transform([cleaned_text]) #Converting tokens into vectors\n", " prediction = model.predict(text_vectorized) #Making predictions\n", " predicted_label = le.inverse_transform([np.argmax(prediction)])[0] #Get the first element of the list\n", " return predicted_label\n", "\n", "predict_language(sent, model, cv, le)" ], "metadata": { "id": "5c2aczzzN64J", "colab": { "base_uri": "https://localhost:8080/", "height": 55 }, "outputId": "ea8bc990-cf9e-4dd4-d7c1-bf13925d1930" }, "execution_count": 44, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1/1 [==============================] - 0s 315ms/step\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "'Hindi'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 44 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "RTC8BPTYJvgO" }, "execution_count": null, "outputs": [] } ] }