{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KX0IBmbjeydD", "outputId": "da61afd9-8a4e-478a-fcbc-4c7d0466ced9" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ], "source": [ "!pip --q install datasets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "PbGUt9dbnriq" }, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "from IPython.display import Audio\n", "import torch\n", "import numpy as np\n", "import pandas as pd\n", "import soundfile as sf\n", "import matplotlib.pyplot as plt\n", "plt.style.use(\"seaborn-whitegrid\")\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zl13ArjoGBq9", "outputId": "eb811f29-0434-4e0e-a043-017b25d42c3d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wUSLY8BmnrfA", "outputId": "04ce4306-3c03-4c36-e9c5-4f01ffe9d908" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Downloading cleansada-version-01.zip to /content\n", " 98% 1.01G/1.03G [00:11<00:00, 109MB/s]\n", "100% 1.03G/1.03G [00:11<00:00, 96.0MB/s]\n" ] } ], "source": [ "def DownloadDataset(username,key):\n", " import json\n", " keys={\"username\":username,\"key\":key}\n", " ! mkdir ~/.kaggle\n", " json_object = json.dumps(keys, indent=4)\n", " with open(r\"/root/.kaggle/kaggle.json\", \"w\") as outfile:\n", " outfile.write(json_object)\n", " ! chmod 600 ~/.kaggle/kaggle.json\n", " ! kaggle datasets download --unzip engmahmoodanaam/cleansada-version-01\n", "\n", "#.............................................\n", "\n", "DownloadDataset( username = \"engmahmoodanaam\",\n", " key = \"4a457b4fc3516f9d9b913d770ea64884\"\n", " )" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "l74W1-6ZnrbX" }, "outputs": [], "source": [ "def GetDataset(path_csv,path_audio):\n", " df = pd.read_csv(path_csv)\n", " audios_data = []\n", " audios_samplerate = []\n", " for idx, row in df.iterrows():\n", " filename = f\"{path_audio}/{row['SegmentID']}.wav\"\n", " audiodata, samplerate = sf.read(filename)\n", " audios_data.append(audiodata)\n", " audios_samplerate.append(samplerate)\n", "\n", " df['audio'] = audios_data\n", " df['samplerate'] = audios_samplerate\n", " df['text'] = df['ProcessedText']\n", " df = df[['text','audio','samplerate']]\n", "\n", "\n", " return df\n", "\n", "#.............................................\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "mfMp94fvnrXw", "outputId": "26e03dbe-2713-4651-ad7a-6973ec6ce9cd" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "summary": "{\n \"name\": \"df\",\n \"rows\": 70,\n \"fields\": [\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 69,\n \"samples\": [\n \"\\u0648\\u0634 \\u0633\\u0648\\u064a\\u062a \\u062d\\u062a\\u0649 \\u062a\\u0635\\u064a\\u0631\\u0648\\u0646 \\u0628\\u0647\\u0627\\u0644\\u062c\\u062d\\u0648\\u062f \\u0627\\u064a\\u0647\",\n \"\\u064a\\u0627 \\u0633\\u0644\\u0627\\u0645 \\u0627\\u0646\\u062a\\u0647\\u0649 \\u0627\\u0644\\u0645\\u0648\\u0636\\u0648\\u0639\",\n \"\\u0645\\u0627 \\u0631\\u0627\\u062d \\u0645\\u0631\\u0631 \\u0644\\u0647 \\u0628\\u0633\\u0647\\u0648\\u0644\\u0629\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"audio\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"samplerate\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 16000,\n \"max\": 16000,\n \"num_unique_values\": 1,\n \"samples\": [\n 16000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}", "type": "dataframe", "variable_name": "df" }, "text/html": [ "\n", "
\n", " | text | \n", "audio | \n", "samplerate | \n", "
---|---|---|---|
0 | \n", "يا سلام انتهى الموضوع | \n", "[3.0517578125e-05, 0.000823974609375, 0.001464... | \n", "16000 | \n", "
1 | \n", "يعني يا ابو مسامح | \n", "[-0.0538330078125, -0.0531005859375, -0.046112... | \n", "16000 | \n", "
2 | \n", "حصة موافقة | \n", "[0.001007080078125, -0.004058837890625, -0.003... | \n", "16000 | \n", "
3 | \n", "والله يا هي فكرة | \n", "[-0.02789306640625, -0.03045654296875, -0.0335... | \n", "16000 | \n", "
4 | \n", "فكرة تبي تخليك تطير من الفرح | \n", "[0.05316162109375, 0.031829833984375, 0.021728... | \n", "16000 | \n", "