{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This file was created by jhlfrfufyfn for choose speaker from the Belarusian Mozilla Voice corpus\n",
    "#\n",
    "#\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import os\n",
    "import librosa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# unpackage tar gz file cv-corpus-12.0-2022-12-07-be.tar.gz\n",
    "# import tarfile\n",
    "# tar = tarfile.open(\"cv-corpus-12.0-2022-12-07-be.tar.gz\", \"r:gz\")\n",
    "# tar.extractall()\n",
    "# tar.close()\n",
    "\n",
    "corpuspath = '/a/cv-corpus'\n",
    "outputpath = '/storage/filtered_dataset'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# open validated.tsv\n",
    "df = pd.read_csv(corpuspath+'/be/validated.tsv', sep='\\t' ,low_memory=False)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop from df columns age, accents\n",
    "df = df.drop(['age', 'accents', 'gender', 'variant', 'locale', 'segment'], axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# count number of recordes with down_votes > 0\n",
    "df[df['down_votes'] > 0].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# count number of recordes with up_votes == 0\n",
    "df[df['up_votes'] == 0].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop all rows with down_votes > 0 and up_votes == 0\n",
    "df = df[df['down_votes'] == 0]\n",
    "df = df[df['up_votes'] > 0]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop column down_votes and up_votes\n",
    "df = df.drop(['down_votes', 'up_votes'], axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sort by count\n",
    "df_sorted = df.groupby('client_id').count().sort_values(by='path', ascending=False)\n",
    "df_sorted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get top 10 speakers\n",
    "top_10_speakers = df_sorted.head(10)\n",
    "top_10_speakers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get for the first speaker ten random paths to audio files\n",
    "def get_speaker_audio_list(speaker_id, n=10):\n",
    "    return df[df['client_id'] == speaker_id].sample(n)['path'].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CHOOSE : which speaker will we use\n",
    "speaker_index = 0\n",
    "speaker_audio_list = get_speaker_audio_list(top_10_speakers.index[speaker_index])\n",
    "print(speaker_audio_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# open audio files from speaker_audio_list and play them\n",
    "# audio files lie in cv-corpus-12.0-2022-12-07/be/clips\n",
    "import IPython.display as ipd\n",
    "for audio in speaker_audio_list:\n",
    "    audio = corpuspath+'/be/clips/' + audio\n",
    "    audio_data = ipd.Audio(audio)\n",
    "    display(audio_data)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 0 is pretty good\n",
    "# 1 is bad\n",
    "# 2 is partly 0, other are different\n",
    "# 3 is bad\n",
    "# 4 is pretty fast and clear, but not good\n",
    "# 5 is echoing, sometimes mic cracks\n",
    "# 6 is really slow and clear, but accent?\n",
    "# 7 has a lot of intonation, but is pretty clear\n",
    "# 8 is clear and slow, sometimes little mic crack\n",
    "# 9 has background noise, whispering\n",
    "\n",
    "# options: 0, 6, 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate speech rate in words per minute for each speaker\n",
    "def get_speech_rate(speaker_id):\n",
    "    df_speaker = df[df['client_id'] == speaker_id]\n",
    "    # get 1000 random samples to calculate speech rate\n",
    "    df_speaker = df_speaker.sample(1000)\n",
    "    # get duration of each audio file\n",
    "    df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n",
    "    # get number of words in each audio file\n",
    "    df_speaker['words'] = df_speaker['sentence'].apply(lambda x: len(x.split()))\n",
    "    # calculate speech rate\n",
    "    df_speaker['speech_rate'] = df_speaker['words'] / df_speaker['duration'] * 60\n",
    "    # return mean speech rate\n",
    "    return df_speaker['speech_rate'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate speech rate for each speaker\n",
    "print(f'Speech rate for speaker {speaker_index}: ', get_speech_rate(top_10_speakers.index[speaker_index]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_average_duration(df_speaker):\n",
    "    # get 1000 random samples to calculate speech rate\n",
    "    df_speaker = df_speaker.sample(1000)\n",
    "    # get duration of each audio file\n",
    "    df_speaker['duration'] = df_speaker['path'].apply(lambda x: librosa.get_duration(path=corpuspath+'/be/clips/' + x))\n",
    "    return df_speaker['duration'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n",
    "\n",
    "avg_duration = get_average_duration(df_speaker)\n",
    "avg_total_duration = avg_duration * len(df_speaker.index)\n",
    "print(f'Average duration for speaker {speaker_index}: ', avg_duration, \", average total duration(hours): \",(avg_total_duration/60.0/60.0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get df with speaker_index speaker \n",
    "df_speaker = df[df['client_id'] == top_10_speakers.index[speaker_index]]\n",
    "df_speaker = df_speaker.drop(['client_id'], axis=1)\n",
    "\n",
    "# get only x latest hours\n",
    "limit_hours = 30\n",
    "limit_files = round(limit_hours*60*60 / avg_duration)\n",
    "df_speaker = df_speaker.tail(limit_files)\n",
    "\n",
    "df_speaker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # move all files of that speaker to another folder\n",
    "# # use multiprocessing to speed up\n",
    "# # add progress bar\n",
    "# from tqdm import tqdm\n",
    "# import multiprocessing\n",
    "# from multiprocessing import Pool\n",
    "# import shutil\n",
    "\n",
    "# def move_file(file):\n",
    "#     shutil.move(corpuspath+'/be/clips/' + file, corpuspath+'/be/speaker_0/' + file)\n",
    "\n",
    "# # get list of files to move\n",
    "# files = df_speaker['path'].values.tolist()\n",
    "\n",
    "# # move files\n",
    "# with Pool(multiprocessing.cpu_count()) as p:\n",
    "#     r = list(tqdm(p.imap(move_file, files), total=len(files)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# cleanup output and save text lines to csv\n",
    "if os.path.isdir(outputpath):\n",
    "    for file in os.scandir(outputpath):\n",
    "        os.remove(file.path)\n",
    "else:\n",
    "    os.mkdir(outputpath)\n",
    "\n",
    "df_speaker['path2'] = df_speaker['path'].str.replace('\\.mp3$','.wav', regex=True)\n",
    "df_speaker[['path2','sentence']].to_csv(outputpath+'/df_speaker.csv', sep='|', header=False, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make rate=22050 of all mp3 files in speaker_0 folder with multiprocessing and tqdm\n",
    "import multiprocessing\n",
    "from multiprocessing import Pool\n",
    "from tqdm import tqdm\n",
    "from pydub import AudioSegment\n",
    "\n",
    "def convert_mp3_to_wav(file):\n",
    "    sound = AudioSegment.from_mp3(corpuspath+'/be/clips/' + file)\n",
    "    sound = sound.set_frame_rate(22050)\n",
    "    sound.export(outputpath+'/' + file[:-4] + '.wav', format='wav')\n",
    "\n",
    "# get list of files to convert\n",
    "files = df_speaker['path'].values.tolist()\n",
    "\n",
    "# convert files\n",
    "with Pool(multiprocessing.cpu_count()) as p:\n",
    "    r = list(tqdm(p.imap(convert_mp3_to_wav, files), total=len(files)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}