|
'''
|
|
File Name: prepare_sft_dataset.py Author: Nikhil Malhotra
|
|
Date: 21/7/2024
|
|
purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
|
|
Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
|
|
Dataset is then translated in requisite dialects as supported by Google
|
|
Dataset is also split in train and test and enables to create requisite files
|
|
Name of the file carries the source, translated into a dialect along with split type
|
|
'''
|
|
|
|
import os
|
|
import pandas as pd
|
|
import configparser
|
|
from googletrans import Translator
|
|
|
|
|
|
|
|
source_dir_path = "source"
|
|
translated_dir_path = "translated"
|
|
config = configparser.ConfigParser(default_section="DATASETS")
|
|
'''
|
|
Dialects double codes used for translation
|
|
1. English : en
|
|
2. Hindi : hi
|
|
3. Dogri : doi
|
|
4. Bhojpuri: bho
|
|
5. Maithili :mai
|
|
'''
|
|
translated_dialect = "hi"
|
|
|
|
def store_sft_dataset(name_of_dataset , data_frame,split_type):
|
|
"""
|
|
Method to get data from Hugging face source and store this data in csv file
|
|
This method also calls translation in dialects
|
|
Input : name of dataset to be fetched from hugging face
|
|
Data Frame : Data frame contains question or answers or prompts and messages
|
|
Split type: test or train as the data is split
|
|
"""
|
|
file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv"
|
|
if not os.path.isfile(file_name):
|
|
print("Opening file....." , file_name)
|
|
data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True)
|
|
print("Finished writing file....",file_name)
|
|
|
|
|
|
translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect)
|
|
|
|
|
|
def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"):
|
|
USE_COUNT = False
|
|
count = 0
|
|
"""
|
|
This method enables translation in various dialects byt taking a dataset in a data frame and then
|
|
storing it as a csv file
|
|
Input : name of dataset to be fetched from hugging face
|
|
Data Frame : Data frame contains question or answers or prompts and messages
|
|
Split type: test or train as the data is split
|
|
dialect name: Dialect in whcih data needs to be converted into
|
|
"""
|
|
print("Translating now....")
|
|
translator = Translator()
|
|
|
|
translate_df = pd.DataFrame(columns=["question" , "answer"])
|
|
translated_append_list = []
|
|
for index,val in data_frame.iterrows():
|
|
translated_ques = translator.translate(val["question"], dest=dialect_name).text
|
|
translated_ans = translator.translate(val["answer"], dest=dialect_name).text
|
|
translated_append_list.append({'question' : translated_ques, 'answer': translated_ans})
|
|
count +=1
|
|
|
|
|
|
if USE_COUNT:
|
|
if count == 5:
|
|
break
|
|
|
|
df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])
|
|
|
|
translated_file_name = str(os.path.join(translated_dir_path,
|
|
name_of_dataset.replace("/","_")
|
|
))+split_type+"_"+dialect_name+"_translated.csv"
|
|
|
|
if not os.path.isfile(translated_file_name):
|
|
print("Opening file....." , translated_file_name)
|
|
df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True)
|
|
print("Finished writing file....",translated_file_name)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
"""
|
|
Main method to read config file and then use it to store and translate to produce
|
|
a high quality SFT dataset
|
|
"""
|
|
|
|
config.read("dataset.ini", encoding="utf-8")
|
|
for key in config['openai']:
|
|
if key.lower().strip().__eq__("name"):
|
|
name_of_dataset = config['openai'][key].replace('"','')
|
|
splits = { 'train': 'main/train-00000-of-00001.parquet',
|
|
'test': 'main/test-00000-of-00001.parquet'
|
|
}
|
|
df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"])
|
|
store_sft_dataset(name_of_dataset , df_1,"train")
|
|
|
|
|
|
|