File size: 4,616 Bytes

f05550b

'''

    File Name: prepare_sft_dataset.py      Author: Nikhil Malhotra

    Date: 21/7/2024 

    purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.

    Dataset source is obtained from Hugging face and enables to get high quality SFT dataset

    Dataset is then translated in requisite dialects as supported by Google

    Dataset is also split in train and test and enables to create requisite files

    Name of the file carries the source, translated into a dialect along with split type

'''
#Imports
import os
import pandas as pd
import configparser
from googletrans import Translator


#Configuration settings
source_dir_path = "source"
translated_dir_path = "translated"
config = configparser.ConfigParser(default_section="DATASETS")
'''

    Dialects double codes used for translation

    1. English : en 

    2. Hindi : hi

    3. Dogri : doi

    4. Bhojpuri: bho

    5. Maithili :mai

'''
translated_dialect = "hi"

def store_sft_dataset(name_of_dataset , data_frame,split_type):
   """

        Method to get data from Hugging face source and store this data in csv file

        This method also calls translation in dialects

        Input : name of dataset to be fetched from hugging face

        Data Frame : Data frame contains question or answers or prompts and messages

        Split type: test or train as the data is split

    """
   file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv"
   if not os.path.isfile(file_name):
        print("Opening file....." , file_name)
        data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True)
        print("Finished writing file....",file_name)
   
   #Change dialect to translate at top depending upon the dialect needed
   translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect)
   

def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"):
    USE_COUNT = False #Only use this if you need to test with small data
    count = 0
    """

        This method enables translation in various dialects byt taking a dataset in a data frame and then 

        storing it as a csv file

        Input : name of dataset to be fetched from hugging face

        Data Frame : Data frame contains question or answers or prompts and messages

        Split type: test or train as the data is split

        dialect name: Dialect in whcih data needs to be converted into

    """
    print("Translating now....")
    translator = Translator()
    #Make a new dataframe for translation
    translate_df = pd.DataFrame(columns=["question" , "answer"])
    translated_append_list = []
    for index,val in data_frame.iterrows():
        translated_ques = translator.translate(val["question"],  dest=dialect_name).text
        translated_ans = translator.translate(val["answer"],  dest=dialect_name).text
        translated_append_list.append({'question' : translated_ques, 'answer': translated_ans})
        count +=1
        #If count is used then the program thinks you are testing.
        # It enables you to break the loop with 5 data points
        if USE_COUNT:
            if count == 5:
                break
    
    df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])

    translated_file_name = str(os.path.join(translated_dir_path,
                                 name_of_dataset.replace("/","_")
                                 ))+split_type+"_"+dialect_name+"_translated.csv"
    
    if not os.path.isfile(translated_file_name):
        print("Opening file....." , translated_file_name)
        df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True)
        print("Finished writing file....",translated_file_name)    


if __name__ == "__main__":
    """

        Main method to read config file and then use it to store and translate to produce

        a high quality SFT dataset

    """
    #Read the config file and get the requisite section to call for the dataset
    config.read("dataset.ini", encoding="utf-8")
    for key in config['openai']:
        if key.lower().strip().__eq__("name"):
            name_of_dataset = config['openai'][key].replace('"','')
            splits = {  'train': 'main/train-00000-of-00001.parquet', 
                         'test': 'main/test-00000-of-00001.parquet'
                     }
            df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"])
            store_sft_dataset(name_of_dataset , df_1,"train")