ProjectIndus / SFT /prepare_sft_dataset.py
nickmalhotra's picture
Upload 4 files
f05550b verified
'''
File Name: prepare_sft_dataset.py Author: Nikhil Malhotra
Date: 21/7/2024
purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
Dataset is then translated in requisite dialects as supported by Google
Dataset is also split in train and test and enables to create requisite files
Name of the file carries the source, translated into a dialect along with split type
'''
#Imports
import os
import pandas as pd
import configparser
from googletrans import Translator
#Configuration settings
source_dir_path = "source"
translated_dir_path = "translated"
config = configparser.ConfigParser(default_section="DATASETS")
'''
Dialects double codes used for translation
1. English : en
2. Hindi : hi
3. Dogri : doi
4. Bhojpuri: bho
5. Maithili :mai
'''
translated_dialect = "hi"
def store_sft_dataset(name_of_dataset , data_frame,split_type):
"""
Method to get data from Hugging face source and store this data in csv file
This method also calls translation in dialects
Input : name of dataset to be fetched from hugging face
Data Frame : Data frame contains question or answers or prompts and messages
Split type: test or train as the data is split
"""
file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv"
if not os.path.isfile(file_name):
print("Opening file....." , file_name)
data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True)
print("Finished writing file....",file_name)
#Change dialect to translate at top depending upon the dialect needed
translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect)
def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"):
USE_COUNT = False #Only use this if you need to test with small data
count = 0
"""
This method enables translation in various dialects byt taking a dataset in a data frame and then
storing it as a csv file
Input : name of dataset to be fetched from hugging face
Data Frame : Data frame contains question or answers or prompts and messages
Split type: test or train as the data is split
dialect name: Dialect in whcih data needs to be converted into
"""
print("Translating now....")
translator = Translator()
#Make a new dataframe for translation
translate_df = pd.DataFrame(columns=["question" , "answer"])
translated_append_list = []
for index,val in data_frame.iterrows():
translated_ques = translator.translate(val["question"], dest=dialect_name).text
translated_ans = translator.translate(val["answer"], dest=dialect_name).text
translated_append_list.append({'question' : translated_ques, 'answer': translated_ans})
count +=1
#If count is used then the program thinks you are testing.
# It enables you to break the loop with 5 data points
if USE_COUNT:
if count == 5:
break
df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])
translated_file_name = str(os.path.join(translated_dir_path,
name_of_dataset.replace("/","_")
))+split_type+"_"+dialect_name+"_translated.csv"
if not os.path.isfile(translated_file_name):
print("Opening file....." , translated_file_name)
df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True)
print("Finished writing file....",translated_file_name)
if __name__ == "__main__":
"""
Main method to read config file and then use it to store and translate to produce
a high quality SFT dataset
"""
#Read the config file and get the requisite section to call for the dataset
config.read("dataset.ini", encoding="utf-8")
for key in config['openai']:
if key.lower().strip().__eq__("name"):
name_of_dataset = config['openai'][key].replace('"','')
splits = { 'train': 'main/train-00000-of-00001.parquet',
'test': 'main/test-00000-of-00001.parquet'
}
df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"])
store_sft_dataset(name_of_dataset , df_1,"train")