ProjectIndus / SFT /prepare_sft_dataset.py

Upload 4 files

f05550b verified 4 months ago

4.62 kB

	'''
	File Name: prepare_sft_dataset.py Author: Nikhil Malhotra
	Date: 21/7/2024
	purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
	Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
	Dataset is then translated in requisite dialects as supported by Google
	Dataset is also split in train and test and enables to create requisite files
	Name of the file carries the source, translated into a dialect along with split type
	'''
	#Imports
	import os
	import pandas as pd
	import configparser
	from googletrans import Translator


	#Configuration settings
	source_dir_path = "source"
	translated_dir_path = "translated"
	config = configparser.ConfigParser(default_section="DATASETS")
	'''
	Dialects double codes used for translation
	1. English : en
	2. Hindi : hi
	3. Dogri : doi
	4. Bhojpuri: bho
	5. Maithili :mai
	'''
	translated_dialect = "hi"

	def store_sft_dataset(name_of_dataset , data_frame,split_type):
	"""
	Method to get data from Hugging face source and store this data in csv file
	This method also calls translation in dialects
	Input : name of dataset to be fetched from hugging face
	Data Frame : Data frame contains question or answers or prompts and messages
	Split type: test or train as the data is split
	"""
	file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv"
	if not os.path.isfile(file_name):
	print("Opening file....." , file_name)
	data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True)
	print("Finished writing file....",file_name)

	#Change dialect to translate at top depending upon the dialect needed
	translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect)


	def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"):
	USE_COUNT = False #Only use this if you need to test with small data
	count = 0
	"""
	This method enables translation in various dialects byt taking a dataset in a data frame and then
	storing it as a csv file
	Input : name of dataset to be fetched from hugging face
	Data Frame : Data frame contains question or answers or prompts and messages
	Split type: test or train as the data is split
	dialect name: Dialect in whcih data needs to be converted into
	"""
	print("Translating now....")
	translator = Translator()
	#Make a new dataframe for translation
	translate_df = pd.DataFrame(columns=["question" , "answer"])
	translated_append_list = []
	for index,val in data_frame.iterrows():
	translated_ques = translator.translate(val["question"], dest=dialect_name).text
	translated_ans = translator.translate(val["answer"], dest=dialect_name).text
	translated_append_list.append({'question' : translated_ques, 'answer': translated_ans})
	count +=1
	#If count is used then the program thinks you are testing.
	# It enables you to break the loop with 5 data points
	if USE_COUNT:
	if count == 5:
	break

	df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])

	translated_file_name = str(os.path.join(translated_dir_path,
	name_of_dataset.replace("/","_")
	))+split_type+"_"+dialect_name+"_translated.csv"

	if not os.path.isfile(translated_file_name):
	print("Opening file....." , translated_file_name)
	df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True)
	print("Finished writing file....",translated_file_name)


	if __name__ == "__main__":
	"""
	Main method to read config file and then use it to store and translate to produce
	a high quality SFT dataset
	"""
	#Read the config file and get the requisite section to call for the dataset
	config.read("dataset.ini", encoding="utf-8")
	for key in config['openai']:
	if key.lower().strip().__eq__("name"):
	name_of_dataset = config['openai'][key].replace('"','')
	splits = { 'train': 'main/train-00000-of-00001.parquet',
	'test': 'main/test-00000-of-00001.parquet'
	}
	df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"])
	store_sft_dataset(name_of_dataset , df_1,"train")