Upload 4 files

Browse files

Files added for the same

Files changed (4) hide show

SFT/Philosophy.txt +17 -0
SFT/dataset.ini +8 -0
SFT/prepare_sft_dataset.py +104 -9
SFT/source/openai_gsm8k_train.csv +0 -0

SFT/Philosophy.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+The Philosophy of SFT dataset is that it is a json dictionary structure containing the following
+{'prompt' , 'prompt_id' , 'message' , 'dialect'}
+This examples can be single turn or multi turn
+Example(Hindi)
+{ 	'dialect': 'hi'
+	'prompt': 'मुझे लंदन में कौन से प्रसिद्ध स्थलों का दौरा करना चाहिए?'
+	'message': 'आपको बिग बेन, टावर ऑफ लंदन, लंदन आई, ग्रीनविच म्यूजियम जैसी जगहों पर जाना चाहिए।'
+}
+An example in another Hindi dialect would be as follows
+Example(Hindi - Dogri)
+{ 	'dialect': 'do'
+	'prompt': 'लंदन च किस मशहूर लैंडमार्क दा दौरा लाना चाहिदा ?'
+	'message': 'तुसें गी बिग बेन, टावर ऑफ लंदन, लंदन आई, ग्रीनविच म्यूजियम जनेह् थाह्रें दा दौरा लाना चाहिदा।'
+}

SFT/dataset.ini ADDED Viewed

	@@ -0,0 +1,8 @@

+[DATASETS]
+[openai]
+name = "openai/gsm8k/"
+structure = ["question" , "answer"]
+initial_url = "hf://datasets/"
+train_data="main/train-00000-of-00001.parquet"
+test_data="main/test-00000-of-00001.parquet"

SFT/prepare_sft_dataset.py CHANGED Viewed

@@ -1,9 +1,104 @@
-'''
-    File Name: prepare_sft_dataset.py      Author: Nikhil Malhotra
-    Date: 21/7/2024
-    purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
-    Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
-    Dataset is then translated in requisite dialects as supported by Google
-    Dataset is also split in train and test and enables to create requisite files
-    Name of the file carries the source, translated into a dialect along with split type
-'''

+'''
+    File Name: prepare_sft_dataset.py      Author: Nikhil Malhotra
+    Date: 21/7/2024
+    purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
+    Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
+    Dataset is then translated in requisite dialects as supported by Google
+    Dataset is also split in train and test and enables to create requisite files
+    Name of the file carries the source, translated into a dialect along with split type
+'''
+#Imports
+import os
+import pandas as pd
+import configparser
+from googletrans import Translator
+#Configuration settings
+source_dir_path = "source"
+translated_dir_path = "translated"
+config = configparser.ConfigParser(default_section="DATASETS")
+'''
+    Dialects double codes used for translation
+    1. English : en
+    2. Hindi : hi
+    3. Dogri : doi
+    4. Bhojpuri: bho
+    5. Maithili :mai
+'''
+translated_dialect = "hi"
+def store_sft_dataset(name_of_dataset , data_frame,split_type):
+   """
+        Method to get data from Hugging face source and store this data in csv file
+        This method also calls translation in dialects
+        Input : name of dataset to be fetched from hugging face
+        Data Frame : Data frame contains question or answers or prompts and messages
+        Split type: test or train as the data is split
+    """
+   file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv"
+   if not os.path.isfile(file_name):
+        print("Opening file....." , file_name)
+        data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True)
+        print("Finished writing file....",file_name)
+   #Change dialect to translate at top depending upon the dialect needed
+   translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect)
+def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"):
+    USE_COUNT = False #Only use this if you need to test with small data
+    count = 0
+    """
+        This method enables translation in various dialects byt taking a dataset in a data frame and then
+        storing it as a csv file
+        Input : name of dataset to be fetched from hugging face
+        Data Frame : Data frame contains question or answers or prompts and messages
+        Split type: test or train as the data is split
+        dialect name: Dialect in whcih data needs to be converted into
+    """
+    print("Translating now....")
+    translator = Translator()
+    #Make a new dataframe for translation
+    translate_df = pd.DataFrame(columns=["question" , "answer"])
+    translated_append_list = []
+    for index,val in data_frame.iterrows():
+        translated_ques = translator.translate(val["question"],  dest=dialect_name).text
+        translated_ans = translator.translate(val["answer"],  dest=dialect_name).text
+        translated_append_list.append({'question' : translated_ques, 'answer': translated_ans})
+        count +=1
+        #If count is used then the program thinks you are testing.
+        # It enables you to break the loop with 5 data points
+        if USE_COUNT:
+            if count == 5:
+                break
+    df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])
+    translated_file_name = str(os.path.join(translated_dir_path,
+                                 name_of_dataset.replace("/","_")
+                                 ))+split_type+"_"+dialect_name+"_translated.csv"
+    if not os.path.isfile(translated_file_name):
+        print("Opening file....." , translated_file_name)
+        df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True)
+        print("Finished writing file....",translated_file_name)
+if __name__ == "__main__":
+    """
+        Main method to read config file and then use it to store and translate to produce
+        a high quality SFT dataset
+    """
+    #Read the config file and get the requisite section to call for the dataset
+    config.read("dataset.ini", encoding="utf-8")
+    for key in config['openai']:
+        if key.lower().strip().__eq__("name"):
+            name_of_dataset = config['openai'][key].replace('"','')
+            splits = {  'train': 'main/train-00000-of-00001.parquet',
+                         'test': 'main/test-00000-of-00001.parquet'
+                     }
+            df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"])
+            store_sft_dataset(name_of_dataset , df_1,"train")

SFT/source/openai_gsm8k_train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff