nickmalhotra commited on
Commit
f05550b
1 Parent(s): 19871d5

Upload 4 files

Browse files

Files added for the same

SFT/Philosophy.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The Philosophy of SFT dataset is that it is a json dictionary structure containing the following
2
+ {'prompt' , 'prompt_id' , 'message' , 'dialect'}
3
+
4
+ This examples can be single turn or multi turn
5
+
6
+ Example(Hindi)
7
+ { 'dialect': 'hi'
8
+ 'prompt': 'मुझे लंदन में कौन से प्रसिद्ध स्थलों का दौरा करना चाहिए?'
9
+ 'message': 'आपको बिग बेन, टावर ऑफ लंदन, लंदन आई, ग्रीनविच म्यूजियम जैसी जगहों पर जाना चाहिए।'
10
+ }
11
+
12
+ An example in another Hindi dialect would be as follows
13
+ Example(Hindi - Dogri)
14
+ { 'dialect': 'do'
15
+ 'prompt': 'लंदन च किस मशहूर लैंडमार्क दा दौरा लाना चाहिदा ?'
16
+ 'message': 'तुसें गी बिग बेन, टावर ऑफ लंदन, लंदन आई, ग्रीनविच म्यूजियम जनेह् थाह्रें दा दौरा लाना चाहिदा।'
17
+ }
SFT/dataset.ini ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [DATASETS]
2
+
3
+ [openai]
4
+ name = "openai/gsm8k/"
5
+ structure = ["question" , "answer"]
6
+ initial_url = "hf://datasets/"
7
+ train_data="main/train-00000-of-00001.parquet"
8
+ test_data="main/test-00000-of-00001.parquet"
SFT/prepare_sft_dataset.py CHANGED
@@ -1,9 +1,104 @@
1
- '''
2
- File Name: prepare_sft_dataset.py Author: Nikhil Malhotra
3
- Date: 21/7/2024
4
- purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
5
- Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
6
- Dataset is then translated in requisite dialects as supported by Google
7
- Dataset is also split in train and test and enables to create requisite files
8
- Name of the file carries the source, translated into a dialect along with split type
9
- '''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ File Name: prepare_sft_dataset.py Author: Nikhil Malhotra
3
+ Date: 21/7/2024
4
+ purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
5
+ Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
6
+ Dataset is then translated in requisite dialects as supported by Google
7
+ Dataset is also split in train and test and enables to create requisite files
8
+ Name of the file carries the source, translated into a dialect along with split type
9
+ '''
10
+ #Imports
11
+ import os
12
+ import pandas as pd
13
+ import configparser
14
+ from googletrans import Translator
15
+
16
+
17
+ #Configuration settings
18
+ source_dir_path = "source"
19
+ translated_dir_path = "translated"
20
+ config = configparser.ConfigParser(default_section="DATASETS")
21
+ '''
22
+ Dialects double codes used for translation
23
+ 1. English : en
24
+ 2. Hindi : hi
25
+ 3. Dogri : doi
26
+ 4. Bhojpuri: bho
27
+ 5. Maithili :mai
28
+ '''
29
+ translated_dialect = "hi"
30
+
31
+ def store_sft_dataset(name_of_dataset , data_frame,split_type):
32
+ """
33
+ Method to get data from Hugging face source and store this data in csv file
34
+ This method also calls translation in dialects
35
+ Input : name of dataset to be fetched from hugging face
36
+ Data Frame : Data frame contains question or answers or prompts and messages
37
+ Split type: test or train as the data is split
38
+ """
39
+ file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv"
40
+ if not os.path.isfile(file_name):
41
+ print("Opening file....." , file_name)
42
+ data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True)
43
+ print("Finished writing file....",file_name)
44
+
45
+ #Change dialect to translate at top depending upon the dialect needed
46
+ translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect)
47
+
48
+
49
+ def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"):
50
+ USE_COUNT = False #Only use this if you need to test with small data
51
+ count = 0
52
+ """
53
+ This method enables translation in various dialects byt taking a dataset in a data frame and then
54
+ storing it as a csv file
55
+ Input : name of dataset to be fetched from hugging face
56
+ Data Frame : Data frame contains question or answers or prompts and messages
57
+ Split type: test or train as the data is split
58
+ dialect name: Dialect in whcih data needs to be converted into
59
+ """
60
+ print("Translating now....")
61
+ translator = Translator()
62
+ #Make a new dataframe for translation
63
+ translate_df = pd.DataFrame(columns=["question" , "answer"])
64
+ translated_append_list = []
65
+ for index,val in data_frame.iterrows():
66
+ translated_ques = translator.translate(val["question"], dest=dialect_name).text
67
+ translated_ans = translator.translate(val["answer"], dest=dialect_name).text
68
+ translated_append_list.append({'question' : translated_ques, 'answer': translated_ans})
69
+ count +=1
70
+ #If count is used then the program thinks you are testing.
71
+ # It enables you to break the loop with 5 data points
72
+ if USE_COUNT:
73
+ if count == 5:
74
+ break
75
+
76
+ df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])
77
+
78
+ translated_file_name = str(os.path.join(translated_dir_path,
79
+ name_of_dataset.replace("/","_")
80
+ ))+split_type+"_"+dialect_name+"_translated.csv"
81
+
82
+ if not os.path.isfile(translated_file_name):
83
+ print("Opening file....." , translated_file_name)
84
+ df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True)
85
+ print("Finished writing file....",translated_file_name)
86
+
87
+
88
+ if __name__ == "__main__":
89
+ """
90
+ Main method to read config file and then use it to store and translate to produce
91
+ a high quality SFT dataset
92
+ """
93
+ #Read the config file and get the requisite section to call for the dataset
94
+ config.read("dataset.ini", encoding="utf-8")
95
+ for key in config['openai']:
96
+ if key.lower().strip().__eq__("name"):
97
+ name_of_dataset = config['openai'][key].replace('"','')
98
+ splits = { 'train': 'main/train-00000-of-00001.parquet',
99
+ 'test': 'main/test-00000-of-00001.parquet'
100
+ }
101
+ df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"])
102
+ store_sft_dataset(name_of_dataset , df_1,"train")
103
+
104
+
SFT/source/openai_gsm8k_train.csv ADDED
The diff for this file is too large to render. See raw diff