nickmalhotra
commited on
Commit
•
f05550b
1
Parent(s):
19871d5
Upload 4 files
Browse filesFiles added for the same
- SFT/Philosophy.txt +17 -0
- SFT/dataset.ini +8 -0
- SFT/prepare_sft_dataset.py +104 -9
- SFT/source/openai_gsm8k_train.csv +0 -0
SFT/Philosophy.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The Philosophy of SFT dataset is that it is a json dictionary structure containing the following
|
2 |
+
{'prompt' , 'prompt_id' , 'message' , 'dialect'}
|
3 |
+
|
4 |
+
This examples can be single turn or multi turn
|
5 |
+
|
6 |
+
Example(Hindi)
|
7 |
+
{ 'dialect': 'hi'
|
8 |
+
'prompt': 'मुझे लंदन में कौन से प्रसिद्ध स्थलों का दौरा करना चाहिए?'
|
9 |
+
'message': 'आपको बिग बेन, टावर ऑफ लंदन, लंदन आई, ग्रीनविच म्यूजियम जैसी जगहों पर जाना चाहिए।'
|
10 |
+
}
|
11 |
+
|
12 |
+
An example in another Hindi dialect would be as follows
|
13 |
+
Example(Hindi - Dogri)
|
14 |
+
{ 'dialect': 'do'
|
15 |
+
'prompt': 'लंदन च किस मशहूर लैंडमार्क दा दौरा लाना चाहिदा ?'
|
16 |
+
'message': 'तुसें गी बिग बेन, टावर ऑफ लंदन, लंदन आई, ग्रीनविच म्यूजियम जनेह् थाह्रें दा दौरा लाना चाहिदा।'
|
17 |
+
}
|
SFT/dataset.ini
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[DATASETS]
|
2 |
+
|
3 |
+
[openai]
|
4 |
+
name = "openai/gsm8k/"
|
5 |
+
structure = ["question" , "answer"]
|
6 |
+
initial_url = "hf://datasets/"
|
7 |
+
train_data="main/train-00000-of-00001.parquet"
|
8 |
+
test_data="main/test-00000-of-00001.parquet"
|
SFT/prepare_sft_dataset.py
CHANGED
@@ -1,9 +1,104 @@
|
|
1 |
-
'''
|
2 |
-
File Name: prepare_sft_dataset.py Author: Nikhil Malhotra
|
3 |
-
Date: 21/7/2024
|
4 |
-
purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
|
5 |
-
Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
|
6 |
-
Dataset is then translated in requisite dialects as supported by Google
|
7 |
-
Dataset is also split in train and test and enables to create requisite files
|
8 |
-
Name of the file carries the source, translated into a dialect along with split type
|
9 |
-
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
File Name: prepare_sft_dataset.py Author: Nikhil Malhotra
|
3 |
+
Date: 21/7/2024
|
4 |
+
purpose: The purpose of a file is to create high quality SFT dataset for Project Indus.
|
5 |
+
Dataset source is obtained from Hugging face and enables to get high quality SFT dataset
|
6 |
+
Dataset is then translated in requisite dialects as supported by Google
|
7 |
+
Dataset is also split in train and test and enables to create requisite files
|
8 |
+
Name of the file carries the source, translated into a dialect along with split type
|
9 |
+
'''
|
10 |
+
#Imports
|
11 |
+
import os
|
12 |
+
import pandas as pd
|
13 |
+
import configparser
|
14 |
+
from googletrans import Translator
|
15 |
+
|
16 |
+
|
17 |
+
#Configuration settings
|
18 |
+
source_dir_path = "source"
|
19 |
+
translated_dir_path = "translated"
|
20 |
+
config = configparser.ConfigParser(default_section="DATASETS")
|
21 |
+
'''
|
22 |
+
Dialects double codes used for translation
|
23 |
+
1. English : en
|
24 |
+
2. Hindi : hi
|
25 |
+
3. Dogri : doi
|
26 |
+
4. Bhojpuri: bho
|
27 |
+
5. Maithili :mai
|
28 |
+
'''
|
29 |
+
translated_dialect = "hi"
|
30 |
+
|
31 |
+
def store_sft_dataset(name_of_dataset , data_frame,split_type):
|
32 |
+
"""
|
33 |
+
Method to get data from Hugging face source and store this data in csv file
|
34 |
+
This method also calls translation in dialects
|
35 |
+
Input : name of dataset to be fetched from hugging face
|
36 |
+
Data Frame : Data frame contains question or answers or prompts and messages
|
37 |
+
Split type: test or train as the data is split
|
38 |
+
"""
|
39 |
+
file_name = str(os.path.join(source_dir_path,name_of_dataset.replace("/","_")))+"train.csv"
|
40 |
+
if not os.path.isfile(file_name):
|
41 |
+
print("Opening file....." , file_name)
|
42 |
+
data_frame.to_csv(file_name, encoding="utf-8",index=False,header=True)
|
43 |
+
print("Finished writing file....",file_name)
|
44 |
+
|
45 |
+
#Change dialect to translate at top depending upon the dialect needed
|
46 |
+
translate_in_dialects(name_of_dataset , data_frame , split_type, translated_dialect)
|
47 |
+
|
48 |
+
|
49 |
+
def translate_in_dialects(name_of_dataset , data_frame ,split_type, dialect_name="hi"):
|
50 |
+
USE_COUNT = False #Only use this if you need to test with small data
|
51 |
+
count = 0
|
52 |
+
"""
|
53 |
+
This method enables translation in various dialects byt taking a dataset in a data frame and then
|
54 |
+
storing it as a csv file
|
55 |
+
Input : name of dataset to be fetched from hugging face
|
56 |
+
Data Frame : Data frame contains question or answers or prompts and messages
|
57 |
+
Split type: test or train as the data is split
|
58 |
+
dialect name: Dialect in whcih data needs to be converted into
|
59 |
+
"""
|
60 |
+
print("Translating now....")
|
61 |
+
translator = Translator()
|
62 |
+
#Make a new dataframe for translation
|
63 |
+
translate_df = pd.DataFrame(columns=["question" , "answer"])
|
64 |
+
translated_append_list = []
|
65 |
+
for index,val in data_frame.iterrows():
|
66 |
+
translated_ques = translator.translate(val["question"], dest=dialect_name).text
|
67 |
+
translated_ans = translator.translate(val["answer"], dest=dialect_name).text
|
68 |
+
translated_append_list.append({'question' : translated_ques, 'answer': translated_ans})
|
69 |
+
count +=1
|
70 |
+
#If count is used then the program thinks you are testing.
|
71 |
+
# It enables you to break the loop with 5 data points
|
72 |
+
if USE_COUNT:
|
73 |
+
if count == 5:
|
74 |
+
break
|
75 |
+
|
76 |
+
df = pd.concat([translate_df, pd.DataFrame(translated_append_list)])
|
77 |
+
|
78 |
+
translated_file_name = str(os.path.join(translated_dir_path,
|
79 |
+
name_of_dataset.replace("/","_")
|
80 |
+
))+split_type+"_"+dialect_name+"_translated.csv"
|
81 |
+
|
82 |
+
if not os.path.isfile(translated_file_name):
|
83 |
+
print("Opening file....." , translated_file_name)
|
84 |
+
df.to_csv(translated_file_name, encoding="utf-8",index=False,header=True)
|
85 |
+
print("Finished writing file....",translated_file_name)
|
86 |
+
|
87 |
+
|
88 |
+
if __name__ == "__main__":
|
89 |
+
"""
|
90 |
+
Main method to read config file and then use it to store and translate to produce
|
91 |
+
a high quality SFT dataset
|
92 |
+
"""
|
93 |
+
#Read the config file and get the requisite section to call for the dataset
|
94 |
+
config.read("dataset.ini", encoding="utf-8")
|
95 |
+
for key in config['openai']:
|
96 |
+
if key.lower().strip().__eq__("name"):
|
97 |
+
name_of_dataset = config['openai'][key].replace('"','')
|
98 |
+
splits = { 'train': 'main/train-00000-of-00001.parquet',
|
99 |
+
'test': 'main/test-00000-of-00001.parquet'
|
100 |
+
}
|
101 |
+
df_1 = pd.read_parquet("hf://datasets/"+name_of_dataset + splits["test"])
|
102 |
+
store_sft_dataset(name_of_dataset , df_1,"train")
|
103 |
+
|
104 |
+
|
SFT/source/openai_gsm8k_train.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|