Corey Morris commited on
Commit
c32735e
·
1 Parent(s): 83a34f0

WIP commit. Finding files can be identical as the method in results_data_processor.

Browse files
details_data_processor.py CHANGED
@@ -10,12 +10,14 @@ class DetailsDataProcessor:
10
  # Download
11
  #url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json
12
 
13
- def __init__(self, directory='results', pattern='results*.json'):
14
  self.directory = directory
15
  self.pattern = pattern
16
  # self.data = self.process_data()
17
  # self.ranked_data = self.rank_data()
18
 
 
 
19
  # download a file from a single url and save it to a local directory
20
  @staticmethod
21
  def download_file(url, filename):
@@ -49,7 +51,35 @@ class DetailsDataProcessor:
49
  constructed_url = base_url + organization + '/' + model + '/' + other_chunk + filename
50
  return constructed_url
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
 
 
 
 
 
 
 
 
 
 
53
  # @staticmethod
54
  # def _find_files(directory, pattern):
55
  # for root, dirs, files in os.walk(directory):
 
10
  # Download
11
  #url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json
12
 
13
+ def __init__(self, directory='results', pattern='moral*.json'):
14
  self.directory = directory
15
  self.pattern = pattern
16
  # self.data = self.process_data()
17
  # self.ranked_data = self.rank_data()
18
 
19
+
20
+
21
  # download a file from a single url and save it to a local directory
22
  @staticmethod
23
  def download_file(url, filename):
 
51
  constructed_url = base_url + organization + '/' + model + '/' + other_chunk + filename
52
  return constructed_url
53
 
54
+ # @staticmethod
55
+ # def _find_files(directory, pattern):
56
+ # for root, dirs, files in os.walk(directory):
57
+ # for basename in files:
58
+ # if fnmatch.fnmatch(basename, pattern):
59
+ # filename = os.path.join(root, basename)
60
+ # yield filename
61
+
62
+
63
+
64
+ def _find_files(self, directory, pattern):
65
+ matching_files = [] # List to hold matching filenames
66
+ for root, dirs, files in os.walk(directory):
67
+ for basename in files:
68
+ if fnmatch.fnmatch(basename, pattern):
69
+ filename = os.path.join(root, basename)
70
+ matching_files.append(filename) # Append the matching filename to the list
71
+ return matching_files # Return the list of matching filenames
72
 
73
+
74
+ def pipeline(self):
75
+ dataframes = []
76
+ for file_path in self._find_files(self.directory, self.pattern):
77
+ print(file_path)
78
+ url = self.generate_url(file_path)
79
+ file_path = file_path.split('/')[-1]
80
+ df = self.single_file_pipeline(url, file_path)
81
+ dataframes.append(df)
82
+ return dataframes
83
  # @staticmethod
84
  # def _find_files(directory, pattern):
85
  # for root, dirs, files in os.walk(directory):
test_details_data_processing.py CHANGED
@@ -28,6 +28,20 @@ class TestDetailsDataProcessor(unittest.TestCase):
28
  constructed_url = self.processor.generate_url(results_file_path)
29
  self.assertEqual(expected_url, constructed_url)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  if __name__ == '__main__':
33
  unittest.main()
 
28
  constructed_url = self.processor.generate_url(results_file_path)
29
  self.assertEqual(expected_url, constructed_url)
30
 
31
+ def test_pipeline(self):
32
+ df = self.processor.pipeline()
33
+ print(100 * "****")
34
+ print(df)
35
+ self.assertIsInstance(df, pd.DataFrame)
36
+
37
+ def test_find_files(self):
38
+ directory = 'results'
39
+ pattern = '*moral*.json'
40
+ files = self.processor._find_files(directory, pattern)
41
+ breakpoint()
42
+ print(files)
43
+ self.assertIsInstance(files, list)
44
+
45
 
46
  if __name__ == '__main__':
47
  unittest.main()