gardarjuto commited on
Commit
4e425e4
·
1 Parent(s): 6b89838

Fix wrong GED. Load all dataset at startup

Browse files
Files changed (1) hide show
  1. quiz.py +13 -7
quiz.py CHANGED
@@ -42,6 +42,15 @@ BENCHMARKS = {
42
  },
43
  }
44
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Dataset specific preprocessing and standardization
47
  def winogrande_preprocessing(sample):
@@ -65,7 +74,7 @@ def icelandic_sentence_gec_preprocessing(sample):
65
  f"Inniheldur eftirfarandi málsgrein villu?<p style='margin-left: 25px;'><i>{sample['sentence']}</i></p>"
66
  )
67
  new_sample["options"] = "Villa", "Engin villa"
68
- new_sample["answer"] = "Engin villa" if sample["correct"] else "Villa"
69
  new_sample["instruction"] = "Valkostir"
70
  return new_sample
71
 
@@ -161,12 +170,9 @@ class BenchmarkQuiz:
161
  return self.state
162
 
163
  def load_benchmark(self, benchmark_name: str) -> List[Dict[str, Any]]:
164
- dataset = load_dataset(
165
- BENCHMARKS[benchmark_name]["path"],
166
- name=BENCHMARKS[benchmark_name].get("config_name"),
167
- split=BENCHMARKS[benchmark_name].get("split", "train"),
168
- )
169
- samples = random.sample(list(dataset), 5)
170
  if benchmark_name == "icelandic-winogrande":
171
  samples = [winogrande_preprocessing(sample) for sample in samples]
172
  elif benchmark_name == "grammatical-error-detection":
 
42
  },
43
  }
44
 
45
+ DATASETS = {
46
+ dataset_name: load_dataset(
47
+ BENCHMARKS[dataset_name]["path"],
48
+ name=BENCHMARKS[dataset_name].get("config_name"),
49
+ split=BENCHMARKS[dataset_name].get("split", "train"),
50
+ )
51
+ for dataset_name in BENCHMARKS
52
+ }
53
+
54
 
55
  # Dataset specific preprocessing and standardization
56
  def winogrande_preprocessing(sample):
 
74
  f"Inniheldur eftirfarandi málsgrein villu?<p style='margin-left: 25px;'><i>{sample['sentence']}</i></p>"
75
  )
76
  new_sample["options"] = "Villa", "Engin villa"
77
+ new_sample["answer"] = "Engin villa" if sample["correct"] == "false" else "Villa"
78
  new_sample["instruction"] = "Valkostir"
79
  return new_sample
80
 
 
170
  return self.state
171
 
172
  def load_benchmark(self, benchmark_name: str) -> List[Dict[str, Any]]:
173
+ dataset = DATASETS[benchmark_name]
174
+ random_indices = random.sample(range(len(dataset)), 5)
175
+ samples = dataset.select(random_indices)
 
 
 
176
  if benchmark_name == "icelandic-winogrande":
177
  samples = [winogrande_preprocessing(sample) for sample in samples]
178
  elif benchmark_name == "grammatical-error-detection":