Elron commited on
Commit
3e28aad
·
1 Parent(s): 6a1bcc1

Upload dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dataset.py +116 -0
dataset.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###############
2
+ # `ls -1 src/unitxt | grep '\.py$' | grep -Ev 'dataset\.py|__init__\.py' | sort`:
3
+ # artifact.py
4
+ # blocks.py
5
+ # card.py
6
+ # catalog.py
7
+ # collections.py
8
+ # common.py
9
+ # file_utils.py
10
+ # fusion.py
11
+ # generator_utils.py
12
+ # instructions.py
13
+ # loaders.py
14
+ # load.py
15
+ # metric.py
16
+ # metrics.py
17
+ # normalizers.py
18
+ # operator.py
19
+ # operators.py
20
+ # processors.py
21
+ # recipe.py
22
+ # register.py
23
+ # splitters.py
24
+ # split_utils.py
25
+ # stream.py
26
+ # task.py
27
+ # templates.py
28
+ # text_utils.py
29
+ # utilize.py
30
+ # validate.py
31
+ #####
32
+ # imports for hf system:
33
+ #####
34
+ from .artifact import __file__ as _
35
+ from .blocks import __file__ as _
36
+ from .card import __file__ as _
37
+ from .catalog import __file__ as _
38
+ from .collections import __file__ as _
39
+ from .common import __file__ as _
40
+ from .file_utils import __file__ as _
41
+
42
+ # from .fusion import __file__
43
+ from .generator_utils import __file__ as _
44
+ from .instructions import __file__ as _
45
+ from .loaders import __file__ as _
46
+ from .load import __file__ as _
47
+ from .metric import __file__ as _
48
+ from .metrics import __file__ as _
49
+ from .normalizers import __file__ as _
50
+ from .operator import __file__ as _
51
+ from .operators import __file__ as _
52
+ from .processors import __file__ as _
53
+ from .recipe import __file__ as _
54
+ from .register import __file__ as _
55
+ from .schema import __file__ as _
56
+ from .splitters import __file__ as _
57
+ from .split_utils import __file__ as _
58
+ from .stream import __file__ as _
59
+ from .task import __file__ as _
60
+ from .templates import __file__ as _
61
+ from .text_utils import __file__ as _
62
+
63
+ # from .utilize import __file__ as _
64
+ # from .validate import __file__ as _
65
+ #############
66
+
67
+ from .register import register_blocks
68
+ from .artifact import Artifact
69
+
70
+ import datasets
71
+
72
+
73
+ def parse(query: str):
74
+ """
75
+ Parses a query of the form 'key1=value1,key2=value2,...' into a dictionary.
76
+ """
77
+ result = {}
78
+ for kv in query.split(","):
79
+ parts = kv.split("=")
80
+ if parts[1].isdigit():
81
+ result[parts[0]] = int(parts[1])
82
+ elif parts[1].replace(".", "", 1).isdigit():
83
+ result[parts[0]] = float(parts[1])
84
+
85
+ result[parts[0]] = parts[1]
86
+
87
+ return result
88
+
89
+
90
+ class Unitext(datasets.GeneratorBasedBuilder):
91
+ """TODO: Short description of my dataset."""
92
+
93
+ VERSION = datasets.Version("1.1.1")
94
+ builder_configs = {}
95
+
96
+ @property
97
+ def generators(self):
98
+ register_blocks()
99
+ if not hasattr(self, "_generators") or self._generators is None:
100
+ args = parse(self.config.name)
101
+ if "type" not in args:
102
+ args["type"] = "common_recipe"
103
+ recipe = Artifact.from_dict(args)
104
+ self._generators = recipe()
105
+ return self._generators
106
+
107
+ def _info(self):
108
+ return datasets.DatasetInfo()
109
+
110
+ def _split_generators(self, _):
111
+ return [datasets.SplitGenerator(name=name, gen_kwargs={"split_name": name}) for name in self.generators.keys()]
112
+
113
+ def _generate_examples(self, split_name):
114
+ generator = self.generators[split_name]
115
+ for i, row in enumerate(generator):
116
+ yield i, row