Elron commited on
Commit
4577f71
·
verified ·
1 Parent(s): 228b86b

Upload struct_data_operators.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. struct_data_operators.py +192 -5
struct_data_operators.py CHANGED
@@ -1,14 +1,20 @@
1
- """This section describes unitxt operators for tabular data.
2
 
3
- These operators are specialized in handling tabular data.
4
- Input table format is assumed as:
5
  {
6
  "header": ["col1", "col2"],
7
  "rows": [["row11", "row12"], ["row21", "row22"], ["row31", "row32"]]
8
  }
9
 
 
 
 
 
 
10
  ------------------------
11
  """
 
12
  import random
13
  from abc import ABC, abstractmethod
14
  from copy import deepcopy
@@ -19,6 +25,8 @@ from typing import (
19
  Optional,
20
  )
21
 
 
 
22
  from .dict_utils import dict_get
23
  from .operators import FieldOperator, StreamInstanceOperator
24
 
@@ -35,12 +43,10 @@ class SerializeTable(ABC, FieldOperator):
35
  pass
36
 
37
  # method to process table header
38
- @abstractmethod
39
  def process_header(self, header: List):
40
  pass
41
 
42
  # method to process a table row
43
- @abstractmethod
44
  def process_row(self, row: List, row_index: int):
45
  pass
46
 
@@ -140,6 +146,80 @@ class SerializeTableAsMarkdown(SerializeTable):
140
  return row_str
141
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  # truncate cell value to maximum allowed length
144
  def truncate_cell(cell_value, max_len):
145
  if cell_value is None:
@@ -362,3 +442,110 @@ class ListToKeyValPairs(StreamInstanceOperator):
362
  instance[self.to_field] = output_dict
363
 
364
  return instance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This section describes unitxt operators for structured data.
2
 
3
+ These operators are specialized in handling structured data like tables.
4
+ For tables, expected input format is:
5
  {
6
  "header": ["col1", "col2"],
7
  "rows": [["row11", "row12"], ["row21", "row22"], ["row31", "row32"]]
8
  }
9
 
10
+ For triples, expected input format is:
11
+ [[ "subject1", "relation1", "object1" ], [ "subject1", "relation2", "object2"]]
12
+
13
+ For key-value pairs, expected input format is:
14
+ {"key1": "value1", "key2": value2, "key3": "value3"}
15
  ------------------------
16
  """
17
+ import json
18
  import random
19
  from abc import ABC, abstractmethod
20
  from copy import deepcopy
 
25
  Optional,
26
  )
27
 
28
+ import pandas as pd
29
+
30
  from .dict_utils import dict_get
31
  from .operators import FieldOperator, StreamInstanceOperator
32
 
 
43
  pass
44
 
45
  # method to process table header
 
46
  def process_header(self, header: List):
47
  pass
48
 
49
  # method to process a table row
 
50
  def process_row(self, row: List, row_index: int):
51
  pass
52
 
 
146
  return row_str
147
 
148
 
149
+ class SerializeTableAsDFLoader(SerializeTable):
150
+ """DFLoader Table Serializer.
151
+
152
+ Pandas dataframe based code snippet format serializer.
153
+ Format(Sample):
154
+ pd.DataFrame({
155
+ "name" : ["Alex", "Diana", "Donald"],
156
+ "age" : [26, 34, 39]
157
+ },
158
+ index=[0,1,2])
159
+ """
160
+
161
+ def process_value(self, table: Any) -> Any:
162
+ table_input = deepcopy(table)
163
+ return self.serialize_table(table_content=table_input)
164
+
165
+ # main method that serializes a table.
166
+ # table_content must be in the presribed input format.
167
+ def serialize_table(self, table_content: Dict) -> str:
168
+ # Extract headers and rows from the dictionary
169
+ header = table_content.get("header", [])
170
+ rows = table_content.get("rows", [])
171
+
172
+ assert header and rows, "Incorrect input table format"
173
+
174
+ # Create a pandas DataFrame
175
+ df = pd.DataFrame(rows, columns=header)
176
+
177
+ # Generate output string in the desired format
178
+ data_dict = df.to_dict(orient="list")
179
+
180
+ return (
181
+ "pd.DataFrame({\n"
182
+ + json.dumps(data_dict)
183
+ + "},\nindex="
184
+ + str(list(range(len(rows))))
185
+ + ")"
186
+ )
187
+
188
+
189
+ class SerializeTableAsJson(SerializeTable):
190
+ """JSON Table Serializer.
191
+
192
+ Json format based serializer.
193
+ Format(Sample):
194
+ {
195
+ "0":{"name":"Alex","age":26},
196
+ "1":{"name":"Diana","age":34},
197
+ "2":{"name":"Donald","age":39}
198
+ }
199
+ """
200
+
201
+ def process_value(self, table: Any) -> Any:
202
+ table_input = deepcopy(table)
203
+ return self.serialize_table(table_content=table_input)
204
+
205
+ # main method that serializes a table.
206
+ # table_content must be in the presribed input format.
207
+ def serialize_table(self, table_content: Dict) -> str:
208
+ # Extract headers and rows from the dictionary
209
+ header = table_content.get("header", [])
210
+ rows = table_content.get("rows", [])
211
+
212
+ assert header and rows, "Incorrect input table format"
213
+
214
+ # Generate output dictionary
215
+ output_dict = {}
216
+ for i, row in enumerate(rows):
217
+ output_dict[i] = {header[j]: value for j, value in enumerate(row)}
218
+
219
+ # Convert dictionary to JSON string
220
+ return json.dumps(output_dict)
221
+
222
+
223
  # truncate cell value to maximum allowed length
224
  def truncate_cell(cell_value, max_len):
225
  if cell_value is None:
 
442
  instance[self.to_field] = output_dict
443
 
444
  return instance
445
+
446
+
447
+ class ConvertTableColNamesToSequential(FieldOperator):
448
+ """Replaces actual table column names with static sequential names like col_0, col_1,...
449
+
450
+ Sample input:
451
+ {
452
+ "header": ["name", "age"],
453
+ "rows": [["Alex", 21], ["Donald", 34]]
454
+ }
455
+ Sample output:
456
+ {
457
+ "header": ["col_0", "col_1"],
458
+ "rows": [["Alex", 21], ["Donald", 34]]
459
+ }
460
+ """
461
+
462
+ def process_value(self, table: Any) -> Any:
463
+ table_input = deepcopy(table)
464
+ return self.replace_header(table_content=table_input)
465
+
466
+ # replaces header with sequential column names
467
+ def replace_header(self, table_content: Dict) -> str:
468
+ # Extract header from the dictionary
469
+ header = table_content.get("header", [])
470
+
471
+ assert header, "Input table missing header"
472
+
473
+ new_header = ["col_" + str(i) for i in range(len(header))]
474
+ table_content["header"] = new_header
475
+
476
+ return table_content
477
+
478
+
479
+ class ShuffleTableRows(FieldOperator):
480
+ """Shuffles the input table rows randomly.
481
+
482
+ Sample Input:
483
+ {
484
+ "header": ["name", "age"],
485
+ "rows": [["Alex", 26], ["Raj", 34], ["Donald", 39]],
486
+ }
487
+
488
+ Sample Output:
489
+ {
490
+ "header": ["name", "age"],
491
+ "rows": [["Donald", 39], ["Raj", 34], ["Alex", 26]],
492
+ }
493
+ """
494
+
495
+ def process_value(self, table: Any) -> Any:
496
+ table_input = deepcopy(table)
497
+ return self.shuffle_rows(table_content=table_input)
498
+
499
+ # shuffles table rows randomly
500
+ def shuffle_rows(self, table_content: Dict) -> str:
501
+ # extract header & rows from the dictionary
502
+ header = table_content.get("header", [])
503
+ rows = table_content.get("rows", [])
504
+ assert header and rows, "Incorrect input table format"
505
+
506
+ # shuffle rows
507
+ random.shuffle(rows)
508
+ table_content["rows"] = rows
509
+
510
+ return table_content
511
+
512
+
513
+ class ShuffleTableColumns(FieldOperator):
514
+ """Shuffles the table columns randomly.
515
+
516
+ Sample Input:
517
+ {
518
+ "header": ["name", "age"],
519
+ "rows": [["Alex", 26], ["Raj", 34], ["Donald", 39]],
520
+ }
521
+
522
+ Sample Output:
523
+ {
524
+ "header": ["age", "name"],
525
+ "rows": [[26, "Alex"], [34, "Raj"], [39, "Donald"]],
526
+ }
527
+ """
528
+
529
+ def process_value(self, table: Any) -> Any:
530
+ table_input = deepcopy(table)
531
+ return self.shuffle_columns(table_content=table_input)
532
+
533
+ # shuffles table columns randomly
534
+ def shuffle_columns(self, table_content: Dict) -> str:
535
+ # extract header & rows from the dictionary
536
+ header = table_content.get("header", [])
537
+ rows = table_content.get("rows", [])
538
+ assert header and rows, "Incorrect input table format"
539
+
540
+ # shuffle the indices first
541
+ indices = list(range(len(header)))
542
+ random.shuffle(indices) #
543
+
544
+ # shuffle the header & rows based on that indices
545
+ shuffled_header = [header[i] for i in indices]
546
+ shuffled_rows = [[row[i] for i in indices] for row in rows]
547
+
548
+ table_content["header"] = shuffled_header
549
+ table_content["rows"] = shuffled_rows
550
+
551
+ return table_content