Upload folder using huggingface_hub
Browse files- README.md +5 -5
- api.py +2 -2
- metrics.py +14 -0
- struct_data_operators.py +21 -0
- templates.py +6 -3
- text_utils.py +1 -1
- version.py +1 -1
README.md
CHANGED
@@ -40,11 +40,11 @@ https://github.com/IBM/unitxt/assets/23455264/baef9131-39d4-4164-90b2-05da52919f
|
|
40 |
|
41 |
### π¦ Currently on Unitxt Catalog
|
42 |
|
43 |
-
![
|
44 |
-
![Dataset Cards](https://img.shields.io/badge/Dataset_Cards-
|
45 |
-
![Templates](https://img.shields.io/badge/Templates-
|
46 |
-
![
|
47 |
-
![Metrics](https://img.shields.io/badge/Metrics-
|
48 |
|
49 |
### π¦ Run Unitxt Exploration Dashboard
|
50 |
|
|
|
40 |
|
41 |
### π¦ Currently on Unitxt Catalog
|
42 |
|
43 |
+
![Abstract Tasks](https://img.shields.io/badge/Abstract_Tasks-62-blue)
|
44 |
+
![Dataset Cards](https://img.shields.io/badge/Dataset_Cards-3025-blue)
|
45 |
+
![Templates](https://img.shields.io/badge/Templates-342-blue)
|
46 |
+
![Benchmarks](https://img.shields.io/badge/Benchmarks-4-blue)
|
47 |
+
![Metrics](https://img.shields.io/badge/Metrics-422-blue)
|
48 |
|
49 |
### π¦ Run Unitxt Exploration Dashboard
|
50 |
|
api.py
CHANGED
@@ -145,8 +145,8 @@ def _source_to_dataset(
|
|
145 |
cache_dir = dir_to_be_deleted if not use_cache else None
|
146 |
ds_builder = UnitxtDataset(
|
147 |
dataset_name="unitxt",
|
148 |
-
config_name="recipe-" + short_hex_hash(source
|
149 |
-
hash=hash(source
|
150 |
version=constants.version,
|
151 |
cache_dir=cache_dir,
|
152 |
)
|
|
|
145 |
cache_dir = dir_to_be_deleted if not use_cache else None
|
146 |
ds_builder = UnitxtDataset(
|
147 |
dataset_name="unitxt",
|
148 |
+
config_name="recipe-" + short_hex_hash(repr(source)),
|
149 |
+
hash=hash(repr(source)),
|
150 |
version=constants.version,
|
151 |
cache_dir=cache_dir,
|
152 |
)
|
metrics.py
CHANGED
@@ -3355,6 +3355,8 @@ class CustomF1(GlobalMetric):
|
|
3355 |
|
3356 |
|
3357 |
class NER(CustomF1):
|
|
|
|
|
3358 |
prediction_type = List[Tuple[str, str]]
|
3359 |
|
3360 |
def get_element_group(self, element, additional_input):
|
@@ -3364,6 +3366,18 @@ class NER(CustomF1):
|
|
3364 |
return str(element)
|
3365 |
|
3366 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3367 |
def normalize_answer(s):
|
3368 |
"""Lower text and remove punctuation, articles and extra whitespace."""
|
3369 |
|
|
|
3355 |
|
3356 |
|
3357 |
class NER(CustomF1):
|
3358 |
+
"""F1 Metrics that receives as input a list of (Entity,EntityType) pairs."""
|
3359 |
+
|
3360 |
prediction_type = List[Tuple[str, str]]
|
3361 |
|
3362 |
def get_element_group(self, element, additional_input):
|
|
|
3366 |
return str(element)
|
3367 |
|
3368 |
|
3369 |
+
class KeyValueExtraction(CustomF1):
|
3370 |
+
"""F1 Metrics that receives as input a list of (Key,Value) pairs."""
|
3371 |
+
|
3372 |
+
prediction_type = List[Tuple[str, str]]
|
3373 |
+
|
3374 |
+
def get_element_group(self, element, additional_input):
|
3375 |
+
return element[0]
|
3376 |
+
|
3377 |
+
def get_element_representation(self, element, additional_input):
|
3378 |
+
return str(element)
|
3379 |
+
|
3380 |
+
|
3381 |
def normalize_answer(s):
|
3382 |
"""Lower text and remove punctuation, articles and extra whitespace."""
|
3383 |
|
struct_data_operators.py
CHANGED
@@ -23,6 +23,7 @@ For key-value pairs, expected input format is:
|
|
23 |
{"key1": "value1", "key2": value2, "key3": "value3"}
|
24 |
"""
|
25 |
|
|
|
26 |
import json
|
27 |
import random
|
28 |
from abc import ABC, abstractmethod
|
@@ -31,12 +32,14 @@ from typing import (
|
|
31 |
Dict,
|
32 |
List,
|
33 |
Optional,
|
|
|
34 |
)
|
35 |
|
36 |
import pandas as pd
|
37 |
|
38 |
from .augmentors import TypeDependentAugmentor
|
39 |
from .dict_utils import dict_get
|
|
|
40 |
from .operators import FieldOperator, InstanceOperator
|
41 |
from .random_utils import new_random_generator
|
42 |
from .serializers import ImageSerializer, TableSerializer
|
@@ -1019,3 +1022,21 @@ class ShuffleColumnsNames(TypeDependentAugmentor):
|
|
1019 |
random.shuffle(shuffled_header)
|
1020 |
|
1021 |
return {"header": shuffled_header, "rows": table["rows"]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
{"key1": "value1", "key2": value2, "key3": "value3"}
|
24 |
"""
|
25 |
|
26 |
+
import ast
|
27 |
import json
|
28 |
import random
|
29 |
from abc import ABC, abstractmethod
|
|
|
32 |
Dict,
|
33 |
List,
|
34 |
Optional,
|
35 |
+
Tuple,
|
36 |
)
|
37 |
|
38 |
import pandas as pd
|
39 |
|
40 |
from .augmentors import TypeDependentAugmentor
|
41 |
from .dict_utils import dict_get
|
42 |
+
from .error_utils import UnitxtWarning
|
43 |
from .operators import FieldOperator, InstanceOperator
|
44 |
from .random_utils import new_random_generator
|
45 |
from .serializers import ImageSerializer, TableSerializer
|
|
|
1022 |
random.shuffle(shuffled_header)
|
1023 |
|
1024 |
return {"header": shuffled_header, "rows": table["rows"]}
|
1025 |
+
|
1026 |
+
|
1027 |
+
class JsonStrToListOfKeyValuePairs(FieldOperator):
|
1028 |
+
def process_value(self, text: str) -> List[Tuple[str, str]]:
|
1029 |
+
text = text.replace("null", "None")
|
1030 |
+
|
1031 |
+
try:
|
1032 |
+
dict_value = ast.literal_eval(text)
|
1033 |
+
except Exception as e:
|
1034 |
+
UnitxtWarning(
|
1035 |
+
f"Unable to convert input text to json format in JsonStrToListOfKeyValuePairs due to {e}. Text: {text}"
|
1036 |
+
)
|
1037 |
+
dict_value = {}
|
1038 |
+
return [
|
1039 |
+
(str(key), str(value))
|
1040 |
+
for key, value in dict_value.items()
|
1041 |
+
if value is not None
|
1042 |
+
]
|
templates.py
CHANGED
@@ -533,7 +533,8 @@ class MultipleChoiceTemplate(InputFormatTemplate):
|
|
533 |
input and reference dictionaries.
|
534 |
target_field (str): The key under which the correct choice is stored in the
|
535 |
reference dictionary (can be integer index or textual label).
|
536 |
-
choices_separator (str): A string used to join formatted
|
|
|
537 |
source_choice_format (str): A Python format string used for displaying each choice
|
538 |
in the input fields (e.g. "{choice_numeral}. {choice_text}").
|
539 |
target_choice_format (str): A Python format string used for displaying each choice
|
@@ -544,8 +545,10 @@ class MultipleChoiceTemplate(InputFormatTemplate):
|
|
544 |
set with `shuffle_choices_seed`.
|
545 |
shuffle_choices_seed (int, optional): If provided, the choices are shuffled with
|
546 |
this fixed integer seed for reproducibility.
|
547 |
-
sort_choices_by_length (bool): If True, sorts choices
|
548 |
-
|
|
|
|
|
549 |
reverse_choices (bool): If True, reverses the order of the choices after any
|
550 |
sorting has been applied. Defaults to False to preserve backward compatibility.
|
551 |
"""
|
|
|
533 |
input and reference dictionaries.
|
534 |
target_field (str): The key under which the correct choice is stored in the
|
535 |
reference dictionary (can be integer index or textual label).
|
536 |
+
choices_separator (str): A string used to join formatted
|
537 |
+
choices (e.g. ", ").
|
538 |
source_choice_format (str): A Python format string used for displaying each choice
|
539 |
in the input fields (e.g. "{choice_numeral}. {choice_text}").
|
540 |
target_choice_format (str): A Python format string used for displaying each choice
|
|
|
545 |
set with `shuffle_choices_seed`.
|
546 |
shuffle_choices_seed (int, optional): If provided, the choices are shuffled with
|
547 |
this fixed integer seed for reproducibility.
|
548 |
+
sort_choices_by_length (bool): If True, sorts choices
|
549 |
+
by their length (ascending).
|
550 |
+
sort_choices_alphabetically (bool): If True, sorts choices
|
551 |
+
in alphabetical order.
|
552 |
reverse_choices (bool): If True, reverses the order of the choices after any
|
553 |
sorting has been applied. Defaults to False to preserve backward compatibility.
|
554 |
"""
|
text_utils.py
CHANGED
@@ -232,7 +232,7 @@ def construct_dict_as_yaml_lines(d, indent_delta=2) -> List[str]:
|
|
232 |
|
233 |
# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
|
234 |
d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
|
235 |
-
if "\\n" in d1:
|
236 |
d1 = f'"{d1}"'
|
237 |
return [d1]
|
238 |
|
|
|
232 |
|
233 |
# d1 = re.sub(r"(\n+)", r'"\1"', str(d))
|
234 |
d1 = str(d).replace("\n", "\\n").replace('"', '\\"')
|
235 |
+
if "\\n" in d1 or d1 == "":
|
236 |
d1 = f'"{d1}"'
|
237 |
return [d1]
|
238 |
|
version.py
CHANGED
@@ -1 +1 @@
|
|
1 |
-
version = "1.
|
|
|
1 |
+
version = "1.18.0"
|