Spaces:
Sleeping
Sleeping
# Copyright 2021 The HuggingFace Team. All rights reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import json | |
from dataclasses import asdict | |
from os.path import exists | |
import pandas as pd | |
from datasets import Dataset, get_dataset_infos, load_dataset, load_from_disk | |
# treating inf values as NaN as well | |
pd.set_option("use_inf_as_na", True) | |
## String names used in Hugging Face dataset configs. | |
HF_FEATURE_FIELD = "features" | |
HF_LABEL_FIELD = "label" | |
HF_DESC_FIELD = "description" | |
CACHE_DIR = "cache_dir" | |
## String names we are using within this code. | |
# These are not coming from the stored dataset nor HF config, | |
# but rather used as identifiers in our dicts and dataframes. | |
OUR_TEXT_FIELD = "text" | |
OUR_LABEL_FIELD = "label" | |
TOKENIZED_FIELD = "tokenized_text" | |
EMBEDDING_FIELD = "embedding" | |
LENGTH_FIELD = "length" | |
VOCAB = "vocab" | |
WORD = "word" | |
CNT = "count" | |
PROP = "proportion" | |
TEXT_NAN_CNT = "text_nan_count" | |
TXT_LEN = "text lengths" | |
DEDUP_TOT = "dedup_total" | |
TOT_WORDS = "total words" | |
TOT_OPEN_WORDS = "total open words" | |
_DATASET_LIST = [ | |
"c4", | |
"squad", | |
"squad_v2", | |
"hate_speech18", | |
"hate_speech_offensive", | |
"glue", | |
"super_glue", | |
"wikitext", | |
"imdb", | |
"HuggingFaceM4/OBELICS", | |
] | |
_STREAMABLE_DATASET_LIST = [ | |
"c4", | |
"wikitext", | |
"HuggingFaceM4/OBELICS", | |
] | |
_MAX_ROWS = 100 | |
def load_truncated_dataset( | |
dataset_name, | |
config_name, | |
split_name, | |
num_rows=_MAX_ROWS, | |
cache_name=None, | |
use_cache=True, | |
use_streaming=True, | |
): | |
""" | |
This function loads the first `num_rows` items of a dataset for a | |
given `config_name` and `split_name`. | |
If `cache_name` exists, the truncated dataset is loaded from `cache_name`. | |
Otherwise, a new truncated dataset is created and immediately saved | |
to `cache_name`. | |
When the dataset is streamable, we iterate through the first | |
`num_rows` examples in streaming mode, write them to a jsonl file, | |
then create a new dataset from the json. | |
This is the most direct way to make a Dataset from an IterableDataset | |
as of datasets version 1.6.1. | |
Otherwise, we download the full dataset and select the first | |
`num_rows` items | |
Args: | |
dataset_name (string): | |
dataset id in the dataset library | |
config_name (string): | |
dataset configuration | |
split_name (string): | |
split name | |
num_rows (int): | |
number of rows to truncate the dataset to | |
cache_name (string): | |
name of the cache directory | |
use_cache (bool): | |
whether to load form the cache if it exists | |
use_streaming (bool): | |
whether to use streaming when the dataset supports it | |
Returns: | |
Dataset: the truncated dataset as a Dataset object | |
""" | |
if cache_name is None: | |
cache_name = f"{dataset_name}_{config_name}_{split_name}_{num_rows}" | |
if exists(cache_name): | |
dataset = load_from_disk(cache_name) | |
else: | |
if use_streaming and dataset_name in _STREAMABLE_DATASET_LIST: | |
iterable_dataset = load_dataset( | |
dataset_name, | |
name=config_name, | |
split=split_name, | |
streaming=True, | |
).take(num_rows) | |
rows = list(iterable_dataset) | |
f = open("temp.jsonl", "w", encoding="utf-8") | |
for row in rows: | |
_ = f.write(json.dumps(row) + "\n") | |
f.close() | |
dataset = Dataset.from_json( | |
"temp.jsonl", features=iterable_dataset.features, split=split_name | |
) | |
else: | |
full_dataset = load_dataset( | |
dataset_name, | |
name=config_name, | |
split=split_name, | |
) | |
dataset = full_dataset.select(range(num_rows)) | |
dataset.save_to_disk(cache_name) | |
return dataset | |
def intersect_dfs(df_dict): | |
started = 0 | |
new_df = None | |
for key, df in df_dict.items(): | |
if df is None: | |
continue | |
for key2, df2 in df_dict.items(): | |
if df2 is None: | |
continue | |
if key == key2: | |
continue | |
if started: | |
new_df = new_df.join(df2, how="inner", lsuffix="1", rsuffix="2") | |
else: | |
new_df = df.join(df2, how="inner", lsuffix="1", rsuffix="2") | |
started = 1 | |
return new_df.copy() | |
def get_typed_features(features, ftype="string", parents=None): | |
""" | |
Recursively get a list of all features of a certain dtype | |
:param features: | |
:param ftype: | |
:param parents: | |
:return: a list of tuples > e.g. ('A', 'B', 'C') for feature example['A']['B']['C'] | |
""" | |
if parents is None: | |
parents = [] | |
typed_features = [] | |
for name, feat in features.items(): | |
if isinstance(feat, dict): | |
if feat.get("dtype", None) == ftype or feat.get("feature", {}).get( | |
("dtype", None) == ftype | |
): | |
typed_features += [tuple(parents + [name])] | |
elif "feature" in feat: | |
if feat["feature"].get("dtype", None) == ftype: | |
typed_features += [tuple(parents + [name])] | |
elif isinstance(feat["feature"], dict): | |
typed_features += get_typed_features( | |
feat["feature"], ftype, parents + [name] | |
) | |
else: | |
for k, v in feat.items(): | |
if isinstance(v, dict): | |
typed_features += get_typed_features( | |
v, ftype, parents + [name, k] | |
) | |
elif name == "dtype" and feat == ftype: | |
typed_features += [tuple(parents)] | |
return typed_features | |
def get_label_features(features, parents=None): | |
""" | |
Recursively get a list of all features that are ClassLabels | |
:param features: | |
:param parents: | |
:return: pairs of tuples as above and the list of class names | |
""" | |
if parents is None: | |
parents = [] | |
label_features = [] | |
for name, feat in features.items(): | |
if isinstance(feat, dict): | |
if "names" in feat: | |
label_features += [(tuple(parents + [name]), feat["names"])] | |
elif "feature" in feat: | |
if "names" in feat: | |
label_features += [ | |
(tuple(parents + [name]), feat["feature"]["names"]) | |
] | |
elif isinstance(feat["feature"], dict): | |
label_features += get_label_features( | |
feat["feature"], parents + [name] | |
) | |
else: | |
for k, v in feat.items(): | |
if isinstance(v, dict): | |
label_features += get_label_features(v, parents + [name, k]) | |
elif name == "names": | |
label_features += [(tuple(parents), feat)] | |
return label_features | |
# get the info we need for the app sidebar in dict format | |
def dictionarize_info(dset_info): | |
info_dict = asdict(dset_info) | |
res = { | |
"config_name": info_dict["config_name"], | |
"splits": { | |
spl: 100 #spl_info["num_examples"] | |
for spl, spl_info in info_dict["splits"].items() | |
}, | |
"features": { | |
"string": get_typed_features(info_dict["features"], "string"), | |
"int32": get_typed_features(info_dict["features"], "int32"), | |
"float32": get_typed_features(info_dict["features"], "float32"), | |
"label": get_label_features(info_dict["features"]), | |
}, | |
"description": dset_info.description, | |
} | |
return res | |
def get_dataset_info_dicts(dataset_id=None): | |
""" | |
Creates a dict from dataset configs. | |
Uses the datasets lib's get_dataset_infos | |
:return: Dictionary mapping dataset names to their configurations | |
""" | |
if dataset_id != None: | |
ds_name_to_conf_dict = { | |
dataset_id: { | |
config_name: dictionarize_info(config_info) | |
for config_name, config_info in get_dataset_infos(dataset_id).items() | |
} | |
} | |
else: | |
ds_name_to_conf_dict = { | |
ds_id: { | |
config_name: dictionarize_info(config_info) | |
for config_name, config_info in get_dataset_infos(ds_id).items() | |
} | |
for ds_id in _DATASET_LIST | |
} | |
return ds_name_to_conf_dict | |
# get all instances of a specific field in a dataset | |
def extract_field(examples, field_path, new_field_name=None): | |
if new_field_name is None: | |
new_field_name = "_".join(field_path) | |
field_list = [] | |
# TODO: Breaks the CLI if this isn't checked. | |
if isinstance(field_path, str): | |
field_path = [field_path] | |
item_list = examples[field_path[0]] | |
for field_name in field_path[1:]: | |
item_list = [ | |
next_item | |
for item in item_list | |
for next_item in ( | |
item[field_name] | |
if isinstance(item[field_name], list) | |
else [item[field_name]] | |
) | |
] | |
field_list += [ | |
field | |
for item in item_list | |
for field in (item if isinstance(item, list) else [item]) | |
] | |
return {new_field_name: field_list} | |