whisper-youtube-2-hf_dataset

Runtime error

Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset

7288748 over 1 year ago

No virus

1.32 kB

	# Adapted from Eduardo Matallanas
	from datasets import load_dataset, Dataset
	from datasets.data_files import EmptyDatasetError

	class HFDataset():
	"""
	Create a dataset to save the transcripts from Youtube.
	"""
	def __init__(self, name) -> None:
	self.name = name
	if name != "":
	self._init_dataset()
	else:
	self.dataset = Dataset.from_dict({})
	self.exist = False
	self.is_empty = True

	def _init_dataset(self):
	try:
	self.dataset = load_dataset(self.name)
	self.exist = True
	self.is_empty = False
	self.list_of_ids = self._get_list_of_id()
	except EmptyDatasetError:
	self.dataset = Dataset.from_dict({})
	self.exist = True
	self.is_empty = True
	self.list_of_ids = []
	pass
	except FileNotFoundError:
	self.dataset = Dataset.from_dict({})
	self.exist = False
	self.is_empty = True
	self.list_of_ids = []
	pass

	def upload(self):
	self.dataset.push_to_hub(self.name)

	def _get_list_of_id(self):
	new_ds = self.dataset.map(
	lambda x: {"ID": [url.split("=")[-1] for url in x["URL"]]}, batched=True
	)
	list_of_ids = []
	for split in new_ds:
	ids = new_ds[split]["ID"]
	list_of_ids.append(ids)
	return [item for sublist in list_of_ids for item in sublist]