whisper-youtube-2-hf_dataset

Runtime error

App Files Files Community

whisper-youtube-2-hf_dataset / transforming /whispertransform.py

juancopi81

Duplicate from Whispering-GPT/whisper-youtube-2-hf_dataset

7288748 over 1 year ago

raw

history blame

No virus

2.26 kB

	import os
	from pathlib import Path
	from typing import Any
	from collections import OrderedDict

	from pytube import YouTube
	import whisper

	from transforming.transform import Transform
	from video import YoutubeVideo
	from utils import accepts_types

	class WhisperTransform(Transform):
	"""
	Transform a Video object using Whisper model. It's a
	concrete Transform.
	Args:
	model (`str`):
	Size of Whisper model. Can be tiny, base (default), small, medium, and large.
	without_timestamps (`bool`, defaults to `False`):
	To add phrase-level timestamps.
	"""

	def __init__(self, model: str="base", without_timestamps: bool=False) -> None:
	self.model = whisper.load_model(model)
	self.without_timestamps = without_timestamps

	@accepts_types(YoutubeVideo)
	def apply(self, video: YoutubeVideo) -> YoutubeVideo:
	"""Creates a new video with transcriptions created by Whisper.
	"""
	# Create a YouTube object
	yt = YouTube(video.url)

	# Get audio from video
	try:
	audio_file = self._get_audio_from_video(yt)

	except Exception as e:
	print(f"Exception: {e}")

	result = self.model.transcribe(audio_file,
	without_timestamps=self.without_timestamps)
	transcription = result["text"]

	data = []
	for seg in result['segments']:
	data.append(OrderedDict({'start': seg['start'], 'end': seg['end'],'text': seg['text']}))

	os.remove(audio_file)

	return YoutubeVideo(channel_name = video.channel_name,
	url = video.url,
	title = video.title,
	description = video.description,
	transcription = transcription,
	segments = data)

	def _get_audio_from_video(self, yt: Any) -> Path:
	# TODO: Add credits
	video = yt.streams.filter(only_audio=True).first()
	out_file = video.download(output_path=".")
	base, _ = os.path.splitext(out_file)
	new_file = base + ".mp3"
	os.rename(out_file, new_file)
	return new_file