from typing import List, Generator, Tuple from pathlib import Path from itertools import islice import scrapetube from youtubesearchpython import ChannelsSearch from pytube import Playlist from utils import accepts_types from loading.serialization import Serializer class YoutubeVideoPreprocessor: """This class is responsible for creating json files of expected as YoutubeVideo objects taking a channel name as input. Each JSON file has the following information: - channel_name: The name of the YouTube channel - url: The url of the video Args: channel_name (`str`): The name of the YouTube channel: Returns: load_paths (`List[Path]`) The paths of the json files of the video of that channel. TODO: Change it to accept also URL of video list, name of video list, etc. """ def __init__(self, mode: str = "channel_name", serializer = Serializer) -> None: self.mode = mode self.serializer = serializer def preprocess(self, name: str, num_videos: int, videos_in_ds: List[str]) -> Tuple[List[Path], Path]: if self.mode == "channel_name": # TODO: Add credits channels_search = ChannelsSearch(name, limit=1) channel_id = channels_search.result()['result'][0]['id'] videos = scrapetube.get_channel(channel_id=channel_id) load_paths, dataset_folder = self._convert_videos_to_json_files(name, videos, num_videos, videos_in_ds) return load_paths, dataset_folder elif self.mode == "playlist": playlist_id = name.split("=")[-1] playlist = Playlist(name) name = playlist.title videos = scrapetube.get_playlist(playlist_id) load_paths, dataset_folder = self._convert_videos_to_json_files(name, videos, num_videos, videos_in_ds) return load_paths, dataset_folder else: # TODO: implement this part youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber" test_files_folder = youtube_folder/"test/files" return [Path("test.json"), Path("test1.json")], test_files_folder def _convert_videos_to_json_files(self, name:str, videos: Generator, num_videos: int, videos_in_ds: List[str]) -> Tuple[List[Path], Path]: load_paths = [] youtube_folder = Path.home()/"whisper_gpt_pipeline/youtube_transcriber" dataset_folder = youtube_folder/name Path(dataset_folder).mkdir(parents=True, exist_ok=True) i = 0 while i < num_videos: try: video = next(videos) if video["videoId"] in videos_in_ds: continue else: file_name = f"{i}.json" save_path = Path(dataset_folder, file_name) save_path.touch(exist_ok=True) video_dict = {"channel_name": name, "url":f"https://www.youtube.com/watch?v={video['videoId']}"} self.serializer.dump(obj=video_dict, save_path=save_path) load_paths.append(save_path) i += 1 except StopIteration: break return load_paths, dataset_folder