File size: 1,134 Bytes
1801c3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44efe1c
1801c3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import subprocess
from pathlib import Path
from typing import List

from tqdm import tqdm

REPO_ROOT = Path(__file__).parents[1].resolve()
DATA_DIR = REPO_ROOT / "data"
VIDEO_DIR = DATA_DIR / "videos"
VIDEO_ID_FOLDER = DATA_DIR / "ids"


def get_id(url: str) -> str:
    return re.search(r"(?<=v=)[^&]+", url).group(0)


def download_videos(video_ids: List[str]) -> None:
    VIDEO_DIR.mkdir(exist_ok=True, parents=True)
    (VIDEO_DIR / ".gitignore").write_text("*")
    for video_id in tqdm(video_ids):
        video_url = f"https://www.youtube.com/watch?v={video_id}"
        video_path = VIDEO_DIR / f"{video_id}.mp4"
        if video_path.exists():
            print(f"Skipping {video_path} because it already exists")
            continue
        subprocess.run(
            ["yt-dlp", "--quiet", "-f", "135", "-o", str(video_path), video_url]
        )


if __name__ == "__main__":
    print("Downloading videos...")
    ids = set()
    for file in VIDEO_ID_FOLDER.glob("*.txt"):
        ids.update(
            [x for x in file.read_text().strip().splitlines(keepends=False) if x]
        )
    download_videos(ids)