sradc
added gitignores when downloading videos/video-ids, and added download_videos.py to run_pipeline.sh
04848c9
raw
history blame
1.13 kB
import re
import subprocess
from pathlib import Path
from typing import List
from tqdm import tqdm
REPO_ROOT = Path(__file__).parents[1].resolve()
DATA_DIR = REPO_ROOT / "data"
VIDEO_DIR = DATA_DIR / "videos"
VIDEO_ID_FOLDER = DATA_DIR / "ids"
def get_id(url: str) -> str:
return re.search(r"(?<=v=)[^&]+", url).group(0)
def download_videos(video_ids: List[str]) -> None:
VIDEO_DIR.mkdir(exist_ok=True, parents=True)
(VIDEO_DIR / "gitignore").write_text("*")
for video_id in tqdm(video_ids):
video_url = f"https://www.youtube.com/watch?v={video_id}"
video_path = VIDEO_DIR / f"{video_id}.mp4"
if video_path.exists():
print(f"Skipping {video_path} because it already exists")
continue
subprocess.run(
["yt-dlp", "--quiet", "-f", "135", "-o", str(video_path), video_url]
)
if __name__ == "__main__":
print("Downloading videos...")
ids = set()
for file in VIDEO_ID_FOLDER.glob("*.txt"):
ids.update(
[x for x in file.read_text().strip().splitlines(keepends=False) if x]
)
download_videos(ids)