{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "\n",
    "def collect_dataset_info(root_dir):\n",
    "    dataset_info = []\n",
    "\n",
    "    for mode in [\"train\", \"val\", \"test\"]:\n",
    "        mode_dir = os.path.join(root_dir, mode)\n",
    "        if not os.path.exists(mode_dir):\n",
    "            continue\n",
    "\n",
    "        for video_folder in os.listdir(mode_dir):\n",
    "            video_id = video_folder  # Folder name is used as video_id\n",
    "            video_folder_path = os.path.join(mode_dir, video_folder)\n",
    "\n",
    "            if os.path.isdir(video_folder_path):\n",
    "                video_path, audio_path, motion_path = None, None, None\n",
    "\n",
    "                for file_name in os.listdir(video_folder_path):\n",
    "                    file_path = os.path.join(video_folder_path, file_name)\n",
    "\n",
    "                    if file_name.endswith(\".mp4\"):\n",
    "                        video_path = file_path\n",
    "                    elif file_name.endswith(\".wav\"):\n",
    "                        audio_path = file_path\n",
    "                    elif file_name.endswith(\".pkl\"):\n",
    "                        motion_path = file_path\n",
    "\n",
    "                # Create an entry only if all the necessary files are present\n",
    "                if video_path and audio_path and motion_path:\n",
    "                    dataset_info.append(\n",
    "                        {\"video_id\": video_id, \"video_path\": video_path, \"audio_path\": audio_path, \"motion_path\": motion_path, \"mode\": mode}\n",
    "                    )\n",
    "\n",
    "    return dataset_info\n",
    "\n",
    "\n",
    "# Set the root directory path of your dataset\n",
    "root_dir = \"/path/to/ExpressiveWholeBodyDatasetReleaseV1.0\"\n",
    "dataset_info = collect_dataset_info(root_dir)\n",
    "output_file = \"dataset_info.json\"\n",
    "with open(output_file, \"w\") as json_file:\n",
    "    json.dump(dataset_info, json_file, indent=4)\n",
    "print(f\"Dataset information saved to {output_file}\")\n",
    "\n",
    "\n",
    "import os\n",
    "import json\n",
    "import pickle\n",
    "import wave\n",
    "\n",
    "\n",
    "def load_pkl(pkl_path):\n",
    "    try:\n",
    "        with open(pkl_path, \"rb\") as f:\n",
    "            data = pickle.load(f)\n",
    "        return data\n",
    "    except Exception as e:\n",
    "        print(f\"Error loading {pkl_path}: {e}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "def load_wav(wav_path):\n",
    "    try:\n",
    "        with wave.open(wav_path, \"rb\") as f:\n",
    "            frames = f.getnframes()\n",
    "        return frames\n",
    "    except Exception as e:\n",
    "        print(f\"Error loading {wav_path}: {e}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "def generate_clips(data, stride, window_length):\n",
    "    clips = []\n",
    "    for entry in data:\n",
    "        pkl_data = load_pkl(entry[\"motion_path\"])\n",
    "        wav_frames = load_wav(entry[\"audio_path\"])\n",
    "\n",
    "        # Only continue if both the pkl and wav files are successfully loaded\n",
    "        if pkl_data is None or wav_frames is None:\n",
    "            continue\n",
    "\n",
    "        # Determine the total length of the sequence from pkl data\n",
    "        total_frames = len(pkl_data)  # Assuming pkl contains motion data frames\n",
    "\n",
    "        # Generate clips based on stride and window_length\n",
    "        for start_idx in range(0, total_frames - window_length + 1, stride):\n",
    "            end_idx = start_idx + window_length\n",
    "            clip = {\n",
    "                \"video_id\": entry[\"video_id\"],\n",
    "                \"video_path\": entry[\"video_path\"],\n",
    "                \"audio_path\": entry[\"audio_path\"],\n",
    "                \"motion_path\": entry[\"motion_path\"],\n",
    "                \"mode\": entry[\"mode\"],\n",
    "                \"start_idx\": start_idx,\n",
    "                \"end_idx\": end_idx,\n",
    "            }\n",
    "            clips.append(clip)\n",
    "\n",
    "    return clips\n",
    "\n",
    "\n",
    "# Load the existing dataset JSON file\n",
    "input_json = \"dataset_info.json\"\n",
    "with open(input_json, \"r\") as f:\n",
    "    dataset_info = json.load(f)\n",
    "\n",
    "# Set stride and window length\n",
    "stride = 5  # Adjust stride as needed\n",
    "window_length = 10  # Adjust window length as needed\n",
    "\n",
    "# Generate clips for all data\n",
    "clips_data = generate_clips(dataset_info, stride, window_length)\n",
    "\n",
    "# Save the filtered clips data to a new JSON file\n",
    "output_json = \"filtered_clips_data.json\"\n",
    "with open(output_json, \"w\") as f:\n",
    "    json.dump(clips_data, f, indent=4)\n",
    "\n",
    "print(f\"Filtered clips data saved to {output_json}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import pickle\n",
    "\n",
    "with open(\n",
    "    \"/Users/haiyang/download_backup/oliver/Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm/test/214438-00_07_16-00_07_26/214438-00_07_16-00_07_26.pkl\",\n",
    "    \"rb\",\n",
    ") as f:\n",
    "    # Load the file by mapping any GPU tensors to CPU\n",
    "    pkl_example = torch.load(f, map_location=torch.device(\"cpu\"))\n",
    "\n",
    "# Now check the type of the object\n",
    "print(type(pkl_example))\n",
    "\n",
    "# If it's a dictionary, print its keys\n",
    "if isinstance(pkl_example, dict):\n",
    "    print(pkl_example.keys())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}