{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "module 'torch' has no attribute 'set_grad_enabled'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdistributed\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mdist\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mvlmeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfig\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m supported_VLM\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mvlmeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m build_dataset\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mvlmeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01minference\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m infer_data_job\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/VLMEvalKit/vlmeval/__init__.py:7\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msmp\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[0;32m----> 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/VLMEvalKit/vlmeval/api/__init__.py:4\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhf_chat_model\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HFChatModel\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mgemini\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GeminiWrapper, GeminiProVision\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mqwen_vl_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m QwenVLWrapper, QwenVLAPI, Qwen2VLAPI\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mqwen_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m QwenAPI\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mclaude\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Claude_Wrapper, Claude3V\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/VLMEvalKit/vlmeval/api/qwen_vl_api.py:8\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mvlmeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msmp\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mvlmeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mapi\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseAPI\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mvlmeval\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvlm\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mqwen2_vl\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mprompt\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Qwen2VLPromptMixin\n\u001b[1;32m     11\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mensure_image_url\u001b[39m(image: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[1;32m     12\u001b[0m     prefixes \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhttp://\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhttps://\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfile://\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata:image;\u001b[39m\u001b[38;5;124m'\u001b[39m]\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/VLMEvalKit/vlmeval/vlm/__init__.py:3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mset_grad_enabled\u001b[49m(\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m      4\u001b[0m torch\u001b[38;5;241m.\u001b[39mmanual_seed(\u001b[38;5;241m1234\u001b[39m)\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01maria\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Aria\n",
      "\u001b[0;31mAttributeError\u001b[0m: module 'torch' has no attribute 'set_grad_enabled'"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.distributed as dist\n",
    "\n",
    "from vlmeval.config import supported_VLM\n",
    "from vlmeval.dataset import build_dataset\n",
    "from vlmeval.inference import infer_data_job\n",
    "from vlmeval.inference_video import infer_data_job_video\n",
    "from vlmeval.inference_mt import infer_data_job_mt\n",
    "from vlmeval.smp import *\n",
    "from vlmeval.utils.result_transfer import MMMU_result_transfer, MMTBench_result_transfer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found existing installation: torch 2.5.1\n",
      "Uninstalling torch-2.5.1:\n",
      "  Successfully uninstalled torch-2.5.1\n",
      "Found existing installation: torchvision 0.20.1\n",
      "Uninstalling torchvision-0.20.1:\n",
      "  Successfully uninstalled torchvision-0.20.1\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!pip uninstall -y torch torchvision"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "RuntimeError",
     "evalue": "operator torchvision::nms does not exist",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorchvision\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtransforms\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtransforms\u001b[39;00m\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages/torchvision/__init__.py:10\u001b[0m\n\u001b[1;32m      7\u001b[0m \u001b[38;5;66;03m# Don't re-order these, we need to load the _C extension (done when importing\u001b[39;00m\n\u001b[1;32m      8\u001b[0m \u001b[38;5;66;03m# .extensions) before entering _meta_registrations.\u001b[39;00m\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mextension\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _HAS_OPS  \u001b[38;5;66;03m# usort:skip\u001b[39;00m\n\u001b[0;32m---> 10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorchvision\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _meta_registrations, datasets, io, models, ops, transforms, utils  \u001b[38;5;66;03m# usort:skip\u001b[39;00m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m     13\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__  \u001b[38;5;66;03m# noqa: F401\u001b[39;00m\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages/torchvision/_meta_registrations.py:164\u001b[0m\n\u001b[1;32m    153\u001b[0m     torch\u001b[38;5;241m.\u001b[39m_check(\n\u001b[1;32m    154\u001b[0m         grad\u001b[38;5;241m.\u001b[39mdtype \u001b[38;5;241m==\u001b[39m rois\u001b[38;5;241m.\u001b[39mdtype,\n\u001b[1;32m    155\u001b[0m         \u001b[38;5;28;01mlambda\u001b[39;00m: (\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    158\u001b[0m         ),\n\u001b[1;32m    159\u001b[0m     )\n\u001b[1;32m    160\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m grad\u001b[38;5;241m.\u001b[39mnew_empty((batch_size, channels, height, width))\n\u001b[1;32m    163\u001b[0m \u001b[38;5;129;43m@torch\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlibrary\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregister_fake\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtorchvision::nms\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m--> 164\u001b[0m \u001b[38;5;28;43;01mdef\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;21;43mmeta_nms\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mdets\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscores\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43miou_threshold\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m    165\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdets\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdim\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mboxes should be a 2d tensor, got \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mdets\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdim\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43mD\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    166\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdets\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msize\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mboxes should have 4 elements in dimension 1, got \u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mdets\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msize\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages/torch/library.py:795\u001b[0m, in \u001b[0;36mregister_fake.<locals>.register\u001b[0;34m(func)\u001b[0m\n\u001b[1;32m    793\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    794\u001b[0m     use_lib \u001b[38;5;241m=\u001b[39m lib\n\u001b[0;32m--> 795\u001b[0m \u001b[43muse_lib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_register_fake\u001b[49m\u001b[43m(\u001b[49m\u001b[43mop_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_stacklevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstacklevel\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    796\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages/torch/library.py:184\u001b[0m, in \u001b[0;36mLibrary._register_fake\u001b[0;34m(self, op_name, fn, _stacklevel)\u001b[0m\n\u001b[1;32m    181\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    182\u001b[0m     func_to_register \u001b[38;5;241m=\u001b[39m fn\n\u001b[0;32m--> 184\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[43mentry\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfake_impl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mregister\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc_to_register\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msource\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_registration_handles\u001b[38;5;241m.\u001b[39mappend(handle)\n",
      "File \u001b[0;32m/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages/torch/_library/fake_impl.py:31\u001b[0m, in \u001b[0;36mFakeImplHolder.register\u001b[0;34m(self, func, source)\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkernel \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m     27\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mregister_fake(...): the operator \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqualname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     28\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124malready has an fake impl registered at \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     29\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mkernel\u001b[38;5;241m.\u001b[39msource\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     30\u001b[0m     )\n\u001b[0;32m---> 31\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dispatch_has_kernel_for_dispatch_key\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqualname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mMeta\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m:\n\u001b[1;32m     32\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m     33\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mregister_fake(...): the operator \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqualname\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     34\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124malready has an DispatchKey::Meta implementation via a \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     37\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mregister_fake.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     38\u001b[0m     )\n\u001b[1;32m     40\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39m_C\u001b[38;5;241m.\u001b[39m_dispatch_has_kernel_for_dispatch_key(\n\u001b[1;32m     41\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqualname, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCompositeImplicitAutograd\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     42\u001b[0m ):\n",
      "\u001b[0;31mRuntimeError\u001b[0m: operator torchvision::nms does not exist"
     ]
    }
   ],
   "source": [
    "import torchvision.transforms as transforms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
      "\u001b[0mRequirement already satisfied: torch in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (2.5.1)\n",
      "Requirement already satisfied: torchvision in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (0.20.1)\n",
      "Requirement already satisfied: filelock in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (3.13.1)\n",
      "Requirement already satisfied: typing-extensions>=4.8.0 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (4.11.0)\n",
      "Requirement already satisfied: networkx in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (3.3)\n",
      "Requirement already satisfied: jinja2 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (3.1.4)\n",
      "Requirement already satisfied: fsspec in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (2024.6.1)\n",
      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (12.4.127)\n",
      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (12.4.127)\n",
      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (12.4.127)\n",
      "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (9.1.0.70)\n",
      "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (12.4.5.8)\n",
      "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (11.2.1.3)\n",
      "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (10.3.5.147)\n",
      "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (11.6.1.9)\n",
      "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (12.3.1.170)\n",
      "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (2.21.5)\n",
      "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (12.4.127)\n",
      "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (12.4.127)\n",
      "Requirement already satisfied: triton==3.1.0 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (3.1.0)\n",
      "Requirement already satisfied: sympy==1.13.1 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torch) (1.13.1)\n",
      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from sympy==1.13.1->torch) (1.3.0)\n",
      "Requirement already satisfied: numpy in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torchvision) (1.26.4)\n",
      "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from torchvision) (11.0.0)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages (from jinja2->torch) (2.1.3)\n",
      "\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
      "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -ransformers (/dscilab_dungvo/workspace/bin/envs/vlmeval/lib/python3.10/site-packages)\u001b[0m\u001b[33m\n",
      "\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!pip install --upgrade torch torchvision "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def build_model_from_config(cfg):\n",
    "    import vlmeval.api\n",
    "    import vlmeval.vlm\n",
    "    config = cp.deepcopy(cfg)\n",
    "    assert 'class' in config\n",
    "    cls_name = config.pop('class')\n",
    "    if hasattr(vlmeval.api, cls_name):\n",
    "        return getattr(vlmeval.api, cls_name)(**config)\n",
    "    elif hasattr(vlmeval.vlm, cls_name):\n",
    "        return getattr(vlmeval.vlm, cls_name)(**config)\n",
    "    else:\n",
    "        raise ValueError(f'Class {cls_name} is not supported in `vlmeval.api` or `vlmeval.vlm`')\n",
    "    \n",
    "\n",
    "def build_dataset_from_config(cfg):\n",
    "    import vlmeval.dataset\n",
    "    config = cp.deepcopy(cfg)\n",
    "    assert 'class' in config\n",
    "    cls_name = config.pop('class')\n",
    "    if hasattr(vlmeval.dataset, cls_name):\n",
    "        return getattr(vlmeval.dataset, cls_name)(**config)\n",
    "    else:\n",
    "        raise ValueError(f'Class {cls_name} is not supported in `vlmeval.dataset`')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "def parse_args():\n",
    "    help_msg = \"\"\"\\\n",
    "You can launch the evaluation by setting either --data and --model or --config.\n",
    "\n",
    "--data and --model:\n",
    "    Each Arg should be a list of strings, specifying the names of datasets and models.\n",
    "    To find all supported model names, please refer to the `vlmeval/config.py` of check the output of the command \\\n",
    "        `vlmutil mlist all` in the terminal (you should first have vlmeval installed).\n",
    "    To find all supported dataset names, please refer to the `vlmeval/dataset/__init__.py` file. The python script \\\n",
    "        to print all supported dataset names is as follows:\n",
    "        ```python\n",
    "        from vlmeval.dataset import SUPPORTED_DATASETS\n",
    "        print(SUPPORTED_DATASETS)\n",
    "        ```\n",
    "        or you can check the output of the command `vlmutil dlist all` in the terminal.\n",
    "\n",
    "--config:\n",
    "    Launch the evaluation by specifying the path to the config json file. Sample Json Content:\n",
    "    ```json\n",
    "    {\n",
    "        \"model\": {\n",
    "            \"GPT4o_20240806_T00_HIGH\": {\n",
    "                \"class\": \"GPT4V\",\n",
    "                \"model\": \"gpt-4o-2024-08-06\",\n",
    "                \"temperature\": 0,\n",
    "                \"img_detail\": \"high\"\n",
    "            },\n",
    "            \"GPT4o_20240806_T10_Low\": {\n",
    "                \"class\": \"GPT4V\",\n",
    "                \"model\": \"gpt-4o-2024-08-06\",\n",
    "                \"temperature\": 1.0,\n",
    "                \"img_detail\": \"low\"\n",
    "            }\n",
    "        },\n",
    "        \"data\": {\n",
    "            \"MME-RealWorld-Lite\": {\n",
    "                \"class\": \"MMERealWorld\",\n",
    "                \"dataset\": \"MME-RealWorld-Lite\"\n",
    "            },\n",
    "            \"MMBench_DEV_EN_V11\": {\n",
    "                \"class\": \"ImageMCQDataset\",\n",
    "                \"dataset\": \"MMBench_DEV_EN_V11\"\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "    ```\n",
    "    Currently, only `model` and `data` are supported fields. The content of each field is a dictionary.\n",
    "    For `model`, the key is the name of the model, and the value is a dictionary containing the following keys:\n",
    "    - `class`: The class name of the model, which should be a class in `vlmeval.vlm` or `vlmeval.api`.\n",
    "    - Other keys are specific to the model, please refer to the corresponding class.\n",
    "    For `data`, the key is the name of the dataset (should be the same as the `dataset` field in most cases, \\\n",
    "        except for video datasets), and the value is a dictionary containing the following keys:\n",
    "    - `class`: The class name of the dataset, which should be a class in `vlmeval.dataset`.\n",
    "    - `dataset`: The name of the dataset, which should be a string that is accepted by the `dataset` argument of the \\\n",
    "        corresponding class.\n",
    "    - Other keys are specific to the dataset, please refer to the corresponding class.\n",
    "\n",
    "    The keys in the `model` and `data` fields will be used for naming the prediction files and evaluation results.\n",
    "    When launching with `--config`, args for video datasets, such as `--nframe`, `--pack`, `--use-subtitle`, `--fps`, \\\n",
    "        and args for API VLMs, such as `--retry`, `--verbose`, will be ignored.\n",
    "\"\"\"\n",
    "    parser = argparse.ArgumentParser(description=help_msg, formatter_class=argparse.RawTextHelpFormatter)\n",
    "    # Essential Args, Setting the Names of Datasets and Models\n",
    "    parser.add_argument('--data', type=str, nargs='+', help='Names of Datasets')\n",
    "    parser.add_argument('--model', type=str, nargs='+', help='Names of Models')\n",
    "    parser.add_argument('--config', type=str, help='Path to the Config Json File', default=None)\n",
    "    # Args that only apply to Video Dataset\n",
    "    parser.add_argument('--nframe', type=int, default=8)\n",
    "    parser.add_argument('--pack', action='store_true')\n",
    "    parser.add_argument('--use-subtitle', action='store_true')\n",
    "    parser.add_argument('--fps', type=float, default=-1)\n",
    "    # Work Dir\n",
    "    parser.add_argument('--work-dir', type=str, default='./outputs', help='select the output directory')\n",
    "    # Infer + Eval or Infer Only\n",
    "    parser.add_argument('--mode', type=str, default='all', choices=['all', 'infer'])\n",
    "    # API Kwargs, Apply to API VLMs and Judge API LLMs\n",
    "    parser.add_argument('--nproc', type=int, default=4, help='Parallel API calling')\n",
    "    parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs')\n",
    "    # Explicitly Set the Judge Model\n",
    "    parser.add_argument('--judge', type=str, default=None)\n",
    "    # Logging Utils\n",
    "    parser.add_argument('--verbose', action='store_true')\n",
    "    # Configuration for Resume\n",
    "    # Ignore: will not rerun failed VLM inference\n",
    "    parser.add_argument('--ignore', action='store_true', help='Ignore failed indices. ')\n",
    "    # Reuse: will reuse the existing prediction files\n",
    "    parser.add_argument('--reuse', action='store_true')\n",
    "\n",
    "    args = parser.parse_args()\n",
    "    return args"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'parse_args' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[2], line 27\u001b[0m\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m args\n\u001b[1;32m     23\u001b[0m \u001b[38;5;66;03m# command2args('notebook --arg1 10 --arg2 hello', parser)\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \n\u001b[1;32m     25\u001b[0m \u001b[38;5;66;03m# python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose\u001b[39;00m\n\u001b[1;32m     26\u001b[0m \u001b[38;5;66;03m# args = command2args('python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model SmolVLM --verbose')\u001b[39;00m\n\u001b[0;32m---> 27\u001b[0m args \u001b[38;5;241m=\u001b[39m \u001b[43mcommand2args\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mpython run.py --data COCO_VAL --model SmolVLM --verbose\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "Cell \u001b[0;32mIn[2], line 21\u001b[0m, in \u001b[0;36mcommand2args\u001b[0;34m(command)\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcommand2args\u001b[39m(command):\n\u001b[1;32m     15\u001b[0m     \n\u001b[1;32m     16\u001b[0m     \u001b[38;5;66;03m# remove file name\u001b[39;00m\n\u001b[1;32m     19\u001b[0m     sys\u001b[38;5;241m.\u001b[39margv \u001b[38;5;241m=\u001b[39m command\u001b[38;5;241m.\u001b[39msplit()[\u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m---> 21\u001b[0m     args \u001b[38;5;241m=\u001b[39m \u001b[43mparse_args\u001b[49m()\n\u001b[1;32m     22\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m args\n",
      "\u001b[0;31mNameError\u001b[0m: name 'parse_args' is not defined"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "from argparse import ArgumentParser\n",
    "\n",
    "# parser = ArgumentParser(description=\"Example argparse in Jupyter Notebook\")\n",
    "# parser.add_argument('--arg1', type=int, help='An integer argument')\n",
    "# parser.add_argument('--arg2', type=str, help='A string argument')\n",
    "# sys.argv = ['notebook', '--arg1', '10', '--arg2', 'hello']\n",
    "# args = parser.parse_args()\n",
    "# print(f\"arg1: {args.arg1}, arg2: {args.arg2}\")\n",
    "\n",
    "\n",
    "# notebook --arg1 10 --arg2 hello\n",
    "\n",
    "def command2args(command):\n",
    "    \n",
    "    # remove file name\n",
    "    \n",
    "    \n",
    "    sys.argv = command.split()[1:]\n",
    "\n",
    "    args = parse_args()\n",
    "    return args\n",
    "# command2args('notebook --arg1 10 --arg2 hello', parser)\n",
    "\n",
    "# python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose\n",
    "# args = command2args('python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model SmolVLM --verbose')\n",
    "args = command2args('python run.py --data COCO_VAL --model SmolVLM --verbose')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2024-12-13 07:39:07,083] WARNING - RUN - 245429849.py: <module> - 14: --reuse is not set, will not reuse previous (before one day) temporary files\n",
      "[2024-12-13 07:39:07] WARNING - 245429849.py: <module> - 14: --reuse is not set, will not reuse previous (before one day) temporary files\n"
     ]
    }
   ],
   "source": [
    "logger = get_logger('RUN')\n",
    "rank, world_size = get_rank_and_world_size()\n",
    "if args.config is not None:\n",
    "    assert args.data is None and args.model is None, '--data and --model should not be set when using --config'\n",
    "    use_config, cfg = True, load(args.config)\n",
    "    args.model = list(cfg['model'].keys())\n",
    "    args.data = list(cfg['data'].keys())\n",
    "else:\n",
    "    assert len(args.data), '--data should be a list of data files'\n",
    "    \n",
    "    \n",
    "if rank == 0:\n",
    "    if not args.reuse:\n",
    "        logger.warning('--reuse is not set, will not reuse previous (before one day) temporary files')\n",
    "    else:\n",
    "        logger.warning('--reuse is set, will reuse the latest prediction & temporary pickle files')\n",
    "\n",
    "if 'MMEVAL_ROOT' in os.environ:\n",
    "    args.work_dir = os.environ['MMEVAL_ROOT']\n",
    "\n",
    "use_config, cfg = False, None\n",
    "if not use_config:\n",
    "    for k, v in supported_VLM.items():\n",
    "        if hasattr(v, 'keywords') and 'retry' in v.keywords and args.retry is not None:\n",
    "            v.keywords['retry'] = args.retry\n",
    "            supported_VLM[k] = v\n",
    "        if hasattr(v, 'keywords') and 'verbose' in v.keywords and args.verbose is not None:\n",
    "            v.keywords['verbose'] = args.verbose\n",
    "            supported_VLM[k] = v\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['COCO_VAL']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "args.data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = args.data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/dscilab_dungvo/workspace/VLMEvalKit/vlmeval/dataset/image_base.py:93: UserWarning: The dataset tsv is not downloaded\n",
      "  warnings.warn('The dataset tsv is not downloaded')\n",
      "COCO_VAL.tsv: 345MB [00:48, 7.09MB/s]                                                                                                                      \n"
     ]
    }
   ],
   "source": [
    "list_datasets = []\n",
    "dataset_kwargs = {}\n",
    "if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']:\n",
    "    dataset_kwargs['model'] = model_name\n",
    "if dataset_name == 'MMBench-Video':\n",
    "    dataset_kwargs['pack'] = args.pack\n",
    "if dataset_name == 'Video-MME':\n",
    "    dataset_kwargs['use_subtitle'] = args.use_subtitle\n",
    "for args_data in args.data:\n",
    "    list_datasets.append(build_dataset(args_data, **dataset_kwargs))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vlmeval",
   "language": "python",
   "name": "vlmeval"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}