{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "One off conversion / data wrangling script, to avoid redownloading videos." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import base64\n", "\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet(\"../data/dataset_original.parquet\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
video_idframe_idxtimestampimage_pathdim_0dim_1dim_2dim_3dim_4dim_5...dim_502dim_503dim_504dim_505dim_506dim_507dim_508dim_509dim_510dim_511
08Ilh1ewceco00.0data/images/8Ilh1ewceco/0.jpg-0.013127-0.022996-0.049374-0.0063060.013602-0.003762...-0.0069200.0138310.0566470.007946-0.002478-0.030497-0.0117700.067427-0.031810-0.025615
18Ilh1ewceco1455.0data/images/8Ilh1ewceco/145.jpg0.0090400.0033380.029684-0.0330580.040864-0.006447...0.033575-0.0190760.047166-0.010574-0.018608-0.013465-0.0200170.086240-0.0296530.035949
28Ilh1ewceco29010.0data/images/8Ilh1ewceco/290.jpg0.0048910.0065270.004417-0.0003230.006400-0.024191...-0.043122-0.0106950.0056720.000172-0.014442-0.014647-0.0168400.1002850.0137940.015046
38Ilh1ewceco43515.0data/images/8Ilh1ewceco/435.jpg-0.0221590.020703-0.021607-0.019721-0.006067-0.035070...-0.017047-0.018341-0.006733-0.007040-0.0083680.009755-0.0456620.116601-0.000572-0.000985
48Ilh1ewceco58020.0data/images/8Ilh1ewceco/580.jpg-0.0159030.0335450.009257-0.0335400.010586-0.028067...-0.0165320.0123880.020868-0.0126350.0109140.009203-0.0100780.063971-0.0380240.025840
\n", "

5 rows × 516 columns

\n", "
" ], "text/plain": [ " video_id frame_idx timestamp image_path \\\n", "0 8Ilh1ewceco 0 0.0 data/images/8Ilh1ewceco/0.jpg \n", "1 8Ilh1ewceco 145 5.0 data/images/8Ilh1ewceco/145.jpg \n", "2 8Ilh1ewceco 290 10.0 data/images/8Ilh1ewceco/290.jpg \n", "3 8Ilh1ewceco 435 15.0 data/images/8Ilh1ewceco/435.jpg \n", "4 8Ilh1ewceco 580 20.0 data/images/8Ilh1ewceco/580.jpg \n", "\n", " dim_0 dim_1 dim_2 dim_3 dim_4 dim_5 ... dim_502 \\\n", "0 -0.013127 -0.022996 -0.049374 -0.006306 0.013602 -0.003762 ... -0.006920 \n", "1 0.009040 0.003338 0.029684 -0.033058 0.040864 -0.006447 ... 0.033575 \n", "2 0.004891 0.006527 0.004417 -0.000323 0.006400 -0.024191 ... -0.043122 \n", "3 -0.022159 0.020703 -0.021607 -0.019721 -0.006067 -0.035070 ... -0.017047 \n", "4 -0.015903 0.033545 0.009257 -0.033540 0.010586 -0.028067 ... -0.016532 \n", "\n", " dim_503 dim_504 dim_505 dim_506 dim_507 dim_508 dim_509 \\\n", "0 0.013831 0.056647 0.007946 -0.002478 -0.030497 -0.011770 0.067427 \n", "1 -0.019076 0.047166 -0.010574 -0.018608 -0.013465 -0.020017 0.086240 \n", "2 -0.010695 0.005672 0.000172 -0.014442 -0.014647 -0.016840 0.100285 \n", "3 -0.018341 -0.006733 -0.007040 -0.008368 0.009755 -0.045662 0.116601 \n", "4 0.012388 0.020868 -0.012635 0.010914 0.009203 -0.010078 0.063971 \n", "\n", " dim_510 dim_511 \n", "0 -0.031810 -0.025615 \n", "1 -0.029653 0.035949 \n", "2 0.013794 0.015046 \n", "3 -0.000572 -0.000985 \n", "4 -0.038024 0.025840 \n", "\n", "[5 rows x 516 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 71761/71761 [00:49<00:00, 1458.75it/s]\n" ] } ], "source": [ "image_paths = df[\"image_path\"].tolist()\n", "new_df = df.rename(columns={\"image_path\": \"base64_image\"})\n", "for i, img in enumerate(tqdm(image_paths)):\n", " with open(f\"../{img}\", \"rb\") as image_file:\n", " encoded_string = base64.b64encode(image_file.read()).decode()\n", " new_df.loc[i, \"base64_image\"] = encoded_string" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/sidneyradcliffe/miniforge3/envs/visual-content-search-over-videos/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from pipeline.process_videos import DATAFRAME_PATH\n", "\n", "new_df.to_parquet(DATAFRAME_PATH, index=False)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# reload, check it's correct\n", "new_df = pd.read_parquet(DATAFRAME_PATH)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
video_idframe_idxtimestampbase64_imagedim_0dim_1dim_2dim_3dim_4dim_5...dim_502dim_503dim_504dim_505dim_506dim_507dim_508dim_509dim_510dim_511
08Ilh1ewceco00.0b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...-0.013127-0.022996-0.049374-0.0063060.013602-0.003762...-0.0069200.0138310.0566470.007946-0.002478-0.030497-0.0117700.067427-0.031810-0.025615
18Ilh1ewceco1455.0b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...0.0090400.0033380.029684-0.0330580.040864-0.006447...0.033575-0.0190760.047166-0.010574-0.018608-0.013465-0.0200170.086240-0.0296530.035949
28Ilh1ewceco29010.0b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...0.0048910.0065270.004417-0.0003230.006400-0.024191...-0.043122-0.0106950.0056720.000172-0.014442-0.014647-0.0168400.1002850.0137940.015046
38Ilh1ewceco43515.0b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...-0.0221590.020703-0.021607-0.019721-0.006067-0.035070...-0.017047-0.018341-0.006733-0.007040-0.0083680.009755-0.0456620.116601-0.000572-0.000985
48Ilh1ewceco58020.0b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...-0.0159030.0335450.009257-0.0335400.010586-0.028067...-0.0165320.0123880.020868-0.0126350.0109140.009203-0.0100780.063971-0.0380240.025840
\n", "

5 rows × 516 columns

\n", "
" ], "text/plain": [ " video_id frame_idx timestamp \\\n", "0 8Ilh1ewceco 0 0.0 \n", "1 8Ilh1ewceco 145 5.0 \n", "2 8Ilh1ewceco 290 10.0 \n", "3 8Ilh1ewceco 435 15.0 \n", "4 8Ilh1ewceco 580 20.0 \n", "\n", " base64_image dim_0 dim_1 \\\n", "0 b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH... -0.013127 -0.022996 \n", "1 b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH... 0.009040 0.003338 \n", "2 b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH... 0.004891 0.006527 \n", "3 b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH... -0.022159 0.020703 \n", "4 b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH... -0.015903 0.033545 \n", "\n", " dim_2 dim_3 dim_4 dim_5 ... dim_502 dim_503 dim_504 \\\n", "0 -0.049374 -0.006306 0.013602 -0.003762 ... -0.006920 0.013831 0.056647 \n", "1 0.029684 -0.033058 0.040864 -0.006447 ... 0.033575 -0.019076 0.047166 \n", "2 0.004417 -0.000323 0.006400 -0.024191 ... -0.043122 -0.010695 0.005672 \n", "3 -0.021607 -0.019721 -0.006067 -0.035070 ... -0.017047 -0.018341 -0.006733 \n", "4 0.009257 -0.033540 0.010586 -0.028067 ... -0.016532 0.012388 0.020868 \n", "\n", " dim_505 dim_506 dim_507 dim_508 dim_509 dim_510 dim_511 \n", "0 0.007946 -0.002478 -0.030497 -0.011770 0.067427 -0.031810 -0.025615 \n", "1 -0.010574 -0.018608 -0.013465 -0.020017 0.086240 -0.029653 0.035949 \n", "2 0.000172 -0.014442 -0.014647 -0.016840 0.100285 0.013794 0.015046 \n", "3 -0.007040 -0.008368 0.009755 -0.045662 0.116601 -0.000572 -0.000985 \n", "4 -0.012635 0.010914 0.009203 -0.010078 0.063971 -0.038024 0.025840 \n", "\n", "[5 rows x 516 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df.head()" ] } ], "metadata": { "kernelspec": { "display_name": "visual-content-search-over-videos", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }