diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..b07961f7ff7e7298b2d45abc3b32b8372552a9bc 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +pages/4M-21/video_1.mp4 filter=lfs diff=lfs merge=lfs -text +pages/Depth[[:space:]]Anything/video_1.mp4 filter=lfs diff=lfs merge=lfs -text +pages/RT-DETR/video_1.mp4 filter=lfs diff=lfs merge=lfs -text diff --git a/Home.py b/Home.py new file mode 100644 index 0000000000000000000000000000000000000000..f7baa867f275f48d021719c1899e1979d01bf6dd --- /dev/null +++ b/Home.py @@ -0,0 +1,16 @@ +import streamlit as st + +st.set_page_config(page_title="Home",page_icon="๐Ÿ ") + +# st.image("image_of_a_Turkish_lofi_girl_sitting_at_a_desk_writing_summaries_of_scientific_publications_ghibli_anime_like_hd.jpeg", use_column_width=True) + +st.write("# Vision Papers ๐Ÿ“š") + + +st.markdown( + """ + I've created a simple Streamlit App where I list summaries of papers (my browser bookmarks or Twitter bookmarks were getting messy). + Since you're one of my sources for bibliography, I thought you might be interested in having all your summaries grouped together somewhere + (average of 0.73 summaries per week, I don't know what it's your fuel but that's impressive). + """ +) \ No newline at end of file diff --git a/README.md b/README.md index 062a46741fe717e1e5c6fa7d79f176eebbeb0826..1e63885b44afe7a61d0f3a09ca90a4af2c7e9066 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ --- title: Vision Papers -emoji: ๐Ÿ“Š -colorFrom: yellow -colorTo: indigo +emoji: ๐Ÿ’ป +colorFrom: indigo +colorTo: blue sdk: streamlit sdk_version: 1.37.0 -app_file: app.py +app_file: Home.py pinned: false --- diff --git a/pages/10_Painter.py b/pages/10_Painter.py new file mode 100644 index 0000000000000000000000000000000000000000..ab097f9d446945a195355827f3411c5aa1b71d0a --- /dev/null +++ b/pages/10_Painter.py @@ -0,0 +1,53 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Painter") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1771542172946354643) (March 23, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""I read the Painter [paper](https://t.co/r3aHp29mjf) by [BAAIBeijing](https://x.com/BAAIBeijing) to convert the weights to ๐Ÿค— Transformers, and I absolutely loved the approach they took so I wanted to take time to unfold it here! +""") +st.markdown(""" """) + +st.image("pages/Painter/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""So essentially this model takes inspiration from in-context learning, as in, in LLMs you give an example input output and give the actual input that you want model to complete (one-shot learning) they adapted this to images, thus the name "images speak in images". + +This model doesn't have any multimodal parts, it just has an image encoder and a decoder head (linear layer, conv layer, another linear layer) so it's a single modality. + +The magic sauce is the data: they input the task in the form of image and associated transformation and another image they want the transformation to take place and take smooth L2 loss over the predictions and ground truth this is like T5 of image models ๐Ÿ˜€ +""") +st.markdown(""" """) + +st.image("pages/Painter/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""What is so cool about it is that it can actually adapt to out of domain tasks, meaning, in below chart, it was trained on the tasks above the dashed line, and the authors found out it generalized to the tasks below the line, image tasks are well generalized ๐Ÿคฏ +""") +st.markdown(""" """) + +st.image("pages/Painter/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[Images Speak in Images: A Generalist Painter for In-Context Visual Learning](https://arxiv.org/abs/2212.02499) +by Xinlong Wang, Wen Wang, Yue Cao, Chunhua Shen, Tiejun Huang (2022) +[GitHub](https://github.com/baaivision/Painter)""", icon="๐Ÿ“š") + + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("LLaVA-NeXT") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("SegGPT") \ No newline at end of file diff --git a/pages/11_SegGPT.py b/pages/11_SegGPT.py new file mode 100644 index 0000000000000000000000000000000000000000..a8d5a366d1ee404028c6bb10f65a279f05560c0b --- /dev/null +++ b/pages/11_SegGPT.py @@ -0,0 +1,70 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("SegGPT") + +st.success("""[Original tweet](https://x.com/mervenoyann/status/1773056450790666568) (March 27, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""SegGPT is a vision generalist on image segmentation, quite like GPT for computer vision โœจ +It comes with the last release of ๐Ÿค— Transformers ๐ŸŽ +Technical details, demo and how-to's under this! +""") +st.markdown(""" """) + +st.image("pages/SegGPT/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""SegGPT is an extension of the Painter where you speak to images with images: the model takes in an image prompt, transformed version of the image prompt, the actual image you want to see the same transform, and expected to output the transformed image. + +SegGPT consists of a vanilla ViT with a decoder on top (linear, conv, linear). The model is trained on diverse segmentation examples, where they provide example image-mask pairs, the actual input to be segmented, and the decoder head learns to reconstruct the mask output. ๐Ÿ‘‡๐Ÿป +""", unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SegGPT/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +This generalizes pretty well! +The authors do not claim state-of-the-art results as the model is mainly used zero-shot and few-shot inference. They also do prompt tuning, where they freeze the parameters of the model and only optimize the image tensor (the input context). +""") +st.markdown(""" """) + +st.image("pages/SegGPT/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Thanks to ๐Ÿค— Transformers you can use this model easily! See [here](https://t.co/U5pVpBhkfK). +""") +st.markdown(""" """) + +st.image("pages/SegGPT/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +I have built an app for you to try it out. I combined SegGPT with Depth Anything Model, so you don't have to upload image mask prompts in your prompt pair ๐Ÿค— +Try it [here](https://t.co/uJIwqJeYUy). Also check out the [collection](https://t.co/HvfjWkAEzP). +""") +st.markdown(""" """) + +st.image("pages/SegGPT/image_5.jpeg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) +by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang (2023) +[GitHub](https://github.com/baaivision/Painter)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Painter") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Grounding DINO") \ No newline at end of file diff --git a/pages/12_Grounding_DINO.py b/pages/12_Grounding_DINO.py new file mode 100644 index 0000000000000000000000000000000000000000..1c430e688011a08d8adffcffcf01c57ffa123f47 --- /dev/null +++ b/pages/12_Grounding_DINO.py @@ -0,0 +1,92 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Grounding DINO") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1780558859221733563) (April 17, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown(""" +We have merged Grounding DINO in ๐Ÿค— Transformers ๐Ÿฆ– +It's an amazing zero-shot object detection model, here's why ๐Ÿงถ +""") +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""There are two zero-shot object detection models as of now, one is OWL series by Google Brain and the other one is Grounding DINO ๐Ÿฆ• +Grounding DINO pays immense attention to detail โฌ‡๏ธ +Also [try yourself](https://t.co/UI0CMxphE7). +""") +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_2.jpeg", use_column_width=True) +st.image("pages/Grounding_DINO/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""I have also built another [application](https://t.co/4EHpOwEpm0) for GroundingSAM, combining GroundingDINO and Segment Anything by Meta for cutting edge zero-shot image segmentation. +""") +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Grounding DINO is essentially a model with connected image encoder (Swin transformer), text encoder (BERT) and on top of both, a decoder that outputs bounding boxes ๐Ÿฆ– +This is quite similar to OWL series, which uses a ViT-based detector on CLIP. +""", unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_5.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""The authors train Swin-L/T with BERT contrastively (not like CLIP where they match the images to texts by means of similarity) where they try to approximate the region outputs to language phrases at the head outputs ๐Ÿคฉ +""") +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_6.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""The authors also form the text features on the sub-sentence level. +This means it extracts certain noun phrases from training data to remove the influence between words while removing fine-grained information. +""") +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_7.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Thanks to all of this, Grounding DINO has great performance on various REC/object detection benchmarks ๐Ÿ†๐Ÿ“ˆ +""") +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_8.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Thanks to ๐Ÿค— Transformers, you can use Grounding DINO very easily! +You can also check out [NielsRogge](https://twitter.com/NielsRogge)'s [notebook here](https://t.co/8ADGFdVkta). +""") +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_9.jpeg", use_column_width=True) + + +st.info("""Ressources: +[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) +by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang (2023) +[GitHub](https://github.com/IDEA-Research/GroundingDINO) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/grounding-dino)""", icon="๐Ÿ“š") + + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("SegGPT") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("DocOwl 1.5") \ No newline at end of file diff --git a/pages/13_DocOwl_1.5.py b/pages/13_DocOwl_1.5.py new file mode 100644 index 0000000000000000000000000000000000000000..224da4322d4cf6ae6e81e08f4b8159ffdfe05898 --- /dev/null +++ b/pages/13_DocOwl_1.5.py @@ -0,0 +1,100 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("DocOwl 1.5") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1782421257591357824) (April 22, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""DocOwl 1.5 is the state-of-the-art document understanding model by Alibaba with Apache 2.0 license ๐Ÿ˜๐Ÿ“ +Time to dive in and learn more ๐Ÿงถ +""") +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""This model consists of a ViT-based visual encoder part that takes in crops of image and the original image itself. +Then the outputs of the encoder goes through a convolution based model, after that the outputs are merged with text and then fed to LLM. +""") +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Initially, the authors only train the convolution based part (called H-Reducer) and vision encoder while keeping LLM frozen. +Then for fine-tuning (on image captioning, VQA etc), they freeze vision encoder and train H-Reducer and LLM. +""") +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Also they use simple linear projection on text and documents. You can see below how they model the text prompts and outputs ๐Ÿค“ +""") +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""They train the model various downstream tasks including: +- document understanding (DUE benchmark and more) +- table parsing (TURL, PubTabNet) +- chart parsing (PlotQA and more) +- image parsing (OCR-CC) +- text localization (DocVQA and more) +""") +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_5.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +They contribute a new model called DocOwl 1.5-Chat by: +1. creating a new document-chat dataset with questions from document VQA datasets +2. feeding them to ChatGPT to get long answers +3. fine-tune the base model with it (which IMO works very well!) +""") +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_6.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Resulting generalist model and the chat model are pretty much state-of-the-art ๐Ÿ˜ +Below you can see how it compares to fine-tuned models. +""") +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_7.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""All the models and the datasets (also some eval datasets on above tasks!) are in this [organization](https://t.co/sJdTw1jWTR). +The [Space](https://t.co/57E9DbNZXf). +""") +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_8.jpeg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding](https://arxiv.org/abs/2403.12895) +by Anwen Hu, Haiyang Xu, Jiabo Ye, Ming Yan, Liang Zhang, Bo Zhang, Chen Li, Ji Zhang, Qin Jin, Fei Huang, Jingren Zhou (2024) +[GitHub](https://github.com/X-PLUG/mPLUG-DocOwl)""", icon="๐Ÿ“š") + + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Grounding DINO") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("PLLaVA") \ No newline at end of file diff --git a/pages/14_PLLaVA.py b/pages/14_PLLaVA.py new file mode 100644 index 0000000000000000000000000000000000000000..d872a77b7c06ce1393c0d039ed62fb772a84621d --- /dev/null +++ b/pages/14_PLLaVA.py @@ -0,0 +1,65 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("PLLaVA") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1786336055425138939) (May 3, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Parameter-free LLaVA for video captioning works like magic! ๐Ÿคฉ Let's take a look! +""") +st.markdown(""" """) + +st.image("pages/PLLaVA/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Most of the video captioning models work by downsampling video frames to reduce computational complexity and memory requirements without losing a lot of information in the process. +PLLaVA on the other hand, uses pooling! ๐Ÿคฉ + +How? ๐Ÿง +It takes in frames of video, passed to ViT and then projection layer, and then output goes through average pooling where input shape is (# frames, width, height, text decoder input dim) ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.image("pages/PLLaVA/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Pooling operation surprisingly reduces the loss of spatial and temporal information. See below some examples on how it can capture the details ๐Ÿค— +""") +st.markdown(""" """) + +st.image("pages/PLLaVA/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""According to authors' findings, it performs way better than many of the existing models (including proprietary VLMs) and scales very well (on text decoder). +""") +st.markdown(""" """) + +st.image("pages/PLLaVA/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Model repositories ๐Ÿค— [7B](https://t.co/AeSdYsz1U7), [13B](https://t.co/GnI1niTxO7), [34B](https://t.co/HWAM0ZzvDc) +Spaces๐Ÿค— [7B](https://t.co/Oms2OLkf7O), [13B](https://t.co/C2RNVNA4uR) +""") +st.markdown(""" """) + +st.info(""" +Ressources: +[PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning](https://arxiv.org/abs/2404.16994) +by Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, Jiashi Feng (2024) +[GitHub](https://github.com/magic-research/PLLaVA)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("DocOwl 1.5") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("CuMo") \ No newline at end of file diff --git a/pages/15_CuMo.py b/pages/15_CuMo.py new file mode 100644 index 0000000000000000000000000000000000000000..70c10bee7062ef6d947c445df681a0669c30427a --- /dev/null +++ b/pages/15_CuMo.py @@ -0,0 +1,61 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("CuMo") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1790665706205307191) (May 15, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown(""" +It's raining vision language models โ˜”๏ธ +CuMo is a new vision language model that has MoE in every step of the VLM (image encoder, MLP and text decoder) and uses Mistral-7B for the decoder part ๐Ÿค“ +""") +st.markdown(""" """) + +st.image("pages/CuMo/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The authors firstly did pre-training of MLP with the by freezing the image encoder and text decoder, then they warmup the whole network by unfreezing and finetuning which they state to stabilize the visual instruction tuning when bringing in the experts. +""") +st.markdown(""" """) + +st.image("pages/CuMo/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The mixture of experts MLP blocks above are simply the same MLP blocks initialized from the single MLP that was trained during pre-training and fine-tuned in pre-finetuning ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.image("pages/CuMo/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +It works very well (also tested myself) that it outperforms the previous SOTA of it's size LLaVA-NeXT! ๐Ÿ˜ +I wonder how it would compare to IDEFICS2-8B You can try it yourself [here](https://t.co/MLIYKVh5Ee). +""", unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/CuMo/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts](https://arxiv.org/abs/2405.05949) +by Jiachen Li, Xinyao Wang, Sijie Zhu, Chia-Wen Kuo, Lu Xu, Fan Chen, Jitesh Jain, Humphrey Shi, Longyin Wen (2024) +[GitHub](https://github.com/SHI-Labs/CuMo)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("PLLaVA") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("DenseConnector") \ No newline at end of file diff --git a/pages/16_DenseConnector.py b/pages/16_DenseConnector.py new file mode 100644 index 0000000000000000000000000000000000000000..258505464d24915003519fee2e59c4dc1275f2be --- /dev/null +++ b/pages/16_DenseConnector.py @@ -0,0 +1,69 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("DenseConnector") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1796089181988352216) (May 30, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Do we fully leverage image encoders in vision language models? ๐Ÿ‘€ +A new paper built a dense connector that does it better! Let's dig in ๐Ÿงถ +""") +st.markdown(""" """) + +st.image("pages/DenseConnector/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +VLMs consist of an image encoder block, a projection layer that projects image embeddings to text embedding space and then a text decoder sequentially connected ๐Ÿ“– +This [paper](https://t.co/DPQzbj0eWm) explores using intermediate states of image encoder and not a single output ๐Ÿคฉ +""") +st.markdown(""" """) + +st.image("pages/DenseConnector/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The authors explore three different ways of instantiating dense connector: sparse token integration, sparse channel integration and dense channel integration (each of them just take intermediate outputs and put them together in different ways, see below). +""") +st.markdown(""" """) + +st.image("pages/DenseConnector/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +They explore all three of them integrated to LLaVA 1.5 and found out each of the new models are superior to the original LLaVA 1.5. +""") +st.markdown(""" """) + +st.image("pages/DenseConnector/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +I tried the [model](https://huggingface.co/spaces/HuanjinYao/DenseConnector-v1.5-8B) and it seems to work very well ๐Ÿฅน +The authors have released various [checkpoints](https://t.co/iF8zM2qvDa) based on different decoders (Vicuna 7/13B and Llama 3-8B). +""") +st.markdown(""" """) + +st.image("pages/DenseConnector/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[Dense Connector for MLLMs](https://arxiv.org/abs/2405.13800) +by Huanjin Yao, Wenhao Wu, Taojiannan Yang, YuXin Song, Mengxi Zhang, Haocheng Feng, Yifan Sun, Zhiheng Li, Wanli Ouyang, Jingdong Wang (2024) +[GitHub](https://github.com/HJYao00/DenseConnector)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("CuMo") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Depth Anything v2") \ No newline at end of file diff --git a/pages/17_Depth_Anything_V2.py b/pages/17_Depth_Anything_V2.py new file mode 100644 index 0000000000000000000000000000000000000000..0a4fa1698c33af8c0343e37f4821ba4f0b36f8e2 --- /dev/null +++ b/pages/17_Depth_Anything_V2.py @@ -0,0 +1,74 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Depth Anything V2") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1803063120354492658) (June 18, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown(""" +I love Depth Anything V2 ๐Ÿ˜ +Itโ€™s Depth Anything, but scaled with both larger teacher model and a gigantic dataset! Letโ€™s unpack ๐Ÿค“๐Ÿงถ! +""", unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Depth_Anything_v2/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The authors have analyzed Marigold, a diffusion based model against Depth Anything and found out whatโ€™s up with using synthetic images vs real images for MDE: +๐Ÿ”– Real data has a lot of label noise, inaccurate depth maps (caused by depth sensors missing transparent objects etc) +๐Ÿ”– Synthetic data have more precise and detailed depth labels and they are truly ground-truth, but thereโ€™s a distribution shift between real and synthetic images, and they have restricted scene coverage +""") +st.markdown(""" """) + +st.image("pages/Depth_Anything_v2/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The authors train different image encoders only on synthetic images and find out unless the encoder is very large the model canโ€™t generalize well (but large models generalize inherently anyway) ๐Ÿง +But they still fail encountering real images that have wide distribution in labels ๐Ÿฅฒ +""") +st.markdown(""" """) + +st.image("pages/Depth_Anything_v2/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Depth Anything v2 framework is to... +๐Ÿฆ– Train a teacher model based on DINOv2-G based on 595K synthetic images +๐Ÿท๏ธ Label 62M real images using teacher model +๐Ÿฆ• Train a student model using the real images labelled by teacher +Result: 10x faster and more accurate than Marigold! +""") +st.markdown(""" """) + +st.image("pages/Depth_Anything_v2/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The authors also construct a new benchmark called DA-2K that is less noisy, highly detailed and more diverse! +I have created a [collection](https://t.co/3fAB9b2sxi) that has the models, the dataset, the demo and CoreML converted model ๐Ÿ˜š +""") +st.markdown(""" """) + +st.info(""" +Ressources: +[Depth Anything V2](https://arxiv.org/abs/2406.09414) +by Lihe Yang, Bingyi Kang, Zilong Huang, Zhen Zhao, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) +[GitHub](https://github.com/DepthAnything/Depth-Anything-V2) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything_v2)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("DenseConnector") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Florence-2") \ No newline at end of file diff --git a/pages/18_Florence-2.py b/pages/18_Florence-2.py new file mode 100644 index 0000000000000000000000000000000000000000..b702a84eb3f7661fc3e90861103ea41473b06249 --- /dev/null +++ b/pages/18_Florence-2.py @@ -0,0 +1,78 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Florence-2") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1803769866878623819) (June 20, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Florence-2 is a new vision foundation model by Microsoft capable of a wide variety of tasks ๐Ÿคฏ +Let's unpack! ๐Ÿงถ +""") +st.markdown(""" """) + +st.image("pages/Florence-2/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +This model is can handle tasks that vary from document understanding to semantic segmentation ๐Ÿคฉ +[Demo](https://t.co/7YJZvjhw84) | [Collection](https://t.co/Ub7FGazDz1) +""") +st.markdown(""" """) + +st.image("pages/Florence-2/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The difference from previous models is that the authors have compiled a dataset that consists of 126M images with 5.4B annotations labelled with their own data engine โ†“โ†“ +""") +st.markdown(""" """) + +st.image("pages/Florence-2/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The dataset also offers more variety in annotations compared to other datasets, it has region level and image level annotations with more variety in semantic granularity as well! +""") +st.markdown(""" """) + +st.image("pages/Florence-2/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The model is a similar architecture to previous models, an image encoder, a multimodality encoder with text decoder. +The authors have compiled the multitask dataset with prompts for each task which makes the model trainable on multiple tasks ๐Ÿค— +""") +st.markdown(""" """) + +st.image("pages/Florence-2/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +You also fine-tune this model on any task of choice, the authors also released different results on downstream tasks and report their results when un/freezing vision encoder ๐Ÿค“๐Ÿ“‰ +They have released fine-tuned models too, you can find them in the collection above ๐Ÿค— +""") +st.markdown(""" """) + +st.image("pages/Florence-2/image_6.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242) +by Bin Xiao, Haiping Wu, Weijian Xu, Xiyang Dai, Houdong Hu, Yumao Lu, Michael Zeng, Ce Liu, Lu Yuan (2023) +[Hugging Face blog post](https://huggingface.co/blog/finetune-florence2)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Depth Anything V2") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("4M-21") \ No newline at end of file diff --git a/pages/19_4M-21.py b/pages/19_4M-21.py new file mode 100644 index 0000000000000000000000000000000000000000..59ff57899ce8c0eef3dfd2a18c8ae6fa31b20e0f --- /dev/null +++ b/pages/19_4M-21.py @@ -0,0 +1,70 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("4M-21") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1804138208814309626) (June 21, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown(""" +EPFL and Apple just released 4M-21: single any-to-any model that can do anything from text-to-image generation to generating depth masks! ๐Ÿ™€ +Let's unpack ๐Ÿงถ +""") +st.markdown(""" """) + +st.image("pages/4M-21/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""4M is a multimodal training [framework](https://t.co/jztLublfSF) introduced by Apple and EPFL. +Resulting model takes image and text and output image and text ๐Ÿคฉ +[Models](https://t.co/1LC0rAohEl) | [Demo](https://t.co/Ra9qbKcWeY) +""") +st.markdown(""" """) + +st.video("pages/4M-21/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(""" +This model consists of transformer encoder and decoder, where the key to multimodality lies in input and output data: +input and output tokens are decoded to generate bounding boxes, generated image's pixels, captions and more! +""") +st.markdown(""" """) + +st.image("pages/4M-21/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +This model also learnt to generate canny maps, SAM edges and other things for steerable text-to-image generation ๐Ÿ–ผ๏ธ +The authors only added image-to-all capabilities for the demo, but you can try to use this model for text-to-image generation as well โ˜บ๏ธ +""") +st.markdown(""" """) + +st.image("pages/4M-21/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +In the project page you can also see the model's text-to-image and steered generation capabilities with model's own outputs as control masks! +""") +st.markdown(""" """) + +st.video("pages/4M-21/video_2.mp4", format="video/mp4") +st.markdown(""" """) + +st.info(""" +Ressources +[4M-21: An Any-to-Any Vision Model for Tens of Tasks and Modalities](https://arxiv.org/abs/2406.09406) by Roman Bachmann, OฤŸuzhan Fatih Kar, David Mizrahi, Ali Garjani, Mingfei Gao, David Griffiths, Jiaming Hu, Afshin Dehghan, Amir Zamir (2024) +[GitHub](https://github.com/apple/ml-4m/)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Florence-2") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("RT-DETR") \ No newline at end of file diff --git a/pages/1_MobileSAM.py b/pages/1_MobileSAM.py new file mode 100644 index 0000000000000000000000000000000000000000..80e21e5e4cc13211869262ef9775baf217b32187 --- /dev/null +++ b/pages/1_MobileSAM.py @@ -0,0 +1,79 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("MobileSAM") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1738959605542076863) (December 24, 2023)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Read the MobileSAM paper this weekend ๐Ÿ“– Sharing some insights! +The idea ๐Ÿ’ก: SAM model consist of three parts, a heavy image encoder, a prompt encoder (prompt can be text, bounding box, mask or point) and a mask decoder. + +To make the SAM model smaller without compromising from the performance, the authors looked into three types of distillation. +First one is distilling the decoder outputs directly (a more naive approach) with a completely randomly initialized small ViT and randomly initialized mask decoder. +However, when the ViT and the decoder are both in a bad state, this doesn't work well. +""") +st.markdown(""" """) + +st.image("pages/MobileSAM/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The second type of distillation is called semi-coupled, where the authors only randomly initialized the ViT image encoder and kept the mask decoder. +This is called semi-coupled because the image encoder distillation still depends on the mask decoder (see below ๐Ÿ‘‡) +""") +st.markdown(""" """) + +st.image("pages/MobileSAM/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The last type of distillation, [decoupled distillation](https://openaccess.thecvf.com/content/CVPR2022/papers/Zhao_Decoupled_Knowledge_Distillation_CVPR_2022_paper.pdf), is the most intuitive IMO. +The authors have "decoupled" image encoder altogether and have frozen the mask decoder and didn't really distill based on generated masks. +This makes sense as the bottleneck here is the encoder itself and most of the time, distillation works well with encoding. +""") +st.markdown(""" """) + +st.image("pages/MobileSAM/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Finally, they found out that decoupled distillation performs better than coupled distillation by means of mean IoU and requires much less compute! โ™ฅ๏ธ +""") +st.markdown(""" """) + +st.image("pages/MobileSAM/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Wanted to leave some links here if you'd like to try yourself ๐Ÿ‘‡ +- MobileSAM [demo](https://huggingface.co/spaces/dhkim2810/MobileSAMMobileSAM) +- Model [repository](https://huggingface.co/dhkim2810/MobileSAM) + +If you'd like to experiment around TinyViT, [timm library](https://huggingface.co/docs/timm/index) ([Ross Wightman](https://x.com/wightmanr)) has a bunch of [checkpoints available](https://huggingface.co/models?sort=trending&search=timm%2Ftinyvit). +""") +st.markdown(""" """) + +st.image("pages/MobileSAM/image_5.jpeg", use_column_width=True) +st.markdown(""" """) + + +st.info(""" +Ressources: +[Faster Segment Anything: Towards Lightweight SAM for Mobile Applications](https://arxiv.org/abs/2306.14289) +by Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, Choong Seon Hong (2023) +[GitHub](https://github.com/ChaoningZhang/MobileSAM)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Home") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("OneFormer") \ No newline at end of file diff --git a/pages/20_RT-DETR.py b/pages/20_RT-DETR.py new file mode 100644 index 0000000000000000000000000000000000000000..9226f4b4b371754c867c0a624d24bc8d5b4c93a0 --- /dev/null +++ b/pages/20_RT-DETR.py @@ -0,0 +1,67 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("RT-DETR") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1807790959884665029) (July 1, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Real-time DEtection Transformer (RT-DETR) landed in ๐Ÿค— Transformers with Apache 2.0 license ๐Ÿ˜ +Do DETRs Beat YOLOs on Real-time Object Detection? Keep reading ๐Ÿ‘€ +""") +st.markdown(""" """) + +st.video("pages/RT-DETR/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(""" +Short answer, it does! ๐Ÿ“– [notebook](https://t.co/NNRpG9cAEa), ๐Ÿ”– [models](https://t.co/ctwWQqNcEt), ๐Ÿ”– [demo](https://t.co/VrmDDDjoNw) + +YOLO models are known to be super fast for real-time computer vision, but they have a downside with being volatile to NMS ๐Ÿฅฒ +Transformer-based models on the other hand are computationally not as efficient ๐Ÿฅฒ +Isn't there something in between? Enter RT-DETR! + +The authors combined CNN backbone, multi-stage hybrid decoder (combining convs and attn) with a transformer decoder โ‡“ +""") +st.markdown(""" """) + +st.image("pages/RT-DETR/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +In the paper, authors also claim one can adjust speed by changing decoder layers without retraining altogether. +They also conduct many ablation studies and try different decoders. +""") +st.markdown(""" """) + +st.image("pages/RT-DETR/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The authors find out that the model performs better in terms of speed and accuracy compared to the previous state-of-the-art ๐Ÿคฉ +""") +st.markdown(""" """) + +st.image("pages/RT-DETR/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069) +by Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen (2023) +[GitHub](https://github.com/lyuwenyu/RT-DETR/) +[Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/rt_detr)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("4M-21") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Llava-NeXT-Interleave") \ No newline at end of file diff --git a/pages/21_Llava-NeXT-Interleave.py b/pages/21_Llava-NeXT-Interleave.py new file mode 100644 index 0000000000000000000000000000000000000000..39ec59ce3607368df17b2bf14ec0ecf8cdec221b --- /dev/null +++ b/pages/21_Llava-NeXT-Interleave.py @@ -0,0 +1,86 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Llava-NeXT-Interleave") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1813560292397203630) (July 17, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""The vision language model in this video is 0.5B and can take in image, video and 3D! ๐Ÿคฏ +Llava-NeXT-Interleave is a new vision language model trained on interleaved image, video and 3D data keep reading โฅฅโฅฅ +""") +st.markdown(""" """) + +st.video("pages/Llava-NeXT-Interleave/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown("""This model comes with 0.5B, 7B and 7B-DPO variants, all can be used with Transformers ๐Ÿ˜ +[Collection of models](https://t.co/sZsaglSXa3) | [Demo](https://t.co/FbpaMWJY8k) +See how to use below ๐Ÿ‘‡๐Ÿป +""") +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Authors of this paper have explored training LLaVA-NeXT on interleaved data where the data consists of multiple modalities, including image(s), video, 3D ๐Ÿ“š +They have discovered that interleaved data increases results across all benchmarks! +""", unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The model can do task transfer from single image tasks to multiple images ๐Ÿคฏ +The authors have trained the model on single images and code yet the model can solve coding with multiple images. +""") +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Same applies to other modalities, see below for video: +""") +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The model also has document understanding capabilities and many real-world application areas. +""") +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +This release also comes with the dataset this model was fine-tuned on ๐Ÿ“– [M4-Instruct-Data](https://t.co/rutXMtNC0I) +""") +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_6.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[LLaVA-NeXT: Tackling Multi-image, Video, and 3D in Large Multimodal Models](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/) +by Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, Chunyuan Li (2024) +[GitHub](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/inference/docs/LLaVA-NeXT-Interleave.md)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("RT-DETR") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Chameleon") \ No newline at end of file diff --git a/pages/22_Chameleon.py b/pages/22_Chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..e6cc368034b2714c4270593275792af0ac5347ca --- /dev/null +++ b/pages/22_Chameleon.py @@ -0,0 +1,88 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Chameleon") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1814278511785312320) (July 19, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Chameleon ๐ŸฆŽ by Meta is now available in ๐Ÿค— Transformers. +A multimodal model that comes in 7B and 34B sizes ๐Ÿคฉ +But what makes this model so special? Keep reading โ‡ฃ +""") +st.markdown(""" """) + +st.video("pages/Chameleon/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(""" +[Demo](https://t.co/GsGE17fSdI) | [Models](https://t.co/cWUiVbsRz6) +Find below the API to load this model locally use it โฌ‡๏ธ +""") +st.markdown(""" """) + +st.image("pages/Chameleon/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Chameleon is a unique model: it attempts to scale early fusion ๐Ÿคจ +But what is early fusion? +Modern vision language models use a vision encoder with a projection layer to project image embeddings so it can be promptable to text decoder.""") +st.markdown(""" """) + +st.image("pages/Chameleon/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Early fusion on the other hand attempts to fuse all features together (image patches and text) by using an image tokenizer and all tokens are projected into a shared space, which enables seamless generation ๐Ÿ˜ +""") +st.markdown(""" """) + +st.image("pages/Chameleon/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Authors have also introduced different architectural improvements (QK norm and revise placement of layer norms) for scalable and stable training. +This way they were able to increase the token count (5x tokens compared to Llama 3 which is a must with early-fusion IMO) . +""") +st.markdown(""" """) + +st.image("pages/Chameleon/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +This model is an any-to-any model thanks to early fusion: it can take image and text input and output image and text, but image generation are disabled to prevent malicious use. +""") +st.markdown(""" """) + +st.image("pages/Chameleon/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +One can also do text-only prompting, authors noted the model catches up with larger LLMs, and you can also see how it compares to VLMs with image-text prompting. +""") +st.markdown(""" """) + +st.image("pages/Chameleon/image_6.jpg", use_column_width=True) +st.image("pages/Chameleon/image_6.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818) +by Chameleon Team (2024) +[GitHub](https://github.com/facebookresearch/chameleon) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/chameleon)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Llava-NeXT-Interleave") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Video-LLaVA") \ No newline at end of file diff --git a/pages/23_Video-LLaVA.py b/pages/23_Video-LLaVA.py new file mode 100644 index 0000000000000000000000000000000000000000..3434308320c15ecd1f40507cbc6f8edc7e95fb0a --- /dev/null +++ b/pages/23_Video-LLaVA.py @@ -0,0 +1,70 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Video-LLaVA") + +st.success("""[Original tweet](https://x.com/mervenoyann/status/1816427325073842539) (July 25, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""We have recently merged Video-LLaVA to ๐Ÿค— Transformers! ๐ŸŽž๏ธ +What makes this model different? Keep reading โ‡Š +""") +st.markdown(""" """) + +st.video("pages/Video-LLaVA/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown("""[Demo](https://t.co/MVP14uEj9e) | [Model](https://t.co/oqSCMUqwJo) +See below how to initialize the model and processor and infer โฌ‡๏ธ +""") +st.markdown(""" """) + +st.image("pages/Video-LLaVA/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Compared to other models that take image and video input and either project them separately or downsampling video and projecting selected frames, Video-LLaVA is converting images and videos to unified representation and project them using a shared projection layer. +""") +st.markdown(""" """) + +st.image("pages/Video-LLaVA/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +It uses Vicuna 1.5 as the language model and LanguageBind's own encoders that's based on OpenCLIP, these encoders project the modalities to an unified representation before passing to projection layer. +""") +st.markdown(""" """) + +st.image("pages/Video-LLaVA/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +I feel like one of the coolest features of this model is the joint understanding which is also introduced recently with many models. +It's a relatively older model but ahead of it's time and works very well! +""") +st.markdown(""" """) + +st.image("pages/Video-LLaVA/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://arxiv.org/abs/2311.10122) +by Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan (2023) +[GitHub](https://github.com/PKU-YuanGroup/Video-LLaVA) +[Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/video_llava) +""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Chameleon") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("SAMv2") \ No newline at end of file diff --git a/pages/24_SAMv2.py b/pages/24_SAMv2.py new file mode 100644 index 0000000000000000000000000000000000000000..3a26086dddde5749508ae1cd44083ea509b34310 --- /dev/null +++ b/pages/24_SAMv2.py @@ -0,0 +1,88 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("SAMv2") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1818675981634109701) (July 31, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""SAMv2 is just mindblowingly good ๐Ÿ˜ +Learn what makes this model so good at video segmentation, keep reading ๐Ÿฆ†โ‡“ +""") +st.markdown(""" """) + +col1, col2, col3 = st.columns(3) +with col2: + st.video("pages/SAMv2/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(""" +Check out the [demo](https://t.co/35ixEZgPaf) by [skalskip92](https://x.com/skalskip92) to see how to use the model locally. +Check out Meta's [demo](https://t.co/Bcbli9Cfim) where you can edit segmented instances too! + +Segment Anything Model by Meta was released as a universal segmentation model in which you could prompt a box or point prompt to segment the object of interest +SAM consists of an image encoder to encode images, a prompt encoder to encode prompts, then outputs of these two are given to a mask decoder to generate masks. +""") +st.markdown(""" """) + +st.image("pages/SAMv2/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +However SAM doesn't naturally track object instances in videos, one needs to make sure to prompt the same mask or point prompt for that instance in each frame and feed each frame, which is infeasible ๐Ÿ˜” +But don't fret, that is where SAMv2 comes in with a memory module! + +SAMv2 defines a new task called "masklet prediction" here masklet refers to the same mask instance throughout the frames ๐ŸŽž๏ธ +Unlike SAM, SAM 2 decoder is not fed the image embedding directly from an image encoder, but attention of memories of prompted frames and object pointers. +""") +st.markdown(""" """) + +st.image("pages/SAMv2/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +๐Ÿ–ผ๏ธ These "memories" are essentially past predictions of object of interest up to a number of recent frames, +and are in form of feature maps of location info (spatial feature maps). +๐Ÿ‘‰๐Ÿป The object pointers are high level semantic information of the object of interest based on. + +Just like SAM paper SAMv2 depends on a data engine, and the dataset it generated comes with the release: SA-V ๐Ÿคฏ +This dataset is gigantic, it has 190.9K manual masklet annotations and 451.7K automatic masklets! +""") +st.markdown(""" """) + +st.image("pages/SAMv2/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Initially they apply SAM to each frame to assist human annotators to annotate a video at six FPS for high quality data, +in the second phase they add SAM and SAM2 to generate masklets across time consistently. Finally they use SAM2 to refine the masklets. + +They have evaluated this model on J&F score (Jaccard Index + F-measure for contour acc) which is used to evaluate zero-shot +video segmentation benchmarks. +SAMv2 seems to outperform two previously sota models that are built on top of SAM! ๐Ÿฅน +""") +st.markdown(""" """) + +st.image("pages/SAMv2/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[SAM 2: Segment Anything in Images and Videos]() +by Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman Rรคdle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Dollรกr, Christoph Feichtenhofer (2024) +[GitHub](https://github.com/facebookresearch/segment-anything-2) +[Hugging Face documentation]()""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Video-LLaVA") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Home") \ No newline at end of file diff --git a/pages/2_Oneformer.py b/pages/2_Oneformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c6b2996b0e8cec6324ee7313ab998633e47a931f --- /dev/null +++ b/pages/2_Oneformer.py @@ -0,0 +1,62 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("OneFormer") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1739707076501221608) (December 26, 2023)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown(""" +OneFormer: one model to segment them all? ๐Ÿคฏ +I was looking into paperswithcode leaderboards when I came across OneFormer for the first time so it was time to dig in! +""") +st.markdown(""" """) + +st.image("pages/OneFormer/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""OneFormer is a "truly universal" model for semantic, instance and panoptic segmentation tasks โš”๏ธ +What makes is truly universal is that it's a single model that is trained only once and can be used across all tasks ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.image("pages/OneFormer/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The enabler here is the text conditioning, i.e. the model is given a text query that states task type along with the appropriate input, and using contrastive loss, the model learns the difference between different task types ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.image("pages/OneFormer/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Thanks to ๐Ÿค— Transformers, you can easily use the model! +I have drafted a [notebook](https://t.co/cBylk1Uv20) for you to try right away ๐Ÿ˜Š +You can also check out the [Space](https://t.co/31GxlVo1W5) without checking out the code itself. +""") +st.markdown(""" """) + +st.image("pages/OneFormer/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) +by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi (2022) +[GitHub](https://github.com/SHI-Labs/OneFormer) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/oneformer)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("MobileSAM") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("VITMAE") \ No newline at end of file diff --git a/pages/3_VITMAE.py b/pages/3_VITMAE.py new file mode 100644 index 0000000000000000000000000000000000000000..582c71f3a0f4f3e77f71afbd658a3a18dfea9869 --- /dev/null +++ b/pages/3_VITMAE.py @@ -0,0 +1,63 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("VITMAE") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1740688304784183664) (December 29, 2023)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Just read VitMAE paper, sharing some highlights ๐Ÿงถ +ViTMAE is a simply yet effective self-supervised pre-training technique, where authors combined vision transformer with masked autoencoder. +The images are first masked (75 percent of the image!) and then the model tries to learn about the features through trying to reconstruct the original image! +""") +st.markdown(""" """) + +st.image("pages/VITMAE/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""The image is not masked, but rather only the visible patches are fed to the encoder (and that is the only thing encoder sees!). +Next, a mask token is added to where the masked patches are (a bit like BERT, if you will) and the mask tokens and encoded patches are fed to decoder. +The decoder then tries to reconstruct the original image. +""") +st.markdown(""" """) + +st.image("pages/VITMAE/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""As a result, the authors found out that high masking ratio works well in fine-tuning for downstream tasks and linear probing ๐Ÿคฏ๐Ÿคฏ +""") +st.markdown(""" """) + +st.image("pages/VITMAE/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""If you want to try the model or fine-tune, all the pre-trained VITMAE models released released by Meta are available on [Huggingface](https://t.co/didvTL9Zkm). +We've built a [demo](https://t.co/PkuACJiKrB) for you to see the intermediate outputs and reconstruction by VITMAE. + +Also there's a nice [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb) by [@NielsRogge](https://twitter.com/NielsRogge). +""") +st.markdown(""" """) + +st.image("pages/VITMAE/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v3) +by LKaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollรกr, Ross Girshick (2021) +[GitHub](https://github.com/facebookresearch/mae) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/vit_mae)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("OneFormer") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("DINOV2") \ No newline at end of file diff --git a/pages/4M-21/4M-21.md b/pages/4M-21/4M-21.md new file mode 100644 index 0000000000000000000000000000000000000000..95a6a6a980ec5976f7da454b4f5cc3c2485bbb11 --- /dev/null +++ b/pages/4M-21/4M-21.md @@ -0,0 +1,32 @@ +๏ปฟEPFL and Apple just released 4M-21: single any-to-any model that can do anything from text-to-image generation to generating depth masks! ๐Ÿ™€ Let's unpack ๐Ÿงถ + +![image_1](image_1.jpg) + +4M is a multimodal training [framework](https://t.co/jztLublfSF) introduced by Apple and EPFL. +Resulting model takes image and text and output image and text ๐Ÿคฉ +[Models](https://t.co/1LC0rAohEl) | [Demo](https://t.co/Ra9qbKcWeY) + +![video_1](video_1.mp4) + +This model consists of transformer encoder and decoder, where the key to multimodality lies in input and output data: input and output tokens are decoded to generate bounding boxes, generated image's pixels, captions and more! + +![image_2](image_2.jpg) + +This model also learnt to generate canny maps, SAM edges and other things for steerable text-to-image generation ๐Ÿ–ผ๏ธ +The authors only added image-to-all capabilities for the demo, but you can try to use this model for text-to-image generation as well โ˜บ๏ธ + +![image_3](image_3.jpg) + +In the project page you can also see the model's text-to-image and steered generation capabilities with model's own outputs as control masks! + +![video_2](video_2.mp4) + + +> [!TIP] +Ressources: +[4M-21: An Any-to-Any Vision Model for Tens of Tasks and Modalities](https://arxiv.org/abs/2406.09406) +by Roman Bachmann, OฤŸuzhan Fatih Kar, David Mizrahi, Ali Garjani, Mingfei Gao, David Griffiths, Jiaming Hu, Afshin Dehghan, Amir Zamir (2024) +[GitHub](https://github.com/apple/ml-4m/) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1804138208814309626) (June 21, 2024) \ No newline at end of file diff --git a/pages/4M-21/image_1.jpg b/pages/4M-21/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..94763ed7f6873c9377554a8f10eb47dcaa28a4f5 Binary files /dev/null and b/pages/4M-21/image_1.jpg differ diff --git a/pages/4M-21/image_2.jpg b/pages/4M-21/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7f52556f301204c1f823ad6999e0adb9ca734367 Binary files /dev/null and b/pages/4M-21/image_2.jpg differ diff --git a/pages/4M-21/image_3.jpg b/pages/4M-21/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6b45be29fca4a21c65a1f7acbe09c5ed58eb726d Binary files /dev/null and b/pages/4M-21/image_3.jpg differ diff --git a/pages/4M-21/video_1.mp4 b/pages/4M-21/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a2c6c5032811b446e83f1e25ff8998b88081e702 --- /dev/null +++ b/pages/4M-21/video_1.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd40cb677314a9384da8e644ad3bb9eba3e23a39e776f5ce8c1437ebf3d06d8 +size 1073547 diff --git a/pages/4M-21/video_2.mp4 b/pages/4M-21/video_2.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..df457d56eb4360537627e22d0a9b3e3e5a79ce44 Binary files /dev/null and b/pages/4M-21/video_2.mp4 differ diff --git a/pages/4_DINOv2.py b/pages/4_DINOv2.py new file mode 100644 index 0000000000000000000000000000000000000000..2d365f7eac441248d7329eea2a130143aad144cd --- /dev/null +++ b/pages/4_DINOv2.py @@ -0,0 +1,78 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("DINOv2") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1743290724672495827) (January 5, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""DINOv2 is the king for self-supervised learning in images ๐Ÿฆ–๐Ÿฆ• +But how does it work? I've tried to explain how it works but let's expand on it ๐Ÿงถ +""") +st.markdown(""" """) + +st.image("pages/DINOv2/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +DINOv2 is essentially DINO on steroids, so let's talk about DINOv1 first ๐Ÿฆ• +It's essentially a pre-training technique to train ViTs with self-supervision, that uses an unusual way of distillation ๐ŸงŸโ€โ™‚๏ธ๐Ÿ‘จ๐Ÿปโ€๐Ÿซ. +Distillation is a technique where there's a large pre-trained model (teacher), and you have a smaller model (student) initialized randomly. +Then during training the student, you take both models'outputs, calculate divergence between them and then update the loss accordingly. +In this case, we have no labels! And the teacher is not pretrained!!!! ๐Ÿคฏ +Well, the outputs here are the distributions, and teacher is iteratively updated according to student, which is called exponential moving average. +""") +st.markdown(""" """) + +st.image("pages/DINOv2/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +DINO doesn't use any contrastive loss or clustering but only cross entropy loss (again, what a paper) which leads the model to collapse. +This can be avoided by normalizing the teacher output multiple times, but authors center (to squish logits) and sharpen (through temperature) the teacher outputs. +Finally, local and global crops are given to student and only global crops are given to teacher and this sort of pushes student to identify context from small parts of the image. +""") +st.markdown(""" """) + +st.image("pages/DINOv2/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""How does DINOv2 improve DINO? +โšก๏ธ More efficient thanks to FSDP and Flash Attention +๐Ÿฆ– Has a very efficient data augmentation technique that apparently scales to 100M+ images (put below) +๐Ÿ‘จ๐Ÿปโ€๐Ÿซ Uses ViT-g instead of training from scratch +""") +st.markdown(""" """) + +st.image("pages/DINOv2/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The model is so powerful that you can use DINOv2 even with knn or linear classifiers without need to fine-tuning! +But if you'd like DINOv2 to work even better, [NielsRogge](https://twitter.com/NielsRogge) has built a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Fine\_tune\_DINOv2\_for\_image\_classification\_%5Bminimal%5D.ipynb) to fine-tune it using Trainer ๐Ÿ“– +He also has a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Train\_a\_linear\_classifier\_on\_top\_of\_DINOv2\_for\_semantic\_segmentation.ipynb) if you feel like training a linear classifier only ๐Ÿ“” +All the different DINO/v2 model checkpoints are [here](https://huggingface.co/models?search=dinoLastly). +Lastly, special thanks to [ykilcher](https://twitter.com/ykilcher) as I couldn't make sense of certain things in the paper and watched his awesome [tutorial](https://youtube.com/watch?v=h3ij3F) ๐Ÿคฉ +""") +st.markdown(""" """) + +st.info(""" +Ressources: +[DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) +by Maxime Oquab, Timothรฉe Darcet, Thรฉo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervรฉ Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski (2023) +[GitHub](https://github.com/facebookresearch/dinov2) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/dinov2)""", icon="๐Ÿ“š") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("VITMAE") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("SigLIP") \ No newline at end of file diff --git a/pages/5_SigLIP.py b/pages/5_SigLIP.py new file mode 100644 index 0000000000000000000000000000000000000000..0cba9330d032284b95a3b20e3585f1cc60c8a8f0 --- /dev/null +++ b/pages/5_SigLIP.py @@ -0,0 +1,78 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("SigLIP") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1745476609686089800) (January 11. 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""SigLIP just got merged to ๐Ÿค— Transformers and it's super easy to use! +To celebrate this, I have created a repository on various SigLIP based projects! +But what is it and how does it work? +SigLIP an vision-text pre-training technique based on contrastive learning. It jointly trains an image encoder and text encoder such that the dot product of embeddings are most similar for the appropriate text-image pairs. +The image below is taken from CLIP, where this contrastive pre-training takes place with softmax, but SigLIP replaces softmax with sigmoid. ๐Ÿ“Ž +""") +st.markdown(""" """) + +st.image("pages/SigLIP/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Highlightsโœจ +๐Ÿ–ผ๏ธ๐Ÿ“ Authors used medium sized B/16 ViT for image encoder and B-sized transformer for text encoder +๐Ÿ˜ More performant than CLIP on zero-shot +๐Ÿ—ฃ๏ธ Authors trained a multilingual model too! +โšก๏ธ Super efficient, sigmoid is enabling up to 1M items per batch, but the authors chose 32k (see saturation on perf below) +""") +st.markdown(""" """) + +st.image("pages/SigLIP/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Below you can find prior CLIP models and SigLIP across different image encoder sizes and their performance on different datasets ๐Ÿ‘‡๐Ÿป +""") +st.markdown(""" """) + +st.image("pages/SigLIP/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +With ๐Ÿค— Transformers integration there comes zero-shot-image-classification pipeline, makes SigLIP super easy to use! +""") +st.markdown(""" """) + +st.image("pages/SigLIP/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +What to use SigLIP for? ๐Ÿง +Honestly the possibilities are endless, but you can use it for image/text retrieval, zero-shot classification, training multimodal models! +I have made a repository with notebooks and applications that are also hosted on [Spaces](https://t.co/Ah1CrHVuPY). +I have built ["Draw to Search Art"](https://t.co/DcmQWMc1qd) where you can input image (upload one or draw) and search among 10k images in wikiart! +I've also built apps to [compare](https://t.co/m699TMvuW9) CLIP and SigLIP outputs. +""") +st.markdown(""" """) + +st.image("pages/SigLIP/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) +by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer (2023) +[GitHub](https://github.com/google-research/big_vision) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/siglip)""", icon="๐Ÿ“š") +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("DINOv2") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("OWLv2") \ No newline at end of file diff --git a/pages/6_OWLv2.py b/pages/6_OWLv2.py new file mode 100644 index 0000000000000000000000000000000000000000..3fb749dacc41e7dd5213e1daf77dd2d64338703d --- /dev/null +++ b/pages/6_OWLv2.py @@ -0,0 +1,87 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("OWLv2") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1748411972675150040) (January 19, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Explaining the ๐Ÿ‘‘ of zero-shot open-vocabulary object detection: OWLv2 ๐Ÿฆ‰๐Ÿงถ""") +st.markdown(""" """) + +st.image("pages/OWLv2/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +OWLv2 is scaled version of a model called OWL-ViT, so let's take a look at that first ๐Ÿ“ +OWLViT is an open vocabulary object detector, meaning, it can detect objects it didn't explicitly see during the training ๐Ÿ‘€ +What's cool is that it can take both image and text queries! This is thanks to how the image and text features aren't fused together. +""") +st.markdown(""" """) + +st.image("pages/OWLv2/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Taking a look at the architecture, the authors firstly do contrastive pre-training of a vision and a text encoder (just like CLIP). +They take that model, remove the final pooling layer and attach a lightweight classification and box detection head and fine-tune. +""") +st.markdown(""" """) + +st.image("pages/OWLv2/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""During fine-tuning for object detection, they calculate the loss over bipartite matches. +Simply put, loss is calculated over the predicted objects against ground truth objects and the goal is to find a perfect match of these two sets where each object is matched to one object in ground truth. + +OWL-ViT is very scalable. +One can easily scale most language models or vision-language models because they require no supervision, but this isn't the case for object detection: you still need supervision. +Moreover, only scaling the encoders creates a bottleneck after a while. +""") +st.markdown(""" """) + +st.image("pages/OWLv2/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +The authors wanted to scale OWL-ViT with more data, so they used OWL-ViT for labelling to train a better detector, "self-train" a new detector on the labels, and fine-tune the model on human-annotated data. +""") +st.markdown(""" """) + +st.image("pages/OWLv2/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Thanks to this, OWLv2 scaled very well and is tops leaderboards on open vocabulary object detection ๐Ÿ‘‘ +""") +st.markdown(""" """) + +st.image("pages/OWLv2/image_5.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Want to try OWL models? +I've created a [notebook](https://t.co/ick5tA6nyx) for you to see how to use it with ๐Ÿค— Transformers. +If you want to play with it directly, you can use this [Space](https://t.co/oghdLOtoa5). +All the models and the applications of OWL-series is in this [collection](https://huggingface.co/collections/merve/owl-series-65aaac3114e6582c300544df). +""") +st.markdown(""" """) + +st.info(""" +Ressources: +[Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) +by Matthias Minderer, Alexey Gritsenko, Neil Houlsby (2023) +[GitHub](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/owlv2)""", icon="๐Ÿ“š") +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("SigLIP") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Backbone") \ No newline at end of file diff --git a/pages/7_Backbone.py b/pages/7_Backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..a3c4db92e123f3651f8c9f2969a6dcedcbc7a4a4 --- /dev/null +++ b/pages/7_Backbone.py @@ -0,0 +1,63 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Backbone") + +st.success("""[Original tweet](https://x.com/mervenoyann/status/1749841426177810502) (January 23, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Many cutting-edge computer vision models consist of multiple stages: +โžฐ backbone extracts the features, +โžฐ neck refines the features, +โžฐ head makes the detection for the task. +Implementing this is cumbersome, so ๐Ÿค— Transformers has an API for this: Backbone! +""") +st.markdown(""" """) + +st.image("pages/Backbone/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Let's see an example of such model. +Assuming we would like to initialize a multi-stage instance segmentation model with ResNet backbone and MaskFormer neck and a head, you can use the backbone API like following (left comments for clarity) ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.image("pages/Backbone/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""One can also use a backbone just to get features from any stage. You can initialize any backbone with `AutoBackbone` class. +See below how to initialize a backbone and getting the feature maps at any stage ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.image("pages/Backbone/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Backbone API also supports any timm backbone of your choice! Check out a variation of timm backbones [here](https://t.co/Voiv0QCPB3). +""") +st.markdown(""" """) + +st.image("pages/Backbone/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Leaving some links ๐Ÿ”— +๐Ÿ“– I've created a [notebook](https://t.co/PNfmBvdrtt) for you to play with it +๐Ÿ“’ [Backbone API docs](https://t.co/Yi9F8qAigO) +๐Ÿ““ [AutoBackbone docs](https://t.co/PGo9oILHDw) (all written with love by me!๐Ÿ’œ)""") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("OWLv2") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Depth Anything") \ No newline at end of file diff --git a/pages/8_Depth_Anything.py b/pages/8_Depth_Anything.py new file mode 100644 index 0000000000000000000000000000000000000000..5d014e17b5cccc3836d292d0dc49212c586f4e62 --- /dev/null +++ b/pages/8_Depth_Anything.py @@ -0,0 +1,100 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("Depth Anything") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1750531698008498431) (January 25, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""Explaining a new state-of-the-art monocular depth estimation model: Depth Anything โœจ๐Ÿงถ +It has just been integrated in transformers for super-easy use. +We compared it against DPTs and benchmarked it as well! You can find the usage, benchmark, demos and more below ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.video("pages/Depth_Anything/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(""" +The paper starts with highlighting previous depth estimation methods and the limitations regarding the data coverage. ๐Ÿ‘€ +The model's success heavily depends on unlocking the use of unlabeled datasets, although initially the authors used self-training and failed. + +What the authors have done: +โžฐ Train a teacher model on labelled dataset +โžฐ Guide the student using teacher and also use unlabelled datasets pseudolabelled by the teacher. However, this was the cause of the failure, as both architectures were similar, the outputs were the same. +""") +st.markdown(""" """) + +st.image("pages/Depth_Anything/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +So the authors have added a more difficult optimization target for student to learn additional knowledge on unlabeled images that went through color jittering, distortions, Gaussian blurring and spatial distortion, so it can learn more invariant representations from them. + +The architecture consists of DINOv2 encoder to extract the features followed by DPT decoder. At first, they train the teacher model on labelled images, and then they jointly train the student model and add in the dataset pseudo-labelled by ViT-L. +""", unsafe_allow_html=True) + +st.markdown(""" """) + +st.image("pages/Depth_Anything/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""Thanks to this, Depth Anything performs very well! I have also benchmarked the inference duration of the model against different models here. I also ran `torch.compile` benchmarks across them and got nice speed-ups ๐Ÿš€ + +On T4 GPU, mean of 30 inferences for each. Inferred using `pipeline` (pre-processing and post-processing included with model inference). + +| Model/Batch Size | 16 | 4 | 1 | +| ----------------------------- | --------- | -------- | ------- | +| intel/dpt-large | 2709.652 | 667.799 | 172.617 | +| facebook/dpt-dinov2-small-nyu | 2534.854 | 654.822 | 159.754 | +| facebook/dpt-dinov2-base-nyu | 4316.8733 | 1090.824 | 266.699 | +| Intel/dpt-beit-large-512 | 7961.386 | 2036.743 | 497.656 | +| depth-anything-small | 1692.368 | 415.915 | 143.379 | + +`torch.compile`โ€™s benchmarks with reduce-overhead mode: we have compiled the model and loaded it to the pipeline for the benchmarks to be fair. + +| Model/Batch Size | 16 | 4 | 1 | +| ----------------------------- | -------- | -------- | ------- | +| intel/dpt-large | 2556.668 | 645.750 | 155.153 | +| facebook/dpt-dinov2-small-nyu | 2415.25 | 610.967 | 148.526 | +| facebook/dpt-dinov2-base-nyu | 4057.909 | 1035.672 | 245.692 | +| Intel/dpt-beit-large-512 | 7417.388 | 1795.882 | 426.546 | +| depth-anything-small | 1664.025 | 384.688 | 97.865 | + +""") +st.markdown(""" """) + +st.image("pages/Depth_Anything/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +You can use Depth Anything easily thanks to ๐Ÿค— Transformers with three lines of code! โœจ +We have also built an app for you to [compare different depth estimation models](https://t.co/6uq4osdwWG) ๐Ÿ ๐ŸŒธ +See all the available Depth Anything checkpoints [here](https://t.co/Ex0IIyx7XC). +""") +st.markdown(""" """) + +st.image("pages/Depth_Anything/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(""" +Ressources: +[Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) +by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) +[GitHub](https://github.com/LiheYoung/Depth-Anything) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything)""", icon="๐Ÿ“š") + + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Backbone") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("LLaVA-NeXT") \ No newline at end of file diff --git a/pages/9_LLaVA-NeXT.py b/pages/9_LLaVA-NeXT.py new file mode 100644 index 0000000000000000000000000000000000000000..ae3c3a9c5254d85df3394436ef803dcd2beb7c99 --- /dev/null +++ b/pages/9_LLaVA-NeXT.py @@ -0,0 +1,74 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + +st.title("LLaVA-NeXT") + +st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1770832875551682563) (March 21, 2024)""", icon="โ„น๏ธ") +st.markdown(""" """) + +st.markdown("""LLaVA-NeXT is recently merged to ๐Ÿค— Transformers and it outperforms many of the proprietary models like Gemini on various benchmarks!๐Ÿคฉ +For those who don't know LLaVA, it's a language model that can take image ๐Ÿ’ฌ +Let's take a look, demo and more in this. +""") +st.markdown(""" """) + +st.image("pages/LLaVA-NeXT/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +LLaVA is essentially a vision-language model that consists of ViT-based CLIP encoder, a MLP projection and Vicuna as decoder โœจ +LLaVA 1.5 was released with Vicuna, but LLaVA NeXT (1.6) is released with four different LLMs: +- Nous-Hermes-Yi-34B +- Mistral-7B +- Vicuna 7B & 13B +""") +st.markdown(""" """) + +st.image("pages/LLaVA-NeXT/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(""" +Thanks to Transformers integration, it is very easy to use LLaVA NeXT, not only standalone but also with 4-bit loading and Flash Attention 2 ๐Ÿ’œ +See below on standalone usage ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.image("pages/LLaVA-NeXT/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""To fit large models and make it even faster and memory efficient, you can enable Flash Attention 2 and load model into 4-bit using bitsandbytes โšก๏ธ transformers makes it very easy to do this! See below ๐Ÿ‘‡ +""") +st.markdown(""" """) + +st.image("pages/LLaVA-NeXT/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown("""If you want to try the code right away, here's the [notebook](https://t.co/NvoxvY9z1u). +Lastly, you can directly play with the LLaVA-NeXT based on Mistral-7B through the demo [here](https://t.co/JTDlqMUwEh) ๐Ÿค— +""") +st.markdown(""" """) + +st.video("pages/LLaVA-NeXT/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.info(""" +Ressources: +[LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) +by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee (2024) +[GitHub](https://github.com/haotian-liu/LLaVA/tree/main) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/llava_next)""", icon="๐Ÿ“š") + + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3 = st.columns(3) +with col1: + if st.button('Previous paper', use_container_width=True): + switch_page("Depth Anything") +with col2: + if st.button('Home', use_container_width=True): + switch_page("Home") +with col3: + if st.button('Next paper', use_container_width=True): + switch_page("Painter") \ No newline at end of file diff --git a/pages/Backbone/Backbone.md b/pages/Backbone/Backbone.md new file mode 100644 index 0000000000000000000000000000000000000000..dd40848799fd10bc23c64732011759e712510249 --- /dev/null +++ b/pages/Backbone/Backbone.md @@ -0,0 +1,31 @@ +๏ปฟMany cutting-edge computer vision models consist of multiple stages: +โžฐ backbone extracts the features, +โžฐ neck refines the features, +โžฐ head makes the detection for the task. +Implementing this is cumbersome, so ๐Ÿค— transformers has an API for this: Backbone! + +![image_1](image_1.jpg) + +Let's see an example of such model. +Assuming we would like to initialize a multi-stage instance segmentation model with ResNet backbone and MaskFormer neck and a head, you can use the backbone API like following (left comments for clarity) ๐Ÿ‘‡ + +![image_2](image_2.jpg) + +One can also use a backbone just to get features from any stage. You can initialize any backbone with `AutoBackbone` class. +See below how to initialize a backbone and getting the feature maps at any stage ๐Ÿ‘‡ + +![image_3](image_3.jpg) + +Backbone API also supports any timm backbone of your choice! Check out a variation of timm backbones [here](https://t.co/Voiv0QCPB3). + +![image_4](image_4.jpg) + +Leaving some links ๐Ÿ”—: +๐Ÿ“– I've created a [notebook](https://t.co/PNfmBvdrtt) for you to play with it +๐Ÿ“’ [Backbone API docs](https://t.co/Yi9F8qAigO) +๐Ÿ““ [AutoBackbone docs](https://t.co/PGo9oILHDw) ๐Ÿ’œ +(all written with love by me!) + + +> [!NOTE] +[Orignial tweet](https://twitter.com/mervenoyann/status/1749841426177810502) (January 23, 2024) diff --git a/pages/Backbone/image_1.jpeg b/pages/Backbone/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e13b273736aa7a089dd01dd4690c1668272065c8 Binary files /dev/null and b/pages/Backbone/image_1.jpeg differ diff --git a/pages/Backbone/image_2.jpeg b/pages/Backbone/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..539e19d52fde3cf4f0cf594d2496acc9a169e48a Binary files /dev/null and b/pages/Backbone/image_2.jpeg differ diff --git a/pages/Backbone/image_3.jpeg b/pages/Backbone/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..d199b14d45ee7c922fd2aba0b5275fdb3916c84e Binary files /dev/null and b/pages/Backbone/image_3.jpeg differ diff --git a/pages/Backbone/image_4.jpeg b/pages/Backbone/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..8612ac2422f1bf5230d4ab0d53d0468f098f8359 Binary files /dev/null and b/pages/Backbone/image_4.jpeg differ diff --git a/pages/Chameleon/Chameleon.md b/pages/Chameleon/Chameleon.md new file mode 100644 index 0000000000000000000000000000000000000000..8a4c7df8d9aae726149b489545f098085024c27c --- /dev/null +++ b/pages/Chameleon/Chameleon.md @@ -0,0 +1,43 @@ +๏ปฟChameleon ๐ŸฆŽ by Meta is now available in @huggingface transformers ๐Ÿ˜ +A multimodal model that comes in 7B and 34B sizes ๐Ÿคฉ +But what makes this model so special? keep reading โ‡ฃ + +![video_1](video_1.mp4) + +[Demo](https://t.co/GsGE17fSdI] | [Models](https://t.co/cWUiVbsRz6) +Find below the API to load this model locally use it โฌ‡๏ธ + +![image_1](image_1.jpg) + +Chameleon is a unique model: it attempts to scale early fusion ๐Ÿคจ But what is early fusion? +Modern vision language models use a vision encoder with a projection layer to project image embeddings so it can be promptable to text decoder. + +![image_2](image_2.jpg) + +Early fusion on the other hand attempts to fuse all features together (image patches and text) by using an image tokenizer and all tokens are projected into a shared space, which enables seamless generation ๐Ÿ˜ + +![image_3](image_3.jpg) + +Authors have also introduced different architectural improvements (QK norm and revise placement of layer norms) for scalable and stable training This way they were able to increase the token count (5x tokens compared to Llama 3 which is a must with early-fusion IMO) + +![image_4](image_4.jpg) + +This model is an any-to-any model thanks to early fusion: it can take image and text input and output image and text, but image generation are disabled to prevent malicious use. + +![image_5](image_5.jpg) + +One can also do text-only prompting, authors noted the model catches up with larger LLMs, and you can also see how it compares to VLMs with image-text prompting. + +![image_6](image_6.jpg) + +![image_7](image_7.jpg) + +> [!TIP] +Ressources: +[Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818) +by Chameleon Team (2024) +[GitHub](https://github.com/facebookresearch/chameleon) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/chameleon) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1814278511785312320) (July 19, 2024) \ No newline at end of file diff --git a/pages/Chameleon/image_1.jpg b/pages/Chameleon/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a570cd2c03d3930ffd3ebfa3bfccbc5eec6de0bf Binary files /dev/null and b/pages/Chameleon/image_1.jpg differ diff --git a/pages/Chameleon/image_2.jpg b/pages/Chameleon/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9351b4847b428ee62207a729f8d5cd3a6b712708 Binary files /dev/null and b/pages/Chameleon/image_2.jpg differ diff --git a/pages/Chameleon/image_3.jpg b/pages/Chameleon/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..72ae18c89d7dfae4df81e7dee2e3e1090cc5ea07 Binary files /dev/null and b/pages/Chameleon/image_3.jpg differ diff --git a/pages/Chameleon/image_4.jpg b/pages/Chameleon/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5fec1ea78d2abab4647ee35f0e5099bfc1ef0dff Binary files /dev/null and b/pages/Chameleon/image_4.jpg differ diff --git a/pages/Chameleon/image_5.jpg b/pages/Chameleon/image_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f155e8508c41db578e266e0f3e35d11a76d9560c Binary files /dev/null and b/pages/Chameleon/image_5.jpg differ diff --git a/pages/Chameleon/image_6.jpg b/pages/Chameleon/image_6.jpg new file mode 100644 index 0000000000000000000000000000000000000000..716dc3dcef0cca2b2e17fb2d6ff419d13466bbd0 Binary files /dev/null and b/pages/Chameleon/image_6.jpg differ diff --git a/pages/Chameleon/image_7.jpg b/pages/Chameleon/image_7.jpg new file mode 100644 index 0000000000000000000000000000000000000000..aef4d21ffbaf689aaf714669cd956518d39fcd94 Binary files /dev/null and b/pages/Chameleon/image_7.jpg differ diff --git a/pages/Chameleon/video_1.mp4 b/pages/Chameleon/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..a16eeaa29ff5c30a3b963c4d6b608af86b25e32f Binary files /dev/null and b/pages/Chameleon/video_1.mp4 differ diff --git a/pages/CuMo/CuMo.md b/pages/CuMo/CuMo.md new file mode 100644 index 0000000000000000000000000000000000000000..8a5608bbe1d7d8b6b91ca40fffd84b347b0c502f --- /dev/null +++ b/pages/CuMo/CuMo.md @@ -0,0 +1,24 @@ +๏ปฟIt's raining vision language models โ˜”๏ธ CuMo is a new vision language model that has MoE in every step of the VLM (image encoder, MLP and text decoder) and uses Mistral-7B for the decoder part ๐Ÿค“ + +![image_1](image_1.jpg) + +The authors firstly did pre-training of MLP with the by freezing the image encoder and text decoder, then they warmup the whole network by unfreezing and finetuning which they state to stabilize the visual instruction tuning when bringing in the experts. + +![image_2](image_2.jpg) + +The mixture of experts MLP blocks above are simply the same MLP blocks initialized from the single MLP that was trained during pre-training and fine-tuned in pre-finetuning ๐Ÿ‘‡ + +![image_3](image_3.jpg) + +It works very well (also tested myself) that it outperforms the previous sota of it's size LLaVA NeXt! ๐Ÿ˜ I wonder how it would compare to IDEFICS2-8B You can try it yourself [here](https://t.co/MLIYKVh5Ee). + +![image_4](image_4.jpg) + +> [!TIP] +Ressources: +[CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts](https://arxiv.org/abs/2405.05949) +by Jiachen Li, Xinyao Wang, Sijie Zhu, Chia-Wen Kuo, Lu Xu, Fan Chen, Jitesh Jain, Humphrey Shi, Longyin Wen (2024) +[GitHub](https://github.com/SHI-Labs/CuMo) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1790665706205307191) (May 15, 2024) \ No newline at end of file diff --git a/pages/CuMo/image_1.jpg b/pages/CuMo/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..da96a21983fe8e0cc003f054e0e74dbde87b7e80 Binary files /dev/null and b/pages/CuMo/image_1.jpg differ diff --git a/pages/CuMo/image_2.jpg b/pages/CuMo/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e8e76bce137b1fecb8c5397c3a76767d353961f3 Binary files /dev/null and b/pages/CuMo/image_2.jpg differ diff --git a/pages/CuMo/image_3.jpg b/pages/CuMo/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..15f23d840ac3561bf3da06e72ef4dc6f18c0cec4 Binary files /dev/null and b/pages/CuMo/image_3.jpg differ diff --git a/pages/CuMo/image_4.jpg b/pages/CuMo/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c8a1f378004044c4295898f862848f0b6be60649 Binary files /dev/null and b/pages/CuMo/image_4.jpg differ diff --git a/pages/DINOv2/Dinov2.md b/pages/DINOv2/Dinov2.md new file mode 100644 index 0000000000000000000000000000000000000000..4479d2fdf9f025b5049eb068a8a9f89e03066b2b --- /dev/null +++ b/pages/DINOv2/Dinov2.md @@ -0,0 +1,40 @@ +๏ปฟDINOv2 is the king for self-supervised learning in images ๐Ÿฆ–๐Ÿฆ• But how does it work? I've tried to explain how it works but let's expand on it ๐Ÿงถ + +![image_1](image_1.jpg) + +DINOv2 is essentially DINO on steroids, so let's talk about DINO first. +๐Ÿฆ• It's essentially a pre-training technique to train ViTs with self-supervision, that uses an unusual way of distillation ๐ŸงŸโ€โ™‚๏ธ๐Ÿง‘๐Ÿปโ€๐Ÿซ +Distillation is a technique where there's a large pre-trained model (teacher), and you have a smaller model (student) initialized randomly. +Then during training the student, you take both models'outputs, calculate divergence between them and then update the loss accordingly. +In this case, we have no labels! And the teacher is not pretrained!!!! ๐Ÿคฏ +Well, the outputs here are the distributions, and teacher is iteratively updated according to student, which is called exponential moving average. + +![image_2](image_2.jpg) + +DINO doesn't use any contrastive loss or clustering but only cross entropy loss (again, what a paper) which leads the model to collapse. +This can be avoided by normalizing the teacher output multiple times, but authors center (to squish logits) and sharpen (through temperature) the teacher outputs. +Finally, local and global crops are given to student and only global crops are given to teacher and this sort of pushes student to identify context from small parts of the image. + +![image_3](image_3.jpg) + +How does DINOv2 improve DINO? โšก๏ธ +More efficient thanks to FSDP and Flash Attention ๐Ÿฆ– Has a very efficient data augmentation technique that apparently scales to 100M+ images (put below) ๐Ÿง‘๐Ÿปโ€๐Ÿซ +Uses ViT-g instead of training from scratch + +![image_4](image_4.jpg) + +The model is so powerful that you can use DINOv2 even with knn or linear classifiers without need to fine-tuning! +But if you'd like DINOv2 to work even better, [NielsRogge](https://twitter.com/NielsRogge) has built a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Fine\_tune\_DINOv2\_for\_image\_classification\_%5Bminimal%5D.ipynb) to fine-tune it using `Trainer`. +๐Ÿ“– He also has a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Train\_a\_linear\_classifier\_on\_top\_of\_DINOv2\_for\_semantic\_segmentation.ipynb) if you feel like training a linear classifier only +๐Ÿ“” All the different DINO/v2 model checkpoints are [here](https://huggingface.co/models?search=dinoLastly). +Special thanks to [ykilcher](https://twitter.com/ykilcher) as I couldn't make sense of certain things in the paper and watched his awesome [tutorial](https://youtube.com/watch?v=h3ij3F) ๐Ÿคฉ + +> [!TIP] +Ressources: +[DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) +by Maxime Oquab, Timothรฉe Darcet, Thรฉo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervรฉ Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski (2023) +[GitHub](https://github.com/facebookresearch/dinov2) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/dinov2) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1743290724672495827) (January 5, 2024) \ No newline at end of file diff --git a/pages/DINOv2/image_1.jpeg b/pages/DINOv2/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e3f21d0a43896cb736fc3eb5a6036a79c01e2300 Binary files /dev/null and b/pages/DINOv2/image_1.jpeg differ diff --git a/pages/DINOv2/image_2.jpg b/pages/DINOv2/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..89302d47a095fa17b2afdebae4850c48c2a09e27 Binary files /dev/null and b/pages/DINOv2/image_2.jpg differ diff --git a/pages/DINOv2/image_3.jpeg b/pages/DINOv2/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..dbd02a8b045dd66c1bab29bf09c36ef722e12164 Binary files /dev/null and b/pages/DINOv2/image_3.jpeg differ diff --git a/pages/DINOv2/image_4.jpeg b/pages/DINOv2/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..711fb761881886fc419fadea50459674e2e119a4 Binary files /dev/null and b/pages/DINOv2/image_4.jpeg differ diff --git a/pages/DenseConnector/DenseConnector.md b/pages/DenseConnector/DenseConnector.md new file mode 100644 index 0000000000000000000000000000000000000000..03dc21c286910c5029ce4bad4932c42780997960 --- /dev/null +++ b/pages/DenseConnector/DenseConnector.md @@ -0,0 +1,32 @@ +๏ปฟDo we fully leverage image encoders in vision language models? ๐Ÿ‘€ +A new paper built a dense connector that does it better! Let's dig in ๐Ÿงถ + +![image_1](image_1.jpg) + +VLMs consist of an image encoder block, a projection layer that projects image embeddings to text embedding space and then a text decoder sequentially connected ๐Ÿ“– +This [paper](https://t.co/DPQzbj0eWm) explores using intermediate states of image encoder and not a single output ๐Ÿคฉ + +![image_2](image_2.jpg) + +The authors explore three different ways of instantiating dense connector: sparse token integration, sparse channel integration and dense channel integration (each of them just take intermediate outputs and put them together in different ways, see below). + +![image_3](image_3.jpg) + +They explore all three of them integrated to LLaVA 1.5 and found out each of the new models are superior to the original LLaVA 1.5. + +![image_4](image_4.jpg) + +I tried the model and it seems to work very well ๐Ÿฅน +The authors have released various [checkpoints](https://t.co/iF8zM2qvDa) based on different decoders (Vicuna 7/13B and Llama 3-8B). + +![image_5](image_5.jpg) + + +> [!TIP] +Ressources: +[Dense Connector for MLLMs](https://arxiv.org/abs/2405.13800) +by Huanjin Yao, Wenhao Wu, Taojiannan Yang, YuXin Song, Mengxi Zhang, Haocheng Feng, Yifan Sun, Zhiheng Li, Wanli Ouyang, Jingdong Wang (2024) +[GitHub](https://github.com/HJYao00/DenseConnector) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1796089181988352216) (May 30, 2024) \ No newline at end of file diff --git a/pages/DenseConnector/image_1.jpg b/pages/DenseConnector/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..54049e36e6ef94330ad77d552939cfbcfd794e8a Binary files /dev/null and b/pages/DenseConnector/image_1.jpg differ diff --git a/pages/DenseConnector/image_2.jpg b/pages/DenseConnector/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3ead1232393d16a3f9090f683da48f2efa93fe36 Binary files /dev/null and b/pages/DenseConnector/image_2.jpg differ diff --git a/pages/DenseConnector/image_3.jpg b/pages/DenseConnector/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..acde3e3b162a0f0b8fbfd4425e276385cbd0274e Binary files /dev/null and b/pages/DenseConnector/image_3.jpg differ diff --git a/pages/DenseConnector/image_4.jpg b/pages/DenseConnector/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..be881825ff172de57f08a05057b1fed561df73ec Binary files /dev/null and b/pages/DenseConnector/image_4.jpg differ diff --git a/pages/DenseConnector/image_5.jpg b/pages/DenseConnector/image_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4c6639141526dedb627b1cbfa720f0a7456df257 Binary files /dev/null and b/pages/DenseConnector/image_5.jpg differ diff --git a/pages/Depth Anything/Depth Anything.md b/pages/Depth Anything/Depth Anything.md new file mode 100644 index 0000000000000000000000000000000000000000..b3f6a43bc6c5eb7e1aa1a638ff5087cdfd3cb790 --- /dev/null +++ b/pages/Depth Anything/Depth Anything.md @@ -0,0 +1,61 @@ +๏ปฟExplaining a new state-of-the-art monocular depth estimation model: Depth Anything โœจ +It has just been integrated in transformers for super-easy use. +We compared it against DPTs and benchmarked it as well! You can the usage, benchmark, demos and more below ๐Ÿ‘‡ + +![video_1](video_1.mp4) + +The paper starts with highlighting previous depth estimation methods and the limitations regarding the data coverage. ๐Ÿ‘€ +The model's success heavily depends on unlocking the use of unlabeled datasets, although initially the authors used self-training and failed. + +What the authors have done: +โžฐ Train a teacher model on labelled dataset +โžฐ Guide the student using teacher and also use unlabelled datasets pseudolabelled by the teacher. However, this was the cause of the failure, as both architectures were similar, the outputs were the same. + +![image_1](image_1.jpg) + +So the authors have added a more difficult optimization target for student to learn additional knowledge on unlabeled images that went through color jittering, distortions, Gaussian blurring and spatial distortion, so it can learn more invariant representations from them. + +The architecture consists of DINOv2 encoder to extract the features followed by DPT decoder. At first, they train the teacher model on labelled images, and then they jointly train the student model and add in the dataset pseudo-labelled by ViT-L. + +![image_1](image_1.jpg) + +Thanks to this, Depth Anything performs very well! I have also benchmarked the inference duration of the model against different models here. I also ran `torch.compile` benchmarks across them and got nice speed-ups ๐Ÿš€ + +On T4 GPU, mean of 30 inferences for each. Inferred using `pipeline` (pre-processing and post-processing included with model inference). + +| Model/Batch Size | 16 | 4 | 1 | +| ----------------------------- | --------- | -------- | ------- | +| intel/dpt-large | 2709.652 | 667.799 | 172.617 | +| facebook/dpt-dinov2-small-nyu | 2534.854 | 654.822 | 159.754 | +| facebook/dpt-dinov2-base-nyu | 4316.8733 | 1090.824 | 266.699 | +| Intel/dpt-beit-large-512 | 7961.386 | 2036.743 | 497.656 | +| depth-anything-small | 1692.368 | 415.915 | 143.379 | + +torch.compileโ€™s benchmarks with reduce-overhead mode: we have compiled the model and loaded it to the pipeline for the benchmarks to be fair. + +| Model/Batch Size | 16 | 4 | 1 | +| ----------------------------- | -------- | -------- | ------- | +| intel/dpt-large | 2556.668 | 645.750 | 155.153 | +| facebook/dpt-dinov2-small-nyu | 2415.25 | 610.967 | 148.526 | +| facebook/dpt-dinov2-base-nyu | 4057.909 | 1035.672 | 245.692 | +| Intel/dpt-beit-large-512 | 7417.388 | 1795.882 | 426.546 | +| depth-anything-small | 1664.025 | 384.688 | 97.865 | + + +![image_2](image_2.jpg) + + +You can use Depth Anything easily thanks to ๐Ÿค— Transformers with three lines of code! โœจ We have also built an app for you to [compare different depth estimation models](https://t.co/6uq4osdwWG) ๐Ÿ ๐ŸŒธ See all the available Depth Anything checkpoints [here](https://t.co/Ex0IIyx7XC). + +![image_3](image_3.jpg) + + +> [!TIP] +Ressources: +[Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) +by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) +[GitHub](https://github.com/LiheYoung/Depth-Anything) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1750531698008498431) (January 25, 2024) \ No newline at end of file diff --git a/pages/Depth Anything/image_1.jpg b/pages/Depth Anything/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9f9f597f1ee050d81546ca12df764fd495c9033b Binary files /dev/null and b/pages/Depth Anything/image_1.jpg differ diff --git a/pages/Depth Anything/image_2.jpg b/pages/Depth Anything/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5764194e35b849f479f6b112e3d10ab243aec6f5 Binary files /dev/null and b/pages/Depth Anything/image_2.jpg differ diff --git a/pages/Depth Anything/image_3.jpg b/pages/Depth Anything/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..330fcf92754f3798f78dff64d3a092ef07a4a503 Binary files /dev/null and b/pages/Depth Anything/image_3.jpg differ diff --git a/pages/Depth Anything/video_1.mp4 b/pages/Depth Anything/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..44bbc6257f0bd16776517efa7ce99fbc78df19bf --- /dev/null +++ b/pages/Depth Anything/video_1.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2587f63d7a6622ca913f0260aa45b2fea8c806f261a09cb5b692ec2644b51066 +size 2026722 diff --git a/pages/Depth_Anything_v2/Depth Anything v2.md b/pages/Depth_Anything_v2/Depth Anything v2.md new file mode 100644 index 0000000000000000000000000000000000000000..5314188ec02c86258dc34d6c863858196dfa0d70 --- /dev/null +++ b/pages/Depth_Anything_v2/Depth Anything v2.md @@ -0,0 +1,34 @@ +๏ปฟI love Depth Anything V2 ๐Ÿ˜ Itโ€™s Depth Anything, but scaled with both larger teacher model and a gigantic dataset! Letโ€™s unpack ๐Ÿค“๐Ÿงถ! + +![image_1](image_1.jpg) + +The authors have analyzed Marigold, a diffusion based model against Depth Anything and found out whatโ€™s up with using synthetic images vs real images for MDE: ๐Ÿ”– +Real data has a lot of label noise, inaccurate depth maps (caused by depth sensors missing transparent objects etc). + +![image_2](image_2.jpg) + +The authors train different image encoders only on synthetic images and find out unless the encoder is very large the model canโ€™t generalize well (but large models generalize inherently anyway) ๐Ÿง But they still fail encountering real images that have wide distribution in labels. + +![image_3](image_3.jpg) + +Depth Anything v2 framework is to... +๐Ÿฆ– Train a teacher model based on DINOv2-G based on 595K synthetic images +๐Ÿท๏ธ Label 62M real images using teacher model +๐Ÿฆ• Train a student model using the real images labelled by teacher +Result: 10x faster and more accurate than Marigold! + +![image_4](image_4.jpg) + + +The authors also construct a new benchmark called DA-2K that is less noisy, highly detailed and more diverse! +I have created a [collection](https://t.co/3fAB9b2sxi) that has the models, the dataset, the demo and CoreML converted model ๐Ÿ˜š + +> [!TIP] +Ressources: +[Depth Anything V2](https://arxiv.org/abs/2406.09414) +by Lihe Yang, Bingyi Kang, Zilong Huang, Zhen Zhao, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) +[GitHub](https://github.com/DepthAnything/Depth-Anything-V2) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything_v2) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1803063120354492658) (June 18, 2024) \ No newline at end of file diff --git a/pages/Depth_Anything_v2/image_1.jpg b/pages/Depth_Anything_v2/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5d15680b6ee378948b983770176c4667360a9811 Binary files /dev/null and b/pages/Depth_Anything_v2/image_1.jpg differ diff --git a/pages/Depth_Anything_v2/image_2.jpg b/pages/Depth_Anything_v2/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0585867a67c18a7fd02d8c38fdfdbe63f5fa5f03 Binary files /dev/null and b/pages/Depth_Anything_v2/image_2.jpg differ diff --git a/pages/Depth_Anything_v2/image_3.jpg b/pages/Depth_Anything_v2/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2acbdffed71edbbce1e1b0859b18635db56d0c0a Binary files /dev/null and b/pages/Depth_Anything_v2/image_3.jpg differ diff --git a/pages/Depth_Anything_v2/image_4.jpg b/pages/Depth_Anything_v2/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..eb78a815f9dd79ddf7fc7ff74488cdf1f6c9d481 Binary files /dev/null and b/pages/Depth_Anything_v2/image_4.jpg differ diff --git a/pages/DocOwl_1.5/DocOwl 1.5.md b/pages/DocOwl_1.5/DocOwl 1.5.md new file mode 100644 index 0000000000000000000000000000000000000000..b92eaae30e321c52f4b179c12a879e6f5b6dfcb9 --- /dev/null +++ b/pages/DocOwl_1.5/DocOwl 1.5.md @@ -0,0 +1,52 @@ +๏ปฟDocOwl 1.5 is the state-of-the-art document understanding model by Alibaba with Apache 2.0 license ๐Ÿ˜๐Ÿ“ time to dive in and learn more ๐Ÿงถ + +![image_1](image_1.jpeg) + +This model consists of a ViT-based visual encoder part that takes in crops of image and the original image itself Then the outputs of the encoder goes through a convolution based model, after that the outputs are merged with text and then fed to LLM + +![image_2](image_2.jpeg) + +Initially, the authors only train the convolution based part (called H-Reducer) and vision encoder while keeping LLM frozen Then for fine-tuning (on image captioning, VQA etc), they freeze vision encoder and train H-Reducer and LLM + +![image_3](image_3.jpeg) + +Also they use simple linear projection on text and documents. You can see below how they model the text prompts and outputs ๐Ÿค“ + +![image_4](image_4.jpeg) + +They train the model various downstream tasks including: +- document understanding (DUE benchmark and more) +- table parsing (TURL, PubTabNet) +- chart parsing (PlotQA and more) +- image parsing (OCR-CC) +- text localization (DocVQA and more) + +![image_5](image_5.jpeg) + +They contribute a new model called DocOwl 1.5-Chat by: +1. creating a new document-chat dataset with questions from document VQA datasets +2. feeding them to ChatGPT to get long answers +3. fine-tune the base model with it (which IMO works very well!) + +![image_6](image_6.jpeg) + +Resulting generalist model and the chat model are pretty much state-of-the-art ๐Ÿ˜ Below you can see how it compares to fine-tuned models + +![image_7](image_7.jpeg) + +Very good paper, read it [here](https://t.co/T23JOAPkv1). +All the models and the datasets (also some eval datasets on above tasks!) are in this [organization](https://t.co/sJdTw1jWTR). +The [Space](https://t.co/57E9DbNZXf). + +Thanks a lot for reading! + +![image_8](image_8.jpeg) + +> [!TIP] +Ressources: +[mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding](https://arxiv.org/abs/2403.12895) +by Anwen Hu, Haiyang Xu, Jiabo Ye, Ming Yan, Liang Zhang, Bo Zhang, Chen Li, Ji Zhang, Qin Jin, Fei Huang, Jingren Zhou (2024) +[GitHub](https://github.com/X-PLUG/mPLUG-DocOwl) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1782421257591357824) (April 22, 2024) \ No newline at end of file diff --git a/pages/DocOwl_1.5/image_1.jpg b/pages/DocOwl_1.5/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f6acdefc776a4586fdefd65e0f36ef2dc30aab30 Binary files /dev/null and b/pages/DocOwl_1.5/image_1.jpg differ diff --git a/pages/DocOwl_1.5/image_2.jpeg b/pages/DocOwl_1.5/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..70369f08bb9f6421696ae51f4a7452c7468fa130 Binary files /dev/null and b/pages/DocOwl_1.5/image_2.jpeg differ diff --git a/pages/DocOwl_1.5/image_3.jpeg b/pages/DocOwl_1.5/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..5e05f6401ec03b4c7afd67437911edb6b455d547 Binary files /dev/null and b/pages/DocOwl_1.5/image_3.jpeg differ diff --git a/pages/DocOwl_1.5/image_4.jpeg b/pages/DocOwl_1.5/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..557d61cc4e621c25073c9409a82e305e77477af3 Binary files /dev/null and b/pages/DocOwl_1.5/image_4.jpeg differ diff --git a/pages/DocOwl_1.5/image_5.jpeg b/pages/DocOwl_1.5/image_5.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..63442fcf91dc3cbee5fa9f4bef1e6022817e94e5 Binary files /dev/null and b/pages/DocOwl_1.5/image_5.jpeg differ diff --git a/pages/DocOwl_1.5/image_6.jpeg b/pages/DocOwl_1.5/image_6.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..190062b9a42597245aa718e2bd7b5bb68e25b8de Binary files /dev/null and b/pages/DocOwl_1.5/image_6.jpeg differ diff --git a/pages/DocOwl_1.5/image_7.jpeg b/pages/DocOwl_1.5/image_7.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..11e88429bd61efbf20a7710d8786fc366ffca638 Binary files /dev/null and b/pages/DocOwl_1.5/image_7.jpeg differ diff --git a/pages/DocOwl_1.5/image_8.jpeg b/pages/DocOwl_1.5/image_8.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..18a9f12a7d4f667ed014ea3946d3563b61caae00 Binary files /dev/null and b/pages/DocOwl_1.5/image_8.jpeg differ diff --git a/pages/Florence-2/Florence-2.md b/pages/Florence-2/Florence-2.md new file mode 100644 index 0000000000000000000000000000000000000000..57a19dee06847eef5ce619b70cf41ed5b3a82ae1 --- /dev/null +++ b/pages/Florence-2/Florence-2.md @@ -0,0 +1,34 @@ +๏ปฟFlorence-2 is a new vision foundation model by MSFT capable of a wide variety of tasks ๐Ÿคฏ Let's unpack! ๐Ÿงถ Demo, models and more on the next one ๐Ÿฃ + +![image_1](image_1.jpg) + +This model is can handle tasks that vary from document understanding to semantic segmentation ๐Ÿคฉ +[Demo](https://t.co/7YJZvjhw84) | [Collection](https://t.co/Ub7FGazDz1) + +![image_2](image_2.jpg) + +The difference from previous models is that the authors have compiled a dataset that consists of 126M images with 5.4B annotations labelled with their own data engine โ†“โ†“ + +![image_3](image_3.jpg) + +The dataset also offers more variety in annotations compared to other datasets, it has region level and image level annotations with more variety in semantic granularity as well! + +![image_4](image_4.jpg) + +The model is a similar architecture to previous models, an image encoder, a multimodality encoder with text decoder. The authors have compiled the multitask dataset with prompts for each task which makes the model trainable on multiple tasks ๐Ÿค— + +![image_5](image_5.jpg) + +You also fine-tune this model on any task of choice, the authors also released different results on downstream tasks and report their results when un/freezing vision encoder ๐Ÿค“๐Ÿ“‰ +They have released fine-tuned models too, you can find them in the collection above ๐Ÿค— + +![image_6](image_6.jpg) + +> [!TIP] +Ressources: +[Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242) +by Bin Xiao, Haiping Wu, Weijian Xu, Xiyang Dai, Houdong Hu, Yumao Lu, Michael Zeng, Ce Liu, Lu Yuan (2023) +[Hugging Face blog post](https://huggingface.co/blog/finetune-florence2) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1803769866878623819) (June 20, 2024) diff --git a/pages/Florence-2/image_1.jpg b/pages/Florence-2/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..31bae32a5d350cd79e15a629fe9d3cf8a3cb738c Binary files /dev/null and b/pages/Florence-2/image_1.jpg differ diff --git a/pages/Florence-2/image_2.jpg b/pages/Florence-2/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3035fbb46dc585536f39a6ed924ab9c638a25e3f Binary files /dev/null and b/pages/Florence-2/image_2.jpg differ diff --git a/pages/Florence-2/image_3.jpg b/pages/Florence-2/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..469963370d22a8950425d86e84c56512004f611d Binary files /dev/null and b/pages/Florence-2/image_3.jpg differ diff --git a/pages/Florence-2/image_4.jpg b/pages/Florence-2/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..63e030e11386b122873c2dd44db99164d1d05a49 Binary files /dev/null and b/pages/Florence-2/image_4.jpg differ diff --git a/pages/Florence-2/image_5.jpg b/pages/Florence-2/image_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b105948d26684849940773d47ce93d1b023aff99 Binary files /dev/null and b/pages/Florence-2/image_5.jpg differ diff --git a/pages/Florence-2/image_6.jpg b/pages/Florence-2/image_6.jpg new file mode 100644 index 0000000000000000000000000000000000000000..880e6e7e76c521eee3322d1049fdb6b0f197f13f Binary files /dev/null and b/pages/Florence-2/image_6.jpg differ diff --git a/pages/Grounding_DINO/Grounding DINO.md b/pages/Grounding_DINO/Grounding DINO.md new file mode 100644 index 0000000000000000000000000000000000000000..18dfedbff03e116d41d80467beb1dd56d318824c --- /dev/null +++ b/pages/Grounding_DINO/Grounding DINO.md @@ -0,0 +1,45 @@ +๏ปฟWe have merged Grounding DINO in ๐Ÿค— Transformers +It's an amazing zero-shot object detection model, here's why ๐Ÿงถ also I have built two applications on top of it. + +![image_1](image_1.jpg) + +There are two zero-shot object detection models as of now, one is OWL series by Google Brain and the other one is Grounding DINO ๐Ÿฆ• Grounding DINO pays immense attention to detail โฌ‡๏ธ +Also [try yourself](https://t.co/UI0CMxphE7. + +![image_2](image_2.jpg) + +![image_3](image_3.jpg) + +I have also built another [application](https://t.co/4EHpOwEpm0) for GroundingSAM, combining GroundingDINO and Segment Anything by Meta for cutting edge zero-shot image segmentation. + +![image_4](image_4.jpg) + +Grounding DINO is essentially a model with connected image encoder (Swin transformer), text encoder (BERT) and on top of both, a decoder that outputs bounding boxes ๐Ÿฆ– This is quite similar to OWLv2, which uses a ViT-based detector on CLIP. + +![image_5](image_5.jpg) + +The authors train Swin-L/T with BERT contrastively (not like CLIP where they match the images to texts by means of similarity) where they try to approximate the region outputs to language phrases at the head outputs ๐Ÿคฉ + +![image_6](image_6.jpg) + +The authors also form the text features on the sub-sentence level. This means it extracts certain noun phrases from training data to remove the influence between words while removing fine-grained information. + +![image_7](image_7.jpg) + +Thanks to all of this, Grounding DINO has great performance on various REC/object detection benchmarks ๐Ÿ†๐Ÿ“ˆ + +![image_8](image_8.jpg) + +Thanks to transformers, you can use Grounding DINO very easily! You can also check out [NielsRogge](https://twitter.com/NielsRogge)'s [notebook here](https://t.co/8ADGFdVkta). + +![image_9](image_9.jpg) + +> [!TIP] +Ressources: +[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) +by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang (2023) +[GitHub](https://github.com/IDEA-Research/GroundingDINO) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/grounding-dino) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1780558859221733563) \ No newline at end of file diff --git a/pages/Grounding_DINO/image_1.jpeg b/pages/Grounding_DINO/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..808589168a625f34be81d385949e94f6f1433066 Binary files /dev/null and b/pages/Grounding_DINO/image_1.jpeg differ diff --git a/pages/Grounding_DINO/image_2.jpeg b/pages/Grounding_DINO/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..571ee25ca84f429642e141ae837db5e76b91ab21 Binary files /dev/null and b/pages/Grounding_DINO/image_2.jpeg differ diff --git a/pages/Grounding_DINO/image_3.jpeg b/pages/Grounding_DINO/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..4b2fdcb1ae3071e3cbb57f4c7d8663a3f012cdc7 Binary files /dev/null and b/pages/Grounding_DINO/image_3.jpeg differ diff --git a/pages/Grounding_DINO/image_4.jpeg b/pages/Grounding_DINO/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..953d513030a01ef62ca29818ea946e59a9ea9c02 Binary files /dev/null and b/pages/Grounding_DINO/image_4.jpeg differ diff --git a/pages/Grounding_DINO/image_5.jpeg b/pages/Grounding_DINO/image_5.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..cf95438c91dd411b7bd67cf8525f9108bef6cb13 Binary files /dev/null and b/pages/Grounding_DINO/image_5.jpeg differ diff --git a/pages/Grounding_DINO/image_6.jpeg b/pages/Grounding_DINO/image_6.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..fc54bffdab07e5021d9b39dccecddc960a1c46a2 Binary files /dev/null and b/pages/Grounding_DINO/image_6.jpeg differ diff --git a/pages/Grounding_DINO/image_7.jpeg b/pages/Grounding_DINO/image_7.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..bfdb3872472f1caacd0bc166781f6a9e0d597b18 Binary files /dev/null and b/pages/Grounding_DINO/image_7.jpeg differ diff --git a/pages/Grounding_DINO/image_8.jpeg b/pages/Grounding_DINO/image_8.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..c646bbb5edd1844ebbfdfda58b7ca6ce4d8df824 Binary files /dev/null and b/pages/Grounding_DINO/image_8.jpeg differ diff --git a/pages/Grounding_DINO/image_9.jpeg b/pages/Grounding_DINO/image_9.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1ba01db76dff84a7d467f05295a6d370d7ac659d Binary files /dev/null and b/pages/Grounding_DINO/image_9.jpeg differ diff --git a/pages/LLaVA-NeXT/LLaVA-NeXT.md b/pages/LLaVA-NeXT/LLaVA-NeXT.md new file mode 100644 index 0000000000000000000000000000000000000000..314b9484734adf10686ed7de5277dd3e56446dff --- /dev/null +++ b/pages/LLaVA-NeXT/LLaVA-NeXT.md @@ -0,0 +1,33 @@ +๏ปฟLLaVA-NeXT is recently merged to ๐Ÿค— Transformers and it outperforms many of the proprietary models like Gemini on various benchmarks! +๐Ÿคฉ For those who don't know LLaVA, it's a language model that can take image ๐Ÿ’ฌ Let's take a look, demo and more in this. + +![image_1](image_1.jpg) + +LLaVA is essentially a vision-language model that consists of ViT-based CLIP encoder, a MLP projection and Vicuna as decoder โœจ LLaVA 1.5 was released with Vicuna, but LLaVA NeXT (1.6) is released with four different LLMs: +- Nous-Hermes-Yi-34B +- Mistral-7B +- Vicuna 7B & 13B + +![image_2](image_2.jpg) + +Thanks to Transformers integration, it is very easy to use LLaVA NeXT, not only standalone but also with 4-bit loading and Flash Attention 2 ๐Ÿ’œ See below on standalone usage ๐Ÿ‘‡ + +![image_3](image_3.jpg) + +To fit large models and make it even faster and memory efficient, you can enable Flash Attention 2 and load model into 4-bit using bitsandbytes โšก๏ธ transformers makes it very easy to do this! See below ๐Ÿ‘‡ + +![image_4](image_4.jpg) + +If you want to try the code right away, here's the [notebook](https://t.co/NvoxvY9z1u). Lastly, you can directly play with the LLaVA-NeXT based on Mistral-7B through the demo [here](https://t.co/JTDlqMUwEh) ๐Ÿค— + +![video_1](video_1.mp4) + +> [!TIP] +Ressources: +[LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) +by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee (2024) +[GitHub](https://github.com/haotian-liu/LLaVA/tree/main) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/llava_next) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1770832875551682563) (March 21, 2024) \ No newline at end of file diff --git a/pages/LLaVA-NeXT/image_1.jpeg b/pages/LLaVA-NeXT/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..39adbf76d3d0c567b92effdc65d1fce50db6b5d9 Binary files /dev/null and b/pages/LLaVA-NeXT/image_1.jpeg differ diff --git a/pages/LLaVA-NeXT/image_2.jpeg b/pages/LLaVA-NeXT/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..529939bc6590519de33cd984a7dfd9beb94bfc16 Binary files /dev/null and b/pages/LLaVA-NeXT/image_2.jpeg differ diff --git a/pages/LLaVA-NeXT/image_3.jpeg b/pages/LLaVA-NeXT/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..a4668bd51dbbfdb1209adfb77b127c82f95b7895 Binary files /dev/null and b/pages/LLaVA-NeXT/image_3.jpeg differ diff --git a/pages/LLaVA-NeXT/image_4.jpeg b/pages/LLaVA-NeXT/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..b2d7e74936615b26d8ebc35d4092d4cee32a4d74 Binary files /dev/null and b/pages/LLaVA-NeXT/image_4.jpeg differ diff --git a/pages/LLaVA-NeXT/video_1.mp4 b/pages/LLaVA-NeXT/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..8e47ba5d9ee035b225516232f7ef54ee8897833b Binary files /dev/null and b/pages/LLaVA-NeXT/video_1.mp4 differ diff --git a/pages/Llava-NeXT-Interleave/Llava-NeXT-Interleave.md b/pages/Llava-NeXT-Interleave/Llava-NeXT-Interleave.md new file mode 100644 index 0000000000000000000000000000000000000000..defbad684b7abf9a6b969db5546d0e75cd6e1a57 --- /dev/null +++ b/pages/Llava-NeXT-Interleave/Llava-NeXT-Interleave.md @@ -0,0 +1,39 @@ +๏ปฟThe vision language model in this video is 0.5B and can take in image, video and 3D! ๐Ÿคฏ Llava-NeXT-Interleave is a new vision language model trained on interleaved image, video and 3D data keep reading โฅฅโฅฅ + +![video_1](video_1.jpg) + +This model comes with 0.5B, 7B and 7B-DPO variants, all can be used with Transformers ๐Ÿ˜ +[Collection of models](https://t.co/sZsaglSXa3) | [Demo](https://t.co/FbpaMWJY8k) +See how to use below ๐Ÿ‘‡๐Ÿป + +![image_1](image_1.jpg) + +Authors of this paper have explored training Llava-NeXT on interleaved data where the data consists of multiple modalities, including image(s), video, 3D ๐Ÿ“š +They have discovered that interleaved data increases results across all benchmarks! + +![image_2](image_2.jpg) + +The model can do task transfer from single image tasks to multiple images ๐Ÿคฏ The authors have trained the model on single images and code yet the model can solve coding with multiple images. + +![image_3](image_3.jpg) + +Same applies to other modalities, see below for video: + +![image_4](image_4.jpg) + +The model also has document understanding capabilities and many real-world application areas + +![image_5](image_5.jpg) + +This release also comes with the dataset this model was fine-tuned on ๐Ÿ“– [M4-Instruct-Data](https://t.co/rutXMtNC0I) + +![image_6](image_6.jpg) + +> [!TIP] +Ressources: +[LLaVA-NeXT: Tackling Multi-image, Video, and 3D in Large Multimodal Models](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/) +by Feng Li, Renrui Zhang*, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, Chunyuan Li (2024) +[GitHub](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/inference/docs/LLaVA-NeXT-Interleave.md) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1813560292397203630) (July 17, 2024) \ No newline at end of file diff --git a/pages/Llava-NeXT-Interleave/image_1.jpg b/pages/Llava-NeXT-Interleave/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..565e4d4f1aacabec08a290b141ec7a24c7f8b65a Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_1.jpg differ diff --git a/pages/Llava-NeXT-Interleave/image_2.jpg b/pages/Llava-NeXT-Interleave/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2d3b5295fc880bc124ff52fe949389ebe7c0434a Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_2.jpg differ diff --git a/pages/Llava-NeXT-Interleave/image_3.jpg b/pages/Llava-NeXT-Interleave/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6932fa5368a8dc5c702fc26203e8c5d67182697c Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_3.jpg differ diff --git a/pages/Llava-NeXT-Interleave/image_4.jpg b/pages/Llava-NeXT-Interleave/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9a765ea6dbfff0edf1d4b2beeaa3dd3e91f22b5d Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_4.jpg differ diff --git a/pages/Llava-NeXT-Interleave/image_5.jpg b/pages/Llava-NeXT-Interleave/image_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6180371a72fd732699de94bc05d06b4f229d8539 Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_5.jpg differ diff --git a/pages/Llava-NeXT-Interleave/image_6.jpg b/pages/Llava-NeXT-Interleave/image_6.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d0f03d353e60db0d39cfae84ae742cfba799fdf7 Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_6.jpg differ diff --git a/pages/Llava-NeXT-Interleave/video_1.mp4 b/pages/Llava-NeXT-Interleave/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..dd61907079b0a479ca9e253809bbe66a2964cb04 Binary files /dev/null and b/pages/Llava-NeXT-Interleave/video_1.mp4 differ diff --git a/pages/MobileSAM/MobileSAM.md b/pages/MobileSAM/MobileSAM.md new file mode 100644 index 0000000000000000000000000000000000000000..edeb658b719f37814df3c64879d1c4c9349cff14 --- /dev/null +++ b/pages/MobileSAM/MobileSAM.md @@ -0,0 +1,38 @@ +๏ปฟRead the MobileSAM paper this weekend ๐Ÿ“– Sharing some insights! +The idea ๐Ÿ’ก: SAM model consist of three parts, a heavy image encoder, a prompt encoder (prompt can be text, bounding box, mask or point) and a mask decoder. +To make the SAM model smaller without compromising from the performance, the authors looked into three types of distillation. +First one is distilling the decoder outputs directly (a more naive approach) with a completely randomly initialized small ViT and randomly initialized mask decoder. +However, when the ViT and the decoder are both in a bad state, this doesn't work well. + +![image_1](image_1.jpg) + +The second type of distillation is called semi-coupled, where the authors only randomly initialized the ViT image encoder and kept the mask decoder. +This is called semi-coupled because the image encoder distillation still depends on the mask decoder (see below ๐Ÿ‘‡ ) + +![image_2](image_2.jpg) + +The last type of distillation, decoupled distillation, is the most intuitive IMO. +The authors have "decoupled" image encoder altogether and have frozen the mask decoder and didn't really distill based on generated masks. +This makes sense as the bottleneck here is the encoder itself and most of the time, distillation works well with encoding. + +![image_3](image_3.jpg) + +Finally, they found out that decoupled distillation performs better than coupled distillation by means of mean IoU and requires much less compute! โ™ฅ๏ธ + +![image_4](image_4.jpg) + +Wanted to leave some links here if you'd like to try yourself ๐Ÿ‘‡ +- MobileSAM [demo](https://huggingface.co/spaces/dhkim2810/MobileSAMMobileSAM) +- Model [repository](https://huggingface.co/dhkim2810/MobileSAM) +If you'd like to experiment around TinyViT, timm library has a bunch of [checkpoints available](https://huggingface.co/models?sort=trending&search=timm%2Ftinyvit). + +![image_5](image_5.jpg) + +> [!TIP] +Ressources: +[Faster Segment Anything: Towards Lightweight SAM for Mobile Applications](https://arxiv.org/abs/2306.14289) +by Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, Choong Seon Hong (2023) +[GitHub](https://github.com/ChaoningZhang/MobileSAM) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1738959605542076863) (December 24, 2023) \ No newline at end of file diff --git a/pages/MobileSAM/image_1.jpeg b/pages/MobileSAM/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..ef60949c0d7df312dc478f8e55d609cd1d304456 Binary files /dev/null and b/pages/MobileSAM/image_1.jpeg differ diff --git a/pages/MobileSAM/image_2.jpg b/pages/MobileSAM/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..05c05faed896fe13547e17e9af6195b1a825768c Binary files /dev/null and b/pages/MobileSAM/image_2.jpg differ diff --git a/pages/MobileSAM/image_3.jpeg b/pages/MobileSAM/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..6bef1b50007b2a48c7538b918015ee40faa2241b Binary files /dev/null and b/pages/MobileSAM/image_3.jpeg differ diff --git a/pages/MobileSAM/image_4.jpg b/pages/MobileSAM/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f883304123672c22f38c11521f6298ee3b801244 Binary files /dev/null and b/pages/MobileSAM/image_4.jpg differ diff --git a/pages/MobileSAM/image_5.jpeg b/pages/MobileSAM/image_5.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..62b176d575b8e381be10fa4001abec4fcd90bbd9 Binary files /dev/null and b/pages/MobileSAM/image_5.jpeg differ diff --git a/pages/OWLv2/OWLv2.md b/pages/OWLv2/OWLv2.md new file mode 100644 index 0000000000000000000000000000000000000000..5ea5e41ed0d183042162a595e8fc52e38780413e --- /dev/null +++ b/pages/OWLv2/OWLv2.md @@ -0,0 +1,46 @@ +๏ปฟExplaining the ๐Ÿ‘‘ of zero-shot open-vocabulary object detection: OWLv2 ๐Ÿฆ‰๐Ÿงถ + +![image_1](image_1.jpg) + +OWLv2 is scaled version of a model called OWL-ViT, so let's take a look at that first. +๐Ÿ“ OWLViT is an open vocabulary object detector, meaning, it can detect objects it didn't explicitly see during the training. +๐Ÿ‘€ What's cool is that it can take both image and text queries! This is thanks to how the image and text features aren't fused together. + +![image_2](image_2.jpg) + +Taking a look at the architecture, the authors firstly do contrastive pre-training of a vision and a text encoder (just like CLIP). +They take that model, remove the final pooling layer and attach a lightweight classification and box detection head and fine-tune. + +![image_3](image_3.jpg) + +During fine-tuning for object detection, they calculate the loss over bipartite matches. +Simply put, loss is calculated over the predicted objects against ground truth objects and the goal is to find a perfect match of these two sets where each object is matched to one object in ground truth. + +OWL-ViT is very scalable. +One can easily scale most language models or vision-language models because they require no supervision, but this isn't the case for object detection: you still need supervision. +Moreover, only scaling the encoders creates a bottleneck after a while. + +![image_1](image_1.jpg) + +The authors wanted to scale OWL-ViT with more data, so they used OWL-ViT for labelling to train a better detector, "self-train" a new detector on the labels, and fine-tune the model on human-annotated data. (see below) + +![image_4](image_4.jpg) + +Thanks to this, OWLv2 scaled very well and is tops leaderboards on open vocabulary object detection ๐Ÿ‘‘ + +![image_5](image_5.jpg) + +Want to try OWL models? I've created a [notebook](https://t.co/ick5tA6nyx ) for you to see how to use it with ๐Ÿค— Transformers. +If you want to play with it directly, you can use this [Space](https://t.co/oghdLOtoa5). +All the models and the applications of OWL-series is in this [collection](https://huggingface.co/collections/merve/owl-series-65aaac3114e6582c300544df). + +> [!TIP] +Ressources: +[Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) +by Matthias Minderer, Alexey Gritsenko, Neil Houlsby (2023) +[GitHub](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/owlv2) + + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1748411972675150040) (January 19, 2024) diff --git a/pages/OWLv2/image_1.jpeg b/pages/OWLv2/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..72ee9ad988e520f29252428ae825af0cbeaf8181 Binary files /dev/null and b/pages/OWLv2/image_1.jpeg differ diff --git a/pages/OWLv2/image_2.jpeg b/pages/OWLv2/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..405c58b14f8c866d96805784d9d145de09d485f3 Binary files /dev/null and b/pages/OWLv2/image_2.jpeg differ diff --git a/pages/OWLv2/image_3.jpeg b/pages/OWLv2/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..c0c7921bf1b3a35087466bb990364605a85370b9 Binary files /dev/null and b/pages/OWLv2/image_3.jpeg differ diff --git a/pages/OWLv2/image_4.jpeg b/pages/OWLv2/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..ca8e3933972379a28b580280441035563ed13fd9 Binary files /dev/null and b/pages/OWLv2/image_4.jpeg differ diff --git a/pages/OWLv2/image_5.jpeg b/pages/OWLv2/image_5.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e7fba927ba4c152cdd1eb334707c8d744df477ae Binary files /dev/null and b/pages/OWLv2/image_5.jpeg differ diff --git a/pages/OneFormer/OneFormer.md b/pages/OneFormer/OneFormer.md new file mode 100644 index 0000000000000000000000000000000000000000..27d1602909a3fa452c5a9e35c40e6cafdb45319a --- /dev/null +++ b/pages/OneFormer/OneFormer.md @@ -0,0 +1,28 @@ +๏ปฟOneFormer: one model to segment them all? ๐Ÿคฏ +I was looking into paperswithcode leaderboards when I came across OneFormer for the first time so it was time to dig in! + +![image_1](image_1.jpg) + +OneFormer is a "truly universal" model for semantic, instance and panoptic segmentation tasks โš”๏ธ +What makes is truly universal is that it's a single model that is trained only once and can be used across all tasks ๐Ÿ‘‡ + +![image_2](image_2.jpg) + +The enabler here is the text conditioning, i.e. the model is given a text query that states task type along with the appropriate input, and using contrastive loss, the model learns the difference between different task types ๐Ÿ‘‡ + +![image_3](image_3.jpg) + +Thanks to ๐Ÿค— Transformers, you can easily use the model! I have drafted a [notebook](https://t.co/cBylk1Uv20) for you to try right away ๐Ÿ˜Š +You can also check out the [Space](https://t.co/31GxlVo1W5) without checking out the code itself + +![image_4](image_4.jpg) + +> [!TIP] +Ressources: +[OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) +by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi (2022) +[GitHub](https://github.com/SHI-Labs/OneFormer) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/oneformer) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1739707076501221608) (December 26, 2023) \ No newline at end of file diff --git a/pages/OneFormer/image_1.jpeg b/pages/OneFormer/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..6ec704afb2e0397037bdf778053e87dbd00e4fc4 Binary files /dev/null and b/pages/OneFormer/image_1.jpeg differ diff --git a/pages/OneFormer/image_2.jpeg b/pages/OneFormer/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..235fbca92c28d1ed82ec04fa375a2cf11b091fe3 Binary files /dev/null and b/pages/OneFormer/image_2.jpeg differ diff --git a/pages/OneFormer/image_3.jpeg b/pages/OneFormer/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..a6f76731199a868aeab4556e774c32daf12c9407 Binary files /dev/null and b/pages/OneFormer/image_3.jpeg differ diff --git a/pages/OneFormer/image_4.jpeg b/pages/OneFormer/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..f37500ae6e880ba6ccd56b72803a1761bc4a2bdf Binary files /dev/null and b/pages/OneFormer/image_4.jpeg differ diff --git a/pages/PLLaVA/PLLaVA .md b/pages/PLLaVA/PLLaVA .md new file mode 100644 index 0000000000000000000000000000000000000000..4ada3bc5fd9e1dc9501c41d1796565f276968626 --- /dev/null +++ b/pages/PLLaVA/PLLaVA .md @@ -0,0 +1,30 @@ +๏ปฟParameter-free LLaVA for video captioning works like magic! ๐Ÿคฉ Let's take a look! + +![image_1](image_1.jpg) + +Most of the video captioning models work by downsampling video frames to reduce computational complexity and memory requirements without losing a lot of information in the process. +PLLaVA on the other hand, uses pooling! ๐Ÿคฉ + +How? ๐Ÿง It takes in frames of video, passed to ViT and then projection layer, and then output goes through average pooling where input shape is (# frames, width, height, text decoder input dim) ๐Ÿ‘‡ + +![image_2](image_2.jpg) + +Pooling operation surprisingly reduces the loss of spatial and temporal information. See below some examples on how it can capture the details ๐Ÿค— + +![image_3](image_3.jpg) + +according to authors' findings, it performs way better than many of the existing models (including proprietary VLMs) and scales very well (on text decoder) + +![image_4](image_4.jpg) + +Model repositories ๐Ÿค— [7B](https://t.co/AeSdYsz1U7), [13B](https://t.co/GnI1niTxO7), [34B](https://t.co/HWAM0ZzvDc) +Spaces๐Ÿค— [7B](https://t.co/Oms2OLkf7O), [13B](https://t.co/C2RNVNA4uR) + +> [!TIP] +Ressources: +[PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning](https://arxiv.org/abs/2404.16994) +by Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, Jiashi Feng (2024) +[GitHub](https://github.com/magic-research/PLLaVA) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1786336055425138939) (May 3, 2024) \ No newline at end of file diff --git a/pages/PLLaVA/image_1.jpg b/pages/PLLaVA/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b1982963ed57d3e93a1435cb9e8e87be1704a7ba Binary files /dev/null and b/pages/PLLaVA/image_1.jpg differ diff --git a/pages/PLLaVA/image_2.jpeg b/pages/PLLaVA/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..465a9e1c566028c066fee65a41183bb50bfbdce8 Binary files /dev/null and b/pages/PLLaVA/image_2.jpeg differ diff --git a/pages/PLLaVA/image_3.jpeg b/pages/PLLaVA/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..58444be4d809d573317af1eb384adcebf01c59f5 Binary files /dev/null and b/pages/PLLaVA/image_3.jpeg differ diff --git a/pages/PLLaVA/image_4.jpeg b/pages/PLLaVA/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1feea29a228a4ab8595cb12d8b7f152cc6fc174d Binary files /dev/null and b/pages/PLLaVA/image_4.jpeg differ diff --git a/pages/Painter/Painter .md b/pages/Painter/Painter .md new file mode 100644 index 0000000000000000000000000000000000000000..245c9449afa07a24b41f5c17e83878f72d5ebbe6 --- /dev/null +++ b/pages/Painter/Painter .md @@ -0,0 +1,24 @@ +๏ปฟI read the Painter [paper](https://t.co/r3aHp29mjf) by BAAIBeijing to convert the weights to transformers, and I absolutely loved the approach they took so I wanted to take time to unfold it here! + +![image_1](image_1.jpg) + +so essentially this model takes inspiration from in-context learning, as in, in LLMs you give an example input output and give the actual input that you want model to complete (one-shot learning) they adapted this to images, thus the name "images speak in images" + +this model doesn't have any multimodal parts, it just has an image encoder and a decoder head (linear layer, conv layer, another linear layer) so it's a single modality + +the magic sauce is the data: they input the task in the form of image and associated transformation and another image they want the transformation to take place and take smooth l2 loss over the predictions and ground truth this is like T5 of image models ๐Ÿ˜€ + +![image_2](image_2.jpg) + +What is so cool about it is that it can actually adapt to out of domain tasks, meaning, in below chart, it was trained on the tasks above the dashed line, and the authors found out it generalized to the tasks below the line, image tasks are well generalized ๐Ÿคฏ + +![image_3](image_3.jpg) + +> [!TIP] +Ressources: +[Images Speak in Images: A Generalist Painter for In-Context Visual Learning](https://arxiv.org/abs/2212.02499) +by Xinlong Wang, Wen Wang, Yue Cao, Chunhua Shen, Tiejun Huang (2022) +[GitHub](https://github.com/baaivision/Painter) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1771542172946354643) (March 23, 2024) \ No newline at end of file diff --git a/pages/Painter/image_1.jpeg b/pages/Painter/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..de15559d4c88c52c9dcf160c5f693b9dc3f784f5 Binary files /dev/null and b/pages/Painter/image_1.jpeg differ diff --git a/pages/Painter/image_2.jpeg b/pages/Painter/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..0ac63a33928ff9ce06650ff79f3a8eadeb9c13aa Binary files /dev/null and b/pages/Painter/image_2.jpeg differ diff --git a/pages/Painter/image_3.jpeg b/pages/Painter/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..97833003915047644e73bcf0650282eeff9ae1ac Binary files /dev/null and b/pages/Painter/image_3.jpeg differ diff --git a/pages/RT-DETR/RT-DETR.md b/pages/RT-DETR/RT-DETR.md new file mode 100644 index 0000000000000000000000000000000000000000..f64e5b8c1242c513d4449132d5d79a9a5968149b --- /dev/null +++ b/pages/RT-DETR/RT-DETR.md @@ -0,0 +1,34 @@ +๏ปฟReal-time DEtection Transformer (RT-DETR) landed in @huggingface transformers ๐Ÿคฉ with Apache 2.0 license ๐Ÿ˜ +Do DETRs Beat YOLOs on Real-time Object Detection? keep reading ๐Ÿ‘€ + +![video_1](video_1.mp4) + +Short answer, it does! +๐Ÿ“– [notebook](https://t.co/NNRpG9cAEa), ๐Ÿ”– [models](https://t.co/ctwWQqNcEt), ๐Ÿ”– [demo](https://t.co/VrmDDDjoNw) + +YOLO models are known to be super fast for real-time computer vision, but they have a downside with being volatile to NMS ๐Ÿฅฒ +Transformer-based models on the other hand are computationally not as efficient ๐Ÿฅฒ Isn't there something in between? Enter RT-DETR! + +The authors combined CNN backbone, multi-stage hybrid decoder (combining convs and attn) with a transformer decoder โ‡“ + +![image_1](image_1.jpg) + +In the paper, authors also claim one can adjust speed by changing decoder layers without retraining altogether they also conduct many ablation studies and try different decoders (see below) + +![image_2](image_2.jpg) + +The authors find out that the model performs better in terms of speed and accuracy compared to the previous state-of-the-art ๐Ÿคฉ + +![image_3](image_3.jpg) + +According to authors' findings, it performs way better than many of the existing models (including proprietary VLMs) and scales very well (on text decoder) + +> [!TIP] +Ressources: +[DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069) +by Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen (2023) +[GitHub](https://github.com/lyuwenyu/RT-DETR/) +[Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/rt_detr) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1807790959884665029) (July 1, 2024) \ No newline at end of file diff --git a/pages/RT-DETR/image_1.jpg b/pages/RT-DETR/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f1327e2043d79df9986726a7991ac6ae29fa1755 Binary files /dev/null and b/pages/RT-DETR/image_1.jpg differ diff --git a/pages/RT-DETR/image_2.jpg b/pages/RT-DETR/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3843ce411d841815591969ba2969a3220b2d0eda Binary files /dev/null and b/pages/RT-DETR/image_2.jpg differ diff --git a/pages/RT-DETR/image_3.jpg b/pages/RT-DETR/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4584fbe2239f41fc1e05e06f3d3fed132db8f580 Binary files /dev/null and b/pages/RT-DETR/image_3.jpg differ diff --git a/pages/RT-DETR/video_1.mp4 b/pages/RT-DETR/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..dab487c3591444cc565ecb4f816ee2ba901736fe --- /dev/null +++ b/pages/RT-DETR/video_1.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6655d13b0522985f24da98aa6abb732b2f87230c118d7d3d74c6712b091f71 +size 2649878 diff --git a/pages/SAMv2/SAMv2.md b/pages/SAMv2/SAMv2.md new file mode 100644 index 0000000000000000000000000000000000000000..012828a0344741b556a5ce38cb5c568c18d01644 --- /dev/null +++ b/pages/SAMv2/SAMv2.md @@ -0,0 +1,36 @@ +๏ปฟSAMv2 is just mindblowingly good ๐Ÿ˜ Learn what makes this model so good at video segmentation, keep reading ๐Ÿฆ†โ‡“ + +![video_1](video_1.mp4) + +Check out the [demo](https://t.co/35ixEZgPaf) by @skalskip92 to see how to use the model locally. +Check out Meta's [demo](https://t.co/Bcbli9Cfim) where you can edit segmented instances too! + +![image_1](image_1.jpg) + +However SAM doesn't naturally track object instances in videos, one needs to make sure to prompt the same mask or point prompt for that instance in each frame and feed each frame, which is infeasible ๐Ÿ˜” But don't fret, that is where SAMv2 comes in with a memory module! + +SAMv2 defines a new task called "masklet prediction" here masklet refers to the same mask instance throughout the frames ๐ŸŽž๏ธ Unlike SAM, SAM 2 decoder is not fed the image embedding directly from an image encoder, but attention of memories of prompted frames and object pointers. + +![image_2](image_2.jpg) + +๐Ÿ–ผ๏ธ These "memories" are essentially past predictions of object of interest up to a number of recent frames, and are in form of feature maps of location info (spatial feature maps) ๐Ÿ‘‰๐Ÿป The object pointers are high level semantic information of the object of interest based on. + +Just like SAM paper SAMv2 depends on a data engine, and the dataset it generated comes with the release: SA-V ๐Ÿคฏ This dataset is gigantic, it has 190.9K manual masklet annotations and 451.7K automatic masklets! + +![image_3](image_3.jpg) + +Initially they apply SAM to each frame to assist human annotators to annotate a video at six FPS for high quality data, in the second phase they add SAM and SAM2 to generate masklets across time consistently Finally they use SAM2 to refine the masklets. + +They have evaluated this model on J&F score (Jaccard Index + F-measure for contour acc) which is used to evaluate zero-shot video segmentation benchmarks SAMv2 seems to outperform two previously sota models that are built on top of SAM! ๐Ÿฅน + +![image_4](image_4.jpg) + +> [!TIP] +Ressources: +[SAM 2: Segment Anything in Images and Videos]() +by Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman Rรคdle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Dollรกr, Christoph Feichtenhofer (2024) +[GitHub](https://github.com/facebookresearch/segment-anything-2) +[Hugging Face documentation]() + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1818675981634109701) (July 31, 2024) \ No newline at end of file diff --git a/pages/SAMv2/image_1.jpg b/pages/SAMv2/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..940e725eb2cd8f6aa75cdb90db442de18f1f68fb Binary files /dev/null and b/pages/SAMv2/image_1.jpg differ diff --git a/pages/SAMv2/image_2.jpg b/pages/SAMv2/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cf27c7fc476d3730357270e84e7f57fe400695a6 Binary files /dev/null and b/pages/SAMv2/image_2.jpg differ diff --git a/pages/SAMv2/image_3.jpg b/pages/SAMv2/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cacf6bc0c374958f55ae4eb22d6ee0f91a8ef77b Binary files /dev/null and b/pages/SAMv2/image_3.jpg differ diff --git a/pages/SAMv2/image_4.jpg b/pages/SAMv2/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..31c92169f7a556884e4b0aec6b0c38b904e72e4a Binary files /dev/null and b/pages/SAMv2/image_4.jpg differ diff --git a/pages/SAMv2/video_1.mp4 b/pages/SAMv2/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fb602d5996c0e8f5535a4cb084d4c38b5eddb839 Binary files /dev/null and b/pages/SAMv2/video_1.mp4 differ diff --git a/pages/SegGPT/SegGPT.md b/pages/SegGPT/SegGPT.md new file mode 100644 index 0000000000000000000000000000000000000000..e4d917b1ff0ab07327048313eb4e8250396bc15e --- /dev/null +++ b/pages/SegGPT/SegGPT.md @@ -0,0 +1,34 @@ +๏ปฟSegGPT is a vision generalist on image segmentation, quite like GPT for computer vision โœจ +It comes with the last release of ๐Ÿค— Transformers. +๐ŸŽ Technical details, demo and how-to's under this! + +![image_1](image_1.jpg) + +SegGPT is an extension of the Painter, where you speak to images with images: the model takes in an image prompt, transformed version of the image prompt, the actual image you want to see the same transform, and expected to output the transformed image. + +SegGPT consists of a vanilla ViT with a decoder on top (linear, conv, linear). The model is trained on diverse segmentation examples, where they provide example image-mask pairs, the actual input to be segmented, and the decoder head learns to reconstruct the mask output. ๐Ÿ‘‡๐Ÿป + +![image_2](image_2.jpg) + +This generalizes pretty well! The authors do not claim state-of-the-art results as the model is mainly used zero-shot and few-shot inference. They also do prompt tuning, where they freeze the parameters of the model and only optimize the image tensor (the input context). + +![image_3](image_3.jpg) + +Thanks to ๐Ÿค— Transformers you can use this model easily! See [here](https://t.co/U5pVpBhkfK). + +![image_4](image_4.jpg) + +I have built an app for you to try it out. I combined SegGPT with Depth Anything Model, so you don't have to upload image mask prompts in your prompt pair ๐Ÿค— +Try it [here](https://t.co/uJIwqJeYUy). Also check out the [collection](https://t.co/HvfjWkAEzP). + +![image_5](image_5.jpg) + +> [!TIP] +Ressources: +[SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) +by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang (2023) +[GitHub](https://github.com/baaivision/Painter) + +> [!NOTE] +[Original tweet](https://x.com/mervenoyann/status/1773056450790666568) (March 27, 2024) + diff --git a/pages/SegGPT/image_1.jpeg b/pages/SegGPT/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..03e16ddfce8180ba6f9681d4fcb3031f93ff30d5 Binary files /dev/null and b/pages/SegGPT/image_1.jpeg differ diff --git a/pages/SegGPT/image_2.jpg b/pages/SegGPT/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4a7091327314a58e8eb97cd6ba63473b61db23ec Binary files /dev/null and b/pages/SegGPT/image_2.jpg differ diff --git a/pages/SegGPT/image_3.jpg b/pages/SegGPT/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2c38f1049ea41b1d4b2a3ed876b480ceafb21584 Binary files /dev/null and b/pages/SegGPT/image_3.jpg differ diff --git a/pages/SegGPT/image_4.jpeg b/pages/SegGPT/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..f196104473057b2f77c901102c44ce60c19fd3aa Binary files /dev/null and b/pages/SegGPT/image_4.jpeg differ diff --git a/pages/SegGPT/image_5.jpeg b/pages/SegGPT/image_5.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..53c314f0da8f1737acc2b2d3d7ba3331fff388c8 Binary files /dev/null and b/pages/SegGPT/image_5.jpeg differ diff --git a/pages/SigLIP/SigLIP.md b/pages/SigLIP/SigLIP.md new file mode 100644 index 0000000000000000000000000000000000000000..323e74c16a790e610e7525c843d67e54b5807d0e --- /dev/null +++ b/pages/SigLIP/SigLIP.md @@ -0,0 +1,40 @@ +๏ปฟSigLIP just got merged to ๐Ÿค—transformers and it's super easy to use! To celebrate this, I have created a repository on various SigLIP based projects! +But what is it and how does it work? SigLIP an vision-text pre-training technique based on contrastive learning. +It jointly trains an image encoder and text encoder such that the dot product of embeddings are most similar for the appropriate text-image pairs. +The image below is taken from CLIP, where this contrastive pre-training takes place with softmax, but SigLIP replaces softmax with sigmoid. ๐Ÿ“Ž + +![image_1](image_1.jpg) + +Highlightsโœจ +๐Ÿ–ผ๏ธ๐Ÿ“ Authors used medium sized B/16 ViT for image encoder and B-sized transformer for text encoder +๐Ÿ˜ More performant than CLIP on zero-shot +๐Ÿ—ฃ๏ธ Authors trained a multilingual model too! +โšก๏ธ Super efficient, sigmoid is enabling up to 1M items per batch, but the authors chose 32k (see saturation on perf below) + +![image_2](image_2.jpg) + +Below you can find prior CLIP models and SigLIP across different image encoder sizes and their performance on different datasets ๐Ÿ‘‡๐Ÿป + +![image_3](image_3.jpg) + +With ๐Ÿค— Transformers integration there comes zero-shot-image-classification pipeline, makes SigLIP super easy to use! + +![image_4](image_4.jpg) + +What to use SigLIP for? ๐Ÿง +Honestly the possibilities are endless, but you can use it for image/text retrieval, zero-shot classification, training multimodal models! +I have made a repository with notebooks and applications that are also hosted on [Spaces ](https://t.co/Ah1CrHVuPY). +I have built ["Draw to Search Art"](https://t.co/DcmQWMc1qd) where you can input image (upload one or draw) and search among 10k images in wikiart! +I've also built apps to [compare](https://t.co/m699TMvuW9)CLIP and SigLIP outputs. + +![image_5](image_5.jpg) + +> [!TIP] +Ressources: +[Sigmoid Loss for Language Image Pre-Training](Sigmoid Loss for Language Image Pre-Training) +by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer (2023) +[GitHub](https://github.com/google-research/big_vision) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/siglip) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1745476609686089800) (January 11. 2024) \ No newline at end of file diff --git a/pages/SigLIP/image_1.jpg b/pages/SigLIP/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0fd3aeaad1a19acc88487c75e8f97fff7575f01c Binary files /dev/null and b/pages/SigLIP/image_1.jpg differ diff --git a/pages/SigLIP/image_2.jpg b/pages/SigLIP/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ff186dd4abdc81fcfda6ffcf54724cb99b9bd56b Binary files /dev/null and b/pages/SigLIP/image_2.jpg differ diff --git a/pages/SigLIP/image_3.jpg b/pages/SigLIP/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..27e5d501bae71ed831bbb168cea1f2c4891ec01e Binary files /dev/null and b/pages/SigLIP/image_3.jpg differ diff --git a/pages/SigLIP/image_4.jpg b/pages/SigLIP/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e3fba6d6ddff448b95a4087f5cada65a60bd22a6 Binary files /dev/null and b/pages/SigLIP/image_4.jpg differ diff --git a/pages/SigLIP/image_5.jpg b/pages/SigLIP/image_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f9684cf65024fe63665694fd6f67317cee4fd0fd Binary files /dev/null and b/pages/SigLIP/image_5.jpg differ diff --git a/pages/SigLIP/image_6.jpeg b/pages/SigLIP/image_6.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..88c6a6244655c950d953fc2c5885755efecace43 Binary files /dev/null and b/pages/SigLIP/image_6.jpeg differ diff --git a/pages/VITMAE/VITMAE.md b/pages/VITMAE/VITMAE.md new file mode 100644 index 0000000000000000000000000000000000000000..ec69e7bc8d04bb118307d8e695639833767aae6b --- /dev/null +++ b/pages/VITMAE/VITMAE.md @@ -0,0 +1,31 @@ +๏ปฟJust read VitMAE paper, sharing some highlights ๐Ÿงถ ViTMAE is a simply yet effective self-supervised pre-training technique, where authors combined vision transformer with masked autoencoder. +The images are first masked (75 percent of the image!) and then the model tries to learn about the features through trying to reconstruct the original image! + +![image_1](image_1.jpg) + +The image is not masked, but rather only the visible patches are fed to the encoder (and that is the only thing encoder sees!). +Next, a mask token is added to where the masked patches are (a bit like BERT, if you will) and the mask tokens and encoded patches are fed to decoder. +The decoder then tries to reconstruct the original image. + +![image_2](image_2.jpg) + +As a result, the authors found out that high masking ratio works well in fine-tuning for downstream tasks and linear probing ๐Ÿคฏ๐Ÿคฏ + +![image_3](image_3.jpg) + +If you want to try the model or fine-tune, all the pre-trained VITMAE models released released by Meta are available on [Huggingface](https://t.co/didvTL9Zkm). +We've built a [demo](https://t.co/PkuACJiKrB) for you to see the intermediate outputs and reconstruction by VITMAE. + +Also there's a nice [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb) by [@NielsRogge](https://twitter.com/NielsRogge). + +![image_4](image_4.jpg) + +> [!TIP] +Ressources: +[Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v3) +by LKaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollรกr, Ross Girshick (2021) +[GitHub](https://github.com/facebookresearch/mae) +[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/vit_mae) + +> [!NOTE] +[Original tweet](https://twitter.com/mervenoyann/status/1740688304784183664) (December 29, 2023) diff --git a/pages/VITMAE/image_1.jpeg b/pages/VITMAE/image_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e3ec2fb793c4aec42c5bd276c92d92c21190355c Binary files /dev/null and b/pages/VITMAE/image_1.jpeg differ diff --git a/pages/VITMAE/image_2.jpeg b/pages/VITMAE/image_2.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..c768c72863efcaaf31aa2da5026b0fa15a0fdc82 Binary files /dev/null and b/pages/VITMAE/image_2.jpeg differ diff --git a/pages/VITMAE/image_3.jpeg b/pages/VITMAE/image_3.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1eaadcb6298bc045b1c631d63b28417955610bce Binary files /dev/null and b/pages/VITMAE/image_3.jpeg differ diff --git a/pages/VITMAE/image_4.jpeg b/pages/VITMAE/image_4.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..560d4ad4ae4df23a8f1b195257627db9bb403a51 Binary files /dev/null and b/pages/VITMAE/image_4.jpeg differ diff --git a/pages/Video-LLaVA/Video-LLaVA.md b/pages/Video-LLaVA/Video-LLaVA.md new file mode 100644 index 0000000000000000000000000000000000000000..51bac3ce0a42ffe0c80ee19b41943ca4e8a966b0 --- /dev/null +++ b/pages/Video-LLaVA/Video-LLaVA.md @@ -0,0 +1,32 @@ +๏ปฟWe have recently merged Video-LLaVA to @huggingface transformers! ๐Ÿค— +๐ŸŽž๏ธ What makes this model different? keep reading โ‡Š + +![video](video_1.mp4) + +[Demo](https://t.co/MVP14uEj9e) | [Model](https://t.co/oqSCMUqwJo) +See below how to initialize the model and processor and infer โฌ‡๏ธ + + +![image_1](image_1.jpg) + +Compared to other models that take image and video input and either project them separately or downsampling video and projecting selected frames, Video-LLaVA is converting images and videos to unified representation and project them using a shared projection layer. + +![image_2](image_2.jpg) + +It uses Vicuna 1.5 as the language model and LanguageBind's own encoders that's based on OpenCLIP, these encoders project the modalities to an unified representation before passing to projection layer. + +![image_3](image_3.jpg) + +I feel like one of the coolest features of this model is the joint understanding which is also introduced recently with many models it's a relatively older model but ahead of it's time and works very well! + +![image_4](image_4.jpg) + +> [!TIP] +Ressources: +[Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://arxiv.org/abs/2311.10122) +by Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan (2023) +[GitHub](https://github.com/PKU-YuanGroup/Video-LLaVA) +[Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/video_llava) + +> [!NOTE] +[Original tweet](https://x.com/mervenoyann/status/1816427325073842539) (July 25, 2024) \ No newline at end of file diff --git a/pages/Video-LLaVA/image_1.jpg b/pages/Video-LLaVA/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6ef2142413d91d3a62b3548be756e743c85eb63f Binary files /dev/null and b/pages/Video-LLaVA/image_1.jpg differ diff --git a/pages/Video-LLaVA/image_2.jpg b/pages/Video-LLaVA/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..63d46e29066a8d1deadb2637a52770d0cae00547 Binary files /dev/null and b/pages/Video-LLaVA/image_2.jpg differ diff --git a/pages/Video-LLaVA/image_3.jpg b/pages/Video-LLaVA/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3ea5e94b62652fbf17a1465aa4bdb7b6189bb5fe Binary files /dev/null and b/pages/Video-LLaVA/image_3.jpg differ diff --git a/pages/Video-LLaVA/image_4.jpg b/pages/Video-LLaVA/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..662e32e2d8c775c203b5fb3d93cfe31d2b8347c4 Binary files /dev/null and b/pages/Video-LLaVA/image_4.jpg differ diff --git a/pages/Video-LLaVA/video_1.mp4 b/pages/Video-LLaVA/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..fbf5f3e97502d4f06f433be09972bc87d11a60b9 Binary files /dev/null and b/pages/Video-LLaVA/video_1.mp4 differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..07aec85c48f5f456bae61c72a2608594f342ae9d --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +streamlit-extras \ No newline at end of file