diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..b07961f7ff7e7298b2d45abc3b32b8372552a9bc 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+pages/4M-21/video_1.mp4 filter=lfs diff=lfs merge=lfs -text
+pages/Depth[[:space:]]Anything/video_1.mp4 filter=lfs diff=lfs merge=lfs -text
+pages/RT-DETR/video_1.mp4 filter=lfs diff=lfs merge=lfs -text
diff --git a/Home.py b/Home.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7baa867f275f48d021719c1899e1979d01bf6dd
--- /dev/null
+++ b/Home.py
@@ -0,0 +1,16 @@
+import streamlit as st
+
+st.set_page_config(page_title="Home",page_icon="๐ ")
+
+# st.image("image_of_a_Turkish_lofi_girl_sitting_at_a_desk_writing_summaries_of_scientific_publications_ghibli_anime_like_hd.jpeg", use_column_width=True)
+
+st.write("# Vision Papers ๐")
+
+
+st.markdown(
+ """
+ I've created a simple Streamlit App where I list summaries of papers (my browser bookmarks or Twitter bookmarks were getting messy).
+ Since you're one of my sources for bibliography, I thought you might be interested in having all your summaries grouped together somewhere
+ (average of 0.73 summaries per week, I don't know what it's your fuel but that's impressive).
+ """
+)
\ No newline at end of file
diff --git a/README.md b/README.md
index 062a46741fe717e1e5c6fa7d79f176eebbeb0826..1e63885b44afe7a61d0f3a09ca90a4af2c7e9066 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,11 @@
---
title: Vision Papers
-emoji: ๐
-colorFrom: yellow
-colorTo: indigo
+emoji: ๐ป
+colorFrom: indigo
+colorTo: blue
sdk: streamlit
sdk_version: 1.37.0
-app_file: app.py
+app_file: Home.py
pinned: false
---
diff --git a/pages/10_Painter.py b/pages/10_Painter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab097f9d446945a195355827f3411c5aa1b71d0a
--- /dev/null
+++ b/pages/10_Painter.py
@@ -0,0 +1,53 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Painter")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1771542172946354643) (March 23, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""I read the Painter [paper](https://t.co/r3aHp29mjf) by [BAAIBeijing](https://x.com/BAAIBeijing) to convert the weights to ๐ค Transformers, and I absolutely loved the approach they took so I wanted to take time to unfold it here!
+""")
+st.markdown(""" """)
+
+st.image("pages/Painter/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""So essentially this model takes inspiration from in-context learning, as in, in LLMs you give an example input output and give the actual input that you want model to complete (one-shot learning) they adapted this to images, thus the name "images speak in images".
+
+This model doesn't have any multimodal parts, it just has an image encoder and a decoder head (linear layer, conv layer, another linear layer) so it's a single modality.
+
+The magic sauce is the data: they input the task in the form of image and associated transformation and another image they want the transformation to take place and take smooth L2 loss over the predictions and ground truth this is like T5 of image models ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/Painter/image_2.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""What is so cool about it is that it can actually adapt to out of domain tasks, meaning, in below chart, it was trained on the tasks above the dashed line, and the authors found out it generalized to the tasks below the line, image tasks are well generalized ๐คฏ
+""")
+st.markdown(""" """)
+
+st.image("pages/Painter/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Images Speak in Images: A Generalist Painter for In-Context Visual Learning](https://arxiv.org/abs/2212.02499)
+by Xinlong Wang, Wen Wang, Yue Cao, Chunhua Shen, Tiejun Huang (2022)
+[GitHub](https://github.com/baaivision/Painter)""", icon="๐")
+
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("LLaVA-NeXT")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("SegGPT")
\ No newline at end of file
diff --git a/pages/11_SegGPT.py b/pages/11_SegGPT.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8d5a366d1ee404028c6bb10f65a279f05560c0b
--- /dev/null
+++ b/pages/11_SegGPT.py
@@ -0,0 +1,70 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("SegGPT")
+
+st.success("""[Original tweet](https://x.com/mervenoyann/status/1773056450790666568) (March 27, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""SegGPT is a vision generalist on image segmentation, quite like GPT for computer vision โจ
+It comes with the last release of ๐ค Transformers ๐
+Technical details, demo and how-to's under this!
+""")
+st.markdown(""" """)
+
+st.image("pages/SegGPT/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""SegGPT is an extension of the Painter where you speak to images with images: the model takes in an image prompt, transformed version of the image prompt, the actual image you want to see the same transform, and expected to output the transformed image.
+
+SegGPT consists of a vanilla ViT with a decoder on top (linear, conv, linear). The model is trained on diverse segmentation examples, where they provide example image-mask pairs, the actual input to be segmented, and the decoder head learns to reconstruct the mask output. ๐๐ป
+""", unsafe_allow_html=True)
+st.markdown(""" """)
+
+st.image("pages/SegGPT/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+This generalizes pretty well!
+The authors do not claim state-of-the-art results as the model is mainly used zero-shot and few-shot inference. They also do prompt tuning, where they freeze the parameters of the model and only optimize the image tensor (the input context).
+""")
+st.markdown(""" """)
+
+st.image("pages/SegGPT/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Thanks to ๐ค Transformers you can use this model easily! See [here](https://t.co/U5pVpBhkfK).
+""")
+st.markdown(""" """)
+
+st.image("pages/SegGPT/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+I have built an app for you to try it out. I combined SegGPT with Depth Anything Model, so you don't have to upload image mask prompts in your prompt pair ๐ค
+Try it [here](https://t.co/uJIwqJeYUy). Also check out the [collection](https://t.co/HvfjWkAEzP).
+""")
+st.markdown(""" """)
+
+st.image("pages/SegGPT/image_5.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284)
+by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang (2023)
+[GitHub](https://github.com/baaivision/Painter)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Painter")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Grounding DINO")
\ No newline at end of file
diff --git a/pages/12_Grounding_DINO.py b/pages/12_Grounding_DINO.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c430e688011a08d8adffcffcf01c57ffa123f47
--- /dev/null
+++ b/pages/12_Grounding_DINO.py
@@ -0,0 +1,92 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Grounding DINO")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1780558859221733563) (April 17, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""
+We have merged Grounding DINO in ๐ค Transformers ๐ฆ
+It's an amazing zero-shot object detection model, here's why ๐งถ
+""")
+st.markdown(""" """)
+
+st.image("pages/Grounding_DINO/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""There are two zero-shot object detection models as of now, one is OWL series by Google Brain and the other one is Grounding DINO ๐ฆ
+Grounding DINO pays immense attention to detail โฌ๏ธ
+Also [try yourself](https://t.co/UI0CMxphE7).
+""")
+st.markdown(""" """)
+
+st.image("pages/Grounding_DINO/image_2.jpeg", use_column_width=True)
+st.image("pages/Grounding_DINO/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""I have also built another [application](https://t.co/4EHpOwEpm0) for GroundingSAM, combining GroundingDINO and Segment Anything by Meta for cutting edge zero-shot image segmentation.
+""")
+st.markdown(""" """)
+
+st.image("pages/Grounding_DINO/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Grounding DINO is essentially a model with connected image encoder (Swin transformer), text encoder (BERT) and on top of both, a decoder that outputs bounding boxes ๐ฆ
+This is quite similar to OWL series, which uses a ViT-based detector on CLIP.
+""", unsafe_allow_html=True)
+st.markdown(""" """)
+
+st.image("pages/Grounding_DINO/image_5.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""The authors train Swin-L/T with BERT contrastively (not like CLIP where they match the images to texts by means of similarity) where they try to approximate the region outputs to language phrases at the head outputs ๐คฉ
+""")
+st.markdown(""" """)
+
+st.image("pages/Grounding_DINO/image_6.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""The authors also form the text features on the sub-sentence level.
+This means it extracts certain noun phrases from training data to remove the influence between words while removing fine-grained information.
+""")
+st.markdown(""" """)
+
+st.image("pages/Grounding_DINO/image_7.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Thanks to all of this, Grounding DINO has great performance on various REC/object detection benchmarks ๐๐
+""")
+st.markdown(""" """)
+
+st.image("pages/Grounding_DINO/image_8.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Thanks to ๐ค Transformers, you can use Grounding DINO very easily!
+You can also check out [NielsRogge](https://twitter.com/NielsRogge)'s [notebook here](https://t.co/8ADGFdVkta).
+""")
+st.markdown(""" """)
+
+st.image("pages/Grounding_DINO/image_9.jpeg", use_column_width=True)
+
+
+st.info("""Ressources:
+[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)
+by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang (2023)
+[GitHub](https://github.com/IDEA-Research/GroundingDINO)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/grounding-dino)""", icon="๐")
+
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("SegGPT")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("DocOwl 1.5")
\ No newline at end of file
diff --git a/pages/13_DocOwl_1.5.py b/pages/13_DocOwl_1.5.py
new file mode 100644
index 0000000000000000000000000000000000000000..224da4322d4cf6ae6e81e08f4b8159ffdfe05898
--- /dev/null
+++ b/pages/13_DocOwl_1.5.py
@@ -0,0 +1,100 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("DocOwl 1.5")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1782421257591357824) (April 22, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""DocOwl 1.5 is the state-of-the-art document understanding model by Alibaba with Apache 2.0 license ๐๐
+Time to dive in and learn more ๐งถ
+""")
+st.markdown(""" """)
+
+st.image("pages/DocOwl_1.5/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""This model consists of a ViT-based visual encoder part that takes in crops of image and the original image itself.
+Then the outputs of the encoder goes through a convolution based model, after that the outputs are merged with text and then fed to LLM.
+""")
+st.markdown(""" """)
+
+st.image("pages/DocOwl_1.5/image_2.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Initially, the authors only train the convolution based part (called H-Reducer) and vision encoder while keeping LLM frozen.
+Then for fine-tuning (on image captioning, VQA etc), they freeze vision encoder and train H-Reducer and LLM.
+""")
+st.markdown(""" """)
+
+st.image("pages/DocOwl_1.5/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Also they use simple linear projection on text and documents. You can see below how they model the text prompts and outputs ๐ค
+""")
+st.markdown(""" """)
+
+st.image("pages/DocOwl_1.5/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""They train the model various downstream tasks including:
+- document understanding (DUE benchmark and more)
+- table parsing (TURL, PubTabNet)
+- chart parsing (PlotQA and more)
+- image parsing (OCR-CC)
+- text localization (DocVQA and more)
+""")
+st.markdown(""" """)
+
+st.image("pages/DocOwl_1.5/image_5.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+They contribute a new model called DocOwl 1.5-Chat by:
+1. creating a new document-chat dataset with questions from document VQA datasets
+2. feeding them to ChatGPT to get long answers
+3. fine-tune the base model with it (which IMO works very well!)
+""")
+st.markdown(""" """)
+
+st.image("pages/DocOwl_1.5/image_6.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Resulting generalist model and the chat model are pretty much state-of-the-art ๐
+Below you can see how it compares to fine-tuned models.
+""")
+st.markdown(""" """)
+
+st.image("pages/DocOwl_1.5/image_7.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""All the models and the datasets (also some eval datasets on above tasks!) are in this [organization](https://t.co/sJdTw1jWTR).
+The [Space](https://t.co/57E9DbNZXf).
+""")
+st.markdown(""" """)
+
+st.image("pages/DocOwl_1.5/image_8.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding](https://arxiv.org/abs/2403.12895)
+by Anwen Hu, Haiyang Xu, Jiabo Ye, Ming Yan, Liang Zhang, Bo Zhang, Chen Li, Ji Zhang, Qin Jin, Fei Huang, Jingren Zhou (2024)
+[GitHub](https://github.com/X-PLUG/mPLUG-DocOwl)""", icon="๐")
+
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Grounding DINO")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("PLLaVA")
\ No newline at end of file
diff --git a/pages/14_PLLaVA.py b/pages/14_PLLaVA.py
new file mode 100644
index 0000000000000000000000000000000000000000..d872a77b7c06ce1393c0d039ed62fb772a84621d
--- /dev/null
+++ b/pages/14_PLLaVA.py
@@ -0,0 +1,65 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("PLLaVA")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1786336055425138939) (May 3, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Parameter-free LLaVA for video captioning works like magic! ๐คฉ Let's take a look!
+""")
+st.markdown(""" """)
+
+st.image("pages/PLLaVA/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Most of the video captioning models work by downsampling video frames to reduce computational complexity and memory requirements without losing a lot of information in the process.
+PLLaVA on the other hand, uses pooling! ๐คฉ
+
+How? ๐ง
+It takes in frames of video, passed to ViT and then projection layer, and then output goes through average pooling where input shape is (# frames, width, height, text decoder input dim) ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/PLLaVA/image_2.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Pooling operation surprisingly reduces the loss of spatial and temporal information. See below some examples on how it can capture the details ๐ค
+""")
+st.markdown(""" """)
+
+st.image("pages/PLLaVA/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""According to authors' findings, it performs way better than many of the existing models (including proprietary VLMs) and scales very well (on text decoder).
+""")
+st.markdown(""" """)
+
+st.image("pages/PLLaVA/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Model repositories ๐ค [7B](https://t.co/AeSdYsz1U7), [13B](https://t.co/GnI1niTxO7), [34B](https://t.co/HWAM0ZzvDc)
+Spaces๐ค [7B](https://t.co/Oms2OLkf7O), [13B](https://t.co/C2RNVNA4uR)
+""")
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning](https://arxiv.org/abs/2404.16994)
+by Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, Jiashi Feng (2024)
+[GitHub](https://github.com/magic-research/PLLaVA)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("DocOwl 1.5")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("CuMo")
\ No newline at end of file
diff --git a/pages/15_CuMo.py b/pages/15_CuMo.py
new file mode 100644
index 0000000000000000000000000000000000000000..70c10bee7062ef6d947c445df681a0669c30427a
--- /dev/null
+++ b/pages/15_CuMo.py
@@ -0,0 +1,61 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("CuMo")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1790665706205307191) (May 15, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""
+It's raining vision language models โ๏ธ
+CuMo is a new vision language model that has MoE in every step of the VLM (image encoder, MLP and text decoder) and uses Mistral-7B for the decoder part ๐ค
+""")
+st.markdown(""" """)
+
+st.image("pages/CuMo/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The authors firstly did pre-training of MLP with the by freezing the image encoder and text decoder, then they warmup the whole network by unfreezing and finetuning which they state to stabilize the visual instruction tuning when bringing in the experts.
+""")
+st.markdown(""" """)
+
+st.image("pages/CuMo/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The mixture of experts MLP blocks above are simply the same MLP blocks initialized from the single MLP that was trained during pre-training and fine-tuned in pre-finetuning ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/CuMo/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+It works very well (also tested myself) that it outperforms the previous SOTA of it's size LLaVA-NeXT! ๐
+I wonder how it would compare to IDEFICS2-8B You can try it yourself [here](https://t.co/MLIYKVh5Ee).
+""", unsafe_allow_html=True)
+st.markdown(""" """)
+
+st.image("pages/CuMo/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts](https://arxiv.org/abs/2405.05949)
+by Jiachen Li, Xinyao Wang, Sijie Zhu, Chia-Wen Kuo, Lu Xu, Fan Chen, Jitesh Jain, Humphrey Shi, Longyin Wen (2024)
+[GitHub](https://github.com/SHI-Labs/CuMo)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("PLLaVA")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("DenseConnector")
\ No newline at end of file
diff --git a/pages/16_DenseConnector.py b/pages/16_DenseConnector.py
new file mode 100644
index 0000000000000000000000000000000000000000..258505464d24915003519fee2e59c4dc1275f2be
--- /dev/null
+++ b/pages/16_DenseConnector.py
@@ -0,0 +1,69 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("DenseConnector")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1796089181988352216) (May 30, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Do we fully leverage image encoders in vision language models? ๐
+A new paper built a dense connector that does it better! Let's dig in ๐งถ
+""")
+st.markdown(""" """)
+
+st.image("pages/DenseConnector/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+VLMs consist of an image encoder block, a projection layer that projects image embeddings to text embedding space and then a text decoder sequentially connected ๐
+This [paper](https://t.co/DPQzbj0eWm) explores using intermediate states of image encoder and not a single output ๐คฉ
+""")
+st.markdown(""" """)
+
+st.image("pages/DenseConnector/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The authors explore three different ways of instantiating dense connector: sparse token integration, sparse channel integration and dense channel integration (each of them just take intermediate outputs and put them together in different ways, see below).
+""")
+st.markdown(""" """)
+
+st.image("pages/DenseConnector/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+They explore all three of them integrated to LLaVA 1.5 and found out each of the new models are superior to the original LLaVA 1.5.
+""")
+st.markdown(""" """)
+
+st.image("pages/DenseConnector/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+I tried the [model](https://huggingface.co/spaces/HuanjinYao/DenseConnector-v1.5-8B) and it seems to work very well ๐ฅน
+The authors have released various [checkpoints](https://t.co/iF8zM2qvDa) based on different decoders (Vicuna 7/13B and Llama 3-8B).
+""")
+st.markdown(""" """)
+
+st.image("pages/DenseConnector/image_5.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Dense Connector for MLLMs](https://arxiv.org/abs/2405.13800)
+by Huanjin Yao, Wenhao Wu, Taojiannan Yang, YuXin Song, Mengxi Zhang, Haocheng Feng, Yifan Sun, Zhiheng Li, Wanli Ouyang, Jingdong Wang (2024)
+[GitHub](https://github.com/HJYao00/DenseConnector)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("CuMo")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Depth Anything v2")
\ No newline at end of file
diff --git a/pages/17_Depth_Anything_V2.py b/pages/17_Depth_Anything_V2.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a4fa1698c33af8c0343e37f4821ba4f0b36f8e2
--- /dev/null
+++ b/pages/17_Depth_Anything_V2.py
@@ -0,0 +1,74 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Depth Anything V2")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1803063120354492658) (June 18, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""
+I love Depth Anything V2 ๐
+Itโs Depth Anything, but scaled with both larger teacher model and a gigantic dataset! Letโs unpack ๐ค๐งถ!
+""", unsafe_allow_html=True)
+st.markdown(""" """)
+
+st.image("pages/Depth_Anything_v2/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The authors have analyzed Marigold, a diffusion based model against Depth Anything and found out whatโs up with using synthetic images vs real images for MDE:
+๐ Real data has a lot of label noise, inaccurate depth maps (caused by depth sensors missing transparent objects etc)
+๐ Synthetic data have more precise and detailed depth labels and they are truly ground-truth, but thereโs a distribution shift between real and synthetic images, and they have restricted scene coverage
+""")
+st.markdown(""" """)
+
+st.image("pages/Depth_Anything_v2/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The authors train different image encoders only on synthetic images and find out unless the encoder is very large the model canโt generalize well (but large models generalize inherently anyway) ๐ง
+But they still fail encountering real images that have wide distribution in labels ๐ฅฒ
+""")
+st.markdown(""" """)
+
+st.image("pages/Depth_Anything_v2/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Depth Anything v2 framework is to...
+๐ฆ Train a teacher model based on DINOv2-G based on 595K synthetic images
+๐ท๏ธ Label 62M real images using teacher model
+๐ฆ Train a student model using the real images labelled by teacher
+Result: 10x faster and more accurate than Marigold!
+""")
+st.markdown(""" """)
+
+st.image("pages/Depth_Anything_v2/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The authors also construct a new benchmark called DA-2K that is less noisy, highly detailed and more diverse!
+I have created a [collection](https://t.co/3fAB9b2sxi) that has the models, the dataset, the demo and CoreML converted model ๐
+""")
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Depth Anything V2](https://arxiv.org/abs/2406.09414)
+by Lihe Yang, Bingyi Kang, Zilong Huang, Zhen Zhao, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024)
+[GitHub](https://github.com/DepthAnything/Depth-Anything-V2)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything_v2)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("DenseConnector")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Florence-2")
\ No newline at end of file
diff --git a/pages/18_Florence-2.py b/pages/18_Florence-2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b702a84eb3f7661fc3e90861103ea41473b06249
--- /dev/null
+++ b/pages/18_Florence-2.py
@@ -0,0 +1,78 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Florence-2")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1803769866878623819) (June 20, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Florence-2 is a new vision foundation model by Microsoft capable of a wide variety of tasks ๐คฏ
+Let's unpack! ๐งถ
+""")
+st.markdown(""" """)
+
+st.image("pages/Florence-2/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+This model is can handle tasks that vary from document understanding to semantic segmentation ๐คฉ
+[Demo](https://t.co/7YJZvjhw84) | [Collection](https://t.co/Ub7FGazDz1)
+""")
+st.markdown(""" """)
+
+st.image("pages/Florence-2/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The difference from previous models is that the authors have compiled a dataset that consists of 126M images with 5.4B annotations labelled with their own data engine โโ
+""")
+st.markdown(""" """)
+
+st.image("pages/Florence-2/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The dataset also offers more variety in annotations compared to other datasets, it has region level and image level annotations with more variety in semantic granularity as well!
+""")
+st.markdown(""" """)
+
+st.image("pages/Florence-2/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The model is a similar architecture to previous models, an image encoder, a multimodality encoder with text decoder.
+The authors have compiled the multitask dataset with prompts for each task which makes the model trainable on multiple tasks ๐ค
+""")
+st.markdown(""" """)
+
+st.image("pages/Florence-2/image_5.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+You also fine-tune this model on any task of choice, the authors also released different results on downstream tasks and report their results when un/freezing vision encoder ๐ค๐
+They have released fine-tuned models too, you can find them in the collection above ๐ค
+""")
+st.markdown(""" """)
+
+st.image("pages/Florence-2/image_6.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242)
+by Bin Xiao, Haiping Wu, Weijian Xu, Xiyang Dai, Houdong Hu, Yumao Lu, Michael Zeng, Ce Liu, Lu Yuan (2023)
+[Hugging Face blog post](https://huggingface.co/blog/finetune-florence2)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Depth Anything V2")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("4M-21")
\ No newline at end of file
diff --git a/pages/19_4M-21.py b/pages/19_4M-21.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ff57899ce8c0eef3dfd2a18c8ae6fa31b20e0f
--- /dev/null
+++ b/pages/19_4M-21.py
@@ -0,0 +1,70 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("4M-21")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1804138208814309626) (June 21, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""
+EPFL and Apple just released 4M-21: single any-to-any model that can do anything from text-to-image generation to generating depth masks! ๐
+Let's unpack ๐งถ
+""")
+st.markdown(""" """)
+
+st.image("pages/4M-21/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""4M is a multimodal training [framework](https://t.co/jztLublfSF) introduced by Apple and EPFL.
+Resulting model takes image and text and output image and text ๐คฉ
+[Models](https://t.co/1LC0rAohEl) | [Demo](https://t.co/Ra9qbKcWeY)
+""")
+st.markdown(""" """)
+
+st.video("pages/4M-21/video_1.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.markdown("""
+This model consists of transformer encoder and decoder, where the key to multimodality lies in input and output data:
+input and output tokens are decoded to generate bounding boxes, generated image's pixels, captions and more!
+""")
+st.markdown(""" """)
+
+st.image("pages/4M-21/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+This model also learnt to generate canny maps, SAM edges and other things for steerable text-to-image generation ๐ผ๏ธ
+The authors only added image-to-all capabilities for the demo, but you can try to use this model for text-to-image generation as well โบ๏ธ
+""")
+st.markdown(""" """)
+
+st.image("pages/4M-21/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+In the project page you can also see the model's text-to-image and steered generation capabilities with model's own outputs as control masks!
+""")
+st.markdown(""" """)
+
+st.video("pages/4M-21/video_2.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.info("""
+Ressources
+[4M-21: An Any-to-Any Vision Model for Tens of Tasks and Modalities](https://arxiv.org/abs/2406.09406) by Roman Bachmann, Oฤuzhan Fatih Kar, David Mizrahi, Ali Garjani, Mingfei Gao, David Griffiths, Jiaming Hu, Afshin Dehghan, Amir Zamir (2024)
+[GitHub](https://github.com/apple/ml-4m/)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Florence-2")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("RT-DETR")
\ No newline at end of file
diff --git a/pages/1_MobileSAM.py b/pages/1_MobileSAM.py
new file mode 100644
index 0000000000000000000000000000000000000000..80e21e5e4cc13211869262ef9775baf217b32187
--- /dev/null
+++ b/pages/1_MobileSAM.py
@@ -0,0 +1,79 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("MobileSAM")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1738959605542076863) (December 24, 2023)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Read the MobileSAM paper this weekend ๐ Sharing some insights!
+The idea ๐ก: SAM model consist of three parts, a heavy image encoder, a prompt encoder (prompt can be text, bounding box, mask or point) and a mask decoder.
+
+To make the SAM model smaller without compromising from the performance, the authors looked into three types of distillation.
+First one is distilling the decoder outputs directly (a more naive approach) with a completely randomly initialized small ViT and randomly initialized mask decoder.
+However, when the ViT and the decoder are both in a bad state, this doesn't work well.
+""")
+st.markdown(""" """)
+
+st.image("pages/MobileSAM/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The second type of distillation is called semi-coupled, where the authors only randomly initialized the ViT image encoder and kept the mask decoder.
+This is called semi-coupled because the image encoder distillation still depends on the mask decoder (see below ๐)
+""")
+st.markdown(""" """)
+
+st.image("pages/MobileSAM/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The last type of distillation, [decoupled distillation](https://openaccess.thecvf.com/content/CVPR2022/papers/Zhao_Decoupled_Knowledge_Distillation_CVPR_2022_paper.pdf), is the most intuitive IMO.
+The authors have "decoupled" image encoder altogether and have frozen the mask decoder and didn't really distill based on generated masks.
+This makes sense as the bottleneck here is the encoder itself and most of the time, distillation works well with encoding.
+""")
+st.markdown(""" """)
+
+st.image("pages/MobileSAM/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Finally, they found out that decoupled distillation performs better than coupled distillation by means of mean IoU and requires much less compute! โฅ๏ธ
+""")
+st.markdown(""" """)
+
+st.image("pages/MobileSAM/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Wanted to leave some links here if you'd like to try yourself ๐
+- MobileSAM [demo](https://huggingface.co/spaces/dhkim2810/MobileSAMMobileSAM)
+- Model [repository](https://huggingface.co/dhkim2810/MobileSAM)
+
+If you'd like to experiment around TinyViT, [timm library](https://huggingface.co/docs/timm/index) ([Ross Wightman](https://x.com/wightmanr)) has a bunch of [checkpoints available](https://huggingface.co/models?sort=trending&search=timm%2Ftinyvit).
+""")
+st.markdown(""" """)
+
+st.image("pages/MobileSAM/image_5.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+
+st.info("""
+Ressources:
+[Faster Segment Anything: Towards Lightweight SAM for Mobile Applications](https://arxiv.org/abs/2306.14289)
+by Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, Choong Seon Hong (2023)
+[GitHub](https://github.com/ChaoningZhang/MobileSAM)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3= st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Home")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("OneFormer")
\ No newline at end of file
diff --git a/pages/20_RT-DETR.py b/pages/20_RT-DETR.py
new file mode 100644
index 0000000000000000000000000000000000000000..9226f4b4b371754c867c0a624d24bc8d5b4c93a0
--- /dev/null
+++ b/pages/20_RT-DETR.py
@@ -0,0 +1,67 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("RT-DETR")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1807790959884665029) (July 1, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Real-time DEtection Transformer (RT-DETR) landed in ๐ค Transformers with Apache 2.0 license ๐
+Do DETRs Beat YOLOs on Real-time Object Detection? Keep reading ๐
+""")
+st.markdown(""" """)
+
+st.video("pages/RT-DETR/video_1.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.markdown("""
+Short answer, it does! ๐ [notebook](https://t.co/NNRpG9cAEa), ๐ [models](https://t.co/ctwWQqNcEt), ๐ [demo](https://t.co/VrmDDDjoNw)
+
+YOLO models are known to be super fast for real-time computer vision, but they have a downside with being volatile to NMS ๐ฅฒ
+Transformer-based models on the other hand are computationally not as efficient ๐ฅฒ
+Isn't there something in between? Enter RT-DETR!
+
+The authors combined CNN backbone, multi-stage hybrid decoder (combining convs and attn) with a transformer decoder โ
+""")
+st.markdown(""" """)
+
+st.image("pages/RT-DETR/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+In the paper, authors also claim one can adjust speed by changing decoder layers without retraining altogether.
+They also conduct many ablation studies and try different decoders.
+""")
+st.markdown(""" """)
+
+st.image("pages/RT-DETR/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The authors find out that the model performs better in terms of speed and accuracy compared to the previous state-of-the-art ๐คฉ
+""")
+st.markdown(""" """)
+
+st.image("pages/RT-DETR/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069)
+by Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen (2023)
+[GitHub](https://github.com/lyuwenyu/RT-DETR/)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/rt_detr)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("4M-21")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Llava-NeXT-Interleave")
\ No newline at end of file
diff --git a/pages/21_Llava-NeXT-Interleave.py b/pages/21_Llava-NeXT-Interleave.py
new file mode 100644
index 0000000000000000000000000000000000000000..39ec59ce3607368df17b2bf14ec0ecf8cdec221b
--- /dev/null
+++ b/pages/21_Llava-NeXT-Interleave.py
@@ -0,0 +1,86 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Llava-NeXT-Interleave")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1813560292397203630) (July 17, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""The vision language model in this video is 0.5B and can take in image, video and 3D! ๐คฏ
+Llava-NeXT-Interleave is a new vision language model trained on interleaved image, video and 3D data keep reading โฅฅโฅฅ
+""")
+st.markdown(""" """)
+
+st.video("pages/Llava-NeXT-Interleave/video_1.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.markdown("""This model comes with 0.5B, 7B and 7B-DPO variants, all can be used with Transformers ๐
+[Collection of models](https://t.co/sZsaglSXa3) | [Demo](https://t.co/FbpaMWJY8k)
+See how to use below ๐๐ป
+""")
+st.markdown(""" """)
+
+st.image("pages/Llava-NeXT-Interleave/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Authors of this paper have explored training LLaVA-NeXT on interleaved data where the data consists of multiple modalities, including image(s), video, 3D ๐
+They have discovered that interleaved data increases results across all benchmarks!
+""", unsafe_allow_html=True)
+st.markdown(""" """)
+
+st.image("pages/Llava-NeXT-Interleave/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The model can do task transfer from single image tasks to multiple images ๐คฏ
+The authors have trained the model on single images and code yet the model can solve coding with multiple images.
+""")
+st.markdown(""" """)
+
+st.image("pages/Llava-NeXT-Interleave/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Same applies to other modalities, see below for video:
+""")
+st.markdown(""" """)
+
+st.image("pages/Llava-NeXT-Interleave/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The model also has document understanding capabilities and many real-world application areas.
+""")
+st.markdown(""" """)
+
+st.image("pages/Llava-NeXT-Interleave/image_5.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+This release also comes with the dataset this model was fine-tuned on ๐ [M4-Instruct-Data](https://t.co/rutXMtNC0I)
+""")
+st.markdown(""" """)
+
+st.image("pages/Llava-NeXT-Interleave/image_6.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[LLaVA-NeXT: Tackling Multi-image, Video, and 3D in Large Multimodal Models](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/)
+by Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, Chunyuan Li (2024)
+[GitHub](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/inference/docs/LLaVA-NeXT-Interleave.md)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("RT-DETR")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Chameleon")
\ No newline at end of file
diff --git a/pages/22_Chameleon.py b/pages/22_Chameleon.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6cc368034b2714c4270593275792af0ac5347ca
--- /dev/null
+++ b/pages/22_Chameleon.py
@@ -0,0 +1,88 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Chameleon")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1814278511785312320) (July 19, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Chameleon ๐ฆ by Meta is now available in ๐ค Transformers.
+A multimodal model that comes in 7B and 34B sizes ๐คฉ
+But what makes this model so special? Keep reading โฃ
+""")
+st.markdown(""" """)
+
+st.video("pages/Chameleon/video_1.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.markdown("""
+[Demo](https://t.co/GsGE17fSdI) | [Models](https://t.co/cWUiVbsRz6)
+Find below the API to load this model locally use it โฌ๏ธ
+""")
+st.markdown(""" """)
+
+st.image("pages/Chameleon/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Chameleon is a unique model: it attempts to scale early fusion ๐คจ
+But what is early fusion?
+Modern vision language models use a vision encoder with a projection layer to project image embeddings so it can be promptable to text decoder.""")
+st.markdown(""" """)
+
+st.image("pages/Chameleon/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Early fusion on the other hand attempts to fuse all features together (image patches and text) by using an image tokenizer and all tokens are projected into a shared space, which enables seamless generation ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/Chameleon/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Authors have also introduced different architectural improvements (QK norm and revise placement of layer norms) for scalable and stable training.
+This way they were able to increase the token count (5x tokens compared to Llama 3 which is a must with early-fusion IMO) .
+""")
+st.markdown(""" """)
+
+st.image("pages/Chameleon/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+This model is an any-to-any model thanks to early fusion: it can take image and text input and output image and text, but image generation are disabled to prevent malicious use.
+""")
+st.markdown(""" """)
+
+st.image("pages/Chameleon/image_5.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+One can also do text-only prompting, authors noted the model catches up with larger LLMs, and you can also see how it compares to VLMs with image-text prompting.
+""")
+st.markdown(""" """)
+
+st.image("pages/Chameleon/image_6.jpg", use_column_width=True)
+st.image("pages/Chameleon/image_6.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818)
+by Chameleon Team (2024)
+[GitHub](https://github.com/facebookresearch/chameleon)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/chameleon)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Llava-NeXT-Interleave")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Video-LLaVA")
\ No newline at end of file
diff --git a/pages/23_Video-LLaVA.py b/pages/23_Video-LLaVA.py
new file mode 100644
index 0000000000000000000000000000000000000000..3434308320c15ecd1f40507cbc6f8edc7e95fb0a
--- /dev/null
+++ b/pages/23_Video-LLaVA.py
@@ -0,0 +1,70 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Video-LLaVA")
+
+st.success("""[Original tweet](https://x.com/mervenoyann/status/1816427325073842539) (July 25, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""We have recently merged Video-LLaVA to ๐ค Transformers! ๐๏ธ
+What makes this model different? Keep reading โ
+""")
+st.markdown(""" """)
+
+st.video("pages/Video-LLaVA/video_1.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.markdown("""[Demo](https://t.co/MVP14uEj9e) | [Model](https://t.co/oqSCMUqwJo)
+See below how to initialize the model and processor and infer โฌ๏ธ
+""")
+st.markdown(""" """)
+
+st.image("pages/Video-LLaVA/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Compared to other models that take image and video input and either project them separately or downsampling video and projecting selected frames, Video-LLaVA is converting images and videos to unified representation and project them using a shared projection layer.
+""")
+st.markdown(""" """)
+
+st.image("pages/Video-LLaVA/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+It uses Vicuna 1.5 as the language model and LanguageBind's own encoders that's based on OpenCLIP, these encoders project the modalities to an unified representation before passing to projection layer.
+""")
+st.markdown(""" """)
+
+st.image("pages/Video-LLaVA/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+I feel like one of the coolest features of this model is the joint understanding which is also introduced recently with many models.
+It's a relatively older model but ahead of it's time and works very well!
+""")
+st.markdown(""" """)
+
+st.image("pages/Video-LLaVA/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://arxiv.org/abs/2311.10122)
+by Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan (2023)
+[GitHub](https://github.com/PKU-YuanGroup/Video-LLaVA)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/video_llava)
+""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Chameleon")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("SAMv2")
\ No newline at end of file
diff --git a/pages/24_SAMv2.py b/pages/24_SAMv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a26086dddde5749508ae1cd44083ea509b34310
--- /dev/null
+++ b/pages/24_SAMv2.py
@@ -0,0 +1,88 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("SAMv2")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1818675981634109701) (July 31, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""SAMv2 is just mindblowingly good ๐
+Learn what makes this model so good at video segmentation, keep reading ๐ฆโ
+""")
+st.markdown(""" """)
+
+col1, col2, col3 = st.columns(3)
+with col2:
+ st.video("pages/SAMv2/video_1.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.markdown("""
+Check out the [demo](https://t.co/35ixEZgPaf) by [skalskip92](https://x.com/skalskip92) to see how to use the model locally.
+Check out Meta's [demo](https://t.co/Bcbli9Cfim) where you can edit segmented instances too!
+
+Segment Anything Model by Meta was released as a universal segmentation model in which you could prompt a box or point prompt to segment the object of interest
+SAM consists of an image encoder to encode images, a prompt encoder to encode prompts, then outputs of these two are given to a mask decoder to generate masks.
+""")
+st.markdown(""" """)
+
+st.image("pages/SAMv2/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+However SAM doesn't naturally track object instances in videos, one needs to make sure to prompt the same mask or point prompt for that instance in each frame and feed each frame, which is infeasible ๐
+But don't fret, that is where SAMv2 comes in with a memory module!
+
+SAMv2 defines a new task called "masklet prediction" here masklet refers to the same mask instance throughout the frames ๐๏ธ
+Unlike SAM, SAM 2 decoder is not fed the image embedding directly from an image encoder, but attention of memories of prompted frames and object pointers.
+""")
+st.markdown(""" """)
+
+st.image("pages/SAMv2/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+๐ผ๏ธ These "memories" are essentially past predictions of object of interest up to a number of recent frames,
+and are in form of feature maps of location info (spatial feature maps).
+๐๐ป The object pointers are high level semantic information of the object of interest based on.
+
+Just like SAM paper SAMv2 depends on a data engine, and the dataset it generated comes with the release: SA-V ๐คฏ
+This dataset is gigantic, it has 190.9K manual masklet annotations and 451.7K automatic masklets!
+""")
+st.markdown(""" """)
+
+st.image("pages/SAMv2/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Initially they apply SAM to each frame to assist human annotators to annotate a video at six FPS for high quality data,
+in the second phase they add SAM and SAM2 to generate masklets across time consistently. Finally they use SAM2 to refine the masklets.
+
+They have evaluated this model on J&F score (Jaccard Index + F-measure for contour acc) which is used to evaluate zero-shot
+video segmentation benchmarks.
+SAMv2 seems to outperform two previously sota models that are built on top of SAM! ๐ฅน
+""")
+st.markdown(""" """)
+
+st.image("pages/SAMv2/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[SAM 2: Segment Anything in Images and Videos]()
+by Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman Rรคdle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Dollรกr, Christoph Feichtenhofer (2024)
+[GitHub](https://github.com/facebookresearch/segment-anything-2)
+[Hugging Face documentation]()""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Video-LLaVA")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Home")
\ No newline at end of file
diff --git a/pages/2_Oneformer.py b/pages/2_Oneformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6b2996b0e8cec6324ee7313ab998633e47a931f
--- /dev/null
+++ b/pages/2_Oneformer.py
@@ -0,0 +1,62 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("OneFormer")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1739707076501221608) (December 26, 2023)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""
+OneFormer: one model to segment them all? ๐คฏ
+I was looking into paperswithcode leaderboards when I came across OneFormer for the first time so it was time to dig in!
+""")
+st.markdown(""" """)
+
+st.image("pages/OneFormer/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""OneFormer is a "truly universal" model for semantic, instance and panoptic segmentation tasks โ๏ธ
+What makes is truly universal is that it's a single model that is trained only once and can be used across all tasks ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/OneFormer/image_2.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The enabler here is the text conditioning, i.e. the model is given a text query that states task type along with the appropriate input, and using contrastive loss, the model learns the difference between different task types ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/OneFormer/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Thanks to ๐ค Transformers, you can easily use the model!
+I have drafted a [notebook](https://t.co/cBylk1Uv20) for you to try right away ๐
+You can also check out the [Space](https://t.co/31GxlVo1W5) without checking out the code itself.
+""")
+st.markdown(""" """)
+
+st.image("pages/OneFormer/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
+by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi (2022)
+[GitHub](https://github.com/SHI-Labs/OneFormer)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/oneformer)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("MobileSAM")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("VITMAE")
\ No newline at end of file
diff --git a/pages/3_VITMAE.py b/pages/3_VITMAE.py
new file mode 100644
index 0000000000000000000000000000000000000000..582c71f3a0f4f3e77f71afbd658a3a18dfea9869
--- /dev/null
+++ b/pages/3_VITMAE.py
@@ -0,0 +1,63 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("VITMAE")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1740688304784183664) (December 29, 2023)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Just read VitMAE paper, sharing some highlights ๐งถ
+ViTMAE is a simply yet effective self-supervised pre-training technique, where authors combined vision transformer with masked autoencoder.
+The images are first masked (75 percent of the image!) and then the model tries to learn about the features through trying to reconstruct the original image!
+""")
+st.markdown(""" """)
+
+st.image("pages/VITMAE/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""The image is not masked, but rather only the visible patches are fed to the encoder (and that is the only thing encoder sees!).
+Next, a mask token is added to where the masked patches are (a bit like BERT, if you will) and the mask tokens and encoded patches are fed to decoder.
+The decoder then tries to reconstruct the original image.
+""")
+st.markdown(""" """)
+
+st.image("pages/VITMAE/image_2.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""As a result, the authors found out that high masking ratio works well in fine-tuning for downstream tasks and linear probing ๐คฏ๐คฏ
+""")
+st.markdown(""" """)
+
+st.image("pages/VITMAE/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""If you want to try the model or fine-tune, all the pre-trained VITMAE models released released by Meta are available on [Huggingface](https://t.co/didvTL9Zkm).
+We've built a [demo](https://t.co/PkuACJiKrB) for you to see the intermediate outputs and reconstruction by VITMAE.
+
+Also there's a nice [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb) by [@NielsRogge](https://twitter.com/NielsRogge).
+""")
+st.markdown(""" """)
+
+st.image("pages/VITMAE/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v3)
+by LKaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollรกr, Ross Girshick (2021)
+[GitHub](https://github.com/facebookresearch/mae)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/vit_mae)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("OneFormer")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("DINOV2")
\ No newline at end of file
diff --git a/pages/4M-21/4M-21.md b/pages/4M-21/4M-21.md
new file mode 100644
index 0000000000000000000000000000000000000000..95a6a6a980ec5976f7da454b4f5cc3c2485bbb11
--- /dev/null
+++ b/pages/4M-21/4M-21.md
@@ -0,0 +1,32 @@
+๏ปฟEPFL and Apple just released 4M-21: single any-to-any model that can do anything from text-to-image generation to generating depth masks! ๐ Let's unpack ๐งถ
+
+![image_1](image_1.jpg)
+
+4M is a multimodal training [framework](https://t.co/jztLublfSF) introduced by Apple and EPFL.
+Resulting model takes image and text and output image and text ๐คฉ
+[Models](https://t.co/1LC0rAohEl) | [Demo](https://t.co/Ra9qbKcWeY)
+
+![video_1](video_1.mp4)
+
+This model consists of transformer encoder and decoder, where the key to multimodality lies in input and output data: input and output tokens are decoded to generate bounding boxes, generated image's pixels, captions and more!
+
+![image_2](image_2.jpg)
+
+This model also learnt to generate canny maps, SAM edges and other things for steerable text-to-image generation ๐ผ๏ธ
+The authors only added image-to-all capabilities for the demo, but you can try to use this model for text-to-image generation as well โบ๏ธ
+
+![image_3](image_3.jpg)
+
+In the project page you can also see the model's text-to-image and steered generation capabilities with model's own outputs as control masks!
+
+![video_2](video_2.mp4)
+
+
+> [!TIP]
+Ressources:
+[4M-21: An Any-to-Any Vision Model for Tens of Tasks and Modalities](https://arxiv.org/abs/2406.09406)
+by Roman Bachmann, Oฤuzhan Fatih Kar, David Mizrahi, Ali Garjani, Mingfei Gao, David Griffiths, Jiaming Hu, Afshin Dehghan, Amir Zamir (2024)
+[GitHub](https://github.com/apple/ml-4m/)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1804138208814309626) (June 21, 2024)
\ No newline at end of file
diff --git a/pages/4M-21/image_1.jpg b/pages/4M-21/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..94763ed7f6873c9377554a8f10eb47dcaa28a4f5
Binary files /dev/null and b/pages/4M-21/image_1.jpg differ
diff --git a/pages/4M-21/image_2.jpg b/pages/4M-21/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7f52556f301204c1f823ad6999e0adb9ca734367
Binary files /dev/null and b/pages/4M-21/image_2.jpg differ
diff --git a/pages/4M-21/image_3.jpg b/pages/4M-21/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6b45be29fca4a21c65a1f7acbe09c5ed58eb726d
Binary files /dev/null and b/pages/4M-21/image_3.jpg differ
diff --git a/pages/4M-21/video_1.mp4 b/pages/4M-21/video_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a2c6c5032811b446e83f1e25ff8998b88081e702
--- /dev/null
+++ b/pages/4M-21/video_1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cd40cb677314a9384da8e644ad3bb9eba3e23a39e776f5ce8c1437ebf3d06d8
+size 1073547
diff --git a/pages/4M-21/video_2.mp4 b/pages/4M-21/video_2.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..df457d56eb4360537627e22d0a9b3e3e5a79ce44
Binary files /dev/null and b/pages/4M-21/video_2.mp4 differ
diff --git a/pages/4_DINOv2.py b/pages/4_DINOv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d365f7eac441248d7329eea2a130143aad144cd
--- /dev/null
+++ b/pages/4_DINOv2.py
@@ -0,0 +1,78 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("DINOv2")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1743290724672495827) (January 5, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""DINOv2 is the king for self-supervised learning in images ๐ฆ๐ฆ
+But how does it work? I've tried to explain how it works but let's expand on it ๐งถ
+""")
+st.markdown(""" """)
+
+st.image("pages/DINOv2/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+DINOv2 is essentially DINO on steroids, so let's talk about DINOv1 first ๐ฆ
+It's essentially a pre-training technique to train ViTs with self-supervision, that uses an unusual way of distillation ๐งโโ๏ธ๐จ๐ปโ๐ซ.
+Distillation is a technique where there's a large pre-trained model (teacher), and you have a smaller model (student) initialized randomly.
+Then during training the student, you take both models'outputs, calculate divergence between them and then update the loss accordingly.
+In this case, we have no labels! And the teacher is not pretrained!!!! ๐คฏ
+Well, the outputs here are the distributions, and teacher is iteratively updated according to student, which is called exponential moving average.
+""")
+st.markdown(""" """)
+
+st.image("pages/DINOv2/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+DINO doesn't use any contrastive loss or clustering but only cross entropy loss (again, what a paper) which leads the model to collapse.
+This can be avoided by normalizing the teacher output multiple times, but authors center (to squish logits) and sharpen (through temperature) the teacher outputs.
+Finally, local and global crops are given to student and only global crops are given to teacher and this sort of pushes student to identify context from small parts of the image.
+""")
+st.markdown(""" """)
+
+st.image("pages/DINOv2/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""How does DINOv2 improve DINO?
+โก๏ธ More efficient thanks to FSDP and Flash Attention
+๐ฆ Has a very efficient data augmentation technique that apparently scales to 100M+ images (put below)
+๐จ๐ปโ๐ซ Uses ViT-g instead of training from scratch
+""")
+st.markdown(""" """)
+
+st.image("pages/DINOv2/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The model is so powerful that you can use DINOv2 even with knn or linear classifiers without need to fine-tuning!
+But if you'd like DINOv2 to work even better, [NielsRogge](https://twitter.com/NielsRogge) has built a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Fine\_tune\_DINOv2\_for\_image\_classification\_%5Bminimal%5D.ipynb) to fine-tune it using Trainer ๐
+He also has a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Train\_a\_linear\_classifier\_on\_top\_of\_DINOv2\_for\_semantic\_segmentation.ipynb) if you feel like training a linear classifier only ๐
+All the different DINO/v2 model checkpoints are [here](https://huggingface.co/models?search=dinoLastly).
+Lastly, special thanks to [ykilcher](https://twitter.com/ykilcher) as I couldn't make sense of certain things in the paper and watched his awesome [tutorial](https://youtube.com/watch?v=h3ij3F) ๐คฉ
+""")
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
+by Maxime Oquab, Timothรฉe Darcet, Thรฉo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervรฉ Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski (2023)
+[GitHub](https://github.com/facebookresearch/dinov2)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/dinov2)""", icon="๐")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("VITMAE")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("SigLIP")
\ No newline at end of file
diff --git a/pages/5_SigLIP.py b/pages/5_SigLIP.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cba9330d032284b95a3b20e3585f1cc60c8a8f0
--- /dev/null
+++ b/pages/5_SigLIP.py
@@ -0,0 +1,78 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("SigLIP")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1745476609686089800) (January 11. 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""SigLIP just got merged to ๐ค Transformers and it's super easy to use!
+To celebrate this, I have created a repository on various SigLIP based projects!
+But what is it and how does it work?
+SigLIP an vision-text pre-training technique based on contrastive learning. It jointly trains an image encoder and text encoder such that the dot product of embeddings are most similar for the appropriate text-image pairs.
+The image below is taken from CLIP, where this contrastive pre-training takes place with softmax, but SigLIP replaces softmax with sigmoid. ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/SigLIP/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Highlightsโจ
+๐ผ๏ธ๐ Authors used medium sized B/16 ViT for image encoder and B-sized transformer for text encoder
+๐ More performant than CLIP on zero-shot
+๐ฃ๏ธ Authors trained a multilingual model too!
+โก๏ธ Super efficient, sigmoid is enabling up to 1M items per batch, but the authors chose 32k (see saturation on perf below)
+""")
+st.markdown(""" """)
+
+st.image("pages/SigLIP/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Below you can find prior CLIP models and SigLIP across different image encoder sizes and their performance on different datasets ๐๐ป
+""")
+st.markdown(""" """)
+
+st.image("pages/SigLIP/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+With ๐ค Transformers integration there comes zero-shot-image-classification pipeline, makes SigLIP super easy to use!
+""")
+st.markdown(""" """)
+
+st.image("pages/SigLIP/image_4.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+What to use SigLIP for? ๐ง
+Honestly the possibilities are endless, but you can use it for image/text retrieval, zero-shot classification, training multimodal models!
+I have made a repository with notebooks and applications that are also hosted on [Spaces](https://t.co/Ah1CrHVuPY).
+I have built ["Draw to Search Art"](https://t.co/DcmQWMc1qd) where you can input image (upload one or draw) and search among 10k images in wikiart!
+I've also built apps to [compare](https://t.co/m699TMvuW9) CLIP and SigLIP outputs.
+""")
+st.markdown(""" """)
+
+st.image("pages/SigLIP/image_5.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343)
+by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer (2023)
+[GitHub](https://github.com/google-research/big_vision)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/siglip)""", icon="๐")
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("DINOv2")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("OWLv2")
\ No newline at end of file
diff --git a/pages/6_OWLv2.py b/pages/6_OWLv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fb749dacc41e7dd5213e1daf77dd2d64338703d
--- /dev/null
+++ b/pages/6_OWLv2.py
@@ -0,0 +1,87 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("OWLv2")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1748411972675150040) (January 19, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Explaining the ๐ of zero-shot open-vocabulary object detection: OWLv2 ๐ฆ๐งถ""")
+st.markdown(""" """)
+
+st.image("pages/OWLv2/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+OWLv2 is scaled version of a model called OWL-ViT, so let's take a look at that first ๐
+OWLViT is an open vocabulary object detector, meaning, it can detect objects it didn't explicitly see during the training ๐
+What's cool is that it can take both image and text queries! This is thanks to how the image and text features aren't fused together.
+""")
+st.markdown(""" """)
+
+st.image("pages/OWLv2/image_2.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Taking a look at the architecture, the authors firstly do contrastive pre-training of a vision and a text encoder (just like CLIP).
+They take that model, remove the final pooling layer and attach a lightweight classification and box detection head and fine-tune.
+""")
+st.markdown(""" """)
+
+st.image("pages/OWLv2/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""During fine-tuning for object detection, they calculate the loss over bipartite matches.
+Simply put, loss is calculated over the predicted objects against ground truth objects and the goal is to find a perfect match of these two sets where each object is matched to one object in ground truth.
+
+OWL-ViT is very scalable.
+One can easily scale most language models or vision-language models because they require no supervision, but this isn't the case for object detection: you still need supervision.
+Moreover, only scaling the encoders creates a bottleneck after a while.
+""")
+st.markdown(""" """)
+
+st.image("pages/OWLv2/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+The authors wanted to scale OWL-ViT with more data, so they used OWL-ViT for labelling to train a better detector, "self-train" a new detector on the labels, and fine-tune the model on human-annotated data.
+""")
+st.markdown(""" """)
+
+st.image("pages/OWLv2/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Thanks to this, OWLv2 scaled very well and is tops leaderboards on open vocabulary object detection ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/OWLv2/image_5.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Want to try OWL models?
+I've created a [notebook](https://t.co/ick5tA6nyx) for you to see how to use it with ๐ค Transformers.
+If you want to play with it directly, you can use this [Space](https://t.co/oghdLOtoa5).
+All the models and the applications of OWL-series is in this [collection](https://huggingface.co/collections/merve/owl-series-65aaac3114e6582c300544df).
+""")
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)
+by Matthias Minderer, Alexey Gritsenko, Neil Houlsby (2023)
+[GitHub](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/owlv2)""", icon="๐")
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("SigLIP")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Backbone")
\ No newline at end of file
diff --git a/pages/7_Backbone.py b/pages/7_Backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c4db92e123f3651f8c9f2969a6dcedcbc7a4a4
--- /dev/null
+++ b/pages/7_Backbone.py
@@ -0,0 +1,63 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Backbone")
+
+st.success("""[Original tweet](https://x.com/mervenoyann/status/1749841426177810502) (January 23, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Many cutting-edge computer vision models consist of multiple stages:
+โฐ backbone extracts the features,
+โฐ neck refines the features,
+โฐ head makes the detection for the task.
+Implementing this is cumbersome, so ๐ค Transformers has an API for this: Backbone!
+""")
+st.markdown(""" """)
+
+st.image("pages/Backbone/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Let's see an example of such model.
+Assuming we would like to initialize a multi-stage instance segmentation model with ResNet backbone and MaskFormer neck and a head, you can use the backbone API like following (left comments for clarity) ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/Backbone/image_2.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""One can also use a backbone just to get features from any stage. You can initialize any backbone with `AutoBackbone` class.
+See below how to initialize a backbone and getting the feature maps at any stage ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/Backbone/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Backbone API also supports any timm backbone of your choice! Check out a variation of timm backbones [here](https://t.co/Voiv0QCPB3).
+""")
+st.markdown(""" """)
+
+st.image("pages/Backbone/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Leaving some links ๐
+๐ I've created a [notebook](https://t.co/PNfmBvdrtt) for you to play with it
+๐ [Backbone API docs](https://t.co/Yi9F8qAigO)
+๐ [AutoBackbone docs](https://t.co/PGo9oILHDw) (all written with love by me!๐)""")
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("OWLv2")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Depth Anything")
\ No newline at end of file
diff --git a/pages/8_Depth_Anything.py b/pages/8_Depth_Anything.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d014e17b5cccc3836d292d0dc49212c586f4e62
--- /dev/null
+++ b/pages/8_Depth_Anything.py
@@ -0,0 +1,100 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("Depth Anything")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1750531698008498431) (January 25, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""Explaining a new state-of-the-art monocular depth estimation model: Depth Anything โจ๐งถ
+It has just been integrated in transformers for super-easy use.
+We compared it against DPTs and benchmarked it as well! You can find the usage, benchmark, demos and more below ๐
+""")
+st.markdown(""" """)
+
+st.video("pages/Depth_Anything/video_1.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.markdown("""
+The paper starts with highlighting previous depth estimation methods and the limitations regarding the data coverage. ๐
+The model's success heavily depends on unlocking the use of unlabeled datasets, although initially the authors used self-training and failed.
+
+What the authors have done:
+โฐ Train a teacher model on labelled dataset
+โฐ Guide the student using teacher and also use unlabelled datasets pseudolabelled by the teacher. However, this was the cause of the failure, as both architectures were similar, the outputs were the same.
+""")
+st.markdown(""" """)
+
+st.image("pages/Depth_Anything/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+So the authors have added a more difficult optimization target for student to learn additional knowledge on unlabeled images that went through color jittering, distortions, Gaussian blurring and spatial distortion, so it can learn more invariant representations from them.
+
+The architecture consists of DINOv2 encoder to extract the features followed by DPT decoder. At first, they train the teacher model on labelled images, and then they jointly train the student model and add in the dataset pseudo-labelled by ViT-L.
+""", unsafe_allow_html=True)
+
+st.markdown(""" """)
+
+st.image("pages/Depth_Anything/image_1.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""Thanks to this, Depth Anything performs very well! I have also benchmarked the inference duration of the model against different models here. I also ran `torch.compile` benchmarks across them and got nice speed-ups ๐
+
+On T4 GPU, mean of 30 inferences for each. Inferred using `pipeline` (pre-processing and post-processing included with model inference).
+
+| Model/Batch Size | 16 | 4 | 1 |
+| ----------------------------- | --------- | -------- | ------- |
+| intel/dpt-large | 2709.652 | 667.799 | 172.617 |
+| facebook/dpt-dinov2-small-nyu | 2534.854 | 654.822 | 159.754 |
+| facebook/dpt-dinov2-base-nyu | 4316.8733 | 1090.824 | 266.699 |
+| Intel/dpt-beit-large-512 | 7961.386 | 2036.743 | 497.656 |
+| depth-anything-small | 1692.368 | 415.915 | 143.379 |
+
+`torch.compile`โs benchmarks with reduce-overhead mode: we have compiled the model and loaded it to the pipeline for the benchmarks to be fair.
+
+| Model/Batch Size | 16 | 4 | 1 |
+| ----------------------------- | -------- | -------- | ------- |
+| intel/dpt-large | 2556.668 | 645.750 | 155.153 |
+| facebook/dpt-dinov2-small-nyu | 2415.25 | 610.967 | 148.526 |
+| facebook/dpt-dinov2-base-nyu | 4057.909 | 1035.672 | 245.692 |
+| Intel/dpt-beit-large-512 | 7417.388 | 1795.882 | 426.546 |
+| depth-anything-small | 1664.025 | 384.688 | 97.865 |
+
+""")
+st.markdown(""" """)
+
+st.image("pages/Depth_Anything/image_2.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+You can use Depth Anything easily thanks to ๐ค Transformers with three lines of code! โจ
+We have also built an app for you to [compare different depth estimation models](https://t.co/6uq4osdwWG) ๐ ๐ธ
+See all the available Depth Anything checkpoints [here](https://t.co/Ex0IIyx7XC).
+""")
+st.markdown(""" """)
+
+st.image("pages/Depth_Anything/image_3.jpg", use_column_width=True)
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891)
+by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024)
+[GitHub](https://github.com/LiheYoung/Depth-Anything)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything)""", icon="๐")
+
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Backbone")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("LLaVA-NeXT")
\ No newline at end of file
diff --git a/pages/9_LLaVA-NeXT.py b/pages/9_LLaVA-NeXT.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae3c3a9c5254d85df3394436ef803dcd2beb7c99
--- /dev/null
+++ b/pages/9_LLaVA-NeXT.py
@@ -0,0 +1,74 @@
+import streamlit as st
+from streamlit_extras.switch_page_button import switch_page
+
+st.title("LLaVA-NeXT")
+
+st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1770832875551682563) (March 21, 2024)""", icon="โน๏ธ")
+st.markdown(""" """)
+
+st.markdown("""LLaVA-NeXT is recently merged to ๐ค Transformers and it outperforms many of the proprietary models like Gemini on various benchmarks!๐คฉ
+For those who don't know LLaVA, it's a language model that can take image ๐ฌ
+Let's take a look, demo and more in this.
+""")
+st.markdown(""" """)
+
+st.image("pages/LLaVA-NeXT/image_1.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+LLaVA is essentially a vision-language model that consists of ViT-based CLIP encoder, a MLP projection and Vicuna as decoder โจ
+LLaVA 1.5 was released with Vicuna, but LLaVA NeXT (1.6) is released with four different LLMs:
+- Nous-Hermes-Yi-34B
+- Mistral-7B
+- Vicuna 7B & 13B
+""")
+st.markdown(""" """)
+
+st.image("pages/LLaVA-NeXT/image_2.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""
+Thanks to Transformers integration, it is very easy to use LLaVA NeXT, not only standalone but also with 4-bit loading and Flash Attention 2 ๐
+See below on standalone usage ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/LLaVA-NeXT/image_3.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""To fit large models and make it even faster and memory efficient, you can enable Flash Attention 2 and load model into 4-bit using bitsandbytes โก๏ธ transformers makes it very easy to do this! See below ๐
+""")
+st.markdown(""" """)
+
+st.image("pages/LLaVA-NeXT/image_4.jpeg", use_column_width=True)
+st.markdown(""" """)
+
+st.markdown("""If you want to try the code right away, here's the [notebook](https://t.co/NvoxvY9z1u).
+Lastly, you can directly play with the LLaVA-NeXT based on Mistral-7B through the demo [here](https://t.co/JTDlqMUwEh) ๐ค
+""")
+st.markdown(""" """)
+
+st.video("pages/LLaVA-NeXT/video_1.mp4", format="video/mp4")
+st.markdown(""" """)
+
+st.info("""
+Ressources:
+[LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/)
+by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee (2024)
+[GitHub](https://github.com/haotian-liu/LLaVA/tree/main)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/llava_next)""", icon="๐")
+
+
+st.markdown(""" """)
+st.markdown(""" """)
+st.markdown(""" """)
+col1, col2, col3 = st.columns(3)
+with col1:
+ if st.button('Previous paper', use_container_width=True):
+ switch_page("Depth Anything")
+with col2:
+ if st.button('Home', use_container_width=True):
+ switch_page("Home")
+with col3:
+ if st.button('Next paper', use_container_width=True):
+ switch_page("Painter")
\ No newline at end of file
diff --git a/pages/Backbone/Backbone.md b/pages/Backbone/Backbone.md
new file mode 100644
index 0000000000000000000000000000000000000000..dd40848799fd10bc23c64732011759e712510249
--- /dev/null
+++ b/pages/Backbone/Backbone.md
@@ -0,0 +1,31 @@
+๏ปฟMany cutting-edge computer vision models consist of multiple stages:
+โฐ backbone extracts the features,
+โฐ neck refines the features,
+โฐ head makes the detection for the task.
+Implementing this is cumbersome, so ๐ค transformers has an API for this: Backbone!
+
+![image_1](image_1.jpg)
+
+Let's see an example of such model.
+Assuming we would like to initialize a multi-stage instance segmentation model with ResNet backbone and MaskFormer neck and a head, you can use the backbone API like following (left comments for clarity) ๐
+
+![image_2](image_2.jpg)
+
+One can also use a backbone just to get features from any stage. You can initialize any backbone with `AutoBackbone` class.
+See below how to initialize a backbone and getting the feature maps at any stage ๐
+
+![image_3](image_3.jpg)
+
+Backbone API also supports any timm backbone of your choice! Check out a variation of timm backbones [here](https://t.co/Voiv0QCPB3).
+
+![image_4](image_4.jpg)
+
+Leaving some links ๐:
+๐ I've created a [notebook](https://t.co/PNfmBvdrtt) for you to play with it
+๐ [Backbone API docs](https://t.co/Yi9F8qAigO)
+๐ [AutoBackbone docs](https://t.co/PGo9oILHDw) ๐
+(all written with love by me!)
+
+
+> [!NOTE]
+[Orignial tweet](https://twitter.com/mervenoyann/status/1749841426177810502) (January 23, 2024)
diff --git a/pages/Backbone/image_1.jpeg b/pages/Backbone/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..e13b273736aa7a089dd01dd4690c1668272065c8
Binary files /dev/null and b/pages/Backbone/image_1.jpeg differ
diff --git a/pages/Backbone/image_2.jpeg b/pages/Backbone/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..539e19d52fde3cf4f0cf594d2496acc9a169e48a
Binary files /dev/null and b/pages/Backbone/image_2.jpeg differ
diff --git a/pages/Backbone/image_3.jpeg b/pages/Backbone/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..d199b14d45ee7c922fd2aba0b5275fdb3916c84e
Binary files /dev/null and b/pages/Backbone/image_3.jpeg differ
diff --git a/pages/Backbone/image_4.jpeg b/pages/Backbone/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..8612ac2422f1bf5230d4ab0d53d0468f098f8359
Binary files /dev/null and b/pages/Backbone/image_4.jpeg differ
diff --git a/pages/Chameleon/Chameleon.md b/pages/Chameleon/Chameleon.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a4c7df8d9aae726149b489545f098085024c27c
--- /dev/null
+++ b/pages/Chameleon/Chameleon.md
@@ -0,0 +1,43 @@
+๏ปฟChameleon ๐ฆ by Meta is now available in @huggingface transformers ๐
+A multimodal model that comes in 7B and 34B sizes ๐คฉ
+But what makes this model so special? keep reading โฃ
+
+![video_1](video_1.mp4)
+
+[Demo](https://t.co/GsGE17fSdI] | [Models](https://t.co/cWUiVbsRz6)
+Find below the API to load this model locally use it โฌ๏ธ
+
+![image_1](image_1.jpg)
+
+Chameleon is a unique model: it attempts to scale early fusion ๐คจ But what is early fusion?
+Modern vision language models use a vision encoder with a projection layer to project image embeddings so it can be promptable to text decoder.
+
+![image_2](image_2.jpg)
+
+Early fusion on the other hand attempts to fuse all features together (image patches and text) by using an image tokenizer and all tokens are projected into a shared space, which enables seamless generation ๐
+
+![image_3](image_3.jpg)
+
+Authors have also introduced different architectural improvements (QK norm and revise placement of layer norms) for scalable and stable training This way they were able to increase the token count (5x tokens compared to Llama 3 which is a must with early-fusion IMO)
+
+![image_4](image_4.jpg)
+
+This model is an any-to-any model thanks to early fusion: it can take image and text input and output image and text, but image generation are disabled to prevent malicious use.
+
+![image_5](image_5.jpg)
+
+One can also do text-only prompting, authors noted the model catches up with larger LLMs, and you can also see how it compares to VLMs with image-text prompting.
+
+![image_6](image_6.jpg)
+
+![image_7](image_7.jpg)
+
+> [!TIP]
+Ressources:
+[Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818)
+by Chameleon Team (2024)
+[GitHub](https://github.com/facebookresearch/chameleon)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/chameleon)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1814278511785312320) (July 19, 2024)
\ No newline at end of file
diff --git a/pages/Chameleon/image_1.jpg b/pages/Chameleon/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a570cd2c03d3930ffd3ebfa3bfccbc5eec6de0bf
Binary files /dev/null and b/pages/Chameleon/image_1.jpg differ
diff --git a/pages/Chameleon/image_2.jpg b/pages/Chameleon/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9351b4847b428ee62207a729f8d5cd3a6b712708
Binary files /dev/null and b/pages/Chameleon/image_2.jpg differ
diff --git a/pages/Chameleon/image_3.jpg b/pages/Chameleon/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..72ae18c89d7dfae4df81e7dee2e3e1090cc5ea07
Binary files /dev/null and b/pages/Chameleon/image_3.jpg differ
diff --git a/pages/Chameleon/image_4.jpg b/pages/Chameleon/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5fec1ea78d2abab4647ee35f0e5099bfc1ef0dff
Binary files /dev/null and b/pages/Chameleon/image_4.jpg differ
diff --git a/pages/Chameleon/image_5.jpg b/pages/Chameleon/image_5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f155e8508c41db578e266e0f3e35d11a76d9560c
Binary files /dev/null and b/pages/Chameleon/image_5.jpg differ
diff --git a/pages/Chameleon/image_6.jpg b/pages/Chameleon/image_6.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..716dc3dcef0cca2b2e17fb2d6ff419d13466bbd0
Binary files /dev/null and b/pages/Chameleon/image_6.jpg differ
diff --git a/pages/Chameleon/image_7.jpg b/pages/Chameleon/image_7.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..aef4d21ffbaf689aaf714669cd956518d39fcd94
Binary files /dev/null and b/pages/Chameleon/image_7.jpg differ
diff --git a/pages/Chameleon/video_1.mp4 b/pages/Chameleon/video_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a16eeaa29ff5c30a3b963c4d6b608af86b25e32f
Binary files /dev/null and b/pages/Chameleon/video_1.mp4 differ
diff --git a/pages/CuMo/CuMo.md b/pages/CuMo/CuMo.md
new file mode 100644
index 0000000000000000000000000000000000000000..8a5608bbe1d7d8b6b91ca40fffd84b347b0c502f
--- /dev/null
+++ b/pages/CuMo/CuMo.md
@@ -0,0 +1,24 @@
+๏ปฟIt's raining vision language models โ๏ธ CuMo is a new vision language model that has MoE in every step of the VLM (image encoder, MLP and text decoder) and uses Mistral-7B for the decoder part ๐ค
+
+![image_1](image_1.jpg)
+
+The authors firstly did pre-training of MLP with the by freezing the image encoder and text decoder, then they warmup the whole network by unfreezing and finetuning which they state to stabilize the visual instruction tuning when bringing in the experts.
+
+![image_2](image_2.jpg)
+
+The mixture of experts MLP blocks above are simply the same MLP blocks initialized from the single MLP that was trained during pre-training and fine-tuned in pre-finetuning ๐
+
+![image_3](image_3.jpg)
+
+It works very well (also tested myself) that it outperforms the previous sota of it's size LLaVA NeXt! ๐ I wonder how it would compare to IDEFICS2-8B You can try it yourself [here](https://t.co/MLIYKVh5Ee).
+
+![image_4](image_4.jpg)
+
+> [!TIP]
+Ressources:
+[CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts](https://arxiv.org/abs/2405.05949)
+by Jiachen Li, Xinyao Wang, Sijie Zhu, Chia-Wen Kuo, Lu Xu, Fan Chen, Jitesh Jain, Humphrey Shi, Longyin Wen (2024)
+[GitHub](https://github.com/SHI-Labs/CuMo)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1790665706205307191) (May 15, 2024)
\ No newline at end of file
diff --git a/pages/CuMo/image_1.jpg b/pages/CuMo/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..da96a21983fe8e0cc003f054e0e74dbde87b7e80
Binary files /dev/null and b/pages/CuMo/image_1.jpg differ
diff --git a/pages/CuMo/image_2.jpg b/pages/CuMo/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e8e76bce137b1fecb8c5397c3a76767d353961f3
Binary files /dev/null and b/pages/CuMo/image_2.jpg differ
diff --git a/pages/CuMo/image_3.jpg b/pages/CuMo/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..15f23d840ac3561bf3da06e72ef4dc6f18c0cec4
Binary files /dev/null and b/pages/CuMo/image_3.jpg differ
diff --git a/pages/CuMo/image_4.jpg b/pages/CuMo/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c8a1f378004044c4295898f862848f0b6be60649
Binary files /dev/null and b/pages/CuMo/image_4.jpg differ
diff --git a/pages/DINOv2/Dinov2.md b/pages/DINOv2/Dinov2.md
new file mode 100644
index 0000000000000000000000000000000000000000..4479d2fdf9f025b5049eb068a8a9f89e03066b2b
--- /dev/null
+++ b/pages/DINOv2/Dinov2.md
@@ -0,0 +1,40 @@
+๏ปฟDINOv2 is the king for self-supervised learning in images ๐ฆ๐ฆ But how does it work? I've tried to explain how it works but let's expand on it ๐งถ
+
+![image_1](image_1.jpg)
+
+DINOv2 is essentially DINO on steroids, so let's talk about DINO first.
+๐ฆ It's essentially a pre-training technique to train ViTs with self-supervision, that uses an unusual way of distillation ๐งโโ๏ธ๐ง๐ปโ๐ซ
+Distillation is a technique where there's a large pre-trained model (teacher), and you have a smaller model (student) initialized randomly.
+Then during training the student, you take both models'outputs, calculate divergence between them and then update the loss accordingly.
+In this case, we have no labels! And the teacher is not pretrained!!!! ๐คฏ
+Well, the outputs here are the distributions, and teacher is iteratively updated according to student, which is called exponential moving average.
+
+![image_2](image_2.jpg)
+
+DINO doesn't use any contrastive loss or clustering but only cross entropy loss (again, what a paper) which leads the model to collapse.
+This can be avoided by normalizing the teacher output multiple times, but authors center (to squish logits) and sharpen (through temperature) the teacher outputs.
+Finally, local and global crops are given to student and only global crops are given to teacher and this sort of pushes student to identify context from small parts of the image.
+
+![image_3](image_3.jpg)
+
+How does DINOv2 improve DINO? โก๏ธ
+More efficient thanks to FSDP and Flash Attention ๐ฆ Has a very efficient data augmentation technique that apparently scales to 100M+ images (put below) ๐ง๐ปโ๐ซ
+Uses ViT-g instead of training from scratch
+
+![image_4](image_4.jpg)
+
+The model is so powerful that you can use DINOv2 even with knn or linear classifiers without need to fine-tuning!
+But if you'd like DINOv2 to work even better, [NielsRogge](https://twitter.com/NielsRogge) has built a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Fine\_tune\_DINOv2\_for\_image\_classification\_%5Bminimal%5D.ipynb) to fine-tune it using `Trainer`.
+๐ He also has a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Train\_a\_linear\_classifier\_on\_top\_of\_DINOv2\_for\_semantic\_segmentation.ipynb) if you feel like training a linear classifier only
+๐ All the different DINO/v2 model checkpoints are [here](https://huggingface.co/models?search=dinoLastly).
+Special thanks to [ykilcher](https://twitter.com/ykilcher) as I couldn't make sense of certain things in the paper and watched his awesome [tutorial](https://youtube.com/watch?v=h3ij3F) ๐คฉ
+
+> [!TIP]
+Ressources:
+[DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)
+by Maxime Oquab, Timothรฉe Darcet, Thรฉo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervรฉ Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski (2023)
+[GitHub](https://github.com/facebookresearch/dinov2)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/dinov2)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1743290724672495827) (January 5, 2024)
\ No newline at end of file
diff --git a/pages/DINOv2/image_1.jpeg b/pages/DINOv2/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..e3f21d0a43896cb736fc3eb5a6036a79c01e2300
Binary files /dev/null and b/pages/DINOv2/image_1.jpeg differ
diff --git a/pages/DINOv2/image_2.jpg b/pages/DINOv2/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..89302d47a095fa17b2afdebae4850c48c2a09e27
Binary files /dev/null and b/pages/DINOv2/image_2.jpg differ
diff --git a/pages/DINOv2/image_3.jpeg b/pages/DINOv2/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..dbd02a8b045dd66c1bab29bf09c36ef722e12164
Binary files /dev/null and b/pages/DINOv2/image_3.jpeg differ
diff --git a/pages/DINOv2/image_4.jpeg b/pages/DINOv2/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..711fb761881886fc419fadea50459674e2e119a4
Binary files /dev/null and b/pages/DINOv2/image_4.jpeg differ
diff --git a/pages/DenseConnector/DenseConnector.md b/pages/DenseConnector/DenseConnector.md
new file mode 100644
index 0000000000000000000000000000000000000000..03dc21c286910c5029ce4bad4932c42780997960
--- /dev/null
+++ b/pages/DenseConnector/DenseConnector.md
@@ -0,0 +1,32 @@
+๏ปฟDo we fully leverage image encoders in vision language models? ๐
+A new paper built a dense connector that does it better! Let's dig in ๐งถ
+
+![image_1](image_1.jpg)
+
+VLMs consist of an image encoder block, a projection layer that projects image embeddings to text embedding space and then a text decoder sequentially connected ๐
+This [paper](https://t.co/DPQzbj0eWm) explores using intermediate states of image encoder and not a single output ๐คฉ
+
+![image_2](image_2.jpg)
+
+The authors explore three different ways of instantiating dense connector: sparse token integration, sparse channel integration and dense channel integration (each of them just take intermediate outputs and put them together in different ways, see below).
+
+![image_3](image_3.jpg)
+
+They explore all three of them integrated to LLaVA 1.5 and found out each of the new models are superior to the original LLaVA 1.5.
+
+![image_4](image_4.jpg)
+
+I tried the model and it seems to work very well ๐ฅน
+The authors have released various [checkpoints](https://t.co/iF8zM2qvDa) based on different decoders (Vicuna 7/13B and Llama 3-8B).
+
+![image_5](image_5.jpg)
+
+
+> [!TIP]
+Ressources:
+[Dense Connector for MLLMs](https://arxiv.org/abs/2405.13800)
+by Huanjin Yao, Wenhao Wu, Taojiannan Yang, YuXin Song, Mengxi Zhang, Haocheng Feng, Yifan Sun, Zhiheng Li, Wanli Ouyang, Jingdong Wang (2024)
+[GitHub](https://github.com/HJYao00/DenseConnector)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1796089181988352216) (May 30, 2024)
\ No newline at end of file
diff --git a/pages/DenseConnector/image_1.jpg b/pages/DenseConnector/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..54049e36e6ef94330ad77d552939cfbcfd794e8a
Binary files /dev/null and b/pages/DenseConnector/image_1.jpg differ
diff --git a/pages/DenseConnector/image_2.jpg b/pages/DenseConnector/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3ead1232393d16a3f9090f683da48f2efa93fe36
Binary files /dev/null and b/pages/DenseConnector/image_2.jpg differ
diff --git a/pages/DenseConnector/image_3.jpg b/pages/DenseConnector/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..acde3e3b162a0f0b8fbfd4425e276385cbd0274e
Binary files /dev/null and b/pages/DenseConnector/image_3.jpg differ
diff --git a/pages/DenseConnector/image_4.jpg b/pages/DenseConnector/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..be881825ff172de57f08a05057b1fed561df73ec
Binary files /dev/null and b/pages/DenseConnector/image_4.jpg differ
diff --git a/pages/DenseConnector/image_5.jpg b/pages/DenseConnector/image_5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4c6639141526dedb627b1cbfa720f0a7456df257
Binary files /dev/null and b/pages/DenseConnector/image_5.jpg differ
diff --git a/pages/Depth Anything/Depth Anything.md b/pages/Depth Anything/Depth Anything.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3f6a43bc6c5eb7e1aa1a638ff5087cdfd3cb790
--- /dev/null
+++ b/pages/Depth Anything/Depth Anything.md
@@ -0,0 +1,61 @@
+๏ปฟExplaining a new state-of-the-art monocular depth estimation model: Depth Anything โจ
+It has just been integrated in transformers for super-easy use.
+We compared it against DPTs and benchmarked it as well! You can the usage, benchmark, demos and more below ๐
+
+![video_1](video_1.mp4)
+
+The paper starts with highlighting previous depth estimation methods and the limitations regarding the data coverage. ๐
+The model's success heavily depends on unlocking the use of unlabeled datasets, although initially the authors used self-training and failed.
+
+What the authors have done:
+โฐ Train a teacher model on labelled dataset
+โฐ Guide the student using teacher and also use unlabelled datasets pseudolabelled by the teacher. However, this was the cause of the failure, as both architectures were similar, the outputs were the same.
+
+![image_1](image_1.jpg)
+
+So the authors have added a more difficult optimization target for student to learn additional knowledge on unlabeled images that went through color jittering, distortions, Gaussian blurring and spatial distortion, so it can learn more invariant representations from them.
+
+The architecture consists of DINOv2 encoder to extract the features followed by DPT decoder. At first, they train the teacher model on labelled images, and then they jointly train the student model and add in the dataset pseudo-labelled by ViT-L.
+
+![image_1](image_1.jpg)
+
+Thanks to this, Depth Anything performs very well! I have also benchmarked the inference duration of the model against different models here. I also ran `torch.compile` benchmarks across them and got nice speed-ups ๐
+
+On T4 GPU, mean of 30 inferences for each. Inferred using `pipeline` (pre-processing and post-processing included with model inference).
+
+| Model/Batch Size | 16 | 4 | 1 |
+| ----------------------------- | --------- | -------- | ------- |
+| intel/dpt-large | 2709.652 | 667.799 | 172.617 |
+| facebook/dpt-dinov2-small-nyu | 2534.854 | 654.822 | 159.754 |
+| facebook/dpt-dinov2-base-nyu | 4316.8733 | 1090.824 | 266.699 |
+| Intel/dpt-beit-large-512 | 7961.386 | 2036.743 | 497.656 |
+| depth-anything-small | 1692.368 | 415.915 | 143.379 |
+
+torch.compileโs benchmarks with reduce-overhead mode: we have compiled the model and loaded it to the pipeline for the benchmarks to be fair.
+
+| Model/Batch Size | 16 | 4 | 1 |
+| ----------------------------- | -------- | -------- | ------- |
+| intel/dpt-large | 2556.668 | 645.750 | 155.153 |
+| facebook/dpt-dinov2-small-nyu | 2415.25 | 610.967 | 148.526 |
+| facebook/dpt-dinov2-base-nyu | 4057.909 | 1035.672 | 245.692 |
+| Intel/dpt-beit-large-512 | 7417.388 | 1795.882 | 426.546 |
+| depth-anything-small | 1664.025 | 384.688 | 97.865 |
+
+
+![image_2](image_2.jpg)
+
+
+You can use Depth Anything easily thanks to ๐ค Transformers with three lines of code! โจ We have also built an app for you to [compare different depth estimation models](https://t.co/6uq4osdwWG) ๐ ๐ธ See all the available Depth Anything checkpoints [here](https://t.co/Ex0IIyx7XC).
+
+![image_3](image_3.jpg)
+
+
+> [!TIP]
+Ressources:
+[Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891)
+by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024)
+[GitHub](https://github.com/LiheYoung/Depth-Anything)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1750531698008498431) (January 25, 2024)
\ No newline at end of file
diff --git a/pages/Depth Anything/image_1.jpg b/pages/Depth Anything/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9f9f597f1ee050d81546ca12df764fd495c9033b
Binary files /dev/null and b/pages/Depth Anything/image_1.jpg differ
diff --git a/pages/Depth Anything/image_2.jpg b/pages/Depth Anything/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5764194e35b849f479f6b112e3d10ab243aec6f5
Binary files /dev/null and b/pages/Depth Anything/image_2.jpg differ
diff --git a/pages/Depth Anything/image_3.jpg b/pages/Depth Anything/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..330fcf92754f3798f78dff64d3a092ef07a4a503
Binary files /dev/null and b/pages/Depth Anything/image_3.jpg differ
diff --git a/pages/Depth Anything/video_1.mp4 b/pages/Depth Anything/video_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..44bbc6257f0bd16776517efa7ce99fbc78df19bf
--- /dev/null
+++ b/pages/Depth Anything/video_1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2587f63d7a6622ca913f0260aa45b2fea8c806f261a09cb5b692ec2644b51066
+size 2026722
diff --git a/pages/Depth_Anything_v2/Depth Anything v2.md b/pages/Depth_Anything_v2/Depth Anything v2.md
new file mode 100644
index 0000000000000000000000000000000000000000..5314188ec02c86258dc34d6c863858196dfa0d70
--- /dev/null
+++ b/pages/Depth_Anything_v2/Depth Anything v2.md
@@ -0,0 +1,34 @@
+๏ปฟI love Depth Anything V2 ๐ Itโs Depth Anything, but scaled with both larger teacher model and a gigantic dataset! Letโs unpack ๐ค๐งถ!
+
+![image_1](image_1.jpg)
+
+The authors have analyzed Marigold, a diffusion based model against Depth Anything and found out whatโs up with using synthetic images vs real images for MDE: ๐
+Real data has a lot of label noise, inaccurate depth maps (caused by depth sensors missing transparent objects etc).
+
+![image_2](image_2.jpg)
+
+The authors train different image encoders only on synthetic images and find out unless the encoder is very large the model canโt generalize well (but large models generalize inherently anyway) ๐ง But they still fail encountering real images that have wide distribution in labels.
+
+![image_3](image_3.jpg)
+
+Depth Anything v2 framework is to...
+๐ฆ Train a teacher model based on DINOv2-G based on 595K synthetic images
+๐ท๏ธ Label 62M real images using teacher model
+๐ฆ Train a student model using the real images labelled by teacher
+Result: 10x faster and more accurate than Marigold!
+
+![image_4](image_4.jpg)
+
+
+The authors also construct a new benchmark called DA-2K that is less noisy, highly detailed and more diverse!
+I have created a [collection](https://t.co/3fAB9b2sxi) that has the models, the dataset, the demo and CoreML converted model ๐
+
+> [!TIP]
+Ressources:
+[Depth Anything V2](https://arxiv.org/abs/2406.09414)
+by Lihe Yang, Bingyi Kang, Zilong Huang, Zhen Zhao, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024)
+[GitHub](https://github.com/DepthAnything/Depth-Anything-V2)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything_v2)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1803063120354492658) (June 18, 2024)
\ No newline at end of file
diff --git a/pages/Depth_Anything_v2/image_1.jpg b/pages/Depth_Anything_v2/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5d15680b6ee378948b983770176c4667360a9811
Binary files /dev/null and b/pages/Depth_Anything_v2/image_1.jpg differ
diff --git a/pages/Depth_Anything_v2/image_2.jpg b/pages/Depth_Anything_v2/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0585867a67c18a7fd02d8c38fdfdbe63f5fa5f03
Binary files /dev/null and b/pages/Depth_Anything_v2/image_2.jpg differ
diff --git a/pages/Depth_Anything_v2/image_3.jpg b/pages/Depth_Anything_v2/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2acbdffed71edbbce1e1b0859b18635db56d0c0a
Binary files /dev/null and b/pages/Depth_Anything_v2/image_3.jpg differ
diff --git a/pages/Depth_Anything_v2/image_4.jpg b/pages/Depth_Anything_v2/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..eb78a815f9dd79ddf7fc7ff74488cdf1f6c9d481
Binary files /dev/null and b/pages/Depth_Anything_v2/image_4.jpg differ
diff --git a/pages/DocOwl_1.5/DocOwl 1.5.md b/pages/DocOwl_1.5/DocOwl 1.5.md
new file mode 100644
index 0000000000000000000000000000000000000000..b92eaae30e321c52f4b179c12a879e6f5b6dfcb9
--- /dev/null
+++ b/pages/DocOwl_1.5/DocOwl 1.5.md
@@ -0,0 +1,52 @@
+๏ปฟDocOwl 1.5 is the state-of-the-art document understanding model by Alibaba with Apache 2.0 license ๐๐ time to dive in and learn more ๐งถ
+
+![image_1](image_1.jpeg)
+
+This model consists of a ViT-based visual encoder part that takes in crops of image and the original image itself Then the outputs of the encoder goes through a convolution based model, after that the outputs are merged with text and then fed to LLM
+
+![image_2](image_2.jpeg)
+
+Initially, the authors only train the convolution based part (called H-Reducer) and vision encoder while keeping LLM frozen Then for fine-tuning (on image captioning, VQA etc), they freeze vision encoder and train H-Reducer and LLM
+
+![image_3](image_3.jpeg)
+
+Also they use simple linear projection on text and documents. You can see below how they model the text prompts and outputs ๐ค
+
+![image_4](image_4.jpeg)
+
+They train the model various downstream tasks including:
+- document understanding (DUE benchmark and more)
+- table parsing (TURL, PubTabNet)
+- chart parsing (PlotQA and more)
+- image parsing (OCR-CC)
+- text localization (DocVQA and more)
+
+![image_5](image_5.jpeg)
+
+They contribute a new model called DocOwl 1.5-Chat by:
+1. creating a new document-chat dataset with questions from document VQA datasets
+2. feeding them to ChatGPT to get long answers
+3. fine-tune the base model with it (which IMO works very well!)
+
+![image_6](image_6.jpeg)
+
+Resulting generalist model and the chat model are pretty much state-of-the-art ๐ Below you can see how it compares to fine-tuned models
+
+![image_7](image_7.jpeg)
+
+Very good paper, read it [here](https://t.co/T23JOAPkv1).
+All the models and the datasets (also some eval datasets on above tasks!) are in this [organization](https://t.co/sJdTw1jWTR).
+The [Space](https://t.co/57E9DbNZXf).
+
+Thanks a lot for reading!
+
+![image_8](image_8.jpeg)
+
+> [!TIP]
+Ressources:
+[mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding](https://arxiv.org/abs/2403.12895)
+by Anwen Hu, Haiyang Xu, Jiabo Ye, Ming Yan, Liang Zhang, Bo Zhang, Chen Li, Ji Zhang, Qin Jin, Fei Huang, Jingren Zhou (2024)
+[GitHub](https://github.com/X-PLUG/mPLUG-DocOwl)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1782421257591357824) (April 22, 2024)
\ No newline at end of file
diff --git a/pages/DocOwl_1.5/image_1.jpg b/pages/DocOwl_1.5/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f6acdefc776a4586fdefd65e0f36ef2dc30aab30
Binary files /dev/null and b/pages/DocOwl_1.5/image_1.jpg differ
diff --git a/pages/DocOwl_1.5/image_2.jpeg b/pages/DocOwl_1.5/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..70369f08bb9f6421696ae51f4a7452c7468fa130
Binary files /dev/null and b/pages/DocOwl_1.5/image_2.jpeg differ
diff --git a/pages/DocOwl_1.5/image_3.jpeg b/pages/DocOwl_1.5/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..5e05f6401ec03b4c7afd67437911edb6b455d547
Binary files /dev/null and b/pages/DocOwl_1.5/image_3.jpeg differ
diff --git a/pages/DocOwl_1.5/image_4.jpeg b/pages/DocOwl_1.5/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..557d61cc4e621c25073c9409a82e305e77477af3
Binary files /dev/null and b/pages/DocOwl_1.5/image_4.jpeg differ
diff --git a/pages/DocOwl_1.5/image_5.jpeg b/pages/DocOwl_1.5/image_5.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..63442fcf91dc3cbee5fa9f4bef1e6022817e94e5
Binary files /dev/null and b/pages/DocOwl_1.5/image_5.jpeg differ
diff --git a/pages/DocOwl_1.5/image_6.jpeg b/pages/DocOwl_1.5/image_6.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..190062b9a42597245aa718e2bd7b5bb68e25b8de
Binary files /dev/null and b/pages/DocOwl_1.5/image_6.jpeg differ
diff --git a/pages/DocOwl_1.5/image_7.jpeg b/pages/DocOwl_1.5/image_7.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..11e88429bd61efbf20a7710d8786fc366ffca638
Binary files /dev/null and b/pages/DocOwl_1.5/image_7.jpeg differ
diff --git a/pages/DocOwl_1.5/image_8.jpeg b/pages/DocOwl_1.5/image_8.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..18a9f12a7d4f667ed014ea3946d3563b61caae00
Binary files /dev/null and b/pages/DocOwl_1.5/image_8.jpeg differ
diff --git a/pages/Florence-2/Florence-2.md b/pages/Florence-2/Florence-2.md
new file mode 100644
index 0000000000000000000000000000000000000000..57a19dee06847eef5ce619b70cf41ed5b3a82ae1
--- /dev/null
+++ b/pages/Florence-2/Florence-2.md
@@ -0,0 +1,34 @@
+๏ปฟFlorence-2 is a new vision foundation model by MSFT capable of a wide variety of tasks ๐คฏ Let's unpack! ๐งถ Demo, models and more on the next one ๐ฃ
+
+![image_1](image_1.jpg)
+
+This model is can handle tasks that vary from document understanding to semantic segmentation ๐คฉ
+[Demo](https://t.co/7YJZvjhw84) | [Collection](https://t.co/Ub7FGazDz1)
+
+![image_2](image_2.jpg)
+
+The difference from previous models is that the authors have compiled a dataset that consists of 126M images with 5.4B annotations labelled with their own data engine โโ
+
+![image_3](image_3.jpg)
+
+The dataset also offers more variety in annotations compared to other datasets, it has region level and image level annotations with more variety in semantic granularity as well!
+
+![image_4](image_4.jpg)
+
+The model is a similar architecture to previous models, an image encoder, a multimodality encoder with text decoder. The authors have compiled the multitask dataset with prompts for each task which makes the model trainable on multiple tasks ๐ค
+
+![image_5](image_5.jpg)
+
+You also fine-tune this model on any task of choice, the authors also released different results on downstream tasks and report their results when un/freezing vision encoder ๐ค๐
+They have released fine-tuned models too, you can find them in the collection above ๐ค
+
+![image_6](image_6.jpg)
+
+> [!TIP]
+Ressources:
+[Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242)
+by Bin Xiao, Haiping Wu, Weijian Xu, Xiyang Dai, Houdong Hu, Yumao Lu, Michael Zeng, Ce Liu, Lu Yuan (2023)
+[Hugging Face blog post](https://huggingface.co/blog/finetune-florence2)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1803769866878623819) (June 20, 2024)
diff --git a/pages/Florence-2/image_1.jpg b/pages/Florence-2/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..31bae32a5d350cd79e15a629fe9d3cf8a3cb738c
Binary files /dev/null and b/pages/Florence-2/image_1.jpg differ
diff --git a/pages/Florence-2/image_2.jpg b/pages/Florence-2/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3035fbb46dc585536f39a6ed924ab9c638a25e3f
Binary files /dev/null and b/pages/Florence-2/image_2.jpg differ
diff --git a/pages/Florence-2/image_3.jpg b/pages/Florence-2/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..469963370d22a8950425d86e84c56512004f611d
Binary files /dev/null and b/pages/Florence-2/image_3.jpg differ
diff --git a/pages/Florence-2/image_4.jpg b/pages/Florence-2/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..63e030e11386b122873c2dd44db99164d1d05a49
Binary files /dev/null and b/pages/Florence-2/image_4.jpg differ
diff --git a/pages/Florence-2/image_5.jpg b/pages/Florence-2/image_5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b105948d26684849940773d47ce93d1b023aff99
Binary files /dev/null and b/pages/Florence-2/image_5.jpg differ
diff --git a/pages/Florence-2/image_6.jpg b/pages/Florence-2/image_6.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..880e6e7e76c521eee3322d1049fdb6b0f197f13f
Binary files /dev/null and b/pages/Florence-2/image_6.jpg differ
diff --git a/pages/Grounding_DINO/Grounding DINO.md b/pages/Grounding_DINO/Grounding DINO.md
new file mode 100644
index 0000000000000000000000000000000000000000..18dfedbff03e116d41d80467beb1dd56d318824c
--- /dev/null
+++ b/pages/Grounding_DINO/Grounding DINO.md
@@ -0,0 +1,45 @@
+๏ปฟWe have merged Grounding DINO in ๐ค Transformers
+It's an amazing zero-shot object detection model, here's why ๐งถ also I have built two applications on top of it.
+
+![image_1](image_1.jpg)
+
+There are two zero-shot object detection models as of now, one is OWL series by Google Brain and the other one is Grounding DINO ๐ฆ Grounding DINO pays immense attention to detail โฌ๏ธ
+Also [try yourself](https://t.co/UI0CMxphE7.
+
+![image_2](image_2.jpg)
+
+![image_3](image_3.jpg)
+
+I have also built another [application](https://t.co/4EHpOwEpm0) for GroundingSAM, combining GroundingDINO and Segment Anything by Meta for cutting edge zero-shot image segmentation.
+
+![image_4](image_4.jpg)
+
+Grounding DINO is essentially a model with connected image encoder (Swin transformer), text encoder (BERT) and on top of both, a decoder that outputs bounding boxes ๐ฆ This is quite similar to OWLv2, which uses a ViT-based detector on CLIP.
+
+![image_5](image_5.jpg)
+
+The authors train Swin-L/T with BERT contrastively (not like CLIP where they match the images to texts by means of similarity) where they try to approximate the region outputs to language phrases at the head outputs ๐คฉ
+
+![image_6](image_6.jpg)
+
+The authors also form the text features on the sub-sentence level. This means it extracts certain noun phrases from training data to remove the influence between words while removing fine-grained information.
+
+![image_7](image_7.jpg)
+
+Thanks to all of this, Grounding DINO has great performance on various REC/object detection benchmarks ๐๐
+
+![image_8](image_8.jpg)
+
+Thanks to transformers, you can use Grounding DINO very easily! You can also check out [NielsRogge](https://twitter.com/NielsRogge)'s [notebook here](https://t.co/8ADGFdVkta).
+
+![image_9](image_9.jpg)
+
+> [!TIP]
+Ressources:
+[Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)
+by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang (2023)
+[GitHub](https://github.com/IDEA-Research/GroundingDINO)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/grounding-dino)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1780558859221733563)
\ No newline at end of file
diff --git a/pages/Grounding_DINO/image_1.jpeg b/pages/Grounding_DINO/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..808589168a625f34be81d385949e94f6f1433066
Binary files /dev/null and b/pages/Grounding_DINO/image_1.jpeg differ
diff --git a/pages/Grounding_DINO/image_2.jpeg b/pages/Grounding_DINO/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..571ee25ca84f429642e141ae837db5e76b91ab21
Binary files /dev/null and b/pages/Grounding_DINO/image_2.jpeg differ
diff --git a/pages/Grounding_DINO/image_3.jpeg b/pages/Grounding_DINO/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..4b2fdcb1ae3071e3cbb57f4c7d8663a3f012cdc7
Binary files /dev/null and b/pages/Grounding_DINO/image_3.jpeg differ
diff --git a/pages/Grounding_DINO/image_4.jpeg b/pages/Grounding_DINO/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..953d513030a01ef62ca29818ea946e59a9ea9c02
Binary files /dev/null and b/pages/Grounding_DINO/image_4.jpeg differ
diff --git a/pages/Grounding_DINO/image_5.jpeg b/pages/Grounding_DINO/image_5.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..cf95438c91dd411b7bd67cf8525f9108bef6cb13
Binary files /dev/null and b/pages/Grounding_DINO/image_5.jpeg differ
diff --git a/pages/Grounding_DINO/image_6.jpeg b/pages/Grounding_DINO/image_6.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..fc54bffdab07e5021d9b39dccecddc960a1c46a2
Binary files /dev/null and b/pages/Grounding_DINO/image_6.jpeg differ
diff --git a/pages/Grounding_DINO/image_7.jpeg b/pages/Grounding_DINO/image_7.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..bfdb3872472f1caacd0bc166781f6a9e0d597b18
Binary files /dev/null and b/pages/Grounding_DINO/image_7.jpeg differ
diff --git a/pages/Grounding_DINO/image_8.jpeg b/pages/Grounding_DINO/image_8.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c646bbb5edd1844ebbfdfda58b7ca6ce4d8df824
Binary files /dev/null and b/pages/Grounding_DINO/image_8.jpeg differ
diff --git a/pages/Grounding_DINO/image_9.jpeg b/pages/Grounding_DINO/image_9.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..1ba01db76dff84a7d467f05295a6d370d7ac659d
Binary files /dev/null and b/pages/Grounding_DINO/image_9.jpeg differ
diff --git a/pages/LLaVA-NeXT/LLaVA-NeXT.md b/pages/LLaVA-NeXT/LLaVA-NeXT.md
new file mode 100644
index 0000000000000000000000000000000000000000..314b9484734adf10686ed7de5277dd3e56446dff
--- /dev/null
+++ b/pages/LLaVA-NeXT/LLaVA-NeXT.md
@@ -0,0 +1,33 @@
+๏ปฟLLaVA-NeXT is recently merged to ๐ค Transformers and it outperforms many of the proprietary models like Gemini on various benchmarks!
+๐คฉ For those who don't know LLaVA, it's a language model that can take image ๐ฌ Let's take a look, demo and more in this.
+
+![image_1](image_1.jpg)
+
+LLaVA is essentially a vision-language model that consists of ViT-based CLIP encoder, a MLP projection and Vicuna as decoder โจ LLaVA 1.5 was released with Vicuna, but LLaVA NeXT (1.6) is released with four different LLMs:
+- Nous-Hermes-Yi-34B
+- Mistral-7B
+- Vicuna 7B & 13B
+
+![image_2](image_2.jpg)
+
+Thanks to Transformers integration, it is very easy to use LLaVA NeXT, not only standalone but also with 4-bit loading and Flash Attention 2 ๐ See below on standalone usage ๐
+
+![image_3](image_3.jpg)
+
+To fit large models and make it even faster and memory efficient, you can enable Flash Attention 2 and load model into 4-bit using bitsandbytes โก๏ธ transformers makes it very easy to do this! See below ๐
+
+![image_4](image_4.jpg)
+
+If you want to try the code right away, here's the [notebook](https://t.co/NvoxvY9z1u). Lastly, you can directly play with the LLaVA-NeXT based on Mistral-7B through the demo [here](https://t.co/JTDlqMUwEh) ๐ค
+
+![video_1](video_1.mp4)
+
+> [!TIP]
+Ressources:
+[LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/)
+by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee (2024)
+[GitHub](https://github.com/haotian-liu/LLaVA/tree/main)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/llava_next)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1770832875551682563) (March 21, 2024)
\ No newline at end of file
diff --git a/pages/LLaVA-NeXT/image_1.jpeg b/pages/LLaVA-NeXT/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..39adbf76d3d0c567b92effdc65d1fce50db6b5d9
Binary files /dev/null and b/pages/LLaVA-NeXT/image_1.jpeg differ
diff --git a/pages/LLaVA-NeXT/image_2.jpeg b/pages/LLaVA-NeXT/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..529939bc6590519de33cd984a7dfd9beb94bfc16
Binary files /dev/null and b/pages/LLaVA-NeXT/image_2.jpeg differ
diff --git a/pages/LLaVA-NeXT/image_3.jpeg b/pages/LLaVA-NeXT/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..a4668bd51dbbfdb1209adfb77b127c82f95b7895
Binary files /dev/null and b/pages/LLaVA-NeXT/image_3.jpeg differ
diff --git a/pages/LLaVA-NeXT/image_4.jpeg b/pages/LLaVA-NeXT/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..b2d7e74936615b26d8ebc35d4092d4cee32a4d74
Binary files /dev/null and b/pages/LLaVA-NeXT/image_4.jpeg differ
diff --git a/pages/LLaVA-NeXT/video_1.mp4 b/pages/LLaVA-NeXT/video_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..8e47ba5d9ee035b225516232f7ef54ee8897833b
Binary files /dev/null and b/pages/LLaVA-NeXT/video_1.mp4 differ
diff --git a/pages/Llava-NeXT-Interleave/Llava-NeXT-Interleave.md b/pages/Llava-NeXT-Interleave/Llava-NeXT-Interleave.md
new file mode 100644
index 0000000000000000000000000000000000000000..defbad684b7abf9a6b969db5546d0e75cd6e1a57
--- /dev/null
+++ b/pages/Llava-NeXT-Interleave/Llava-NeXT-Interleave.md
@@ -0,0 +1,39 @@
+๏ปฟThe vision language model in this video is 0.5B and can take in image, video and 3D! ๐คฏ Llava-NeXT-Interleave is a new vision language model trained on interleaved image, video and 3D data keep reading โฅฅโฅฅ
+
+![video_1](video_1.jpg)
+
+This model comes with 0.5B, 7B and 7B-DPO variants, all can be used with Transformers ๐
+[Collection of models](https://t.co/sZsaglSXa3) | [Demo](https://t.co/FbpaMWJY8k)
+See how to use below ๐๐ป
+
+![image_1](image_1.jpg)
+
+Authors of this paper have explored training Llava-NeXT on interleaved data where the data consists of multiple modalities, including image(s), video, 3D ๐
+They have discovered that interleaved data increases results across all benchmarks!
+
+![image_2](image_2.jpg)
+
+The model can do task transfer from single image tasks to multiple images ๐คฏ The authors have trained the model on single images and code yet the model can solve coding with multiple images.
+
+![image_3](image_3.jpg)
+
+Same applies to other modalities, see below for video:
+
+![image_4](image_4.jpg)
+
+The model also has document understanding capabilities and many real-world application areas
+
+![image_5](image_5.jpg)
+
+This release also comes with the dataset this model was fine-tuned on ๐ [M4-Instruct-Data](https://t.co/rutXMtNC0I)
+
+![image_6](image_6.jpg)
+
+> [!TIP]
+Ressources:
+[LLaVA-NeXT: Tackling Multi-image, Video, and 3D in Large Multimodal Models](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/)
+by Feng Li, Renrui Zhang*, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, Chunyuan Li (2024)
+[GitHub](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/inference/docs/LLaVA-NeXT-Interleave.md)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1813560292397203630) (July 17, 2024)
\ No newline at end of file
diff --git a/pages/Llava-NeXT-Interleave/image_1.jpg b/pages/Llava-NeXT-Interleave/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..565e4d4f1aacabec08a290b141ec7a24c7f8b65a
Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_1.jpg differ
diff --git a/pages/Llava-NeXT-Interleave/image_2.jpg b/pages/Llava-NeXT-Interleave/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2d3b5295fc880bc124ff52fe949389ebe7c0434a
Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_2.jpg differ
diff --git a/pages/Llava-NeXT-Interleave/image_3.jpg b/pages/Llava-NeXT-Interleave/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6932fa5368a8dc5c702fc26203e8c5d67182697c
Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_3.jpg differ
diff --git a/pages/Llava-NeXT-Interleave/image_4.jpg b/pages/Llava-NeXT-Interleave/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9a765ea6dbfff0edf1d4b2beeaa3dd3e91f22b5d
Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_4.jpg differ
diff --git a/pages/Llava-NeXT-Interleave/image_5.jpg b/pages/Llava-NeXT-Interleave/image_5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6180371a72fd732699de94bc05d06b4f229d8539
Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_5.jpg differ
diff --git a/pages/Llava-NeXT-Interleave/image_6.jpg b/pages/Llava-NeXT-Interleave/image_6.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d0f03d353e60db0d39cfae84ae742cfba799fdf7
Binary files /dev/null and b/pages/Llava-NeXT-Interleave/image_6.jpg differ
diff --git a/pages/Llava-NeXT-Interleave/video_1.mp4 b/pages/Llava-NeXT-Interleave/video_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..dd61907079b0a479ca9e253809bbe66a2964cb04
Binary files /dev/null and b/pages/Llava-NeXT-Interleave/video_1.mp4 differ
diff --git a/pages/MobileSAM/MobileSAM.md b/pages/MobileSAM/MobileSAM.md
new file mode 100644
index 0000000000000000000000000000000000000000..edeb658b719f37814df3c64879d1c4c9349cff14
--- /dev/null
+++ b/pages/MobileSAM/MobileSAM.md
@@ -0,0 +1,38 @@
+๏ปฟRead the MobileSAM paper this weekend ๐ Sharing some insights!
+The idea ๐ก: SAM model consist of three parts, a heavy image encoder, a prompt encoder (prompt can be text, bounding box, mask or point) and a mask decoder.
+To make the SAM model smaller without compromising from the performance, the authors looked into three types of distillation.
+First one is distilling the decoder outputs directly (a more naive approach) with a completely randomly initialized small ViT and randomly initialized mask decoder.
+However, when the ViT and the decoder are both in a bad state, this doesn't work well.
+
+![image_1](image_1.jpg)
+
+The second type of distillation is called semi-coupled, where the authors only randomly initialized the ViT image encoder and kept the mask decoder.
+This is called semi-coupled because the image encoder distillation still depends on the mask decoder (see below ๐ )
+
+![image_2](image_2.jpg)
+
+The last type of distillation, decoupled distillation, is the most intuitive IMO.
+The authors have "decoupled" image encoder altogether and have frozen the mask decoder and didn't really distill based on generated masks.
+This makes sense as the bottleneck here is the encoder itself and most of the time, distillation works well with encoding.
+
+![image_3](image_3.jpg)
+
+Finally, they found out that decoupled distillation performs better than coupled distillation by means of mean IoU and requires much less compute! โฅ๏ธ
+
+![image_4](image_4.jpg)
+
+Wanted to leave some links here if you'd like to try yourself ๐
+- MobileSAM [demo](https://huggingface.co/spaces/dhkim2810/MobileSAMMobileSAM)
+- Model [repository](https://huggingface.co/dhkim2810/MobileSAM)
+If you'd like to experiment around TinyViT, timm library has a bunch of [checkpoints available](https://huggingface.co/models?sort=trending&search=timm%2Ftinyvit).
+
+![image_5](image_5.jpg)
+
+> [!TIP]
+Ressources:
+[Faster Segment Anything: Towards Lightweight SAM for Mobile Applications](https://arxiv.org/abs/2306.14289)
+by Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, Choong Seon Hong (2023)
+[GitHub](https://github.com/ChaoningZhang/MobileSAM)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1738959605542076863) (December 24, 2023)
\ No newline at end of file
diff --git a/pages/MobileSAM/image_1.jpeg b/pages/MobileSAM/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..ef60949c0d7df312dc478f8e55d609cd1d304456
Binary files /dev/null and b/pages/MobileSAM/image_1.jpeg differ
diff --git a/pages/MobileSAM/image_2.jpg b/pages/MobileSAM/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..05c05faed896fe13547e17e9af6195b1a825768c
Binary files /dev/null and b/pages/MobileSAM/image_2.jpg differ
diff --git a/pages/MobileSAM/image_3.jpeg b/pages/MobileSAM/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..6bef1b50007b2a48c7538b918015ee40faa2241b
Binary files /dev/null and b/pages/MobileSAM/image_3.jpeg differ
diff --git a/pages/MobileSAM/image_4.jpg b/pages/MobileSAM/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f883304123672c22f38c11521f6298ee3b801244
Binary files /dev/null and b/pages/MobileSAM/image_4.jpg differ
diff --git a/pages/MobileSAM/image_5.jpeg b/pages/MobileSAM/image_5.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..62b176d575b8e381be10fa4001abec4fcd90bbd9
Binary files /dev/null and b/pages/MobileSAM/image_5.jpeg differ
diff --git a/pages/OWLv2/OWLv2.md b/pages/OWLv2/OWLv2.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ea5e41ed0d183042162a595e8fc52e38780413e
--- /dev/null
+++ b/pages/OWLv2/OWLv2.md
@@ -0,0 +1,46 @@
+๏ปฟExplaining the ๐ of zero-shot open-vocabulary object detection: OWLv2 ๐ฆ๐งถ
+
+![image_1](image_1.jpg)
+
+OWLv2 is scaled version of a model called OWL-ViT, so let's take a look at that first.
+๐ OWLViT is an open vocabulary object detector, meaning, it can detect objects it didn't explicitly see during the training.
+๐ What's cool is that it can take both image and text queries! This is thanks to how the image and text features aren't fused together.
+
+![image_2](image_2.jpg)
+
+Taking a look at the architecture, the authors firstly do contrastive pre-training of a vision and a text encoder (just like CLIP).
+They take that model, remove the final pooling layer and attach a lightweight classification and box detection head and fine-tune.
+
+![image_3](image_3.jpg)
+
+During fine-tuning for object detection, they calculate the loss over bipartite matches.
+Simply put, loss is calculated over the predicted objects against ground truth objects and the goal is to find a perfect match of these two sets where each object is matched to one object in ground truth.
+
+OWL-ViT is very scalable.
+One can easily scale most language models or vision-language models because they require no supervision, but this isn't the case for object detection: you still need supervision.
+Moreover, only scaling the encoders creates a bottleneck after a while.
+
+![image_1](image_1.jpg)
+
+The authors wanted to scale OWL-ViT with more data, so they used OWL-ViT for labelling to train a better detector, "self-train" a new detector on the labels, and fine-tune the model on human-annotated data. (see below)
+
+![image_4](image_4.jpg)
+
+Thanks to this, OWLv2 scaled very well and is tops leaderboards on open vocabulary object detection ๐
+
+![image_5](image_5.jpg)
+
+Want to try OWL models? I've created a [notebook](https://t.co/ick5tA6nyx ) for you to see how to use it with ๐ค Transformers.
+If you want to play with it directly, you can use this [Space](https://t.co/oghdLOtoa5).
+All the models and the applications of OWL-series is in this [collection](https://huggingface.co/collections/merve/owl-series-65aaac3114e6582c300544df).
+
+> [!TIP]
+Ressources:
+[Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)
+by Matthias Minderer, Alexey Gritsenko, Neil Houlsby (2023)
+[GitHub](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/owlv2)
+
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1748411972675150040) (January 19, 2024)
diff --git a/pages/OWLv2/image_1.jpeg b/pages/OWLv2/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..72ee9ad988e520f29252428ae825af0cbeaf8181
Binary files /dev/null and b/pages/OWLv2/image_1.jpeg differ
diff --git a/pages/OWLv2/image_2.jpeg b/pages/OWLv2/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..405c58b14f8c866d96805784d9d145de09d485f3
Binary files /dev/null and b/pages/OWLv2/image_2.jpeg differ
diff --git a/pages/OWLv2/image_3.jpeg b/pages/OWLv2/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c0c7921bf1b3a35087466bb990364605a85370b9
Binary files /dev/null and b/pages/OWLv2/image_3.jpeg differ
diff --git a/pages/OWLv2/image_4.jpeg b/pages/OWLv2/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..ca8e3933972379a28b580280441035563ed13fd9
Binary files /dev/null and b/pages/OWLv2/image_4.jpeg differ
diff --git a/pages/OWLv2/image_5.jpeg b/pages/OWLv2/image_5.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..e7fba927ba4c152cdd1eb334707c8d744df477ae
Binary files /dev/null and b/pages/OWLv2/image_5.jpeg differ
diff --git a/pages/OneFormer/OneFormer.md b/pages/OneFormer/OneFormer.md
new file mode 100644
index 0000000000000000000000000000000000000000..27d1602909a3fa452c5a9e35c40e6cafdb45319a
--- /dev/null
+++ b/pages/OneFormer/OneFormer.md
@@ -0,0 +1,28 @@
+๏ปฟOneFormer: one model to segment them all? ๐คฏ
+I was looking into paperswithcode leaderboards when I came across OneFormer for the first time so it was time to dig in!
+
+![image_1](image_1.jpg)
+
+OneFormer is a "truly universal" model for semantic, instance and panoptic segmentation tasks โ๏ธ
+What makes is truly universal is that it's a single model that is trained only once and can be used across all tasks ๐
+
+![image_2](image_2.jpg)
+
+The enabler here is the text conditioning, i.e. the model is given a text query that states task type along with the appropriate input, and using contrastive loss, the model learns the difference between different task types ๐
+
+![image_3](image_3.jpg)
+
+Thanks to ๐ค Transformers, you can easily use the model! I have drafted a [notebook](https://t.co/cBylk1Uv20) for you to try right away ๐
+You can also check out the [Space](https://t.co/31GxlVo1W5) without checking out the code itself
+
+![image_4](image_4.jpg)
+
+> [!TIP]
+Ressources:
+[OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
+by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi (2022)
+[GitHub](https://github.com/SHI-Labs/OneFormer)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/oneformer)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1739707076501221608) (December 26, 2023)
\ No newline at end of file
diff --git a/pages/OneFormer/image_1.jpeg b/pages/OneFormer/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..6ec704afb2e0397037bdf778053e87dbd00e4fc4
Binary files /dev/null and b/pages/OneFormer/image_1.jpeg differ
diff --git a/pages/OneFormer/image_2.jpeg b/pages/OneFormer/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..235fbca92c28d1ed82ec04fa375a2cf11b091fe3
Binary files /dev/null and b/pages/OneFormer/image_2.jpeg differ
diff --git a/pages/OneFormer/image_3.jpeg b/pages/OneFormer/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..a6f76731199a868aeab4556e774c32daf12c9407
Binary files /dev/null and b/pages/OneFormer/image_3.jpeg differ
diff --git a/pages/OneFormer/image_4.jpeg b/pages/OneFormer/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..f37500ae6e880ba6ccd56b72803a1761bc4a2bdf
Binary files /dev/null and b/pages/OneFormer/image_4.jpeg differ
diff --git a/pages/PLLaVA/PLLaVA .md b/pages/PLLaVA/PLLaVA .md
new file mode 100644
index 0000000000000000000000000000000000000000..4ada3bc5fd9e1dc9501c41d1796565f276968626
--- /dev/null
+++ b/pages/PLLaVA/PLLaVA .md
@@ -0,0 +1,30 @@
+๏ปฟParameter-free LLaVA for video captioning works like magic! ๐คฉ Let's take a look!
+
+![image_1](image_1.jpg)
+
+Most of the video captioning models work by downsampling video frames to reduce computational complexity and memory requirements without losing a lot of information in the process.
+PLLaVA on the other hand, uses pooling! ๐คฉ
+
+How? ๐ง It takes in frames of video, passed to ViT and then projection layer, and then output goes through average pooling where input shape is (# frames, width, height, text decoder input dim) ๐
+
+![image_2](image_2.jpg)
+
+Pooling operation surprisingly reduces the loss of spatial and temporal information. See below some examples on how it can capture the details ๐ค
+
+![image_3](image_3.jpg)
+
+according to authors' findings, it performs way better than many of the existing models (including proprietary VLMs) and scales very well (on text decoder)
+
+![image_4](image_4.jpg)
+
+Model repositories ๐ค [7B](https://t.co/AeSdYsz1U7), [13B](https://t.co/GnI1niTxO7), [34B](https://t.co/HWAM0ZzvDc)
+Spaces๐ค [7B](https://t.co/Oms2OLkf7O), [13B](https://t.co/C2RNVNA4uR)
+
+> [!TIP]
+Ressources:
+[PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning](https://arxiv.org/abs/2404.16994)
+by Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, Jiashi Feng (2024)
+[GitHub](https://github.com/magic-research/PLLaVA)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1786336055425138939) (May 3, 2024)
\ No newline at end of file
diff --git a/pages/PLLaVA/image_1.jpg b/pages/PLLaVA/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b1982963ed57d3e93a1435cb9e8e87be1704a7ba
Binary files /dev/null and b/pages/PLLaVA/image_1.jpg differ
diff --git a/pages/PLLaVA/image_2.jpeg b/pages/PLLaVA/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..465a9e1c566028c066fee65a41183bb50bfbdce8
Binary files /dev/null and b/pages/PLLaVA/image_2.jpeg differ
diff --git a/pages/PLLaVA/image_3.jpeg b/pages/PLLaVA/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..58444be4d809d573317af1eb384adcebf01c59f5
Binary files /dev/null and b/pages/PLLaVA/image_3.jpeg differ
diff --git a/pages/PLLaVA/image_4.jpeg b/pages/PLLaVA/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..1feea29a228a4ab8595cb12d8b7f152cc6fc174d
Binary files /dev/null and b/pages/PLLaVA/image_4.jpeg differ
diff --git a/pages/Painter/Painter .md b/pages/Painter/Painter .md
new file mode 100644
index 0000000000000000000000000000000000000000..245c9449afa07a24b41f5c17e83878f72d5ebbe6
--- /dev/null
+++ b/pages/Painter/Painter .md
@@ -0,0 +1,24 @@
+๏ปฟI read the Painter [paper](https://t.co/r3aHp29mjf) by BAAIBeijing to convert the weights to transformers, and I absolutely loved the approach they took so I wanted to take time to unfold it here!
+
+![image_1](image_1.jpg)
+
+so essentially this model takes inspiration from in-context learning, as in, in LLMs you give an example input output and give the actual input that you want model to complete (one-shot learning) they adapted this to images, thus the name "images speak in images"
+
+this model doesn't have any multimodal parts, it just has an image encoder and a decoder head (linear layer, conv layer, another linear layer) so it's a single modality
+
+the magic sauce is the data: they input the task in the form of image and associated transformation and another image they want the transformation to take place and take smooth l2 loss over the predictions and ground truth this is like T5 of image models ๐
+
+![image_2](image_2.jpg)
+
+What is so cool about it is that it can actually adapt to out of domain tasks, meaning, in below chart, it was trained on the tasks above the dashed line, and the authors found out it generalized to the tasks below the line, image tasks are well generalized ๐คฏ
+
+![image_3](image_3.jpg)
+
+> [!TIP]
+Ressources:
+[Images Speak in Images: A Generalist Painter for In-Context Visual Learning](https://arxiv.org/abs/2212.02499)
+by Xinlong Wang, Wen Wang, Yue Cao, Chunhua Shen, Tiejun Huang (2022)
+[GitHub](https://github.com/baaivision/Painter)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1771542172946354643) (March 23, 2024)
\ No newline at end of file
diff --git a/pages/Painter/image_1.jpeg b/pages/Painter/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..de15559d4c88c52c9dcf160c5f693b9dc3f784f5
Binary files /dev/null and b/pages/Painter/image_1.jpeg differ
diff --git a/pages/Painter/image_2.jpeg b/pages/Painter/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..0ac63a33928ff9ce06650ff79f3a8eadeb9c13aa
Binary files /dev/null and b/pages/Painter/image_2.jpeg differ
diff --git a/pages/Painter/image_3.jpeg b/pages/Painter/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..97833003915047644e73bcf0650282eeff9ae1ac
Binary files /dev/null and b/pages/Painter/image_3.jpeg differ
diff --git a/pages/RT-DETR/RT-DETR.md b/pages/RT-DETR/RT-DETR.md
new file mode 100644
index 0000000000000000000000000000000000000000..f64e5b8c1242c513d4449132d5d79a9a5968149b
--- /dev/null
+++ b/pages/RT-DETR/RT-DETR.md
@@ -0,0 +1,34 @@
+๏ปฟReal-time DEtection Transformer (RT-DETR) landed in @huggingface transformers ๐คฉ with Apache 2.0 license ๐
+Do DETRs Beat YOLOs on Real-time Object Detection? keep reading ๐
+
+![video_1](video_1.mp4)
+
+Short answer, it does!
+๐ [notebook](https://t.co/NNRpG9cAEa), ๐ [models](https://t.co/ctwWQqNcEt), ๐ [demo](https://t.co/VrmDDDjoNw)
+
+YOLO models are known to be super fast for real-time computer vision, but they have a downside with being volatile to NMS ๐ฅฒ
+Transformer-based models on the other hand are computationally not as efficient ๐ฅฒ Isn't there something in between? Enter RT-DETR!
+
+The authors combined CNN backbone, multi-stage hybrid decoder (combining convs and attn) with a transformer decoder โ
+
+![image_1](image_1.jpg)
+
+In the paper, authors also claim one can adjust speed by changing decoder layers without retraining altogether they also conduct many ablation studies and try different decoders (see below)
+
+![image_2](image_2.jpg)
+
+The authors find out that the model performs better in terms of speed and accuracy compared to the previous state-of-the-art ๐คฉ
+
+![image_3](image_3.jpg)
+
+According to authors' findings, it performs way better than many of the existing models (including proprietary VLMs) and scales very well (on text decoder)
+
+> [!TIP]
+Ressources:
+[DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069)
+by Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen (2023)
+[GitHub](https://github.com/lyuwenyu/RT-DETR/)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/rt_detr)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1807790959884665029) (July 1, 2024)
\ No newline at end of file
diff --git a/pages/RT-DETR/image_1.jpg b/pages/RT-DETR/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f1327e2043d79df9986726a7991ac6ae29fa1755
Binary files /dev/null and b/pages/RT-DETR/image_1.jpg differ
diff --git a/pages/RT-DETR/image_2.jpg b/pages/RT-DETR/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3843ce411d841815591969ba2969a3220b2d0eda
Binary files /dev/null and b/pages/RT-DETR/image_2.jpg differ
diff --git a/pages/RT-DETR/image_3.jpg b/pages/RT-DETR/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4584fbe2239f41fc1e05e06f3d3fed132db8f580
Binary files /dev/null and b/pages/RT-DETR/image_3.jpg differ
diff --git a/pages/RT-DETR/video_1.mp4 b/pages/RT-DETR/video_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..dab487c3591444cc565ecb4f816ee2ba901736fe
--- /dev/null
+++ b/pages/RT-DETR/video_1.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d6655d13b0522985f24da98aa6abb732b2f87230c118d7d3d74c6712b091f71
+size 2649878
diff --git a/pages/SAMv2/SAMv2.md b/pages/SAMv2/SAMv2.md
new file mode 100644
index 0000000000000000000000000000000000000000..012828a0344741b556a5ce38cb5c568c18d01644
--- /dev/null
+++ b/pages/SAMv2/SAMv2.md
@@ -0,0 +1,36 @@
+๏ปฟSAMv2 is just mindblowingly good ๐ Learn what makes this model so good at video segmentation, keep reading ๐ฆโ
+
+![video_1](video_1.mp4)
+
+Check out the [demo](https://t.co/35ixEZgPaf) by @skalskip92 to see how to use the model locally.
+Check out Meta's [demo](https://t.co/Bcbli9Cfim) where you can edit segmented instances too!
+
+![image_1](image_1.jpg)
+
+However SAM doesn't naturally track object instances in videos, one needs to make sure to prompt the same mask or point prompt for that instance in each frame and feed each frame, which is infeasible ๐ But don't fret, that is where SAMv2 comes in with a memory module!
+
+SAMv2 defines a new task called "masklet prediction" here masklet refers to the same mask instance throughout the frames ๐๏ธ Unlike SAM, SAM 2 decoder is not fed the image embedding directly from an image encoder, but attention of memories of prompted frames and object pointers.
+
+![image_2](image_2.jpg)
+
+๐ผ๏ธ These "memories" are essentially past predictions of object of interest up to a number of recent frames, and are in form of feature maps of location info (spatial feature maps) ๐๐ป The object pointers are high level semantic information of the object of interest based on.
+
+Just like SAM paper SAMv2 depends on a data engine, and the dataset it generated comes with the release: SA-V ๐คฏ This dataset is gigantic, it has 190.9K manual masklet annotations and 451.7K automatic masklets!
+
+![image_3](image_3.jpg)
+
+Initially they apply SAM to each frame to assist human annotators to annotate a video at six FPS for high quality data, in the second phase they add SAM and SAM2 to generate masklets across time consistently Finally they use SAM2 to refine the masklets.
+
+They have evaluated this model on J&F score (Jaccard Index + F-measure for contour acc) which is used to evaluate zero-shot video segmentation benchmarks SAMv2 seems to outperform two previously sota models that are built on top of SAM! ๐ฅน
+
+![image_4](image_4.jpg)
+
+> [!TIP]
+Ressources:
+[SAM 2: Segment Anything in Images and Videos]()
+by Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman Rรคdle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Dollรกr, Christoph Feichtenhofer (2024)
+[GitHub](https://github.com/facebookresearch/segment-anything-2)
+[Hugging Face documentation]()
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1818675981634109701) (July 31, 2024)
\ No newline at end of file
diff --git a/pages/SAMv2/image_1.jpg b/pages/SAMv2/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..940e725eb2cd8f6aa75cdb90db442de18f1f68fb
Binary files /dev/null and b/pages/SAMv2/image_1.jpg differ
diff --git a/pages/SAMv2/image_2.jpg b/pages/SAMv2/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cf27c7fc476d3730357270e84e7f57fe400695a6
Binary files /dev/null and b/pages/SAMv2/image_2.jpg differ
diff --git a/pages/SAMv2/image_3.jpg b/pages/SAMv2/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cacf6bc0c374958f55ae4eb22d6ee0f91a8ef77b
Binary files /dev/null and b/pages/SAMv2/image_3.jpg differ
diff --git a/pages/SAMv2/image_4.jpg b/pages/SAMv2/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..31c92169f7a556884e4b0aec6b0c38b904e72e4a
Binary files /dev/null and b/pages/SAMv2/image_4.jpg differ
diff --git a/pages/SAMv2/video_1.mp4 b/pages/SAMv2/video_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..fb602d5996c0e8f5535a4cb084d4c38b5eddb839
Binary files /dev/null and b/pages/SAMv2/video_1.mp4 differ
diff --git a/pages/SegGPT/SegGPT.md b/pages/SegGPT/SegGPT.md
new file mode 100644
index 0000000000000000000000000000000000000000..e4d917b1ff0ab07327048313eb4e8250396bc15e
--- /dev/null
+++ b/pages/SegGPT/SegGPT.md
@@ -0,0 +1,34 @@
+๏ปฟSegGPT is a vision generalist on image segmentation, quite like GPT for computer vision โจ
+It comes with the last release of ๐ค Transformers.
+๐ Technical details, demo and how-to's under this!
+
+![image_1](image_1.jpg)
+
+SegGPT is an extension of the Painter, where you speak to images with images: the model takes in an image prompt, transformed version of the image prompt, the actual image you want to see the same transform, and expected to output the transformed image.
+
+SegGPT consists of a vanilla ViT with a decoder on top (linear, conv, linear). The model is trained on diverse segmentation examples, where they provide example image-mask pairs, the actual input to be segmented, and the decoder head learns to reconstruct the mask output. ๐๐ป
+
+![image_2](image_2.jpg)
+
+This generalizes pretty well! The authors do not claim state-of-the-art results as the model is mainly used zero-shot and few-shot inference. They also do prompt tuning, where they freeze the parameters of the model and only optimize the image tensor (the input context).
+
+![image_3](image_3.jpg)
+
+Thanks to ๐ค Transformers you can use this model easily! See [here](https://t.co/U5pVpBhkfK).
+
+![image_4](image_4.jpg)
+
+I have built an app for you to try it out. I combined SegGPT with Depth Anything Model, so you don't have to upload image mask prompts in your prompt pair ๐ค
+Try it [here](https://t.co/uJIwqJeYUy). Also check out the [collection](https://t.co/HvfjWkAEzP).
+
+![image_5](image_5.jpg)
+
+> [!TIP]
+Ressources:
+[SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284)
+by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang (2023)
+[GitHub](https://github.com/baaivision/Painter)
+
+> [!NOTE]
+[Original tweet](https://x.com/mervenoyann/status/1773056450790666568) (March 27, 2024)
+
diff --git a/pages/SegGPT/image_1.jpeg b/pages/SegGPT/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..03e16ddfce8180ba6f9681d4fcb3031f93ff30d5
Binary files /dev/null and b/pages/SegGPT/image_1.jpeg differ
diff --git a/pages/SegGPT/image_2.jpg b/pages/SegGPT/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4a7091327314a58e8eb97cd6ba63473b61db23ec
Binary files /dev/null and b/pages/SegGPT/image_2.jpg differ
diff --git a/pages/SegGPT/image_3.jpg b/pages/SegGPT/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2c38f1049ea41b1d4b2a3ed876b480ceafb21584
Binary files /dev/null and b/pages/SegGPT/image_3.jpg differ
diff --git a/pages/SegGPT/image_4.jpeg b/pages/SegGPT/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..f196104473057b2f77c901102c44ce60c19fd3aa
Binary files /dev/null and b/pages/SegGPT/image_4.jpeg differ
diff --git a/pages/SegGPT/image_5.jpeg b/pages/SegGPT/image_5.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..53c314f0da8f1737acc2b2d3d7ba3331fff388c8
Binary files /dev/null and b/pages/SegGPT/image_5.jpeg differ
diff --git a/pages/SigLIP/SigLIP.md b/pages/SigLIP/SigLIP.md
new file mode 100644
index 0000000000000000000000000000000000000000..323e74c16a790e610e7525c843d67e54b5807d0e
--- /dev/null
+++ b/pages/SigLIP/SigLIP.md
@@ -0,0 +1,40 @@
+๏ปฟSigLIP just got merged to ๐คtransformers and it's super easy to use! To celebrate this, I have created a repository on various SigLIP based projects!
+But what is it and how does it work? SigLIP an vision-text pre-training technique based on contrastive learning.
+It jointly trains an image encoder and text encoder such that the dot product of embeddings are most similar for the appropriate text-image pairs.
+The image below is taken from CLIP, where this contrastive pre-training takes place with softmax, but SigLIP replaces softmax with sigmoid. ๐
+
+![image_1](image_1.jpg)
+
+Highlightsโจ
+๐ผ๏ธ๐ Authors used medium sized B/16 ViT for image encoder and B-sized transformer for text encoder
+๐ More performant than CLIP on zero-shot
+๐ฃ๏ธ Authors trained a multilingual model too!
+โก๏ธ Super efficient, sigmoid is enabling up to 1M items per batch, but the authors chose 32k (see saturation on perf below)
+
+![image_2](image_2.jpg)
+
+Below you can find prior CLIP models and SigLIP across different image encoder sizes and their performance on different datasets ๐๐ป
+
+![image_3](image_3.jpg)
+
+With ๐ค Transformers integration there comes zero-shot-image-classification pipeline, makes SigLIP super easy to use!
+
+![image_4](image_4.jpg)
+
+What to use SigLIP for? ๐ง
+Honestly the possibilities are endless, but you can use it for image/text retrieval, zero-shot classification, training multimodal models!
+I have made a repository with notebooks and applications that are also hosted on [Spaces ](https://t.co/Ah1CrHVuPY).
+I have built ["Draw to Search Art"](https://t.co/DcmQWMc1qd) where you can input image (upload one or draw) and search among 10k images in wikiart!
+I've also built apps to [compare](https://t.co/m699TMvuW9)CLIP and SigLIP outputs.
+
+![image_5](image_5.jpg)
+
+> [!TIP]
+Ressources:
+[Sigmoid Loss for Language Image Pre-Training](Sigmoid Loss for Language Image Pre-Training)
+by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer (2023)
+[GitHub](https://github.com/google-research/big_vision)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/siglip)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1745476609686089800) (January 11. 2024)
\ No newline at end of file
diff --git a/pages/SigLIP/image_1.jpg b/pages/SigLIP/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0fd3aeaad1a19acc88487c75e8f97fff7575f01c
Binary files /dev/null and b/pages/SigLIP/image_1.jpg differ
diff --git a/pages/SigLIP/image_2.jpg b/pages/SigLIP/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..ff186dd4abdc81fcfda6ffcf54724cb99b9bd56b
Binary files /dev/null and b/pages/SigLIP/image_2.jpg differ
diff --git a/pages/SigLIP/image_3.jpg b/pages/SigLIP/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..27e5d501bae71ed831bbb168cea1f2c4891ec01e
Binary files /dev/null and b/pages/SigLIP/image_3.jpg differ
diff --git a/pages/SigLIP/image_4.jpg b/pages/SigLIP/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e3fba6d6ddff448b95a4087f5cada65a60bd22a6
Binary files /dev/null and b/pages/SigLIP/image_4.jpg differ
diff --git a/pages/SigLIP/image_5.jpg b/pages/SigLIP/image_5.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f9684cf65024fe63665694fd6f67317cee4fd0fd
Binary files /dev/null and b/pages/SigLIP/image_5.jpg differ
diff --git a/pages/SigLIP/image_6.jpeg b/pages/SigLIP/image_6.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..88c6a6244655c950d953fc2c5885755efecace43
Binary files /dev/null and b/pages/SigLIP/image_6.jpeg differ
diff --git a/pages/VITMAE/VITMAE.md b/pages/VITMAE/VITMAE.md
new file mode 100644
index 0000000000000000000000000000000000000000..ec69e7bc8d04bb118307d8e695639833767aae6b
--- /dev/null
+++ b/pages/VITMAE/VITMAE.md
@@ -0,0 +1,31 @@
+๏ปฟJust read VitMAE paper, sharing some highlights ๐งถ ViTMAE is a simply yet effective self-supervised pre-training technique, where authors combined vision transformer with masked autoencoder.
+The images are first masked (75 percent of the image!) and then the model tries to learn about the features through trying to reconstruct the original image!
+
+![image_1](image_1.jpg)
+
+The image is not masked, but rather only the visible patches are fed to the encoder (and that is the only thing encoder sees!).
+Next, a mask token is added to where the masked patches are (a bit like BERT, if you will) and the mask tokens and encoded patches are fed to decoder.
+The decoder then tries to reconstruct the original image.
+
+![image_2](image_2.jpg)
+
+As a result, the authors found out that high masking ratio works well in fine-tuning for downstream tasks and linear probing ๐คฏ๐คฏ
+
+![image_3](image_3.jpg)
+
+If you want to try the model or fine-tune, all the pre-trained VITMAE models released released by Meta are available on [Huggingface](https://t.co/didvTL9Zkm).
+We've built a [demo](https://t.co/PkuACJiKrB) for you to see the intermediate outputs and reconstruction by VITMAE.
+
+Also there's a nice [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb) by [@NielsRogge](https://twitter.com/NielsRogge).
+
+![image_4](image_4.jpg)
+
+> [!TIP]
+Ressources:
+[Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v3)
+by LKaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollรกr, Ross Girshick (2021)
+[GitHub](https://github.com/facebookresearch/mae)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/vit_mae)
+
+> [!NOTE]
+[Original tweet](https://twitter.com/mervenoyann/status/1740688304784183664) (December 29, 2023)
diff --git a/pages/VITMAE/image_1.jpeg b/pages/VITMAE/image_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..e3ec2fb793c4aec42c5bd276c92d92c21190355c
Binary files /dev/null and b/pages/VITMAE/image_1.jpeg differ
diff --git a/pages/VITMAE/image_2.jpeg b/pages/VITMAE/image_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..c768c72863efcaaf31aa2da5026b0fa15a0fdc82
Binary files /dev/null and b/pages/VITMAE/image_2.jpeg differ
diff --git a/pages/VITMAE/image_3.jpeg b/pages/VITMAE/image_3.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..1eaadcb6298bc045b1c631d63b28417955610bce
Binary files /dev/null and b/pages/VITMAE/image_3.jpeg differ
diff --git a/pages/VITMAE/image_4.jpeg b/pages/VITMAE/image_4.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..560d4ad4ae4df23a8f1b195257627db9bb403a51
Binary files /dev/null and b/pages/VITMAE/image_4.jpeg differ
diff --git a/pages/Video-LLaVA/Video-LLaVA.md b/pages/Video-LLaVA/Video-LLaVA.md
new file mode 100644
index 0000000000000000000000000000000000000000..51bac3ce0a42ffe0c80ee19b41943ca4e8a966b0
--- /dev/null
+++ b/pages/Video-LLaVA/Video-LLaVA.md
@@ -0,0 +1,32 @@
+๏ปฟWe have recently merged Video-LLaVA to @huggingface transformers! ๐ค
+๐๏ธ What makes this model different? keep reading โ
+
+![video](video_1.mp4)
+
+[Demo](https://t.co/MVP14uEj9e) | [Model](https://t.co/oqSCMUqwJo)
+See below how to initialize the model and processor and infer โฌ๏ธ
+
+
+![image_1](image_1.jpg)
+
+Compared to other models that take image and video input and either project them separately or downsampling video and projecting selected frames, Video-LLaVA is converting images and videos to unified representation and project them using a shared projection layer.
+
+![image_2](image_2.jpg)
+
+It uses Vicuna 1.5 as the language model and LanguageBind's own encoders that's based on OpenCLIP, these encoders project the modalities to an unified representation before passing to projection layer.
+
+![image_3](image_3.jpg)
+
+I feel like one of the coolest features of this model is the joint understanding which is also introduced recently with many models it's a relatively older model but ahead of it's time and works very well!
+
+![image_4](image_4.jpg)
+
+> [!TIP]
+Ressources:
+[Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://arxiv.org/abs/2311.10122)
+by Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan (2023)
+[GitHub](https://github.com/PKU-YuanGroup/Video-LLaVA)
+[Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/video_llava)
+
+> [!NOTE]
+[Original tweet](https://x.com/mervenoyann/status/1816427325073842539) (July 25, 2024)
\ No newline at end of file
diff --git a/pages/Video-LLaVA/image_1.jpg b/pages/Video-LLaVA/image_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6ef2142413d91d3a62b3548be756e743c85eb63f
Binary files /dev/null and b/pages/Video-LLaVA/image_1.jpg differ
diff --git a/pages/Video-LLaVA/image_2.jpg b/pages/Video-LLaVA/image_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..63d46e29066a8d1deadb2637a52770d0cae00547
Binary files /dev/null and b/pages/Video-LLaVA/image_2.jpg differ
diff --git a/pages/Video-LLaVA/image_3.jpg b/pages/Video-LLaVA/image_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3ea5e94b62652fbf17a1465aa4bdb7b6189bb5fe
Binary files /dev/null and b/pages/Video-LLaVA/image_3.jpg differ
diff --git a/pages/Video-LLaVA/image_4.jpg b/pages/Video-LLaVA/image_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..662e32e2d8c775c203b5fb3d93cfe31d2b8347c4
Binary files /dev/null and b/pages/Video-LLaVA/image_4.jpg differ
diff --git a/pages/Video-LLaVA/video_1.mp4 b/pages/Video-LLaVA/video_1.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..fbf5f3e97502d4f06f433be09972bc87d11a60b9
Binary files /dev/null and b/pages/Video-LLaVA/video_1.mp4 differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..07aec85c48f5f456bae61c72a2608594f342ae9d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+streamlit-extras
\ No newline at end of file