Spaces:
Running
Running
import streamlit as st | |
from streamlit_extras.switch_page_button import switch_page | |
translations = { | |
'en': {'title': 'NVEagle', | |
'original_tweet': | |
""" | |
[Original tweet](https://x.com/mervenoyann/status/1829144958101561681) (August 29, 2024) | |
""", | |
'tweet_1': | |
""" | |
NVIDIA just dropped NVEagle 🦅 | |
Super impressive vision language model that comes in 7B, 13B and 13B fine-tuned on chat, improved visual perception with MoE vision encoders 💬 | |
Keep reading for details and links ⇓ | |
""", | |
'tweet_2': | |
""" | |
[Model repositories](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) | Try it [here](https://huggingface.co/spaces/NVEagle/Eagle-X5-13B-Chat) 💬 (works very well! 🤯) | |
""", | |
'tweet_3': | |
""" | |
This model essentially explores having different experts (MoE) and fusion strategies for image encoders. | |
I have been <a href='MiniGemini' target='_self'>talking</a> about how VLMs improve when using multiple encoders in parallel, so seeing this paper MoE made me happy! 🥲 | |
""", | |
'tweet_4': | |
""" | |
How? 🧐 | |
The authors concatenate the vision encoder output tokens together, and they apply "pre-alignment": essentially fine-tune experts with frozen text encoder. | |
Rest of the architecture is quite similar to <a href='LLaVA-NeXT' target='_self'>LlaVA</a>. | |
""", | |
'tweet_5': | |
""" | |
Then they freeze both experts and the decoder and just train the projection layer, and finally, they unfreeze everything for supervised fine-tuning ✨ | |
<br> | |
They explore different fusion strategies and encoders, extending basic CLIP encoder, and find out that simply concatenating visual tokens works well 🥹 | |
See below the performances of different experts ⇓⇓ | |
""", | |
'ressources': | |
""" | |
Ressources: | |
[Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders](https://www.arxiv.org/abs/2408.15998) | |
by Min Shi, Fuxiao Liu, Shihao Wang, Shijia Liao, Subhashree Radhakrishnan, De-An Huang, Hongxu Yin, Karan Sapra, Yaser Yacoob, Humphrey Shi, Bryan Catanzaro, Andrew Tao, Jan Kautz, Zhiding Yu, Guilin Liu (2024) | |
[GitHub](https://github.com/NVlabs/Eagle) | |
[Models and Demos Collection](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) | |
""" | |
}, | |
'fr': { | |
'title': 'NVEagle', | |
'original_tweet': | |
""" | |
[Tweet de base](https://x.com/mervenoyann/status/1829144958101561681) (en anglais) (29 août 2024) | |
""", | |
'tweet_1': | |
""" | |
NVIDIA vient de sortir NVEagle 🦅 | |
Un modèle langage-vision très impressionnant disponible en taille 7B, 13B et 13B, finetuné sur des données de chat. | |
Il dispose d'une perception visuelle améliorée via un mélange d'experts (MoE) d'encodeurs de vision 💬 | |
Continuez à lire pour plus de détails et des liens ⇓ | |
""", | |
'tweet_2': | |
""" | |
[Répertoire des modèles](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) | [Essayez-le ici](https://huggingface.co/spaces/NVEagle/Eagle-X5-13B-Chat) 💬 (fonctionne très bien ! 🤯) | |
""", | |
'tweet_3': | |
""" | |
Ce modèle explore le fait d'avoir différents experts et des stratégies de fusion pour les encodeurs d'images. | |
J'ai <a href='MiniGemini' target='_self'>parlé</a> de la façon dont les VLM s'améliorent lors de l'utilisation de plusieurs encodeurs en parallèle. Ce papier m'a ainsi rendu heureuse ! 🥲 | |
""", | |
'tweet_4': | |
""" | |
Comment ? 🧐 | |
Les auteurs concatènent les tokens de sortie de l'encodeur de vision ensemble, et ils appliquent un « pré-alignement » : ils finetunent les experts avec un encodeur de texte gelé. Le reste de l'architecture est assez similaire à <a href='LLaVA-NeXT' target='_self'>LlaVA</a>. | |
""", | |
'tweet_5': | |
""" | |
Ensuite, ils gèlent les experts et le décodeur et entraînent simplement la couche de projection. Finalement, ils dégèlent le tout pour un finetuning supervisé ✨ | |
<br> | |
Ils explorent différentes stratégies de fusion et d'encodeurs, étendant l'encodeur CLIP de base, et découvrent que la simple concaténation de tokens visuels fonctionne bien 🥹 | |
Voir ci-dessous les performances de différents experts ⇓⇓ | |
""", | |
'ressources': | |
""" | |
Ressources : | |
[Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders](https://www.arxiv.org/abs/2408.15998) | |
de Min Shi, Fuxiao Liu, Shihao Wang, Shijia Liao, Subhashree Radhakrishnan, De-An Huang, Hongxu Yin, Karan Sapra, Yaser Yacoob, Humphrey Shi, Bryan Catanzaro, Andrew Tao, Jan Kautz, Zhiding Yu, Guilin Liu (2024) | |
[GitHub](https://github.com/NVlabs/Eagle) | |
[Models and Demos Collection](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) | |
""" | |
} | |
} | |
def language_selector(): | |
languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} | |
selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') | |
return 'en' if selected_lang == 'EN' else 'fr' | |
left_column, right_column = st.columns([5, 1]) | |
# Add a selector to the right column | |
with right_column: | |
lang = language_selector() | |
# Add a title to the left column | |
with left_column: | |
st.title(translations[lang]["title"]) | |
st.success(translations[lang]["original_tweet"], icon="ℹ️") | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/NVEagle/image_1.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/NVEagle/image_2.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/NVEagle/image_3.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) | |
st.markdown(""" """) | |
st.image("pages/NVEagle/image_4.jpg", use_container_width=True) | |
st.markdown(""" """) | |
st.info(translations[lang]["ressources"], icon="📚") | |
st.markdown(""" """) | |
st.markdown(""" """) | |
st.markdown(""" """) | |
col1, col2, col3= st.columns(3) | |
with col1: | |
if lang == "en": | |
if st.button('Previous paper', use_container_width=True): | |
switch_page("SAMv2") | |
else: | |
if st.button('Papier précédent', use_container_width=True): | |
switch_page("SAMv2") | |
with col2: | |
if lang == "en": | |
if st.button("Home", use_container_width=True): | |
switch_page("Home") | |
else: | |
if st.button("Accueil", use_container_width=True): | |
switch_page("Home") | |
with col3: | |
if lang == "en": | |
if st.button("Next paper", use_container_width=True): | |
switch_page("NVLM") | |
else: | |
if st.button("Papier suivant", use_container_width=True): | |
switch_page("NVLM") | |