import streamlit as st
from streamlit_extras.switch_page_button import switch_page


translations = {
'en': {'title': 'NVEagle',
    'original_tweet': 
       """
       [Original tweet](https://x.com/mervenoyann/status/1829144958101561681) (August 29, 2024)
       """,
    'tweet_1':
        """
        NVIDIA just dropped NVEagle 🦅  
        Super impressive vision language model that comes in 7B, 13B and 13B fine-tuned on chat, improved visual perception with MoE vision encoders 💬  
        Keep reading for details and links ⇓
        """,
    'tweet_2':
        """
        [Model repositories](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) | Try it [here](https://huggingface.co/spaces/NVEagle/Eagle-X5-13B-Chat) 💬 (works very well! 🤯)  
        """,
    'tweet_3':
        """
        This model essentially explores having different experts (MoE) and fusion strategies for image encoders.  
        I have been <a href='MiniGemini' target='_self'>talking</a> about how VLMs improve when using multiple encoders in parallel, so seeing this paper MoE made me happy! 🥲 
        """,
    'tweet_4':
        """
        How? 🧐  
        The authors concatenate the vision encoder output tokens together, and they apply "pre-alignment": essentially fine-tune experts with frozen text encoder.  
        Rest of the architecture is quite similar to <a href='LLaVA-NeXT' target='_self'>LlaVA</a>.
        """,
    'tweet_5':
        """
        Then they freeze both experts and the decoder and just train the projection layer, and finally, they unfreeze everything for supervised fine-tuning ✨  
        <br>
        They explore different fusion strategies and encoders, extending basic CLIP encoder, and find out that simply concatenating visual tokens works well 🥹  
        See below the performances of different experts ⇓⇓
        """,
    'ressources':
        """
        Ressources:  
        [Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders](https://www.arxiv.org/abs/2408.15998) 
        by Min Shi, Fuxiao Liu, Shihao Wang, Shijia Liao, Subhashree Radhakrishnan, De-An Huang, Hongxu Yin, Karan Sapra, Yaser Yacoob, Humphrey Shi, Bryan Catanzaro, Andrew Tao, Jan Kautz, Zhiding Yu, Guilin Liu (2024)  
        [GitHub](https://github.com/NVlabs/Eagle)  
        [Models and Demos Collection](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26)
        """
      },
'fr': {
    'title': 'NVEagle',
    'original_tweet': 
       """
       [Tweet de base](https://x.com/mervenoyann/status/1829144958101561681) (en anglais) (29 août 2024)
       """,
    'tweet_1':
        """
        NVIDIA vient de sortir NVEagle 🦅  
        Un modèle langage-vision très impressionnant disponible en taille 7B, 13B et 13B, finetuné sur des données de chat.  
        Il dispose d'une perception visuelle améliorée via un mélange d'experts (MoE) d'encodeurs de vision 💬  
        Continuez à lire pour plus de détails et des liens ⇓
        """,
    'tweet_2':
        """
        [Répertoire des modèles](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) | [Essayez-le ici](https://huggingface.co/spaces/NVEagle/Eagle-X5-13B-Chat) 💬 (fonctionne très bien ! 🤯)
        """,
    'tweet_3':
        """
        Ce modèle explore le fait d'avoir différents experts et des stratégies de fusion pour les encodeurs d'images.  
        J'ai <a href='MiniGemini' target='_self'>parlé</a> de la façon dont les VLM s'améliorent lors de l'utilisation de plusieurs encodeurs en parallèle. Ce papier m'a ainsi rendu heureuse ! 🥲
        """,
    'tweet_4':
        """
        Comment ? 🧐
        Les auteurs concatènent les tokens de sortie de l'encodeur de vision ensemble, et ils appliquent un « pré-alignement » : ils finetunent les experts avec un encodeur de texte gelé. Le reste de l'architecture est assez similaire à <a href='LLaVA-NeXT' target='_self'>LlaVA</a>.
        """,
    'tweet_5':
        """
        Ensuite, ils gèlent les experts et le décodeur et entraînent simplement la couche de projection. Finalement, ils dégèlent le tout pour un finetuning supervisé ✨  
        <br>
        Ils explorent différentes stratégies de fusion et d'encodeurs, étendant l'encodeur CLIP de base, et découvrent que la simple concaténation de tokens visuels fonctionne bien 🥹  
        Voir ci-dessous les performances de différents experts ⇓⇓
        """,
    'ressources':
        """
        Ressources :  
        [Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders](https://www.arxiv.org/abs/2408.15998) 
        de Min Shi, Fuxiao Liu, Shihao Wang, Shijia Liao, Subhashree Radhakrishnan, De-An Huang, Hongxu Yin, Karan Sapra, Yaser Yacoob, Humphrey Shi, Bryan Catanzaro, Andrew Tao, Jan Kautz, Zhiding Yu, Guilin Liu (2024)  
        [GitHub](https://github.com/NVlabs/Eagle)  
        [Models and Demos Collection](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26)
        """
    }
}    


def language_selector():
    languages = {'EN': '🇬🇧', 'FR': '🇫🇷'}
    selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector')
    return 'en' if selected_lang == 'EN' else 'fr'

left_column, right_column = st.columns([5, 1])

# Add a selector to the right column
with right_column:
    lang = language_selector()

# Add a title to the left column
with left_column:
    st.title(translations[lang]["title"])
    
st.success(translations[lang]["original_tweet"], icon="ℹ️")
st.markdown(""" """)

st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True)
st.markdown(""" """)

st.image("pages/NVEagle/image_1.jpg", use_container_width=True)
st.markdown(""" """)

st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True)
st.markdown(""" """)

st.image("pages/NVEagle/image_2.jpg", use_container_width=True)
st.markdown(""" """)

st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True)
st.markdown(""" """)

st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True)
st.markdown(""" """)

st.image("pages/NVEagle/image_3.jpg", use_container_width=True)
st.markdown(""" """)

st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True)
st.markdown(""" """)

st.image("pages/NVEagle/image_4.jpg", use_container_width=True)
st.markdown(""" """)

st.info(translations[lang]["ressources"], icon="📚")  

st.markdown(""" """)
st.markdown(""" """)
st.markdown(""" """)
col1, col2, col3= st.columns(3)
with col1:
    if lang == "en":
        if st.button('Previous paper', use_container_width=True):
            switch_page("SAMv2")
    else:
        if st.button('Papier précédent', use_container_width=True):
            switch_page("SAMv2")
with col2:
    if lang == "en":
        if st.button("Home", use_container_width=True):
            switch_page("Home")
    else:
        if st.button("Accueil", use_container_width=True):
            switch_page("Home")
with col3:
    if lang == "en":
        if st.button("Next paper", use_container_width=True):
            switch_page("NVLM")
    else:
        if st.button("Papier suivant", use_container_width=True):
            switch_page("NVLM")