AGENT_ANALYSE_RAG_dev

Running

App Files Files Community

Ilyas KHIAT commited on Jul 29, 2024

Commit

f1342ba

1 Parent(s): c408d8a

multipage et ux ++

Browse files

Files changed (9) hide show

agents_page/catalogue.py +5 -0
agents_page/recommended_agent.py +5 -0
app.py +22 -199
audit_page/audit.py +224 -0
audit_page/knowledge_graph.py +10 -0
chatbot_page/chatbot.py +3 -0
doc_page/documentation.py +3 -0
utils/audit/audit_audio.py +6 -2
utils/audit/audit_doc.py +21 -1

agents_page/catalogue.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import streamlit as st
+#st.set_page_config(page_title="Catalogue des agents (via bziiit.com)", page_icon="", layout="wide")
+st.title("Catalogue des agents (via bziiit.com)")

agents_page/recommended_agent.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import streamlit as st
+#st.set_page_config(page_title="Agents recommandés", page_icon="", layout="wide")
+st.title("Agents recommandés")

app.py CHANGED Viewed

@@ -1,210 +1,33 @@
 import streamlit as st
-import pymupdf as fitz
-import pyperclip
-from utils.audit.audit_doc import audit_descriptif_pdf,audit_text,audit_descriptif_word
 import dotenv
-from utils.audit.audit_audio import evaluate_audio_quality
-from PIL import Image
-from io import BytesIO
-# Function to classify file type
-def classify_file(file):
-    if file.type.startswith("image/"):
-        return "image"
-    elif file.type == "application/pdf":
-        return "pdf"
-    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
-        return "word"
-    elif file.type.startswith("audio/"):
-        return "audio"
-    elif file.type.startswith("text/"):
-        return "text"
-    else:
-        return "unknown"
-#display content
-def display_content_doc(content:dict):
-    number_of_pages = len(content)
-    st.info("si vous choisissez 0, vous verrez le contenu de toutes les pages")
-    number = st.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
-    #0 means all pages
-    if number > 0:
-        page : dict = content[f"page_{number-1}"]
-    option = st.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
-    if option == "images":
-        if number == 0:
-            images = [img for page in content.values() for img in page["images"]]
-        else:
-            images = page["images"]
-        col1,col2,col3 = st.columns(3)
-        for i, (img_bytes, img_width, img_height) in enumerate(images):
-            if i%3 == 0:
-                col1.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
-            elif i%3 == 1:
-                col2.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
-            else:
-                col3.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
-    elif option == "texte":
-        if number == 0:
-            text = "-------------------\n".join([page["texte"] for page in content.values()])
-        else:
-            text = page["texte"]
-        st.text_area("Texte",text,height=200)
-    elif option == "liens":
-        if number == 0:
-            links = [link for page in content.values() for link in page["liens"]]
-        else:
-            links = page["liens"]
-        for i, link in enumerate(links):
-            st.markdown(f"- {i+1}: {link['uri']} (page {link['page']})")
-def display_audit_pdf(uploaded_file):
-    if st.session_state.name_file != uploaded_file.name:
-        st.session_state.name_file = uploaded_file.name
-        with st.spinner("Analyse du document..."):
-            st.session_state.audit = audit_descriptif_pdf(uploaded_file,200)
-    audit = st.session_state.audit["audit"]
-    content = st.session_state.audit["content"]
-    #global audit
-    audit_simplified = {
-        "Nombre de pages": audit["number_of_pages"],
-        "Nombre d'images": audit["number_of_images"],
-        "Nombre de liens": audit["number_of_links"],
-        "Nombre de tableaux": audit["number_of_tables"],
-        "Nombre de tokens": audit["number_of_tokens"],
-        "Nombre de mots": audit["number_of_words"],
-        "Mots clés": audit["key_words"]
-    }
-    well_formatted_audit = "Contenus audités\n"
-    for key, value in audit_simplified.items():
-        well_formatted_audit += f"- {key}: {value}\n"
-    st.code(well_formatted_audit)
-    #audit par page
-    with st.expander("Audit par page"):
-        number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit")
-        audit_page = audit[f"page_{number-1}"]
-        audit_page = {
-            "Nombre d'images": audit_page["number_of_images"],
-            "Nombre de liens": audit_page["number_of_links"],
-            "Nombre de tableaux": audit_page["number_of_tables"],
-            "Nombre de tokens": audit_page["number_of_tokens"],
-            "Nombre de mots": audit_page["number_of_words"],
-        }
-        well_formatted_audit_page = "Audit descriptif\n"
-        for key, value in audit_page.items():
-            well_formatted_audit_page += f"- {key}: {value}\n"
-        st.code(well_formatted_audit_page)
-    with st.expander("Cliquer ici pour voir le contenu du document"):
-        display_content_doc(content)
 def main():
-    dotenv.load_dotenv()
-    # Streamlit app
-    st.title("AUDIT DES DOCUMENTS")
-    notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
-    st.markdown(notice)
-    if "audit" not in st.session_state:
-        st.session_state.audit = {}
-    if "name_file" not in st.session_state:
-        st.session_state.name_file = ""
-    # File uploader
-    uploaded_file = st.file_uploader("Télécharger un ou plusieurs documents")
-    if uploaded_file is not None:
-        type = classify_file(uploaded_file)
-        st.write(f"Type de fichier: {type}")
-        st.write("### Synthèse audit du ou des document(s) téléchargé(s)")
-        if type == "pdf":
-            display_audit_pdf(uploaded_file)
-        elif type == "audio":
-            if st.session_state.name_file != uploaded_file.name:
-                st.session_state.name_file = uploaded_file.name
-                with st.spinner("Analyse de l'audio..."):
-                    st.session_state.audit = evaluate_audio_quality(uploaded_file)
-            audit = st.session_state.audit
-            #audit global simplifié
-            audit_simplified = {
-                "Volume": f"{audit['volume']:0.2f} dBFS",
-                "SNR": f"{max(audit['SNR'],0):0.2f} dB",
-                "Durée": f"{audit['duration']:0.2f} minutes",
-                "Nombre de tokens": audit["number_of_tokens"]
-            }
-            well_formatted_audit = "Contenus audités\n"
-            for key, value in audit_simplified.items():
-                well_formatted_audit += f"- {key}: {value}\n"
-            st.code(well_formatted_audit)
-            with st.expander("Transcription"):
-                st.write(audit["transcription"])
-        elif type == "text":
-            text = uploaded_file.read().decode("utf-8")
-            if st.session_state.name_file != uploaded_file.name:
-                st.session_state.name_file = uploaded_file.name
-                with st.spinner("Analyse du texte..."):
-                    st.session_state.audit = audit_text(text)
-            audit = st.session_state.audit
-            #audit global simplifié
-            audit_simplified = {
-                "Nombre de tokens": audit["number_of_tokens"],
-                "Nombre de mots": audit["number_of_words"]
-            }
-            well_formatted_audit = "Audit descriptif\n"
-            for key, value in audit_simplified.items():
-                well_formatted_audit += f"- {key}: {value}\n"
-            st.code(well_formatted_audit)
-        elif type == "word":
-            if st.session_state.name_file != uploaded_file.name:
-                st.session_state.name_file = uploaded_file.name
-                with st.spinner("Analyse du document..."):
-                    st.session_state.audit = audit_descriptif_word(uploaded_file)
-            audit = st.session_state.audit
-            #global audit
-            audit_simplified = {
-                "Nombre de pages": audit["number_of_paragraphs"],
-                "Nombre d'images": audit["number_of_images"],
-                "Nombre de liens": audit["number_of_links"],
-                "Nombre de tableaux": audit["number_of_tables"],
-                "Nombre de tokens": audit["number_of_tokens"],
-                "Nombre de mots": audit["number_of_words"]
-            }
-            well_formatted_audit = "Contenus audités\n"
-            for key, value in audit_simplified.items():
-                well_formatted_audit += f"- {key}: {value}\n"
-            st.code(well_formatted_audit)
 if __name__ == "__main__":
-    main()

 import streamlit as st
 import dotenv
+import os
 def main():
+    dotenv.load_dotenv(dotenv_path=os.path.join('.streamlit', '.env'))
+    st.set_page_config(page_title="RAG Agent", page_icon="🤖", layout="wide")
+    audit_page = st.Page("audit_page/audit.py", title="Audit", icon="📋", default=True)
+    kg_page = st.Page("audit_page/knowledge_graph.py", title="Graphe de connaissance", icon="🧠")
+    agents_page = st.Page("agents_page/catalogue.py", title="Catalogue des agents", icon="📇")
+    recommended_agents = st.Page("agents_page/recommended_agent.py", title="Agents recommandés", icon="⭐")
+    chatbot = st.Page("chatbot_page/chatbot.py", title="Chatbot", icon="💬")
+    documentation = st.Page("doc_page/documentation.py", title="Documentation", icon="📚")
+    pg = st.navigation(
+        {
+            "Audit de contenus": [audit_page, kg_page],
+            "Equipe d'agents IA": [agents_page, recommended_agents],
+            "Chatbot": [chatbot],
+            "Documentation": [documentation]
+        }
+    )
+    pg.run()
 if __name__ == "__main__":
+    main()

audit_page/audit.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import streamlit as st
+import pymupdf as fitz
+import pyperclip
+from utils.audit.audit_doc import audit_descriptif_pdf,audit_text,audit_descriptif_word
+import dotenv
+from utils.audit.audit_audio import evaluate_audio_quality
+from PIL import Image
+from io import BytesIO
+import os
+# Function to classify file type
+def classify_file(file):
+    if file.type.startswith("image/"):
+        return "image"
+    elif file.type == "application/pdf":
+        return "pdf"
+    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        return "word"
+    elif file.type.startswith("audio/"):
+        return "audio"
+    elif file.type.startswith("text/"):
+        return "text"
+    else:
+        return "unknown"
+#display content
+def display_content_doc(content:dict,col:st):
+    number_of_pages = len(content)
+    col.info("si vous choisissez 0, vous verrez le contenu de toutes les pages")
+    number = col.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
+    #0 means all pages
+    if number > 0:
+        page : dict = content[f"page_{number-1}"]
+    option = col.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
+    if option == "images":
+        if number == 0:
+            images = [img for page in content.values() for img in page["images"]]
+        else:
+            images = page["images"]
+        col1,col2,col3 = col.columns(3)
+        for i, (img_bytes, img_width, img_height) in enumerate(images):
+            if i%3 == 0:
+                col1.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
+            elif i%3 == 1:
+                col2.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
+            else:
+                col3.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
+    elif option == "texte":
+        if number == 0:
+            text = "-------------------\n".join([page["texte"] for page in content.values()])
+        else:
+            text = page["texte"]
+        col.text_area("Texte",text,height=200)
+    elif option == "liens":
+        if number == 0:
+            links = [link for page in content.values() for link in page["liens"]]
+        else:
+            links = page["liens"]
+        for i, link in enumerate(links):
+            col.markdown(f"- {i+1}: {link['uri']} (page {link['page']})")
+def display_audit_pdf(uploaded_file,col:st):
+    if st.session_state.name_file != uploaded_file.name:
+        st.session_state.name_file = uploaded_file.name
+        with st.spinner("Analyse du document..."):
+            st.session_state.audit = audit_descriptif_pdf(uploaded_file,200)
+    audit = st.session_state.audit["audit"]
+    content = st.session_state.audit["content"]
+    #global audit
+    audit_simplified = {
+        "Nombre de pages": audit["number_of_pages"],
+        "Nombre d'images": audit["number_of_images"],
+        "Nombre de liens": audit["number_of_links"],
+        "Nombre de tableaux": audit["number_of_tables"],
+        "Nombre de tokens": audit["number_of_tokens"],
+        "Nombre de mots": audit["number_of_words"],
+        "Mots clés": audit["key_words"]
+    }
+    well_formatted_audit = "Contenus audités\n"
+    for key, value in audit_simplified.items():
+        well_formatted_audit += f"- {key}: {value}\n"
+    col.code(well_formatted_audit)
+    #audit par page
+    with col.expander("Audit par page"):
+        number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit")
+        audit_page = audit[f"page_{number-1}"]
+        audit_page = {
+            "Nombre d'images": audit_page["number_of_images"],
+            "Nombre de liens": audit_page["number_of_links"],
+            "Nombre de tableaux": audit_page["number_of_tables"],
+            "Nombre de tokens": audit_page["number_of_tokens"],
+            "Nombre de mots": audit_page["number_of_words"],
+        }
+        well_formatted_audit_page = "Audit descriptif\n"
+        for key, value in audit_page.items():
+            well_formatted_audit_page += f"- {key}: {value}\n"
+        st.code(well_formatted_audit_page)
+    return content
+def audit_main():
+    #st.set_page_config(page_title="Audit des documents", page_icon=":page_with_curl:", layout="wide")
+    # Streamlit app
+    st.title("Audit des documents")
+    notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
+    col1, col2 = st.columns([4, 3])
+    col1.markdown(notice)
+    if "audit" not in st.session_state:
+        st.session_state.audit = {}
+    if "name_file" not in st.session_state:
+        st.session_state.name_file = ""
+    # File uploader
+    uploaded_file = col1.file_uploader("Télécharger un ou plusieurs documents")
+    if uploaded_file is not None:
+        type = classify_file(uploaded_file)
+        col1.write(f"Type de fichier: {type}")
+        col1.write("### Synthèse audit du ou des document(s) téléchargé(s)")
+        if type == "pdf":
+            content = display_audit_pdf(uploaded_file,col1)
+            with col2.expander("Contenu"):
+                display_content_doc(content,st)
+        elif type == "audio":
+            if st.session_state.name_file != uploaded_file.name:
+                st.session_state.name_file = uploaded_file.name
+                with st.spinner("Analyse de l'audio..."):
+                    st.session_state.audit = evaluate_audio_quality(uploaded_file)
+            audit = st.session_state.audit
+            #audit global simplifié
+            audit_simplified = {
+                "Durée": f"{audit['duration']:0.2f} minutes",
+                "Nombre de mots": audit["number_of_words"],
+                "Nombre de tokens": audit["number_of_tokens"],
+                "Volume": f"{audit['volume']:0.2f} dBFS (déciBels Full Scale)",
+                "SNR": f"{max(audit['SNR'],0):0.2f} dB (Ratio Signal / Bruit)",
+            }
+            well_formatted_audit = "Contenus audités\n"
+            for key, value in audit_simplified.items():
+                well_formatted_audit += f"- {key}: {value}\n"
+            col1.code(well_formatted_audit)
+            with col2.expander("Transcription"):
+                st.write(audit["transcription"])
+                if st.button("📋",key="copy_transcription"):
+                    pyperclip.copy(audit["transcription"])
+                    st.success("Transcription copiée dans le presse-papier")
+        elif type == "text":
+            text = uploaded_file.read().decode("utf-8")
+            if st.session_state.name_file != uploaded_file.name:
+                st.session_state.name_file = uploaded_file.name
+                with st.spinner("Analyse du texte..."):
+                    st.session_state.audit = audit_text(text)
+            audit = st.session_state.audit
+            #audit global simplifié
+            audit_simplified = {
+                "Nombre de tokens": audit["number_of_tokens"],
+                "Nombre de mots": audit["number_of_words"]
+            }
+            well_formatted_audit = "Audit descriptif\n"
+            for key, value in audit_simplified.items():
+                well_formatted_audit += f"- {key}: {value}\n"
+            col1.code(well_formatted_audit)
+            with col2.expander("Texte"):
+                st.text_area("Texte",text,height=200)
+        elif type == "word":
+            if st.session_state.name_file != uploaded_file.name:
+                st.session_state.name_file = uploaded_file.name
+                with st.spinner("Analyse du document..."):
+                    st.session_state.audit = audit_descriptif_word(uploaded_file)
+            audit = st.session_state.audit
+            #global audit
+            audit_simplified = {
+                "Nombre de pages": audit["number_of_paragraphs"],
+                "Nombre d'images": audit["number_of_images"],
+                "Nombre de liens": audit["number_of_links"],
+                "Nombre de tableaux": audit["number_of_tables"],
+                "Nombre de tokens": audit["number_of_tokens"],
+                "Nombre de mots": audit["number_of_words"]
+            }
+            well_formatted_audit = "Contenus audités\n"
+            for key, value in audit_simplified.items():
+                well_formatted_audit += f"- {key}: {value}\n"
+            st.code(well_formatted_audit)
+audit_main()

audit_page/knowledge_graph.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import streamlit as st
+def kg_main():
+    #st.set_page_config(page_title="Graphe de connaissance", page_icon="", layout="wide")
+    st.title("Graphe de connaissance")
+kg_main()

chatbot_page/chatbot.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import streamlit as st
2	+
3	+ st.title("Chatbot")

doc_page/documentation.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ import streamlit as st
2	+
3	+ st.title("Documentation")

utils/audit/audit_audio.py CHANGED Viewed

@@ -31,7 +31,11 @@ def calculate_snr(audio_data):
 # Function to evaluate audio quality
 def evaluate_audio_quality(file) -> dict:
-    audio = AudioSegment.from_file(file)
     audio_data = np.array(audio.get_array_of_samples())
     #number of minutes
@@ -46,5 +50,5 @@ def evaluate_audio_quality(file) -> dict:
     #get the transcription of the audio
     transcription = transcript_audio_func(file)
-    return {"volume": volume, "SNR": snr,"transcription": transcription,"number_of_tokens": count_tokens(transcription),"duration": duration}

 # Function to evaluate audio quality
 def evaluate_audio_quality(file) -> dict:
+    try:
+        audio = AudioSegment.from_file(file)
+    except:
+        audio = AudioSegment.from_file(io.BytesIO(file.read()))
     audio_data = np.array(audio.get_array_of_samples())
     #number of minutes
     #get the transcription of the audio
     transcription = transcript_audio_func(file)
+    return {"volume": volume, "SNR": snr,"transcription": transcription,"number_of_tokens": count_tokens(transcription),"duration": duration, "number_of_words": len(transcription.split())}

utils/audit/audit_doc.py CHANGED Viewed

@@ -7,11 +7,23 @@ import io
 from rake_nltk import Rake
 import nltk
 from nltk.corpus import stopwords
 # Download NLTK stopwords
 nltk.download('stopwords')
 nltk.download('punkt')
 def evaluate_text_quality(text: str) -> dict:
     # Calculate readability metrics
@@ -153,7 +165,15 @@ def audit_descriptif_pdf(file,max_img_width) -> dict:
     # Extract key words from the document
     text = " ".join([page["texte"] for page in doc_content.values()])
     key_words = extract_keywords(text)
-    audit_dict_doc["key_words"] = key_words[:5]
     #merge 2 dicts
     global_audit = {

 from rake_nltk import Rake
 import nltk
 from nltk.corpus import stopwords
+from openai import OpenAI
 # Download NLTK stopwords
 nltk.download('stopwords')
 nltk.download('punkt')
+#function to use gpt4o-mini
+def extract_relevant_keywords(prompt: str) -> str:
+    client = OpenAI()
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "user", "content": prompt}
+        ]
+    )
+    return response.choices[0].message.content
 def evaluate_text_quality(text: str) -> dict:
     # Calculate readability metrics
     # Extract key words from the document
     text = " ".join([page["texte"] for page in doc_content.values()])
     key_words = extract_keywords(text)
+    list_key_words_text = "\n".join(key_words[:10])
+    prompt = f'''Voici une liste de mots et phrases provenant d'un document :
+        - {list_key_words_text}
+        Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots.
+        REPONSE:
+    '''
+    key_words_extracted = extract_relevant_keywords(prompt)
+    audit_dict_doc["key_words"] = "\n" + key_words_extracted
     #merge 2 dicts
     global_audit = {