import streamlit as st import pymupdf as fitz import pyperclip import clipboard from utils.audit.audit_doc import audit_descriptif_pdf,audit_text from utils.audit.rag import setup_rag import dotenv from utils.audit.audit_audio import evaluate_audio_quality from PIL import Image from io import BytesIO import st_copy_to_clipboard import os # Function to classify file type def classify_file(file): if file.type.startswith("image/"): return "image" elif file.type == "application/pdf": return "pdf" elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return "word" elif file.type.startswith("audio/"): return "audio" elif file.type.startswith("text/"): return "text" else: return "unknown" #display content def display_content_doc(content:dict,col:st): number_of_pages = len(content) col.info("Note : Si vous choisissez 0, vous verrez le contenu de toutes les pages") number = col.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content") #0 means all pages if number > 0: page : dict = content[f"page_{number-1}"] option = col.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True) if option == "images": if number == 0: images = [img for page in content.values() for img in page["images"]] else: images = page["images"] col1,col2,col3 = col.columns(3) for i, (img_bytes, img_width, img_height) in enumerate(images): try: if i%3 == 0: col1.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width) elif i%3 == 1: col2.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width) else: col3.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width) except: pass elif option == "texte": if number == 0: text = "-------------------\n".join([page["texte"] for page in content.values()]) else: text = page["texte"] col.code(text,language="text") elif option == "liens": if number == 0: links = [link for page in content.values() for link in page["liens"]] else: links = page["liens"] for i, link in enumerate(links): col.markdown(f"- {i+1}: [{link['uri']}]({link['uri']}) (page {link['page']})") elif option == "tableaux": if number == 0: tables = [table for page in content.values() for table in page["tableaux"]] else: tables = page["tableaux"] for i, table in enumerate(tables): col.write(f"Tableau {i+1}") col.write(table) def display_content_audio(content:dict,col:st): st.write("##### Transcription") st.write(content["transcription"]) # if st.button("📋",key="copy_transcription"): st_copy_to_clipboard(content["transcription"]) # st.success("Transcription copiée dans le presse-papier") st.audio(content["audio_data"],sample_rate=content["frame_rate"]*2) def display_content_text(content,col:st): st.text_area("Texte",content,height=200) def handle_display_content(col:st): audit = st.session_state.audit type = st.session_state.audit_simplified["type de fichier"] if type == "pdf": with col.expander("Contenu"): display_content_doc(audit["content"],st) elif type == "audio": with col.expander("Contenu"): display_content_audio(audit["content"],col) elif type == "text": with col.expander("Contenu"): display_content_text(audit["content"],col) def handle_audit(uploaded_file,type:str): if type == "pdf": if st.session_state.name_file != uploaded_file.name: st.session_state.name_file = uploaded_file.name with st.spinner("Analyse du document..."): st.session_state.audit = {} st.session_state.audit = audit_descriptif_pdf(uploaded_file,100) with st.spinner("Préparation de la DB..."): vectorstore = setup_rag(type,st.session_state.audit["content"]) st.session_state.vectorstore = vectorstore st.session_state.graph = None st.session_state.cr = "" audit = st.session_state.audit["audit"] #global audit audit_simplified = { "type de fichier": type, "Nombre de pages": audit["number_of_pages"], "Nombre d'images": audit["number_of_images"], "Nombre de liens": audit["number_of_links"], "Nombre de tableaux": audit["number_of_tables"], "Nombre de tokens": audit["number_of_tokens"], "Nombre de mots": audit["number_of_words"], "Mots clés": audit["key_words"] } st.session_state.audit_simplified = audit_simplified elif type == "audio": if st.session_state.name_file != uploaded_file.name: st.session_state.name_file = uploaded_file.name with st.spinner("Analyse de l'audio..."): st.session_state.audit = {} st.session_state.audit = evaluate_audio_quality(uploaded_file) with st.spinner("Préparation de la DB..."): vectorstore = setup_rag(type,st.session_state.audit["content"]) st.session_state.vectorstore = vectorstore st.session_state.graph = None st.session_state.cr = "" audit = st.session_state.audit["audit"] #audit global simplifié audit_simplified = { "type de fichier": type, "Durée": f"{audit['duration']:0.2f} minutes", "Nombre de mots": audit["number_of_words"], "Nombre de tokens": audit["number_of_tokens"], "Volume": f"{audit['volume']:0.2f} dBFS (déciBels Full Scale)", "SNR": f"{max(audit['SNR'],0):0.2f} dB (Ratio Signal / Bruit)", } st.session_state.audit_simplified = audit_simplified elif type == "text": text = uploaded_file.read().decode("utf-8") if st.session_state.name_file != uploaded_file.name: st.session_state.name_file = uploaded_file.name with st.spinner("Analyse du texte..."): st.session_state.audit = {} st.session_state.audit = audit_text(text) audit = st.session_state.audit["audit"] #audit global simplifié audit_simplified = { "type de fichier": type, "Nombre de tokens": audit["number_of_tokens"], "Nombre de mots": audit["number_of_words"], "Mots clés": audit["key_words"] } st.session_state.audit_simplified = audit_simplified def display_audit(col:st): #audit global simplifié audit_simplified = st.session_state.audit_simplified audit = st.session_state.audit["audit"] well_formatted_audit = "Contenus audités\n" for key, value in audit_simplified.items(): well_formatted_audit += f"- {key}: {value}\n" col.code(well_formatted_audit) if audit_simplified["type de fichier"] == "pdf": #cad un type qui contient des pages #audit par page with col.expander("Audit par page"): number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit") audit_page = audit[f"page_{number-1}"] audit_page = { "Nombre d'images": audit_page["number_of_images"], "Nombre de liens": audit_page["number_of_links"], "Nombre de tableaux": audit_page["number_of_tables"], "Nombre de tokens": audit_page["number_of_tokens"], "Nombre de mots": audit_page["number_of_words"], } well_formatted_audit_page = "Audit descriptif\n" for key, value in audit_page.items(): well_formatted_audit_page += f"- {key}: {value}\n" st.code(well_formatted_audit_page) def audit_main(): #st.set_page_config(page_title="Audit des documents", page_icon=":page_with_curl:", layout="wide") # Streamlit app st.title("Audit des documents") notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3" col1, col2 = st.columns([4, 3]) col1.markdown(notice) if "audit" not in st.session_state: st.session_state.audit = {} if "name_file" not in st.session_state: st.session_state.name_file = "" if "audit_simplified" not in st.session_state: st.session_state.audit_simplified = {} if "vectorstore" not in st.session_state: st.session_state.vectorstore = None if "cr" not in st.session_state: st.session_state.cr = "" if "graph" not in st.session_state: st.session_state.graph = None # File uploader uploaded_file = col1.file_uploader("Télécharger un ou plusieurs documents") if uploaded_file is not None: type = classify_file(uploaded_file) handle_audit(uploaded_file,type) col1.write(f"Type de fichier: {type}") col1.write("### Synthèse audit de(s) document(s) téléchargé(s)") if "audit" in st.session_state and st.session_state.audit != {}: display_audit(col1) handle_display_content(col2) #init graph and cr audit_main()