Spaces:
Running
Running
import streamlit as st | |
import pymupdf as fitz | |
import pyperclip | |
from utils.audit.audit_doc import audit_descriptif_pdf,audit_text,audit_descriptif_word | |
import dotenv | |
from utils.audit.audit_audio import evaluate_audio_quality | |
from PIL import Image | |
from io import BytesIO | |
# Function to classify file type | |
def classify_file(file): | |
if file.type.startswith("image/"): | |
return "image" | |
elif file.type == "application/pdf": | |
return "pdf" | |
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
return "word" | |
elif file.type.startswith("audio/"): | |
return "audio" | |
elif file.type.startswith("text/"): | |
return "text" | |
else: | |
return "unknown" | |
#display content | |
def display_content_doc(content:dict): | |
number_of_pages = len(content) | |
st.info("si vous choisissez 0, vous verrez le contenu de toutes les pages") | |
number = st.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content") | |
#0 means all pages | |
if number > 0: | |
page : dict = content[f"page_{number-1}"] | |
option = st.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True) | |
if option == "images": | |
if number == 0: | |
images = [img for page in content.values() for img in page["images"]] | |
else: | |
images = page["images"] | |
col1,col2,col3 = st.columns(3) | |
for i, (img_bytes, img_width, img_height) in enumerate(images): | |
if i%3 == 0: | |
col1.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width) | |
elif i%3 == 1: | |
col2.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width) | |
else: | |
col3.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width) | |
elif option == "texte": | |
if number == 0: | |
text = "-------------------\n".join([page["texte"] for page in content.values()]) | |
else: | |
text = page["texte"] | |
st.text_area("Texte",text,height=200) | |
elif option == "liens": | |
if number == 0: | |
links = [link for page in content.values() for link in page["liens"]] | |
else: | |
links = page["liens"] | |
for i, link in enumerate(links): | |
st.markdown(f"- {i+1}: {link['uri']} (page {link['page']})") | |
def display_audit_pdf(uploaded_file): | |
if st.session_state.name_file != uploaded_file.name: | |
st.session_state.name_file = uploaded_file.name | |
with st.spinner("Analyse du document..."): | |
st.session_state.audit = audit_descriptif_pdf(uploaded_file,200) | |
audit = st.session_state.audit["audit"] | |
content = st.session_state.audit["content"] | |
#global audit | |
audit_simplified = { | |
"Nombre de pages": audit["number_of_pages"], | |
"Nombre d'images": audit["number_of_images"], | |
"Nombre de liens": audit["number_of_links"], | |
"Nombre de tableaux": audit["number_of_tables"], | |
"Nombre de tokens": audit["number_of_tokens"], | |
"Nombre de mots": audit["number_of_words"], | |
"Mots clés": audit["key_words"] | |
} | |
well_formatted_audit = "Contenus audités\n" | |
for key, value in audit_simplified.items(): | |
well_formatted_audit += f"- {key}: {value}\n" | |
st.code(well_formatted_audit) | |
#audit par page | |
with st.expander("Audit par page"): | |
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit") | |
audit_page = audit[f"page_{number-1}"] | |
audit_page = { | |
"Nombre d'images": audit_page["number_of_images"], | |
"Nombre de liens": audit_page["number_of_links"], | |
"Nombre de tableaux": audit_page["number_of_tables"], | |
"Nombre de tokens": audit_page["number_of_tokens"], | |
"Nombre de mots": audit_page["number_of_words"], | |
} | |
well_formatted_audit_page = "Audit descriptif\n" | |
for key, value in audit_page.items(): | |
well_formatted_audit_page += f"- {key}: {value}\n" | |
st.code(well_formatted_audit_page) | |
with st.expander("Cliquer ici pour voir le contenu du document"): | |
display_content_doc(content) | |
def main(): | |
dotenv.load_dotenv() | |
# Streamlit app | |
st.title("AUDIT DES DOCUMENTS") | |
notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3" | |
st.markdown(notice) | |
if "audit" not in st.session_state: | |
st.session_state.audit = {} | |
if "name_file" not in st.session_state: | |
st.session_state.name_file = "" | |
# File uploader | |
uploaded_file = st.file_uploader("Télécharger un ou plusieurs documents") | |
if uploaded_file is not None: | |
type = classify_file(uploaded_file) | |
st.write(f"Type de fichier: {type}") | |
st.write("### Synthèse audit du ou des document(s) téléchargé(s)") | |
if type == "pdf": | |
display_audit_pdf(uploaded_file) | |
elif type == "audio": | |
if st.session_state.name_file != uploaded_file.name: | |
st.session_state.name_file = uploaded_file.name | |
with st.spinner("Analyse de l'audio..."): | |
st.session_state.audit = evaluate_audio_quality(uploaded_file) | |
audit = st.session_state.audit | |
#audit global simplifié | |
audit_simplified = { | |
"Volume": f"{audit['volume']:0.2f} dBFS", | |
"SNR": f"{max(audit['SNR'],0):0.2f} dB", | |
"Durée": f"{audit['duration']:0.2f} minutes", | |
"Nombre de tokens": audit["number_of_tokens"] | |
} | |
well_formatted_audit = "Contenus audités\n" | |
for key, value in audit_simplified.items(): | |
well_formatted_audit += f"- {key}: {value}\n" | |
st.code(well_formatted_audit) | |
with st.expander("Transcription"): | |
st.write(audit["transcription"]) | |
elif type == "text": | |
text = uploaded_file.read().decode("utf-8") | |
if st.session_state.name_file != uploaded_file.name: | |
st.session_state.name_file = uploaded_file.name | |
with st.spinner("Analyse du texte..."): | |
st.session_state.audit = audit_text(text) | |
audit = st.session_state.audit | |
#audit global simplifié | |
audit_simplified = { | |
"Nombre de tokens": audit["number_of_tokens"], | |
"Nombre de mots": audit["number_of_words"] | |
} | |
well_formatted_audit = "Audit descriptif\n" | |
for key, value in audit_simplified.items(): | |
well_formatted_audit += f"- {key}: {value}\n" | |
st.code(well_formatted_audit) | |
elif type == "word": | |
if st.session_state.name_file != uploaded_file.name: | |
st.session_state.name_file = uploaded_file.name | |
with st.spinner("Analyse du document..."): | |
st.session_state.audit = audit_descriptif_word(uploaded_file) | |
audit = st.session_state.audit | |
#global audit | |
audit_simplified = { | |
"Nombre de pages": audit["number_of_paragraphs"], | |
"Nombre d'images": audit["number_of_images"], | |
"Nombre de liens": audit["number_of_links"], | |
"Nombre de tableaux": audit["number_of_tables"], | |
"Nombre de tokens": audit["number_of_tokens"], | |
"Nombre de mots": audit["number_of_words"] | |
} | |
well_formatted_audit = "Contenus audités\n" | |
for key, value in audit_simplified.items(): | |
well_formatted_audit += f"- {key}: {value}\n" | |
st.code(well_formatted_audit) | |
if __name__ == "__main__": | |
main() | |