Ilyas KHIAT
more details content
bc557f4
raw
history blame
7.99 kB
import streamlit as st
import pymupdf as fitz
import pyperclip
from utils.audit.audit_doc import audit_descriptif_pdf,audit_text,audit_descriptif_word
import dotenv
from utils.audit.audit_audio import evaluate_audio_quality
from PIL import Image
from io import BytesIO
# Function to classify file type
def classify_file(file):
if file.type.startswith("image/"):
return "image"
elif file.type == "application/pdf":
return "pdf"
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return "word"
elif file.type.startswith("audio/"):
return "audio"
elif file.type.startswith("text/"):
return "text"
else:
return "unknown"
#display content
def display_content_doc(content:dict):
number_of_pages = len(content)
st.info("si vous choisissez 0, vous verrez le contenu de toutes les pages")
number = st.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
#0 means all pages
if number > 0:
page : dict = content[f"page_{number-1}"]
option = st.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
if option == "images":
if number == 0:
images = [img for page in content.values() for img in page["images"]]
else:
images = page["images"]
col1,col2,col3 = st.columns(3)
for i, (img_bytes, img_width, img_height) in enumerate(images):
if i%3 == 0:
col1.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
elif i%3 == 1:
col2.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
else:
col3.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
elif option == "texte":
if number == 0:
text = "-------------------\n".join([page["texte"] for page in content.values()])
else:
text = page["texte"]
st.text_area("Texte",text,height=200)
elif option == "liens":
if number == 0:
links = [link for page in content.values() for link in page["liens"]]
else:
links = page["liens"]
for i, link in enumerate(links):
st.markdown(f"- {i+1}: {link['uri']} (page {link['page']})")
def display_audit_pdf(uploaded_file):
if st.session_state.name_file != uploaded_file.name:
st.session_state.name_file = uploaded_file.name
with st.spinner("Analyse du document..."):
st.session_state.audit = audit_descriptif_pdf(uploaded_file,200)
audit = st.session_state.audit["audit"]
content = st.session_state.audit["content"]
#global audit
audit_simplified = {
"Nombre de pages": audit["number_of_pages"],
"Nombre d'images": audit["number_of_images"],
"Nombre de liens": audit["number_of_links"],
"Nombre de tableaux": audit["number_of_tables"],
"Nombre de tokens": audit["number_of_tokens"],
"Nombre de mots": audit["number_of_words"],
"Mots clés": audit["key_words"]
}
well_formatted_audit = "Contenus audités\n"
for key, value in audit_simplified.items():
well_formatted_audit += f"- {key}: {value}\n"
st.code(well_formatted_audit)
#audit par page
with st.expander("Audit par page"):
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit")
audit_page = audit[f"page_{number-1}"]
audit_page = {
"Nombre d'images": audit_page["number_of_images"],
"Nombre de liens": audit_page["number_of_links"],
"Nombre de tableaux": audit_page["number_of_tables"],
"Nombre de tokens": audit_page["number_of_tokens"],
"Nombre de mots": audit_page["number_of_words"],
}
well_formatted_audit_page = "Audit descriptif\n"
for key, value in audit_page.items():
well_formatted_audit_page += f"- {key}: {value}\n"
st.code(well_formatted_audit_page)
with st.expander("Cliquer ici pour voir le contenu du document"):
display_content_doc(content)
def main():
dotenv.load_dotenv()
# Streamlit app
st.title("AUDIT DES DOCUMENTS")
notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
st.markdown(notice)
if "audit" not in st.session_state:
st.session_state.audit = {}
if "name_file" not in st.session_state:
st.session_state.name_file = ""
# File uploader
uploaded_file = st.file_uploader("Télécharger un ou plusieurs documents")
if uploaded_file is not None:
type = classify_file(uploaded_file)
st.write(f"Type de fichier: {type}")
st.write("### Synthèse audit du ou des document(s) téléchargé(s)")
if type == "pdf":
display_audit_pdf(uploaded_file)
elif type == "audio":
if st.session_state.name_file != uploaded_file.name:
st.session_state.name_file = uploaded_file.name
with st.spinner("Analyse de l'audio..."):
st.session_state.audit = evaluate_audio_quality(uploaded_file)
audit = st.session_state.audit
#audit global simplifié
audit_simplified = {
"Volume": f"{audit['volume']:0.2f} dBFS",
"SNR": f"{max(audit['SNR'],0):0.2f} dB",
"Durée": f"{audit['duration']:0.2f} minutes",
"Nombre de tokens": audit["number_of_tokens"]
}
well_formatted_audit = "Contenus audités\n"
for key, value in audit_simplified.items():
well_formatted_audit += f"- {key}: {value}\n"
st.code(well_formatted_audit)
with st.expander("Transcription"):
st.write(audit["transcription"])
elif type == "text":
text = uploaded_file.read().decode("utf-8")
if st.session_state.name_file != uploaded_file.name:
st.session_state.name_file = uploaded_file.name
with st.spinner("Analyse du texte..."):
st.session_state.audit = audit_text(text)
audit = st.session_state.audit
#audit global simplifié
audit_simplified = {
"Nombre de tokens": audit["number_of_tokens"],
"Nombre de mots": audit["number_of_words"]
}
well_formatted_audit = "Audit descriptif\n"
for key, value in audit_simplified.items():
well_formatted_audit += f"- {key}: {value}\n"
st.code(well_formatted_audit)
elif type == "word":
if st.session_state.name_file != uploaded_file.name:
st.session_state.name_file = uploaded_file.name
with st.spinner("Analyse du document..."):
st.session_state.audit = audit_descriptif_word(uploaded_file)
audit = st.session_state.audit
#global audit
audit_simplified = {
"Nombre de pages": audit["number_of_paragraphs"],
"Nombre d'images": audit["number_of_images"],
"Nombre de liens": audit["number_of_links"],
"Nombre de tableaux": audit["number_of_tables"],
"Nombre de tokens": audit["number_of_tokens"],
"Nombre de mots": audit["number_of_words"]
}
well_formatted_audit = "Contenus audités\n"
for key, value in audit_simplified.items():
well_formatted_audit += f"- {key}: {value}\n"
st.code(well_formatted_audit)
if __name__ == "__main__":
main()