Ilyas KHIAT
first push
56a3465
import streamlit as st
import pymupdf as fitz
import pyperclip
import clipboard
from utils.audit.audit_doc import audit_descriptif_pdf,audit_text
from utils.audit.rag import setup_rag
import dotenv
from utils.audit.audit_audio import evaluate_audio_quality
from PIL import Image
from io import BytesIO
import st_copy_to_clipboard
import os
# Function to classify file type
def classify_file(file):
if file.type.startswith("image/"):
return "image"
elif file.type == "application/pdf":
return "pdf"
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return "word"
elif file.type.startswith("audio/"):
return "audio"
elif file.type.startswith("text/"):
return "text"
else:
return "unknown"
#display content
def display_content_doc(content:dict,col:st):
number_of_pages = len(content)
col.info("Note : Si vous choisissez 0, vous verrez le contenu de toutes les pages")
number = col.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
#0 means all pages
if number > 0:
page : dict = content[f"page_{number-1}"]
option = col.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
if option == "images":
if number == 0:
images = [img for page in content.values() for img in page["images"]]
else:
images = page["images"]
col1,col2,col3 = col.columns(3)
for i, (img_bytes, img_width, img_height) in enumerate(images):
try:
if i%3 == 0:
col1.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
elif i%3 == 1:
col2.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
else:
col3.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
except:
pass
elif option == "texte":
if number == 0:
text = "-------------------\n".join([page["texte"] for page in content.values()])
else:
text = page["texte"]
col.code(text,language="text")
elif option == "liens":
if number == 0:
links = [link for page in content.values() for link in page["liens"]]
else:
links = page["liens"]
for i, link in enumerate(links):
col.markdown(f"- {i+1}: [{link['uri']}]({link['uri']}) (page {link['page']})")
elif option == "tableaux":
if number == 0:
tables = [table for page in content.values() for table in page["tableaux"]]
else:
tables = page["tableaux"]
for i, table in enumerate(tables):
col.write(f"Tableau {i+1}")
col.write(table)
def display_content_audio(content:dict,col:st):
st.write("##### Transcription")
st.write(content["transcription"])
# if st.button("📋",key="copy_transcription"):
st_copy_to_clipboard(content["transcription"])
# st.success("Transcription copiée dans le presse-papier")
st.audio(content["audio_data"],sample_rate=content["frame_rate"]*2)
def display_content_text(content,col:st):
st.text_area("Texte",content,height=200)
def handle_display_content(col:st):
audit = st.session_state.audit
type = st.session_state.audit_simplified["type de fichier"]
if type == "pdf":
with col.expander("Contenu"):
display_content_doc(audit["content"],st)
elif type == "audio":
with col.expander("Contenu"):
display_content_audio(audit["content"],col)
elif type == "text":
with col.expander("Contenu"):
display_content_text(audit["content"],col)
def handle_audit(uploaded_file,type:str):
if type == "pdf":
if st.session_state.name_file != uploaded_file.name:
st.session_state.name_file = uploaded_file.name
with st.spinner("Analyse du document..."):
st.session_state.audit = {}
st.session_state.audit = audit_descriptif_pdf(uploaded_file,100)
with st.spinner("Préparation de la DB..."):
vectorstore = setup_rag(type,st.session_state.audit["content"])
st.session_state.vectorstore = vectorstore
st.session_state.graph = None
st.session_state.cr = ""
audit = st.session_state.audit["audit"]
#global audit
audit_simplified = {
"type de fichier": type,
"Nombre de pages": audit["number_of_pages"],
"Nombre d'images": audit["number_of_images"],
"Nombre de liens": audit["number_of_links"],
"Nombre de tableaux": audit["number_of_tables"],
"Nombre de tokens": audit["number_of_tokens"],
"Nombre de mots": audit["number_of_words"],
"Mots clés": audit["key_words"]
}
st.session_state.audit_simplified = audit_simplified
elif type == "audio":
if st.session_state.name_file != uploaded_file.name:
st.session_state.name_file = uploaded_file.name
with st.spinner("Analyse de l'audio..."):
st.session_state.audit = {}
st.session_state.audit = evaluate_audio_quality(uploaded_file)
with st.spinner("Préparation de la DB..."):
vectorstore = setup_rag(type,st.session_state.audit["content"])
st.session_state.vectorstore = vectorstore
st.session_state.graph = None
st.session_state.cr = ""
audit = st.session_state.audit["audit"]
#audit global simplifié
audit_simplified = {
"type de fichier": type,
"Durée": f"{audit['duration']:0.2f} minutes",
"Nombre de mots": audit["number_of_words"],
"Nombre de tokens": audit["number_of_tokens"],
"Volume": f"{audit['volume']:0.2f} dBFS (déciBels Full Scale)",
"SNR": f"{max(audit['SNR'],0):0.2f} dB (Ratio Signal / Bruit)",
}
st.session_state.audit_simplified = audit_simplified
elif type == "text":
text = uploaded_file.read().decode("utf-8")
if st.session_state.name_file != uploaded_file.name:
st.session_state.name_file = uploaded_file.name
with st.spinner("Analyse du texte..."):
st.session_state.audit = {}
st.session_state.audit = audit_text(text)
audit = st.session_state.audit["audit"]
#audit global simplifié
audit_simplified = {
"type de fichier": type,
"Nombre de tokens": audit["number_of_tokens"],
"Nombre de mots": audit["number_of_words"],
"Mots clés": audit["key_words"]
}
st.session_state.audit_simplified = audit_simplified
def display_audit(col:st):
#audit global simplifié
audit_simplified = st.session_state.audit_simplified
audit = st.session_state.audit["audit"]
well_formatted_audit = "Contenus audités\n"
for key, value in audit_simplified.items():
well_formatted_audit += f"- {key}: {value}\n"
col.code(well_formatted_audit)
if audit_simplified["type de fichier"] == "pdf": #cad un type qui contient des pages
#audit par page
with col.expander("Audit par page"):
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit")
audit_page = audit[f"page_{number-1}"]
audit_page = {
"Nombre d'images": audit_page["number_of_images"],
"Nombre de liens": audit_page["number_of_links"],
"Nombre de tableaux": audit_page["number_of_tables"],
"Nombre de tokens": audit_page["number_of_tokens"],
"Nombre de mots": audit_page["number_of_words"],
}
well_formatted_audit_page = "Audit descriptif\n"
for key, value in audit_page.items():
well_formatted_audit_page += f"- {key}: {value}\n"
st.code(well_formatted_audit_page)
def audit_main():
#st.set_page_config(page_title="Audit des documents", page_icon=":page_with_curl:", layout="wide")
# Streamlit app
st.title("Audit des documents")
notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
col1, col2 = st.columns([4, 3])
col1.markdown(notice)
if "audit" not in st.session_state:
st.session_state.audit = {}
if "name_file" not in st.session_state:
st.session_state.name_file = ""
if "audit_simplified" not in st.session_state:
st.session_state.audit_simplified = {}
if "vectorstore" not in st.session_state:
st.session_state.vectorstore = None
if "cr" not in st.session_state:
st.session_state.cr = ""
if "graph" not in st.session_state:
st.session_state.graph = None
# File uploader
uploaded_file = col1.file_uploader("Télécharger un ou plusieurs documents")
if uploaded_file is not None:
type = classify_file(uploaded_file)
handle_audit(uploaded_file,type)
col1.write(f"Type de fichier: {type}")
col1.write("### Synthèse audit de(s) document(s) téléchargé(s)")
if "audit" in st.session_state and st.session_state.audit != {}:
display_audit(col1)
handle_display_content(col2)
#init graph and cr
audit_main()