Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pymupdf as fitz | |
import pyperclip | |
import clipboard | |
from utils.audit.audit_doc import audit_descriptif_pdf,audit_text | |
from utils.audit.rag import setup_rag | |
import dotenv | |
from utils.audit.audit_audio import evaluate_audio_quality | |
from PIL import Image | |
from io import BytesIO | |
import st_copy_to_clipboard | |
import os | |
# Function to classify file type | |
def classify_file(file): | |
if file.type.startswith("image/"): | |
return "image" | |
elif file.type == "application/pdf": | |
return "pdf" | |
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
return "word" | |
elif file.type.startswith("audio/"): | |
return "audio" | |
elif file.type.startswith("text/"): | |
return "text" | |
else: | |
return "unknown" | |
#display content | |
def display_content_doc(content:dict,col:st): | |
number_of_pages = len(content) | |
col.info("Note : Si vous choisissez 0, vous verrez le contenu de toutes les pages") | |
number = col.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content") | |
#0 means all pages | |
if number > 0: | |
page : dict = content[f"page_{number-1}"] | |
option = col.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True) | |
if option == "images": | |
if number == 0: | |
images = [img for page in content.values() for img in page["images"]] | |
else: | |
images = page["images"] | |
col1,col2,col3 = col.columns(3) | |
for i, (img_bytes, img_width, img_height) in enumerate(images): | |
try: | |
if i%3 == 0: | |
col1.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width) | |
elif i%3 == 1: | |
col2.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width) | |
else: | |
col3.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width) | |
except: | |
pass | |
elif option == "texte": | |
if number == 0: | |
text = "-------------------\n".join([page["texte"] for page in content.values()]) | |
else: | |
text = page["texte"] | |
col.code(text,language="text") | |
elif option == "liens": | |
if number == 0: | |
links = [link for page in content.values() for link in page["liens"]] | |
else: | |
links = page["liens"] | |
for i, link in enumerate(links): | |
col.markdown(f"- {i+1}: [{link['uri']}]({link['uri']}) (page {link['page']})") | |
elif option == "tableaux": | |
if number == 0: | |
tables = [table for page in content.values() for table in page["tableaux"]] | |
else: | |
tables = page["tableaux"] | |
for i, table in enumerate(tables): | |
col.write(f"Tableau {i+1}") | |
col.write(table) | |
def display_content_audio(content:dict,col:st): | |
st.write("##### Transcription") | |
st.write(content["transcription"]) | |
# if st.button("📋",key="copy_transcription"): | |
st_copy_to_clipboard(content["transcription"]) | |
# st.success("Transcription copiée dans le presse-papier") | |
st.audio(content["audio_data"],sample_rate=content["frame_rate"]*2) | |
def display_content_text(content,col:st): | |
st.text_area("Texte",content,height=200) | |
def handle_display_content(col:st): | |
audit = st.session_state.audit | |
type = st.session_state.audit_simplified["type de fichier"] | |
if type == "pdf": | |
with col.expander("Contenu"): | |
display_content_doc(audit["content"],st) | |
elif type == "audio": | |
with col.expander("Contenu"): | |
display_content_audio(audit["content"],col) | |
elif type == "text": | |
with col.expander("Contenu"): | |
display_content_text(audit["content"],col) | |
def handle_audit(uploaded_file,type:str): | |
if type == "pdf": | |
if st.session_state.name_file != uploaded_file.name: | |
st.session_state.name_file = uploaded_file.name | |
with st.spinner("Analyse du document..."): | |
st.session_state.audit = {} | |
st.session_state.audit = audit_descriptif_pdf(uploaded_file,100) | |
with st.spinner("Préparation de la DB..."): | |
vectorstore = setup_rag(type,st.session_state.audit["content"]) | |
st.session_state.vectorstore = vectorstore | |
st.session_state.graph = None | |
st.session_state.cr = "" | |
audit = st.session_state.audit["audit"] | |
#global audit | |
audit_simplified = { | |
"type de fichier": type, | |
"Nombre de pages": audit["number_of_pages"], | |
"Nombre d'images": audit["number_of_images"], | |
"Nombre de liens": audit["number_of_links"], | |
"Nombre de tableaux": audit["number_of_tables"], | |
"Nombre de tokens": audit["number_of_tokens"], | |
"Nombre de mots": audit["number_of_words"], | |
"Mots clés": audit["key_words"] | |
} | |
st.session_state.audit_simplified = audit_simplified | |
elif type == "audio": | |
if st.session_state.name_file != uploaded_file.name: | |
st.session_state.name_file = uploaded_file.name | |
with st.spinner("Analyse de l'audio..."): | |
st.session_state.audit = {} | |
st.session_state.audit = evaluate_audio_quality(uploaded_file) | |
with st.spinner("Préparation de la DB..."): | |
vectorstore = setup_rag(type,st.session_state.audit["content"]) | |
st.session_state.vectorstore = vectorstore | |
st.session_state.graph = None | |
st.session_state.cr = "" | |
audit = st.session_state.audit["audit"] | |
#audit global simplifié | |
audit_simplified = { | |
"type de fichier": type, | |
"Durée": f"{audit['duration']:0.2f} minutes", | |
"Nombre de mots": audit["number_of_words"], | |
"Nombre de tokens": audit["number_of_tokens"], | |
"Volume": f"{audit['volume']:0.2f} dBFS (déciBels Full Scale)", | |
"SNR": f"{max(audit['SNR'],0):0.2f} dB (Ratio Signal / Bruit)", | |
} | |
st.session_state.audit_simplified = audit_simplified | |
elif type == "text": | |
text = uploaded_file.read().decode("utf-8") | |
if st.session_state.name_file != uploaded_file.name: | |
st.session_state.name_file = uploaded_file.name | |
with st.spinner("Analyse du texte..."): | |
st.session_state.audit = {} | |
st.session_state.audit = audit_text(text) | |
audit = st.session_state.audit["audit"] | |
#audit global simplifié | |
audit_simplified = { | |
"type de fichier": type, | |
"Nombre de tokens": audit["number_of_tokens"], | |
"Nombre de mots": audit["number_of_words"], | |
"Mots clés": audit["key_words"] | |
} | |
st.session_state.audit_simplified = audit_simplified | |
def display_audit(col:st): | |
#audit global simplifié | |
audit_simplified = st.session_state.audit_simplified | |
audit = st.session_state.audit["audit"] | |
well_formatted_audit = "Contenus audités\n" | |
for key, value in audit_simplified.items(): | |
well_formatted_audit += f"- {key}: {value}\n" | |
col.code(well_formatted_audit) | |
if audit_simplified["type de fichier"] == "pdf": #cad un type qui contient des pages | |
#audit par page | |
with col.expander("Audit par page"): | |
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit") | |
audit_page = audit[f"page_{number-1}"] | |
audit_page = { | |
"Nombre d'images": audit_page["number_of_images"], | |
"Nombre de liens": audit_page["number_of_links"], | |
"Nombre de tableaux": audit_page["number_of_tables"], | |
"Nombre de tokens": audit_page["number_of_tokens"], | |
"Nombre de mots": audit_page["number_of_words"], | |
} | |
well_formatted_audit_page = "Audit descriptif\n" | |
for key, value in audit_page.items(): | |
well_formatted_audit_page += f"- {key}: {value}\n" | |
st.code(well_formatted_audit_page) | |
def audit_main(): | |
#st.set_page_config(page_title="Audit des documents", page_icon=":page_with_curl:", layout="wide") | |
# Streamlit app | |
st.title("Audit des documents") | |
notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3" | |
col1, col2 = st.columns([4, 3]) | |
col1.markdown(notice) | |
if "audit" not in st.session_state: | |
st.session_state.audit = {} | |
if "name_file" not in st.session_state: | |
st.session_state.name_file = "" | |
if "audit_simplified" not in st.session_state: | |
st.session_state.audit_simplified = {} | |
if "vectorstore" not in st.session_state: | |
st.session_state.vectorstore = None | |
if "cr" not in st.session_state: | |
st.session_state.cr = "" | |
if "graph" not in st.session_state: | |
st.session_state.graph = None | |
# File uploader | |
uploaded_file = col1.file_uploader("Télécharger un ou plusieurs documents") | |
if uploaded_file is not None: | |
type = classify_file(uploaded_file) | |
handle_audit(uploaded_file,type) | |
col1.write(f"Type de fichier: {type}") | |
col1.write("### Synthèse audit de(s) document(s) téléchargé(s)") | |
if "audit" in st.session_state and st.session_state.audit != {}: | |
display_audit(col1) | |
handle_display_content(col2) | |
#init graph and cr | |
audit_main() | |