Spaces:
Running
Running
Ilyas KHIAT
commited on
Commit
·
f1342ba
1
Parent(s):
c408d8a
multipage et ux ++
Browse files- agents_page/catalogue.py +5 -0
- agents_page/recommended_agent.py +5 -0
- app.py +22 -199
- audit_page/audit.py +224 -0
- audit_page/knowledge_graph.py +10 -0
- chatbot_page/chatbot.py +3 -0
- doc_page/documentation.py +3 -0
- utils/audit/audit_audio.py +6 -2
- utils/audit/audit_doc.py +21 -1
agents_page/catalogue.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
#st.set_page_config(page_title="Catalogue des agents (via bziiit.com)", page_icon="", layout="wide")
|
4 |
+
|
5 |
+
st.title("Catalogue des agents (via bziiit.com)")
|
agents_page/recommended_agent.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
#st.set_page_config(page_title="Agents recommandés", page_icon="", layout="wide")
|
4 |
+
|
5 |
+
st.title("Agents recommandés")
|
app.py
CHANGED
@@ -1,210 +1,33 @@
|
|
1 |
import streamlit as st
|
2 |
-
import pymupdf as fitz
|
3 |
-
import pyperclip
|
4 |
-
from utils.audit.audit_doc import audit_descriptif_pdf,audit_text,audit_descriptif_word
|
5 |
import dotenv
|
6 |
-
|
7 |
-
from PIL import Image
|
8 |
-
from io import BytesIO
|
9 |
|
10 |
-
# Function to classify file type
|
11 |
-
def classify_file(file):
|
12 |
-
if file.type.startswith("image/"):
|
13 |
-
return "image"
|
14 |
-
elif file.type == "application/pdf":
|
15 |
-
return "pdf"
|
16 |
-
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
17 |
-
return "word"
|
18 |
-
elif file.type.startswith("audio/"):
|
19 |
-
return "audio"
|
20 |
-
elif file.type.startswith("text/"):
|
21 |
-
return "text"
|
22 |
-
else:
|
23 |
-
return "unknown"
|
24 |
-
|
25 |
-
#display content
|
26 |
-
def display_content_doc(content:dict):
|
27 |
-
|
28 |
-
number_of_pages = len(content)
|
29 |
-
st.info("si vous choisissez 0, vous verrez le contenu de toutes les pages")
|
30 |
-
|
31 |
-
number = st.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
|
32 |
-
#0 means all pages
|
33 |
-
if number > 0:
|
34 |
-
page : dict = content[f"page_{number-1}"]
|
35 |
-
option = st.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
|
36 |
-
if option == "images":
|
37 |
-
if number == 0:
|
38 |
-
images = [img for page in content.values() for img in page["images"]]
|
39 |
-
else:
|
40 |
-
images = page["images"]
|
41 |
-
col1,col2,col3 = st.columns(3)
|
42 |
-
for i, (img_bytes, img_width, img_height) in enumerate(images):
|
43 |
-
if i%3 == 0:
|
44 |
-
col1.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
|
45 |
-
elif i%3 == 1:
|
46 |
-
col2.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
|
47 |
-
else:
|
48 |
-
col3.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
|
49 |
-
|
50 |
-
elif option == "texte":
|
51 |
-
if number == 0:
|
52 |
-
text = "-------------------\n".join([page["texte"] for page in content.values()])
|
53 |
-
else:
|
54 |
-
text = page["texte"]
|
55 |
-
|
56 |
-
st.text_area("Texte",text,height=200)
|
57 |
-
|
58 |
-
elif option == "liens":
|
59 |
-
if number == 0:
|
60 |
-
links = [link for page in content.values() for link in page["liens"]]
|
61 |
-
else:
|
62 |
-
links = page["liens"]
|
63 |
-
for i, link in enumerate(links):
|
64 |
-
st.markdown(f"- {i+1}: {link['uri']} (page {link['page']})")
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
def display_audit_pdf(uploaded_file):
|
70 |
-
if st.session_state.name_file != uploaded_file.name:
|
71 |
-
st.session_state.name_file = uploaded_file.name
|
72 |
-
with st.spinner("Analyse du document..."):
|
73 |
-
st.session_state.audit = audit_descriptif_pdf(uploaded_file,200)
|
74 |
-
audit = st.session_state.audit["audit"]
|
75 |
-
content = st.session_state.audit["content"]
|
76 |
-
#global audit
|
77 |
-
audit_simplified = {
|
78 |
-
"Nombre de pages": audit["number_of_pages"],
|
79 |
-
"Nombre d'images": audit["number_of_images"],
|
80 |
-
"Nombre de liens": audit["number_of_links"],
|
81 |
-
"Nombre de tableaux": audit["number_of_tables"],
|
82 |
-
"Nombre de tokens": audit["number_of_tokens"],
|
83 |
-
"Nombre de mots": audit["number_of_words"],
|
84 |
-
"Mots clés": audit["key_words"]
|
85 |
-
}
|
86 |
-
|
87 |
-
well_formatted_audit = "Contenus audités\n"
|
88 |
-
for key, value in audit_simplified.items():
|
89 |
-
well_formatted_audit += f"- {key}: {value}\n"
|
90 |
-
|
91 |
-
st.code(well_formatted_audit)
|
92 |
-
|
93 |
-
#audit par page
|
94 |
-
with st.expander("Audit par page"):
|
95 |
-
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit")
|
96 |
-
audit_page = audit[f"page_{number-1}"]
|
97 |
-
audit_page = {
|
98 |
-
"Nombre d'images": audit_page["number_of_images"],
|
99 |
-
"Nombre de liens": audit_page["number_of_links"],
|
100 |
-
"Nombre de tableaux": audit_page["number_of_tables"],
|
101 |
-
"Nombre de tokens": audit_page["number_of_tokens"],
|
102 |
-
"Nombre de mots": audit_page["number_of_words"],
|
103 |
-
}
|
104 |
-
well_formatted_audit_page = "Audit descriptif\n"
|
105 |
-
for key, value in audit_page.items():
|
106 |
-
well_formatted_audit_page += f"- {key}: {value}\n"
|
107 |
-
|
108 |
-
st.code(well_formatted_audit_page)
|
109 |
-
|
110 |
-
with st.expander("Cliquer ici pour voir le contenu du document"):
|
111 |
-
display_content_doc(content)
|
112 |
|
113 |
|
114 |
def main():
|
115 |
-
dotenv.load_dotenv()
|
116 |
-
# Streamlit app
|
117 |
-
st.title("AUDIT DES DOCUMENTS")
|
118 |
-
|
119 |
-
notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
elif type == "audio":
|
142 |
-
if st.session_state.name_file != uploaded_file.name:
|
143 |
-
st.session_state.name_file = uploaded_file.name
|
144 |
-
with st.spinner("Analyse de l'audio..."):
|
145 |
-
st.session_state.audit = evaluate_audio_quality(uploaded_file)
|
146 |
-
audit = st.session_state.audit
|
147 |
-
|
148 |
-
#audit global simplifié
|
149 |
-
audit_simplified = {
|
150 |
-
"Volume": f"{audit['volume']:0.2f} dBFS",
|
151 |
-
"SNR": f"{max(audit['SNR'],0):0.2f} dB",
|
152 |
-
"Durée": f"{audit['duration']:0.2f} minutes",
|
153 |
-
"Nombre de tokens": audit["number_of_tokens"]
|
154 |
-
}
|
155 |
-
|
156 |
-
well_formatted_audit = "Contenus audités\n"
|
157 |
-
for key, value in audit_simplified.items():
|
158 |
-
well_formatted_audit += f"- {key}: {value}\n"
|
159 |
-
|
160 |
-
st.code(well_formatted_audit)
|
161 |
-
|
162 |
-
with st.expander("Transcription"):
|
163 |
-
st.write(audit["transcription"])
|
164 |
-
|
165 |
-
elif type == "text":
|
166 |
-
text = uploaded_file.read().decode("utf-8")
|
167 |
-
if st.session_state.name_file != uploaded_file.name:
|
168 |
-
st.session_state.name_file = uploaded_file.name
|
169 |
-
with st.spinner("Analyse du texte..."):
|
170 |
-
st.session_state.audit = audit_text(text)
|
171 |
-
audit = st.session_state.audit
|
172 |
-
|
173 |
-
#audit global simplifié
|
174 |
-
audit_simplified = {
|
175 |
-
"Nombre de tokens": audit["number_of_tokens"],
|
176 |
-
"Nombre de mots": audit["number_of_words"]
|
177 |
-
}
|
178 |
-
|
179 |
-
well_formatted_audit = "Audit descriptif\n"
|
180 |
-
for key, value in audit_simplified.items():
|
181 |
-
well_formatted_audit += f"- {key}: {value}\n"
|
182 |
-
|
183 |
-
st.code(well_formatted_audit)
|
184 |
-
|
185 |
-
elif type == "word":
|
186 |
-
if st.session_state.name_file != uploaded_file.name:
|
187 |
-
st.session_state.name_file = uploaded_file.name
|
188 |
-
with st.spinner("Analyse du document..."):
|
189 |
-
st.session_state.audit = audit_descriptif_word(uploaded_file)
|
190 |
-
audit = st.session_state.audit
|
191 |
-
|
192 |
-
#global audit
|
193 |
-
audit_simplified = {
|
194 |
-
"Nombre de pages": audit["number_of_paragraphs"],
|
195 |
-
"Nombre d'images": audit["number_of_images"],
|
196 |
-
"Nombre de liens": audit["number_of_links"],
|
197 |
-
"Nombre de tableaux": audit["number_of_tables"],
|
198 |
-
"Nombre de tokens": audit["number_of_tokens"],
|
199 |
-
"Nombre de mots": audit["number_of_words"]
|
200 |
-
}
|
201 |
-
|
202 |
-
well_formatted_audit = "Contenus audités\n"
|
203 |
-
for key, value in audit_simplified.items():
|
204 |
-
well_formatted_audit += f"- {key}: {value}\n"
|
205 |
|
206 |
-
|
207 |
|
208 |
|
209 |
if __name__ == "__main__":
|
210 |
-
main()
|
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
2 |
import dotenv
|
3 |
+
import os
|
|
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
def main():
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
dotenv.load_dotenv(dotenv_path=os.path.join('.streamlit', '.env'))
|
10 |
+
|
11 |
+
st.set_page_config(page_title="RAG Agent", page_icon="🤖", layout="wide")
|
12 |
+
|
13 |
+
audit_page = st.Page("audit_page/audit.py", title="Audit", icon="📋", default=True)
|
14 |
+
kg_page = st.Page("audit_page/knowledge_graph.py", title="Graphe de connaissance", icon="🧠")
|
15 |
+
agents_page = st.Page("agents_page/catalogue.py", title="Catalogue des agents", icon="📇")
|
16 |
+
recommended_agents = st.Page("agents_page/recommended_agent.py", title="Agents recommandés", icon="⭐")
|
17 |
+
chatbot = st.Page("chatbot_page/chatbot.py", title="Chatbot", icon="💬")
|
18 |
+
documentation = st.Page("doc_page/documentation.py", title="Documentation", icon="📚")
|
19 |
+
|
20 |
+
pg = st.navigation(
|
21 |
+
{
|
22 |
+
"Audit de contenus": [audit_page, kg_page],
|
23 |
+
"Equipe d'agents IA": [agents_page, recommended_agents],
|
24 |
+
"Chatbot": [chatbot],
|
25 |
+
"Documentation": [documentation]
|
26 |
+
}
|
27 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
pg.run()
|
30 |
|
31 |
|
32 |
if __name__ == "__main__":
|
33 |
+
main()
|
audit_page/audit.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pymupdf as fitz
|
3 |
+
import pyperclip
|
4 |
+
from utils.audit.audit_doc import audit_descriptif_pdf,audit_text,audit_descriptif_word
|
5 |
+
import dotenv
|
6 |
+
from utils.audit.audit_audio import evaluate_audio_quality
|
7 |
+
from PIL import Image
|
8 |
+
from io import BytesIO
|
9 |
+
import os
|
10 |
+
|
11 |
+
|
12 |
+
# Function to classify file type
|
13 |
+
def classify_file(file):
|
14 |
+
if file.type.startswith("image/"):
|
15 |
+
return "image"
|
16 |
+
elif file.type == "application/pdf":
|
17 |
+
return "pdf"
|
18 |
+
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
19 |
+
return "word"
|
20 |
+
elif file.type.startswith("audio/"):
|
21 |
+
return "audio"
|
22 |
+
elif file.type.startswith("text/"):
|
23 |
+
return "text"
|
24 |
+
else:
|
25 |
+
return "unknown"
|
26 |
+
|
27 |
+
#display content
|
28 |
+
def display_content_doc(content:dict,col:st):
|
29 |
+
|
30 |
+
number_of_pages = len(content)
|
31 |
+
col.info("si vous choisissez 0, vous verrez le contenu de toutes les pages")
|
32 |
+
|
33 |
+
number = col.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
|
34 |
+
#0 means all pages
|
35 |
+
if number > 0:
|
36 |
+
page : dict = content[f"page_{number-1}"]
|
37 |
+
option = col.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
|
38 |
+
if option == "images":
|
39 |
+
if number == 0:
|
40 |
+
images = [img for page in content.values() for img in page["images"]]
|
41 |
+
else:
|
42 |
+
images = page["images"]
|
43 |
+
col1,col2,col3 = col.columns(3)
|
44 |
+
for i, (img_bytes, img_width, img_height) in enumerate(images):
|
45 |
+
if i%3 == 0:
|
46 |
+
col1.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
|
47 |
+
elif i%3 == 1:
|
48 |
+
col2.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
|
49 |
+
else:
|
50 |
+
col3.image(Image.open(BytesIO(img_bytes)), caption=f'', width=img_width)
|
51 |
+
|
52 |
+
elif option == "texte":
|
53 |
+
if number == 0:
|
54 |
+
text = "-------------------\n".join([page["texte"] for page in content.values()])
|
55 |
+
else:
|
56 |
+
text = page["texte"]
|
57 |
+
|
58 |
+
col.text_area("Texte",text,height=200)
|
59 |
+
|
60 |
+
elif option == "liens":
|
61 |
+
if number == 0:
|
62 |
+
links = [link for page in content.values() for link in page["liens"]]
|
63 |
+
else:
|
64 |
+
links = page["liens"]
|
65 |
+
for i, link in enumerate(links):
|
66 |
+
col.markdown(f"- {i+1}: {link['uri']} (page {link['page']})")
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
def display_audit_pdf(uploaded_file,col:st):
|
72 |
+
if st.session_state.name_file != uploaded_file.name:
|
73 |
+
st.session_state.name_file = uploaded_file.name
|
74 |
+
with st.spinner("Analyse du document..."):
|
75 |
+
st.session_state.audit = audit_descriptif_pdf(uploaded_file,200)
|
76 |
+
audit = st.session_state.audit["audit"]
|
77 |
+
content = st.session_state.audit["content"]
|
78 |
+
#global audit
|
79 |
+
audit_simplified = {
|
80 |
+
"Nombre de pages": audit["number_of_pages"],
|
81 |
+
"Nombre d'images": audit["number_of_images"],
|
82 |
+
"Nombre de liens": audit["number_of_links"],
|
83 |
+
"Nombre de tableaux": audit["number_of_tables"],
|
84 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
85 |
+
"Nombre de mots": audit["number_of_words"],
|
86 |
+
"Mots clés": audit["key_words"]
|
87 |
+
}
|
88 |
+
|
89 |
+
well_formatted_audit = "Contenus audités\n"
|
90 |
+
for key, value in audit_simplified.items():
|
91 |
+
well_formatted_audit += f"- {key}: {value}\n"
|
92 |
+
|
93 |
+
|
94 |
+
col.code(well_formatted_audit)
|
95 |
+
|
96 |
+
#audit par page
|
97 |
+
with col.expander("Audit par page"):
|
98 |
+
number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit")
|
99 |
+
audit_page = audit[f"page_{number-1}"]
|
100 |
+
audit_page = {
|
101 |
+
"Nombre d'images": audit_page["number_of_images"],
|
102 |
+
"Nombre de liens": audit_page["number_of_links"],
|
103 |
+
"Nombre de tableaux": audit_page["number_of_tables"],
|
104 |
+
"Nombre de tokens": audit_page["number_of_tokens"],
|
105 |
+
"Nombre de mots": audit_page["number_of_words"],
|
106 |
+
}
|
107 |
+
well_formatted_audit_page = "Audit descriptif\n"
|
108 |
+
for key, value in audit_page.items():
|
109 |
+
well_formatted_audit_page += f"- {key}: {value}\n"
|
110 |
+
|
111 |
+
st.code(well_formatted_audit_page)
|
112 |
+
|
113 |
+
return content
|
114 |
+
|
115 |
+
|
116 |
+
def audit_main():
|
117 |
+
|
118 |
+
#st.set_page_config(page_title="Audit des documents", page_icon=":page_with_curl:", layout="wide")
|
119 |
+
# Streamlit app
|
120 |
+
st.title("Audit des documents")
|
121 |
+
|
122 |
+
notice = "Les formats autorisés sont les suivants :\n- **format texte** : txt, word, pdf\n- **format image** : png, jpg\n- **format audio** : wav, MP3"
|
123 |
+
|
124 |
+
col1, col2 = st.columns([4, 3])
|
125 |
+
col1.markdown(notice)
|
126 |
+
|
127 |
+
if "audit" not in st.session_state:
|
128 |
+
st.session_state.audit = {}
|
129 |
+
if "name_file" not in st.session_state:
|
130 |
+
st.session_state.name_file = ""
|
131 |
+
|
132 |
+
# File uploader
|
133 |
+
uploaded_file = col1.file_uploader("Télécharger un ou plusieurs documents")
|
134 |
+
|
135 |
+
if uploaded_file is not None:
|
136 |
+
type = classify_file(uploaded_file)
|
137 |
+
|
138 |
+
col1.write(f"Type de fichier: {type}")
|
139 |
+
|
140 |
+
col1.write("### Synthèse audit du ou des document(s) téléchargé(s)")
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
if type == "pdf":
|
145 |
+
content = display_audit_pdf(uploaded_file,col1)
|
146 |
+
with col2.expander("Contenu"):
|
147 |
+
display_content_doc(content,st)
|
148 |
+
|
149 |
+
elif type == "audio":
|
150 |
+
if st.session_state.name_file != uploaded_file.name:
|
151 |
+
st.session_state.name_file = uploaded_file.name
|
152 |
+
with st.spinner("Analyse de l'audio..."):
|
153 |
+
st.session_state.audit = evaluate_audio_quality(uploaded_file)
|
154 |
+
audit = st.session_state.audit
|
155 |
+
|
156 |
+
#audit global simplifié
|
157 |
+
audit_simplified = {
|
158 |
+
"Durée": f"{audit['duration']:0.2f} minutes",
|
159 |
+
"Nombre de mots": audit["number_of_words"],
|
160 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
161 |
+
"Volume": f"{audit['volume']:0.2f} dBFS (déciBels Full Scale)",
|
162 |
+
"SNR": f"{max(audit['SNR'],0):0.2f} dB (Ratio Signal / Bruit)",
|
163 |
+
}
|
164 |
+
|
165 |
+
well_formatted_audit = "Contenus audités\n"
|
166 |
+
for key, value in audit_simplified.items():
|
167 |
+
well_formatted_audit += f"- {key}: {value}\n"
|
168 |
+
|
169 |
+
col1.code(well_formatted_audit)
|
170 |
+
|
171 |
+
with col2.expander("Transcription"):
|
172 |
+
st.write(audit["transcription"])
|
173 |
+
if st.button("📋",key="copy_transcription"):
|
174 |
+
pyperclip.copy(audit["transcription"])
|
175 |
+
st.success("Transcription copiée dans le presse-papier")
|
176 |
+
|
177 |
+
elif type == "text":
|
178 |
+
text = uploaded_file.read().decode("utf-8")
|
179 |
+
if st.session_state.name_file != uploaded_file.name:
|
180 |
+
st.session_state.name_file = uploaded_file.name
|
181 |
+
with st.spinner("Analyse du texte..."):
|
182 |
+
st.session_state.audit = audit_text(text)
|
183 |
+
audit = st.session_state.audit
|
184 |
+
|
185 |
+
#audit global simplifié
|
186 |
+
audit_simplified = {
|
187 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
188 |
+
"Nombre de mots": audit["number_of_words"]
|
189 |
+
}
|
190 |
+
|
191 |
+
well_formatted_audit = "Audit descriptif\n"
|
192 |
+
for key, value in audit_simplified.items():
|
193 |
+
well_formatted_audit += f"- {key}: {value}\n"
|
194 |
+
|
195 |
+
col1.code(well_formatted_audit)
|
196 |
+
|
197 |
+
with col2.expander("Texte"):
|
198 |
+
st.text_area("Texte",text,height=200)
|
199 |
+
|
200 |
+
elif type == "word":
|
201 |
+
if st.session_state.name_file != uploaded_file.name:
|
202 |
+
st.session_state.name_file = uploaded_file.name
|
203 |
+
with st.spinner("Analyse du document..."):
|
204 |
+
st.session_state.audit = audit_descriptif_word(uploaded_file)
|
205 |
+
audit = st.session_state.audit
|
206 |
+
|
207 |
+
#global audit
|
208 |
+
audit_simplified = {
|
209 |
+
"Nombre de pages": audit["number_of_paragraphs"],
|
210 |
+
"Nombre d'images": audit["number_of_images"],
|
211 |
+
"Nombre de liens": audit["number_of_links"],
|
212 |
+
"Nombre de tableaux": audit["number_of_tables"],
|
213 |
+
"Nombre de tokens": audit["number_of_tokens"],
|
214 |
+
"Nombre de mots": audit["number_of_words"]
|
215 |
+
}
|
216 |
+
|
217 |
+
well_formatted_audit = "Contenus audités\n"
|
218 |
+
for key, value in audit_simplified.items():
|
219 |
+
well_formatted_audit += f"- {key}: {value}\n"
|
220 |
+
|
221 |
+
st.code(well_formatted_audit)
|
222 |
+
|
223 |
+
|
224 |
+
audit_main()
|
audit_page/knowledge_graph.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def kg_main():
|
5 |
+
#st.set_page_config(page_title="Graphe de connaissance", page_icon="", layout="wide")
|
6 |
+
|
7 |
+
st.title("Graphe de connaissance")
|
8 |
+
|
9 |
+
|
10 |
+
kg_main()
|
chatbot_page/chatbot.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.title("Chatbot")
|
doc_page/documentation.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.title("Documentation")
|
utils/audit/audit_audio.py
CHANGED
@@ -31,7 +31,11 @@ def calculate_snr(audio_data):
|
|
31 |
|
32 |
# Function to evaluate audio quality
|
33 |
def evaluate_audio_quality(file) -> dict:
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
audio_data = np.array(audio.get_array_of_samples())
|
36 |
|
37 |
#number of minutes
|
@@ -46,5 +50,5 @@ def evaluate_audio_quality(file) -> dict:
|
|
46 |
#get the transcription of the audio
|
47 |
transcription = transcript_audio_func(file)
|
48 |
|
49 |
-
return {"volume": volume, "SNR": snr,"transcription": transcription,"number_of_tokens": count_tokens(transcription),"duration": duration}
|
50 |
|
|
|
31 |
|
32 |
# Function to evaluate audio quality
|
33 |
def evaluate_audio_quality(file) -> dict:
|
34 |
+
try:
|
35 |
+
audio = AudioSegment.from_file(file)
|
36 |
+
except:
|
37 |
+
audio = AudioSegment.from_file(io.BytesIO(file.read()))
|
38 |
+
|
39 |
audio_data = np.array(audio.get_array_of_samples())
|
40 |
|
41 |
#number of minutes
|
|
|
50 |
#get the transcription of the audio
|
51 |
transcription = transcript_audio_func(file)
|
52 |
|
53 |
+
return {"volume": volume, "SNR": snr,"transcription": transcription,"number_of_tokens": count_tokens(transcription),"duration": duration, "number_of_words": len(transcription.split())}
|
54 |
|
utils/audit/audit_doc.py
CHANGED
@@ -7,11 +7,23 @@ import io
|
|
7 |
from rake_nltk import Rake
|
8 |
import nltk
|
9 |
from nltk.corpus import stopwords
|
|
|
10 |
|
11 |
# Download NLTK stopwords
|
12 |
nltk.download('stopwords')
|
13 |
nltk.download('punkt')
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
def evaluate_text_quality(text: str) -> dict:
|
17 |
# Calculate readability metrics
|
@@ -153,7 +165,15 @@ def audit_descriptif_pdf(file,max_img_width) -> dict:
|
|
153 |
# Extract key words from the document
|
154 |
text = " ".join([page["texte"] for page in doc_content.values()])
|
155 |
key_words = extract_keywords(text)
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
#merge 2 dicts
|
159 |
global_audit = {
|
|
|
7 |
from rake_nltk import Rake
|
8 |
import nltk
|
9 |
from nltk.corpus import stopwords
|
10 |
+
from openai import OpenAI
|
11 |
|
12 |
# Download NLTK stopwords
|
13 |
nltk.download('stopwords')
|
14 |
nltk.download('punkt')
|
15 |
|
16 |
+
#function to use gpt4o-mini
|
17 |
+
def extract_relevant_keywords(prompt: str) -> str:
|
18 |
+
client = OpenAI()
|
19 |
+
response = client.chat.completions.create(
|
20 |
+
model="gpt-4o-mini",
|
21 |
+
messages=[
|
22 |
+
{"role": "user", "content": prompt}
|
23 |
+
]
|
24 |
+
)
|
25 |
+
return response.choices[0].message.content
|
26 |
+
|
27 |
|
28 |
def evaluate_text_quality(text: str) -> dict:
|
29 |
# Calculate readability metrics
|
|
|
165 |
# Extract key words from the document
|
166 |
text = " ".join([page["texte"] for page in doc_content.values()])
|
167 |
key_words = extract_keywords(text)
|
168 |
+
list_key_words_text = "\n".join(key_words[:10])
|
169 |
+
prompt = f'''Voici une liste de mots et phrases provenant d'un document :
|
170 |
+
- {list_key_words_text}
|
171 |
+
Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots.
|
172 |
+
|
173 |
+
REPONSE:
|
174 |
+
'''
|
175 |
+
key_words_extracted = extract_relevant_keywords(prompt)
|
176 |
+
audit_dict_doc["key_words"] = "\n" + key_words_extracted
|
177 |
|
178 |
#merge 2 dicts
|
179 |
global_audit = {
|