Ilyas KHIAT commited on
Commit
e1bcbc6
·
1 Parent(s): 38b5db9

more details content

Browse files
Files changed (3) hide show
  1. app.py +141 -58
  2. requirements.txt +6 -1
  3. utils/audit/audit_doc.py +126 -8
app.py CHANGED
@@ -1,20 +1,11 @@
1
  import streamlit as st
2
  import pymupdf as fitz
3
  import pyperclip
4
- from utils.audit.audit_doc import audit_descriptif
5
  import dotenv
6
  from utils.audit.audit_audio import evaluate_audio_quality
7
-
8
-
9
- # Function to extract text from PDF
10
- def extract_text_from_pdf(file):
11
- document = fitz.open(stream=file.read(), filetype="pdf")
12
- full_text = ""
13
- for page_num in range(len(document)):
14
- page = document.load_page(page_num)
15
- text = page.get_text("text")
16
- full_text += text
17
- return full_text
18
 
19
  # Function to classify file type
20
  def classify_file(file):
@@ -30,6 +21,95 @@ def classify_file(file):
30
  return "text"
31
  else:
32
  return "unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def main():
35
  dotenv.load_dotenv()
@@ -42,25 +122,53 @@ def main():
42
  st.session_state.name_file = ""
43
 
44
  # File uploader
45
- uploaded_file = st.file_uploader("Télécharger un documents")
46
 
47
  if uploaded_file is not None:
48
  type = classify_file(uploaded_file)
 
49
  st.write(f"Type de fichier: {type}")
 
 
 
50
  if type == "pdf":
 
51
 
 
 
52
  if st.session_state.name_file != uploaded_file.name:
53
  st.session_state.name_file = uploaded_file.name
54
- with st.spinner("Analyse du document..."):
55
- st.session_state.audit = audit_descriptif(uploaded_file)
56
  audit = st.session_state.audit
57
 
58
- #global audit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  audit_simplified = {
60
- "Nombre de pages": audit["number_of_pages"],
61
- "Nombre d'images": audit["number_of_images"],
62
- "Nombre de liens": audit["number_of_links"],
63
- "Nombre de tableaux": audit["number_of_tables"],
64
  "Nombre de tokens": audit["number_of_tokens"],
65
  "Nombre de mots": audit["number_of_words"]
66
  }
@@ -69,56 +177,31 @@ def main():
69
  for key, value in audit_simplified.items():
70
  well_formatted_audit += f"- {key}: {value}\n"
71
 
72
- st.write("### Audit de tout le document")
73
  st.code(well_formatted_audit)
74
-
75
- #audit par page
76
- with st.expander("Audit par page"):
77
- number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1)
78
- audit_page = audit[f"page_{number-1}"]
79
- audit_page = {
80
- "Nombre d'images": audit_page["number_of_images"],
81
- "Nombre de liens": audit_page["number_of_links"],
82
- "Nombre de tableaux": audit_page["number_of_tables"],
83
- "Nombre de tokens": audit_page["number_of_tokens"],
84
- "Nombre de mots": audit_page["number_of_words"]
85
- }
86
- well_formatted_audit_page = "Audit descriptif\n"
87
- for key, value in audit_page.items():
88
- well_formatted_audit_page += f"- {key}: {value}\n"
89
-
90
- st.code(well_formatted_audit_page)
91
-
92
- # # Button to copy text to clipboard
93
- # if st.button("Copy to Clipboard"):
94
- # pyperclip.copy(audit)
95
- # st.success("Text copied to clipboard successfully!")
96
- # else:
97
- # st.info("Please upload a PDF file to extract text.")
98
 
99
- elif type == "audio":
100
  if st.session_state.name_file != uploaded_file.name:
101
  st.session_state.name_file = uploaded_file.name
102
- with st.spinner("Analyse de l'audio..."):
103
- st.session_state.audit = evaluate_audio_quality(uploaded_file)
104
  audit = st.session_state.audit
105
 
106
- #audit global simplifié
107
  audit_simplified = {
108
- "Volume": f"{audit['volume']:0.2f} dBFS",
109
- "SNR": f"{max(audit['SNR'],0):0.2f} dB",
110
- "Durée": f"{audit['duration']:0.2f} minutes",
111
- "Nombre de tokens": audit["number_of_tokens"]
 
 
112
  }
113
 
114
- well_formatted_audit = "Audit descriptif\n"
115
  for key, value in audit_simplified.items():
116
  well_formatted_audit += f"- {key}: {value}\n"
117
-
118
  st.code(well_formatted_audit)
119
-
120
- with st.expander("Transcription"):
121
- st.write(audit["transcription"])
122
 
123
  if __name__ == "__main__":
124
  main()
 
1
  import streamlit as st
2
  import pymupdf as fitz
3
  import pyperclip
4
+ from utils.audit.audit_doc import audit_descriptif_pdf,audit_text,audit_descriptif_word
5
  import dotenv
6
  from utils.audit.audit_audio import evaluate_audio_quality
7
+ from PIL import Image
8
+ from io import BytesIO
 
 
 
 
 
 
 
 
 
9
 
10
  # Function to classify file type
11
  def classify_file(file):
 
21
  return "text"
22
  else:
23
  return "unknown"
24
+
25
+ #display content
26
+ def display_content_doc(content:dict):
27
+
28
+ number_of_pages = len(content)
29
+ st.info("si vous choisissez 0, vous verrez le contenu de toutes les pages")
30
+
31
+ number = st.number_input("Numéro de page", min_value=0, max_value=number_of_pages, value=0,key="number_page_content")
32
+ #0 means all pages
33
+ if number > 0:
34
+ page : dict = content[f"page_{number-1}"]
35
+ option = st.radio("Type de contenu",list(content[f"page_0"].keys()), index=0,horizontal=True)
36
+ if option == "images":
37
+ if number == 0:
38
+ images = [img for page in content.values() for img in page["images"]]
39
+ else:
40
+ images = page["images"]
41
+ col1,col2,col3 = st.columns(3)
42
+ for i, (img_bytes, img_width, img_height) in enumerate(images):
43
+ if i%3 == 0:
44
+ col1.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
45
+ elif i%3 == 1:
46
+ col2.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
47
+ else:
48
+ col3.image(Image.open(BytesIO(img_bytes)), caption=f'Image {i + 1}', width=img_width)
49
+
50
+ elif option == "texte":
51
+ if number == 0:
52
+ text = "-------------------\n".join([page["texte"] for page in content.values()])
53
+ else:
54
+ text = page["texte"]
55
+
56
+ st.text_area("Texte",text,height=200)
57
+
58
+ elif option == "liens":
59
+ if number == 0:
60
+ links = [link for page in content.values() for link in page["liens"]]
61
+ else:
62
+ links = page["liens"]
63
+ for i, link in enumerate(links):
64
+ st.markdown(f"- {i+1}: {link['uri']} (page {link['page']})")
65
+
66
+
67
+
68
+
69
+ def display_audit_pdf(uploaded_file):
70
+ if st.session_state.name_file != uploaded_file.name:
71
+ st.session_state.name_file = uploaded_file.name
72
+ with st.spinner("Analyse du document..."):
73
+ st.session_state.audit = audit_descriptif_pdf(uploaded_file,200)
74
+ audit = st.session_state.audit["audit"]
75
+ content = st.session_state.audit["content"]
76
+ #global audit
77
+ audit_simplified = {
78
+ "Nombre de pages": audit["number_of_pages"],
79
+ "Nombre d'images": audit["number_of_images"],
80
+ "Nombre de liens": audit["number_of_links"],
81
+ "Nombre de tableaux": audit["number_of_tables"],
82
+ "Nombre de tokens": audit["number_of_tokens"],
83
+ "Nombre de mots": audit["number_of_words"],
84
+ "Mots clés": audit["key_words"]
85
+ }
86
+
87
+ well_formatted_audit = "Contenus audités\n"
88
+ for key, value in audit_simplified.items():
89
+ well_formatted_audit += f"- {key}: {value}\n"
90
+
91
+ st.code(well_formatted_audit)
92
+
93
+ #audit par page
94
+ with st.expander("Audit par page"):
95
+ number = st.number_input("Numéro de page", min_value=1, max_value=audit["number_of_pages"], value=1,key="number_page_audit")
96
+ audit_page = audit[f"page_{number-1}"]
97
+ audit_page = {
98
+ "Nombre d'images": audit_page["number_of_images"],
99
+ "Nombre de liens": audit_page["number_of_links"],
100
+ "Nombre de tableaux": audit_page["number_of_tables"],
101
+ "Nombre de tokens": audit_page["number_of_tokens"],
102
+ "Nombre de mots": audit_page["number_of_words"],
103
+ }
104
+ well_formatted_audit_page = "Audit descriptif\n"
105
+ for key, value in audit_page.items():
106
+ well_formatted_audit_page += f"- {key}: {value}\n"
107
+
108
+ st.code(well_formatted_audit_page)
109
+
110
+ with st.expander("Cliquer ici pour voir le contenu du document"):
111
+ display_content_doc(content)
112
+
113
 
114
  def main():
115
  dotenv.load_dotenv()
 
122
  st.session_state.name_file = ""
123
 
124
  # File uploader
125
+ uploaded_file = st.file_uploader("Télécharger un ou plusieurs des documents")
126
 
127
  if uploaded_file is not None:
128
  type = classify_file(uploaded_file)
129
+
130
  st.write(f"Type de fichier: {type}")
131
+
132
+ st.write("### Audit de tout le document")
133
+
134
  if type == "pdf":
135
+ display_audit_pdf(uploaded_file)
136
 
137
+
138
+ elif type == "audio":
139
  if st.session_state.name_file != uploaded_file.name:
140
  st.session_state.name_file = uploaded_file.name
141
+ with st.spinner("Analyse de l'audio..."):
142
+ st.session_state.audit = evaluate_audio_quality(uploaded_file)
143
  audit = st.session_state.audit
144
 
145
+ #audit global simplifié
146
+ audit_simplified = {
147
+ "Volume": f"{audit['volume']:0.2f} dBFS",
148
+ "SNR": f"{max(audit['SNR'],0):0.2f} dB",
149
+ "Durée": f"{audit['duration']:0.2f} minutes",
150
+ "Nombre de tokens": audit["number_of_tokens"]
151
+ }
152
+
153
+ well_formatted_audit = "Contenus audités\n"
154
+ for key, value in audit_simplified.items():
155
+ well_formatted_audit += f"- {key}: {value}\n"
156
+
157
+ st.code(well_formatted_audit)
158
+
159
+ with st.expander("Transcription"):
160
+ st.write(audit["transcription"])
161
+
162
+ elif type == "text":
163
+ text = uploaded_file.read().decode("utf-8")
164
+ if st.session_state.name_file != uploaded_file.name:
165
+ st.session_state.name_file = uploaded_file.name
166
+ with st.spinner("Analyse du texte..."):
167
+ st.session_state.audit = audit_text(text)
168
+ audit = st.session_state.audit
169
+
170
+ #audit global simplifié
171
  audit_simplified = {
 
 
 
 
172
  "Nombre de tokens": audit["number_of_tokens"],
173
  "Nombre de mots": audit["number_of_words"]
174
  }
 
177
  for key, value in audit_simplified.items():
178
  well_formatted_audit += f"- {key}: {value}\n"
179
 
 
180
  st.code(well_formatted_audit)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ elif type == "word":
183
  if st.session_state.name_file != uploaded_file.name:
184
  st.session_state.name_file = uploaded_file.name
185
+ with st.spinner("Analyse du document..."):
186
+ st.session_state.audit = audit_descriptif_word(uploaded_file)
187
  audit = st.session_state.audit
188
 
189
+ #global audit
190
  audit_simplified = {
191
+ "Nombre de pages": audit["number_of_paragraphs"],
192
+ "Nombre d'images": audit["number_of_images"],
193
+ "Nombre de liens": audit["number_of_links"],
194
+ "Nombre de tableaux": audit["number_of_tables"],
195
+ "Nombre de tokens": audit["number_of_tokens"],
196
+ "Nombre de mots": audit["number_of_words"]
197
  }
198
 
199
+ well_formatted_audit = "Contenus audités\n"
200
  for key, value in audit_simplified.items():
201
  well_formatted_audit += f"- {key}: {value}\n"
202
+
203
  st.code(well_formatted_audit)
204
+
 
 
205
 
206
  if __name__ == "__main__":
207
  main()
requirements.txt CHANGED
@@ -6,4 +6,9 @@ numpy
6
  scipy
7
  textstat
8
  pymupdf
9
- openai
 
 
 
 
 
 
6
  scipy
7
  textstat
8
  pymupdf
9
+ openai
10
+ nltk
11
+ rake_nltk
12
+ python-docx
13
+ io
14
+ pillow
utils/audit/audit_doc.py CHANGED
@@ -2,6 +2,15 @@
2
  import pymupdf
3
  import tiktoken
4
  import textstat
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  def evaluate_text_quality(text: str) -> dict:
@@ -44,12 +53,17 @@ def evaluate_text_quality(text: str) -> dict:
44
  # Scale the global score to 0-5
45
  global_score_0_5 = global_score * 5
46
 
 
 
 
 
 
47
  def count_tokens(input_string: str) -> int:
48
  tokenizer = tiktoken.get_encoding("cl100k_base")
49
  tokens = tokenizer.encode(input_string)
50
  return len(tokens)
51
 
52
- def audit_descriptif(file) -> dict:
53
  document = pymupdf.open(stream=file.read())
54
 
55
  audit_dict_doc = {
@@ -58,20 +72,56 @@ def audit_descriptif(file) -> dict:
58
  "number_of_links": 0,
59
  "number_of_tables": 0,
60
  "number_of_tokens": 0,
61
- "number_of_words": 0
 
62
  }
63
 
 
 
64
  for page in document:
65
 
66
  audit_dict_page = {}
 
 
 
 
 
 
67
  #number of images
68
- number_images = len(page.get_images())
 
69
  audit_dict_page["number_of_images"] = number_images
70
  audit_dict_doc["number_of_images"] += number_images
71
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  #number of links
74
- number_links = len(page.get_links())
75
  audit_dict_page["number_of_links"] = number_links
76
  audit_dict_doc["number_of_links"] += number_links
77
 
@@ -85,12 +135,80 @@ def audit_descriptif(file) -> dict:
85
  number_tokens = count_tokens(text)
86
  number_words = len(text.split())
87
 
88
- audit_dict_page["number_of_tokens"] = count_tokens(text)
89
- audit_dict_page["number_of_words"] = len(text.split())
 
 
 
90
 
91
  audit_dict_doc["number_of_tokens"] += number_tokens
92
  audit_dict_doc["number_of_words"] += number_words
93
 
94
  audit_dict_doc[f"page_{page.number}"] = audit_dict_page
95
 
96
- return audit_dict_doc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pymupdf
3
  import tiktoken
4
  import textstat
5
+ from docx import Document
6
+ import io
7
+ from rake_nltk import Rake
8
+ import nltk
9
+ from nltk.corpus import stopwords
10
+
11
+ # Download NLTK stopwords
12
+ nltk.download('stopwords')
13
+ nltk.download('punkt')
14
 
15
 
16
  def evaluate_text_quality(text: str) -> dict:
 
53
  # Scale the global score to 0-5
54
  global_score_0_5 = global_score * 5
55
 
56
+ def extract_keywords(text):
57
+ rake = Rake(stopwords.words('english'))
58
+ rake.extract_keywords_from_text(text)
59
+ return rake.get_ranked_phrases_with_scores()
60
+
61
  def count_tokens(input_string: str) -> int:
62
  tokenizer = tiktoken.get_encoding("cl100k_base")
63
  tokens = tokenizer.encode(input_string)
64
  return len(tokens)
65
 
66
+ def audit_descriptif_pdf(file,max_img_width) -> dict:
67
  document = pymupdf.open(stream=file.read())
68
 
69
  audit_dict_doc = {
 
72
  "number_of_links": 0,
73
  "number_of_tables": 0,
74
  "number_of_tokens": 0,
75
+ "number_of_words": 0,
76
+ "key_words": []
77
  }
78
 
79
+ doc_content = dict()
80
+
81
  for page in document:
82
 
83
  audit_dict_page = {}
84
+ page_content = {
85
+ "images": [],
86
+ "texte": "",
87
+ "liens": []
88
+ }
89
+
90
  #number of images
91
+ images = page.get_images()
92
+ number_images = len(images)
93
  audit_dict_page["number_of_images"] = number_images
94
  audit_dict_doc["number_of_images"] += number_images
95
+
96
+ #get images
97
+ for _, img in enumerate(images):
98
+ xref = img[0]
99
+ base_image = document.extract_image(xref)
100
+
101
+ image_bytes = base_image["image"]
102
+ image_width = base_image["width"]
103
+ image_height = base_image["height"]
104
+
105
+ # Adjust image size if it exceeds the maximum width
106
+ if image_width > max_img_width:
107
+ ratio = max_img_width / image_width
108
+ image_width = max_img_width
109
+ image_height = int(image_height * ratio)
110
+
111
+ page_content["images"].append((image_bytes, image_width, image_height))
112
+
113
+
114
+
115
+ #get links with uri
116
+ links = []
117
+ for link in page.get_links():
118
+ if link['kind'] == pymupdf.LINK_URI and 'uri' in link:
119
+ links.append({"uri": link["uri"], "page": page.number})
120
+
121
+ page_content["liens"] = links
122
 
123
  #number of links
124
+ number_links = len(links)
125
  audit_dict_page["number_of_links"] = number_links
126
  audit_dict_doc["number_of_links"] += number_links
127
 
 
135
  number_tokens = count_tokens(text)
136
  number_words = len(text.split())
137
 
138
+ audit_dict_page["number_of_tokens"] = number_tokens
139
+ audit_dict_page["number_of_words"] = number_words
140
+
141
+ #get text
142
+ page_content["texte"] = text
143
 
144
  audit_dict_doc["number_of_tokens"] += number_tokens
145
  audit_dict_doc["number_of_words"] += number_words
146
 
147
  audit_dict_doc[f"page_{page.number}"] = audit_dict_page
148
 
149
+ doc_content[f"page_{page.number}"] = page_content
150
+
151
+ # Extract key words from the document
152
+ text = " ".join([page["texte"] for page in doc_content.values()])
153
+ key_words = extract_keywords(text)
154
+ audit_dict_doc["key_words"] = key_words[:5]
155
+
156
+ #merge 2 dicts
157
+ global_audit = {
158
+ "audit": audit_dict_doc,
159
+ "content": doc_content
160
+ }
161
+
162
+ return global_audit
163
+
164
+ def audit_text(text: str) -> dict:
165
+ audit_dict = {
166
+ "number_of_tokens": count_tokens(text),
167
+ "number_of_words": len(text.split())
168
+ }
169
+
170
+ return audit_dict
171
+
172
+
173
+
174
+ def count_tokens(text):
175
+ # Implement a token counting method. Here, we assume tokens are words.
176
+ return len(text.split())
177
+
178
+ def audit_descriptif_word(file) -> dict:
179
+ document = Document(io.BytesIO(file.read()))
180
+
181
+ audit_dict_doc = {
182
+ "number_of_paragraphs": 0,
183
+ "number_of_images": 0,
184
+ "number_of_links": 0,
185
+ "number_of_tables": 0,
186
+ "number_of_tokens": 0,
187
+ "number_of_words": 0
188
+ }
189
+
190
+ for para in document.paragraphs:
191
+ audit_dict_doc["number_of_paragraphs"] += 1
192
+ text = para.text
193
+
194
+ # Count tokens and words in the paragraph
195
+ number_tokens = count_tokens(text)
196
+ number_words = len(text.split())
197
+
198
+ audit_dict_doc["number_of_tokens"] += number_tokens
199
+ audit_dict_doc["number_of_words"] += number_words
200
+
201
+ # Count links (assuming they are hyperlinks)
202
+ for run in para.runs:
203
+ if run.link:
204
+ audit_dict_doc["number_of_links"] += 1
205
+
206
+ for table in document.tables:
207
+ audit_dict_doc["number_of_tables"] += 1
208
+
209
+ # Counting images (inline shapes and pictures)
210
+ for shape in document.inline_shapes:
211
+ audit_dict_doc["number_of_images"] += 1
212
+
213
+ return audit_dict_doc
214
+