import pymupdf import tiktoken import textstat from docx import Document import io # from rake_nltk import Rake # import nltk # from nltk.corpus import stopwords from openai import OpenAI # Download NLTK stopwords # nltk.download('stopwords') # nltk.download('punkt') #function to use gpt4o-mini def extract_relevant_keywords(prompt: str) -> str: client = OpenAI() response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "user", "content": prompt} ] ) return response.choices[0].message.content def evaluate_text_quality(text: str) -> dict: # Calculate readability metrics flesch_reading_ease = textstat.flesch_reading_ease(text) flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) gunning_fog = textstat.gunning_fog(text) smog_index = textstat.smog_index(text) automated_readability_index = textstat.automated_readability_index(text) # Normalize readability scores to a 0-1 scale def normalize_score(score, min_score, max_score): return (score - min_score) / (max_score - min_score) # Normalize each readability score n_flesch_reading_ease = normalize_score(flesch_reading_ease, 0, 100) n_flesch_kincaid_grade = 1 - normalize_score(flesch_kincaid_grade, 0, 18) # Higher is more difficult n_gunning_fog = 1 - normalize_score(gunning_fog, 0, 18) # Higher is more difficult n_smog_index = 1 - normalize_score(smog_index, 0, 18) # Higher is more difficult n_automated_readability_index = 1 - normalize_score(automated_readability_index, 0, 18) # Higher is more difficult # Weights for each metric (adjust these as needed) weights = { "flesch_reading_ease": 0.25, "flesch_kincaid_grade": 0.25, "gunning_fog": 0.2, "smog_index": 0.15, "automated_readability_index": 0.15 } # Calculate the global readability score global_score = ( n_flesch_reading_ease * weights["flesch_reading_ease"] + n_flesch_kincaid_grade * weights["flesch_kincaid_grade"] + n_gunning_fog * weights["gunning_fog"] + n_smog_index * weights["smog_index"] + n_automated_readability_index * weights["automated_readability_index"] ) # Scale the global score to 0-5 global_score_0_5 = global_score * 5 # def extract_keywords(text): # rake = Rake(stopwords.words('french')) # rake.extract_keywords_from_text(text) # return rake.get_ranked_phrases() def count_tokens(input_string: str) -> int: tokenizer = tiktoken.get_encoding("cl100k_base") tokens = tokenizer.encode(input_string) return len(tokens) def audit_descriptif_pdf(file,max_img_width) -> dict: document = pymupdf.open(stream=file.read()) audit_dict_doc = { "number_of_pages": len(document), "number_of_images": 0, "number_of_links": 0, "number_of_tables": 0, "number_of_tokens": 0, "number_of_words": 0, "key_words": [] } doc_content = dict() for page in document: audit_dict_page = {} page_content = { "images": [], "texte": "", "liens": [], "tableaux": [] } #number of images images = page.get_images() number_images = len(images) audit_dict_page["number_of_images"] = number_images audit_dict_doc["number_of_images"] += number_images #get images for _, img in enumerate(images): xref = img[0] base_image = document.extract_image(xref) image_bytes = base_image["image"] image_width = base_image["width"] image_height = base_image["height"] # Adjust image size if it exceeds the maximum width if image_width > max_img_width: ratio = max_img_width / image_width image_width = max_img_width image_height = int(image_height * ratio) page_content["images"].append((image_bytes, image_width, image_height)) #get links with uri links = [] for link in page.get_links(): if link['kind'] == pymupdf.LINK_URI and 'uri' in link: links.append({"uri": link["uri"], "page": page.number}) page_content["liens"] = links #number of links number_links = len(links) audit_dict_page["number_of_links"] = number_links audit_dict_doc["number_of_links"] += number_links #number of tables tables = page.find_tables().tables number_tables = len(tables) for tab in tables: page_content["tableaux"].append(tab.to_pandas()) audit_dict_page["number_of_tables"] = number_tables audit_dict_doc["number_of_tables"] += number_tables #number of tokens and words text = page.get_text("text") number_tokens = count_tokens(text) number_words = len(text.split()) audit_dict_page["number_of_tokens"] = number_tokens audit_dict_page["number_of_words"] = number_words #get text page_content["texte"] = text audit_dict_doc["number_of_tokens"] += number_tokens audit_dict_doc["number_of_words"] += number_words audit_dict_doc[f"page_{page.number}"] = audit_dict_page doc_content[f"page_{page.number}"] = page_content # Extract key words from the document text = " ".join([page["texte"] for page in doc_content.values()]) # key_words = extract_keywords(text) # list_key_words_text = "\n".join(key_words[:10]) prompt = f'''Voici le document: - {text} Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots. TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT : key_word1, key_word2, key_word3, key_word4, key_word5 ''' key_words_extracted = extract_relevant_keywords(prompt) audit_dict_doc["key_words"] = "\n" + key_words_extracted #merge 2 dicts global_audit = { "audit": audit_dict_doc, "content": doc_content } return global_audit def audit_text(text: str) -> dict: prompt = f'''Voici le document: - {text} Veuillez extraire les cinq mots clés les plus pertinents de cette liste. Chaque mot clé doit contenir au maximum deux mots. TA REPONSE DOIT RESPECTER LE FORMAT SUIVANT : key_word1, key_word2, key_word3, key_word4, key_word5 ''' key_words_extracted = extract_relevant_keywords(prompt) audit_dict = { "number_of_tokens": count_tokens(text), "number_of_words": len(text.split()), } audit_dict["key_words"] = "\n" + key_words_extracted global_audit = { "audit": audit_dict, "content": text } return global_audit