import pdfplumber import re # Extract text as paragraph delimiter without tables and graphs def extract_and_format_paragraphs(pdf_path): """Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections.""" # Define patterns for headers, footnotes, and specific lines header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE) footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE) footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE) # Footnotes start with a number followed by a space start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE) end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE) # Define common abbreviations and patterns that should not be considered as end-of-sentence #exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE) def remove_abbreviation_periods(text): # Define regex patterns for common abbreviations where periods should be ignored abbreviations = [ r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b' ] for abbr in abbreviations: # Remove periods in abbreviations at the end of the text text = re.sub(f'({abbr})\.', r'\1', text) return text def is_end_of_sentence(text): # Strip leading and trailing whitespace text = text.strip() # Remove periods in common abbreviations from the end of the text text = remove_abbreviation_periods(text) # Define regex patterns for sentence-ending punctuation sentence_end_re = re.compile(r'[\.\!\?]\s*$') # Check if the text ends with sentence-ending punctuation return bool(sentence_end_re.search(text)) def clean_text(text): """Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections.""" lines = text.split('\n') filtered_lines = [] in_removal_section = False paragraph_lines = [] def is_footnote_line(line): """Check if a line matches the footnote pattern.""" return footnote_pattern.match(line) def append_line_to_paragraph(line): """Append the line to the paragraph, handling line breaks and footnotes.""" if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]): # This line is a continuation of the previous one paragraph_lines[-1] += ' ' + line.strip() else: # Start a new line in the paragraph paragraph_lines.append(line.strip()) skip_line = False for line in lines: # Check for start and end markers if start_marker_pattern.match(line): in_removal_section = True if in_removal_section and end_marker_pattern.match(line): in_removal_section = False continue # Handle footnotes if is_footnote_line(line): skip_line = True continue if skip_line: if is_end_of_sentence(line): skip_line = False continue # Filter out headers and footers if not header_pattern.match(line) and \ not footer_pattern.match(line) and \ not in_removal_section: # Remove unnecessary line breaks and append line to paragraph_lines if line.strip(): append_line_to_paragraph(line) # Join all paragraph lines into a single paragraph text, removing unnecessary newlines cleaned_paragraphs = "\n".join(paragraph_lines) return cleaned_paragraphs full_text = "" previous_page_text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: # Clean and format the page text cleaned_text = clean_text(page_text) # Handle text from previous page if previous_page_text: # Check if the previous page text ends with punctuation if not is_end_of_sentence(previous_page_text): # Append the current page text to previous page text previous_page_text += " " + cleaned_text else: # Add previous page text to full text full_text += previous_page_text + "\n" # Reset previous page text previous_page_text = cleaned_text else: previous_page_text = cleaned_text # Add remaining text from the last page if previous_page_text: full_text += previous_page_text return full_text.strip() # Cleaning: cut unecessary information such as annex and intro def find_text_range(text, start_keyword, end_keywords): """Find the text range between start and multiple end keywords.""" start_index = text.lower().find(start_keyword.lower()) if start_index == -1: raise ValueError(f"Start keyword '{start_keyword}' not found in the text.") # Find the earliest occurrence of any end keyword end_index = len(text) # Default to end of text for end_keyword in end_keywords: keyword_index = text.lower().find(end_keyword.lower()) if keyword_index != -1 and keyword_index < end_index: end_index = keyword_index return start_index, end_index def extract_relevant_text(text, start_index, end_index): """Extract text from the start index to the end index.""" return text[start_index:end_index].strip() # Split paragraphs into list of paragraphs def split_text_into_paragraphs(extracted_text, min_length): """ Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs. """ # Split the text into paragraphs based on newlines paragraphs = re.split(r'\n+', extracted_text.strip()) def is_end_of_sentence(text): """Check if the text ends with punctuation indicating the end of a sentence.""" return bool(re.search(r'[.!?]$', text.strip())) def count_sentences(text): """Count the number of sentences in a text.""" return len(re.split(r'(?<=[.!?])\s+', text.strip())) def merge_single_sentence_paragraphs(paragraphs): """Merge single-sentence paragraphs with the next paragraph if necessary.""" merged_paragraphs = [] i = 0 while i < len(paragraphs): para = paragraphs[i].strip() if not para: i += 1 continue if count_sentences(para) == 1 and i + 1 < len(paragraphs): # Check if the next paragraph should be merged with the current one next_para = paragraphs[i + 1].strip() if next_para: # Merge single-sentence paragraph with the next paragraph merged_paragraphs.append(para + ' ' + next_para) i += 2 # Skip the next paragraph since it has been merged else: # If the next paragraph is empty, just add the current paragraph merged_paragraphs.append(para) i += 1 else: # Add the current paragraph if it has more than one sentence or is the last one merged_paragraphs.append(para) i += 1 return merged_paragraphs # Filter out paragraphs that are too short filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length] # Merge single-sentence paragraphs final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs) return final_paragraphs