Spaces:
Running
Running
import pdfplumber | |
import re | |
# Extract text as paragraph delimiter without tables and graphs | |
def extract_and_format_paragraphs(pdf_path): | |
"""Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections.""" | |
# Define patterns for headers, footnotes, and specific lines | |
header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE) | |
footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE) | |
footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE) # Footnotes start with a number followed by a space | |
start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE) | |
end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE) | |
# Define common abbreviations and patterns that should not be considered as end-of-sentence | |
#exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE) | |
def remove_abbreviation_periods(text): | |
# Define regex patterns for common abbreviations where periods should be ignored | |
abbreviations = [ | |
r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b' | |
] | |
for abbr in abbreviations: | |
# Remove periods in abbreviations at the end of the text | |
text = re.sub(f'({abbr})\.', r'\1', text) | |
return text | |
def is_end_of_sentence(text): | |
# Strip leading and trailing whitespace | |
text = text.strip() | |
# Remove periods in common abbreviations from the end of the text | |
text = remove_abbreviation_periods(text) | |
# Define regex patterns for sentence-ending punctuation | |
sentence_end_re = re.compile(r'[\.\!\?]\s*$') | |
# Check if the text ends with sentence-ending punctuation | |
return bool(sentence_end_re.search(text)) | |
def clean_text(text): | |
"""Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections.""" | |
lines = text.split('\n') | |
filtered_lines = [] | |
in_removal_section = False | |
paragraph_lines = [] | |
def is_footnote_line(line): | |
"""Check if a line matches the footnote pattern.""" | |
return footnote_pattern.match(line) | |
def append_line_to_paragraph(line): | |
"""Append the line to the paragraph, handling line breaks and footnotes.""" | |
if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]): | |
# This line is a continuation of the previous one | |
paragraph_lines[-1] += ' ' + line.strip() | |
else: | |
# Start a new line in the paragraph | |
paragraph_lines.append(line.strip()) | |
skip_line = False | |
for line in lines: | |
# Check for start and end markers | |
if start_marker_pattern.match(line): | |
in_removal_section = True | |
if in_removal_section and end_marker_pattern.match(line): | |
in_removal_section = False | |
continue | |
# Handle footnotes | |
if is_footnote_line(line): | |
skip_line = True | |
continue | |
if skip_line: | |
if is_end_of_sentence(line): | |
skip_line = False | |
continue | |
# Filter out headers and footers | |
if not header_pattern.match(line) and \ | |
not footer_pattern.match(line) and \ | |
not in_removal_section: | |
# Remove unnecessary line breaks and append line to paragraph_lines | |
if line.strip(): | |
append_line_to_paragraph(line) | |
# Join all paragraph lines into a single paragraph text, removing unnecessary newlines | |
cleaned_paragraphs = "\n".join(paragraph_lines) | |
return cleaned_paragraphs | |
full_text = "" | |
previous_page_text = "" | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
page_text = page.extract_text() | |
if page_text: | |
# Clean and format the page text | |
cleaned_text = clean_text(page_text) | |
# Handle text from previous page | |
if previous_page_text: | |
# Check if the previous page text ends with punctuation | |
if not is_end_of_sentence(previous_page_text): | |
# Append the current page text to previous page text | |
previous_page_text += " " + cleaned_text | |
else: | |
# Add previous page text to full text | |
full_text += previous_page_text + "\n" | |
# Reset previous page text | |
previous_page_text = cleaned_text | |
else: | |
previous_page_text = cleaned_text | |
# Add remaining text from the last page | |
if previous_page_text: | |
full_text += previous_page_text | |
return full_text.strip() | |
# Cleaning: cut unecessary information such as annex and intro | |
def find_text_range(text, start_keyword, end_keywords): | |
"""Find the text range between start and multiple end keywords.""" | |
start_index = text.lower().find(start_keyword.lower()) | |
if start_index == -1: | |
raise ValueError(f"Start keyword '{start_keyword}' not found in the text.") | |
# Find the earliest occurrence of any end keyword | |
end_index = len(text) # Default to end of text | |
for end_keyword in end_keywords: | |
keyword_index = text.lower().find(end_keyword.lower()) | |
if keyword_index != -1 and keyword_index < end_index: | |
end_index = keyword_index | |
return start_index, end_index | |
def extract_relevant_text(text, start_index, end_index): | |
"""Extract text from the start index to the end index.""" | |
return text[start_index:end_index].strip() | |
# Split paragraphs into list of paragraphs | |
def split_text_into_paragraphs(extracted_text, min_length): | |
""" | |
Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs. | |
""" | |
# Split the text into paragraphs based on newlines | |
paragraphs = re.split(r'\n+', extracted_text.strip()) | |
def is_end_of_sentence(text): | |
"""Check if the text ends with punctuation indicating the end of a sentence.""" | |
return bool(re.search(r'[.!?]$', text.strip())) | |
def count_sentences(text): | |
"""Count the number of sentences in a text.""" | |
return len(re.split(r'(?<=[.!?])\s+', text.strip())) | |
def merge_single_sentence_paragraphs(paragraphs): | |
"""Merge single-sentence paragraphs with the next paragraph if necessary.""" | |
merged_paragraphs = [] | |
i = 0 | |
while i < len(paragraphs): | |
para = paragraphs[i].strip() | |
if not para: | |
i += 1 | |
continue | |
if count_sentences(para) == 1 and i + 1 < len(paragraphs): | |
# Check if the next paragraph should be merged with the current one | |
next_para = paragraphs[i + 1].strip() | |
if next_para: | |
# Merge single-sentence paragraph with the next paragraph | |
merged_paragraphs.append(para + ' ' + next_para) | |
i += 2 # Skip the next paragraph since it has been merged | |
else: | |
# If the next paragraph is empty, just add the current paragraph | |
merged_paragraphs.append(para) | |
i += 1 | |
else: | |
# Add the current paragraph if it has more than one sentence or is the last one | |
merged_paragraphs.append(para) | |
i += 1 | |
return merged_paragraphs | |
# Filter out paragraphs that are too short | |
filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length] | |
# Merge single-sentence paragraphs | |
final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs) | |
return final_paragraphs |