Spaces:
Running
Running
File size: 8,387 Bytes
3e45198 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import pdfplumber
import re
# Extract text as paragraph delimiter without tables and graphs
def extract_and_format_paragraphs(pdf_path):
"""Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
# Define patterns for headers, footnotes, and specific lines
header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE) # Footnotes start with a number followed by a space
start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)
# Define common abbreviations and patterns that should not be considered as end-of-sentence
#exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
def remove_abbreviation_periods(text):
# Define regex patterns for common abbreviations where periods should be ignored
abbreviations = [
r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
]
for abbr in abbreviations:
# Remove periods in abbreviations at the end of the text
text = re.sub(f'({abbr})\.', r'\1', text)
return text
def is_end_of_sentence(text):
# Strip leading and trailing whitespace
text = text.strip()
# Remove periods in common abbreviations from the end of the text
text = remove_abbreviation_periods(text)
# Define regex patterns for sentence-ending punctuation
sentence_end_re = re.compile(r'[\.\!\?]\s*$')
# Check if the text ends with sentence-ending punctuation
return bool(sentence_end_re.search(text))
def clean_text(text):
"""Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
lines = text.split('\n')
filtered_lines = []
in_removal_section = False
paragraph_lines = []
def is_footnote_line(line):
"""Check if a line matches the footnote pattern."""
return footnote_pattern.match(line)
def append_line_to_paragraph(line):
"""Append the line to the paragraph, handling line breaks and footnotes."""
if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
# This line is a continuation of the previous one
paragraph_lines[-1] += ' ' + line.strip()
else:
# Start a new line in the paragraph
paragraph_lines.append(line.strip())
skip_line = False
for line in lines:
# Check for start and end markers
if start_marker_pattern.match(line):
in_removal_section = True
if in_removal_section and end_marker_pattern.match(line):
in_removal_section = False
continue
# Handle footnotes
if is_footnote_line(line):
skip_line = True
continue
if skip_line:
if is_end_of_sentence(line):
skip_line = False
continue
# Filter out headers and footers
if not header_pattern.match(line) and \
not footer_pattern.match(line) and \
not in_removal_section:
# Remove unnecessary line breaks and append line to paragraph_lines
if line.strip():
append_line_to_paragraph(line)
# Join all paragraph lines into a single paragraph text, removing unnecessary newlines
cleaned_paragraphs = "\n".join(paragraph_lines)
return cleaned_paragraphs
full_text = ""
previous_page_text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
# Clean and format the page text
cleaned_text = clean_text(page_text)
# Handle text from previous page
if previous_page_text:
# Check if the previous page text ends with punctuation
if not is_end_of_sentence(previous_page_text):
# Append the current page text to previous page text
previous_page_text += " " + cleaned_text
else:
# Add previous page text to full text
full_text += previous_page_text + "\n"
# Reset previous page text
previous_page_text = cleaned_text
else:
previous_page_text = cleaned_text
# Add remaining text from the last page
if previous_page_text:
full_text += previous_page_text
return full_text.strip()
# Cleaning: cut unecessary information such as annex and intro
def find_text_range(text, start_keyword, end_keywords):
"""Find the text range between start and multiple end keywords."""
start_index = text.lower().find(start_keyword.lower())
if start_index == -1:
raise ValueError(f"Start keyword '{start_keyword}' not found in the text.")
# Find the earliest occurrence of any end keyword
end_index = len(text) # Default to end of text
for end_keyword in end_keywords:
keyword_index = text.lower().find(end_keyword.lower())
if keyword_index != -1 and keyword_index < end_index:
end_index = keyword_index
return start_index, end_index
def extract_relevant_text(text, start_index, end_index):
"""Extract text from the start index to the end index."""
return text[start_index:end_index].strip()
# Split paragraphs into list of paragraphs
def split_text_into_paragraphs(extracted_text, min_length):
"""
Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.
"""
# Split the text into paragraphs based on newlines
paragraphs = re.split(r'\n+', extracted_text.strip())
def is_end_of_sentence(text):
"""Check if the text ends with punctuation indicating the end of a sentence."""
return bool(re.search(r'[.!?]$', text.strip()))
def count_sentences(text):
"""Count the number of sentences in a text."""
return len(re.split(r'(?<=[.!?])\s+', text.strip()))
def merge_single_sentence_paragraphs(paragraphs):
"""Merge single-sentence paragraphs with the next paragraph if necessary."""
merged_paragraphs = []
i = 0
while i < len(paragraphs):
para = paragraphs[i].strip()
if not para:
i += 1
continue
if count_sentences(para) == 1 and i + 1 < len(paragraphs):
# Check if the next paragraph should be merged with the current one
next_para = paragraphs[i + 1].strip()
if next_para:
# Merge single-sentence paragraph with the next paragraph
merged_paragraphs.append(para + ' ' + next_para)
i += 2 # Skip the next paragraph since it has been merged
else:
# If the next paragraph is empty, just add the current paragraph
merged_paragraphs.append(para)
i += 1
else:
# Add the current paragraph if it has more than one sentence or is the last one
merged_paragraphs.append(para)
i += 1
return merged_paragraphs
# Filter out paragraphs that are too short
filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
# Merge single-sentence paragraphs
final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)
return final_paragraphs |