File size: 8,387 Bytes
3e45198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import pdfplumber
import re

# Extract text as paragraph delimiter without tables and graphs
def extract_and_format_paragraphs(pdf_path):
    """Extract and format paragraphs from a PDF text, applying filters to remove headers, footnotes, and specific sections."""
    
    # Define patterns for headers, footnotes, and specific lines
    header_pattern = re.compile(r"^(ECB-PUBLIC|Title|Document|Header)", re.IGNORECASE)
    footer_pattern = re.compile(r"^(Page \d+ of \d+|Footer|Document|Note:|Source:|the 75th and 25th percentiles|\|\d+)", re.IGNORECASE)
    footnote_pattern = re.compile(r"^\d+ \d{1} ", re.IGNORECASE)  # Footnotes start with a number followed by a space
    start_marker_pattern = re.compile(r"^Chart", re.IGNORECASE)
    end_marker_pattern = re.compile(r"^(Source:|Note:)", re.IGNORECASE)

    # Define common abbreviations and patterns that should not be considered as end-of-sentence
    #exceptions_pattern = re.compile(r'\b(e\.g|i\.e\.|etc\.|a\.k\.a\.)\b', re.IGNORECASE)
    def remove_abbreviation_periods(text):
        # Define regex patterns for common abbreviations where periods should be ignored
        abbreviations = [
            r'\b(?:e\.g|i\.e|a\.m|p\.m|U\.S|J\.R\.R|Dr|Mr|Ms|Mrs|Jr|Sr)\b'
        ]
        
        for abbr in abbreviations:
            # Remove periods in abbreviations at the end of the text
            text = re.sub(f'({abbr})\.', r'\1', text)
        
        return text

    def is_end_of_sentence(text):
        # Strip leading and trailing whitespace
        text = text.strip()
        
        # Remove periods in common abbreviations from the end of the text
        text = remove_abbreviation_periods(text)
        
        # Define regex patterns for sentence-ending punctuation
        sentence_end_re = re.compile(r'[\.\!\?]\s*$')
        
        # Check if the text ends with sentence-ending punctuation
        return bool(sentence_end_re.search(text))
    def clean_text(text):
        """Remove unnecessary line breaks, extra spaces, and filter out headers, footnotes, and specific sections."""
        lines = text.split('\n')
        filtered_lines = []
        in_removal_section = False
        paragraph_lines = []

        def is_footnote_line(line):
            """Check if a line matches the footnote pattern."""
            return footnote_pattern.match(line)

        def append_line_to_paragraph(line):
            """Append the line to the paragraph, handling line breaks and footnotes."""
            if paragraph_lines and not is_end_of_sentence(paragraph_lines[-1]):
                # This line is a continuation of the previous one
                paragraph_lines[-1] += ' ' + line.strip()
            else:
                # Start a new line in the paragraph
                paragraph_lines.append(line.strip())

        skip_line = False
        for line in lines:
            # Check for start and end markers
            if start_marker_pattern.match(line):
                in_removal_section = True
            if in_removal_section and end_marker_pattern.match(line):
                in_removal_section = False
                continue
            
            # Handle footnotes
            if is_footnote_line(line):
                skip_line = True
                continue
            
            if skip_line:
                if is_end_of_sentence(line):
                    skip_line = False
                continue
            
            # Filter out headers and footers
            if not header_pattern.match(line) and \
               not footer_pattern.match(line) and \
               not in_removal_section:
                # Remove unnecessary line breaks and append line to paragraph_lines
                if line.strip():
                    append_line_to_paragraph(line)

        # Join all paragraph lines into a single paragraph text, removing unnecessary newlines
        cleaned_paragraphs = "\n".join(paragraph_lines)
        return cleaned_paragraphs
    
    full_text = ""
    previous_page_text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                # Clean and format the page text
                cleaned_text = clean_text(page_text)
                # Handle text from previous page
                if previous_page_text:
                    # Check if the previous page text ends with punctuation
                    if not is_end_of_sentence(previous_page_text):
                        # Append the current page text to previous page text
                        previous_page_text += " " + cleaned_text
                    else:
                        # Add previous page text to full text
                        full_text += previous_page_text + "\n"
                        # Reset previous page text
                        previous_page_text = cleaned_text
                else:
                    previous_page_text = cleaned_text
        
        # Add remaining text from the last page
        if previous_page_text:
            full_text += previous_page_text

    return full_text.strip()

# Cleaning: cut unecessary information such as annex and intro
def find_text_range(text, start_keyword, end_keywords):
    """Find the text range between start and multiple end keywords."""
    start_index = text.lower().find(start_keyword.lower())

    if start_index == -1:
        raise ValueError(f"Start keyword '{start_keyword}' not found in the text.")

    # Find the earliest occurrence of any end keyword
    end_index = len(text)  # Default to end of text
    for end_keyword in end_keywords:
        keyword_index = text.lower().find(end_keyword.lower())
        if keyword_index != -1 and keyword_index < end_index:
            end_index = keyword_index

    return start_index, end_index

def extract_relevant_text(text, start_index, end_index):
    """Extract text from the start index to the end index."""
    return text[start_index:end_index].strip()
# Split paragraphs into list of paragraphs
def split_text_into_paragraphs(extracted_text, min_length):
    """

    Split the extracted text into paragraphs based on newlines, and merge single-sentence paragraphs.

    """
    # Split the text into paragraphs based on newlines
    paragraphs = re.split(r'\n+', extracted_text.strip())
    
    def is_end_of_sentence(text):
        """Check if the text ends with punctuation indicating the end of a sentence."""
        return bool(re.search(r'[.!?]$', text.strip()))
    
    def count_sentences(text):
        """Count the number of sentences in a text."""
        return len(re.split(r'(?<=[.!?])\s+', text.strip()))
    
    def merge_single_sentence_paragraphs(paragraphs):
        """Merge single-sentence paragraphs with the next paragraph if necessary."""
        merged_paragraphs = []
        i = 0
        while i < len(paragraphs):
            para = paragraphs[i].strip()
            if not para:
                i += 1
                continue
            
            if count_sentences(para) == 1 and i + 1 < len(paragraphs):
                # Check if the next paragraph should be merged with the current one
                next_para = paragraphs[i + 1].strip()
                if next_para:
                    # Merge single-sentence paragraph with the next paragraph
                    merged_paragraphs.append(para + ' ' + next_para)
                    i += 2  # Skip the next paragraph since it has been merged
                else:
                    # If the next paragraph is empty, just add the current paragraph
                    merged_paragraphs.append(para)
                    i += 1
            else:
                # Add the current paragraph if it has more than one sentence or is the last one
                merged_paragraphs.append(para)
                i += 1
        
        return merged_paragraphs

    # Filter out paragraphs that are too short
    filtered_paragraphs = [p for p in paragraphs if len(p.strip()) > min_length]
    
    # Merge single-sentence paragraphs
    final_paragraphs = merge_single_sentence_paragraphs(filtered_paragraphs)

    return final_paragraphs