| | import re |
| | import requests |
| | import docx2txt |
| | from io import StringIO |
| | from PyPDF2 import PdfReader |
| |
|
| | from bs4 import BeautifulSoup |
| | from nltk.tokenize import sent_tokenize |
| |
|
| | emoji_pattern = re.compile( |
| | "[" |
| | u"\U0001F600-\U0001F64F" |
| | u"\U0001F300-\U0001F5FF" |
| | u"\U0001F680-\U0001F6FF" |
| | u"\U0001F1E0-\U0001F1FF" |
| | u"\U00002702-\U000027B0" |
| | u"\U000024C2-\U0001F251" |
| | "]+", |
| | flags=re.UNICODE, |
| | ) |
| |
|
| |
|
| | def clean_text(x): |
| | x = x.encode("ascii", "ignore").decode() |
| | x = re.sub(r"https*\S+", " ", x) |
| | x = re.sub(r"@\S+", " ", x) |
| | x = re.sub(r"#\S+", " ", x) |
| | x = re.sub(r"\s{2,}", " ", x) |
| | x = emoji_pattern.sub(r"", x) |
| | x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) |
| |
|
| | return x |
| |
|
| |
|
| | def fetch_article_text(url: str): |
| |
|
| | r = requests.get(url) |
| | soup = BeautifulSoup(r.text, "html.parser") |
| | results = soup.find_all(["h1", "p"]) |
| | text = [result.text for result in results] |
| | ARTICLE = " ".join(text) |
| | ARTICLE = ARTICLE.replace(".", ".<eos>") |
| | ARTICLE = ARTICLE.replace("!", "!<eos>") |
| | ARTICLE = ARTICLE.replace("?", "?<eos>") |
| | sentences = ARTICLE.split("<eos>") |
| | current_chunk = 0 |
| | chunks = [] |
| | for sentence in sentences: |
| | if len(chunks) == current_chunk + 1: |
| | if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500: |
| | chunks[current_chunk].extend(sentence.split(" ")) |
| | else: |
| | current_chunk += 1 |
| | chunks.append(sentence.split(" ")) |
| | else: |
| | print(current_chunk) |
| | chunks.append(sentence.split(" ")) |
| |
|
| | for chunk_id in range(len(chunks)): |
| | chunks[chunk_id] = " ".join(chunks[chunk_id]) |
| |
|
| | return ARTICLE, chunks |
| |
|
| |
|
| | def preprocess_text_for_abstractive_summarization(tokenizer, text): |
| | sentences = sent_tokenize(text) |
| |
|
| | |
| | length = 0 |
| | chunk = "" |
| | chunks = [] |
| | count = -1 |
| | for sentence in sentences: |
| | count += 1 |
| | combined_length = ( |
| | len(tokenizer.tokenize(sentence)) + length |
| | ) |
| |
|
| | if combined_length <= tokenizer.max_len_single_sentence: |
| | chunk += sentence + " " |
| | length = combined_length |
| |
|
| | |
| | if count == len(sentences) - 1: |
| | chunks.append(chunk.strip()) |
| |
|
| | else: |
| | chunks.append(chunk.strip()) |
| |
|
| | |
| | length = 0 |
| | chunk = "" |
| |
|
| | |
| | chunk += sentence + " " |
| | length = len(tokenizer.tokenize(sentence)) |
| |
|
| | return chunks |
| |
|
| |
|
| | def read_pdf(file): |
| | pdfReader = PdfReader(file) |
| | count = len(pdfReader.pages) |
| | all_page_text = "" |
| | for i in range(count): |
| | page = pdfReader.pages[i] |
| | all_page_text += page.extract_text() |
| |
|
| | return all_page_text |
| |
|
| |
|
| | def read_text_from_file(file): |
| |
|
| | |
| | if file.type == "text/plain": |
| | |
| | stringio = StringIO(file.getvalue().decode("utf-8")) |
| |
|
| | |
| | file_content = stringio.read() |
| |
|
| | |
| | elif file.type == "application/pdf": |
| | file_content = read_pdf(file) |
| |
|
| | |
| | elif ( |
| | file.type |
| | == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
| | ): |
| | file_content = docx2txt.process(file) |
| |
|
| | return file_content |