Spaces:
Running
Running
from bs4 import BeautifulSoup as bs | |
import re | |
def extract_1(content): | |
finalcontent = '' | |
toremove = ['link','script','style','iframe','object','noscript','param','embed','meta','base','canvas','svg'] | |
content_soup = bs(content.text, 'html.parser') | |
for soup_body in content_soup.find_all('body'): | |
for remove_tag in toremove: | |
for trash_tag in soup_body.find_all(remove_tag): | |
trash_tag.decompose() | |
thisbody = soup_body.get_text() | |
thisbody = thisbody.replace("\t",'') | |
thisbody = re.sub(r"\n\w\n",'\n',thisbody) | |
while True: | |
old_body = thisbody | |
thisbody = thisbody.replace(' ', ' ') | |
if old_body == thisbody: | |
break | |
while True: | |
old_body = thisbody | |
thisbody = thisbody.replace('\n\n', '\n') | |
if old_body == thisbody: | |
break | |
finalcontent = finalcontent + thisbody | |
print('content Extracted') | |
return "\n".join([z.strip() for z in finalcontent.split("\n") if len(z.strip().split(" ")) > 3]) |