WebAPI / pattern_functions.py
mangoman7002's picture
Upload 6 files
ad06298 verified
from bs4 import BeautifulSoup as bs
import re
def extract_1(content):
finalcontent = ''
toremove = ['link','script','style','iframe','object','noscript','param','embed','meta','base','canvas','svg']
content_soup = bs(content.text, 'html.parser')
for soup_body in content_soup.find_all('body'):
for remove_tag in toremove:
for trash_tag in soup_body.find_all(remove_tag):
trash_tag.decompose()
thisbody = soup_body.get_text()
thisbody = thisbody.replace("\t",'')
thisbody = re.sub(r"\n\w\n",'\n',thisbody)
while True:
old_body = thisbody
thisbody = thisbody.replace(' ', ' ')
if old_body == thisbody:
break
while True:
old_body = thisbody
thisbody = thisbody.replace('\n\n', '\n')
if old_body == thisbody:
break
finalcontent = finalcontent + thisbody
print('content Extracted')
return "\n".join([z.strip() for z in finalcontent.split("\n") if len(z.strip().split(" ")) > 3])