Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup | |
from bs4.element import Comment | |
import requests | |
from transformers import pipeline | |
def tag_visible(element): | |
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: | |
return False | |
if isinstance(element, Comment): | |
return False | |
return True | |
def getTalkPage(wiki_page): | |
if "wikipedia.org" in wiki_page: | |
response = requests.get(wiki_page) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
talk_url = soup.find_all("a", {"rel": "discussion"}) | |
if len(talk_url) > 0: | |
talk_url = talk_url[0]["href"] | |
try: | |
talk_response = requests.get("https://en.wikipedia.org" + talk_url) | |
talk_soup = BeautifulSoup(talk_response.content, 'html.parser') | |
talk_texts = talk_soup.findAll(text=True) | |
visible_texts = filter(tag_visible, talk_texts) | |
return u" ".join(t.strip() for t in visible_texts) | |
except Exception as error: | |
print('Error occured: {}'.format(error)) | |
classifier = pipeline(model="amitkayal/bert-finetuned-sem_eval-english", top_k=None) | |
def tone_talkpage(url): | |
"""This function goes through the content of the talk page, break it down into smaller parts. | |
It then takes those smaller parts through the fine-tuned BERT tone detection model, then average out the results for the whole page | |
Output: the 3 most likely tones of the page with its corresponding probability""" | |
talk_content = getTalkPage(url) | |
tone_labels = {'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 0, 'joy': 0, 'love': 0, 'optimism': 0, 'pessimism': 0, 'sadness': 0, 'surprise': 0, 'trust': 0} | |
if talk_content: | |
breakdown = talk_content.split() | |
n = 150 #because the max amount of sequence length is 512 | |
breakdown_lst = [' '.join(breakdown[i:i+n]) for i in range(0,len(talk_content),n)] | |
for ele in breakdown_lst: | |
res = classifier(ele)[0] | |
for tone_res in res: | |
tone_labels[tone_res["label"]] += tone_res["score"] | |
lst_len = len(breakdown_lst) | |
for key, val in tone_labels.items(): | |
tone_labels[key] = val/lst_len | |
sorted_tones = sorted(tone_labels.items(), key=lambda x:x[1], reverse=True) | |
return sorted_tones[:3] | |