import requests import pandas as pd from bs4 import BeautifulSoup def extract_div_contents_from_url(url): response = requests.get(url) if response.status_code != 200: print(f"Error: Received status code {response.status_code} for URL: {url}") return pd.DataFrame(columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) soup = BeautifulSoup(response.content, 'html.parser') div_classes = ["mw-heading mw-heading3",'boilerplate afd vfd xfd-closed', 'boilerplate afd vfd xfd-closed archived mw-archivedtalk'] divs = [] for div_class in div_classes: divs.extend(soup.find_all('div', class_=div_class)) url_fragment = url.split('#')[-1].replace('_', ' ') data = [] for div in divs: try: title = None text_url = None # Extract title and text_url title_tag = div.find('a') if title_tag: title_span = div.find('span', {'data-mw-comment-start': True}) if title_span: title_anchor = title_span.find_next_sibling('a') if title_anchor: title = title_anchor.text text_url = 'https://en.wikipedia.org' + title_anchor['href'] else: title = title_tag.text text_url = 'https://en.wikipedia.org' + title_tag['href'] if title == 'talk page' or title is None: heading_tag = div.find('div', class_='mw-heading mw-heading3') if heading_tag: title_tag = heading_tag.find('a') if title_tag: title = title_tag.text text_url = 'https://en.wikipedia.org' + title_tag['href'] if not title: continue if title.lower() != url_fragment.lower(): continue deletion_discussion = div.prettify() # Extract label label = '' verdict_tag = div.find('p') if verdict_tag: label_b_tag = verdict_tag.find('b') if label_b_tag: label = label_b_tag.text.strip() # Extract confirmation confirmation = '' discussion_tag = div.find('dd') if discussion_tag: discussion_tag_i = discussion_tag.find('i') if discussion_tag_i: confirmation_b_tag = discussion_tag_i.find('b') if confirmation_b_tag: confirmation = confirmation_b_tag.text.strip() # Split deletion_discussion into discussion and verdict parts = deletion_discussion.split('
') discussion = parts[0] if len(parts) > 0 else '' verdict = '
' + parts[1] if len(parts) > 1 else '' data.append([title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) except Exception as e: print(f"Error processing div: {e}") continue df = pd.DataFrame(data, columns=['title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'verdict', 'discussion']) df = df[['title', 'discussion', 'verdict', 'label']] print(f"DataFrame created with {len(df)} rows") return df def extract_div_contents_from_url_new(url): response = requests.get(url) if response.status_code != 200: print(f"Error: Received status code {response.status_code} for URL: {url}") return pd.DataFrame(columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) soup = BeautifulSoup(response.content, 'html.parser') div_classes = ["mw-heading mw-heading3"] divs = [] for div_class in div_classes: divs.extend(soup.find_all('div', class_=div_class)) url_fragment = url.split('#')[-1].replace('_', ' ') log_date = url.split('/')[-1] data = [] for i, div in enumerate(divs): try: title = None text_url = None title_tag = div.find('a') if title_tag: title_span = div.find('span', {'data-mw-comment-start': True}) if title_span: title_anchor = title_span.find_next_sibling('a') if title_anchor: title = title_anchor.text text_url = 'https://en.wikipedia.org' + title_anchor['href'] else: title = title_tag.text text_url = 'https://en.wikipedia.org' + title_tag['href'] if title == 'talk page' or title is None: heading_tag = div.find('div', class_='mw-heading mw-heading3') if heading_tag: title_tag = heading_tag.find('a') if title_tag: title = title_tag.text text_url = 'https://en.wikipedia.org' + title_tag['href'] if not title: continue if title.lower() != url_fragment.lower(): continue next_div = div.find_next('div', class_='mw-heading mw-heading3') deletion_discussion = '' sibling = div.find_next_sibling() while sibling and sibling != next_div: deletion_discussion += str(sibling) sibling = sibling.find_next_sibling() label = '' verdict_tag = div.find('p') if verdict_tag: label_b_tag = verdict_tag.find('b') if label_b_tag: label = label_b_tag.text.strip() confirmation = '' discussion_tag = div.find('dd') if discussion_tag: discussion_tag_i = discussion_tag.find('i') if discussion_tag_i: confirmation_b_tag = discussion_tag_i.find('b') if confirmation_b_tag: confirmation = confirmation_b_tag.text.strip() parts = deletion_discussion.split('
') discussion = parts[0] if len(parts) > 0 else '' verdict = '
' + parts[1] if len(parts) > 1 else '' data.append([ title, text_url, deletion_discussion, label, confirmation, verdict, discussion]) except Exception as e: print(f"Error processing div: {e}") continue df = pd.DataFrame(data, columns=[ 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict']) return df def extract_post_links_text(discussion_html): split_point = '' if split_point in discussion_html: parts = discussion_html.split(split_point) if len(parts) > 1: return parts[1] return discussion_html def process_discussion(df): df['discussion_cleaned'] = df['verdict'].apply(extract_post_links_text) return df def html_to_plaintext(html_content): soup = BeautifulSoup(html_content, 'html.parser') for tag in soup.find_all(['p', 'li', 'dd', 'dl']): tag.insert_before('\n') tag.insert_after('\n') for br in soup.find_all('br'): br.replace_with('\n') text = soup.get_text(separator=' ', strip=True) text = '\n'.join([line.strip() for line in text.splitlines() if line.strip() != '']) return text def process_html_to_plaintext(df): df['discussion_cleaned'] = df['discussion_cleaned'].apply(html_to_plaintext) df = df[['title', 'discussion_cleaned', 'label']] return df import pysbd def split_text_into_sentences(text): seg = pysbd.Segmenter(language="en", clean=False) sentences = seg.segment(text) return ' '.join(sentences[1:]) def process_split_text_into_sentences(df): df['discussion_cleaned'] = df['discussion_cleaned'].apply(split_text_into_sentences) return df def process_data(url): df = extract_div_contents_from_url(url) if df.at[0,'discussion'] == '': df = extract_div_contents_from_url_new(url) #print(df.head()) df = process_discussion(df) print(df.at[0,'discussion']) df = process_html_to_plaintext(df) df = process_split_text_into_sentences(df) if not df.empty: return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned'] else: return 'Empty DataFrame'