hsuvaskakoty commited on
Commit
997bc87
1 Parent(s): 13f6685

Upload data_prep.py

Browse files
Files changed (1) hide show
  1. data_prep.py +86 -0
data_prep.py CHANGED
@@ -82,6 +82,89 @@ def extract_div_contents_from_url(url):
82
  return df
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def extract_post_links_text(discussion_html):
86
  split_point = '<span class="plainlinks">'
87
  if split_point in discussion_html:
@@ -123,6 +206,9 @@ def process_split_text_into_sentences(df):
123
 
124
  def process_data(url):
125
  df = extract_div_contents_from_url(url)
 
 
 
126
  df = process_discussion(df)
127
  df = process_html_to_plaintext(df)
128
  df = process_split_text_into_sentences(df)
 
82
  return df
83
 
84
 
85
+ import requests
86
+ import pandas as pd
87
+ from bs4 import BeautifulSoup
88
+
89
+ def extract_div_contents_from_url_new(url, date):
90
+ response = requests.get(url)
91
+ if response.status_code != 200:
92
+ print(f"Error: Received status code {response.status_code} for URL: {url}")
93
+ return pd.DataFrame(columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
94
+
95
+ soup = BeautifulSoup(response.content, 'html.parser')
96
+ div_classes = ["mw-heading mw-heading3"]
97
+ divs = []
98
+
99
+ for div_class in div_classes:
100
+ divs.extend(soup.find_all('div', class_=div_class))
101
+
102
+ url_fragment = url.split('#')[-1].replace('_', ' ')
103
+ log_date = url.split('/')[-1]
104
+
105
+ data = []
106
+ for i, div in enumerate(divs):
107
+ try:
108
+ title = None
109
+ text_url = None
110
+ title_tag = div.find('a')
111
+ if title_tag:
112
+ title_span = div.find('span', {'data-mw-comment-start': True})
113
+ if title_span:
114
+ title_anchor = title_span.find_next_sibling('a')
115
+ if title_anchor:
116
+ title = title_anchor.text
117
+ text_url = 'https://en.wikipedia.org' + title_anchor['href']
118
+ else:
119
+ title = title_tag.text
120
+ text_url = 'https://en.wikipedia.org' + title_tag['href']
121
+
122
+ if title == 'talk page' or title is None:
123
+ heading_tag = div.find('div', class_='mw-heading mw-heading3')
124
+ if heading_tag:
125
+ title_tag = heading_tag.find('a')
126
+ if title_tag:
127
+ title = title_tag.text
128
+ text_url = 'https://en.wikipedia.org' + title_tag['href']
129
+
130
+ if not title:
131
+ continue
132
+ if title.lower() != url_fragment.lower():
133
+ continue
134
+
135
+ next_div = div.find_next('div', class_='mw-heading mw-heading3')
136
+ deletion_discussion = ''
137
+ sibling = div.find_next_sibling()
138
+ while sibling and sibling != next_div:
139
+ deletion_discussion += str(sibling)
140
+ sibling = sibling.find_next_sibling()
141
+
142
+ label = ''
143
+ verdict_tag = div.find('p')
144
+ if verdict_tag:
145
+ label_b_tag = verdict_tag.find('b')
146
+ if label_b_tag:
147
+ label = label_b_tag.text.strip()
148
+ confirmation = ''
149
+ discussion_tag = div.find('dd')
150
+ if discussion_tag:
151
+ discussion_tag_i = discussion_tag.find('i')
152
+ if discussion_tag_i:
153
+ confirmation_b_tag = discussion_tag_i.find('b')
154
+ if confirmation_b_tag:
155
+ confirmation = confirmation_b_tag.text.strip()
156
+ parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
157
+ discussion = parts[-1] if len(parts) > 0 else ''
158
+ verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
159
+
160
+ data.append([date, title, text_url, deletion_discussion, label, confirmation, verdict, discussion])
161
+ except Exception as e:
162
+ print(f"Error processing div: {e}")
163
+ continue
164
+
165
+ df = pd.DataFrame(data, columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
166
+ return df
167
+
168
  def extract_post_links_text(discussion_html):
169
  split_point = '<span class="plainlinks">'
170
  if split_point in discussion_html:
 
206
 
207
  def process_data(url):
208
  df = extract_div_contents_from_url(url)
209
+ #check if df is empty
210
+ if df.empty:
211
+ df = extract_div_contents_from_url_new(url)
212
  df = process_discussion(df)
213
  df = process_html_to_plaintext(df)
214
  df = process_split_text_into_sentences(df)