hsuvaskakoty commited on
Commit
8dbd54d
1 Parent(s): 997bc87

Upload data_prep.py

Browse files
Files changed (1) hide show
  1. data_prep.py +7 -11
data_prep.py CHANGED
@@ -82,11 +82,7 @@ def extract_div_contents_from_url(url):
82
  return df
83
 
84
 
85
- import requests
86
- import pandas as pd
87
- from bs4 import BeautifulSoup
88
-
89
- def extract_div_contents_from_url_new(url, date):
90
  response = requests.get(url)
91
  if response.status_code != 200:
92
  print(f"Error: Received status code {response.status_code} for URL: {url}")
@@ -154,15 +150,15 @@ def extract_div_contents_from_url_new(url, date):
154
  if confirmation_b_tag:
155
  confirmation = confirmation_b_tag.text.strip()
156
  parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
157
- discussion = parts[-1] if len(parts) > 0 else ''
158
  verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
159
 
160
- data.append([date, title, text_url, deletion_discussion, label, confirmation, verdict, discussion])
161
  except Exception as e:
162
  print(f"Error processing div: {e}")
163
  continue
164
 
165
- df = pd.DataFrame(data, columns=['date', 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
166
  return df
167
 
168
  def extract_post_links_text(discussion_html):
@@ -206,13 +202,13 @@ def process_split_text_into_sentences(df):
206
 
207
  def process_data(url):
208
  df = extract_div_contents_from_url(url)
209
- #check if df is empty
210
- if df.empty:
211
  df = extract_div_contents_from_url_new(url)
 
212
  df = process_discussion(df)
 
213
  df = process_html_to_plaintext(df)
214
  df = process_split_text_into_sentences(df)
215
- #if not empty
216
  if not df.empty:
217
  return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned']
218
  else:
 
82
  return df
83
 
84
 
85
+ def extract_div_contents_from_url_new(url):
 
 
 
 
86
  response = requests.get(url)
87
  if response.status_code != 200:
88
  print(f"Error: Received status code {response.status_code} for URL: {url}")
 
150
  if confirmation_b_tag:
151
  confirmation = confirmation_b_tag.text.strip()
152
  parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
153
+ discussion = parts[0] if len(parts) > 0 else ''
154
  verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
155
 
156
+ data.append([ title, text_url, deletion_discussion, label, confirmation, verdict, discussion])
157
  except Exception as e:
158
  print(f"Error processing div: {e}")
159
  continue
160
 
161
+ df = pd.DataFrame(data, columns=[ 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
162
  return df
163
 
164
  def extract_post_links_text(discussion_html):
 
202
 
203
  def process_data(url):
204
  df = extract_div_contents_from_url(url)
205
+ if df.at[0,'discussion'] == '':
 
206
  df = extract_div_contents_from_url_new(url)
207
+ #print(df.head())
208
  df = process_discussion(df)
209
+ print(df.at[0,'discussion'])
210
  df = process_html_to_plaintext(df)
211
  df = process_split_text_into_sentences(df)
 
212
  if not df.empty:
213
  return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned']
214
  else: