Spaces:
Sleeping
Sleeping
hsuvaskakoty
commited on
Commit
•
8dbd54d
1
Parent(s):
997bc87
Upload data_prep.py
Browse files- data_prep.py +7 -11
data_prep.py
CHANGED
@@ -82,11 +82,7 @@ def extract_div_contents_from_url(url):
|
|
82 |
return df
|
83 |
|
84 |
|
85 |
-
|
86 |
-
import pandas as pd
|
87 |
-
from bs4 import BeautifulSoup
|
88 |
-
|
89 |
-
def extract_div_contents_from_url_new(url, date):
|
90 |
response = requests.get(url)
|
91 |
if response.status_code != 200:
|
92 |
print(f"Error: Received status code {response.status_code} for URL: {url}")
|
@@ -154,15 +150,15 @@ def extract_div_contents_from_url_new(url, date):
|
|
154 |
if confirmation_b_tag:
|
155 |
confirmation = confirmation_b_tag.text.strip()
|
156 |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
|
157 |
-
discussion = parts[
|
158 |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
|
159 |
|
160 |
-
data.append([
|
161 |
except Exception as e:
|
162 |
print(f"Error processing div: {e}")
|
163 |
continue
|
164 |
|
165 |
-
df = pd.DataFrame(data, columns=[
|
166 |
return df
|
167 |
|
168 |
def extract_post_links_text(discussion_html):
|
@@ -206,13 +202,13 @@ def process_split_text_into_sentences(df):
|
|
206 |
|
207 |
def process_data(url):
|
208 |
df = extract_div_contents_from_url(url)
|
209 |
-
|
210 |
-
if df.empty:
|
211 |
df = extract_div_contents_from_url_new(url)
|
|
|
212 |
df = process_discussion(df)
|
|
|
213 |
df = process_html_to_plaintext(df)
|
214 |
df = process_split_text_into_sentences(df)
|
215 |
-
#if not empty
|
216 |
if not df.empty:
|
217 |
return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned']
|
218 |
else:
|
|
|
82 |
return df
|
83 |
|
84 |
|
85 |
+
def extract_div_contents_from_url_new(url):
|
|
|
|
|
|
|
|
|
86 |
response = requests.get(url)
|
87 |
if response.status_code != 200:
|
88 |
print(f"Error: Received status code {response.status_code} for URL: {url}")
|
|
|
150 |
if confirmation_b_tag:
|
151 |
confirmation = confirmation_b_tag.text.strip()
|
152 |
parts = deletion_discussion.split('<div class="mw-heading mw-heading3">')
|
153 |
+
discussion = parts[0] if len(parts) > 0 else ''
|
154 |
verdict = '<div class="mw-heading mw-heading3">' + parts[1] if len(parts) > 1 else ''
|
155 |
|
156 |
+
data.append([ title, text_url, deletion_discussion, label, confirmation, verdict, discussion])
|
157 |
except Exception as e:
|
158 |
print(f"Error processing div: {e}")
|
159 |
continue
|
160 |
|
161 |
+
df = pd.DataFrame(data, columns=[ 'title', 'text_url', 'deletion_discussion', 'label', 'confirmation', 'discussion', 'verdict'])
|
162 |
return df
|
163 |
|
164 |
def extract_post_links_text(discussion_html):
|
|
|
202 |
|
203 |
def process_data(url):
|
204 |
df = extract_div_contents_from_url(url)
|
205 |
+
if df.at[0,'discussion'] == '':
|
|
|
206 |
df = extract_div_contents_from_url_new(url)
|
207 |
+
#print(df.head())
|
208 |
df = process_discussion(df)
|
209 |
+
print(df.at[0,'discussion'])
|
210 |
df = process_html_to_plaintext(df)
|
211 |
df = process_split_text_into_sentences(df)
|
|
|
212 |
if not df.empty:
|
213 |
return df.at[0,'title']+ ' : '+df.at[0, 'discussion_cleaned']
|
214 |
else:
|