Spaces:
Sleeping
Sleeping
Create preprocessor.py
Browse files- preprocessor.py +51 -0
preprocessor.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
def prepos(data):
|
4 |
+
pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{1,2}\s(?:AM|PM)\s-\s'
|
5 |
+
messages = re.split(pattern,data)[1:]
|
6 |
+
dates = re.findall(pattern,data)
|
7 |
+
|
8 |
+
df =pd.DataFrame({'user_message':messages,'dates':dates})
|
9 |
+
|
10 |
+
df['dates']=pd.to_datetime(df['dates'],format='%m/%d/%y, %I:%M %p - ')
|
11 |
+
|
12 |
+
|
13 |
+
users=[]
|
14 |
+
message=[]
|
15 |
+
for i in df['user_message']:
|
16 |
+
x=re.split('([\w\W]+?):\s',i)
|
17 |
+
if x[1:]:
|
18 |
+
users.append(x[1])
|
19 |
+
message.append(x[2])
|
20 |
+
else:
|
21 |
+
users.append('group_notification')
|
22 |
+
message.append(x[0])
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
df['users']=users
|
28 |
+
df['message']=message
|
29 |
+
df.drop(columns=['user_message'],inplace=True)
|
30 |
+
|
31 |
+
df['only_date']=df['dates'].dt.date
|
32 |
+
df['year'] = df['dates'].dt.year
|
33 |
+
df['month_num'] = df['dates'].dt.month
|
34 |
+
df['month'] = df['dates'].dt.month_name()
|
35 |
+
df['day ']= df['dates'].dt.day
|
36 |
+
df['day_name']= df['dates'].dt.day_name()
|
37 |
+
df['hour'] = df['dates'].dt.hour
|
38 |
+
df['minute'] = df['dates'].dt.minute
|
39 |
+
|
40 |
+
|
41 |
+
period=[]
|
42 |
+
for hour in df[['day_name','hour']]['hour']:
|
43 |
+
if hour == 23 :
|
44 |
+
period.append(str(hour)+ '-' + str('00'))
|
45 |
+
elif hour == 0 :
|
46 |
+
period.append(str('00')+ '-' + str(hour+1))
|
47 |
+
else:
|
48 |
+
period.append(str(hour)+ '-' + str(hour+1))
|
49 |
+
|
50 |
+
df['period']=period
|
51 |
+
return df
|