cs189 / hw /hw3 /data /featurize.py
briancpark's picture
Upload 670 files
14fe3f7
'''
**************** PLEASE READ ***************
Script that reads in spam and ham messages and converts each training example
into a feature vector
Code intended for UC Berkeley course CS 189/289A: Machine Learning
Requirements:
-scipy ('pip install scipy')
To add your own features, create a function that takes in the raw text and
word frequency dictionary and outputs a int or float. Then add your feature
in the function 'def generate_feature_vector'
The output of your file will be a .mat file. The data will be accessible using
the following keys:
-'training_data'
-'training_labels'
-'test_data'
Please direct any bugs to [email protected]
'''
from collections import defaultdict
import glob
import re
import os
import scipy.io
import numpy as np
NUM_TRAINING_EXAMPLES = 5172
NUM_TEST_EXAMPLES = 5857
BASE_DIR = os.path.abspath(os.path.dirname(__file__))
SPAM_DIR = os.path.join(BASE_DIR, 'spam')
HAM_DIR = os.path.join(BASE_DIR, 'ham')
TEST_DIR = os.path.join(BASE_DIR, 'test')
# ************* Features *************
# Features that look for certain words
def freq_pain_feature(text, freq):
return float(freq['pain'])
def freq_private_feature(text, freq):
return float(freq['private'])
def freq_bank_feature(text, freq):
return float(freq['bank'])
def freq_money_feature(text, freq):
return float(freq['money'])
def freq_drug_feature(text, freq):
return float(freq['drug'])
def freq_spam_feature(text, freq):
return float(freq['spam'])
def freq_prescription_feature(text, freq):
return float(freq['prescription'])
def freq_creative_feature(text, freq):
return float(freq['creative'])
def freq_height_feature(text, freq):
return float(freq['height'])
def freq_featured_feature(text, freq):
return float(freq['featured'])
def freq_differ_feature(text, freq):
return float(freq['differ'])
def freq_width_feature(text, freq):
return float(freq['width'])
def freq_other_feature(text, freq):
return float(freq['other'])
def freq_energy_feature(text, freq):
return float(freq['energy'])
def freq_business_feature(text, freq):
return float(freq['business'])
def freq_message_feature(text, freq):
return float(freq['message'])
def freq_volumes_feature(text, freq):
return float(freq['volumes'])
def freq_revision_feature(text, freq):
return float(freq['revision'])
def freq_path_feature(text, freq):
return float(freq['path'])
def freq_meter_feature(text, freq):
return float(freq['meter'])
def freq_memo_feature(text, freq):
return float(freq['memo'])
def freq_planning_feature(text, freq):
return float(freq['planning'])
def freq_pleased_feature(text, freq):
return float(freq['pleased'])
def freq_record_feature(text, freq):
return float(freq['record'])
def freq_out_feature(text, freq):
return float(freq['out'])
# Features that look for certain characters
def freq_semicolon_feature(text, freq):
return text.count(';')
def freq_dollar_feature(text, freq):
return text.count('$')
def freq_sharp_feature(text, freq):
return text.count('#')
def freq_exclamation_feature(text, freq):
return text.count('!')
def freq_para_feature(text, freq):
return text.count('(')
def freq_bracket_feature(text, freq):
return text.count('[')
def freq_and_feature(text, freq):
return text.count('&')
# --------- Add your own feature methods ----------
def freq_free_feature(text, freq):
return float(freq['free'])
def freq_insurance_feature(text, freq):
return float(freq['insurance'])
def freq_porn_feature(text, freq):
return float(freq['porn'])
def freq_fuck_feature(text, freq):
return float(freq['fuck'])
def freq_dick_feature(text, freq):
return float(freq['dick']) + float(freq['penis']) + float(freq['cock'])
def freq_viagra_feature(text, freq):
return float(freq['viagra'])
def freq_click_feature(text, freq):
return float(freq['click'])
def freq_send_feature(text, freq):
return float(freq['send'])
def freq_money_feature(text, freq):
return float(freq['money'])
def freq_sex_feature(text, freq):
return text.count('sex') + float(freq['hard']) + float(freq['adult'])
def freq_linux_feature(text, freq):
return float(freq['linux'])
def freq_web_feature(text, freq):
return text.count('http')
def freq_period_feature(text, freq):
return text.count('.')
def len_feature(text, freq):
return len(text)
def freq_forward_feature(text, freq):
return text.count('forward')
def freq_career_feature(text, freq):
return float(freq['career'])
def freq_interview_feature(text, freq):
return float(freq['interview'])
def freq_meeting_feature(text, freq):
return text.count('meet')
def freq_files_feature(text, freq):
return float(text.lower().count('pdf')) + float(text.lower().count('jpg')) + float(text.lower().count('png')) + float(text.lower().count('doc')) + float(text.lower().count('html')) + float(text.lower().count('xls')) + float(text.lower().count('ods')) + float(text.lower().count('ppt')) + float(text.lower().count('txt'))
def freq_urgent_feature(text, freq):
return float(freq['urgent'])
def freq_ebay_feature(text, freq):
return float(freq['ebay'])
def freq_prince_feature(text, freq):
return float(freq['prince'])
def freq_cialis_feature(text, freq):
return float(freq['cialis'])
def freq_visit_feature(text, freq):
return float(freq['visit'])
def freq_pharm_feature(text, freq):
return text.count('pharm')
def freq_period_feature(text, freq):
return text.count('.') + text.count('-') + text.count('/')
def freq_at_feature(text, freq):
return text.count('@')
def freq_common_feature(text, freq):
return float(freq['the']) + float(freq['and']) + float(freq['to']) + float(freq['for']) + float(freq['in'])
def freq_pronouns_feature(text, freq):
return float(freq['me']) + float(freq['he']) + float(freq['she']) + float(freq['they']) + float(freq['them']) + float(freq['we']) + float(freq['ours']) + float(freq['my'])
def freq_agg_pronoun_feature(text, freq):
return float(freq['you']) + float(freq['yours']) + float(freq['your']) + float(freq['name'])
def freq_verbs_feature(text, freq):
return float(freq['should']) + float(freq['could']) + float(freq['would']) + float(freq['see']) + float(freq['need']) + float(freq['has']) + float(freq['do'])
def freq_mail_feature(text, freq):
return float(freq['mail'])
def freq_date_feature(text, freq):
return float(freq['january']) + float(freq['february']) + float(freq['march']) + float(freq['april']) + float(freq['may']) + float(freq['june']) + float(freq['july']) + float(freq['august']) + float(freq['september']) + float(freq['october']) + float(freq['november']) + float(freq['december'])
def freq_schedule_feature(text, freq):
return float(freq['date']) + float(freq['time']) + float(freq['month']) + float(freq['schedule']) + float(freq['meeting']) + float(freq['late']) + float(freq['early'])
def freq_answer_feature(text, freq):
return float(freq['yes']) + float(freq['no']) + float(freq['sure']) + float(freq['yep']) + float(freq['nope']) + float(freq['sorry']) + float(freq['apologies']) + float(freq['ok']) + float(freq['okay'])
def freq_adj_feature(text, freq):
return float(freq['this']) + float(freq['that']) + float(freq['here']) + float(freq['there']) + float(freq['in']) + float(freq['with']) + float(freq['be'])
def freq_jobs_feature(text, freq):
return float(freq['company']) + float(freq['job']) + float(freq['hire']) + float(freq['recruit']) + float(freq['professional']) + float(freq['business']) + float(freq['application'])
# Generates a feature vector
def generate_feature_vector(text, freq):
feature = []
feature.append(freq_pain_feature(text, freq))
feature.append(freq_private_feature(text, freq))
feature.append(freq_bank_feature(text, freq))
feature.append(freq_money_feature(text, freq))
feature.append(freq_drug_feature(text, freq))
feature.append(freq_spam_feature(text, freq))
feature.append(freq_prescription_feature(text, freq))
feature.append(freq_creative_feature(text, freq))
feature.append(freq_height_feature(text, freq))
feature.append(freq_featured_feature(text, freq))
feature.append(freq_differ_feature(text, freq))
feature.append(freq_width_feature(text, freq))
feature.append(freq_other_feature(text, freq))
feature.append(freq_energy_feature(text, freq))
feature.append(freq_business_feature(text, freq))
feature.append(freq_message_feature(text, freq))
feature.append(freq_volumes_feature(text, freq))
feature.append(freq_revision_feature(text, freq))
feature.append(freq_path_feature(text, freq))
feature.append(freq_meter_feature(text, freq))
feature.append(freq_memo_feature(text, freq))
feature.append(freq_planning_feature(text, freq))
feature.append(freq_pleased_feature(text, freq))
feature.append(freq_record_feature(text, freq))
feature.append(freq_out_feature(text, freq))
feature.append(freq_semicolon_feature(text, freq))
feature.append(freq_dollar_feature(text, freq))
feature.append(freq_sharp_feature(text, freq))
feature.append(freq_exclamation_feature(text, freq))
feature.append(freq_para_feature(text, freq))
feature.append(freq_bracket_feature(text, freq))
feature.append(freq_and_feature(text, freq))
feature.append(freq_insurance_feature(text, freq))
feature.append(freq_porn_feature(text, freq))
feature.append(freq_fuck_feature(text, freq))
feature.append(freq_dick_feature(text, freq))
feature.append(freq_viagra_feature(text, freq))
feature.append(freq_click_feature(text, freq))
feature.append(freq_send_feature(text, freq))
feature.append(freq_money_feature(text, freq))
feature.append(freq_sex_feature(text, freq))
feature.append(freq_linux_feature(text, freq))
feature.append(freq_web_feature(text, freq))
feature.append(freq_period_feature(text, freq))
feature.append(len_feature(text, freq))
feature.append(freq_forward_feature(text, freq))
feature.append(freq_career_feature(text, freq))
feature.append(freq_interview_feature(text, freq))
feature.append(freq_meeting_feature(text, freq))
feature.append(freq_files_feature(text, freq))
feature.append(freq_urgent_feature(text, freq))
feature.append(freq_ebay_feature(text, freq))
feature.append(freq_prince_feature(text, freq))
feature.append(freq_cialis_feature(text, freq))
feature.append(freq_visit_feature(text, freq))
feature.append(freq_pharm_feature(text, freq))
feature.append(freq_period_feature(text, freq))
feature.append(freq_at_feature(text, freq))
feature.append(freq_common_feature(text, freq))
feature.append(freq_pronouns_feature(text, freq))
feature.append(freq_agg_pronoun_feature(text, freq))
feature.append(freq_verbs_feature(text, freq))
feature.append(freq_mail_feature(text, freq))
feature.append(freq_date_feature(text, freq))
feature.append(freq_schedule_feature(text, freq))
feature.append(freq_answer_feature(text, freq))
feature.append(freq_adj_feature(text, freq))
feature.append(freq_jobs_feature(text, freq))
# --------- Add your own features here ---------
# Make sure type is int or float
return feature
# This method generates a design matrix with a list of filenames
# Each file is a single training example
def generate_design_matrix(filenames):
design_matrix = []
for filename in filenames:
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
try:
text = f.read() # Read in text from file
except Exception as e:
# skip files we have trouble reading.
continue
text = text.replace('\r\n', ' ') # Remove newline character
words = re.findall(r'\w+', text)
word_freq = defaultdict(int) # Frequency of all words
for word in words:
word_freq[word] += 1
# Create a feature vector
feature_vector = generate_feature_vector(text, word_freq)
design_matrix.append(feature_vector)
return design_matrix
# ************** Script starts here **************
# DO NOT MODIFY ANYTHING BELOW
spam_filenames = glob.glob(os.path.join(SPAM_DIR, '*.txt'))
spam_design_matrix = generate_design_matrix(spam_filenames)
ham_filenames = glob.glob(os.path.join(HAM_DIR, '*.txt'))
ham_design_matrix = generate_design_matrix(ham_filenames)
# Important: the test_filenames must be in reverse numerical order as that is the
# order we will be evaluating your classifier
test_filenames = [os.path.join(TEST_DIR, '{}.txt'.format(x)) for x in range(NUM_TEST_EXAMPLES-1, -1, -1)]
test_design_matrix = generate_design_matrix(test_filenames)
X = spam_design_matrix + ham_design_matrix
Y = np.array([1]*len(spam_design_matrix) + [0]*len(ham_design_matrix)).reshape((-1, 1))
file_dict = {}
file_dict['training_data'] = X
file_dict['training_labels'] = Y
file_dict['test_data'] = test_design_matrix
outfile = os.path.join(BASE_DIR, 'spam_data.mat')
scipy.io.savemat(outfile, file_dict)