cs189 / hw /hw1 /data /featurize.py
briancpark's picture
Upload 670 files
14fe3f7
'''
**************** PLEASE READ ***************
Script that reads in spam and ham messages and converts each training example
into a feature vector
Code intended for UC Berkeley course CS 189/289A: Machine Learning
Requirements:
-scipy ('pip install scipy')
To add your own features, create a function that takes in the raw text and
word frequency dictionary and outputs a int or float. Then add your feature
in the function 'def generate_feature_vector'
The output of your file will be a .mat file. The data will be accessible using
the following keys:
-'training_data'
-'training_labels'
-'test_data'
Please direct any bugs to [email protected]
'''
from collections import defaultdict
import glob
import re
import scipy.io
import numpy as np
NUM_TRAINING_EXAMPLES = 5172
NUM_TEST_EXAMPLES = 5857
BASE_DIR = './'
SPAM_DIR = 'spam/'
HAM_DIR = 'ham/'
TEST_DIR = 'test/'
# ************* Features *************
# Features that look for certain words
def freq_pain_feature(text, freq):
return float(freq['pain'])
def freq_private_feature(text, freq):
return float(freq['private'])
def freq_bank_feature(text, freq):
return float(freq['bank'])
def freq_money_feature(text, freq):
return float(freq['money'])
def freq_drug_feature(text, freq):
return float(freq['drug'])
def freq_spam_feature(text, freq):
return float(freq['spam'])
def freq_prescription_feature(text, freq):
return float(freq['prescription'])
def freq_creative_feature(text, freq):
return float(freq['creative'])
def freq_height_feature(text, freq):
return float(freq['height'])
def freq_featured_feature(text, freq):
return float(freq['featured'])
def freq_differ_feature(text, freq):
return float(freq['differ'])
def freq_width_feature(text, freq):
return float(freq['width'])
def freq_other_feature(text, freq):
return float(freq['other'])
def freq_energy_feature(text, freq):
return float(freq['energy'])
def freq_business_feature(text, freq):
return float(freq['business'])
def freq_message_feature(text, freq):
return float(freq['message'])
def freq_volumes_feature(text, freq):
return float(freq['volumes'])
def freq_revision_feature(text, freq):
return float(freq['revision'])
def freq_path_feature(text, freq):
return float(freq['path'])
def freq_meter_feature(text, freq):
return float(freq['meter'])
def freq_memo_feature(text, freq):
return float(freq['memo'])
def freq_planning_feature(text, freq):
return float(freq['planning'])
def freq_pleased_feature(text, freq):
return float(freq['pleased'])
def freq_record_feature(text, freq):
return float(freq['record'])
def freq_out_feature(text, freq):
return float(freq['out'])
# Features that look for certain characters
def freq_semicolon_feature(text, freq):
return text.count(';')
def freq_dollar_feature(text, freq):
return text.count('$')
def freq_sharp_feature(text, freq):
return text.count('#')
def freq_exclamation_feature(text, freq):
return text.count('!')
def freq_para_feature(text, freq):
return text.count('(')
def freq_bracket_feature(text, freq):
return text.count('[')
def freq_and_feature(text, freq):
return text.count('&')
# --------- Add your own feature methods ----------
def freq_free_feature(text, freq):
return text.count('free')
def freq_insurance_feature(text, freq):
return text.count('insurance')
def freq_porn_feature(text, freq):
return text.count('porn')
def freq_fuck_feature(text, freq):
return text.count('fuck')
def freq_cock_feature(text, freq):
return text.count('cock')
def freq_dick_feature(text, freq):
return text.count('dick')
def freq_penis_feature(text, freq):
return text.count('penis')
def freq_viagra_feature(text, freq):
return text.count('viagra')
def freq_click_feature(text, freq):
return text.count('click')
def freq_adult_feature(text, freq):
return text.count('adult')
def freq_send_feature(text, freq):
return text.count('send')
def freq_money_feature(text, freq):
return text.count('money')
def freq_sex_feature(text, freq):
return text.count('sex')
def freq_sexual_feature(text, freq):
return text.count('sexual')
def freq_sexy_feature(text, freq):
return text.count('sexy')
def freq_hard_feature(text, freq):
return text.count('hard')
# Generates a feature vector
def generate_feature_vector(text, freq):
feature = []
feature.append(freq_pain_feature(text, freq))
feature.append(freq_private_feature(text, freq))
feature.append(freq_bank_feature(text, freq))
feature.append(freq_money_feature(text, freq))
feature.append(freq_drug_feature(text, freq))
feature.append(freq_spam_feature(text, freq))
feature.append(freq_prescription_feature(text, freq))
feature.append(freq_creative_feature(text, freq))
feature.append(freq_height_feature(text, freq))
feature.append(freq_featured_feature(text, freq))
feature.append(freq_differ_feature(text, freq))
feature.append(freq_width_feature(text, freq))
feature.append(freq_other_feature(text, freq))
feature.append(freq_energy_feature(text, freq))
feature.append(freq_business_feature(text, freq))
feature.append(freq_message_feature(text, freq))
feature.append(freq_volumes_feature(text, freq))
feature.append(freq_revision_feature(text, freq))
feature.append(freq_path_feature(text, freq))
feature.append(freq_meter_feature(text, freq))
feature.append(freq_memo_feature(text, freq))
feature.append(freq_planning_feature(text, freq))
feature.append(freq_pleased_feature(text, freq))
feature.append(freq_record_feature(text, freq))
feature.append(freq_out_feature(text, freq))
feature.append(freq_semicolon_feature(text, freq))
feature.append(freq_dollar_feature(text, freq))
feature.append(freq_sharp_feature(text, freq))
feature.append(freq_exclamation_feature(text, freq))
feature.append(freq_para_feature(text, freq))
feature.append(freq_bracket_feature(text, freq))
feature.append(freq_and_feature(text, freq))
feature.append(freq_free_feature(text, freq))
feature.append(freq_insurance_feature(text, freq))
feature.append(freq_porn_feature(text, freq))
feature.append(freq_fuck_feature(text, freq))
feature.append(freq_cock_feature(text, freq))
feature.append(freq_dick_feature(text, freq))
feature.append(freq_penis_feature(text, freq))
feature.append(freq_viagra_feature(text, freq))
feature.append(freq_click_feature(text, freq))
feature.append(freq_adult_feature(text, freq))
feature.append(freq_send_feature(text, freq))
feature.append(freq_money_feature(text, freq))
feature.append(freq_sexy_feature(text, freq))
feature.append(freq_sex_feature(text, freq))
feature.append(freq_sexual_feature(text, freq))
feature.append(freq_hard_feature(text, freq))
# --------- Add your own features here ---------
# Make sure type is int or float
return feature
# This method generates a design matrix with a list of filenames
# Each file is a single training example
def generate_design_matrix(filenames):
design_matrix = []
for filename in filenames:
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
try:
text = f.read() # Read in text from file
except Exception as e:
# skip files we have trouble reading.
continue
text = text.replace('\r\n', ' ') # Remove newline character
words = re.findall(r'\w+', text)
word_freq = defaultdict(int) # Frequency of all words
for word in words:
word_freq[word] += 1
# Create a feature vector
feature_vector = generate_feature_vector(text, word_freq)
design_matrix.append(feature_vector)
return design_matrix
# ************** Script starts here **************
# DO NOT MODIFY ANYTHING BELOW
spam_filenames = glob.glob(BASE_DIR + SPAM_DIR + '*.txt')
spam_design_matrix = generate_design_matrix(spam_filenames)
ham_filenames = glob.glob(BASE_DIR + HAM_DIR + '*.txt')
ham_design_matrix = generate_design_matrix(ham_filenames)
# Important: the test_filenames must be in numerical order as that is the
# order we will be evaluating your classifier
test_filenames = [BASE_DIR + TEST_DIR + str(x) + '.txt' for x in range(NUM_TEST_EXAMPLES)]
test_design_matrix = generate_design_matrix(test_filenames)
X = spam_design_matrix + ham_design_matrix
Y = np.array([1]*len(spam_design_matrix) + [0]*len(ham_design_matrix)).reshape((-1, 1))
file_dict = {}
file_dict['training_data'] = X
file_dict['training_labels'] = Y
file_dict['test_data'] = test_design_matrix
scipy.io.savemat('spam_data.mat', file_dict)