|
''' |
|
**************** PLEASE READ *************** |
|
|
|
Script that reads in spam and ham messages and converts each training example |
|
into a feature vector |
|
|
|
Code intended for UC Berkeley course CS 189/289A: Machine Learning |
|
|
|
Requirements: |
|
-scipy ('pip install scipy') |
|
|
|
To add your own features, create a function that takes in the raw text and |
|
word frequency dictionary and outputs a int or float. Then add your feature |
|
in the function 'def generate_feature_vector' |
|
|
|
The output of your file will be a .mat file. The data will be accessible using |
|
the following keys: |
|
-'training_data' |
|
-'training_labels' |
|
-'test_data' |
|
|
|
Please direct any bugs to [email protected] |
|
''' |
|
|
|
from collections import defaultdict |
|
import glob |
|
import re |
|
import scipy.io |
|
import numpy as np |
|
|
|
NUM_TRAINING_EXAMPLES = 5172 |
|
NUM_TEST_EXAMPLES = 5857 |
|
|
|
BASE_DIR = './' |
|
SPAM_DIR = 'spam/' |
|
HAM_DIR = 'ham/' |
|
TEST_DIR = 'test/' |
|
|
|
|
|
|
|
|
|
def freq_pain_feature(text, freq): |
|
return float(freq['pain']) |
|
|
|
def freq_private_feature(text, freq): |
|
return float(freq['private']) |
|
|
|
def freq_bank_feature(text, freq): |
|
return float(freq['bank']) |
|
|
|
def freq_money_feature(text, freq): |
|
return float(freq['money']) |
|
|
|
def freq_drug_feature(text, freq): |
|
return float(freq['drug']) |
|
|
|
def freq_spam_feature(text, freq): |
|
return float(freq['spam']) |
|
|
|
def freq_prescription_feature(text, freq): |
|
return float(freq['prescription']) |
|
|
|
def freq_creative_feature(text, freq): |
|
return float(freq['creative']) |
|
|
|
def freq_height_feature(text, freq): |
|
return float(freq['height']) |
|
|
|
def freq_featured_feature(text, freq): |
|
return float(freq['featured']) |
|
|
|
def freq_differ_feature(text, freq): |
|
return float(freq['differ']) |
|
|
|
def freq_width_feature(text, freq): |
|
return float(freq['width']) |
|
|
|
def freq_other_feature(text, freq): |
|
return float(freq['other']) |
|
|
|
def freq_energy_feature(text, freq): |
|
return float(freq['energy']) |
|
|
|
def freq_business_feature(text, freq): |
|
return float(freq['business']) |
|
|
|
def freq_message_feature(text, freq): |
|
return float(freq['message']) |
|
|
|
def freq_volumes_feature(text, freq): |
|
return float(freq['volumes']) |
|
|
|
def freq_revision_feature(text, freq): |
|
return float(freq['revision']) |
|
|
|
def freq_path_feature(text, freq): |
|
return float(freq['path']) |
|
|
|
def freq_meter_feature(text, freq): |
|
return float(freq['meter']) |
|
|
|
def freq_memo_feature(text, freq): |
|
return float(freq['memo']) |
|
|
|
def freq_planning_feature(text, freq): |
|
return float(freq['planning']) |
|
|
|
def freq_pleased_feature(text, freq): |
|
return float(freq['pleased']) |
|
|
|
def freq_record_feature(text, freq): |
|
return float(freq['record']) |
|
|
|
def freq_out_feature(text, freq): |
|
return float(freq['out']) |
|
|
|
|
|
def freq_semicolon_feature(text, freq): |
|
return text.count(';') |
|
|
|
def freq_dollar_feature(text, freq): |
|
return text.count('$') |
|
|
|
def freq_sharp_feature(text, freq): |
|
return text.count('#') |
|
|
|
def freq_exclamation_feature(text, freq): |
|
return text.count('!') |
|
|
|
def freq_para_feature(text, freq): |
|
return text.count('(') |
|
|
|
def freq_bracket_feature(text, freq): |
|
return text.count('[') |
|
|
|
def freq_and_feature(text, freq): |
|
return text.count('&') |
|
|
|
|
|
def freq_free_feature(text, freq): |
|
return text.count('free') |
|
|
|
def freq_insurance_feature(text, freq): |
|
return text.count('insurance') |
|
|
|
def freq_porn_feature(text, freq): |
|
return text.count('porn') |
|
|
|
def freq_fuck_feature(text, freq): |
|
return text.count('fuck') |
|
|
|
def freq_cock_feature(text, freq): |
|
return text.count('cock') |
|
|
|
def freq_dick_feature(text, freq): |
|
return text.count('dick') |
|
|
|
def freq_penis_feature(text, freq): |
|
return text.count('penis') |
|
|
|
def freq_viagra_feature(text, freq): |
|
return text.count('viagra') |
|
|
|
def freq_click_feature(text, freq): |
|
return text.count('click') |
|
|
|
def freq_adult_feature(text, freq): |
|
return text.count('adult') |
|
|
|
def freq_send_feature(text, freq): |
|
return text.count('send') |
|
|
|
def freq_money_feature(text, freq): |
|
return text.count('money') |
|
|
|
def freq_sex_feature(text, freq): |
|
return text.count('sex') |
|
|
|
def freq_sexual_feature(text, freq): |
|
return text.count('sexual') |
|
|
|
def freq_sexy_feature(text, freq): |
|
return text.count('sexy') |
|
|
|
def freq_hard_feature(text, freq): |
|
return text.count('hard') |
|
|
|
|
|
def generate_feature_vector(text, freq): |
|
feature = [] |
|
feature.append(freq_pain_feature(text, freq)) |
|
feature.append(freq_private_feature(text, freq)) |
|
feature.append(freq_bank_feature(text, freq)) |
|
feature.append(freq_money_feature(text, freq)) |
|
feature.append(freq_drug_feature(text, freq)) |
|
feature.append(freq_spam_feature(text, freq)) |
|
feature.append(freq_prescription_feature(text, freq)) |
|
feature.append(freq_creative_feature(text, freq)) |
|
feature.append(freq_height_feature(text, freq)) |
|
feature.append(freq_featured_feature(text, freq)) |
|
feature.append(freq_differ_feature(text, freq)) |
|
feature.append(freq_width_feature(text, freq)) |
|
feature.append(freq_other_feature(text, freq)) |
|
feature.append(freq_energy_feature(text, freq)) |
|
feature.append(freq_business_feature(text, freq)) |
|
feature.append(freq_message_feature(text, freq)) |
|
feature.append(freq_volumes_feature(text, freq)) |
|
feature.append(freq_revision_feature(text, freq)) |
|
feature.append(freq_path_feature(text, freq)) |
|
feature.append(freq_meter_feature(text, freq)) |
|
feature.append(freq_memo_feature(text, freq)) |
|
feature.append(freq_planning_feature(text, freq)) |
|
feature.append(freq_pleased_feature(text, freq)) |
|
feature.append(freq_record_feature(text, freq)) |
|
feature.append(freq_out_feature(text, freq)) |
|
feature.append(freq_semicolon_feature(text, freq)) |
|
feature.append(freq_dollar_feature(text, freq)) |
|
feature.append(freq_sharp_feature(text, freq)) |
|
feature.append(freq_exclamation_feature(text, freq)) |
|
feature.append(freq_para_feature(text, freq)) |
|
feature.append(freq_bracket_feature(text, freq)) |
|
feature.append(freq_and_feature(text, freq)) |
|
feature.append(freq_free_feature(text, freq)) |
|
|
|
feature.append(freq_insurance_feature(text, freq)) |
|
feature.append(freq_porn_feature(text, freq)) |
|
feature.append(freq_fuck_feature(text, freq)) |
|
feature.append(freq_cock_feature(text, freq)) |
|
feature.append(freq_dick_feature(text, freq)) |
|
feature.append(freq_penis_feature(text, freq)) |
|
feature.append(freq_viagra_feature(text, freq)) |
|
feature.append(freq_click_feature(text, freq)) |
|
feature.append(freq_adult_feature(text, freq)) |
|
feature.append(freq_send_feature(text, freq)) |
|
feature.append(freq_money_feature(text, freq)) |
|
feature.append(freq_sexy_feature(text, freq)) |
|
feature.append(freq_sex_feature(text, freq)) |
|
feature.append(freq_sexual_feature(text, freq)) |
|
feature.append(freq_hard_feature(text, freq)) |
|
|
|
|
|
|
|
|
|
|
|
return feature |
|
|
|
|
|
|
|
def generate_design_matrix(filenames): |
|
design_matrix = [] |
|
for filename in filenames: |
|
with open(filename, 'r', encoding='utf-8', errors='ignore') as f: |
|
try: |
|
text = f.read() |
|
except Exception as e: |
|
|
|
continue |
|
text = text.replace('\r\n', ' ') |
|
words = re.findall(r'\w+', text) |
|
word_freq = defaultdict(int) |
|
for word in words: |
|
word_freq[word] += 1 |
|
|
|
|
|
feature_vector = generate_feature_vector(text, word_freq) |
|
design_matrix.append(feature_vector) |
|
return design_matrix |
|
|
|
|
|
|
|
|
|
spam_filenames = glob.glob(BASE_DIR + SPAM_DIR + '*.txt') |
|
spam_design_matrix = generate_design_matrix(spam_filenames) |
|
ham_filenames = glob.glob(BASE_DIR + HAM_DIR + '*.txt') |
|
ham_design_matrix = generate_design_matrix(ham_filenames) |
|
|
|
|
|
test_filenames = [BASE_DIR + TEST_DIR + str(x) + '.txt' for x in range(NUM_TEST_EXAMPLES)] |
|
test_design_matrix = generate_design_matrix(test_filenames) |
|
|
|
X = spam_design_matrix + ham_design_matrix |
|
Y = np.array([1]*len(spam_design_matrix) + [0]*len(ham_design_matrix)).reshape((-1, 1)) |
|
|
|
file_dict = {} |
|
file_dict['training_data'] = X |
|
file_dict['training_labels'] = Y |
|
file_dict['test_data'] = test_design_matrix |
|
scipy.io.savemat('spam_data.mat', file_dict) |
|
|