|
''' |
|
**************** PLEASE READ *************** |
|
|
|
Script that reads in spam and ham messages and converts each training example |
|
into a feature vector |
|
|
|
Code intended for UC Berkeley course CS 189/289A: Machine Learning |
|
|
|
Requirements: |
|
-scipy ('pip install scipy') |
|
|
|
To add your own features, create a function that takes in the raw text and |
|
word frequency dictionary and outputs a int or float. Then add your feature |
|
in the function 'def generate_feature_vector' |
|
|
|
The output of your file will be a .mat file. The data will be accessible using |
|
the following keys: |
|
-'training_data' |
|
-'training_labels' |
|
-'test_data' |
|
|
|
Please direct any bugs to [email protected] |
|
''' |
|
|
|
from collections import defaultdict |
|
import glob |
|
import re |
|
import os |
|
import scipy.io |
|
import numpy as np |
|
|
|
NUM_TRAINING_EXAMPLES = 5172 |
|
NUM_TEST_EXAMPLES = 5857 |
|
|
|
BASE_DIR = os.path.abspath(os.path.dirname(__file__)) |
|
SPAM_DIR = os.path.join(BASE_DIR, 'spam') |
|
HAM_DIR = os.path.join(BASE_DIR, 'ham') |
|
TEST_DIR = os.path.join(BASE_DIR, 'test') |
|
|
|
|
|
|
|
|
|
def freq_pain_feature(text, freq): |
|
return float(freq['pain']) |
|
|
|
def freq_private_feature(text, freq): |
|
return float(freq['private']) |
|
|
|
def freq_bank_feature(text, freq): |
|
return float(freq['bank']) |
|
|
|
def freq_money_feature(text, freq): |
|
return float(freq['money']) |
|
|
|
def freq_drug_feature(text, freq): |
|
return float(freq['drug']) |
|
|
|
def freq_spam_feature(text, freq): |
|
return float(freq['spam']) |
|
|
|
def freq_prescription_feature(text, freq): |
|
return float(freq['prescription']) |
|
|
|
def freq_creative_feature(text, freq): |
|
return float(freq['creative']) |
|
|
|
def freq_height_feature(text, freq): |
|
return float(freq['height']) |
|
|
|
def freq_featured_feature(text, freq): |
|
return float(freq['featured']) |
|
|
|
def freq_differ_feature(text, freq): |
|
return float(freq['differ']) |
|
|
|
def freq_width_feature(text, freq): |
|
return float(freq['width']) |
|
|
|
def freq_other_feature(text, freq): |
|
return float(freq['other']) |
|
|
|
def freq_energy_feature(text, freq): |
|
return float(freq['energy']) |
|
|
|
def freq_business_feature(text, freq): |
|
return float(freq['business']) |
|
|
|
def freq_message_feature(text, freq): |
|
return float(freq['message']) |
|
|
|
def freq_volumes_feature(text, freq): |
|
return float(freq['volumes']) |
|
|
|
def freq_revision_feature(text, freq): |
|
return float(freq['revision']) |
|
|
|
def freq_path_feature(text, freq): |
|
return float(freq['path']) |
|
|
|
def freq_meter_feature(text, freq): |
|
return float(freq['meter']) |
|
|
|
def freq_memo_feature(text, freq): |
|
return float(freq['memo']) |
|
|
|
def freq_planning_feature(text, freq): |
|
return float(freq['planning']) |
|
|
|
def freq_pleased_feature(text, freq): |
|
return float(freq['pleased']) |
|
|
|
def freq_record_feature(text, freq): |
|
return float(freq['record']) |
|
|
|
def freq_out_feature(text, freq): |
|
return float(freq['out']) |
|
|
|
|
|
def freq_semicolon_feature(text, freq): |
|
return text.count(';') |
|
|
|
def freq_dollar_feature(text, freq): |
|
return text.count('$') |
|
|
|
def freq_sharp_feature(text, freq): |
|
return text.count('#') |
|
|
|
def freq_exclamation_feature(text, freq): |
|
return text.count('!') |
|
|
|
def freq_para_feature(text, freq): |
|
return text.count('(') |
|
|
|
def freq_bracket_feature(text, freq): |
|
return text.count('[') |
|
|
|
def freq_and_feature(text, freq): |
|
return text.count('&') |
|
|
|
|
|
|
|
def freq_free_feature(text, freq): |
|
return float(freq['free']) |
|
|
|
def freq_insurance_feature(text, freq): |
|
return float(freq['insurance']) |
|
|
|
def freq_porn_feature(text, freq): |
|
return float(freq['porn']) |
|
|
|
def freq_fuck_feature(text, freq): |
|
return float(freq['fuck']) |
|
|
|
def freq_dick_feature(text, freq): |
|
return float(freq['dick']) + float(freq['penis']) + float(freq['cock']) |
|
|
|
def freq_viagra_feature(text, freq): |
|
return float(freq['viagra']) |
|
|
|
def freq_click_feature(text, freq): |
|
return float(freq['click']) |
|
|
|
def freq_send_feature(text, freq): |
|
return float(freq['send']) |
|
|
|
def freq_money_feature(text, freq): |
|
return float(freq['money']) |
|
|
|
def freq_sex_feature(text, freq): |
|
return text.count('sex') + float(freq['hard']) + float(freq['adult']) |
|
|
|
def freq_linux_feature(text, freq): |
|
return float(freq['linux']) |
|
|
|
def freq_web_feature(text, freq): |
|
return text.count('http') |
|
|
|
def freq_period_feature(text, freq): |
|
return text.count('.') |
|
|
|
def len_feature(text, freq): |
|
return len(text) |
|
|
|
def freq_forward_feature(text, freq): |
|
return text.count('forward') |
|
|
|
def freq_career_feature(text, freq): |
|
return float(freq['career']) |
|
|
|
def freq_interview_feature(text, freq): |
|
return float(freq['interview']) |
|
|
|
def freq_meeting_feature(text, freq): |
|
return text.count('meet') |
|
|
|
def freq_files_feature(text, freq): |
|
return float(text.lower().count('pdf')) + float(text.lower().count('jpg')) + float(text.lower().count('png')) + float(text.lower().count('doc')) + float(text.lower().count('html')) + float(text.lower().count('xls')) + float(text.lower().count('ods')) + float(text.lower().count('ppt')) + float(text.lower().count('txt')) |
|
|
|
def freq_urgent_feature(text, freq): |
|
return float(freq['urgent']) |
|
|
|
def freq_ebay_feature(text, freq): |
|
return float(freq['ebay']) |
|
|
|
def freq_prince_feature(text, freq): |
|
return float(freq['prince']) |
|
|
|
def freq_cialis_feature(text, freq): |
|
return float(freq['cialis']) |
|
|
|
def freq_visit_feature(text, freq): |
|
return float(freq['visit']) |
|
|
|
def freq_pharm_feature(text, freq): |
|
return text.count('pharm') |
|
|
|
def freq_period_feature(text, freq): |
|
return text.count('.') + text.count('-') + text.count('/') |
|
|
|
def freq_at_feature(text, freq): |
|
return text.count('@') |
|
|
|
def freq_common_feature(text, freq): |
|
return float(freq['the']) + float(freq['and']) + float(freq['to']) + float(freq['for']) + float(freq['in']) |
|
|
|
def freq_pronouns_feature(text, freq): |
|
return float(freq['me']) + float(freq['he']) + float(freq['she']) + float(freq['they']) + float(freq['them']) + float(freq['we']) + float(freq['ours']) + float(freq['my']) |
|
|
|
def freq_agg_pronoun_feature(text, freq): |
|
return float(freq['you']) + float(freq['yours']) + float(freq['your']) + float(freq['name']) |
|
|
|
def freq_verbs_feature(text, freq): |
|
return float(freq['should']) + float(freq['could']) + float(freq['would']) + float(freq['see']) + float(freq['need']) + float(freq['has']) + float(freq['do']) |
|
|
|
def freq_mail_feature(text, freq): |
|
return float(freq['mail']) |
|
|
|
def freq_date_feature(text, freq): |
|
return float(freq['january']) + float(freq['february']) + float(freq['march']) + float(freq['april']) + float(freq['may']) + float(freq['june']) + float(freq['july']) + float(freq['august']) + float(freq['september']) + float(freq['october']) + float(freq['november']) + float(freq['december']) |
|
|
|
def freq_schedule_feature(text, freq): |
|
return float(freq['date']) + float(freq['time']) + float(freq['month']) + float(freq['schedule']) + float(freq['meeting']) + float(freq['late']) + float(freq['early']) |
|
|
|
def freq_answer_feature(text, freq): |
|
return float(freq['yes']) + float(freq['no']) + float(freq['sure']) + float(freq['yep']) + float(freq['nope']) + float(freq['sorry']) + float(freq['apologies']) + float(freq['ok']) + float(freq['okay']) |
|
|
|
def freq_adj_feature(text, freq): |
|
return float(freq['this']) + float(freq['that']) + float(freq['here']) + float(freq['there']) + float(freq['in']) + float(freq['with']) + float(freq['be']) |
|
|
|
def freq_jobs_feature(text, freq): |
|
return float(freq['company']) + float(freq['job']) + float(freq['hire']) + float(freq['recruit']) + float(freq['professional']) + float(freq['business']) + float(freq['application']) |
|
|
|
|
|
def generate_feature_vector(text, freq): |
|
feature = [] |
|
feature.append(freq_pain_feature(text, freq)) |
|
feature.append(freq_private_feature(text, freq)) |
|
feature.append(freq_bank_feature(text, freq)) |
|
feature.append(freq_money_feature(text, freq)) |
|
feature.append(freq_drug_feature(text, freq)) |
|
feature.append(freq_spam_feature(text, freq)) |
|
feature.append(freq_prescription_feature(text, freq)) |
|
feature.append(freq_creative_feature(text, freq)) |
|
feature.append(freq_height_feature(text, freq)) |
|
feature.append(freq_featured_feature(text, freq)) |
|
feature.append(freq_differ_feature(text, freq)) |
|
feature.append(freq_width_feature(text, freq)) |
|
feature.append(freq_other_feature(text, freq)) |
|
feature.append(freq_energy_feature(text, freq)) |
|
feature.append(freq_business_feature(text, freq)) |
|
feature.append(freq_message_feature(text, freq)) |
|
feature.append(freq_volumes_feature(text, freq)) |
|
feature.append(freq_revision_feature(text, freq)) |
|
feature.append(freq_path_feature(text, freq)) |
|
feature.append(freq_meter_feature(text, freq)) |
|
feature.append(freq_memo_feature(text, freq)) |
|
feature.append(freq_planning_feature(text, freq)) |
|
feature.append(freq_pleased_feature(text, freq)) |
|
feature.append(freq_record_feature(text, freq)) |
|
feature.append(freq_out_feature(text, freq)) |
|
feature.append(freq_semicolon_feature(text, freq)) |
|
feature.append(freq_dollar_feature(text, freq)) |
|
feature.append(freq_sharp_feature(text, freq)) |
|
feature.append(freq_exclamation_feature(text, freq)) |
|
feature.append(freq_para_feature(text, freq)) |
|
feature.append(freq_bracket_feature(text, freq)) |
|
feature.append(freq_and_feature(text, freq)) |
|
|
|
feature.append(freq_insurance_feature(text, freq)) |
|
feature.append(freq_porn_feature(text, freq)) |
|
feature.append(freq_fuck_feature(text, freq)) |
|
feature.append(freq_dick_feature(text, freq)) |
|
feature.append(freq_viagra_feature(text, freq)) |
|
feature.append(freq_click_feature(text, freq)) |
|
feature.append(freq_send_feature(text, freq)) |
|
feature.append(freq_money_feature(text, freq)) |
|
feature.append(freq_sex_feature(text, freq)) |
|
feature.append(freq_linux_feature(text, freq)) |
|
feature.append(freq_web_feature(text, freq)) |
|
feature.append(freq_period_feature(text, freq)) |
|
feature.append(len_feature(text, freq)) |
|
feature.append(freq_forward_feature(text, freq)) |
|
feature.append(freq_career_feature(text, freq)) |
|
feature.append(freq_interview_feature(text, freq)) |
|
feature.append(freq_meeting_feature(text, freq)) |
|
feature.append(freq_files_feature(text, freq)) |
|
feature.append(freq_urgent_feature(text, freq)) |
|
feature.append(freq_ebay_feature(text, freq)) |
|
feature.append(freq_prince_feature(text, freq)) |
|
feature.append(freq_cialis_feature(text, freq)) |
|
feature.append(freq_visit_feature(text, freq)) |
|
feature.append(freq_pharm_feature(text, freq)) |
|
feature.append(freq_period_feature(text, freq)) |
|
feature.append(freq_at_feature(text, freq)) |
|
feature.append(freq_common_feature(text, freq)) |
|
feature.append(freq_pronouns_feature(text, freq)) |
|
feature.append(freq_agg_pronoun_feature(text, freq)) |
|
feature.append(freq_verbs_feature(text, freq)) |
|
feature.append(freq_mail_feature(text, freq)) |
|
feature.append(freq_date_feature(text, freq)) |
|
feature.append(freq_schedule_feature(text, freq)) |
|
feature.append(freq_answer_feature(text, freq)) |
|
feature.append(freq_adj_feature(text, freq)) |
|
feature.append(freq_jobs_feature(text, freq)) |
|
|
|
|
|
|
|
return feature |
|
|
|
|
|
|
|
def generate_design_matrix(filenames): |
|
design_matrix = [] |
|
for filename in filenames: |
|
with open(filename, 'r', encoding='utf-8', errors='ignore') as f: |
|
try: |
|
text = f.read() |
|
except Exception as e: |
|
|
|
continue |
|
text = text.replace('\r\n', ' ') |
|
words = re.findall(r'\w+', text) |
|
word_freq = defaultdict(int) |
|
for word in words: |
|
word_freq[word] += 1 |
|
|
|
|
|
feature_vector = generate_feature_vector(text, word_freq) |
|
design_matrix.append(feature_vector) |
|
return design_matrix |
|
|
|
|
|
|
|
|
|
spam_filenames = glob.glob(os.path.join(SPAM_DIR, '*.txt')) |
|
spam_design_matrix = generate_design_matrix(spam_filenames) |
|
ham_filenames = glob.glob(os.path.join(HAM_DIR, '*.txt')) |
|
ham_design_matrix = generate_design_matrix(ham_filenames) |
|
|
|
|
|
test_filenames = [os.path.join(TEST_DIR, '{}.txt'.format(x)) for x in range(NUM_TEST_EXAMPLES-1, -1, -1)] |
|
test_design_matrix = generate_design_matrix(test_filenames) |
|
|
|
X = spam_design_matrix + ham_design_matrix |
|
Y = np.array([1]*len(spam_design_matrix) + [0]*len(ham_design_matrix)).reshape((-1, 1)) |
|
|
|
file_dict = {} |
|
file_dict['training_data'] = X |
|
file_dict['training_labels'] = Y |
|
file_dict['test_data'] = test_design_matrix |
|
|
|
outfile = os.path.join(BASE_DIR, 'spam_data.mat') |
|
scipy.io.savemat(outfile, file_dict) |
|
|