Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 2,955 Bytes
e1c08c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
from __future__ import print_function
import pickle
import json
import csv
import sys
from io import open
# Allow us to import the torchmoji directory
from os.path import dirname, abspath
sys.path.insert(0, dirname(dirname(abspath(__file__))))
from torchmoji.sentence_tokenizer import SentenceTokenizer, coverage
try:
unicode # Python 2
except NameError:
unicode = str # Python 3
IS_PYTHON2 = int(sys.version[0]) == 2
OUTPUT_PATH = 'coverage.csv'
DATASET_PATHS = [
'../data/Olympic/raw.pickle',
'../data/PsychExp/raw.pickle',
'../data/SCv1/raw.pickle',
'../data/SCv2-GEN/raw.pickle',
'../data/SE0714/raw.pickle',
#'../data/SE1604/raw.pickle', # Excluded due to Twitter's ToS
'../data/SS-Twitter/raw.pickle',
'../data/SS-Youtube/raw.pickle',
]
with open('../model/vocabulary.json', 'r') as f:
vocab = json.load(f)
results = []
for p in DATASET_PATHS:
coverage_result = [p]
print('Calculating coverage for {}'.format(p))
with open(p, 'rb') as f:
if IS_PYTHON2:
s = pickle.load(f)
else:
s = pickle.load(f, fix_imports=True)
# Decode data
try:
s['texts'] = [unicode(x) for x in s['texts']]
except UnicodeDecodeError:
s['texts'] = [x.decode('utf-8') for x in s['texts']]
# Own
st = SentenceTokenizer({}, 30)
tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
[s['train_ind'],
s['val_ind'],
s['test_ind']],
extend_with=10000)
coverage_result.append(coverage(tests[2]))
# Last
st = SentenceTokenizer(vocab, 30)
tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
[s['train_ind'],
s['val_ind'],
s['test_ind']],
extend_with=0)
coverage_result.append(coverage(tests[2]))
# Full
st = SentenceTokenizer(vocab, 30)
tests, dicts, _ = st.split_train_val_test(s['texts'], s['info'],
[s['train_ind'],
s['val_ind'],
s['test_ind']],
extend_with=10000)
coverage_result.append(coverage(tests[2]))
results.append(coverage_result)
with open(OUTPUT_PATH, 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter='\t', lineterminator='\n')
writer.writerow(['Dataset', 'Own', 'Last', 'Full'])
for i, row in enumerate(results):
try:
writer.writerow(row)
except:
print("Exception at row {}!".format(i))
print('Saved to {}'.format(OUTPUT_PATH))
|