-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutil.py
149 lines (114 loc) · 3.54 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
The functions in this module transform and read the input files,
and write out the submission file.
project: Kaggle WISE 2014 Greek Media competition
author: David Thaler
"""
import numpy as np
import pandas as pd
import gzip
import cPickle
import re
import os.path
from sklearn.datasets import load_svmlight_file
import pdb
BASE = os.path.expanduser('~/Documents/Kaggle/GreekMedia/')
DATA = os.path.join(BASE, 'data')
SUBMIT = os.path.join(BASE, 'submissions')
SAMPLE = os.path.join(DATA, 'sampleSubmission.csv')
SUBMISSION_PATH = os.path.join(SUBMIT, 'submission%d.csv.gz')
XTEST = os.path.join(DATA, 'xtest.pkl.gz')
XTRAIN = os.path.join(DATA, 'xtrain.pkl.gz')
YTRAIN = os.path.join(DATA, 'ytrain.pkl.gz')
NROW_TRAIN = 64857
NLABELS = 203
NFEATURES= 301561
def rewrite_train():
"""
Rewrites train.libsvm into a gzipped, pickled sparse matrix for the features,
and a gzipped, pickled numpy (0-1) array for the labels.
Run this once.
Params: none
Returns:
nothing, but writes out the transformed input files at data/
"""
inpath = os.path.join(DATA, 'train.libsvm')
(x, ylist) = load_svmlight_file(inpath,
n_features=NFEATURES,
multilabel=True,
zero_based=False)
with gzip.open(XTRAIN, 'wb') as fx:
cPickle.dump(x, fx)
y = list2matrix(ylist)
with gzip.open(YTRAIN, 'wb') as fy:
cPickle.dump(y, fy)
def list2matrix(ylist):
"""
Rewrites a list-of-lists of labels for the multilabel case into a
0-1 label matrix. The matrix is a numpy array (dense data type),
but fairly sparse in practice.
Params:
ylist - a list of lists of integer labels for multilabel classification
Returns:
a 0-1 numpy array of size (# instances) x (# classes)
"""
y = np.zeros((NROW_TRAIN, NLABELS))
for k in range(len(ylist)):
yl = ylist[k]
for l in yl:
y[k, l-1] = 1
return y
def rewrite_test():
"""
Rewrites the test set features from test.libsvm, into a gzipped,
pickled sparse matrix.
Params: none
Returns:
nothing, but writes out the transformed input files at data/
"""
inpath = os.path.join(DATA, 'test.libsvm')
(x, y) = load_svmlight_file(inpath, n_features=NFEATURES, zero_based=False)
with gzip.open(XTEST, 'wb') as f:
cPickle.dump(x, f)
def loadTrain():
"""
Function loads (uncompresses, unpickles) the training data and labels.
Params: none
Returns:
2-tuple of training set features and labels
"""
with gzip.open(XTRAIN) as fx:
x = cPickle.load(fx)
with gzip.open(YTRAIN) as fy:
y = cPickle.load(fy)
return (x, y)
def loadTest():
"""
Function loads (uncompresses, unpickles) the test data.
Params: none
Returns:
the test features
"""
with gzip.open(XTEST) as f:
x = cPickle.load(f)
return x
def writeSubmission(submit_num, pred):
"""
Writes out the predictions in the correct form for submission to Kaggle.
NB: This code is pretty slow (~ 15min).
Params:
submit_num - the submission is named submission<submit_num>.csv.gz
pred - a 0-1 numpy array of predictions of dimension
(# test instances) x (# classes)
Returns:
nothing, but writes submission file into submissions/
"""
ss = pd.read_csv(SAMPLE)
for k in range(pred.shape[0]):
s = np.array_str(pred[k].nonzero()[0] + 1)
s = s[1:-1].strip()
s = re.sub(r"\W+", " ", s)
ss.Labels[k] = s
path = SUBMISSION_PATH % submit_num
with gzip.open(path, 'wb') as f:
ss.to_csv(f, index=False)