-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSVM.py
153 lines (111 loc) · 5.05 KB
/
SVM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import time
from sklearn import svm, preprocessing, gaussian_process, neighbors, cross_decomposition, ensemble
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, f_regression, SelectKBest
from ParseData.Database import DB
from Lib import loadData, loadTestDataset, getPointsOfInterest, generateAllFeatures, TEST_DATASET_FINAL_FILENAME, TEST_DATASET_INITIAL_FILENAME
OUTPUT_FILENAME = 'out.txt'
FINAL = False
K_BEST_FEATURES = 10
# Various machine learning methods
def generateSVRPipeline(x):
selector = SelectKBest(f_regression, k=K_BEST_FEATURES)
scaler = preprocessing.StandardScaler().fit(x)
reg = svm.SVR(kernel='rbf')
return Pipeline([('selector', selector), ('scaler', scaler), ('svr', reg)])
def generateGaussianProcessPipeline(x):
selector = SelectKBest(f_regression, k=K_BEST_FEATURES)
scaler = preprocessing.StandardScaler().fit(x)
gp = gaussian_process.GaussianProcess()
return Pipeline([('selector', selector), ('scaler', scaler), ('gp', gp)])
def generateKNearestNeighborsPipeline(x):
selector = SelectKBest(f_regression, k=K_BEST_FEATURES)
scaler = preprocessing.StandardScaler().fit(x)
knn = neighbors.KNeighborsRegressor(n_neighbors=25, weights='uniform')
return Pipeline([('selector', selector), ('scaler', scaler), ('knn', knn)])
def generateRadiusNearestNeighborsPipeline(x):
selector = SelectKBest(f_regression, k=K_BEST_FEATURES)
scaler = preprocessing.StandardScaler().fit(x)
rnn = neighbors.RadiusNeighborsRegressor(radius=1.5, weights='uniform')
return Pipeline([('selector', selector), ('scaler', scaler), ('rnn', rnn)])
def generatePLSRegressionPipeline(x):
scaler = preprocessing.StandardScaler().fit(x)
pls = cross_decomposition.PLSRegression(n_components=2)
return Pipeline([('scaler', scaler), ('pls', pls)])
def generateRandomForestPipeline(x):
scaler = preprocessing.StandardScaler().fit(x)
rf = ensemble.RandomForestRegressor(n_estimators=100, min_samples_split=1, n_jobs=-1)
return Pipeline([('scaler', scaler), ('rf', rf)])
def generateExtraTreesPipeline(x):
scaler = preprocessing.StandardScaler().fit(x)
et = ensemble.ExtraTreesRegressor(n_estimators=100, n_jobs=-1)
return Pipeline([('scaler', scaler), ('et', et)])
def generateAdaBoostPipeline(x):
selector = SelectKBest(f_regression, k=K_BEST_FEATURES)
scaler = preprocessing.StandardScaler().fit(x)
base_estimator = DecisionTreeRegressor(max_depth=3, min_samples_leaf=1)
ada = ensemble.AdaBoostRegressor(base_estimator=base_estimator, n_estimators=50)
return Pipeline([('selector', selector), ('scaler', scaler), ('ada', ada)])
def generateGradientBoostingPipeline(x):
scaler = preprocessing.StandardScaler().fit(x)
grb = ensemble.GradientBoostingRegressor(n_estimators=100, max_depth=5)
return Pipeline([('scaler', scaler), ('grb', grb)])
GENERATE_PIPELINE = generateRandomForestPipeline
# Fitting and predicting
def fitPipeline(db, latitude, longitude, generatePipeline):
print 'Loading Data'
x, y = loadData(db, latitude, longitude, generateAllFeatures)
print 'Generating pipeline'
pipeline = generatePipeline(x)
print 'Training SVR'
start = time.clock()
pipeline.fit(x, y)
print 'Total Training time:', time.clock() - start
return pipeline
def predict(db, pipeline, latitude, longitude, testDataset):
print 'Begin Prediction'
print 'Generating input vectors'
inputVectors = {sample['id']: generateAllFeatures(db, latitude, longitude, sample['start']) for sample in testDataset if sample['lat'] == latitude and sample['long'] == longitude}
print 'Predicting', inputVectors
try:
predictions = pipeline.predict(inputVectors.values())
except ValueError:
predictions = [0] * len(inputVectors)
# Ensure we don't predict any negative values
finalPredictions = []
numNegatives = 0
for prediction in predictions:
finalPrediction = 0
if prediction > 0:
finalPrediction = prediction
elif prediction < 0:
numNegatives += 1
finalPredictions.append(finalPrediction)
return zip(inputVectors.keys(), finalPredictions), numNegatives
if __name__ == '__main__':
test_dataset_filename = TEST_DATASET_FINAL_FILENAME if FINAL else TEST_DATASET_INITIAL_FILENAME
testDataset = loadTestDataset(test_dataset_filename)
predictions = []
start = time.clock()
numNegatives = 0
with DB() as db:
for POI in getPointsOfInterest():
print 'POI', POI
pipeline = fitPipeline(db, POI['LAT'], POI['LONG'], GENERATE_PIPELINE)
POIPredictions, POINegatives = predict(db, pipeline, POI['LAT'], POI['LONG'], testDataset)
predictions.extend(POIPredictions)
numNegatives += POINegatives
print 'Predicted a negative number of taxi pickups %i times' % numNegatives
print 'All predictions took %s seconds' % (time.clock() - start)
print 'Writing output'
idList = [False] * len(testDataset)
outputList = []
for locID, prediction in predictions:
outputList.append((locID, '%i %i' % (locID, prediction)))
idList[locID] = True
outputList.sort()
with open(OUTPUT_FILENAME, 'w') as f:
f.write('\n'.join([out[1] for out in outputList]))
if not all(idList):
print 'ERROR: MISSING PREDICTIONS'