Skip to content

Commit 547a6e2

Browse files
committed
Added classify function and predicted results.
1 parent fc63dd2 commit 547a6e2

File tree

2 files changed

+4517
-9
lines changed

2 files changed

+4517
-9
lines changed

classify.py

+33-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import csv
22
import ast
3+
import time
34
import pandas as pd
45
from sklearn import preprocessing
56
from orderedset import OrderedSet
@@ -61,20 +62,44 @@ def gridify(filename, (min_lon, min_lat), cellSide): #cellSide in km
6162
else: #elif len(row) == 2
6263
outputWriter.writerow([tripID, ';'.join([x for x in cellsList])])
6364

64-
def classify(classifier):
65-
df=pd.read_csv('datasets/tripsClean_grid.csv',sep='!')
65+
def cross_validate(classifier):
66+
df=pd.read_csv('datasets/tripsClean_grid_v2.csv',sep='!')
6667
le = preprocessing.LabelEncoder()
6768
le.fit(df['JourneyPatternID'])
6869
Y_train=le.transform(df['JourneyPatternID'])
6970
X_train=df['Trajectory']
70-
vectorizer = HashingVectorizer(ngram_range = (2,2), tokenizer=lambda x: x.split(';'))
71+
vectorizer = HashingVectorizer(ngram_range=(1,2), tokenizer=lambda x: x.split(';'))
7172
pipeline = Pipeline([
7273
('vect', vectorizer),
7374
('classifier', classifier)
7475
])
7576
scores = cross_val_score(pipeline, X_train, Y_train, cv=10)
7677
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
7778

79+
def classify(classifier):
80+
df=pd.read_csv('datasets/tripsClean_grid.csv',sep='!')
81+
le = preprocessing.LabelEncoder()
82+
le.fit(df['JourneyPatternID'])
83+
Y_train=le.transform(df['JourneyPatternID'])
84+
X_train=df['Trajectory']
85+
vectorizer = HashingVectorizer(ngram_range=(1,2), tokenizer=lambda x: x.split(';'))
86+
pipeline = Pipeline([
87+
('vect', vectorizer),
88+
('classifier', classifier)
89+
])
90+
pipeline.fit(X_train, Y_train)
91+
df=pd.read_csv('datasets/test_set_grid.csv',sep='!')
92+
X_test = df['Trajectory']
93+
predicted_labels = le.inverse_transform(pipeline.predict(X_test))
94+
with open('datasets/testSet_JourneyPatternIDs.csv', 'w') as outFile:
95+
outputWriter = csv.writer(outFile, delimiter='\t')
96+
outputWriter.writerow(['Test_Trip_ID', 'Predicted_JourneyPatternID'])
97+
trip_id = 0
98+
for label in predicted_labels:
99+
outputWriter.writerow([trip_id, label])
100+
trip_id += 1
101+
102+
78103

79104
def regridify(filename):
80105
with open(filename, 'r') as inFile, open(filename[:-4]+'_v2.csv', 'w') as outFile:
@@ -113,12 +138,11 @@ def regridify(filename):
113138

114139
# min_lon, min_lat = (-6.61505, 53.07045)
115140
min_lon, min_lat = find_min_border('datasets/tripsClean.csv')
116-
print 'min: ', (min_lon, min_lat)
117-
# gridify('datasets/tripsClean.csv',(min_lon, min_lat), float(sys.argv[1]))
118-
gridify('datasets/tripsClean.csv',(min_lon, min_lat), 0.3)
119-
gridify('datasets/test_set.csv',(min_lon, min_lat), 0.3)
141+
gridify('datasets/tripsClean.csv',(min_lon, min_lat), 0.2)
142+
gridify('datasets/test_set.csv',(min_lon, min_lat), 0.2)
143+
# regridify('datasets/tripsClean_grid.csv')
144+
# regridify('datasets/test_set_grid.csv')
120145

121-
classify(classifier = KNeighborsClassifier())
146+
classify(classifier = KNeighborsClassifier(n_neighbors=1))
122147
# classify(classifier = LogisticRegression())
123148
# classify(classifier = RandomForestClassifier(n_estimators = 10, random_state = 1, n_jobs=-1))
124-

0 commit comments

Comments
 (0)