|
1 | 1 | import csv
|
2 | 2 | import ast
|
| 3 | +import time |
3 | 4 | import pandas as pd
|
4 | 5 | from sklearn import preprocessing
|
5 | 6 | from orderedset import OrderedSet
|
@@ -61,20 +62,44 @@ def gridify(filename, (min_lon, min_lat), cellSide): #cellSide in km
|
61 | 62 | else: #elif len(row) == 2
|
62 | 63 | outputWriter.writerow([tripID, ';'.join([x for x in cellsList])])
|
63 | 64 |
|
64 |
| -def classify(classifier): |
65 |
| - df=pd.read_csv('datasets/tripsClean_grid.csv',sep='!') |
| 65 | +def cross_validate(classifier): |
| 66 | + df=pd.read_csv('datasets/tripsClean_grid_v2.csv',sep='!') |
66 | 67 | le = preprocessing.LabelEncoder()
|
67 | 68 | le.fit(df['JourneyPatternID'])
|
68 | 69 | Y_train=le.transform(df['JourneyPatternID'])
|
69 | 70 | X_train=df['Trajectory']
|
70 |
| - vectorizer = HashingVectorizer(ngram_range = (2,2), tokenizer=lambda x: x.split(';')) |
| 71 | + vectorizer = HashingVectorizer(ngram_range=(1,2), tokenizer=lambda x: x.split(';')) |
71 | 72 | pipeline = Pipeline([
|
72 | 73 | ('vect', vectorizer),
|
73 | 74 | ('classifier', classifier)
|
74 | 75 | ])
|
75 | 76 | scores = cross_val_score(pipeline, X_train, Y_train, cv=10)
|
76 | 77 | print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
|
77 | 78 |
|
| 79 | +def classify(classifier): |
| 80 | + df=pd.read_csv('datasets/tripsClean_grid.csv',sep='!') |
| 81 | + le = preprocessing.LabelEncoder() |
| 82 | + le.fit(df['JourneyPatternID']) |
| 83 | + Y_train=le.transform(df['JourneyPatternID']) |
| 84 | + X_train=df['Trajectory'] |
| 85 | + vectorizer = HashingVectorizer(ngram_range=(1,2), tokenizer=lambda x: x.split(';')) |
| 86 | + pipeline = Pipeline([ |
| 87 | + ('vect', vectorizer), |
| 88 | + ('classifier', classifier) |
| 89 | + ]) |
| 90 | + pipeline.fit(X_train, Y_train) |
| 91 | + df=pd.read_csv('datasets/test_set_grid.csv',sep='!') |
| 92 | + X_test = df['Trajectory'] |
| 93 | + predicted_labels = le.inverse_transform(pipeline.predict(X_test)) |
| 94 | + with open('datasets/testSet_JourneyPatternIDs.csv', 'w') as outFile: |
| 95 | + outputWriter = csv.writer(outFile, delimiter='\t') |
| 96 | + outputWriter.writerow(['Test_Trip_ID', 'Predicted_JourneyPatternID']) |
| 97 | + trip_id = 0 |
| 98 | + for label in predicted_labels: |
| 99 | + outputWriter.writerow([trip_id, label]) |
| 100 | + trip_id += 1 |
| 101 | + |
| 102 | + |
78 | 103 |
|
79 | 104 | def regridify(filename):
|
80 | 105 | with open(filename, 'r') as inFile, open(filename[:-4]+'_v2.csv', 'w') as outFile:
|
@@ -113,12 +138,11 @@ def regridify(filename):
|
113 | 138 |
|
114 | 139 | # min_lon, min_lat = (-6.61505, 53.07045)
|
115 | 140 | min_lon, min_lat = find_min_border('datasets/tripsClean.csv')
|
116 |
| -print 'min: ', (min_lon, min_lat) |
117 |
| -# gridify('datasets/tripsClean.csv',(min_lon, min_lat), float(sys.argv[1])) |
118 |
| -gridify('datasets/tripsClean.csv',(min_lon, min_lat), 0.3) |
119 |
| -gridify('datasets/test_set.csv',(min_lon, min_lat), 0.3) |
| 141 | +gridify('datasets/tripsClean.csv',(min_lon, min_lat), 0.2) |
| 142 | +gridify('datasets/test_set.csv',(min_lon, min_lat), 0.2) |
| 143 | +# regridify('datasets/tripsClean_grid.csv') |
| 144 | +# regridify('datasets/test_set_grid.csv') |
120 | 145 |
|
121 |
| -classify(classifier = KNeighborsClassifier()) |
| 146 | +classify(classifier = KNeighborsClassifier(n_neighbors=1)) |
122 | 147 | # classify(classifier = LogisticRegression())
|
123 | 148 | # classify(classifier = RandomForestClassifier(n_estimators = 10, random_state = 1, n_jobs=-1))
|
124 |
| - |
|
0 commit comments