Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
chengzhanhong committed Aug 7, 2024
1 parent dbd5c6e commit 7a481d3
Show file tree
Hide file tree
Showing 38 changed files with 986,299 additions and 14,119 deletions.
2 changes: 1 addition & 1 deletion .idea/forecastable.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 34 additions & 7 deletions analysis/anamoly_detection/anamoly detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
import rpca.ealm
from datasets.datautils import *

plt.rcParams.update({'ytick.direction': 'in', 'xtick.direction': 'in'})

def drop_midnight(data, start_minute, end_minute):
"""
Expand Down Expand Up @@ -58,8 +58,32 @@ def get_station_abnormal_index(data, s=14, flow_type="inflow", neighbor=1, iqr_r
data_index = data_inflow.index[idx]

if fig:
plt.matshow(inflow, aspect='auto')
plt.plot(idx2[1],idx2[0], 'o',markerfacecolor='none', markeredgecolor='r', markersize=5)
fig0, ax0 = plt.subplots(figsize=(10, 5))
cax = ax0.matshow(inflow, aspect='auto', cmap='viridis')
ax0.xaxis.set_ticks_position('bottom')
# set x-grids on
n_weeks = inflow.shape[0]
day_length = inflow.shape[1]//7
ax0.set_xticks(np.arange(-0.5, 7*day_length+0.5, day_length))
ax0.set_xticklabels([])
ax0.set_yticks(np.arange(-0.5, n_weeks+0.5, 1))
ax0.set_yticklabels([])
x_ticks_midpoints = np.arange(0, 7 * day_length, day_length) + day_length / 2
ax0.set_xticks(x_ticks_midpoints, minor=True)
ax0.set_xticklabels(['Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri'], minor=True)
ax0.tick_params(which='minor', length=0)
ax0.grid(which='major', color='white', linestyle='--', linewidth=1, alpha=0.2)
y_ticks_midpoints = np.arange(0, n_weeks, 1)
ax0.set_yticks(y_ticks_midpoints, minor=True)
ax0.set_yticklabels(np.arange(1, n_weeks+1), minor=True)
ax0.set_ylabel('Weeks')
ax0.set_xlabel('Time in a week')
ax0.plot(idx2[1],idx2[0], 'o',markerfacecolor='none', markeredgecolor='#eb4034', markersize=6, markeredgewidth=1)
axc = fig0.colorbar(cax)
axc.set_label('Boardings / 15 min')
fig0.set_tight_layout(True)
fig0.savefig('inflow_anomaly.pdf')


fig, ax = plt.subplots(figsize=(35, 5))
outflow0 = data.loc[data.station == s, other_flow].values
Expand All @@ -69,6 +93,7 @@ def get_station_abnormal_index(data, s=14, flow_type="inflow", neighbor=1, iqr_r
ax.legend()
fig.set_tight_layout(True)
ax.set_xmargin(0)
return data_index, fig
return data_index


Expand Down Expand Up @@ -140,12 +165,14 @@ def read_data(args):
data = read_data(args)
data.head()
# save to csv
data.to_csv(data_info['data_path'] + 'data.csv',
index=False)
# data.to_csv(data_info['data_path'] + 'data.csv',
# index=False)
# data = pd.read_csv(data_info['data_path'] + 'data.csv')

#%% Using manual rules to detect abnormal data
# Guangzhou, s=14, 110, 117, 118, 18
# Seoul, s=18, 27
get_station_abnormal_index(data, s=27, neighbor=0, fig=True, flow_type='inflow', iqr_rate=1.5)
data.time_in_day.unique()
get_station_abnormal_index(data, s=14, neighbor=0, fig=True, flow_type='inflow', iqr_rate=1)
data.time_in_day.unique()
import seaborn as sns
sns.heatmap()
3,712 changes: 0 additions & 3,712 deletions data/HangzhouMetro/in_data.csv

This file was deleted.

3,712 changes: 0 additions & 3,712 deletions data/HangzhouMetro/out_data.csv

This file was deleted.

984,641 changes: 984,641 additions & 0 deletions data/SeoulMetro/data.csv

Large diffs are not rendered by default.

2,401 changes: 0 additions & 2,401 deletions data/SeoulMetro/seoul_in_data.csv

This file was deleted.

2,401 changes: 0 additions & 2,401 deletions data/SeoulMetro/seoul_out_data.csv

This file was deleted.

7 changes: 7 additions & 0 deletions data/SeoulMetro/stationID2stationName.txt
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,10 @@
270: 모란
271: 남위례
272: 충무로
273: 연신내
274: 신내
275: 삼산체육관
276: 신중동
277: 춘의
278: 까치울
279: 부천시청
132 changes: 114 additions & 18 deletions datasets/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
'n_layers' : 3,
'attn_mask_type' : 'time',
'pre_norm' : True,
'pe' : 'zeros', # intial values of positional encoding
'pe' : 'zeros', # intial values of positional encoding, 'rotary'
'learn_pe' : True, # learn positional encoding
'div_factor' : 1e4, # initial warmup learning rate : max_lr / div_factor
'final_div_factor' : 1, # final learning rate : initial_lr / final_div_factor
Expand All @@ -38,13 +38,14 @@
'start_minute': 360,
'end_minute': 1440,
't_resolution': '15T', # time resolution used in the model
'input_len': 72,
'target_len': 24, # 6 hours
'input_len': 144,
'target_len': 48,
'test_date_start': '2017-09-15', # equivalent to 2017-09-16
'test_date_end': '2017-10-01',
'val_date_start': '2017-09-08', # equivalent to 2017-09-09
'val_date_end': '2017-09-15',
'sample_interval':2, # The skip interval for fast testing
'sample_interval':1, # The skip interval for fast testing
'sample_divide':4,
'patch_len':1
},
'seoul': {'data_path': '../../data/SeoulMetro/',
Expand All @@ -53,14 +54,15 @@
'start_minute': 240,
'end_minute': 1381,
't_resolution': '60T', # time resolution used in the model
'input_len': 20,
'target_len': 6, # 6 hours
'input_len': 40,
'target_len': 12,
'test_date_start': '2023-06-09', # equivalent to 2023-06-10
'test_date_end': '2023-07-01',
'val_date_start': '2023-05-31', # equivalent to 2023-06-01
'val_date_end': '2023-06-09',
'sample_interval': 1, # The skip interval for fast testing
'patch_len':1
'patch_len':1,
'sample_divide':4,
},
}

Expand Down Expand Up @@ -136,7 +138,19 @@
'time_eb': True,
'seed': 0,
'max_run_time': 10,
}
},
'Nlinear':{
'station_eb': True,
'weekday_eb': True,
'time_eb': True,
'n_epochs': 20,
'patience': 5,
'max_lr': 0.001,
'seed': 0,
'max_run_time': 10,
'd_model': 128,
'dropout': 0.05,
}
}

head_infos = {'RMSE': {'standardization' : 'zscore', 'output_type': 'number', 'input_type': 'number'},
Expand All @@ -146,6 +160,7 @@
'TruncatedNormal': {'standardization' : 'zscore', 'output_type': 'number', 'input_type': 'number'},
'Normal': {'standardization' : 'zscore', 'output_type': 'number', 'input_type': 'number'},
'MixTruncatedNormal': {'standardization' : 'zscore', 'output_type': 'number', 'input_type': 'number', 'n_mixture': 2},
'MixTruncatedNormal2': {'standardization' : 'zscore', 'output_type': 'number', 'input_type': 'number', 'n_mixture': 2},
}

def reset_random_seeds(n=1):
Expand Down Expand Up @@ -272,6 +287,7 @@ def __init__(self, data, args, f_index=None):
self.bin_data = data.copy()
self.input_type = args.input_type # 'number' or "bins"
self.output_type = args.output_type # 'number' or "bins"
self.forecast_target = args.forecast_target

if args.output_type == 'bins' or args.input_type == 'bins':
self.bin_edges, self.num_per_bin, _ = get_quantile_edges(self.bin_data.loc[self.train_idx, ['inflow', 'outflow']].values.flatten(),
Expand All @@ -294,13 +310,17 @@ def __init__(self, data, args, f_index=None):
'DeepAR': MetroDataset_deepar,
'ABT_concat': MetroDataset_deepar, # ABT_concat uses the same data structure as DeepAR
'ABT_new': MetroDataset_deepar,
'Nlinear': MetroDataset_seq2seq,
}
child_dataset = datatype_dict[args.model]

# Get train, val, test Dataset
self.TrainDataset = child_dataset(self.raw_data, self.bin_data, args, np.intersect1d(self.train_idx, self.f_index))
self.ValDataset = child_dataset(self.raw_data, self.bin_data, args, np.intersect1d(self.val_idx, self.f_index))
self.TestDataset = child_dataset(self.raw_data, self.bin_data, args, np.intersect1d(self.test_idx, self.f_index))
self.TrainDataset = child_dataset(self.raw_data, self.bin_data, args,
np.intersect1d(self.train_idx, self.f_index), forecast_target=args.forecast_target)
self.ValDataset = child_dataset(self.raw_data, self.bin_data, args,
np.intersect1d(self.val_idx, self.f_index), forecast_target=args.forecast_target)
self.TestDataset = child_dataset(self.raw_data, self.bin_data, args,
np.intersect1d(self.test_idx, self.f_index), forecast_target=args.forecast_target)

def get_data_from_ts(self, time, station, method='target', mask_ratio=0.2):
"""Get input and target from a specific time and station."""
Expand All @@ -324,25 +344,36 @@ def get_data_from_ts(self, time, station, method='target', mask_ratio=0.2):


class MetroDataset_deepar(MetroDataset_base):
def __init__(self, raw_data, bin_data, args, f_index=None):
def __init__(self, raw_data, bin_data, args, f_index=None, forecast_target='both'):
super(MetroDataset_deepar, self).__init__(raw_data, args, f_index)
self.raw_data = raw_data
self.bin_data = bin_data
self.input_type = args.input_type # 'number' or "bins"
self.output_type = args.output_type # 'number' or "bins"
self.forecast_target = forecast_target

def __getitem__(self, index):
# the input data
if self.input_type == 'bins':
bin_data_piece = self.bin_data.iloc[self.f_index[index]:self.f_index[index] + self.sample_len, :]
bin_inflow = torch.from_numpy(bin_data_piece.inflow.values[:]).unfold(0, self.patch_len, self.patch_len)
bin_outflow = torch.from_numpy(bin_data_piece.outflow.values[:]).unfold(0, self.patch_len, self.patch_len)
input = torch.cat((bin_inflow[:-1, :], bin_outflow[:-1, :]), dim=1)
if self.forecast_target == 'inflow':
input = bin_inflow[:-1, :]
elif self.forecast_target == 'outflow':
input = bin_outflow[:-1, :]
else:
input = torch.cat((bin_inflow[:-1, :], bin_outflow[:-1, :]), dim=1)
elif self.input_type == 'number':
data_piece = self.raw_data.iloc[self.f_index[index]:self.f_index[index] + self.sample_len, :]
inflow = torch.from_numpy(data_piece.inflow.values[:]).unfold(0, self.patch_len, self.patch_len)
outflow = torch.from_numpy(data_piece.outflow.values[:]).unfold(0, self.patch_len, self.patch_len)
input = torch.cat((inflow[:-1, :], outflow[:-1, :]), dim=1)
if self.forecast_target == 'inflow':
input = inflow[:-1, :]
elif self.forecast_target == 'outflow':
input = outflow[:-1, :]
else:
input = torch.cat((inflow[:-1, :], outflow[:-1, :]), dim=1)
else:
raise ValueError('Invalid input type. input_type should be "number" or "bins"')

Expand All @@ -363,13 +394,23 @@ def __getitem__(self, index):
bin_data_piece = self.bin_data.iloc[self.f_index[index]:self.f_index[index] + self.sample_len, :]
bin_inflow = torch.from_numpy(bin_data_piece.inflow.values[:]).unfold(0, self.patch_len, self.patch_len)
bin_outflow = torch.from_numpy(bin_data_piece.outflow.values[:]).unfold(0, self.patch_len, self.patch_len)
target = torch.cat((bin_inflow[1:, :], bin_outflow[1:, :]), dim=1)
if self.forecast_target == 'inflow':
target = bin_inflow[1:, :]
elif self.forecast_target == 'outflow':
target = bin_outflow[1:, :]
else:
target = torch.cat((bin_inflow[1:, :], bin_outflow[1:, :]), dim=1)
elif self.output_type == 'number' or self.test_mode:
if "data_piece" not in locals():
data_piece = self.raw_data.iloc[self.f_index[index]:self.f_index[index] + self.sample_len, :]
inflow = torch.from_numpy(data_piece.inflow.values[:]).unfold(0, self.patch_len, self.patch_len)
outflow = torch.from_numpy(data_piece.outflow.values[:]).unfold(0, self.patch_len, self.patch_len)
target = torch.cat((inflow[1:, :], outflow[1:, :]), dim=1)
if self.forecast_target == 'inflow':
target = inflow[1:, :]
elif self.forecast_target == 'outflow':
target = outflow[1:, :]
else:
target = torch.cat((inflow[1:, :], outflow[1:, :]), dim=1)
else:
raise ValueError('Invalid output type. output_type should be "number" or "bins"')

Expand All @@ -378,7 +419,62 @@ def __getitem__(self, index):
data = data_piece if 'data_piece' in locals() else bin_data_piece
abnormal_in = torch.from_numpy(data.abnormal_in.values[:]).unfold(0, self.patch_len, self.patch_len)
abnormal_out = torch.from_numpy(data.abnormal_out.values[:]).unfold(0, self.patch_len, self.patch_len)
abnormal = torch.cat((abnormal_in[1:, :], abnormal_out[1:, :]), dim=1)
if self.forecast_target == 'inflow':
abnormal = abnormal_in[1:, :]
elif self.forecast_target == 'outflow':
abnormal = abnormal_out[1:, :]
else:
abnormal = torch.cat((abnormal_in[1:, :], abnormal_out[1:, :]), dim=1)
return tuple(inputs), target, abnormal
else:
return tuple(inputs), target
return tuple(inputs), target

class MetroDataset_seq2seq(MetroDataset_base):
# The dataset designed for model that forecast multistep feature values jointly.
def __init__(self, raw_data, bin_data, args, f_index=None, forecast_target='both'):
super(MetroDataset_seq2seq, self).__init__(raw_data, args, f_index)
self.raw_data = raw_data
self.forecast_target = forecast_target

def __getitem__(self, index):
# the input data
data_piece = self.raw_data.iloc[self.f_index[index]:self.f_index[index] + self.sample_len, :]
inflow = torch.from_numpy(data_piece.inflow.values[:]).unfold(0, self.patch_len, self.patch_len)
outflow = torch.from_numpy(data_piece.outflow.values[:]).unfold(0, self.patch_len, self.patch_len)
if self.forecast_target == 'inflow':
input = inflow[:self.input_len, :]
target = inflow[-self.target_len:, :]
elif self.forecast_target == 'outflow':
input = outflow[:self.input_len, :]
target = outflow[-self.target_len:, :]
else:
input = torch.cat((inflow[:self.input_len, :], outflow[:self.input_len, :]), dim=1)
target = torch.cat((inflow[-self.target_len:, :], outflow[-self.target_len:, :]), dim=1)

# Get features
inputs = [input]
data = data_piece
features = torch.from_numpy(data[['station', 'weekday', 'time_in_day']].values[self.patch_len-1::self.patch_len])
if self.station_eb:
inputs.append(features[-self.target_len:, 0])
if self.weekday_eb:
inputs.append(features[-self.target_len:, 1])
if self.time_eb:
inputs.append(features[-self.target_len:, 2])

if self.test_mode:
data = data_piece
abnormal_in = torch.from_numpy(data.abnormal_in.values[:]).unfold(0, self.patch_len, self.patch_len)
abnormal_out = torch.from_numpy(data.abnormal_out.values[:]).unfold(0, self.patch_len, self.patch_len)
if self.forecast_target == 'inflow':
abnormal = abnormal_in[-self.target_len:, :]
elif self.forecast_target == 'outflow':
abnormal = abnormal_out[-self.target_len:, :]
else:
abnormal = torch.cat((abnormal_in[-self.target_len:, :], abnormal_out[-self.target_len:, :]), dim=1)
return tuple(inputs), target, abnormal
else:
return tuple(inputs), target



Loading

0 comments on commit 7a481d3

Please sign in to comment.