-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil_functions.py
109 lines (87 loc) · 3.63 KB
/
util_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import numpy as np
import h5py
import glob
from collections import OrderedDict
SEED = 1
MAX_PKT = 100 # number of packets
TIME_WINDOW = 10
TRAIN_RATE = 0.80 # size of the training set wrt the total number of samples
protocols = ['arp','data','dns','ftp','http','icmp','ip','ssdp','ssl','telnet','tcp','udp']
powers_of_two = np.array([2**i for i in range(len(protocols))])
# feature list with min and max values
feature_list = OrderedDict([
('timestamp', [0,10]),
('packet_length',[0,1<<16]),
('highest_layer',[0,1<<32]),
('IP_flags',[0,1<<16]),
('protocols',[0,1<<len(protocols)]),
('TCP_length',[0,1<<16]),
('TCP_ack',[0,1<<32]),
('TCP_flags',[0,1<<16]),
('TCP_window_size',[0,1<<16]),
('UDP_length',[0,1<<16]),
('ICMP_type',[0,1<<8])]
)
def load_dataset(path):
print("Loading dataset from:", path) # Add a print statement
filename = glob.glob(path)[0]
dataset = h5py.File(filename, "r")
set_x_orig = np.array(dataset["set_x"][:]) # features
set_y_orig = np.array(dataset["set_y"][:]) # labels
X_train = np.reshape(set_x_orig, (set_x_orig.shape[0], set_x_orig.shape[1], set_x_orig.shape[2], 1))
Y_train = set_y_orig#.reshape((1, set_y_orig.shape[0]))
return X_train, Y_train
def scale_linear_bycolumn(rawpoints, mins,maxs,high=1.0, low=0.0):
rng = maxs - mins
return high - (((high - low) * (maxs - rawpoints)) / rng)
def count_packets_in_dataset(X_list):
packet_counters = []
for X in X_list:
TOT = X.sum(axis=2)
packet_counters.append(np.count_nonzero(TOT))
return packet_counters
def all_same(items):
return all(x == items[0] for x in items)
def static_min_max(time_window=10):
feature_list['timestamp'][1] = time_window
min_array = np.zeros(len(feature_list))
max_array = np.zeros(len(feature_list))
i=0
for feature, value in feature_list.items():
min_array[i] = value[0]
max_array[i] = value[1]
i+=1
return min_array,max_array
def find_min_max(X,time_window=10):
sample_len = X[0].shape[1]
max_array = np.zeros((1,sample_len))
min_array = np.full((1, sample_len),np.inf)
for feature in X:
temp_feature = np.vstack([max_array,feature])
max_array = np.amax(temp_feature,axis=0)
temp_feature = np.vstack([min_array, feature])
min_array = np.amin(temp_feature, axis=0)
# flows cannot last for more than MAX_FLOW_DURATION seconds, so they are normalized accordingly
max_array[0] = time_window
min_array[0] = 0
return min_array,max_array
def normalize_and_padding(X,mins,maxs,max_flow_len,padding=True):
norm_X = []
for sample in X:
if sample.shape[0] > max_flow_len: # if the sample is bigger than expected, we cut the sample
sample = sample[:max_flow_len,...]
packet_nr = sample.shape[0] # number of packets in one sample
norm_sample = scale_linear_bycolumn(sample, mins, maxs, high=1.0, low=0.0)
np.nan_to_num(norm_sample, copy=False) # remove NaN from the array
if padding == True:
norm_sample = np.pad(norm_sample, ((0, max_flow_len - packet_nr), (0, 0)), 'constant',constant_values=(0, 0)) # padding
norm_X.append(norm_sample)
return norm_X
def padding(X,max_flow_len):
padded_X = []
for sample in X:
flow_nr = sample.shape[0]
padded_sample = np.pad(sample, ((0, max_flow_len - flow_nr), (0, 0)), 'constant',
constant_values=(0, 0)) # padding
padded_X.append(padded_sample)
return padded_X