-
Notifications
You must be signed in to change notification settings - Fork 0
/
pefeatures.py
541 lines (447 loc) · 22.7 KB
/
pefeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
#!/usr/bin/python
# This file comes from the EMBER project: https://github.com/elastic/ember
''' Extracts some basic features from PE files. Many of the features
implemented have been used in previously published works. For more information,
check out the following resources:
* Schultz, et al., 2001: http://128.59.14.66/sites/default/files/binaryeval-ieeesp01.pdf
* Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf
* Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf
* Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf
* Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf
It may be useful to do feature selection to reduce this set of features to a meaningful set
for your modeling problem.
'''
import re
import lief
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher
LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 )
LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)
class FeatureType(object):
''' Base class from which each feature type may inherit '''
name = ''
dim = 0
def __repr__(self):
return '{}({})'.format(self.name, self.dim)
def raw_features(self, bytez, lief_binary):
''' Generate a JSON-able representation of the file '''
raise (NotImplementedError)
def process_raw_features(self, raw_obj):
''' Generate a feature vector from the raw features '''
raise (NotImplementedError)
def feature_vector(self, bytez, lief_binary):
''' Directly calculate the feature vector from the sample itself. This should only be implemented differently
if there are significant speedups to be gained from combining the two functions. '''
return self.process_raw_features(self.raw_features(bytez, lief_binary))
class ByteHistogram(FeatureType):
''' Byte histogram (count + non-normalized) over the entire binary file '''
name = 'histogram'
dim = 256
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256)
return counts.tolist()
def process_raw_features(self, raw_obj):
counts = np.array(raw_obj, dtype=np.float32)
sum = counts.sum()
normalized = counts / sum
return normalized
class ByteEntropyHistogram(FeatureType):
''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015).
This roughly approximates the joint probability of byte value and local entropy.
See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info.
'''
name = 'byteentropy'
dim = 256
def __init__(self, step=1024, window=2048):
super(FeatureType, self).__init__()
self.window = window
self.step = step
def _entropy_bin_counts(self, block):
# coarse histogram, 16 bytes per bin
c = np.bincount(block >> 4, minlength=16) # 16-bin histogram
p = c.astype(np.float32) / self.window
wh = np.where(c)[0]
H = np.sum(-p[wh] * np.log2(
p[wh])) * 2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
Hbin = int(H * 2) # up to 16 bins (max entropy is 8 bits)
if Hbin == 16: # handle entropy = 8.0 bits
Hbin = 15
return Hbin, c
def raw_features(self, bytez, lief_binary):
output = np.zeros((16, 16), dtype=int)
a = np.frombuffer(bytez, dtype=np.uint8)
if a.shape[0] < self.window:
Hbin, c = self._entropy_bin_counts(a)
output[Hbin, :] += c
else:
# strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html
shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window)
strides = a.strides + (a.strides[-1],)
blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
# from the blocks, compute histogram
for block in blocks:
Hbin, c = self._entropy_bin_counts(block)
output[Hbin, :] += c
return output.flatten().tolist()
def process_raw_features(self, raw_obj):
counts = np.array(raw_obj, dtype=np.float32)
sum = counts.sum()
normalized = counts / sum
return normalized
class SectionInfo(FeatureType):
''' Information about section names, sizes and entropy. Uses hashing trick
to summarize all this section info into a feature vector.
'''
name = 'section'
dim = 5 + 50 + 50 + 50 + 50 + 50
def __init__(self):
super(FeatureType, self).__init__()
@staticmethod
def _properties(s):
return [str(c).split('.')[-1] for c in s.characteristics_lists]
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return {"entry": "", "sections": []}
# properties of entry point, or if invalid, the first executable section
try:
entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
except: #lief.not_found:
# bad entry point, let's find the first executable section
entry_section = ""
for s in lief_binary.sections:
if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
entry_section = s.name
break
raw_obj = {"entry": entry_section}
raw_obj["sections"] = [{
'name': s.name,
'size': s.size,
'entropy': s.entropy,
'vsize': s.virtual_size,
'props': self._properties(s)
} for s in lief_binary.sections]
return raw_obj
def process_raw_features(self, raw_obj):
sections = raw_obj['sections']
general = [
len(sections), # total number of sections
# number of sections with nonzero size
sum(1 for s in sections if s['size'] == 0),
# number of sections with an empty name
sum(1 for s in sections if s['name'] == ""),
# number of RX
sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),
# number of W
sum(1 for s in sections if 'MEM_WRITE' in s['props'])
]
# gross characteristics of each section
section_sizes = [(s['name'], s['size']) for s in sections]
section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
section_entropy = [(s['name'], s['entropy']) for s in sections]
section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
section_vsize = [(s['name'], s['vsize']) for s in sections]
section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0]
characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
return np.hstack([
general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed,
characteristics_hashed
]).astype(np.float32)
class ImportsInfo(FeatureType):
''' Information about imported libraries and functions from the
import address table. Note that the total number of imported
functions is contained in GeneralFileInfo.
'''
name = 'imports'
dim = 1280
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
imports = {}
if lief_binary is None:
return imports
for lib in lief_binary.imports:
if lib.name not in imports:
imports[lib.name] = [] # libraries can be duplicated in listing, extend instead of overwrite
# Clipping assumes there are diminishing returns on the discriminatory power of imported functions
# beyond the first 10000 characters, and this will help limit the dataset size
for entry in lib.entries:
if entry.is_ordinal:
imports[lib.name].append("ordinal" + str(entry.ordinal))
else:
imports[lib.name].append(entry.name[:10000])
return imports
def process_raw_features(self, raw_obj):
# unique libraries
libraries = list(set([l.lower() for l in raw_obj.keys()]))
libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
# A string like "kernel32.dll:CreateFileMappingA" for each imported function
imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist]
imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
# Two separate elements: libraries (alone) and fully-qualified names of imported functions
return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
class ExportsInfo(FeatureType):
''' Information about exported functions. Note that the total number of exported
functions is contained in GeneralFileInfo.
'''
name = 'exports'
dim = 128
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return []
# Clipping assumes there are diminishing returns on the discriminatory power of exports beyond
# the first 10000 characters, and this will help limit the dataset size
if LIEF_EXPORT_OBJECT:
# export is an object with .name attribute (0.10.0 and later)
clipped_exports = [export.name[:10000] for export in lief_binary.exported_functions]
else:
# export is a string (LIEF 0.9.0 and earlier)
clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
return clipped_exports
def process_raw_features(self, raw_obj):
exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0]
return exports_hashed.astype(np.float32)
class GeneralFileInfo(FeatureType):
''' General information about the file '''
name = 'general'
dim = 10
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
if lief_binary is None:
return {
'size': len(bytez),
'vsize': 0,
'has_debug': 0,
'exports': 0,
'imports': 0,
'has_relocations': 0,
'has_resources': 0,
'has_signature': 0,
'has_tls': 0,
'symbols': 0
}
return {
'size': len(bytez),
'vsize': lief_binary.virtual_size,
'has_debug': int(lief_binary.has_debug),
'exports': len(lief_binary.exported_functions),
'imports': len(lief_binary.imported_functions),
'has_relocations': int(lief_binary.has_relocations),
'has_resources': int(lief_binary.has_resources),
'has_signature': int(lief_binary.has_signatures) if LIEF_HAS_SIGNATURE else int(lief_binary.has_signature),
'has_tls': int(lief_binary.has_tls),
'symbols': len(lief_binary.symbols),
}
def process_raw_features(self, raw_obj):
return np.asarray([
raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'],
raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
raw_obj['symbols']
],
dtype=np.float32)
class HeaderFileInfo(FeatureType):
''' Machine, architecure, OS, linker and other information extracted from header '''
name = 'header'
dim = 62
def __init__(self):
super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary):
raw_obj = {}
raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []}
raw_obj['optional'] = {
'subsystem': "",
'dll_characteristics': [],
'magic': "",
'major_image_version': 0,
'minor_image_version': 0,
'major_linker_version': 0,
'minor_linker_version': 0,
'major_operating_system_version': 0,
'minor_operating_system_version': 0,
'major_subsystem_version': 0,
'minor_subsystem_version': 0,
'sizeof_code': 0,
'sizeof_headers': 0,
'sizeof_heap_commit': 0
}
if lief_binary is None:
return raw_obj
raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps
raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1]
raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list]
raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1]
raw_obj['optional']['dll_characteristics'] = [
str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists
]
raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1]
raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version
raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version
raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version
raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version
raw_obj['optional'][
'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version
raw_obj['optional'][
'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version
raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version
raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version
raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code
raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers
raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit
return raw_obj
def process_raw_features(self, raw_obj):
return np.hstack([
raw_obj['coff']['timestamp'],
FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0],
FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0],
FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0],
FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0],
FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0],
raw_obj['optional']['major_image_version'],
raw_obj['optional']['minor_image_version'],
raw_obj['optional']['major_linker_version'],
raw_obj['optional']['minor_linker_version'],
raw_obj['optional']['major_operating_system_version'],
raw_obj['optional']['minor_operating_system_version'],
raw_obj['optional']['major_subsystem_version'],
raw_obj['optional']['minor_subsystem_version'],
raw_obj['optional']['sizeof_code'],
raw_obj['optional']['sizeof_headers'],
raw_obj['optional']['sizeof_heap_commit'],
]).astype(np.float32)
class StringExtractor(FeatureType):
''' Extracts strings from raw byte stream '''
name = 'strings'
dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
def __init__(self):
super(FeatureType, self).__init__()
# all consecutive runs of 0x20 - 0x7f that are 5+ characters
self._allstrings = re.compile(b'[\x20-\x7f]{5,}')
# occurances of the string 'C:\'. Not actually extracting the path
self._paths = re.compile(b'c:\\\\', re.IGNORECASE)
# occurances of http:// or https://. Not actually extracting the URLs
self._urls = re.compile(b'https?://', re.IGNORECASE)
# occurances of the string prefix HKEY_. No actually extracting registry names
self._registry = re.compile(b'HKEY_')
# crude evidence of an MZ header (dropper?) somewhere in the byte stream
self._mz = re.compile(b'MZ')
def raw_features(self, bytez, lief_binary):
allstrings = self._allstrings.findall(bytez)
if allstrings:
# statistics about strings:
string_lengths = [len(s) for s in allstrings]
avlength = sum(string_lengths) / len(string_lengths)
# map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive
as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)]
c = np.bincount(as_shifted_string, minlength=96) # histogram count
# distribution of characters in printable strings
csum = c.sum()
p = c.astype(np.float32) / csum
wh = np.where(c)[0]
H = np.sum(-p[wh] * np.log2(p[wh])) # entropy
else:
avlength = 0
c = np.zeros((96,), dtype=np.float32)
H = 0
csum = 0
return {
'numstrings': len(allstrings),
'avlength': avlength,
'printabledist': c.tolist(), # store non-normalized histogram
'printables': int(csum),
'entropy': float(H),
'paths': len(self._paths.findall(bytez)),
'urls': len(self._urls.findall(bytez)),
'registry': len(self._registry.findall(bytez)),
'MZ': len(self._mz.findall(bytez))
}
def process_raw_features(self, raw_obj):
hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0
return np.hstack([
raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'],
np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'],
raw_obj['registry'], raw_obj['MZ']
]).astype(np.float32)
class DataDirectories(FeatureType):
''' Extracts size and virtual address of the first 15 data directories '''
name = 'datadirectories'
dim = 15 * 2
def __init__(self):
super(FeatureType, self).__init__()
self._name_order = [
"EXPORT_TABLE", "IMPORT_TABLE", "RESOURCE_TABLE", "EXCEPTION_TABLE", "CERTIFICATE_TABLE",
"BASE_RELOCATION_TABLE", "DEBUG", "ARCHITECTURE", "GLOBAL_PTR", "TLS_TABLE", "LOAD_CONFIG_TABLE",
"BOUND_IMPORT", "IAT", "DELAY_IMPORT_DESCRIPTOR", "CLR_RUNTIME_HEADER"
]
def raw_features(self, bytez, lief_binary):
output = []
if lief_binary is None:
return output
for data_directory in lief_binary.data_directories:
output.append({
"name": str(data_directory.type).replace("DATA_DIRECTORY.", ""),
"size": data_directory.size,
"virtual_address": data_directory.rva
})
return output
def process_raw_features(self, raw_obj):
features = np.zeros(2 * len(self._name_order), dtype=np.float32)
for i in range(len(self._name_order)):
if i < len(raw_obj):
features[2 * i] = raw_obj[i]["size"]
features[2 * i + 1] = raw_obj[i]["virtual_address"]
return features
class PEFeatureExtractor(object):
''' Extract useful features from a PE file, and return as a vector of fixed size. '''
def __init__(self, feature_version=2, print_feature_warning=False):
self.features = [
ByteHistogram(),
ByteEntropyHistogram(),
StringExtractor(),
GeneralFileInfo(),
HeaderFileInfo(),
SectionInfo(),
ImportsInfo(),
ExportsInfo()
]
if feature_version == 1:
if not lief.__version__.startswith("0.8.3"):
if print_feature_warning:
print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(f"WARNING: in the feature calculations.")
elif feature_version == 2:
self.features.append(DataDirectories())
if not lief.__version__.startswith("0.9.0"):
if print_feature_warning:
print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(f"WARNING: in the feature calculations.")
else:
raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
self.dim = sum([fe.dim for fe in self.features])
def raw_features(self, bytez):
lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound,
RuntimeError)
try:
lief_binary = lief.PE.parse(list(bytez))
except lief_errors as e:
print("lief error: ", str(e))
lief_binary = None
except Exception: # everything else (KeyboardInterrupt, SystemExit, ValueError):
raise
features = {"sha256": hashlib.sha256(bytez).hexdigest()}
features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features})
return features
def process_raw_features(self, raw_obj):
feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features]
return np.hstack(feature_vectors).astype(np.float32)
def feature_vector(self, bytez):
return self.process_raw_features(self.raw_features(bytez))