MalwareDetection.py

import os
import re
import string
import json
from matplotlib import pyplot as plt
import pandas as pd
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
from collections import Counter
import pickle
import sys
from tqdm import tqdm


def get_static_data(filepath: str) -> tuple:
    """
    Returns the contents of Static analysis

    Args:
          filepath: full path to the directory containing String.txt and Structure_Info.txt

    Returns:
          a tuple of strings `string_output` and `structure_info`(string_output, structure_info)
          containing the contents of "String.txt" and "Structure_Info.txt"
    """
    if os.path.isdir(filepath):
        with open(os.path.join(filepath, "String.txt"), 'rb') as strings:
            string_output = strings.read()
        with open(os.path.join(filepath, "Structure_Info.txt"), 'rb') as structures:
            structure_info = structures.read()
        return string_output, structure_info


def get_dynamic_data(filepath: str) -> dict:
    """
    Returns the contents of Dynamic analysis of malware files

    Args:
          filepath: Fullpath to the file <hash>.json

    Returns:
          Dictionary parsed from json load of the file
    """
    if not filepath:
        return {}
    elif os.path.exists(filepath):
        with open(filepath, 'rb') as filee:
            return json.load(filee)


def find(name: str, path: str) -> str:
    """
    Finds the full path of file under specified path

    Args:
          name: name of the <hash> directory or <hash>.json to be found
          path: Path under which the full path needs to be found

    Returns:
          Full path from `path` to file `name`
    """
    for root, dirs, files in os.walk(path):
        if name in files or name in dirs:
            return os.path.join(root, name)


def get_data(filename: str, basepath: str) -> tuple:
    """
    Get the dynamic and static analysis data for the filename under basepath
    get the dynamic and static data associated by filename (hashvalue)

    Args:
          filename: Name of file specified as the hash value of the malware
          basepath: base path directory which contains the files

    Returns:
          the strings data (as bytestring), structure info data (as bytestrings)
          and dynamic analysis data as dictionary
    """
    dyn_path = find(filename + '.json', basepath)
    stat_path = find(filename, basepath)
    dyn_data = get_dynamic_data(dyn_path)
    stat_data = get_static_data(stat_path)
    return stat_data[0], stat_data[1], dyn_data


def parse_structure_info(data: bytes) -> dict:
    """
    Parses structure info to a dictionary

    Args:
          data: data (contents) of Structure_Info.txt read in binary mode

    Returns:
          filtered_data: A dictionary with selected fields namely
          - sections
            List of PE sections with information regarding
            - section name `name`
            - virtual size of section `virtual_size`
            - raw data size of section `raw_data_size`
            - `entropy` of section
          - dlls: list of imported symbols
          - OPTIONAL_HEADER information
            - SizeOfCode
            - SizeOfInitializedData
            - SizeOfUninitializedData
            - SizeOfStackReserve
            - SizeOfStackCommit
            - SizeOfHeapReserve
            - SizeOfHeapCommit
          - FILE_HEADER info
            - NumberOfSections
            - NumberOfSymbols
    """
    filtered_data = {}
    data_dict = {}
    splitted_headers = re.split(b'----------(:?[a-zA-Z _]*)----------', data)
    num_headers = len(splitted_headers)

    for i in range(0, num_headers - 1, 2):
        data_dict[splitted_headers[i + 1]] = splitted_headers[i + 2]

    # PE Sections
    section_list = []
    try:
        for section in re.findall(
            rb'Name:(.*?)\n[\s\S]*?Misc_VirtualSize:(.*?)\n[\s\S]*?SizeOfRawData:(.*?)\n[\s\S]*?Entropy:(.*?)\(',
                data_dict[b'PE Sections']):
            section_data = {}
            section_data['name'] = section[0].strip().decode()
            section_data['virtual_size'] = section[1].strip().decode()
            section_data['raw_data_size'] = section[2].strip().decode()
            section_data['entropy'] = section[3].strip().decode()
            section_list.append(section_data)
    except BaseException:
        pass

    filtered_data['sections'] = section_list
    # dll calls Imported Symbols
    try:
        filtered_data['dlls'] = list(map(lambda x: x.decode(), re.findall(
            rb'(.*)Hint\[', data_dict[b'Imported symbols'])))
    except BaseException:
        filtered_data['dlls'] = []
    # OPTIONAL HEADER
    try:
        filtered_data['SizeOfCode'] = re.findall(
            b'SizeOfCode:(.*)', data_dict[b'OPTIONAL_HEADER'])[0].strip().decode()
        filtered_data['SizeOfInitializedData'] = re.findall(
            b'SizeOfInitializedData:(.*)',
            data_dict[b'OPTIONAL_HEADER'])[0].strip().decode()
        filtered_data['SizeOfUninitializedData'] = re.findall(
            b'SizeOfUninitializedData:(.*)',
            data_dict[b'OPTIONAL_HEADER'])[0].strip().decode()
        filtered_data['SizeOfStackReserve'] = re.findall(
            b'SizeOfStackReserve:(.*)',
            data_dict[b'OPTIONAL_HEADER'])[0].strip().decode()
        filtered_data['SizeOfStackCommit'] = re.findall(
            b'SizeOfStackCommit:(.*)',
            data_dict[b'OPTIONAL_HEADER'])[0].strip().decode()
        filtered_data['SizeOfHeapReserve'] = re.findall(
            b'SizeOfHeapReserve:(.*)',
            data_dict[b'OPTIONAL_HEADER'])[0].strip().decode()
        filtered_data['SizeOfHeapCommit'] = re.findall(
            b'SizeOfHeapCommit:(.*)',
            data_dict[b'OPTIONAL_HEADER'])[0].strip().decode()
    except BaseException:
        filtered_data['SizeOfCode'] = []
        filtered_data['SizeOfInitializedData'] = []
        filtered_data['SizeOfUninitializedData'] = []
        filtered_data['SizeOfStackReserve'] = []
        filtered_data['SizeOfStackCommit'] = []
        filtered_data['SizeOfHeapReserve'] = []
        filtered_data['SizeOfHeapCommit'] = []

    # FILE_HEADER
    try:
        filtered_data['NumberOfSections'] = re.findall(
            b'NumberOfSections:(.*)', data_dict[b'FILE_HEADER'])[0].strip().decode()
        filtered_data['NumberOfSymbols'] = re.findall(
            b'NumberOfSymbols:(.*)', data_dict[b'FILE_HEADER'])[0].strip().decode()
    except BaseException:
        filtered_data['NumberOfSections'] = []
        filtered_data['NumberOfSymbols'] = []
    return filtered_data


def parse_network(dyn_data: dict) -> dict:
    """
    parses out network information from dynamic data dictionary

    Args:
          dyn_data: dictionary read from dynamic data

    Returns:
          network_dict: dictionary parsing the network information extracted from dyn_data

    """
    if dyn_data == {}:
        return {}
    network_dict = {}
    network_dict['udp'] = {}
    network_dict['icmp'] = {}
    network_dict['tcp'] = {}
    network_dict['dns'] = {}
    network_dict['domains'] = {}
    if dyn_data['network']['udp']:
        network_dict['udp']['src'] = []
        network_dict['udp']['dst'] = []
        for x in dyn_data['network']['udp']:
            network_dict['udp']['src'].append(x['src'] + ':' + str(x['sport']))
            network_dict['udp']['dst'].append(x['dst'] + ':' + str(x['dport']))
    network_dict['dns_servers'] = dyn_data['network']['dns_servers']
    network_dict['http'] = dyn_data['network']['http']
    if dyn_data['network']['icmp']:
        network_dict['icmp']['src'] = [x['src']
                                       for x in dyn_data['network']['icmp']]
        network_dict['icmp']['dst'] = [x['dst']
                                       for x in dyn_data['network']['icmp']]
    if dyn_data['network']['tcp']:
        network_dict['tcp']['src'] = []
        network_dict['tcp']['dst'] = []
        for x in dyn_data['network']['tcp']:
            network_dict['tcp']['src'].append(x['src'] + ':' + str(x['sport']))
            network_dict['tcp']['dst'].append(x['dst'] + ':' + str(x['dport']))
    network_dict['hosts'] = dyn_data['network']['hosts']
    network_dict['dns'] = dyn_data['network']['dns']
    network_dict['domains'] = dyn_data['network']['domains']
    return network_dict


def parse_behavior(dyn_data: dict) -> dict:
    """
    parses out behavior information from dyn_data dictionary

    Args:
          dyn_data: dictionary read from dynamic data

    Returns:
          behavior_dict: dictionary parsing the behavior information extracted from dyn_data
    """
    if dyn_data == {}:
      return {}
    behavior_dict = {}
    if len(dyn_data['behavior']['apistats'].values()) > 1:
        for i, x in enumerate(dyn_data['behavior']['apistats'].values()):
            if i == 0:
                apistats = Counter(x)
            else:
                apistats.update(Counter(x))
        behavior_dict['apistats'] = dict(apistats)
    else:
        for i in dyn_data['behavior']['apistats'].values():
            behavior_dict['apistats'] = i
    behavior_dict['processes'] = []
    for i in dyn_data['behavior']['processes']:
        behavior_dict['processes'].append(i['process_name'])
    behavior_dict['summary'] = {}
    summaryList = [
        'file_create',
        'file_written',
        'file_exists',
        'directory_created',
        'dll_loaded',
        'file_opened',
        'regkey_opened',
        'guid',
        'file_read',
        'regkey_read',
        'regkey_deleted',
        'directory_enumerated',
        'ddirectory_removed',
        'mutex',
        'command_line',
        'connects_ip']
    for i in summaryList:
        try:
            fc = dyn_data['behavior']['summary'][i]
            behavior_dict['summary'][i] = fc
        except BaseException:
            behavior_dict['summary'][i] = []
    return behavior_dict


def parse_virustotal(dyn_data: dict) -> dict:
    """
    parses out virustotal information from dyn_data dictionary

    Not used, just used for data analysis sake

    Args:
          dyn_data: dictionary read from dynamic data

    Returns:
          network_dict: dictionary parsing the network information extracted from dyn_data
    """
    virustotal_dict = {}
    try:
        vt = dyn_data['virustotal']
        virustotal_dict['positives'] = vt['positives']
        virustotal_dict['total'] = vt['total']
        return virustotal_dict
    except BaseException:
        return virustotal_dict


def parse_dynamic_data(data: dict) -> dict:
    """
    parse the dynamic data dictionary only for desired features

    Args:
          data: Dictionary read from json file

    Returns:
          dict: Appended dictionary of network interaction and behavior interaction of malware
          (along with virustotal results which are not used anywhere)
    """
    network_dict = parse_network(data)
    behavior_dict = parse_behavior(data)
    virustotal_dict = parse_virustotal(data)
    return {**network_dict, **behavior_dict, **virustotal_dict}


PASS_LIST = ['password','qwerty','abc123','1q2w3e4r5t','q1w2e3r4t5y6','iloveyou','1q2w3e4r','qwertyuiop','yuantuo2012','qwerty123','1qaz2wsx3edc','password1','1qaz2wsx','dragon','ashley','princess','123qwe','monkey','q1w2e3r4','zxcvbnm','asdfghjkl','pokemon','football','killer','michael','shadow','daniel','asdasd','qazwsx','1234qwer','superman','123456a','azerty','qwe123','master','sunshine','N0=Acc3ss','1q2w3e','abcd1234','computer','fuckyou','aaaaaa','asdfgh','asd123','baseball','charlie','qwer1234','naruto','a123456','jessica','Status','soccer','jordan','liverpool','thomas','lol123','michelle','123abc','nicole','starwars','samsung','secret','joshua','andrew','q1w2e3r4t5','hunter','Password','qazwsxedc','lovely','jennifer','letmein','tigger','asdf1234','hannah','purple','justin','qwerty1','anthony','welcome','love','aa123456','qweasdzxc','internet','robert','minecraft','super123','batman','trustno1','matthew','chocolate','flower','cookie','D1lakiss','william','cheese','buster','pakistan','chelsea','alexander','andrea','hello','samantha','blink182','freedom','matrix','george','amanda','1qazxsw2','forever','martin','patrick','iloveu','babygirl','summer','friends','whatever','12qwaszx','pepper','zaq12wsx','butterfly','orange','jasmine','joseph','maggie','banana','arsenal','mustang','monster','passw0rd','jonathan','snoopy','family','changeme','123qweasd','ginger','angel','junior','diamond','asdfasdf','taylor','eminem','oliver','Exigent','basketball','sophie','loveme','mother','benjamin','silver','harley','Password1','spiderman','chicken','a123456789','asshole','qweasd','yellow','melissa','qwertyui','christian','nathan','anhyeuem','brandon','richard','nks230kjs82','rr123456rr','metallica','never','123hfjdk147','lovers','mercedes','123456abc','gabriel','password123','loveyou','mickey','bailey','hello123','sandra','london','qwerty12','zxcvbn','q1w2e3','slipknot','qwerty12345','prince','hockey','angels','peanut','victoria','asdf','angela','rainbow','abcdef','ferrari','google','cocacola','hahaha','carlos']
# list of 200 common passwords stripped of only numeric ones since a lot
# of numeric strings may be found inside
PACKER_LIST =  ["UPX","Obsidium","PECompact","FSG","Themida","UPack","ASPack","NSPack","Armadillo","Enigma","UPC","Packman","PolyCrypt","Molebox","Morphine","ASProtect","Expressor","Petite","NSAnti","PEBundle","PE-Shield","PELock","PE-Armor","WWPack32","Polyene","PESpin","TeLock","PE-Pack","SoftComp","Ezip","Pingvin","Yoda","PCShrink","Pex","YodaProt","NTPacker","PE-Diminisher","NakedPack","SimplePack","Exe32Pack","ExeStealth","YodaProtect","Kcuf","RadPack","Crypt.Kcuf","SDProtector","JDPack","PE-Crypt.Sqr","Thunder","BJFnt","Neolite"]
 
# List of packers to look for


def find_hex_strings(data: bytes) -> list:
    """
    finds the list of possibly hex strings in data

    Args:
          data: strings data read from the "Strings.txt" file

    Returns:
          List of seemingly hex strings atleast 20 character long
    """
    pattern = b'[0-9a-f]{20,}|[0-9A-F]{20,}'  # Atleast 20 character longs, can be tweaked
    return [i.decode() for i in re.findall(pattern, data)]


def find_base64_strings(data: bytes) -> list:
    """
    Finds the list of possibly base64 encoded strings in data

    Args:
          data: strings data read from the "Strings.txt" file

    Returns:
          List of seemingly hex strings atleast 20 character long
          (mostly false negative since character strings would be valid base64 strings)
    """
    pattern = b'(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{4})'
    return [i.decode() for i in re.findall(pattern, data) if len(i) > 20]


def password_bruteforces(data: bytes) -> list:
    """
    Looks for possible bruteforced password lists in the binary

    Args:
          data: strings data read from "Strings.txt" file

    Globals:
          PASS_LIST: list of alphanumberic or alpha most common passwords
    Returns:
          a list of strings found from the PASS_LIST
    """
    pattern = "|".join(PASS_LIST).encode()
    return list(map(lambda x: x.decode(), set(re.findall(pattern, data))))


def hardcoded_addresses(data: bytes) -> list:
    """
    Looks for hardcoded ip addresses and urls in data

    Args:
          data: strings data read from "Strings.txt" file

    Returns:
          list of hardcoded ipv4/ipv6/http(s) addresses in the "Strings.txt" file
    """
    ipv4_pattern = rb"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"
    ipv6_pattern = b'(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}|::(?:[0-9a-fA-F]{1,4}:){0,6}[0-9a-fA-F]{1,4}|[0-9a-fA-F]{1,4}::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}|[0-9a-fA-F]{1,4}:[0-9a-fA-F]{1,4}::(?:[0-9a-fA-F]{1,4}:){0,4}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){0,2}[0-9a-fA-F]{1,4}::(?:[0-9a-fA-F]{1,4}:){0,3}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){0,3}[0-9a-fA-F]{1,4}::(?:[0-9a-fA-F]{1,4}:){0,2}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){0,4}[0-9a-fA-F]{1,4}::(?:[0-9a-fA-F]{1,4}:)?[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}::[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){0,6}[0-9a-fA-F]{1,4}::'
    # very ugly but awesome
    url_pattern = rb"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    combined_pattern = ipv4_pattern + b"|" + ipv6_pattern + b"|" + url_pattern
    return [
        i.decode() for i in set(
            re.findall(
                combined_pattern,
                data)) if len(i) > 6]


def long_strings(data: bytes) -> list:
    """
    returns the long strings in the data (separated by newlines)

    May not be a distuinguishing criteria as non-malwares may have long strings too
    threshold length:50

    Args:
          data: strings data read from "Strings.txt" file

    Returns:
          List of really long strings in "Strings.txt" file
    """
    return [string.decode()
            for string in data.split(b"\n") if len(string) > 50]


def find_dlls(data: bytes) -> list:
    """
    lists the dlls mentioned in the strings

    Args:
          data: strings data from "Strings.txt" file

    Returns:
          List of hardcoded .dll names in the strings output
    """
    pattern = rb".+\.dll|.+\.DLL"
    return list(map(lambda x: x.decode(), set(re.findall(pattern, data))))


def find_exes(data: str) -> list:
    """
    lists all .exe mentioned in data

    Args:
          data: strings data from "Strings.txt" file

    Returns:
          list of hardcoded exe names in strings output
    """
    pattern = rb".+\.exe|.+\.EXE"
    return list(map(lambda x: x.decode(), set(re.findall(pattern, data))))


def num_strings(data: bytes) -> int:
    """
    return the number of strings in the strings file

    Args:
          data: strings data from "Strings.txt" file

    Returns:
          Number of lines in strings output
    """
    return data.count(b'\n')


def packer_strings(data: bytes) -> list:
    """
    return a list of packer names in the binary

    Args:
          data: Strings data from "Strings.txt" file

    Globals:
          PACKER_LIST: list of packer specific strings

    Returns:
          List of packer specific strings found the "Strings.txt" (may contain false information sometimes)
    """
    packer_pattern = "|".join(i.upper() for i in PACKER_LIST).encode(
    ) + "|".join(i.lower() for i in PACKER_LIST).encode()
    return list(map(lambda x: x.decode(), set(
        re.findall(packer_pattern, data))))


def char_count(data: bytes) -> int:
    """
    total character count of the strings output

    Args:
          data: Strings data from "Strings.txt" file

    Returns:
          Number of bytes in "Strings.txt" file
    """
    return len(data)


def parse_strings(data: bytes) -> dict:
    """
    Parses the strings file and returns various suspecious features
    Invokes different parsers which look for specific patterns

    Args:
          data: Strings data from "Strings.txt" file

    Returns:
          parsed_data: Dictionary with following features
          - num_strings: number of strings in "Strings.txt"
          - passwords: list of hardcoded common passwords
          - addresses: hardcoded ipv4,ipv6 and urls
          - hex_str: seemingly long hex strings
          - b64_str: seemingly long base64 strings
          - packer: list of packer associated strings
          - exes: hardcoded exe names in binary
          - len: size of "Strings.txt"
    """
    parsed_data = {}
    parsed_data['num_strings'] = num_strings(data)
    parsed_data['passwords'] = password_bruteforces(data)
    parsed_data['addresses'] = hardcoded_addresses(data)
    parsed_data['hex_str'] = find_hex_strings(data)
    parsed_data['b64_str'] = find_base64_strings(data)
    parsed_data['packer'] = packer_strings(data)
    parsed_data['exes'] = find_exes(data)
    parsed_data['len'] = char_count(data)
    return parsed_data


def filter_file_data(
        filename: str,
        filepath: str,
        file_type: str = "") -> dict:
    """
    Filters all the static and dynamic data associated with a program and returns the data in parsed dictionary

    Args:
          filename: name of malware specified as <hash>
          filepath: path under which the analysis files need to be looked
          file_type: Type of the file (Malware(further classifed into malware classes) or benign)
                                  by default null string when called on testing data (string provided when parsing training data)

    Returns:
          dict: All useful information parsed into a dictionary of keys by parsing both static and dynamic analysis data

    """
    try:
        string_data, structure_data, dynamic_data = get_data(
            filename, filepath)
    except Exception as e:
        print(e, filename)
    try:
        string_dict = parse_strings(string_data)
    except Exception as e:
        print(e,filename)
        string_dict = {}
    try:
        struct_dict = parse_structure_info(structure_data)
    except Exception as e:
        print(e,filename)
        struct_dict = {}
    try:
        dynamic_dict = parse_dynamic_data(dynamic_data)
    except Exception as e:
        print(e,filename)
        dynamic_dict = {}
    label_dict = {"label": file_type}
    return {**label_dict, **string_dict, **struct_dict, **dynamic_dict}


def filter_data(
        file_list: list,
        filepath: str,
        output_dir: str,
        file_type: str = ""):
    """
    Filter data for the names present in file_list and produce intermediate result

    Args:
          file_list: list of file names
          filepath: path to the directory where to look for analysis files
          file_type: Type of the file (Malware(further classifed into malware classes) or benign)
                                  by default null string when called on testing data (string provided when parsing training data)
    """
    for file in tqdm(file_list, desc="Filtering", ascii=False, ncols=75  ):
        save_file_name = os.path.join(output_dir, file + '_filtered.json')
        filtered_data = filter_file_data(file, filepath, file_type)
        with open(save_file_name, 'w') as save_file:
            json.dump(filtered_data, save_file)


def selected_features_to_csv(
        feature_list: list,
        field_names: list,
        filtered_files_path: str,
        csv_name: str):
    """
    Parse all the intermediate files and put the desired features in a csv

    Args:
          feature_list: list of functions to extract out the desired values/fields from intermediate result files
          field_names: names of the filtered fields to be generated in the csv
          filtered_files_path: path to the directory where intermediate filtered results have been stored
          csv_name: name of the intermediate csv containing our desired features

    Returns:
          None (stores output in `csv_name` csv)
    """
    output_csv = open(csv_name, 'w')
    output_csv.write("filename," + ",".join(field_names) + '\n')
    filtered_file_names = os.listdir(filtered_files_path)
    filtered_files = list(map(lambda x: os.path.join(
        filtered_files_path, x), filtered_file_names))
    for file in tqdm(filtered_files, desc = "Extracting features", ascii=False, ncols=75):
        with open(file, 'r') as json_file:
            try:
                data = json.load(json_file)
                hashname = os.path.basename(file).split(
                    '_')[0]  # hash_filtered.json
                output_csv.write(hashname+","+",".join(str(feature(data))
                                          for feature in feature_list) + "\n")
            except Exception as e:
                continue
    output_csv.close()


def filter_csv(filtered_files_path: str, csv_name: str):
    """
    Helper function with desired features selected

    Calls `selected_features_to_csv` with custom feature_list and field_names

    Args:
            filtered_files_path:
            csv_name: name of the intermediate csv to be generated

    Returns:
            None (saves csv file)
    """
    field_names = [
        'label',
        'num_strings',
        'len',
        'avg_entropy',
        'packer',
        'max_en',
        'min_en',
        'num_addr',
        'udp_dst']

    def avg(lst):
        n = len(lst)
        if n == 0:
            return 0
        return sum(lst) / n

    def max1(lst):
        if lst:
            return max(lst)
        return -1

    def min1(lst):
        if lst:
            return min(lst)
        return -1

    feature_list = [
        lambda x: x.get('label'),
        lambda x: x.get('num_strings'),
        lambda x: x.get('len'),
        lambda x: avg([float(i['entropy']) for i in x.get('sections')]),
        lambda x: len(x.get('packer')),
        lambda x: max1([float(i['entropy']) for i in x.get('sections')]),
        lambda x: min1([float(i['entropy']) for i in x.get('sections')]),
        lambda x: len(x.get('addresses')),
        lambda x: len(x.get('udp',{'dst':[]}).get('dst')),
    ]
    selected_features_to_csv(
        feature_list,
        field_names,
        filtered_files_path,
        csv_name)
    print("Data filtered, stored in {}".format(csv_name))


def main(temp_csv: str, output_csv: str, model_name: str):
    """
    Main function, reads intermediate parsing results of test data, produces output in output_csv

    The model classifies in classes benign, trojan, trojandownloader, trojandropper ... which has been tuned down
    to binary classes i.e malware and benign
    Args:
      temp_csv: temporary csv containing parsed features for test data
      output_csv: the csv file in which results will be stored
      model_name: name of the pretrained model

    Returns:
      None, generates output in the provided output_csv
    """
    df = pd.read_csv(temp_csv)
    features = df[['num_strings', 'len', 'avg_entropy',
                   'packer', 'max_en', 'min_en', 'num_addr', 'udp_dst']]
    print("Reading model {}".format(model_name))
    with open(model_name, 'rb') as model_data:
        model = pickle.load(model_data)
    file_names = df[['filename']]
    class_prediction = model.predict(features)
    with open(output_csv, 'w') as output:
        output.write('filename,class\n')
        for i, name in enumerate(np.ravel(file_names)):
            if class_prediction[i] == 'benign':
                file_type = 'Benign'
            else:
                file_type = 'Malware'
            output.write('{},{}\n'.format(name, file_type))
    print("Output stored in {}".format(output_csv))


def get_names(path: str) -> list:
    """
    gets the names of the file hashes from the provided path

    Args:
            path: path of the test directory
    Returns:
            list of static and dynamic files
    """
    def valid_name(name):
        """checks if the name is of form <hash> or <hash>.json it yes, returns the <hash>"""
        matched = re.match(r'^([a-f0-9A-F]{64})(:?\.json)?$', name)
        if matched:
            return matched[1]
    file_names = set()
    for root, directory, files in os.walk(path):
        file_names.update(set(map(valid_name, files)))
        file_names.update(set(map(valid_name, directory)))
    file_names = list(file_names)
    file_names.remove(None)
    return file_names


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("usage info: python3 MalwareDetection.py <path to testing data>")
        exit(1)
    test_path = sys.argv[1]
    temporary_file_path = 'temp_test_filtered'
    if not os.path.exists(temporary_file_path):
      os.mkdir(temporary_file_path)
    temporary_csv = 'temp_features.csv'
    output_csv = 'output.csv'
    model_name = 'trained_model.pickle'
    test_file_names = get_names(test_path)
    filter_data(test_file_names, test_path, temporary_file_path, '')
    filter_csv(temporary_file_path, temporary_csv)
    main(temporary_csv, output_csv, model_name)