Skip to content

Commit 3c6c4ee

Browse files
committed
Add a new JSONCustom parser Bot
JSON Custom Parser allows can translate json key into harmonized key field and can create events from list of dict within key of JSON
1 parent 7aa3c07 commit 3c6c4ee

File tree

11 files changed

+436
-0
lines changed

11 files changed

+436
-0
lines changed

docs/user/bots.rst

+32
Original file line numberDiff line numberDiff line change
@@ -1243,6 +1243,38 @@ HTML Table Parser
12431243
* `"type"`: set the `classification.type` statically, optional
12441244
* `"html_parser"`: The HTML parser to use, by default "html.parser", can also be e.g. "lxml", have a look at https://www.crummy.com/software/BeautifulSoup/bs4/doc/
12451245

1246+
JSON Custom Parser
1247+
^^^^^^^^^^^^^^^^^
1248+
1249+
**Configuration parameters**
1250+
1251+
* `"json_data_format"`: Boolean, if list of data is within key of json object, optional. Default: false.
1252+
* `"json_data_key"`: Key of json object where data list is present. string should be flatten_key, optional. To be used in conjunction with `"json_data_format"`. Default: `""`.
1253+
E.g.
1254+
1255+
.. code-block:: json
1256+
1257+
"json_data_format": true,
1258+
"json_data_key": "data.ipdata"
1259+
1260+
With above configuration, list of dict will be created from list present in json["data"]["ipdata"]. Each dict will then create atleast an event.
1261+
* `"splitlines"`: Boolean, spit multiline data into list, optional. Default: `"false"`. Either `"json_data_format"` or `"splitlines"` can be used.`
1262+
* `"translate_fields"`: A Dictionary to map harmonized field to flatten json key(separator: `"."`). these flatten key should be relative to `"json_data_key"`.
1263+
1264+
.. code-block:: json
1265+
1266+
"translate_fields": {
1267+
"source.url": "url",
1268+
"time.source": "lseen",
1269+
"extra.tags": "tags.str"
1270+
},
1271+
1272+
Above configuration will put value from "url" key to "source.url", "lseen" key to "time.source" and so on.
1273+
1274+
* `"default_url_protocol"`: For URLs you can give a default protocol which will be pretended to the data. Default: `"http://"`.
1275+
* `"time_format"`: Optional. If `"timestamp"`, `"windows_nt"`, `"epoch_millis"`, `"from_format"`, `"from_format_midnight"`, `"utc_isoformat"` or `"fuzzy"` the time will be converted first. With the default `"null"` fuzzy time parsing will be used.
1276+
* `"type"`: set the `classification.type` statically, optional. Default: "c2server".
1277+
12461278
Key-Value Parser
12471279
^^^^^^^^^^^^^^^^
12481280

intelmq/bots/BOTS

+13
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,19 @@
547547
"splitlines": false
548548
}
549549
},
550+
"JSON Custom": {
551+
"description": "JSON Custom Parser converts from a custom JSON-String into an Event",
552+
"module": "intelmq.bots.parsers.json_custom.parser",
553+
"parameters": {
554+
"splitlines": false,
555+
"multiple_msg_field": null,
556+
"json_data_format": false,
557+
"json_data_key": null,
558+
"time_format": null,
559+
"translate_fields": {},
560+
"type": "c2server"
561+
}
562+
},
550563
"Key-Value": {
551564
"description": "Parse key=value strings.",
552565
"module": "intelmq.bots.parsers.key_value.parser",

intelmq/bots/parsers/json_custom/__init__.py

Whitespace-only changes.
+102
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from dateutil.parser import parse
4+
5+
from intelmq.lib.bot import ParserBot
6+
from intelmq.lib.harmonization import DateTime
7+
from intelmq.lib.message import Message
8+
from intelmq.lib.utils import base64_decode
9+
from intelmq.lib.harmonization import DateTime
10+
11+
12+
class JSONCustomParserBot(ParserBot):
13+
14+
def init(self):
15+
self.time_format = getattr(self.parameters, "time_format", None)
16+
if self.time_format and self.time_format.split('|')[0] not in DateTime.TIME_CONVERSIONS.keys():
17+
raise InvalidArgument('time_format', got=self.time_format,
18+
expected=list(DateTime.TIME_CONVERSIONS.keys()),
19+
docs='https://intelmq.readthedocs.io/en/latest/user/Bots.html#json-custom-parser')
20+
21+
self.json_data_format = getattr(self.parameters, 'json_data_format', False)
22+
self.json_data_key = getattr(self.parameters, 'json_data_key', '')
23+
self.multiple_msg_field = getattr(self.parameters, 'multiple_msg_field', None)
24+
self.translate_fields = getattr(self.parameters, 'translate_fields', {})
25+
self.split_lines = getattr(self.parameters, 'splitlines', False)
26+
self.default_url_protocol = getattr(self.parameters, 'default_url_protocol', 'http://')
27+
self.classification_type = getattr(self.parameters, 'type')
28+
29+
def flatten_json(self, json_object):
30+
out = {}
31+
32+
def flatten(x, name='', separator='.'):
33+
if type(x) is dict:
34+
for a in x:
35+
flatten(x[a], name + a + separator)
36+
else:
37+
out[name[:-1]] = x
38+
39+
flatten(json_object)
40+
return out
41+
42+
def process(self):
43+
44+
report = self.receive_message()
45+
raw_report = base64_decode(report["raw"])
46+
47+
if self.json_data_format:
48+
lines = Message.unserialize(raw_report)[self.json_data_key]
49+
elif self.split_lines:
50+
lines = raw_report.splitlines()
51+
else:
52+
lines = [raw_report]
53+
54+
for line in lines:
55+
if not line:
56+
continue
57+
58+
msg = Message.unserialize(line) if not self.json_data_format else line
59+
flatten_msg = self.flatten_json(msg)
60+
event_msg = {}
61+
62+
for key in self.translate_fields:
63+
data = flatten_msg.get(self.translate_fields[key])
64+
65+
if key in ["time.source", "time.destination"]:
66+
try:
67+
data = int(data)
68+
except ValueError:
69+
pass
70+
data = DateTime.convert(data, format=self.time_format)
71+
72+
elif key.endswith('.url'):
73+
if not data:
74+
continue
75+
if '://' not in data:
76+
data = self.default_url_protocol + data
77+
78+
event_msg[key] = data
79+
80+
multiple_msgs = []
81+
if self.multiple_msg_field in event_msg and type(event_msg[self.multiple_msg_field]) is list:
82+
for value in event_msg[self.multiple_msg_field]:
83+
new_msg = event_msg.copy()
84+
new_msg[self.multiple_msg_field] = value
85+
multiple_msgs.append(new_msg)
86+
else:
87+
multiple_msgs = [event_msg]
88+
89+
for event_msg in multiple_msgs:
90+
event = self.new_event(report)
91+
event.update(event_msg)
92+
93+
if self.classification_type and "classification.type" not in event:
94+
event.add('classification.type', self.classification_type)
95+
event['raw'] = Message.serialize(line) if self.json_data_format else line
96+
97+
self.send_message(event)
98+
99+
self.acknowledge_message()
100+
101+
102+
BOT = JSONCustomParserBot

intelmq/tests/bots/parsers/json_custom/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
{
2+
"complete": false,
3+
"count": 9632240,
4+
"data": [
5+
{
6+
"ip": "179.124.36.196",
7+
"seen": true,
8+
"classification": "malicious",
9+
"spoofable": false,
10+
"first_seen": "2020-01-13",
11+
"last_seen": "2020-12-14",
12+
"actor": "unknown",
13+
"tags": [
14+
"SSH Scanner",
15+
"SSH Worm"
16+
],
17+
"cve": [],
18+
"metadata": {
19+
"country": "Brazil",
20+
"country_code": "BR",
21+
"city": "São Paulo",
22+
"organization": "EQUINIX BRASIL",
23+
"region": "São Paulo",
24+
"rdns": "196.36.124.179.static.sp2.alog.com.br",
25+
"asn": "AS16397",
26+
"tor": false,
27+
"os": "Linux 3.1-3.10",
28+
"category": "isp",
29+
"vpn": false,
30+
"vpn_service": ""
31+
},
32+
"raw_data": {
33+
"scan": [
34+
{
35+
"port": 22,
36+
"protocol": "TCP"
37+
},
38+
{
39+
"port": 2222,
40+
"protocol": "TCP"
41+
}
42+
],
43+
"web": {},
44+
"ja3": []
45+
}
46+
},
47+
{
48+
"ip": "189.86.227.150",
49+
"seen": true,
50+
"classification": "malicious",
51+
"spoofable": false,
52+
"first_seen": "2019-01-17",
53+
"last_seen": "2020-12-14",
54+
"actor": "unknown",
55+
"tags": [
56+
"Eternalblue",
57+
"SMB Scanner"
58+
],
59+
"cve": [
60+
"CVE-2017-0144"
61+
],
62+
"metadata": {
63+
"country": "Brazil",
64+
"country_code": "BR",
65+
"city": "Sorocaba",
66+
"organization": "CLARO S.A.",
67+
"region": "São Paulo",
68+
"rdns": "bkbrasil-g2-0-0-15122-iacc02.gna.embratel.net.br",
69+
"asn": "AS4230",
70+
"tor": false,
71+
"os": "Windows 7/8",
72+
"category": "isp",
73+
"vpn": false,
74+
"vpn_service": ""
75+
},
76+
"raw_data": {
77+
"scan": [
78+
{
79+
"port": 445,
80+
"protocol": "TCP"
81+
}
82+
],
83+
"web": {},
84+
"ja3": []
85+
}
86+
}
87+
],
88+
"message": "ok"
89+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"domain": "kreditohneschufa48.de", "fseen": 1576368000, "lseen": 1607731200, "collect": 1607817600, "tags": {"str": ["spam"], "codes": [2]}, "resolved": {"ip": {"a": ["23.60.91.225", "23.200.237.225"], "alias": [], "cname": []}, "whois": {"created": "1970-01-01 00:00:00", "updated": "1970-01-01 00:00:00", "expires": "1970-01-01 00:00:00", "age": 0, "registrar": "unknown", "registrant": "unknown", "havedata": "false"}}, "score": {"total": 3, "src": 60.2, "tags": 0.75, "frequency": 0.07}, "fp": {"alarm": "false", "descr": ""}, "threat": [], "id": "d267c60f-5709-3698-9523-f727f42ab5c7", "title": "RST Threat feed. IOC: kreditohneschufa48.de", "description": "IOC with tags: spam"}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"url": "114.234.166.255:39436/mozi.a", "fseen": 1598918400, "lseen": 1601942400, "collect": 1602028800, "tags": {"str": ["malware"], "codes": [10]}, "score": {"total": 10, "src": 73.06, "tags": 0.89, "frequency": 0.58}, "resolved": {"status": 503}, "fp": {"alarm": "true", "descr": "Resource unavailable"}, "threat": [], "id": "987f5038-298f-37eb-a1d5-a17105f6b4b5", "title": "RST Threat feed. IOC: 114.234.166.255:39436/mozi.a", "description": "IOC with tags: malware"}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# -*- coding: utf-8 -*-
2+
import base64
3+
import os
4+
import unittest
5+
6+
import intelmq.lib.test as test
7+
from intelmq.bots.parsers.json_custom.parser import JSONCustomParserBot
8+
9+
with open(os.path.join(os.path.dirname(__file__), 'json_key_data.json'), 'rb') as fh:
10+
RAW = base64.b64encode(fh.read()).decode()
11+
12+
REPORT = {"feed.name": "Test Feed",
13+
"raw": RAW,
14+
"__type": "Report",
15+
}
16+
EVENT = {'__type': 'Event',
17+
'classification.type': 'malware',
18+
'extra.tags': ['SSH Scanner', 'SSH Worm'],
19+
'feed.name': 'Test Feed',
20+
'raw': 'eyJpcCI6ICIxNzkuMTI0LjM2LjE5NiIsICJzZWVuIjogdHJ1ZSwgImNsYXNzaWZpY2F0aW9'
21+
'uIjogIm1hbGljaW91cyIsICJzcG9vZmFibGUiOiBmYWxzZSwgImZpcnN0X3NlZW4iOiAiMj'
22+
'AyMC0wMS0xMyIsICJsYXN0X3NlZW4iOiAiMjAyMC0xMi0xNCIsICJhY3RvciI6ICJ1bmtub'
23+
'3duIiwgInRhZ3MiOiBbIlNTSCBTY2FubmVyIiwgIlNTSCBXb3JtIl0sICJjdmUiOiBbXSwg'
24+
'Im1ldGFkYXRhIjogeyJjb3VudHJ5IjogIkJyYXppbCIsICJjb3VudHJ5X2NvZGUiOiAiQlI'
25+
'iLCAiY2l0eSI6ICJTXHUwMGUzbyBQYXVsbyIsICJvcmdhbml6YXRpb24iOiAiRVFVSU5JWC'
26+
'BCUkFTSUwiLCAicmVnaW9uIjogIlNcdTAwZTNvIFBhdWxvIiwgInJkbnMiOiAiMTk2LjM2L'
27+
'jEyNC4xNzkuc3RhdGljLnNwMi5hbG9nLmNvbS5iciIsICJhc24iOiAiQVMxNjM5NyIsICJ0'
28+
'b3IiOiBmYWxzZSwgIm9zIjogIkxpbnV4IDMuMS0zLjEwIiwgImNhdGVnb3J5IjogImlzcCI'
29+
'sICJ2cG4iOiBmYWxzZSwgInZwbl9zZXJ2aWNlIjogIiJ9LCAicmF3X2RhdGEiOiB7InNjYW'
30+
'4iOiBbeyJwb3J0IjogMjIsICJwcm90b2NvbCI6ICJUQ1AifSwgeyJwb3J0IjogMjIyMiwgI'
31+
'nByb3RvY29sIjogIlRDUCJ9XSwgIndlYiI6IHt9LCAiamEzIjogW119LCAiX190eXBlIjog'
32+
'ImRpY3QifQ==',
33+
'time.source': '2020-12-14T00:00:00+00:00',
34+
'source.ip': '179.124.36.196'
35+
}
36+
37+
38+
class TestJSONCustomParserBot(test.BotTestCase, unittest.TestCase):
39+
"""
40+
A TestCase for a JSONCustomParserBot.
41+
"""
42+
43+
@classmethod
44+
def set_bot(cls):
45+
cls.bot_reference = JSONCustomParserBot
46+
47+
def test_sample(self):
48+
""" Test if correct Event has been produced. """
49+
self.input_message = REPORT
50+
self.sysconfig = {"json_data_format": True,
51+
"json_data_key": "data",
52+
"type": "malware",
53+
"time_format": "from_format_midnight|%Y-%m-%d",
54+
"translate_fields": {"source.ip": "ip",
55+
"time.source": "last_seen",
56+
"extra.tags": "tags"
57+
}
58+
}
59+
self.run_bot()
60+
self.assertMessageEqual(0, EVENT)
61+
62+
63+
if __name__ == '__main__': # pragma: no cover
64+
unittest.main()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# -*- coding: utf-8 -*-
2+
import base64
3+
import os
4+
import unittest
5+
6+
import intelmq.lib.test as test
7+
from intelmq.bots.parsers.json_custom.parser import JSONCustomParserBot
8+
9+
with open(os.path.join(os.path.dirname(__file__), 'multiple_msg.json'), 'rb') as fh:
10+
RAW1 = base64.b64encode(fh.read()).decode()
11+
12+
MULTILINE_REPORT = {"feed.name": "RSTThreats Domain Feed",
13+
"raw": RAW1,
14+
"__type": "Report",
15+
}
16+
17+
MULTIPLE_EVENT1 = {'__type': 'Event',
18+
'classification.type': 'malware',
19+
'extra.tags': ['spam'],
20+
'extra.threat_info': [],
21+
'feed.name': 'RSTThreats Domain Feed',
22+
'raw': 'eyJkb21haW4iOiAia3JlZGl0b2huZXNjaHVmYTQ4LmRlIiwgImZzZWVuIjogMTU3NjM2O'
23+
'DAwMCwgImxzZWVuIjogMTYwNzczMTIwMCwgImNvbGxlY3QiOiAxNjA3ODE3NjAwLCAidG'
24+
'FncyI6IHsic3RyIjogWyJzcGFtIl0sICJjb2RlcyI6IFsyXX0sICJyZXNvbHZlZCI6IHs'
25+
'iaXAiOiB7ImEiOiBbIjIzLjYwLjkxLjIyNSIsICIyMy4yMDAuMjM3LjIyNSJdLCAiYWxp'
26+
'YXMiOiBbXSwgImNuYW1lIjogW119LCAid2hvaXMiOiB7ImNyZWF0ZWQiOiAiMTk3MC0wM'
27+
'S0wMSAwMDowMDowMCIsICJ1cGRhdGVkIjogIjE5NzAtMDEtMDEgMDA6MDA6MDAiLCAiZX'
28+
'hwaXJlcyI6ICIxOTcwLTAxLTAxIDAwOjAwOjAwIiwgImFnZSI6IDAsICJyZWdpc3RyYXI'
29+
'iOiAidW5rbm93biIsICJyZWdpc3RyYW50IjogInVua25vd24iLCAiaGF2ZWRhdGEiOiAi'
30+
'ZmFsc2UifX0sICJzY29yZSI6IHsidG90YWwiOiAzLCAic3JjIjogNjAuMiwgInRhZ3MiO'
31+
'iAwLjc1LCAiZnJlcXVlbmN5IjogMC4wN30sICJmcCI6IHsiYWxhcm0iOiAiZmFsc2UiLC'
32+
'AiZGVzY3IiOiAiIn0sICJ0aHJlYXQiOiBbXSwgImlkIjogImQyNjdjNjBmLTU3MDktMzY'
33+
'5OC05NTIzLWY3MjdmNDJhYjVjNyIsICJ0aXRsZSI6ICJSU1QgVGhyZWF0IGZlZWQuIElP'
34+
'Qzoga3JlZGl0b2huZXNjaHVmYTQ4LmRlIiwgImRlc2NyaXB0aW9uIjogIklPQyB3aXRoI'
35+
'HRhZ3M6IHNwYW0ifQ==',
36+
'source.fqdn': 'kreditohneschufa48.de',
37+
'source.ip': '23.60.91.225',
38+
'time.source': '2020-12-12T00:00:00+00:00'
39+
}
40+
41+
MULTIPLE_EVENT2 = MULTIPLE_EVENT1.copy()
42+
MULTIPLE_EVENT2["source.ip"] = "23.200.237.225"
43+
44+
45+
class TestJSONCustomParserBot(test.BotTestCase, unittest.TestCase):
46+
"""
47+
A TestCase for a JSONCustomParserBot.
48+
"""
49+
50+
@classmethod
51+
def set_bot(cls):
52+
cls.bot_reference = JSONCustomParserBot
53+
54+
def test_multiple_msg(self):
55+
""" Test if correct Event has been produced. """
56+
self.input_message = MULTILINE_REPORT
57+
self.sysconfig = {"splitlines": True,
58+
"type": "malware",
59+
"time_format": "epoch_millis",
60+
"multiple_msg_field": "source.ip",
61+
"translate_fields": {"source.fqdn": "domain",
62+
"time.source": "lseen",
63+
"extra.tags": "tags.str",
64+
"extra.threat_info": "threat",
65+
"source.ip": "resolved.ip.a"
66+
}
67+
}
68+
self.run_bot()
69+
self.assertMessageEqual(0, MULTIPLE_EVENT1)
70+
self.assertMessageEqual(1, MULTIPLE_EVENT2)
71+
72+
73+
if __name__ == '__main__': # pragma: no cover
74+
unittest.main()

0 commit comments

Comments
 (0)