From aa6e81a71aa9f3f96f83394553927106bc67c337 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Fri, 15 Apr 2022 13:48:36 -0400 Subject: [PATCH 01/35] Improved autocompletion, plus unit test --- src/kestrel/session.py | 41 ++++++++++++++++---- src/kestrel/syntax/kestrel.lark | 34 +++++++++------- src/kestrel/syntax/parser.py | 4 ++ tests/test_completion.py | 69 +++++++++++++++++++++++++++++++++ 4 files changed, 126 insertions(+), 22 deletions(-) create mode 100644 tests/test_completion.py diff --git a/src/kestrel/session.py b/src/kestrel/session.py index 14aaf9e7..5a05962b 100644 --- a/src/kestrel/session.py +++ b/src/kestrel/session.py @@ -368,7 +368,9 @@ def do_complete(self, code, cursor_pos): A list of suggested strings to complete the code. """ prefix = code[:cursor_pos] - last_word = prefix.split(" ")[-1] + words = prefix.split(" ") + last_word = words[-1] + _logger.debug('code="%s" prefix="%s" last_word="%s"', code, prefix, last_word) if "START" in prefix or "STOP" in prefix: return self._get_complete_timestamp(last_word) @@ -397,19 +399,30 @@ def do_complete(self, code, cursor_pos): _logger.debug("standard auto-complete") try: - self.parse(prefix) + stmt = self.parse(prefix) + _logger.debug("first parse: %s", stmt) + last_stmt = stmt[-1] + if last_stmt["command"] == "assign" and last_stmt["output"] == "_": + # Special case for a varname alone on a line + allnames = [ + v for v in self.get_variable_names() if v.startswith(prefix) + ] + if not allnames: + return ["=", "+"] if prefix.endswith(" ") else [] # If it parses successfully, add something so it will fail self.parse(prefix + " @autocompletions@") except KestrelSyntaxError as e: + _logger.debug("exception: %s", e) + varnames = self.get_variable_names() tmp = [] for token in e.expected: if token == "VARIABLE": - tmp.extend(self.get_variable_names()) + tmp.extend(varnames) elif token == "DATASRC": schemes = self.data_source_manager.schemes() tmp.extend([f"{scheme}://" for scheme in schemes]) - tmp.extend(self.get_variable_names()) + tmp.extend(varnames) elif token == "ANALYTICS": schemes = self.analytics_manager.schemes() tmp.extend([f"{scheme}://" for scheme in schemes]) @@ -418,12 +431,23 @@ def do_complete(self, code, cursor_pos): elif token.startswith("STIXPATH"): # TODO: figure out the varname and get its attrs continue + elif token.startswith("STIXPATTERNBODY"): + # TODO: figure out how to complete STIX patterns + continue elif token == "RELATION": - tmp.extend(all_relations) + if last_word: + tmp.extend(get_entity_types()) + else: + tmp.extend(all_relations) elif token == "REVERSED": tmp.append("BY") - varnames = self.get_variable_names() - if last_word not in varnames: + prev_word = words[-2] if len(words) >= 2 else "" + _logger.debug("prev_word = %s", prev_word) + if prev_word in all_relations: + pass + elif prev_word in varnames: + pass + elif last_word not in varnames: # Must be FIND and not GROUP tmp.extend(all_relations) elif token == "FUNCNAME": @@ -434,6 +458,8 @@ def do_complete(self, code, cursor_pos): continue elif token.startswith("__ANON"): continue + elif token == "EQUAL": + tmp.append("=") else: tmp.append(token) allnames = sorted(tmp) @@ -441,6 +467,7 @@ def do_complete(self, code, cursor_pos): suggestions = [ name[len(last_word) :] for name in allnames if name.startswith(last_word) ] + _logger.debug("%s -> %s", allnames, suggestions) return suggestions def close(self): diff --git a/src/kestrel/syntax/kestrel.lark b/src/kestrel/syntax/kestrel.lark index e70ce221..1c6fc68d 100644 --- a/src/kestrel/syntax/kestrel.lark +++ b/src/kestrel/syntax/kestrel.lark @@ -8,23 +8,27 @@ start: statement* // If no VARIABLE is given, default to _ in post-parsing // -statement: VARIABLE "=" command - | command +statement: assignment + | command_no_result +assignment: VARIABLE "=" command_with_result + | command_with_result + // "?" at the beginning will inline command -?command: get - | find - | disp - | info - | apply - | join - | sort - | group - | load - | save - | new - | merge - | assign +?command_with_result: get + | find + | join + | sort + | group + | load + | new + | merge + | assign + +?command_no_result: disp + | info + | apply + | save assign: expression diff --git a/src/kestrel/syntax/parser.py b/src/kestrel/syntax/parser.py index 4d116315..f776fdac 100644 --- a/src/kestrel/syntax/parser.py +++ b/src/kestrel/syntax/parser.py @@ -40,6 +40,10 @@ def start(self, args): def statement(self, args): # Kestrel syntax: a statement can only has one command stmt = args.pop() + return stmt + + def assignment(self, args): + stmt = args[1] if len(args) == 2 else args[0] stmt["output"] = _extract_var(args, self.default_variable) return stmt diff --git a/tests/test_completion.py b/tests/test_completion.py new file mode 100644 index 00000000..2d51ceb7 --- /dev/null +++ b/tests/test_completion.py @@ -0,0 +1,69 @@ +import os +import pytest +from kestrel.codegen.relations import all_relations +from kestrel.session import Session +from kestrel.syntax.utils import ( + LITERALS, + AGG_FUNCS, + TRANSFORMS, +) + + +KNOWN_ETYPES = { + 'artifact', 'autonomous-system', 'directory', 'domain-name', + 'email-addr', 'email-message', 'file', 'ipv4-addr', 'ipv6-addr', + 'mac-addr', 'mutex', 'network-traffic', 'process', 'software', + 'url', 'user-account', 'windows-registry-key', 'x-ibm-finding', + 'x-oca-asset', 'x-oca-event' +} + + +@pytest.fixture +def a_session(): + cwd = os.path.dirname(os.path.abspath(__file__)) + bundle = os.path.join(cwd, "test_bundle.json") + session = Session(debug_mode=True) + stmt = ("conns = get network-traffic" + f" from file://{bundle}" + " where [network-traffic:dst_port < 10000]") + session.execute(stmt) + return session + + +@pytest.mark.parametrize( + "code, expected", + [ + ("x", []), # No suggestions + ("x ", {"=", "+"}), + ("c", {"onns"}), + ("conns", ['']), # Empty string means word is complete + ("conns ", {"=", "+"}), + ("disp ", {"conns", "_"} | TRANSFORMS), + ("procs = ", {"GET", "FIND", "JOIN", "SORT", "GROUP", "LOAD", "NEW", "conns", "_"} | TRANSFORMS), + ("procs = G", {"ET", "ROUP"}), + ("procs = F", {"IND"}), + ("procs = FI", {"ND"}), + ("procs = FIN", {"D"}), + ("procs = FIND", []), + ("procs = FIND ", KNOWN_ETYPES), + ("procs = FIND p", ["rocess"]), + ("procs = FIND process", ['']), + #("procs = FIND process ", {"created", "loaded", "linked"}), + ("procs = FIND process ", all_relations), + ("procs = FIND process l", {"oaded", "inked"}), + ("procs = FIND process c", {"reated", "ontained", "onns"}), # FIXME: shouldn't suggest var here + ("procs = FIND process created ", {"conns", "_", "BY"}), + ("procs = FIND process created BY ", {"conns", "_"}), + ("grps = GR", {"OUP"}), + ("grps = GROUP ", {"conns", "_"}), + ("grps = GROUP conns ", {"BY"}), + ("grps = GROUP conns by ", []), # TODO: we don't suggest attrs yet + ("urls = get ", KNOWN_ETYPES), + ("urls = get url ", ["FROM", "WHERE"]), + ("urls = get url from ", ["_", "conns", "file://", "http://", "https://", "stixshifter://"]), + ("urls = get url where ", []), + ] +) +def test_do_complete_after_get(a_session, code, expected): + result = a_session.do_complete(code, len(code)) + assert set(result) == set(expected) From 979e6b4cdae67d93b38918d9fa2d33abc6e148d0 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Sun, 17 Apr 2022 00:15:35 -0400 Subject: [PATCH 02/35] Update GOVERNANCE.rst --- GOVERNANCE.rst | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/GOVERNANCE.rst b/GOVERNANCE.rst index e40f16a0..b3510b60 100644 --- a/GOVERNANCE.rst +++ b/GOVERNANCE.rst @@ -46,43 +46,44 @@ Release Procedure A maintainer should release a new Kestrel runtime (PyPI package name: ``kestre-lang``) following the procedure: -1. Update version and changelog +#. Update version and changelog - - Sync the local git repo to the latest of the ``develop`` branch. - - Update the ``version`` field in ``setup.cfg``. - - Add changes in ``CHANGELOG.rst`` under a new version section. - - Add new contributors to ``AUTHORS.rst`` if any. - - Commit the updates with the new version number as the message. - - Push the local ``develop`` branch to remote. + #. Sync the local git repo to the latest of the ``develop`` branch. + #. Update the ``version`` field in ``setup.cfg``. + #. Add changes in ``CHANGELOG.rst`` under a new version section. + #. Add new contributors to ``AUTHORS.rst`` if any. + #. Commit the updates with the new version number as the message. + #. Push the local ``develop`` branch to remote. -2. Graduate code to the ``release`` branch +#. Graduate code to the ``release`` branch - - Open a PR to merge the ``develop`` branch to the ``release`` branch. Use the version number as the PR title. + #. Open a PR to merge the ``develop`` branch to the ``release`` branch. Use the version number as the PR title. - - Merge the PR. + #. Merge the PR. -3. Create a new release +#. Create a new release - - Go to the release page and click *Draft a new release*. + #. Go to the release page and click *Draft a new release*. - - Type the version number as the new tag to create. + #. Type the version number as the new tag to create. - - Choose ``release`` branch as the *Target*. + #. Choose ``release`` branch as the *Target*. - - Specify a release title. Use the version number for ordinary release. + #. Specify a release title. Use the version number for ordinary release. - - Write a summary of the release. + #. Write a summary of the release. - Patch number release: copy the CHANGELOG entries. - Minor number release: may have a TLDR at the beginning highlighting the most important new feature. - - Hit the *Publish release* button. + #. Hit the *Publish release* button. -4. After release check +#. After release check - Check `kestrel-lang on PyPI`_ after a few minutes to confirm new package built and released. - May activate/pin the released version of Kestrel documentation at `readthedocs version control`_. + - Announce the release at OCA Kestrel channel. Vulnerability Disclosure ------------------------ From 13ba0f4342eaa33d2709526352050a664a4f6906 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Mon, 18 Apr 2022 14:23:30 -0400 Subject: [PATCH 03/35] Add DisplayWarning --- src/kestrel/codegen/commands.py | 17 ++++++++++++++--- src/kestrel/codegen/display.py | 17 +++++++++++++++++ tests/test_command_get.py | 21 ++++++++++++++++++++- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/src/kestrel/codegen/commands.py b/src/kestrel/codegen/commands.py index 5128181f..0d0a2cbf 100644 --- a/src/kestrel/codegen/commands.py +++ b/src/kestrel/codegen/commands.py @@ -31,8 +31,9 @@ from kestrel.semantics import get_entity_table, get_entity_type from kestrel.symboltable import new_var from kestrel.syntax.parser import get_all_input_var_names +from kestrel.syntax.utils import get_entity_types from kestrel.codegen.data import load_data, load_data_file, dump_data_to_file -from kestrel.codegen.display import DisplayDataframe, DisplayDict +from kestrel.codegen.display import DisplayDataframe, DisplayDict, DisplayWarning from kestrel.codegen.pattern import build_pattern, build_pattern_from_ids from kestrel.codegen.queries import ( compile_specific_relation_to_query, @@ -226,6 +227,7 @@ def get(stmt, session): return_type = stmt["type"] start_offset = session.config["stixquery"]["timerange_start_offset"] end_offset = session.config["stixquery"]["timerange_stop_offset"] + display = None pattern = build_pattern( stmt["patternbody"], @@ -237,10 +239,14 @@ def get(stmt, session): ) if "variablesource" in stmt: + input_type = get_entity_table(stmt["variablesource"], session.symtable) + output_type = stmt["type"] + if input_type != output_type: + pass # TODO: new exception type? session.store.filter( stmt["output"], stmt["type"], - get_entity_table(stmt["variablesource"], session.symtable), + input_type, pattern, ) output = new_var(session.store, return_var_table, [], stmt, session.symtable) @@ -325,10 +331,15 @@ def get(stmt, session): output = new_var(session.store, return_var_table, [], stmt, session.symtable) + if not len(output): + if not return_type.startswith("x-") and return_type not in ( + set(session.store.types()) | set(get_entity_types()) + ): + display = DisplayWarning(f'unknown entity type "{return_type}"') else: raise KestrelInternalError(f"unknown type of source in {str(stmt)}") - return output, None + return output, display @_debug_logger diff --git a/src/kestrel/codegen/display.py b/src/kestrel/codegen/display.py index 33ca1e0d..e05d468f 100644 --- a/src/kestrel/codegen/display.py +++ b/src/kestrel/codegen/display.py @@ -163,3 +163,20 @@ def __init__(self, figure): figure.savefig(vfile, format="svg") svg = vfile.getvalue() super().__init__(svg) + + +class DisplayWarning(AbstractDisplay): + def __init__(self, text): + self.text = text + + def to_string(self): + return self.text + + def to_html(self): + return f'
[WARNING] {self.text}
' + + def to_json(self): + return json.dumps(self.to_dict()) + + def to_dict(self): + return {"display": "warning", "data": self.text} diff --git a/tests/test_command_get.py b/tests/test_command_get.py index 63bb8ef8..b42e367e 100644 --- a/tests/test_command_get.py +++ b/tests/test_command_get.py @@ -1,7 +1,9 @@ -import pytest import json import os +import pytest + +from kestrel.codegen.display import DisplayWarning from kestrel.session import Session @@ -80,3 +82,20 @@ def test_get_multiple_stixshifter_stix_bundles(set_stixshifter_stix_bundles): "teamviewer_service.exe", "teamviewer.exe", "vmware.exe", "dashost.exe", "applemobiledeviceservice.exe", "svctest.exe", "vmware-hostd.exe"] + +def test_get_wrong_type(file_stix_bundles): + with Session() as s: + stmt = f"var = GET foo FROM file://{file_stix_bundles[0]} WHERE [process:name='compattelrunner.exe']" + + output = s.execute(stmt) + warnings = [] + for o in output: + print(json.dumps(o.to_dict(), indent=4)) + if isinstance(o, DisplayWarning): + warnings.append(o) + assert len(warnings) == 1 + assert "foo" in warnings[0].to_string() + v = s.get_variable("var") + print(json.dumps(v, indent=4)) + assert len(v) == 0 + From a6187961b761ff41032b56507540d3e1e6ecf4a7 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Mon, 18 Apr 2022 23:31:52 -0400 Subject: [PATCH 04/35] fix #205 with firepit query gen (4 cases) for refs --- src/kestrel/codegen/queries.py | 51 ++++++++++++++++++++---- tests/kestrel_python_analytics.py | 1 + tests/test_command_disp.py | 4 +- tests/test_command_find.py | 65 +++++++++++++++++++------------ tests/test_command_get.py | 45 +++++++++++++-------- tests/test_command_group.py | 30 +++++++++----- tests/test_parser.py | 40 +++++-------------- tests/test_timestamped.py | 18 +++++---- 8 files changed, 157 insertions(+), 97 deletions(-) diff --git a/src/kestrel/codegen/queries.py b/src/kestrel/codegen/queries.py index 8557ca01..9e12e015 100644 --- a/src/kestrel/codegen/queries.py +++ b/src/kestrel/codegen/queries.py @@ -28,29 +28,63 @@ def compile_specific_relation_to_query( stix_src_refs, stix_tgt_refs = stix_2_0_ref_mapping[(entity_x, relation, entity_y)] for ref_name in stix_src_refs: + # e.g., # STIX: ("process", "created", "network-traffic"): (["opened_connection_refs"], []) + # # type(p) == process; is_reversed == True + # nt = FIND network-traffic CREATED BY p + # # type(nt) == network-traffic; is_reversed == False + # p = FIND process CREATED nt + # + # It is just aligned that is_reversed == whether input_var is + # - EntityX in stix_2_0_ref_mapping + # - the source_ref in the __reflist table of firepit v2.0 + var_is_source = is_reversed + + (var_attr, ret_attr) = (ref_name, "id") if var_is_source else ("id", ref_name) + # if there are multiple options, use first one found in DB - (var_attr, ret_attr) = (ref_name, "id") if is_reversed else ("id", ref_name) if ref_name.endswith("_refs"): - query = _generate_reflist_query(input_var_name, ref_name, entity_y) + query = _generate_reflist_query( + input_var_name, var_is_source, ref_name, return_type + ) + elif var_attr in input_var_attrs and ret_attr in return_type_attrs: query = _generate_ref_query( input_var_name, input_type, var_attr, return_type, ret_attr ) + else: continue + return query for ref_name in stix_tgt_refs: + # e.g., # STIX: ("autonomous-system", "owned", "ipv4-addr"): ([], ["belongs_to_refs"]) + # # type(a) == autonomous-system; is_reversed == True + # ip = FIND ipv4-addr OWNED BY a + # # type(ip) == ipv4-addr; is_reversed == False + # a = FIND autonomous-system OWNED ip + # + # It is just aligned that (not is_reversed) == whether input_var is + # - EntityX in stix_2_0_ref_mapping + # - the source_ref in the __reflist table of firepit v2.0 + var_is_source = not is_reversed + + (var_attr, ret_attr) = (ref_name, "id") if var_is_source else ("id", ref_name) + # if there are multiple options, use first one found in DB - (var_attr, ret_attr) = ("id", ref_name) if is_reversed else (ref_name, "id") if ref_name.endswith("_refs"): - query = _generate_reflist_query(input_var_name, ref_name, entity_x) + query = _generate_reflist_query( + input_var_name, var_is_source, ref_name, return_type + ) + elif var_attr in input_var_attrs and ret_attr in return_type_attrs: query = _generate_ref_query( input_var_name, input_type, var_attr, return_type, ret_attr ) + else: continue + return query return None @@ -111,12 +145,15 @@ def _generate_ref_query(input_var_name, input_type, var_attr, ret_type, ret_attr ) -def _generate_reflist_query(input_var_name, ref_name, entity_y): +def _generate_reflist_query(input_var_name, var_is_source, ref_name, entity_y): + var_ref_pos, y_ref_pos = ( + ("source_ref", "target_ref") if var_is_source else ("target_ref", "source_ref") + ) return Query( [ Table(input_var_name), - Join("__reflist", "id", "=", "source_ref"), - Join(entity_y, "target_ref", "=", "id"), + Join("__reflist", "id", "=", var_ref_pos), + Join(entity_y, y_ref_pos, "=", "id"), Filter([Predicate("ref_name", "=", ref_name)]), Projection([Column("*", entity_y)]), # All columns from entity_y Unique(), diff --git a/tests/kestrel_python_analytics.py b/tests/kestrel_python_analytics.py index 667980ac..1129d89f 100644 --- a/tests/kestrel_python_analytics.py +++ b/tests/kestrel_python_analytics.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 + def enrich_one_variable(dataframe): newattr = ["newval" + str(i) for i in range(dataframe.shape[0])] dataframe["x_new_attr"] = newattr diff --git a/tests/test_command_disp.py b/tests/test_command_disp.py index fbc71264..69d6f307 100644 --- a/tests/test_command_disp.py +++ b/tests/test_command_disp.py @@ -33,7 +33,7 @@ def test_disp_grouped_procs(): s.execute(stmt) s.execute("grpvar = group newvar by name") out = s.execute("DISP grpvar") - data = out[0].to_dict()['data'] + data = out[0].to_dict()["data"] assert len(data) == 2 @@ -48,5 +48,5 @@ def test_disp_grouped_conns(): s.execute(stmt) s.execute("grpvar = group newvar by dst_ref.value") out = s.execute("DISP grpvar") - data = out[0].to_dict()['data'] + data = out[0].to_dict()["data"] assert len(data) == 2 diff --git a/tests/test_command_find.py b/tests/test_command_find.py index d9a86247..21e6a3df 100644 --- a/tests/test_command_find.py +++ b/tests/test_command_find.py @@ -66,28 +66,10 @@ def test_find_srcs(fake_bundle_file): srcs = FIND ipv4-addr CREATED conns """ s.execute(stmt) - srcs = s.get_variable('srcs') + srcs = s.get_variable("srcs") assert len(srcs) == 24 -def test_find_procs(proc_bundle_file): - with Session() as s: - stmt = f""" -procs = get process - from file://{proc_bundle_file} - where [process:name LIKE '%'] -conns = FIND network-traffic CREATED BY procs -""" - s.execute(stmt) - conns = s.get_variable('conns') - assert len(conns) == 853 # FIXME: should be 948, I think (id collisions for network-traffic) - - # DISP with a ref (parent_ref) and ambiguous column (command_line) - disp_out = s.execute("DISP procs ATTR name, parent_ref.name, command_line") - data = disp_out[0].to_dict()["data"] - print(json.dumps(data, indent=4)) - - def test_find_file_linked_to_process(proc_bundle_file): with Session() as s: stmt = f""" @@ -97,12 +79,12 @@ def test_find_file_linked_to_process(proc_bundle_file): files = FIND file LINKED procs """ s.execute(stmt) - procs = s.get_variable('procs') + procs = s.get_variable("procs") print(json.dumps(procs, indent=4)) assert len(procs) == 7 * 3 # TEMP: 3 records per entity - files = s.get_variable('files') + files = s.get_variable("files") print(json.dumps(files, indent=4)) - assert len(files) == 6 #TODO: double check this count + assert len(files) == 6 # TODO: double check this count def test_find_file_loaded_by_process(proc_bundle_file): @@ -114,10 +96,10 @@ def test_find_file_loaded_by_process(proc_bundle_file): files = FIND file LOADED BY procs """ s.execute(stmt) - procs = s.get_variable('procs') + procs = s.get_variable("procs") print(json.dumps(procs, indent=4)) assert len(procs) == 7 * 3 # TEMP: 3 records per entity - files = s.get_variable('files') + files = s.get_variable("files") print(json.dumps(files, indent=4)) assert len(files) == 1 @@ -131,6 +113,39 @@ def test_find_process_created_process(proc_bundle_file): parents = FIND process CREATED procs """ s.execute(stmt) - data = s.get_variable('parents') + data = s.get_variable("parents") print(json.dumps(data, indent=4)) assert len(data) + + +def test_find_refs_resolution_not_reversed_src_ref(proc_bundle_file): + with Session() as s: + stmt = f""" +nt = get network-traffic + from file://{proc_bundle_file} + where [network-traffic:src_port > 0] +p = FIND process CREATED nt +""" + s.execute(stmt) + p = s.get_variable("p") + assert len(p) == 1897 + + +def test_find_refs_resolution_reversed_src_ref(proc_bundle_file): + with Session() as s: + stmt = f""" +procs = get process + from file://{proc_bundle_file} + where [process:name LIKE '%'] +conns = FIND network-traffic CREATED BY procs +""" + s.execute(stmt) + conns = s.get_variable("conns") + assert ( + len(conns) == 853 + ) # FIXME: should be 948, I think (id collisions for network-traffic) + + # DISP with a ref (parent_ref) and ambiguous column (command_line) + disp_out = s.execute("DISP procs ATTR name, parent_ref.name, command_line") + data = disp_out[0].to_dict()["data"] + print(json.dumps(data, indent=4)) diff --git a/tests/test_command_get.py b/tests/test_command_get.py index 63bb8ef8..5dc286fd 100644 --- a/tests/test_command_get.py +++ b/tests/test_command_get.py @@ -8,24 +8,26 @@ @pytest.fixture() def file_stix_bundles(): cwd = os.path.dirname(os.path.abspath(__file__)) - return [os.path.join(cwd, "test_bundle_4.json"), - os.path.join(cwd, "test_bundle_5.json")] + return [ + os.path.join(cwd, "test_bundle_4.json"), + os.path.join(cwd, "test_bundle_5.json"), + ] @pytest.fixture() def set_stixshifter_stix_bundles(): cfg = '{"auth": {"username": "","password": ""}}' - connector = 'stix_bundle' - stixshifter_data_url = 'https://raw.githubusercontent.com/opencybersecurityalliance/stix-shifter/develop/data/cybox' + connector = "stix_bundle" + stixshifter_data_url = "https://raw.githubusercontent.com/opencybersecurityalliance/stix-shifter/develop/data/cybox" host1 = f"{stixshifter_data_url}/carbon_black/cb_observed_156.json" host2 = f"{stixshifter_data_url}/qradar/qradar_custom_process_observable.json" - os.environ['STIXSHIFTER_HOST1_CONNECTION'] = json.dumps({"host": host1}) - os.environ['STIXSHIFTER_HOST1_CONNECTOR'] = connector - os.environ['STIXSHIFTER_HOST1_CONFIG'] = cfg - os.environ['STIXSHIFTER_HOST2_CONNECTION'] = json.dumps({"host": host2}) - os.environ['STIXSHIFTER_HOST2_CONNECTOR'] = connector - os.environ['STIXSHIFTER_HOST2_CONFIG'] = cfg + os.environ["STIXSHIFTER_HOST1_CONNECTION"] = json.dumps({"host": host1}) + os.environ["STIXSHIFTER_HOST1_CONNECTOR"] = connector + os.environ["STIXSHIFTER_HOST1_CONFIG"] = cfg + os.environ["STIXSHIFTER_HOST2_CONNECTION"] = json.dumps({"host": host2}) + os.environ["STIXSHIFTER_HOST2_CONNECTOR"] = connector + os.environ["STIXSHIFTER_HOST2_CONFIG"] = cfg def test_get_single_file(file_stix_bundles): @@ -42,7 +44,7 @@ def test_get_single_file(file_stix_bundles): def test_get_multiple_file_stix_bundles(file_stix_bundles): with Session() as s: - file_bundles = ','.join(file_stix_bundles) + file_bundles = ",".join(file_stix_bundles) stmt = f"var = GET process FROM file://{file_bundles} WHERE [process:name='compattelrunner.exe']" s.execute(stmt) @@ -68,7 +70,9 @@ def test_get_single_stixshifter_stix_bundle(set_stixshifter_stix_bundles): def test_get_multiple_stixshifter_stix_bundles(set_stixshifter_stix_bundles): with Session() as s: # default data source schema is stixshifter - stmt = "var = GET process FROM HOST1,HOST2 WHERE [ipv4-addr:value = '127.0.0.1']" + stmt = ( + "var = GET process FROM HOST1,HOST2 WHERE [ipv4-addr:value = '127.0.0.1']" + ) s.execute(stmt) v = s.get_variable("var") @@ -76,7 +80,16 @@ def test_get_multiple_stixshifter_stix_bundles(set_stixshifter_stix_bundles): for i in range(len(v)): assert v[i]["type"] == "process" assert v[i]["name"] in [ - "powershell.exe", "(unknown)", "explorer.exe", "firefox.exe", "ntoskrnl.exe", - "teamviewer_service.exe", "teamviewer.exe", "vmware.exe", "dashost.exe", - "applemobiledeviceservice.exe", "svctest.exe", "vmware-hostd.exe"] - + "powershell.exe", + "(unknown)", + "explorer.exe", + "firefox.exe", + "ntoskrnl.exe", + "teamviewer_service.exe", + "teamviewer.exe", + "vmware.exe", + "dashost.exe", + "applemobiledeviceservice.exe", + "svctest.exe", + "vmware-hostd.exe", + ] diff --git a/tests/test_command_group.py b/tests/test_command_group.py index 9df9f982..b5881661 100644 --- a/tests/test_command_group.py +++ b/tests/test_command_group.py @@ -38,9 +38,13 @@ def test_group_src_dst(fake_bundle_file): where [network-traffic:dst_port > 0]""", ) - session.execute(("grps = group conns by " - "network-traffic:src_ref.value," - "network-traffic:dst_ref.value")) + session.execute( + ( + "grps = group conns by " + "network-traffic:src_ref.value," + "network-traffic:dst_ref.value" + ) + ) assert "grps" in session.get_variable_names() grps = session.get_variable("grps") assert grps is not None @@ -53,7 +57,7 @@ def test_group_src_dst(fake_bundle_file): ("max", "dst_ref.value", "max_dst_ref.value"), ("count", "dst_ref.value", "count_dst_ref.value"), ("nunique", "dst_ref.value", "nunique_dst_ref.value"), - ] + ], ) def test_group_srcref_agg(fake_bundle_file, agg_func, attr, expected): with Session(debug_mode=True) as session: @@ -63,8 +67,12 @@ def test_group_srcref_agg(fake_bundle_file, agg_func, attr, expected): where [network-traffic:dst_port > 0]""", ) - session.execute(("src_grps = group conns by network-traffic:src_ref.value" - f" with {agg_func}({attr})")) + session.execute( + ( + "src_grps = group conns by network-traffic:src_ref.value" + f" with {agg_func}({attr})" + ) + ) assert "src_grps" in session.get_variable_names() src_grps = session.get_variable("src_grps") assert src_grps is not None @@ -77,7 +85,7 @@ def test_group_srcref_agg(fake_bundle_file, agg_func, attr, expected): ("max", "dst_ref.value", "rand_value"), ("count", "dst_ref.value", "whatever"), ("nunique", "dst_ref.value", "unique_dests"), - ] + ], ) def test_group_srcref_agg_alias(fake_bundle_file, agg_func, attr, alias): with Session(debug_mode=True) as session: @@ -87,8 +95,12 @@ def test_group_srcref_agg_alias(fake_bundle_file, agg_func, attr, alias): where [network-traffic:dst_port > 0]""", ) - session.execute(("src_grps = group conns by network-traffic:src_ref.value" - f" with {agg_func}({attr}) as {alias}")) + session.execute( + ( + "src_grps = group conns by network-traffic:src_ref.value" + f" with {agg_func}({attr}) as {alias}" + ) + ) assert "src_grps" in session.get_variable_names() src_grps = session.get_variable("src_grps") assert src_grps is not None diff --git a/tests/test_parser.py b/tests/test_parser.py index aa800df8..8b84f2f4 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -109,11 +109,7 @@ def test_grouping_1(): assert result["input"] == "x" assert result["paths"] == ["foo"] assert result["aggregations"] == [ - { - 'attr': 'baz', - 'func': 'sum', - 'alias': 'sum_baz' - }, + {"attr": "baz", "func": "sum", "alias": "sum_baz"}, ] @@ -125,40 +121,22 @@ def test_grouping_2(): assert result["input"] == "x" assert result["paths"] == ["foo", "bar"] assert result["aggregations"] == [ - { - 'attr': 'baz', - 'func': 'max', - 'alias': 'biggest' - }, - { - 'attr': 'blah', - 'func': 'min', - 'alias': 'min_blah' - }, + {"attr": "baz", "func": "max", "alias": "biggest"}, + {"attr": "blah", "func": "min", "alias": "min_blah"}, ] def test_grouping_3(): - results = parse("y = group x by foo with avg(bar), count(baz), max(blah) as whatever") + results = parse( + "y = group x by foo with avg(bar), count(baz), max(blah) as whatever" + ) result = results[0] print(result) assert result["command"] == "group" assert result["input"] == "x" assert result["paths"] == ["foo"] assert result["aggregations"] == [ - { - 'attr': 'bar', - 'func': 'avg', - 'alias': 'avg_bar' - }, - { - 'attr': 'baz', - 'func': 'count', - 'alias': 'count_baz' - }, - { - 'attr': 'blah', - 'func': 'max', - 'alias': 'whatever' - }, + {"attr": "bar", "func": "avg", "alias": "avg_bar"}, + {"attr": "baz", "func": "count", "alias": "count_baz"}, + {"attr": "blah", "func": "max", "alias": "whatever"}, ] diff --git a/tests/test_timestamped.py b/tests/test_timestamped.py index f66d7ad9..25d31793 100644 --- a/tests/test_timestamped.py +++ b/tests/test_timestamped.py @@ -21,27 +21,31 @@ def test_timestamped_disp(fake_bundle_file): """ s.execute(stmt) out = s.execute("DISP conns") - data = out[0].to_dict()['data'] + data = out[0].to_dict()["data"] assert len(data) == 29 assert "first_observed" not in data[0] out = s.execute("DISP TIMESTAMPED(conns)") - data = out[0].to_dict()['data'] + data = out[0].to_dict()["data"] assert len(data) == 29 assert "first_observed" in data[0] out = s.execute("DISP TIMESTAMPED(conns) LIMIT 5") - data = out[0].to_dict()['data'] + data = out[0].to_dict()["data"] assert len(data) == 5 assert "first_observed" in data[0] - out = s.execute("DISP TIMESTAMPED(conns) ATTR first_observed, src_ref.value, src_port") - data = out[0].to_dict()['data'] + out = s.execute( + "DISP TIMESTAMPED(conns) ATTR first_observed, src_ref.value, src_port" + ) + data = out[0].to_dict()["data"] assert len(data) == 29 assert "first_observed" in data[0] assert "src_ref.value" in data[0] assert "src_port" in data[0] assert "dst_ref.value" not in data[0] assert "dst_port" not in data[0] - out = s.execute("DISP TIMESTAMPED(conns) ATTR first_observed, src_ref.value, src_port LIMIT 5") - data = out[0].to_dict()['data'] + out = s.execute( + "DISP TIMESTAMPED(conns) ATTR first_observed, src_ref.value, src_port LIMIT 5" + ) + data = out[0].to_dict()["data"] assert len(data) == 5 assert "first_observed" in data[0] assert "src_ref.value" in data[0] From e85b72d7ee415a3c862f70cf731d1c8edaa15eef Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Mon, 18 Apr 2022 23:48:21 -0400 Subject: [PATCH 05/35] minor comment fix --- src/kestrel/codegen/queries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kestrel/codegen/queries.py b/src/kestrel/codegen/queries.py index 9e12e015..044cdc14 100644 --- a/src/kestrel/codegen/queries.py +++ b/src/kestrel/codegen/queries.py @@ -65,7 +65,7 @@ def compile_specific_relation_to_query( # a = FIND autonomous-system OWNED ip # # It is just aligned that (not is_reversed) == whether input_var is - # - EntityX in stix_2_0_ref_mapping + # - EntityY in stix_2_0_ref_mapping # - the source_ref in the __reflist table of firepit v2.0 var_is_source = not is_reversed From a282680022b49bb1acf413b1d9c3f0da0eca1f0a Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 19 Apr 2022 00:01:17 -0400 Subject: [PATCH 06/35] minor update: better var name --- src/kestrel/codegen/queries.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kestrel/codegen/queries.py b/src/kestrel/codegen/queries.py index 044cdc14..783713c3 100644 --- a/src/kestrel/codegen/queries.py +++ b/src/kestrel/codegen/queries.py @@ -145,17 +145,17 @@ def _generate_ref_query(input_var_name, input_type, var_attr, ret_type, ret_attr ) -def _generate_reflist_query(input_var_name, var_is_source, ref_name, entity_y): - var_ref_pos, y_ref_pos = ( +def _generate_reflist_query(input_var_name, var_is_source, ref_name, ret_type): + var_ref_pos, ret_ref_pos = ( ("source_ref", "target_ref") if var_is_source else ("target_ref", "source_ref") ) return Query( [ Table(input_var_name), Join("__reflist", "id", "=", var_ref_pos), - Join(entity_y, y_ref_pos, "=", "id"), + Join(ret_type, ret_ref_pos, "=", "id"), Filter([Predicate("ref_name", "=", ref_name)]), - Projection([Column("*", entity_y)]), # All columns from entity_y + Projection([Column("*", ret_type)]), # All columns from ret_type Unique(), ] ) From 5dfcc8410e3420fd74cbca90af36c5b49576ed95 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 19 Apr 2022 07:51:03 -0400 Subject: [PATCH 07/35] Don't try to deref a path if there's already a column by that name --- src/kestrel/codegen/commands.py | 2 +- tests/test_session.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/kestrel/codegen/commands.py b/src/kestrel/codegen/commands.py index 5128181f..9153aaf3 100644 --- a/src/kestrel/codegen/commands.py +++ b/src/kestrel/codegen/commands.py @@ -616,7 +616,7 @@ def _set_projection(store, entity_table, query, paths): for path in paths: if path == "*": return - if "_ref" in path: # This seems like a hack + if "_ref" in path and path not in cols: # This seems like a hack joins, table, column = store.path_joins(entity_table, None, path) if table not in joined: query.extend(joins) diff --git a/tests/test_session.py b/tests/test_session.py index eb754f36..2a34c9ea 100644 --- a/tests/test_session.py +++ b/tests/test_session.py @@ -299,3 +299,16 @@ def test_sha256_attr_name(cbcloud_powershell_bundle): df["binary_ref.hashes.'SHA-256'"][0] == "de96a6e69944335375dc1ac238336066889d9ffc7d73628ef4fe1b1b160ab32c" ) + + +def test_disp_after_group(fake_bundle_file): + with Session(debug_mode=True) as session: + session.execute( + f""" +conns = get network-traffic from file://{fake_bundle_file} + where [network-traffic:dst_port < 10000] +grouped = group conns by src_ref.value, dst_ref.value with count(src_ref.value) as count +""") + out = session.execute("DISP grouped ATTR src_ref.value, dst_ref.value, count") + df = out[0].dataframe + assert list(df.columns) == ["src_ref.value", "dst_ref.value", "count"] From a12a9d64b0af60f5e823c9687e8caa5e54b7566f Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 19 Apr 2022 16:38:15 -0400 Subject: [PATCH 08/35] Change FUNCNAME from a terminal to an inlined rule --- src/kestrel/syntax/kestrel.lark | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kestrel/syntax/kestrel.lark b/src/kestrel/syntax/kestrel.lark index 1c6fc68d..8a9071fd 100644 --- a/src/kestrel/syntax/kestrel.lark +++ b/src/kestrel/syntax/kestrel.lark @@ -107,7 +107,7 @@ path_list: STIXPATH ("," STIXPATH)* -> valuelist agg_list: agg ("," agg)* -agg: FUNCNAME "(" STIXPATH ")" ("AS"i alias)? +agg: funcname "(" STIXPATH ")" ("AS"i alias)? ?alias: CNAME @@ -128,7 +128,7 @@ REVERSED: "by"i COMMENT: /#.*/ URI: PATH -FUNCNAME: (MIN|MAX|SUM|AVG|COUNT|NUNIQUE) +?funcname: (MIN|MAX|SUM|AVG|COUNT|NUNIQUE) MIN: "min"i MAX: "max"i SUM: "sum"i From 9ec8baa781cef158d3bfde6aa225b230f44d5c71 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 19 Apr 2022 17:15:22 -0400 Subject: [PATCH 09/35] fix two issues for complete FIND command 1. the terminal "by"i is now differentiated between FIND and SORT/GROUP 2. now variables such as `conns` are not in the completion of 'FIND process c' --- src/kestrel/session.py | 20 ++++++++++---------- src/kestrel/syntax/kestrel.lark | 9 +++++---- tests/test_completion.py | 3 ++- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/kestrel/session.py b/src/kestrel/session.py index 5a05962b..0886f4dc 100644 --- a/src/kestrel/session.py +++ b/src/kestrel/session.py @@ -370,6 +370,7 @@ def do_complete(self, code, cursor_pos): prefix = code[:cursor_pos] words = prefix.split(" ") last_word = words[-1] + last_char = prefix[-1] _logger.debug('code="%s" prefix="%s" last_word="%s"', code, prefix, last_word) if "START" in prefix or "STOP" in prefix: @@ -439,17 +440,16 @@ def do_complete(self, code, cursor_pos): tmp.extend(get_entity_types()) else: tmp.extend(all_relations) - elif token == "REVERSED": + elif token == "BY": tmp.append("BY") - prev_word = words[-2] if len(words) >= 2 else "" - _logger.debug("prev_word = %s", prev_word) - if prev_word in all_relations: - pass - elif prev_word in varnames: - pass - elif last_word not in varnames: - # Must be FIND and not GROUP - tmp.extend(all_relations) + elif token == "REVERSED": + if last_char == " ": + tmp.append("BY") + else: + # "procs = FIND process l" will expect ['REVERSED', 'VARIABLE'] + # override results from the case of VARIABLE + tmp = all_relations + break elif token == "FUNCNAME": tmp.extend(AGG_FUNCS) elif token == "TRANSFORM": diff --git a/src/kestrel/syntax/kestrel.lark b/src/kestrel/syntax/kestrel.lark index 1c6fc68d..34b78008 100644 --- a/src/kestrel/syntax/kestrel.lark +++ b/src/kestrel/syntax/kestrel.lark @@ -44,11 +44,11 @@ find: "find"i ENTITY_TYPE RELATION (REVERSED)? VARIABLE (starttime endtime)? apply: "apply"i ANALYTICS "on"i variables ("with"i anaparams)? -join: "join"i VARIABLE "," VARIABLE ("by"i STIXPATH "," STIXPATH)? +join: "join"i VARIABLE "," VARIABLE (BY STIXPATH "," STIXPATH)? -sort: "sort"i VARIABLE "by"i STIXPATH (ASC|DESC)? +sort: "sort"i VARIABLE BY STIXPATH (ASC|DESC)? -group: "group"i VARIABLE "by"i path_list ("with"i agg_list)? +group: "group"i VARIABLE BY path_list ("with"i agg_list)? load: "load"i DUMPPATH ("as"i ENTITY_TYPE)? @@ -63,7 +63,7 @@ expression: transform where_clause? attr_clause? sort_clause? limit_clause? offs where_clause: "where"i condition attr_clause: "attr"i STIXPATHS -sort_clause: "sort"i "by"i STIXPATH (ASC|DESC)? +sort_clause: "sort"i BY STIXPATH (ASC|DESC)? limit_clause: "limit"i INT offset_clause: "offset"i INT @@ -125,6 +125,7 @@ DUMPPATH: PATH ASC: "asc"i DESC: "desc"i REVERSED: "by"i +BY: "by"i COMMENT: /#.*/ URI: PATH diff --git a/tests/test_completion.py b/tests/test_completion.py index 2d51ceb7..0d1268be 100644 --- a/tests/test_completion.py +++ b/tests/test_completion.py @@ -51,9 +51,10 @@ def a_session(): #("procs = FIND process ", {"created", "loaded", "linked"}), ("procs = FIND process ", all_relations), ("procs = FIND process l", {"oaded", "inked"}), - ("procs = FIND process c", {"reated", "ontained", "onns"}), # FIXME: shouldn't suggest var here + ("procs = FIND process c", {"reated", "ontained"}), ("procs = FIND process created ", {"conns", "_", "BY"}), ("procs = FIND process created BY ", {"conns", "_"}), + ("procs2 = SORT procs ", {"BY"}), ("grps = GR", {"OUP"}), ("grps = GROUP ", {"conns", "_"}), ("grps = GROUP conns ", {"BY"}), From 07dc9ced79cce018f891d6beac606ea84e4b535a Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 19 Apr 2022 17:22:44 -0400 Subject: [PATCH 10/35] add logo to readthedocs --- docs/_static/css/logo.css | 8 ++++++++ docs/conf.py | 9 +++++++++ logo/logo_w_text_white.svg | 20 ++++++++++++++++++++ logo/logo_white.svg | 18 ++++++++++++++++++ 4 files changed, 55 insertions(+) create mode 100644 docs/_static/css/logo.css create mode 100644 logo/logo_w_text_white.svg create mode 100644 logo/logo_white.svg diff --git a/docs/_static/css/logo.css b/docs/_static/css/logo.css new file mode 100644 index 00000000..9db57eb2 --- /dev/null +++ b/docs/_static/css/logo.css @@ -0,0 +1,8 @@ +/* +`width:auto` was rendering 0px wide for .svg files +https://stackoverflow.com/questions/59215996/how-to-add-a-logo-to-my-readthedocs-logo-rendering-at-0px-wide +*/ +.wy-side-nav-search .wy-dropdown > a img.logo, .wy-side-nav-search > a img.logo { + width: 241px; + margin-top: 15px; +} diff --git a/docs/conf.py b/docs/conf.py index 14ce02f1..dde78eb3 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,3 +35,12 @@ def get_version(): html_title = project html_theme = "sphinx_rtd_theme" highlight_language = "none" +html_logo = "../logo/logo_w_text_white.svg" +html_theme_options = { + 'logo_only': True, + 'display_version': False, +} +html_static_path = ['_static'] +html_css_files = [ + 'css/logo.css', +] diff --git a/logo/logo_w_text_white.svg b/logo/logo_w_text_white.svg new file mode 100644 index 00000000..79894964 --- /dev/null +++ b/logo/logo_w_text_white.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + Kestrel + diff --git a/logo/logo_white.svg b/logo/logo_white.svg new file mode 100644 index 00000000..4c543f50 --- /dev/null +++ b/logo/logo_white.svg @@ -0,0 +1,18 @@ + + + + + + + + + + + + + From 9650425e33b2badea9cfbad9b79b8b4fa370ebb6 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 19 Apr 2022 17:43:49 -0400 Subject: [PATCH 11/35] hotfix: broken index in parser after using BY --- src/kestrel/syntax/parser.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/kestrel/syntax/parser.py b/src/kestrel/syntax/parser.py index f776fdac..aca44896 100644 --- a/src/kestrel/syntax/parser.py +++ b/src/kestrel/syntax/parser.py @@ -119,21 +119,21 @@ def join(self, args): "command": "join", "input": _first(args), "input_2": _second(args), - "path": _third(args), - "path_2": _fourth(args), + "path": _fourth(args), + "path_2": _fifth(args), } else: return {"command": "join", "input": _first(args), "input_2": _second(args)} def group(self, args): # args[1] was already transformed by path_list/valuelist - cols = _normalize_paths(args[1]) + cols = _normalize_paths(args[2]) result = { "command": "group", "paths": cols, "input": _extract_var(args, self.default_variable), } - aggregations = args[2] if len(args) > 2 else None + aggregations = args[3] if len(args) > 3 else None if aggregations: result["aggregations"] = aggregations return result @@ -306,6 +306,8 @@ def _third(args): def _fourth(args): return args[3].value +def _fifth(args): + return args[3].value def _last(args): return args[-1].value From 0005abb639d727b3cb73912b61ceaffef259c086 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 19 Apr 2022 17:52:12 -0400 Subject: [PATCH 12/35] style check --- src/kestrel/syntax/parser.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/kestrel/syntax/parser.py b/src/kestrel/syntax/parser.py index aca44896..85b569d4 100644 --- a/src/kestrel/syntax/parser.py +++ b/src/kestrel/syntax/parser.py @@ -306,9 +306,11 @@ def _third(args): def _fourth(args): return args[3].value + def _fifth(args): return args[3].value + def _last(args): return args[-1].value From cfcae3bb9baa96957c44def9955da025065bc7a3 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Wed, 20 Apr 2022 07:27:43 -0400 Subject: [PATCH 13/35] Make auto-completion case-insensitive --- src/kestrel/session.py | 7 +++++++ tests/test_completion.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/src/kestrel/session.py b/src/kestrel/session.py index 5a05962b..fe64c3aa 100644 --- a/src/kestrel/session.py +++ b/src/kestrel/session.py @@ -66,6 +66,7 @@ from kestrel.syntax.parser import parse from kestrel.syntax.utils import ( get_entity_types, + get_keywords, all_relations, LITERALS, AGG_FUNCS, @@ -415,8 +416,11 @@ def do_complete(self, code, cursor_pos): except KestrelSyntaxError as e: _logger.debug("exception: %s", e) varnames = self.get_variable_names() + keywords = set(get_keywords()) + _logger.debug("keywords: %s", keywords) tmp = [] for token in e.expected: + _logger.debug("token: %s", token) if token == "VARIABLE": tmp.extend(varnames) elif token == "DATASRC": @@ -460,6 +464,9 @@ def do_complete(self, code, cursor_pos): continue elif token == "EQUAL": tmp.append("=") + elif token in keywords and last_word.islower(): + # keywords has both upper and lower case + tmp.append(token.lower()) else: tmp.append(token) allnames = sorted(tmp) diff --git a/tests/test_completion.py b/tests/test_completion.py index 2d51ceb7..14d75d11 100644 --- a/tests/test_completion.py +++ b/tests/test_completion.py @@ -58,6 +58,8 @@ def a_session(): ("grps = GROUP ", {"conns", "_"}), ("grps = GROUP conns ", {"BY"}), ("grps = GROUP conns by ", []), # TODO: we don't suggest attrs yet + ("urls = g", ["et", "roup"]), + ("urls = ge", ["t"]), ("urls = get ", KNOWN_ETYPES), ("urls = get url ", ["FROM", "WHERE"]), ("urls = get url from ", ["_", "conns", "file://", "http://", "https://", "stixshifter://"]), From 5b306094126cb56245b8f5c9981f84f32bc45cfa Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 11:57:49 -0400 Subject: [PATCH 14/35] add testing coverage report in GitHub Action --- .github/workflows/unit-testing.yml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 5dcd27b7..ad453df0 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -29,12 +29,18 @@ jobs: uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - - name: Install Kestrel package + - name: Install Kestrel run: | python -m pip install --upgrade pip python -m pip install --upgrade setuptools - python -m pip install pytest python -m pip install . - python -m pip install stix-shifter-modules-stix_bundle - - name: Unit testing - run: pytest -vv + - name: Unit testing with coverage report + run: | + python -m pip install pytest + python -m pip install pytest-cov + python -m pytest -vv --cov=./ --cov-report=xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v2 + with: + fail_ci_if_error: true + verbose: true From b8950c056625b4607d59fe6afc50dc970adf63c6 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 14:29:17 -0400 Subject: [PATCH 15/35] update codecov action version --- .github/workflows/unit-testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index ad453df0..2c72e887 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -40,7 +40,7 @@ jobs: python -m pip install pytest-cov python -m pytest -vv --cov=./ --cov-report=xml - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3 with: fail_ci_if_error: true verbose: true From 62387f290ce660ca0651aca3700d35e4820d9201 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Thu, 21 Apr 2022 15:59:35 -0400 Subject: [PATCH 16/35] Fix JOIN; add unit test w/CSV data --- src/kestrel/syntax/parser.py | 4 ++-- tests/test_command_join.py | 21 +++++++++++++++++++++ tests/test_input_data_ips.csv | 3 +++ 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 tests/test_command_join.py create mode 100644 tests/test_input_data_ips.csv diff --git a/src/kestrel/syntax/parser.py b/src/kestrel/syntax/parser.py index 85b569d4..ed426437 100644 --- a/src/kestrel/syntax/parser.py +++ b/src/kestrel/syntax/parser.py @@ -114,7 +114,7 @@ def find(self, args): return packet def join(self, args): - if len(args) == 4: + if len(args) == 5: return { "command": "join", "input": _first(args), @@ -308,7 +308,7 @@ def _fourth(args): def _fifth(args): - return args[3].value + return args[4].value def _last(args): diff --git a/tests/test_command_join.py b/tests/test_command_join.py new file mode 100644 index 00000000..be10d624 --- /dev/null +++ b/tests/test_command_join.py @@ -0,0 +1,21 @@ +import os + +from kestrel.session import Session + + +def test_join_csv_data(): + data_file_path = os.path.join( + os.path.dirname(__file__), "test_input_data_ips.csv" + ) + with Session() as s: + s.execute(f"assets = LOAD {data_file_path} AS ipv4-addr") + s.execute(""" +ips = NEW [{"type": "ipv4-addr", "value": "192.168.1.2"}, + {"type": "ipv4-addr", "value": "192.168.1.3"}] +""") + s.execute("risk_ips = JOIN ips, assets by value, value") + v = s.get_variable("risk_ips") + assert len(v) == 1 + assert v[0]["type"] == "ipv4-addr" + assert v[0]["value"] == "192.168.1.2" + assert v[0]["risk"] == 2 diff --git a/tests/test_input_data_ips.csv b/tests/test_input_data_ips.csv new file mode 100644 index 00000000..cf9974f4 --- /dev/null +++ b/tests/test_input_data_ips.csv @@ -0,0 +1,3 @@ +"value","risk" +"192.168.1.1",1 +"192.168.1.2",2 \ No newline at end of file From 17ad885fc854c1de1c5dd9498f3262a2a1c17e2f Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 16:08:07 -0400 Subject: [PATCH 17/35] use tmp dir for testing SAVE command --- tests/test_command_save.py | 32 +++++++++++++++++++------------- tests/test_session.py | 3 ++- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/tests/test_command_save.py b/tests/test_command_save.py index 3fbb0648..69b7923a 100644 --- a/tests/test_command_save.py +++ b/tests/test_command_save.py @@ -21,11 +21,13 @@ def test_save_parquet_gz(tmp_path): data_file_path = os.path.join( os.path.dirname(__file__), "test_input_data_procs.parquet.gz" ) + stmt_save = f"newvar = LOAD {data_file_path} SAVE newvar TO {save_path}" + stmt_load = f"newload = LOAD {save_path}" + with Session() as s: - stmt_save = f"newvar = LOAD {data_file_path} SAVE newvar TO {save_path}" s.execute(stmt_save) - assert save_path.exists() - stmt_load = f"newload = LOAD {save_path}" + assert save_path.exists() + with Session() as s: s.execute(stmt_load) v = s.get_variable("newload") @@ -34,21 +36,25 @@ def test_save_parquet_gz(tmp_path): assert v[0]["name"] == "reg.exe" -def test_save_network_traffic_v4(fake_bundle_file): +def test_save_network_traffic_v4(tmp_path, fake_bundle_file): + save_path = tmp_path / "conns.csv" with Session(debug_mode=True) as session: session.execute( - f"""conns = get network-traffic - from file://{fake_bundle_file} - where [network-traffic:dst_port > 0]""", + f"""conns = GET network-traffic + FROM file://{fake_bundle_file} + WHERE [network-traffic:dst_port > 0]""", ) - session.execute("save conns to conns.csv") + session.execute(f"SAVE conns TO {save_path}") + assert save_path.exists() -def test_save_network_traffic_v4_v6(proc_bundle_file): +def test_save_network_traffic_v4_v6(tmp_path, proc_bundle_file): + save_path = tmp_path / "conns.csv" with Session(debug_mode=True) as session: session.execute( - f"""conns = get network-traffic - from file://{proc_bundle_file} - where [network-traffic:dst_port > 0]""", + f"""conns = GET network-traffic + FROM file://{proc_bundle_file} + WHERE [network-traffic:dst_port > 0]""", ) - session.execute("save conns to conns.csv") + session.execute(f"SAVE conns TO {save_path}") + assert save_path.exists() diff --git a/tests/test_session.py b/tests/test_session.py index 2a34c9ea..c8fdfedd 100644 --- a/tests/test_session.py +++ b/tests/test_session.py @@ -308,7 +308,8 @@ def test_disp_after_group(fake_bundle_file): conns = get network-traffic from file://{fake_bundle_file} where [network-traffic:dst_port < 10000] grouped = group conns by src_ref.value, dst_ref.value with count(src_ref.value) as count -""") +""" + ) out = session.execute("DISP grouped ATTR src_ref.value, dst_ref.value, count") df = out[0].dataframe assert list(df.columns) == ["src_ref.value", "dst_ref.value", "count"] From 5a655c9c04fd52ec208ec9b700c4aab993b846f7 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Thu, 21 Apr 2022 16:43:35 -0400 Subject: [PATCH 18/35] Fix for auto-dereferencing with mixed IPv4/IPv6 --- src/kestrel/codegen/commands.py | 44 +++++++++++++++------------------ src/kestrel/syntax/parser.py | 14 +++++------ tests/test_command_disp.py | 28 +++++++++++++++++++++ 3 files changed, 55 insertions(+), 31 deletions(-) diff --git a/src/kestrel/codegen/commands.py b/src/kestrel/codegen/commands.py index 3a46df12..acb0a25d 100644 --- a/src/kestrel/codegen/commands.py +++ b/src/kestrel/codegen/commands.py @@ -23,7 +23,8 @@ import itertools from collections import OrderedDict -from firepit.query import Column, Limit, Offset, Order, Projection, Query +from firepit.deref import auto_deref +from firepit.query import Limit, Offset, Order, Projection, Query from firepit.stix20 import summarize_pattern from kestrel.utils import remove_empty_dicts, dedup_ordered_dicts @@ -613,7 +614,7 @@ def _filter_prefetched_process( id_pattern = build_pattern_from_ids(return_type, entity_ids) if id_pattern: session.store.extract(prefetch_filtered_var_name, return_type, None, id_pattern) - _logger.debug(f"filter successful.") + _logger.debug("filter successful.") return prefetch_filtered_var_name else: _logger.info("no prefetched process found after filtering.") @@ -621,35 +622,30 @@ def _filter_prefetched_process( def _set_projection(store, entity_table, query, paths): - proj = [] - cols = store.columns(entity_table) - joined = set() - for path in paths: - if path == "*": - return - if "_ref" in path and path not in cols: # This seems like a hack - joins, table, column = store.path_joins(entity_table, None, path) - if table not in joined: - query.extend(joins) - joined.add(table) - proj.append(Column(column, table, path)) - elif path in cols: - # Prevent any ambiguity - proj.append(Column(path, entity_table)) - else: - # Not sure where it came from - proj.append(path) - query.append(Projection(proj)) + joins, proj = auto_deref(store, entity_table, paths=paths) + query.joins.extend(joins) + if query.proj: + # Need to merge projections? More-specific overrides less-specific ("*") + new_cols = [] + for p in query.proj.cols: + if not (hasattr(p, "table") and p.table == entity_table and p.name == "*"): + new_cols.append(p) + for p in proj.cols: + if not (hasattr(p, "table") and p.table == entity_table and p.name == "*"): + new_cols.append(p) + query.proj = Projection(new_cols) + else: + query.proj = proj def _build_query(store, entity_table, qry, stmt): where = stmt.get("where") if where: + where.set_table(entity_table) qry.append(where) attrs = stmt.get("attrs", "*") - if attrs != "*": - cols = attrs.split(",") - _set_projection(store, entity_table, qry, cols) + cols = attrs.split(",") + _set_projection(store, entity_table, qry, cols) sort_by = stmt.get("path") if sort_by: direction = "ASC" if stmt["ascending"] else "DESC" diff --git a/src/kestrel/syntax/parser.py b/src/kestrel/syntax/parser.py index ed426437..046b24cc 100644 --- a/src/kestrel/syntax/parser.py +++ b/src/kestrel/syntax/parser.py @@ -2,7 +2,7 @@ from pkgutil import get_data from firepit.query import Filter, Predicate -from lark import Lark, Transformer, Tree +from lark import Lark, Token, Transformer, Tree def parse(stmts, default_variable="_", default_sort_order="desc"): @@ -253,19 +253,19 @@ def agg(self, args): return {"func": func, "attr": args[1].value, "alias": alias} def disj(self, args): - lhs = str(args[0]) - rhs = str(args[2]) + lhs = str(args[0]) if isinstance(args, Token) else args[0] + rhs = str(args[1]) if isinstance(args, Token) else args[1] return Predicate(lhs, "OR", rhs) def conj(self, args): - lhs = str(args[0]) - rhs = str(args[2]) + lhs = str(args[0]) if isinstance(args, Token) else args[0] + rhs = str(args[1]) if isinstance(args, Token) else args[1] return Predicate(lhs, "AND", rhs) def comp(self, args): - lhs = str(args[0]) + lhs = str(args[0]) if isinstance(args, Token) else args[0] op = str(args[1]) - rhs = str(args[2]) + rhs = str(args[2]) if isinstance(args, Token) else args[2] return Predicate(lhs, op, rhs) def null_comp(self, args): diff --git a/tests/test_command_disp.py b/tests/test_command_disp.py index 69d6f307..b881c217 100644 --- a/tests/test_command_disp.py +++ b/tests/test_command_disp.py @@ -1,9 +1,17 @@ +import os +import pandas as pd import pytest from kestrel.exceptions import VariableNotExist from kestrel.session import Session +@pytest.fixture +def proc_bundle_file(): + cwd = os.path.dirname(os.path.abspath(__file__)) + return os.path.join(cwd, "doctored-1k.json") + + def test_disp(): with Session() as s: stmt = """ @@ -50,3 +58,23 @@ def test_disp_grouped_conns(): out = s.execute("DISP grpvar") data = out[0].to_dict()["data"] assert len(data) == 2 + + +def test_disp_mixed_v4_v6(proc_bundle_file): + with Session() as s: + stmt = f""" +conns = GET network-traffic + FROM file://{proc_bundle_file} + WHERE [network-traffic:dst_port > 0] +""" + s.execute(stmt) + + out = s.execute("DISP conns ATTR src_ref.value, src_port") + data = out[0].to_dict()["data"] + df = pd.DataFrame.from_records(data) + assert df.columns.tolist() == ["src_ref.value", "src_port"] + + out = s.execute("DISP TIMESTAMPED(conns) ATTR src_ref.value, src_port") + data = out[0].to_dict()["data"] + df = pd.DataFrame.from_records(data) + assert df.columns.tolist() == ["first_observed", "src_ref.value", "src_port"] From 67dfac1ba6cb6f9ad764b7ac4f4f052ee4a204d5 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Thu, 21 Apr 2022 16:53:33 -0400 Subject: [PATCH 19/35] Add some unit tests I forget to include --- tests/test_command_assign.py | 57 +++++++++++++++++++++ tests/test_expressions.py | 96 ++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 tests/test_command_assign.py create mode 100644 tests/test_expressions.py diff --git a/tests/test_command_assign.py b/tests/test_command_assign.py new file mode 100644 index 00000000..db966ea0 --- /dev/null +++ b/tests/test_command_assign.py @@ -0,0 +1,57 @@ +import os +import pytest + +from kestrel.session import Session + + +NEW_PROCS = """ +p = NEW [ + {"type": "process", "name": "cmd.exe", "command_line": "cmd -c dir"}, + {"type": "process", "name": "explorer.exe", "pid": "99"} + ] +""" + +@pytest.fixture +def proc_bundle_file(): + cwd = os.path.dirname(os.path.abspath(__file__)) + return os.path.join(cwd, "doctored-1k.json") + + +@pytest.mark.parametrize( + "stmt, expected", + [ + ("x = p", 2), + ("x = p WHERE pid = 99", 1), + ("x = p WHERE command_line IS NULL", 1), + ("x = p WHERE command_line IS NOT NULL", 1), + ("x = p WHERE command_line LIKE '%cmd%'", 1), + ], +) +def test_assign_after_new(stmt, expected): + with Session() as s: + s.execute(NEW_PROCS) + s.execute(stmt) + x = s.get_variable("x") + assert len(x) == expected, f"ASSIGN error: f{stmt}" + + +@pytest.mark.parametrize( + "stmt, expected", + [ + ("x = p", 2000), + ("x = p WHERE pid = 1380", 106 * 2), #FIXME: doubled due to prefetch + ("x = p WHERE command_line IS NULL", 948 * 2), + ("x = p WHERE command_line IS NOT NULL", 104), + ("x = p WHERE command_line LIKE '%/node%'", 1 * 2), + ("x = p WHERE pid = 5960 OR name = 'taskeng.exe'", 4), + ("x = p WHERE (pid = 5960 OR name = 'taskeng.exe') AND command_line IS NULL", 0), + ], +) +def test_assign_after_get(proc_bundle_file, stmt, expected): + with Session() as s: + s.execute(("p = GET process" + f" FROM file://{proc_bundle_file}" + " WHERE [process:pid > 0]")) + s.execute(stmt) + x = s.get_variable("x") + assert len(x) == expected, f"ASSIGN error: {stmt}" diff --git a/tests/test_expressions.py b/tests/test_expressions.py new file mode 100644 index 00000000..3330f005 --- /dev/null +++ b/tests/test_expressions.py @@ -0,0 +1,96 @@ +import json + +import pytest + +from kestrel.session import Session + + +NEW_PROCS = """ +procs = NEW [ {"type": "process", "name": "cmd.exe", "pid": 123, "x_foo": "bar"} + , {"type": "process", "name": "explorer.exe", "pid": 99}] +""" + + +@pytest.mark.parametrize( + "attrs, unexpected", [ + ("pid", {"name"}), + ("name", {"pid"}), + ("pid,name", set()), + ] +) +def test_expr_attr(attrs, unexpected): + with Session() as s: + s.execute(NEW_PROCS) + out = s.execute(f"DISP procs ATTR {attrs}") + data = out[0].to_dict()["data"] + print(json.dumps(data, indent=4)) + actual = set(data[0].keys()) + expected = set(attrs.split(",")) + assert expected == actual + assert len(unexpected & actual) == 0 + + +@pytest.mark.parametrize( + "prop, direction, expected", [ + ("pid", "asc", [99, 123]), + ("pid", "desc", [123, 99]), + ("name", "asc", ["cmd.exe", "explorer.exe"]), + ("name", "desc", ["explorer.exe", "cmd.exe"]), + ] +) +def test_expr_sort(prop, direction, expected): + with Session() as s: + s.execute(NEW_PROCS) + out = s.execute(f"DISP procs sort by {prop} {direction}") + data = out[0].to_dict()["data"] + print(json.dumps(data, indent=4)) + actual = [p[prop] for p in data] + assert actual == expected + + +@pytest.mark.parametrize( + "limit, offset, expected", [ + (5, 0, [99, 123]), + (1, 0, [99]), + (2, 1, [123]), + (1, 1, [123]), + ] +) +def test_expr_limit_offset(limit, offset, expected): + with Session() as s: + s.execute(NEW_PROCS) + out = s.execute(f"DISP procs SORT BY pid ASC LIMIT {limit} OFFSET {offset}") + data = out[0].to_dict()["data"] + print(json.dumps(data, indent=4)) + actual = [p["pid"] for p in data] + assert actual == expected + + +@pytest.mark.parametrize( + "col, op, val, expected", [ + ("pid", "=", 99, [99]), + ("pid", "<", 100, [99]), + ("pid", ">=", 100, [123]), + ("x_foo", "IS NULL", "", [99]), + ("x_foo", "IS NOT NULL", "", [123]), + ("x_foo", "=", "'bar'", [123]), + ] +) +def test_expr_where(col, op, val, expected): + with Session() as s: + s.execute(NEW_PROCS) + out = s.execute(f"DISP procs WHERE {col} {op} {val}") + data = out[0].to_dict()["data"] + print(json.dumps(data, indent=4)) + actual = [p["pid"] for p in data] + assert actual == expected + + +def test_expr_assign_where(): + with Session() as s: + s.execute(NEW_PROCS) + out = s.execute("x = procs WHERE pid > 100") + data = out[0].to_dict()["data"] + print(json.dumps(data, indent=4)) + vars_updated = data["variables updated"] + assert vars_updated[0]["#(ENTITIES)"] == 1 From e1bdf7809b9e2fa9fb98ae1993fb68a59ee33c55 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Thu, 21 Apr 2022 17:08:19 -0400 Subject: [PATCH 20/35] Require firepit>=2.0.1 --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index ec960fed..a9d50c78 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = docker>=5.0.0 stix-shifter>=3.6.0 stix-shifter-utils>=3.6.0 - firepit>=2.0.0 + firepit>=2.0.1 tests_require = pytest From 2b6dc569ace237a12fceb64bf29c6655880c2392 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 17:45:04 -0400 Subject: [PATCH 21/35] hotfix on undefined-variables --- src/kestrel/absinterface/manager.py | 4 ++-- src/kestrel/exceptions.py | 2 +- src/kestrel/semantics.py | 7 ++++++- src/kestrel/syntax/parser.py | 3 --- src/kestrel_analytics_python/interface.py | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/kestrel/absinterface/manager.py b/src/kestrel/absinterface/manager.py index f07c979a..6dacd5f4 100644 --- a/src/kestrel/absinterface/manager.py +++ b/src/kestrel/absinterface/manager.py @@ -54,9 +54,9 @@ def _parse_and_complete_uri(self, uri): def _get_interface_with_config(self, scheme): scheme = scheme.lower() if scheme not in self.scheme_to_interface: - raise nonexist_interface_exception(scheme) + raise self.nonexist_interface_exception(scheme) if scheme not in self.scheme_to_interface_name: - raise nonexist_interface_exception(scheme) + raise self.nonexist_interface_exception(scheme) interface_name = self.scheme_to_interface_name[scheme] interface_config = self.config[self.config_root_key][interface_name] interface = self.scheme_to_interface[scheme] diff --git a/src/kestrel/exceptions.py b/src/kestrel/exceptions.py index d0924082..1d1d2ec4 100644 --- a/src/kestrel/exceptions.py +++ b/src/kestrel/exceptions.py @@ -258,7 +258,7 @@ def __init__(self, type_received, types_expected): class InvalidAnalyticsOutput(KestrelException): - def __init__(self, analytcs_name, return_type): + def __init__(self, analytics_name, return_type): super().__init__( f"unsupported return type {return_type} from analytics: {analytics_name}" ) diff --git a/src/kestrel/semantics.py b/src/kestrel/semantics.py index e456640e..4af69c3e 100644 --- a/src/kestrel/semantics.py +++ b/src/kestrel/semantics.py @@ -1,7 +1,12 @@ import logging import re -from kestrel.exceptions import InvalidAttribute, VariableNotExist, UnsupportedRelation +from kestrel.exceptions import ( + InvalidAttribute, + VariableNotExist, + UnsupportedRelation, + KestrelInternalError, +) from kestrel.codegen.relations import stix_2_0_ref_mapping, generic_relations _logger = logging.getLogger(__name__) diff --git a/src/kestrel/syntax/parser.py b/src/kestrel/syntax/parser.py index ed426437..c51bcfa1 100644 --- a/src/kestrel/syntax/parser.py +++ b/src/kestrel/syntax/parser.py @@ -228,9 +228,6 @@ def endtime(self, args): def variables(self, args): return {"variables": _extract_vars(args, self.default_variable)} - def variables(self, args): - return {"variables": _extract_vars(args, self.default_variable)} - def localargs(self, args): return {args[0].value: args[1]} diff --git a/src/kestrel_analytics_python/interface.py b/src/kestrel_analytics_python/interface.py index fd15c78f..c866b9fc 100644 --- a/src/kestrel_analytics_python/interface.py +++ b/src/kestrel_analytics_python/interface.py @@ -230,7 +230,7 @@ def _execute(self, arg_variables): input_dataframes = [DataFrame(v.get_entities()) for v in arg_variables] if len(input_dataframes) != self._get_var_count(): raise InvalidAnalyticsArgumentCount( - profile, len(input_dataframes), self._get_var_count() + self.name, len(input_dataframes), self._get_var_count() ) else: try: From 193e4eeb1a9d90515a3fd9941cee3d051c515127 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 22:21:09 -0400 Subject: [PATCH 22/35] focus code-cov on real source code --- .github/workflows/unit-testing.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 2c72e887..cfb6a5e6 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -38,7 +38,9 @@ jobs: run: | python -m pip install pytest python -m pip install pytest-cov - python -m pytest -vv --cov=./ --cov-report=xml + # pytest-cov does not support automatic sub-package recognition in src/ + # pass the sub-package names in through the --cov argument + python -m pytest -vv --cov-report=xml $(ls --ignore='*.egg-info' src | xargs | sed 's/^\| / --cov=/g') - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: From e9d36b9456cbbdc650bb29c0dbeb95b8fa78fcbe Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 22:30:48 -0400 Subject: [PATCH 23/35] fix github workflow for macOS --- .github/workflows/unit-testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index cfb6a5e6..0944a6da 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -40,7 +40,7 @@ jobs: python -m pip install pytest-cov # pytest-cov does not support automatic sub-package recognition in src/ # pass the sub-package names in through the --cov argument - python -m pytest -vv --cov-report=xml $(ls --ignore='*.egg-info' src | xargs | sed 's/^\| / --cov=/g') + python -m pytest -vv --cov-report=xml $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: From 94f2b84dcb51726e47a9407a0e2d8ebb2a5700e0 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 22:36:51 -0400 Subject: [PATCH 24/35] avoid pytest-cov misfire for testing code --- .../{kestrel_python_analytics.py => python_analytics_mockup.py} | 0 tests/test_python_analytics.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/{kestrel_python_analytics.py => python_analytics_mockup.py} (100%) diff --git a/tests/kestrel_python_analytics.py b/tests/python_analytics_mockup.py similarity index 100% rename from tests/kestrel_python_analytics.py rename to tests/python_analytics_mockup.py diff --git a/tests/test_python_analytics.py b/tests/test_python_analytics.py index df8dcda5..df9caf14 100644 --- a/tests/test_python_analytics.py +++ b/tests/test_python_analytics.py @@ -22,7 +22,7 @@ def fake_bundle_4(): def env_setup(tmp_path): analytics_module_path = str( - pathlib.Path(__file__).resolve().parent / "kestrel_python_analytics.py" + pathlib.Path(__file__).resolve().parent / "python_analytics_mockup.py" ) profiles = f"""profiles: From 571e98f8ca9f493b8c2857d1bf15b715b2063cf4 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 22:43:47 -0400 Subject: [PATCH 25/35] try to fix pytest-cov on macOS --- .github/workflows/unit-testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 0944a6da..2f7b222e 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -40,7 +40,7 @@ jobs: python -m pip install pytest-cov # pytest-cov does not support automatic sub-package recognition in src/ # pass the sub-package names in through the --cov argument - python -m pytest -vv --cov-report=xml $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') + python -m pytest -vv --cov-report=xml $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') tests/ - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: From 045215b5e4b7fba951fc1952b188d51a9e1cf4b1 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 22:51:53 -0400 Subject: [PATCH 26/35] try to fix pytest-cov on mac OS --- .github/workflows/unit-testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 2f7b222e..b249fd1b 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -40,7 +40,7 @@ jobs: python -m pip install pytest-cov # pytest-cov does not support automatic sub-package recognition in src/ # pass the sub-package names in through the --cov argument - python -m pytest -vv --cov-report=xml $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') tests/ + pytest -vv $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: From 763aab2cbea4e27a914b248138991d4a47ef543c Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 22:54:12 -0400 Subject: [PATCH 27/35] try fix pytest-cov on macOS --- .github/workflows/unit-testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index b249fd1b..e1b8b0c9 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -40,7 +40,7 @@ jobs: python -m pip install pytest-cov # pytest-cov does not support automatic sub-package recognition in src/ # pass the sub-package names in through the --cov argument - pytest -vv $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') --cov-report=xml + python -m pytest -vv $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') --cov-report=xml tests/test_*.py - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: From 12383b9b44cea62639385dd46b1009b7971fc8ce Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 22:57:39 -0400 Subject: [PATCH 28/35] debug pytest-cov on macOS --- .github/workflows/unit-testing.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index e1b8b0c9..cd611659 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -40,7 +40,9 @@ jobs: python -m pip install pytest-cov # pytest-cov does not support automatic sub-package recognition in src/ # pass the sub-package names in through the --cov argument - python -m pytest -vv $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') --cov-report=xml tests/test_*.py + echo $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') + echo $(ls tests) + python -m pytest -vv $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: From eb85671bdc86493970b5b9191a5c745763d7173f Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 23:09:47 -0400 Subject: [PATCH 29/35] fix sed bug on macOS in pytest --- .github/workflows/unit-testing.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index cd611659..b117b9f8 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -40,9 +40,8 @@ jobs: python -m pip install pytest-cov # pytest-cov does not support automatic sub-package recognition in src/ # pass the sub-package names in through the --cov argument - echo $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') - echo $(ls tests) - python -m pytest -vv $(ls src | grep -v '.egg-info' | xargs | sed 's/^\| / --cov=/g') --cov-report=xml + echo $(ls src | grep -v '.egg-info' | xargs | sed -r 's/^| / --cov=/g') + python -m pytest -vv --cov-report=xml $(ls src | grep -v '.egg-info' | xargs | sed -r 's/^| / --cov=/g') - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: From 2590774ba11a3d12b34ef536ee75c65841715ba8 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 23:19:37 -0400 Subject: [PATCH 30/35] finally fix pytest-cov on macOS --- .github/workflows/code-style.yml | 2 +- .github/workflows/stixshifter-module-verification.yml | 2 +- .github/workflows/unit-testing.yml | 4 ++-- .github/workflows/unused-import.yml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index 31886830..16e14a3f 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -32,4 +32,4 @@ jobs: python -m pip install black python -m pip install . - name: Code style check (please black your code) - run: black --check src/ + run: python -m black --check src/ diff --git a/.github/workflows/stixshifter-module-verification.yml b/.github/workflows/stixshifter-module-verification.yml index 39b68c44..df40f291 100644 --- a/.github/workflows/stixshifter-module-verification.yml +++ b/.github/workflows/stixshifter-module-verification.yml @@ -23,4 +23,4 @@ jobs: python -m pip install pytest python -m pip install . - name: Sample STIX-shifter Connector Package Verification on PyPI - run: pytest -vv tests/test_stixshifter.py -k test_verify_package_origin + run: python -m pytest -vv tests/test_stixshifter.py -k test_verify_package_origin diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index b117b9f8..0526983d 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -39,8 +39,8 @@ jobs: python -m pip install pytest python -m pip install pytest-cov # pytest-cov does not support automatic sub-package recognition in src/ - # pass the sub-package names in through the --cov argument - echo $(ls src | grep -v '.egg-info' | xargs | sed -r 's/^| / --cov=/g') + # pass in all sub-package names through multiple --cov arguments + # use ls, xargs, and sed carefully regarding diff between mac/BSD and linux python -m pytest -vv --cov-report=xml $(ls src | grep -v '.egg-info' | xargs | sed -r 's/^| / --cov=/g') - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/unused-import.yml b/.github/workflows/unused-import.yml index 0e10bff6..dd723fe5 100644 --- a/.github/workflows/unused-import.yml +++ b/.github/workflows/unused-import.yml @@ -32,4 +32,4 @@ jobs: python -m pip install unimport python -m pip install . - name: Check - run: unimport --check --exclude __init__.py src/ + run: python -m unimport --check --exclude __init__.py src/ From 6d5ff7ff299e53054c6c0b032d430861b76308f1 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 23:27:35 -0400 Subject: [PATCH 31/35] ignore code in tests/ for codecov --- codecov.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 codecov.yml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..e6b99e6b --- /dev/null +++ b/codecov.yml @@ -0,0 +1,2 @@ +ignore: + - "tests" From b0b4746af6bde3c90ca44954bfb9ccd89925ed23 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 23:33:43 -0400 Subject: [PATCH 32/35] add codecov badge to README --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index 1cf01f24..bc57af30 100644 --- a/README.rst +++ b/README.rst @@ -10,6 +10,10 @@ :target: https://github.com/psf/black :alt: Code Style: Black +.. image:: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang/branch/develop/graph/badge.svg?token=HM4ax10IW3 + :target: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang + :alt: Code Coverage + .. image:: https://img.shields.io/pypi/v/kestrel-lang :target: https://pypi.python.org/pypi/kestrel-lang :alt: Latest Version From 8f69a33aa04b50c0d7353f50dd25a0f7d6a86d86 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 23:34:34 -0400 Subject: [PATCH 33/35] minor README update --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index bc57af30..44c7b9af 100644 --- a/README.rst +++ b/README.rst @@ -118,7 +118,7 @@ Kestrel Hunting Blogs Talks And Demos =============== -- 2022/04 `SC eSummit on Threat Hunting & Offense Security`_ (register to watch for free) +- 2022/04 `SC eSummit on Threat Hunting & Offense Security`_ (free to register/playback) - 2021/12 `Infosec Jupyterthon 2021`_ [`IJ'21 live hunt recording`_] - 2021/11 `BlackHat Europe 2021`_ - 2021/10 `SANS Threat Hunting Summit 2021`_: [`SANS'21 session recording`_] From d8f15b7a20f0123430bf819d590f6af53eebc94e Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 21 Apr 2022 23:37:09 -0400 Subject: [PATCH 34/35] black tests to test codecov --- tests/test_completion.py | 52 +++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/tests/test_completion.py b/tests/test_completion.py index 010b96c2..35218b97 100644 --- a/tests/test_completion.py +++ b/tests/test_completion.py @@ -10,11 +10,26 @@ KNOWN_ETYPES = { - 'artifact', 'autonomous-system', 'directory', 'domain-name', - 'email-addr', 'email-message', 'file', 'ipv4-addr', 'ipv6-addr', - 'mac-addr', 'mutex', 'network-traffic', 'process', 'software', - 'url', 'user-account', 'windows-registry-key', 'x-ibm-finding', - 'x-oca-asset', 'x-oca-event' + "artifact", + "autonomous-system", + "directory", + "domain-name", + "email-addr", + "email-message", + "file", + "ipv4-addr", + "ipv6-addr", + "mac-addr", + "mutex", + "network-traffic", + "process", + "software", + "url", + "user-account", + "windows-registry-key", + "x-ibm-finding", + "x-oca-asset", + "x-oca-event", } @@ -23,9 +38,11 @@ def a_session(): cwd = os.path.dirname(os.path.abspath(__file__)) bundle = os.path.join(cwd, "test_bundle.json") session = Session(debug_mode=True) - stmt = ("conns = get network-traffic" - f" from file://{bundle}" - " where [network-traffic:dst_port < 10000]") + stmt = ( + "conns = get network-traffic" + f" from file://{bundle}" + " where [network-traffic:dst_port < 10000]" + ) session.execute(stmt) return session @@ -36,10 +53,14 @@ def a_session(): ("x", []), # No suggestions ("x ", {"=", "+"}), ("c", {"onns"}), - ("conns", ['']), # Empty string means word is complete + ("conns", [""]), # Empty string means word is complete ("conns ", {"=", "+"}), ("disp ", {"conns", "_"} | TRANSFORMS), - ("procs = ", {"GET", "FIND", "JOIN", "SORT", "GROUP", "LOAD", "NEW", "conns", "_"} | TRANSFORMS), + ( + "procs = ", + {"GET", "FIND", "JOIN", "SORT", "GROUP", "LOAD", "NEW", "conns", "_"} + | TRANSFORMS, + ), ("procs = G", {"ET", "ROUP"}), ("procs = F", {"IND"}), ("procs = FI", {"ND"}), @@ -47,8 +68,8 @@ def a_session(): ("procs = FIND", []), ("procs = FIND ", KNOWN_ETYPES), ("procs = FIND p", ["rocess"]), - ("procs = FIND process", ['']), - #("procs = FIND process ", {"created", "loaded", "linked"}), + ("procs = FIND process", [""]), + # ("procs = FIND process ", {"created", "loaded", "linked"}), ("procs = FIND process ", all_relations), ("procs = FIND process l", {"oaded", "inked"}), ("procs = FIND process c", {"reated", "ontained"}), @@ -63,9 +84,12 @@ def a_session(): ("urls = ge", ["t"]), ("urls = get ", KNOWN_ETYPES), ("urls = get url ", ["FROM", "WHERE"]), - ("urls = get url from ", ["_", "conns", "file://", "http://", "https://", "stixshifter://"]), + ( + "urls = get url from ", + ["_", "conns", "file://", "http://", "https://", "stixshifter://"], + ), ("urls = get url where ", []), - ] + ], ) def test_do_complete_after_get(a_session, code, expected): result = a_session.do_complete(code, len(code)) From fc6e1e17d88525babffcbbc692956af2de84ba07 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 22 Apr 2022 12:25:22 -0400 Subject: [PATCH 35/35] v1.3.2 --- CHANGELOG.rst | 35 +++++++++++++++++++++++++++++++++++ setup.cfg | 2 +- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 60337f81..a2227df9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,41 @@ All notable changes to this project will be documented in this file. The format is based on `Keep a Changelog`_. +1.3.2 (2022-04-22) +================== + +Added +----- + +- runtime warning generation for invalid entity type #200 +- auto-complete relation in FIND +- auto-complete BY and variable in FIND +- add logo to readthedocs +- upgrade auto-complete keywords to be case sensitive #213 +- add testing coverage into github workflows +- add codecov badge to README +- 31 unit tests for auto-completion +- the first unit test for JOIN +- two unit tests for ASSIGN +- five unit tests for EXPRESSION +- use tmp dir for generated testing data +- auto-deref with mixed ipv4/ipv6 in network-traffic + +Fixed +----- + +- missing ``_refs`` handling for 2 cases out of 4 #205 +- incorrectly derefering attributes after GROUP BY +- incorrectly yielding variable when auto-completing relation in FIND +- pylint errors about undefined-variables + +Changed +------- + +- update grammar to separate commands yielding (or not) a variable +- change FUNCNAME from a terminal to an inlined rule +- differentiate the terminal "by"i between FIND and SORT/GROUP + 1.3.1 (2022-04-16) ================== diff --git a/setup.cfg b/setup.cfg index a9d50c78..1b0c3d7f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = kestrel-lang -version = 1.3.1 +version = 1.3.2 description = Kestrel Threat Hunting Language long_description = file:README.rst long_description_content_type = text/x-rst