Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dynamic: add sequence scope #2532

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

### New Features

- add dynamic sequence scope for matching nearby calls within a thread #2532 @williballenthin
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

naming alternatives to sequence (matching occurs in any order): span, ngram, group/cluster

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 cluster

Copy link
Collaborator Author

@williballenthin williballenthin Dec 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"window", "slice", "range"

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

math: multiset (or bag, or mset) - https://en.wikipedia.org/wiki/Multiset

  • multiple instances of same object
  • order doesn't matter

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

optionally prefix with "call", e.g., callbag, callcluster?


### Breaking Changes

### New Rules (0)
Expand Down
54 changes: 42 additions & 12 deletions capa/capabilities/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
logger = logging.getLogger(__name__)


# The number of calls that make up a sequence.
SEQUENCE_SIZE = 5


def find_call_capabilities(
ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
) -> tuple[FeatureSet, MatchResults]:
Expand Down Expand Up @@ -51,11 +55,11 @@ def find_call_capabilities(

def find_thread_capabilities(
ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle
) -> tuple[FeatureSet, MatchResults, MatchResults]:
) -> tuple[FeatureSet, MatchResults, MatchResults, MatchResults]:
mr-tz marked this conversation as resolved.
Show resolved Hide resolved
"""
find matches for the given rules within the given thread.

returns: tuple containing (features for thread, match results for thread, match results for calls)
returns: tuple containing (features for thread, match results for thread, match results for sequences, match results for calls)
"""
# all features found within this thread,
# includes features found within calls.
Expand All @@ -65,14 +69,29 @@ def find_thread_capabilities(
# might be found at different calls, that's ok.
call_matches: MatchResults = collections.defaultdict(list)

# matches found at the sequence scope.
sequence_matches: MatchResults = collections.defaultdict(list)

sequence: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE)
williballenthin marked this conversation as resolved.
Show resolved Hide resolved

for ch in extractor.get_calls(ph, th):
ifeatures, imatches = find_call_capabilities(ruleset, extractor, ph, th, ch)
for feature, vas in ifeatures.items():
cfeatures, cmatches = find_call_capabilities(ruleset, extractor, ph, th, ch)
for feature, vas in cfeatures.items():
features[feature].update(vas)

for rule_name, res in imatches.items():
for rule_name, res in cmatches.items():
call_matches[rule_name].extend(res)

sequence.append(cfeatures)
sfeatures: FeatureSet = collections.defaultdict(set)
for call in sequence:
for feature, vas in call.items():
sfeatures[feature].update(vas)
williballenthin marked this conversation as resolved.
Show resolved Hide resolved

_, smatches = ruleset.match(Scope.SEQUENCE, sfeatures, ch.address)
for rule_name, res in smatches.items():
sequence_matches[rule_name].extend(res)

for feature, va in itertools.chain(extractor.extract_thread_features(ph, th), extractor.extract_global_features()):
features[feature].add(va)

Expand All @@ -84,12 +103,12 @@ def find_thread_capabilities(
for va, _ in res:
capa.engine.index_rule_matches(features, rule, [va])

return features, matches, call_matches
return features, matches, sequence_matches, call_matches


def find_process_capabilities(
ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle
) -> tuple[MatchResults, MatchResults, MatchResults, int]:
) -> tuple[MatchResults, MatchResults, MatchResults, MatchResults, int]:
"""
find matches for the given rules within the given process.

Expand All @@ -103,33 +122,41 @@ def find_process_capabilities(
# might be found at different threads, that's ok.
thread_matches: MatchResults = collections.defaultdict(list)

# matches found at the sequence scope.
# might be found at different sequences, that's ok.
sequence_matches: MatchResults = collections.defaultdict(list)

# matches found at the call scope.
# might be found at different calls, that's ok.
call_matches: MatchResults = collections.defaultdict(list)

for th in extractor.get_threads(ph):
features, tmatches, cmatches = find_thread_capabilities(ruleset, extractor, ph, th)
features, tmatches, smatches, cmatches = find_thread_capabilities(ruleset, extractor, ph, th)
for feature, vas in features.items():
process_features[feature].update(vas)

for rule_name, res in tmatches.items():
thread_matches[rule_name].extend(res)

for rule_name, res in smatches.items():
sequence_matches[rule_name].extend(res)

for rule_name, res in cmatches.items():
call_matches[rule_name].extend(res)

for feature, va in itertools.chain(extractor.extract_process_features(ph), extractor.extract_global_features()):
process_features[feature].add(va)

_, process_matches = ruleset.match(Scope.PROCESS, process_features, ph.address)
return process_matches, thread_matches, call_matches, len(process_features)
return process_matches, thread_matches, sequence_matches, call_matches, len(process_features)


def find_dynamic_capabilities(
ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress=None
) -> tuple[MatchResults, Any]:
all_process_matches: MatchResults = collections.defaultdict(list)
all_thread_matches: MatchResults = collections.defaultdict(list)
all_sequence_matches: MatchResults = collections.defaultdict(list)
all_call_matches: MatchResults = collections.defaultdict(list)

feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())
Expand All @@ -143,7 +170,7 @@ def find_dynamic_capabilities(
) as pbar:
task = pbar.add_task("matching", total=n_processes, unit="processes")
for p in processes:
process_matches, thread_matches, call_matches, feature_count = find_process_capabilities(
process_matches, thread_matches, sequence_matches, call_matches, feature_count = find_process_capabilities(
ruleset, extractor, p
)
feature_counts.processes += (
Expand All @@ -155,6 +182,8 @@ def find_dynamic_capabilities(
all_process_matches[rule_name].extend(res)
for rule_name, res in thread_matches.items():
all_thread_matches[rule_name].extend(res)
for rule_name, res in sequence_matches.items():
all_sequence_matches[rule_name].extend(res)
for rule_name, res in call_matches.items():
all_call_matches[rule_name].extend(res)

Expand All @@ -164,7 +193,7 @@ def find_dynamic_capabilities(
# mapping from feature (matched rule) to set of addresses at which it matched.
process_and_lower_features: FeatureSet = collections.defaultdict(set)
for rule_name, results in itertools.chain(
all_process_matches.items(), all_thread_matches.items(), all_call_matches.items()
all_process_matches.items(), all_thread_matches.items(), all_sequence_matches.items(), all_call_matches.items()
):
locations = {p[0] for p in results}
rule = ruleset[rule_name]
Expand All @@ -178,9 +207,10 @@ def find_dynamic_capabilities(
# each rule exists in exactly one scope,
# so there won't be any overlap among these following MatchResults,
# and we can merge the dictionaries naively.
all_call_matches.items(),
all_sequence_matches.items(),
all_thread_matches.items(),
all_process_matches.items(),
all_call_matches.items(),
all_file_matches.items(),
)
)
Expand Down
12 changes: 12 additions & 0 deletions capa/features/extractors/base_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,4 +497,16 @@ def filtered_get_processes(self):
return new_extractor


def ThreadFilter(extractor: DynamicFeatureExtractor, threads: set) -> DynamicFeatureExtractor:
original_get_threads = extractor.get_threads

def filtered_get_threads(self, ph: ProcessHandle):
yield from (t for t in original_get_threads(ph) if t.address in threads)

new_extractor = copy(extractor)
new_extractor.get_threads = MethodType(filtered_get_threads, extractor) # type: ignore

return new_extractor


FeatureExtractor: TypeAlias = Union[StaticFeatureExtractor, DynamicFeatureExtractor]
4 changes: 4 additions & 0 deletions capa/render/proto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ def scope_to_pb2(scope: capa.rules.Scope) -> capa_pb2.Scope.ValueType:
return capa_pb2.Scope.SCOPE_PROCESS
elif scope == capa.rules.Scope.THREAD:
return capa_pb2.Scope.SCOPE_THREAD
elif scope == capa.rules.Scope.SEQUENCE:
return capa_pb2.Scope.SCOPE_SEQUENCE
elif scope == capa.rules.Scope.CALL:
return capa_pb2.Scope.SCOPE_CALL
else:
Expand Down Expand Up @@ -648,6 +650,8 @@ def scope_from_pb2(scope: capa_pb2.Scope.ValueType) -> capa.rules.Scope:
return capa.rules.Scope.PROCESS
elif scope == capa_pb2.Scope.SCOPE_THREAD:
return capa.rules.Scope.THREAD
elif scope == capa_pb2.Scope.SCOPE_SEQUENCE:
return capa.rules.Scope.SEQUENCE
elif scope == capa_pb2.Scope.SCOPE_CALL:
return capa.rules.Scope.CALL
else:
Expand Down
1 change: 1 addition & 0 deletions capa/render/proto/capa.proto
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ enum Scope {
SCOPE_PROCESS = 5;
SCOPE_THREAD = 6;
SCOPE_CALL = 7;
SCOPE_SEQUENCE = 8;
}

message Scopes {
Expand Down
306 changes: 153 additions & 153 deletions capa/render/proto/capa_pb2.py

Large diffs are not rendered by default.

Loading
Loading