Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Support automatic cleanup jobs for nodes without a sub resource #1399

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 46 additions & 4 deletions cartography/graph/cleanupbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,17 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]:
"""
Generates queries to clean up stale nodes and relationships from the given CartographyNodeSchema.
Note that auto-cleanups for a node with no relationships is not currently supported.

Algorithm:
1. If node_schema has no relationships at all, return empty.

Otherwise,

1. If node_schema doesn't have a sub_resource relationship, generate queries only to clean up its other
relationships. No nodes will be cleaned up.

Otherwise,

1. First delete all stale nodes attached to the node_schema's sub resource
2. Delete all stale node to sub resource relationships
- We don't expect this to be very common (never for AWS resources, at least), but in case it is possible for an
Expand All @@ -25,11 +35,15 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]:
:param node_schema: The given CartographyNodeSchema
:return: A list of Neo4j queries to clean up nodes and relationships.
"""
if not node_schema.sub_resource_relationship and not node_schema.other_relationships:
return []

if not node_schema.sub_resource_relationship:
raise ValueError(
"Auto-creating a cleanup job for a node_schema without a sub resource relationship is not supported. "
f'Please check the class definition of "{node_schema.__class__.__name__}".',
)
queries = []
for rel in node_schema.other_relationships.rels:
query = _build_cleanup_rel_query_no_sub_resource(node_schema, rel)
queries.append(query)
return queries

result = _build_cleanup_node_and_rel_queries(node_schema, node_schema.sub_resource_relationship)
if node_schema.other_relationships:
Expand All @@ -41,6 +55,34 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]:
return result


def _build_cleanup_rel_query_no_sub_resource(
node_schema: CartographyNodeSchema,
selected_relationship: CartographyRelSchema,
) -> str:
"""
Helper function to delete stale relationships for node_schemas that have no sub resource relationship defined.
"""
if node_schema.sub_resource_relationship:
raise ValueError(
f'Expected {node_schema.label} to not exist. '
'This function is intended for node_schemas without sub_resource_relationships.'
)
# Ensure the node is attached to the sub resource and delete the node
query_template = Template(
"""
MATCH (n:$node_label)
$selected_rel_clause
WHERE r.lastupdated <> $UPDATE_TAG
WITH r LIMIT $LIMIT_SIZE
DELETE r;
""",
)
return query_template.safe_substitute(
node_label=node_schema.label,
selected_rel_clause=_build_selected_rel_clause(selected_relationship)
)


def _build_cleanup_node_and_rel_queries(
node_schema: CartographyNodeSchema,
selected_relationship: CartographyRelSchema,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from dataclasses import dataclass

from cartography.models.core.common import PropertyRef
from cartography.models.core.nodes import CartographyNodeSchema, CartographyNodeProperties
from cartography.models.core.relationships import CartographyRelProperties, CartographyRelSchema, TargetNodeMatcher, \
LinkDirection, make_target_node_matcher, OtherRelationships


# Test defining a simple node with no relationships.
@dataclass(frozen=True)
class NodeAProperties(CartographyNodeProperties):
id: PropertyRef = PropertyRef('Id')
lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)
property1: PropertyRef = PropertyRef('property1')
property2: PropertyRef = PropertyRef('property2')


# Test defining a simple node attached to another node
@dataclass(frozen=True)
class NodeAToNodeBProps(CartographyRelProperties):
lastupdated: PropertyRef = PropertyRef('lastupdated', set_in_kwargs=True)


@dataclass(frozen=True)
class NodeAToNodeB(CartographyRelSchema):
target_node_label: str = 'SimpleNode'
target_node_matcher: TargetNodeMatcher = make_target_node_matcher(
{'id': PropertyRef('sub_resource_id', set_in_kwargs=True)},
)
direction: LinkDirection = LinkDirection.INWARD
rel_label: str = "POINTS_TO"
properties: NodeAToNodeBProps = NodeAToNodeBProps()


@dataclass(frozen=True)
class NodeA(CartographyNodeSchema):
label: str = 'NodeA'
properties: NodeAProperties = NodeAProperties()
other_relationships: OtherRelationships = OtherRelationships(
[
NodeAToNodeB(),
]
)
Empty file added tests/data/util/__init__.py
Empty file.
54 changes: 54 additions & 0 deletions tests/data/util/fake_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from dataclasses import asdict, fields
from typing import Any

from cartography.models.core.common import PropertyRef
from cartography.models.core.nodes import CartographyNodeProperties, CartographyNodeSchema
from cartography.models.core.relationships import OtherRelationships, TargetNodeMatcher, CartographyRelSchema


def _get_propref_keys_from_node_props(node_props: type[CartographyNodeProperties]) -> list[str]:
result = []
for field in fields(node_props):
prop_ref: PropertyRef = field.default
if prop_ref and prop_ref.set_in_kwargs is False:
result.append(
str(prop_ref).split('.')[1]
)
return result


def _get_propref_keys_from_rel(rel: type[CartographyRelSchema]) -> list[str]:
result = []
tgm: TargetNodeMatcher = rel.target_node_matcher
for field in fields(tgm):
prop_ref: PropertyRef = field.default
if prop_ref and prop_ref.set_in_kwargs is False:
result.append(
str(prop_ref).split('.')[1]
)
return result




def generate_fake_data(count: int, node_schema: type[CartographyNodeSchema]) -> list[dict[str, Any]]:
"""
make me 10 fake node As and 10 fake node Bs
for the node As, attach the 0th rel on them
"""
fake_data = []
node_props = node_schema.properties
props = _get_propref_keys_from_node_props(node_props)
props_from_other_rels = []

other_rels: OtherRelationships = node_schema.other_relationships
if other_rels:
for rel in other_rels.rels:
props.extend(_get_propref_keys_from_rel(rel))


for i in range(count):
fake_data.append(
{prop: str(i) for prop in props}
)
return fake_data
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from dataclasses import fields

from cartography.client.core.tx import load_graph_data
from cartography.graph.cleanupbuilder import build_cleanup_queries
from cartography.graph.job import GraphJob
from cartography.graph.querybuilder import build_ingestion_query
from tests.data.graph.querybuilder.sample_models.node_without_sub_resource import NodeA, NodeAProperties
from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeSchema, SimpleNodeProperties
from tests.data.util.fake_data import generate_fake_data


def test_build_cleanup_queries_no_sub_resource(neo4j_session):
# Arrange
data = generate_fake_data(10, SimpleNodeProperties)
query = build_ingestion_query(SimpleNodeSchema())
load_graph_data(
neo4j_session,
query,
data,
lastupdated=1,
)

data = generate_fake_data(10, NodeAProperties)
query = build_ingestion_query(NodeA())
load_graph_data(
neo4j_session,
query,
data,
lastupdated=1,
sub_resource_id=3,
)

# Act
common_job_parameters = {'UPDATE_TAG' : 1}
cleanup_job = GraphJob.from_node_schema(NodeA(), common_job_parameters)
cleanup_job.run(neo4j_session)


expected_queries = [
"""
MATCH (n:NodeA)
MATCH (n)<-[r:POINTS_TO]-(:NodeB)
WHERE r.lastupdated <> $UPDATE_TAG
WITH r LIMIT $LIMIT_SIZE
DELETE r;
"""
]

assert clean_query_list(actual_queries) == clean_query_list(expected_queries)
29 changes: 21 additions & 8 deletions tests/unit/cartography/graph/test_cleanupbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetSchema
from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetToHelloAssetRel
from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetToSubResourceRel
from tests.data.graph.querybuilder.sample_models.node_without_sub_resource import NodeA
from tests.data.graph.querybuilder.sample_models.simple_node import SimpleNodeSchema
from tests.unit.cartography.graph.helpers import clean_query_list

Expand Down Expand Up @@ -120,14 +121,26 @@ def test_get_params_from_queries():
assert set(get_parameters(queries)) == {'UPDATE_TAG', 'sub_resource_id', 'LIMIT_SIZE'}


def test_build_cleanup_queries_selected_rels_no_sub_res_raises_exc():
"""
Test that not specifying the sub resource rel as a selected_relationship in build_cleanup_queries raises exception
"""
with pytest.raises(ValueError, match='node_schema without a sub resource relationship is not supported'):
build_cleanup_queries(SimpleNodeSchema())


def test_build_cleanup_node_and_rel_queries_sub_res_tgm_not_validated_raises_exc():
with pytest.raises(ValueError, match='must have set_in_kwargs=True'):
_build_cleanup_node_and_rel_queries(FakeEC2InstanceSchema(), FakeEC2InstanceToAWSAccount())


def test_build_cleanup_queries_no_sub_resource():
actual_queries: list[str] = build_cleanup_queries(NodeA())
expected_queries = [
"""
MATCH (n:NodeA)
MATCH (n)<-[r:POINTS_TO]-(:NodeB)
WHERE r.lastupdated <> $UPDATE_TAG
WITH r LIMIT $LIMIT_SIZE
DELETE r;
"""
]
assert clean_query_list(actual_queries) == clean_query_list(expected_queries)


def test_build_cleanup_queries_no_rels():
actual_queries: list[str] = build_cleanup_queries(SimpleNodeSchema())
expected_queries = []
assert clean_query_list(actual_queries) == clean_query_list(expected_queries)
Loading