Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#1413: Support automatic cleanups for NodeSchemas without a sub resource #1419

Merged
merged 3 commits into from
Dec 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions cartography/data/indexes.cypher
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,8 @@ CREATE INDEX IF NOT EXISTS FOR (n:GCPSubnet) ON (n.id);
CREATE INDEX IF NOT EXISTS FOR (n:GCPSubnet) ON (n.lastupdated);
CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.id);
CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.lastupdated);
CREATE INDEX IF NOT EXISTS FOR (n:GitHubOrganization) ON (n.id);
Copy link
Contributor Author

@achantavy achantavy Dec 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No longer needed anymore because GitHubOrganizations and GitHubUsers have been migrated to use the data model. The data model automatically ensures the required indexes are created.

CREATE INDEX IF NOT EXISTS FOR (n:GitHubOrganization) ON (n.lastupdated);
CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.id);
CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.lastupdated);
CREATE INDEX IF NOT EXISTS FOR (n:GitHubUser) ON (n.id);
CREATE INDEX IF NOT EXISTS FOR (n:GitHubUser) ON (n.lastupdated);
CREATE INDEX IF NOT EXISTS FOR (n:GKECluster) ON (n.id);
CREATE INDEX IF NOT EXISTS FOR (n:GKECluster) ON (n.lastupdated);
CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.email);
Expand Down
28 changes: 0 additions & 28 deletions cartography/data/jobs/cleanup/github_org_and_users_cleanup.json

This file was deleted.

51 changes: 47 additions & 4 deletions cartography/graph/cleanupbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,17 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]:
"""
Generates queries to clean up stale nodes and relationships from the given CartographyNodeSchema.
Note that auto-cleanups for a node with no relationships is not currently supported.

Algorithm:
1. If node_schema has no relationships at all, return empty.

Otherwise,

1. If node_schema doesn't have a sub_resource relationship, generate queries only to clean up its other
relationships. No nodes will be cleaned up.

Otherwise,

1. First delete all stale nodes attached to the node_schema's sub resource
2. Delete all stale node to sub resource relationships
- We don't expect this to be very common (never for AWS resources, at least), but in case it is possible for an
Expand All @@ -25,11 +35,16 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]:
:param node_schema: The given CartographyNodeSchema
:return: A list of Neo4j queries to clean up nodes and relationships.
"""
if not node_schema.sub_resource_relationship and not node_schema.other_relationships:
return []

if not node_schema.sub_resource_relationship:
raise ValueError(
"Auto-creating a cleanup job for a node_schema without a sub resource relationship is not supported. "
f'Please check the class definition of "{node_schema.__class__.__name__}".',
)
queries = []
other_rels = node_schema.other_relationships.rels if node_schema.other_relationships else []
for rel in other_rels:
query = _build_cleanup_rel_query_no_sub_resource(node_schema, rel)
queries.append(query)
return queries

result = _build_cleanup_node_and_rel_queries(node_schema, node_schema.sub_resource_relationship)
if node_schema.other_relationships:
Expand All @@ -41,6 +56,34 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]:
return result


def _build_cleanup_rel_query_no_sub_resource(
node_schema: CartographyNodeSchema,
selected_relationship: CartographyRelSchema,
) -> str:
"""
Helper function to delete stale relationships for node_schemas that have no sub resource relationship defined.
"""
if node_schema.sub_resource_relationship:
raise ValueError(
f'Expected {node_schema.label} to not exist. '
'This function is intended for node_schemas without sub_resource_relationships.',
)
# Ensure the node is attached to the sub resource and delete the node
query_template = Template(
"""
MATCH (n:$node_label)
$selected_rel_clause
WHERE r.lastupdated <> $UPDATE_TAG
WITH r LIMIT $LIMIT_SIZE
DELETE r;
""",
)
return query_template.safe_substitute(
node_label=node_schema.label,
selected_rel_clause=_build_selected_rel_clause(selected_relationship),
)


def _build_cleanup_node_and_rel_queries(
node_schema: CartographyNodeSchema,
selected_relationship: CartographyRelSchema,
Expand Down
13 changes: 10 additions & 3 deletions cartography/intel/github/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
import neo4j

from cartography.client.core.tx import load
from cartography.graph.job import GraphJob
from cartography.intel.github.util import fetch_all
from cartography.models.github.orgs import GitHubOrganizationSchema
from cartography.models.github.users import GitHubOrganizationUserSchema
from cartography.models.github.users import GitHubUnaffiliatedUserSchema
from cartography.stats import get_stats_client
from cartography.util import merge_module_sync_metadata
from cartography.util import run_cleanup_job
from cartography.util import timeit

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -210,6 +210,13 @@ def load_organization(
)


@timeit
def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict[str, Any]) -> None:
logger.info("Cleaning up GitHub users")
GraphJob.from_node_schema(GitHubOrganizationUserSchema(), common_job_parameters).run(neo4j_session)
GraphJob.from_node_schema(GitHubUnaffiliatedUserSchema(), common_job_parameters).run(neo4j_session)


@timeit
def sync(
neo4j_session: neo4j.Session,
Expand All @@ -236,8 +243,8 @@ def sync(
neo4j_session, GitHubUnaffiliatedUserSchema(), processed_unaffiliated_user_data, org_data,
common_job_parameters['UPDATE_TAG'],
)
# no automated cleanup job for users because user node has no sub_resource_relationship
run_cleanup_job('github_org_and_users_cleanup.json', neo4j_session, common_job_parameters)
cleanup(neo4j_session, common_job_parameters)

merge_module_sync_metadata(
neo4j_session,
group_type='GitHubOrganization',
Expand Down
44 changes: 44 additions & 0 deletions tests/data/github/users.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,50 @@
GITHUB_ORG_DATA,
)

GITHUB_USER_DATA_AT_TIMESTAMP_2 = (
[
{
'hasTwoFactorEnabled': None,
'node': {
'url': 'https://example.com/hjsimpson',
'login': 'hjsimpson',
'name': 'Homer Simpson',
'isSiteAdmin': False,
'email': 'hjsimpson@example.com',
'company': 'Springfield Nuclear Power Plant',
},
# In timestamp 2, Homer is now an admin and no longer a member.
# This is used to test that stale relationships are removed.
'role': 'ADMIN',
}, {
'hasTwoFactorEnabled': None,
'node': {
'url': 'https://example.com/lmsimpson',
'login': 'lmsimpson',
'name': 'Lisa Simpson',
'isSiteAdmin': False,
'email': 'lmsimpson@example.com',
'company': 'Simpson Residence',
},
'role': 'MEMBER',
}, {
'hasTwoFactorEnabled': True,
'node': {
'url': 'https://example.com/mbsimpson',
'login': 'mbsimpson',
'name': 'Marge Simpson',
'isSiteAdmin': False,
'email': 'mbsimpson@example.com',
'company': 'Simpson Residence',
},
# In timestamp 2, Marge is no longer an ADMIN and is now a MEMBER.
'role': 'MEMBER',
},
],
GITHUB_ORG_DATA,
)


# Subtle differences between owner data and user data:
# 1. owner data does not include a `hasTwoFactorEnabled` field (it in unavailable in the GraphQL query for these owners)
# 2. an `organizationRole` field instead of a `role` field. In owner data, membership within an org is not assumed, so
Expand Down
37 changes: 37 additions & 0 deletions tests/integration/cartography/intel/github/test_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from tests.data.github.users import GITHUB_ENTERPRISE_OWNER_DATA
from tests.data.github.users import GITHUB_ORG_DATA
from tests.data.github.users import GITHUB_USER_DATA
from tests.data.github.users import GITHUB_USER_DATA_AT_TIMESTAMP_2
from tests.integration.util import check_rels

TEST_UPDATE_TAG = 123456789
TEST_JOB_PARAMS = {'UPDATE_TAG': TEST_UPDATE_TAG}
Expand Down Expand Up @@ -145,3 +147,38 @@ def test_sync(mock_owners, mock_users, neo4j_session):
) for n in nodes
}
assert actual_nodes == expected_nodes


@patch.object(
cartography.intel.github.users,
'get_users',
side_effect=[GITHUB_USER_DATA, GITHUB_USER_DATA_AT_TIMESTAMP_2],
)
@patch.object(cartography.intel.github.users, 'get_enterprise_owners', return_value=GITHUB_ENTERPRISE_OWNER_DATA)
def test_sync_with_cleanups(mock_owners, mock_users, neo4j_session):
# Act
# Sync once
cartography.intel.github.users.sync(
neo4j_session,
{'UPDATE_TAG': 100},
FAKE_API_KEY,
TEST_GITHUB_URL,
TEST_GITHUB_ORG,
)
# Assert that the only admin is marge
assert check_rels(neo4j_session, 'GitHubUser', 'id', 'GitHubOrganization', 'id', 'ADMIN_OF') == {
('https://example.com/mbsimpson', 'https://example.com/my_org'),
}

# Act: Sync a second time
cartography.intel.github.users.sync(
neo4j_session,
{'UPDATE_TAG': 200},
FAKE_API_KEY,
TEST_GITHUB_URL,
TEST_GITHUB_ORG,
)
# Assert that Marge is no longer an ADMIN of the GitHub org and the admin is now Homer
assert check_rels(neo4j_session, 'GitHubUser', 'id', 'GitHubOrganization', 'id', 'ADMIN_OF') == {
('https://example.com/hjsimpson', 'https://example.com/my_org'),
}
53 changes: 45 additions & 8 deletions tests/unit/cartography/graph/test_cleanupbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import pytest

from cartography.graph.cleanupbuilder import _build_cleanup_node_and_rel_queries
from cartography.graph.cleanupbuilder import _build_cleanup_rel_query_no_sub_resource
from cartography.graph.cleanupbuilder import build_cleanup_queries
from cartography.graph.job import get_parameters
from cartography.models.aws.emr import EMRClusterToAWSAccount
from cartography.models.github.users import GitHubOrganizationUserSchema
from tests.data.graph.querybuilder.sample_models.asset_with_non_kwargs_tgm import FakeEC2InstanceSchema
from tests.data.graph.querybuilder.sample_models.asset_with_non_kwargs_tgm import FakeEC2InstanceToAWSAccount
from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetSchema
Expand Down Expand Up @@ -120,14 +122,49 @@ def test_get_params_from_queries():
assert set(get_parameters(queries)) == {'UPDATE_TAG', 'sub_resource_id', 'LIMIT_SIZE'}


def test_build_cleanup_queries_selected_rels_no_sub_res_raises_exc():
"""
Test that not specifying the sub resource rel as a selected_relationship in build_cleanup_queries raises exception
"""
with pytest.raises(ValueError, match='node_schema without a sub resource relationship is not supported'):
build_cleanup_queries(SimpleNodeSchema())


def test_build_cleanup_node_and_rel_queries_sub_res_tgm_not_validated_raises_exc():
with pytest.raises(ValueError, match='must have set_in_kwargs=True'):
_build_cleanup_node_and_rel_queries(FakeEC2InstanceSchema(), FakeEC2InstanceToAWSAccount())


def test_build_cleanup_queries_no_sub_resource():
# Ensure that we only clean up stale relationships for node schemas that don't have a sub resource defined.
# That is, we do not delete stale nodes.
actual_queries: list[str] = build_cleanup_queries(GitHubOrganizationUserSchema())
expected_queries = [
'''
MATCH (n:GitHubUser)
MATCH (n)-[r:MEMBER_OF]->(:GitHubOrganization)
WHERE r.lastupdated <> $UPDATE_TAG
WITH r LIMIT $LIMIT_SIZE
DELETE r;
''',
'''
MATCH (n:GitHubUser)
MATCH (n)-[r:ADMIN_OF]->(:GitHubOrganization)
WHERE r.lastupdated <> $UPDATE_TAG
WITH r LIMIT $LIMIT_SIZE
DELETE r;
''',
]
assert clean_query_list(actual_queries) == clean_query_list(expected_queries)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: fodder for a different PR - might be better to compare sets here instead of lists



def test_build_cleanup_queries_no_rels():
# Ensure that no queries are generated if we try to clean up a node with no defined relationships
actual_queries: list[str] = build_cleanup_queries(SimpleNodeSchema())
expected_queries = []
assert clean_query_list(actual_queries) == clean_query_list(expected_queries)


def test_build_cleanup_rel_query_no_sub_resource_raises_on_sub_resource():
"""
Test that _build_cleanup_rel_query_no_sub_resource raises ValueError when given a node schema
that has a sub_resource_relationship defined.
"""
# InterestingAssetSchema has a sub_resource_relationship defined
node_schema = InterestingAssetSchema()
rel_schema = InterestingAssetToHelloAssetRel()

with pytest.raises(ValueError, match="Expected InterestingAsset to not exist"):
_build_cleanup_rel_query_no_sub_resource(node_schema, rel_schema)
Loading