diff --git a/cartography/data/indexes.cypher b/cartography/data/indexes.cypher index 6cca3839e4..e70f534a87 100644 --- a/cartography/data/indexes.cypher +++ b/cartography/data/indexes.cypher @@ -161,12 +161,8 @@ CREATE INDEX IF NOT EXISTS FOR (n:GCPSubnet) ON (n.id); CREATE INDEX IF NOT EXISTS FOR (n:GCPSubnet) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.id); CREATE INDEX IF NOT EXISTS FOR (n:GCPVpc) ON (n.lastupdated); -CREATE INDEX IF NOT EXISTS FOR (n:GitHubOrganization) ON (n.id); -CREATE INDEX IF NOT EXISTS FOR (n:GitHubOrganization) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.id); CREATE INDEX IF NOT EXISTS FOR (n:GitHubRepository) ON (n.lastupdated); -CREATE INDEX IF NOT EXISTS FOR (n:GitHubUser) ON (n.id); -CREATE INDEX IF NOT EXISTS FOR (n:GitHubUser) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:GKECluster) ON (n.id); CREATE INDEX IF NOT EXISTS FOR (n:GKECluster) ON (n.lastupdated); CREATE INDEX IF NOT EXISTS FOR (n:GSuiteGroup) ON (n.email); diff --git a/cartography/data/jobs/cleanup/github_org_and_users_cleanup.json b/cartography/data/jobs/cleanup/github_org_and_users_cleanup.json deleted file mode 100644 index 7c7f6ff403..0000000000 --- a/cartography/data/jobs/cleanup/github_org_and_users_cleanup.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "statements": [{ - "query": "MATCH (n:GitHubUser) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DETACH DELETE (n)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (n:GitHubOrganization) WHERE n.lastupdated <> $UPDATE_TAG WITH n LIMIT $LIMIT_SIZE DETACH DELETE (n)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:OWNER]->(:GitHubRepository) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:MEMBER_OF]->(:GitHubOrganization) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }, - { - "query": "MATCH (:GitHubUser)-[r:UNAFFILIATED]->(:GitHubOrganization) WHERE r.lastupdated <> $UPDATE_TAG WITH r LIMIT $LIMIT_SIZE DELETE (r)", - "iterative": true, - "iterationsize": 100 - }], - "name": "cleanup GitHub users data" -} diff --git a/cartography/graph/cleanupbuilder.py b/cartography/graph/cleanupbuilder.py index bee0fa6519..227e8c1412 100644 --- a/cartography/graph/cleanupbuilder.py +++ b/cartography/graph/cleanupbuilder.py @@ -16,7 +16,17 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]: """ Generates queries to clean up stale nodes and relationships from the given CartographyNodeSchema. Note that auto-cleanups for a node with no relationships is not currently supported. + Algorithm: + 1. If node_schema has no relationships at all, return empty. + + Otherwise, + + 1. If node_schema doesn't have a sub_resource relationship, generate queries only to clean up its other + relationships. No nodes will be cleaned up. + + Otherwise, + 1. First delete all stale nodes attached to the node_schema's sub resource 2. Delete all stale node to sub resource relationships - We don't expect this to be very common (never for AWS resources, at least), but in case it is possible for an @@ -25,11 +35,16 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]: :param node_schema: The given CartographyNodeSchema :return: A list of Neo4j queries to clean up nodes and relationships. """ + if not node_schema.sub_resource_relationship and not node_schema.other_relationships: + return [] + if not node_schema.sub_resource_relationship: - raise ValueError( - "Auto-creating a cleanup job for a node_schema without a sub resource relationship is not supported. " - f'Please check the class definition of "{node_schema.__class__.__name__}".', - ) + queries = [] + other_rels = node_schema.other_relationships.rels if node_schema.other_relationships else [] + for rel in other_rels: + query = _build_cleanup_rel_query_no_sub_resource(node_schema, rel) + queries.append(query) + return queries result = _build_cleanup_node_and_rel_queries(node_schema, node_schema.sub_resource_relationship) if node_schema.other_relationships: @@ -41,6 +56,34 @@ def build_cleanup_queries(node_schema: CartographyNodeSchema) -> List[str]: return result +def _build_cleanup_rel_query_no_sub_resource( + node_schema: CartographyNodeSchema, + selected_relationship: CartographyRelSchema, +) -> str: + """ + Helper function to delete stale relationships for node_schemas that have no sub resource relationship defined. + """ + if node_schema.sub_resource_relationship: + raise ValueError( + f'Expected {node_schema.label} to not exist. ' + 'This function is intended for node_schemas without sub_resource_relationships.', + ) + # Ensure the node is attached to the sub resource and delete the node + query_template = Template( + """ + MATCH (n:$node_label) + $selected_rel_clause + WHERE r.lastupdated <> $UPDATE_TAG + WITH r LIMIT $LIMIT_SIZE + DELETE r; + """, + ) + return query_template.safe_substitute( + node_label=node_schema.label, + selected_rel_clause=_build_selected_rel_clause(selected_relationship), + ) + + def _build_cleanup_node_and_rel_queries( node_schema: CartographyNodeSchema, selected_relationship: CartographyRelSchema, diff --git a/cartography/intel/github/users.py b/cartography/intel/github/users.py index 4fd50a7258..1fa7876638 100644 --- a/cartography/intel/github/users.py +++ b/cartography/intel/github/users.py @@ -8,13 +8,13 @@ import neo4j from cartography.client.core.tx import load +from cartography.graph.job import GraphJob from cartography.intel.github.util import fetch_all from cartography.models.github.orgs import GitHubOrganizationSchema from cartography.models.github.users import GitHubOrganizationUserSchema from cartography.models.github.users import GitHubUnaffiliatedUserSchema from cartography.stats import get_stats_client from cartography.util import merge_module_sync_metadata -from cartography.util import run_cleanup_job from cartography.util import timeit logger = logging.getLogger(__name__) @@ -210,6 +210,13 @@ def load_organization( ) +@timeit +def cleanup(neo4j_session: neo4j.Session, common_job_parameters: dict[str, Any]) -> None: + logger.info("Cleaning up GitHub users") + GraphJob.from_node_schema(GitHubOrganizationUserSchema(), common_job_parameters).run(neo4j_session) + GraphJob.from_node_schema(GitHubUnaffiliatedUserSchema(), common_job_parameters).run(neo4j_session) + + @timeit def sync( neo4j_session: neo4j.Session, @@ -236,8 +243,8 @@ def sync( neo4j_session, GitHubUnaffiliatedUserSchema(), processed_unaffiliated_user_data, org_data, common_job_parameters['UPDATE_TAG'], ) - # no automated cleanup job for users because user node has no sub_resource_relationship - run_cleanup_job('github_org_and_users_cleanup.json', neo4j_session, common_job_parameters) + cleanup(neo4j_session, common_job_parameters) + merge_module_sync_metadata( neo4j_session, group_type='GitHubOrganization', diff --git a/tests/data/github/users.py b/tests/data/github/users.py index c1a1842932..6cfff47265 100644 --- a/tests/data/github/users.py +++ b/tests/data/github/users.py @@ -44,6 +44,50 @@ GITHUB_ORG_DATA, ) +GITHUB_USER_DATA_AT_TIMESTAMP_2 = ( + [ + { + 'hasTwoFactorEnabled': None, + 'node': { + 'url': 'https://example.com/hjsimpson', + 'login': 'hjsimpson', + 'name': 'Homer Simpson', + 'isSiteAdmin': False, + 'email': 'hjsimpson@example.com', + 'company': 'Springfield Nuclear Power Plant', + }, + # In timestamp 2, Homer is now an admin and no longer a member. + # This is used to test that stale relationships are removed. + 'role': 'ADMIN', + }, { + 'hasTwoFactorEnabled': None, + 'node': { + 'url': 'https://example.com/lmsimpson', + 'login': 'lmsimpson', + 'name': 'Lisa Simpson', + 'isSiteAdmin': False, + 'email': 'lmsimpson@example.com', + 'company': 'Simpson Residence', + }, + 'role': 'MEMBER', + }, { + 'hasTwoFactorEnabled': True, + 'node': { + 'url': 'https://example.com/mbsimpson', + 'login': 'mbsimpson', + 'name': 'Marge Simpson', + 'isSiteAdmin': False, + 'email': 'mbsimpson@example.com', + 'company': 'Simpson Residence', + }, + # In timestamp 2, Marge is no longer an ADMIN and is now a MEMBER. + 'role': 'MEMBER', + }, + ], + GITHUB_ORG_DATA, +) + + # Subtle differences between owner data and user data: # 1. owner data does not include a `hasTwoFactorEnabled` field (it in unavailable in the GraphQL query for these owners) # 2. an `organizationRole` field instead of a `role` field. In owner data, membership within an org is not assumed, so diff --git a/tests/integration/cartography/intel/github/test_users.py b/tests/integration/cartography/intel/github/test_users.py index 653f98190f..8c2a755758 100644 --- a/tests/integration/cartography/intel/github/test_users.py +++ b/tests/integration/cartography/intel/github/test_users.py @@ -5,6 +5,8 @@ from tests.data.github.users import GITHUB_ENTERPRISE_OWNER_DATA from tests.data.github.users import GITHUB_ORG_DATA from tests.data.github.users import GITHUB_USER_DATA +from tests.data.github.users import GITHUB_USER_DATA_AT_TIMESTAMP_2 +from tests.integration.util import check_rels TEST_UPDATE_TAG = 123456789 TEST_JOB_PARAMS = {'UPDATE_TAG': TEST_UPDATE_TAG} @@ -145,3 +147,38 @@ def test_sync(mock_owners, mock_users, neo4j_session): ) for n in nodes } assert actual_nodes == expected_nodes + + +@patch.object( + cartography.intel.github.users, + 'get_users', + side_effect=[GITHUB_USER_DATA, GITHUB_USER_DATA_AT_TIMESTAMP_2], +) +@patch.object(cartography.intel.github.users, 'get_enterprise_owners', return_value=GITHUB_ENTERPRISE_OWNER_DATA) +def test_sync_with_cleanups(mock_owners, mock_users, neo4j_session): + # Act + # Sync once + cartography.intel.github.users.sync( + neo4j_session, + {'UPDATE_TAG': 100}, + FAKE_API_KEY, + TEST_GITHUB_URL, + TEST_GITHUB_ORG, + ) + # Assert that the only admin is marge + assert check_rels(neo4j_session, 'GitHubUser', 'id', 'GitHubOrganization', 'id', 'ADMIN_OF') == { + ('https://example.com/mbsimpson', 'https://example.com/my_org'), + } + + # Act: Sync a second time + cartography.intel.github.users.sync( + neo4j_session, + {'UPDATE_TAG': 200}, + FAKE_API_KEY, + TEST_GITHUB_URL, + TEST_GITHUB_ORG, + ) + # Assert that Marge is no longer an ADMIN of the GitHub org and the admin is now Homer + assert check_rels(neo4j_session, 'GitHubUser', 'id', 'GitHubOrganization', 'id', 'ADMIN_OF') == { + ('https://example.com/hjsimpson', 'https://example.com/my_org'), + } diff --git a/tests/unit/cartography/graph/test_cleanupbuilder.py b/tests/unit/cartography/graph/test_cleanupbuilder.py index 3d9a01dd6d..8bb9e09a68 100644 --- a/tests/unit/cartography/graph/test_cleanupbuilder.py +++ b/tests/unit/cartography/graph/test_cleanupbuilder.py @@ -3,9 +3,11 @@ import pytest from cartography.graph.cleanupbuilder import _build_cleanup_node_and_rel_queries +from cartography.graph.cleanupbuilder import _build_cleanup_rel_query_no_sub_resource from cartography.graph.cleanupbuilder import build_cleanup_queries from cartography.graph.job import get_parameters from cartography.models.aws.emr import EMRClusterToAWSAccount +from cartography.models.github.users import GitHubOrganizationUserSchema from tests.data.graph.querybuilder.sample_models.asset_with_non_kwargs_tgm import FakeEC2InstanceSchema from tests.data.graph.querybuilder.sample_models.asset_with_non_kwargs_tgm import FakeEC2InstanceToAWSAccount from tests.data.graph.querybuilder.sample_models.interesting_asset import InterestingAssetSchema @@ -120,14 +122,49 @@ def test_get_params_from_queries(): assert set(get_parameters(queries)) == {'UPDATE_TAG', 'sub_resource_id', 'LIMIT_SIZE'} -def test_build_cleanup_queries_selected_rels_no_sub_res_raises_exc(): - """ - Test that not specifying the sub resource rel as a selected_relationship in build_cleanup_queries raises exception - """ - with pytest.raises(ValueError, match='node_schema without a sub resource relationship is not supported'): - build_cleanup_queries(SimpleNodeSchema()) - - def test_build_cleanup_node_and_rel_queries_sub_res_tgm_not_validated_raises_exc(): with pytest.raises(ValueError, match='must have set_in_kwargs=True'): _build_cleanup_node_and_rel_queries(FakeEC2InstanceSchema(), FakeEC2InstanceToAWSAccount()) + + +def test_build_cleanup_queries_no_sub_resource(): + # Ensure that we only clean up stale relationships for node schemas that don't have a sub resource defined. + # That is, we do not delete stale nodes. + actual_queries: list[str] = build_cleanup_queries(GitHubOrganizationUserSchema()) + expected_queries = [ + ''' + MATCH (n:GitHubUser) + MATCH (n)-[r:MEMBER_OF]->(:GitHubOrganization) + WHERE r.lastupdated <> $UPDATE_TAG + WITH r LIMIT $LIMIT_SIZE + DELETE r; + ''', + ''' + MATCH (n:GitHubUser) + MATCH (n)-[r:ADMIN_OF]->(:GitHubOrganization) + WHERE r.lastupdated <> $UPDATE_TAG + WITH r LIMIT $LIMIT_SIZE + DELETE r; + ''', + ] + assert clean_query_list(actual_queries) == clean_query_list(expected_queries) + + +def test_build_cleanup_queries_no_rels(): + # Ensure that no queries are generated if we try to clean up a node with no defined relationships + actual_queries: list[str] = build_cleanup_queries(SimpleNodeSchema()) + expected_queries = [] + assert clean_query_list(actual_queries) == clean_query_list(expected_queries) + + +def test_build_cleanup_rel_query_no_sub_resource_raises_on_sub_resource(): + """ + Test that _build_cleanup_rel_query_no_sub_resource raises ValueError when given a node schema + that has a sub_resource_relationship defined. + """ + # InterestingAssetSchema has a sub_resource_relationship defined + node_schema = InterestingAssetSchema() + rel_schema = InterestingAssetToHelloAssetRel() + + with pytest.raises(ValueError, match="Expected InterestingAsset to not exist"): + _build_cleanup_rel_query_no_sub_resource(node_schema, rel_schema)