From 704120cf3f00d15b8bbaf8cdb76914306eba3f46 Mon Sep 17 00:00:00 2001 From: Noah Holm <32292420+noppaz@users.noreply.github.com> Date: Tue, 24 Dec 2024 17:08:39 +0100 Subject: [PATCH 1/2] Reapply changes from #7125 Co-authored by: Noah Holm <32292420+noppaz@users.noreply.github.com> Co-authored by: Jeremy Cohen --- .../unreleased/Features-20230307-134838.yaml | 6 +++++ core/dbt/artifacts/resources/base.py | 22 +++++++++++++++++++ core/dbt/cli/main.py | 1 + core/dbt/cli/params.py | 8 +++++++ core/dbt/constants.py | 3 --- core/dbt/contracts/files.py | 6 ++--- core/dbt/contracts/project.py | 1 + core/dbt/events/types.py | 17 +++++++++----- core/dbt/flags.py | 1 + core/dbt/parser/read_files.py | 9 +++++--- tests/unit/graph/test_selector_methods.py | 8 +++---- 11 files changed, 62 insertions(+), 20 deletions(-) create mode 100644 .changes/unreleased/Features-20230307-134838.yaml diff --git a/.changes/unreleased/Features-20230307-134838.yaml b/.changes/unreleased/Features-20230307-134838.yaml new file mode 100644 index 00000000000..0e6f769f863 --- /dev/null +++ b/.changes/unreleased/Features-20230307-134838.yaml @@ -0,0 +1,6 @@ +kind: Features +body: Make MAXIMUM_SEED_SIZE_MIB configurable +time: 2023-03-07T13:48:38.792321024Z +custom: + Author: noppaz acurtis-evi + Issue: 7117 7124 diff --git a/core/dbt/artifacts/resources/base.py b/core/dbt/artifacts/resources/base.py index 0911a997c19..0c3f052c46d 100644 --- a/core/dbt/artifacts/resources/base.py +++ b/core/dbt/artifacts/resources/base.py @@ -3,6 +3,7 @@ from typing import List, Optional from dbt.artifacts.resources.types import NodeType +from dbt_common.clients.system import convert_path from dbt_common.dataclass_schema import dbtClassMixin @@ -60,6 +61,27 @@ def from_contents(cls, contents: str, name="sha256") -> "FileHash": checksum = hashlib.new(name, data).hexdigest() return cls(name=name, checksum=checksum) + @classmethod + def from_path(cls, path: str, name="sha256") -> "FileHash": + """Create a file hash from the file at given path. The hash is always the + utf-8 encoding of the contents which is stripped to give similar hashes + as `FileHash.from_contents`. + """ + path = convert_path(path) + chunk_size = 1 * 1024 * 1024 + file_hash = hashlib.new(name) + with open(path, "r") as handle: + # Left and rightstrip start and end of contents to give identical + # results as the seed hashing implementation with from_contents + chunk = handle.read(chunk_size).lstrip() + while chunk: + next_chunk = handle.read(chunk_size) + if not next_chunk: + chunk = chunk.rstrip() + file_hash.update(chunk.encode("utf-8")) + chunk = next_chunk + return cls(name=name, checksum=file_hash.hexdigest()) + @dataclass class Docs(dbtClassMixin): diff --git a/core/dbt/cli/main.py b/core/dbt/cli/main.py index 11cc81ef70e..96192c17fd3 100644 --- a/core/dbt/cli/main.py +++ b/core/dbt/cli/main.py @@ -118,6 +118,7 @@ def global_flags(func): @p.log_level_file @p.log_path @p.macro_debugging + @p.maximum_seed_size_mib @p.partial_parse @p.partial_parse_file_path @p.partial_parse_file_diff diff --git a/core/dbt/cli/params.py b/core/dbt/cli/params.py index 612728de222..08df2b6463b 100644 --- a/core/dbt/cli/params.py +++ b/core/dbt/cli/params.py @@ -167,6 +167,14 @@ default="eager", ) +maximum_seed_size_mib = click.option( + "--maximum-seed-size-mib", + envvar="DBT_MAXIMUM_SEED_SIZE_MIB", + help="Specify max size (MiB) for seed files that will be hashed for state comparison.", + type=click.INT, + default=1, +) + lock = click.option( "--lock", envvar=None, diff --git a/core/dbt/constants.py b/core/dbt/constants.py index 0ff538910d5..acb4d234f74 100644 --- a/core/dbt/constants.py +++ b/core/dbt/constants.py @@ -4,9 +4,6 @@ SECRET_PLACEHOLDER = "$$$DBT_SECRET_START$$${}$$$DBT_SECRET_END$$$" -MAXIMUM_SEED_SIZE = 1 * 1024 * 1024 -MAXIMUM_SEED_SIZE_NAME = "1MB" - PIN_PACKAGE_URL = ( "https://docs.getdbt.com/docs/package-management#section-specifying-package-versions" ) diff --git a/core/dbt/contracts/files.py b/core/dbt/contracts/files.py index 15e951e026c..a477c619f0d 100644 --- a/core/dbt/contracts/files.py +++ b/core/dbt/contracts/files.py @@ -5,7 +5,6 @@ from mashumaro.types import SerializableType from dbt.artifacts.resources.base import FileHash -from dbt.constants import MAXIMUM_SEED_SIZE from dbt_common.dataclass_schema import StrEnum, dbtClassMixin from .util import SourceKey @@ -65,9 +64,8 @@ def absolute_path(self) -> str: def original_file_path(self) -> str: return os.path.join(self.searched_path, self.relative_path) - def seed_too_large(self) -> bool: - """Return whether the file this represents is over the seed size limit""" - return os.stat(self.full_path).st_size > MAXIMUM_SEED_SIZE + def file_size(self) -> int: + return os.stat(self.full_path).st_size @dataclass diff --git a/core/dbt/contracts/project.py b/core/dbt/contracts/project.py index 25fb19b4f58..25bd1f7a9df 100644 --- a/core/dbt/contracts/project.py +++ b/core/dbt/contracts/project.py @@ -324,6 +324,7 @@ class ProjectFlags(ExtensibleDbtClassMixin): log_format_file: Optional[str] = None log_level: Optional[str] = None log_level_file: Optional[str] = None + maximum_seed_size_mib: Optional[int] = None partial_parse: Optional[bool] = None populate_cache: Optional[bool] = None printer_width: Optional[int] = None diff --git a/core/dbt/events/types.py b/core/dbt/events/types.py index a2ae8a4d54b..242c6b429c5 100644 --- a/core/dbt/events/types.py +++ b/core/dbt/events/types.py @@ -1,6 +1,6 @@ import json -from dbt.constants import MAXIMUM_SEED_SIZE_NAME, PIN_PACKAGE_URL +from dbt.constants import PIN_PACKAGE_URL from dbt.events.base_types import ( DebugLevel, DynamicLevel, @@ -8,6 +8,7 @@ InfoLevel, WarnLevel, ) +from dbt.flags import get_flags from dbt_common.events.base_types import EventLevel from dbt_common.events.format import ( format_fancy_output_line, @@ -675,10 +676,11 @@ def code(self) -> str: return "I052" def message(self) -> str: + maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB" msg = ( f"Found a seed ({self.package_name}.{self.name}) " - f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was " - f"<={MAXIMUM_SEED_SIZE_NAME}, so it has changed" + f">{maximum_seed_size_name} in size. The previous file was " + f"<={maximum_seed_size_name}, so it has changed" ) return msg @@ -688,9 +690,10 @@ def code(self) -> str: return "I053" def message(self) -> str: + maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB" msg = ( f"Found a seed ({self.package_name}.{self.name}) " - f">{MAXIMUM_SEED_SIZE_NAME} in size at the same path, dbt " + f">{maximum_seed_size_name} in size at the same path, dbt " f"cannot tell if it has changed: assuming they are the same" ) return msg @@ -701,9 +704,10 @@ def code(self) -> str: return "I054" def message(self) -> str: + maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB" msg = ( f"Found a seed ({self.package_name}.{self.name}) " - f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was in " + f">{maximum_seed_size_name} in size. The previous file was in " f"a different location, assuming it has changed" ) return msg @@ -714,9 +718,10 @@ def code(self) -> str: return "I055" def message(self) -> str: + maximum_seed_size_name = str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB" msg = ( f"Found a seed ({self.package_name}.{self.name}) " - f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file had a " + f">{maximum_seed_size_name} in size. The previous file had a " f"checksum type of {self.checksum_name}, so it has changed" ) return msg diff --git a/core/dbt/flags.py b/core/dbt/flags.py index 97ee9598f2f..48bc0c90249 100644 --- a/core/dbt/flags.py +++ b/core/dbt/flags.py @@ -69,6 +69,7 @@ def get_flag_dict(): "log_path", "invocation_command", "empty", + "maximum_seed_size_mib", } return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr} diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index d0ce1a551e3..c29a9c2e925 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -17,6 +17,7 @@ ) from dbt.events.types import InputFileDiffError from dbt.exceptions import ParsingError +from dbt.flags import get_flags from dbt.parser.common import schema_file_keys from dbt.parser.schemas import yaml_from_file from dbt.parser.search import filesystem_search @@ -123,12 +124,14 @@ def validate_yaml(file_path, dct): # Special processing for big seed files def load_seed_source_file(match: FilePath, project_name) -> SourceFile: - if match.seed_too_large(): + # Users can configure the maximum seed size (MiB) that will be hashed for state comparison + maximum_seed_size = get_flags().MAXIMUM_SEED_SIZE_MIB * 1024 * 1024 + # maximum_seed_size = 0 means no limit + if match.file_size() > maximum_seed_size and maximum_seed_size != 0: # We don't want to calculate a hash of this file. Use the path. source_file = SourceFile.big_seed(match) else: - file_contents = load_file_contents(match.absolute_path, strip=True) - checksum = FileHash.from_contents(file_contents) + checksum = FileHash.from_path(match.absolute_path) source_file = SourceFile(path=match, checksum=checksum) source_file.contents = "" source_file.parse_file_type = ParseFileType.Seed diff --git a/tests/unit/graph/test_selector_methods.py b/tests/unit/graph/test_selector_methods.py index d500c631a1b..86bfa247b3d 100644 --- a/tests/unit/graph/test_selector_methods.py +++ b/tests/unit/graph/test_selector_methods.py @@ -780,7 +780,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat event = warn_or_error_patch.call_args[0][0] assert type(event).__name__ == "SeedExceedsLimitSamePath" msg = event.message() - assert msg.startswith("Found a seed (pkg.seed) >1MB in size") + assert msg.startswith("Found a seed (pkg.seed) >1MiB in size") with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch: assert not search_manifest_using_method(manifest, method, "new") warn_or_error_patch.assert_not_called() @@ -793,7 +793,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat event = warn_or_error_patch.call_args[0][0] assert type(event).__name__ == "SeedExceedsLimitSamePath" msg = event.message() - assert msg.startswith("Found a seed (pkg.seed) >1MB in size") + assert msg.startswith("Found a seed (pkg.seed) >1MiB in size") def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state, seed): @@ -807,7 +807,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state event = warn_or_error_patch.call_args[0][0] assert type(event).__name__ == "SeedIncreased" msg = event.message() - assert msg.startswith("Found a seed (pkg.seed) >1MB in size") + assert msg.startswith("Found a seed (pkg.seed) >1MiB in size") with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch: assert not search_manifest_using_method(manifest, method, "new") warn_or_error_patch.assert_not_called() @@ -820,7 +820,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state event = warn_or_error_patch.call_args[0][0] assert type(event).__name__ == "SeedIncreased" msg = event.message() - assert msg.startswith("Found a seed (pkg.seed) >1MB in size") + assert msg.startswith("Found a seed (pkg.seed) >1MiB in size") def test_select_state_changed_seed_checksum_path_to_sha(manifest, previous_state, seed): From 20be92525ac2e9171c8a0d56f4c04d5c8df14099 Mon Sep 17 00:00:00 2001 From: Jeremy Cohen Date: Tue, 24 Dec 2024 17:44:51 +0100 Subject: [PATCH 2/2] Update MB -> MiB in functional test --- tests/functional/defer_state/test_modified_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/functional/defer_state/test_modified_state.py b/tests/functional/defer_state/test_modified_state.py index 91fa0d4c45c..19131a559f4 100644 --- a/tests/functional/defer_state/test_modified_state.py +++ b/tests/functional/defer_state/test_modified_state.py @@ -228,7 +228,7 @@ def test_changed_seed_contents_state(self, project): "./state", ] ) - assert ">1MB" in str(exc.value) + assert ">1MiB" in str(exc.value) # now check if unmodified returns none results = run_dbt(