From 46ab871b269f73d31dd45103d3f8a07caafa7cc8 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 21 May 2024 23:55:39 -0500 Subject: [PATCH] add path-too-long check (#244) --- docs/_static/defaults.toml | 7 +++++ docs/check-reference.rst | 52 ++++++++++++++++++++++++++++++++ src/pydistcheck/checks.py | 21 +++++++++++++ src/pydistcheck/cli.py | 61 ++++++++++++++++++++++---------------- src/pydistcheck/config.py | 10 ++++--- tests/test_cli.py | 47 ++++++++++++++++++++++++++++- tests/test_config.py | 20 ++++++++----- 7 files changed, 180 insertions(+), 38 deletions(-) diff --git a/docs/_static/defaults.toml b/docs/_static/defaults.toml index 38aa2ce..b8a8ad1 100644 --- a/docs/_static/defaults.toml +++ b/docs/_static/defaults.toml @@ -25,6 +25,13 @@ max_allowed_size_compressed = '50M' # See 'pydistcheck --help' for available units. max_allowed_size_uncompressed = '75M' +# If any file or directory in the distribution has a path longer +# than this many characters, pydistcheck reports a 'path-too-long' check failure. +# +# For help choosing a value, see +# https://pydistcheck.readthedocs.io/en/latest/check-reference.html#path-too-long +max_path_length = 200 + # List of fnmatch.fnmatchcase() patterns to be compared to directories # in the distribution. expected_directories = [ diff --git a/docs/check-reference.rst b/docs/check-reference.rst index f2d4143..40e9af6 100644 --- a/docs/check-reference.rst +++ b/docs/check-reference.rst @@ -121,6 +121,58 @@ For more information, see: * `"Don't use spaces or underscores in file paths" (blog post) `_ * `"What technical reasons exist for not using space characters in file names?" (Stack Overflow) `_ +path-too-long +************* + +A file or directory in the distribution has a path that has too many characters. + +Some operating systems have limits on path lengths, and distributions with longer paths +might not be installable on those systems. + +By default, ``pydistcheck`` reports this check failure if it detects any paths longer than ``200`` characters. +This is primarily informed by the following limitations: + +* many Windows systems limit the total filepath length (excluding drive specifiers like ``C://``) to 256 characters +* some older ``tar`` implementations will not support paths longer than 256 characters + +See below for details. + +> *Tarballs are only required to store paths of up to 100 bytes and cannot store those of more than 256 bytes*. + +`R CMD check source code `__ + +> *...packages are normally distributed as tarballs, and these have a limit on path lengths: for maximal portability 100 bytes.* + +`"Package Structure" (Writing R Extensions) `__ + +> *Windows historically has limited path lengths to 260 characters.* +> *This meant that paths longer than this would not resolve and errors would result.* +> +> *In the latest versions of Windows, this limitation can be expanded to approximately 32,000 characters.* +> *Your administrator will need to activate the ``"Enable Win32 long paths"`` group policy, or set ``LongPathsEnabled`` to 1 in the registry key ``HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem``.* +> +> *This allows the open() function, the os module and most other path functionality to accept and return paths longer than 260 characters.* + +`"Removing the Max Path Limitation" (Python Windows docs) `__ + +> *Git has a limit of 4096 characters for a filename, except on Windows when Git is compiled with msys.* +> *It uses an older version of the Windows API and there's a limit of 260 characters for a filename.* +> +> *You can circumvent this by using another Git client on Windows or set ``core.longpaths`` to ``true``...* + +`Filename too long in Git for Windows (Stack Overflow answer) `__ + +Other relevant discussions: + +* `"Maximum Path Length" (Windows docs) `__ +* `"Comparison of Filesystems: Limits" (Wikipedia) `__ +* `"Could the 100 byte path length limit be lifted?" (r-pkg-devel, 2023) `__ +* `"R CMD check NOTE - Long paths in package" (r-pkg-devel, 2015) `__ +* `"Filename length limits on linux?" (serverfault answer, 2009-2016) `__ +* `"Command prompt (Cmd. exe) command-line string limitation" (Windows docs, 2023) `__ +* `conda-build discussion about 255-character prefix limit (conda/conda-build#1482) `__ +* `discussion about paths lengths (Python Discourse, 2023) `__ + too-many-files ************** diff --git a/src/pydistcheck/checks.py b/src/pydistcheck/checks.py index cc976de..534d8eb 100644 --- a/src/pydistcheck/checks.py +++ b/src/pydistcheck/checks.py @@ -27,6 +27,7 @@ "mixed-file-extensions", "path-contains-non-ascii-characters", "path-contains-spaces", + "path-too-long", "unexpected-files", } @@ -203,6 +204,26 @@ def __call__(self, distro_summary: _DistributionSummary) -> List[str]: return out +class _PathTooLongCheck(_CheckProtocol): + check_name = "path-too-long" + + def __init__(self, max_path_length: int): + self.max_path_length = max_path_length + + def __call__(self, distro_summary: _DistributionSummary) -> List[str]: + out: List[str] = [] + bad_paths = [ + p for p in distro_summary.all_paths if len(p) > self.max_path_length + ] + for file_path in bad_paths: + msg = ( + f"[{self.check_name}] Path too long ({len(file_path)} > {self.max_path_length}): " + f"'{file_path}'" + ) + out.append(msg) + return out + + class _SpacesInPathCheck(_CheckProtocol): check_name = "path-contains-spaces" diff --git a/src/pydistcheck/cli.py b/src/pydistcheck/cli.py index 7b28bcb..b4c61d6 100644 --- a/src/pydistcheck/cli.py +++ b/src/pydistcheck/cli.py @@ -19,6 +19,7 @@ _FilesOnlyDifferByCaseCheck, _MixedFileExtensionCheck, _NonAsciiCharacterCheck, + _PathTooLongCheck, _SpacesInPathCheck, _UnexpectedFilesCheck, ) @@ -74,6 +75,32 @@ class ExitCodes: "Print a summary of the distribution, like its total size and largest files." ), ) +@click.option( # type: ignore[misc] + "--expected-directories", + default=_Config.expected_directories, + show_default=True, + type=str, + help=( + "comma-delimited list of patterns matching directories that are expected " + "to be found in the distribution. Prefix with '!' to indicate a pattern which " + "should NOT match any of the distribution's contents. Patterns should be in " + "the format understood by ``fnmatch.fnmatchcase()``. " + "See https://docs.python.org/3/library/fnmatch.html." + ), +) +@click.option( # type: ignore[misc] + "--expected-files", + default=_Config.expected_files, + show_default=True, + type=str, + help=( + "comma-delimited list of patterns matching files that are expected " + "to be found in the distribution. Prefix with '!' to indicate a pattern which " + "should NOT match any of the distribution's contents. Patterns should be in " + "the format understood by ``fnmatch.fnmatchcase()``. " + "See https://docs.python.org/3/library/fnmatch.html." + ), +) @click.option( # type: ignore[misc] "--max-allowed-files", default=_Config.max_allowed_files, @@ -110,43 +137,25 @@ class ExitCodes: ), ) @click.option( # type: ignore[misc] - "--expected-directories", - default=_Config.expected_directories, - show_default=True, - type=str, - help=( - "comma-delimited list of patterns matching directories that are expected " - "to be found in the distribution. Prefix with '!' to indicate a pattern which " - "should NOT match any of the distribution's contents. Patterns should be in " - "the format understood by ``fnmatch.fnmatchcase()``. " - "See https://docs.python.org/3/library/fnmatch.html." - ), -) -@click.option( # type: ignore[misc] - "--expected-files", - default=_Config.expected_files, + "--max-path-length", + default=_Config.max_path_length, show_default=True, - type=str, - help=( - "comma-delimited list of patterns matching files that are expected " - "to be found in the distribution. Prefix with '!' to indicate a pattern which " - "should NOT match any of the distribution's contents. Patterns should be in " - "the format understood by ``fnmatch.fnmatchcase()``. " - "See https://docs.python.org/3/library/fnmatch.html." - ), + type=int, + help="Maximum allowed filepath length for files or directories in the distribution.", ) def check( # noqa: PLR0913 *, filepaths: str, version: bool, config: str, + expected_directories: str, + expected_files: str, ignore: str, inspect: bool, max_allowed_files: int, max_allowed_size_compressed: str, max_allowed_size_uncompressed: str, - expected_directories: str, - expected_files: str, + max_path_length: int, ) -> None: """ Run the contents of a distribution through a set of checks, and warn about @@ -170,6 +179,7 @@ def check( # noqa: PLR0913 "max_allowed_files": max_allowed_files, "max_allowed_size_compressed": max_allowed_size_compressed, "max_allowed_size_uncompressed": max_allowed_size_uncompressed, + "max_path_length": max_path_length, "expected_directories": expected_directories, "expected_files": expected_files, } @@ -213,6 +223,7 @@ def check( # noqa: PLR0913 _FileCountCheck(max_allowed_files=conf.max_allowed_files), _FilesOnlyDifferByCaseCheck(), _MixedFileExtensionCheck(), + _PathTooLongCheck(max_path_length=conf.max_path_length), _SpacesInPathCheck(), _UnexpectedFilesCheck( directory_patterns=expected_directories.split(","), diff --git a/src/pydistcheck/config.py b/src/pydistcheck/config.py index 29836da..25ef9a5 100644 --- a/src/pydistcheck/config.py +++ b/src/pydistcheck/config.py @@ -16,13 +16,14 @@ # # unit tests confirm that it matches the `_Config` class, so it shouldn't ever drift from that class _ALLOWED_CONFIG_VALUES = { + "expected_directories", + "expected_files", "ignore", "inspect", "max_allowed_files", "max_allowed_size_compressed", "max_allowed_size_uncompressed", - "expected_directories", - "expected_files", + "max_path_length", } _EXPECTED_DIRECTORIES = ",".join( @@ -62,13 +63,14 @@ @dataclass class _Config: + expected_directories: str = _EXPECTED_DIRECTORIES + expected_files: str = _EXPECTED_FILES ignore: str = "" inspect: bool = False max_allowed_files: int = 2000 max_allowed_size_compressed: str = "50M" max_allowed_size_uncompressed: str = "75M" - expected_directories: str = _EXPECTED_DIRECTORIES - expected_files: str = _EXPECTED_FILES + max_path_length: int = 200 def __setattr__(self, name: str, value: Any) -> None: attr_name = name.replace("-", "_") diff --git a/tests/test_cli.py b/tests/test_cli.py index 84e75d9..5621bac 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -17,12 +17,15 @@ ] MACOS_SUFFIX = "macosx_10_15_x86_64.macosx_11_6_x86_64.macosx_12_5_x86_64.whl" MANYLINUX_SUFFIX = "manylinux_2_28_x86_64.manylinux_2_5_x86_64.manylinux1_x86_64.whl" -BASEBALL_PACKAGES = [ +BASEBALL_CONDA_PACKAGES = [ "osx-64-baseballmetrics-0.1.0-0.conda", "osx-64-baseballmetrics-0.1.0-0.tar.bz2", +] +BASEBALL_WHEELS = [ f"baseballmetrics-0.1.0-py3-none-{MACOS_SUFFIX}", f"baseballmetrics-0.1.0-py3-none-{MANYLINUX_SUFFIX}", ] +BASEBALL_PACKAGES = BASEBALL_CONDA_PACKAGES + BASEBALL_WHEELS # NOTE: .bz2 and .tar.gz packages here are just unzipped # and re-tarred Python wheels... to avoid pydistcheck # implicitly assuming that, for example, '*.tar.gz' must @@ -671,6 +674,48 @@ def test_expected_files_does_not_raise_check_failure_if_directory_pattern_matche ) +@pytest.mark.parametrize("distro_file", BASEBALL_CONDA_PACKAGES) +def test_path_too_long_check_works_for_conda_packages(distro_file): + runner = CliRunner() + result = runner.invoke( + check, + [ + "--max-path-length=5", + os.path.join(TEST_DATA_DIR, distro_file), + ], + ) + assert result.exit_code == 1 + + _assert_log_matches_pattern( + result=result, + pattern=( + r"\[path\-too\-long\] Path too long \(78 > 5\)\: " + r"'lib/python3\.9/site\-packages/baseballmetrics/__pycache__/metrics\.cpython\-39\.pyc'" + ), + ) + + +@pytest.mark.parametrize("distro_file", BASEBALL_WHEELS) +def test_path_too_long_check_works_for_wheels(distro_file): + runner = CliRunner() + result = runner.invoke( + check, + [ + "--max-path-length=5", + os.path.join(TEST_DATA_DIR, distro_file), + ], + ) + assert result.exit_code == 1 + + _assert_log_matches_pattern( + result=result, + pattern=( + r"\[path\-too\-long\] Path too long \(30 > 5\)\: " + r"'baseballmetrics/_shared_lib\.py'" + ), + ) + + @pytest.mark.parametrize("distro_file", PROBLEMATIC_PACKAGES) def test_cli_raises_exactly_the_expected_number_of_errors_for_the_problematic_package( distro_file, diff --git a/tests/test_config.py b/tests/test_config.py index 54cda27..8e24f1b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -59,24 +59,26 @@ def test_update_from_dict_works_when_changing_all_values(base_config): assert base_config.max_allowed_size_compressed == "1G" assert base_config.max_allowed_size_uncompressed == "18K" patch_dict = { + "expected_directories": "!*/tests", + "expected_files": "!*.xlsx,!data/*.csv", "ignore": "path-contains-spaces,too-many-files", "inspect": True, "max_allowed_files": 8, "max_allowed_size_compressed": "2G", "max_allowed_size_uncompressed": "141K", - "expected_directories": "!*/tests", - "expected_files": "!*.xlsx,!data/*.csv", + "max_path_length": 600, } assert ( set(patch_dict.keys()) == _ALLOWED_CONFIG_VALUES ), "this test needs to be updated" base_config.update_from_dict(patch_dict) + assert base_config.expected_directories == "!*/tests" + assert base_config.expected_files == "!*.xlsx,!data/*.csv" assert base_config.inspect is True assert base_config.max_allowed_files == 8 assert base_config.max_allowed_size_compressed == "2G" assert base_config.max_allowed_size_uncompressed == "141K" - assert base_config.expected_directories == "!*/tests" - assert base_config.expected_files == "!*.xlsx,!data/*.csv" + assert base_config.max_path_length == 600 def test_update_from_toml_silently_returns_self_if_file_does_not_exist(base_config): @@ -126,13 +128,14 @@ def test_update_from_toml_works_with_all_config_values( ): temp_file = os.path.join(tmpdir, f"{uuid.uuid4().hex}.toml") patch_dict = { + "expected_directories": "[\n'!tests/*'\n]", + "expected_files": "[\n'!*.pq',\n'!*/tests/data/*.csv']", "ignore": "[\n'path-contains-spaces',\n'too-many-files'\n]", "inspect": "true", "max_allowed_files": 8, "max_allowed_size_compressed": "'3G'", "max_allowed_size_uncompressed": "'4.12G'", - "expected_directories": "[\n'!tests/*'\n]", - "expected_files": "[\n'!*.pq',\n'!*/tests/data/*.csv']", + "max_path_length": 25, } assert ( set(patch_dict.keys()) == _ALLOWED_CONFIG_VALUES @@ -145,13 +148,14 @@ def test_update_from_toml_works_with_all_config_values( ] f.write("\n".join(lines)) base_config.update_from_toml(toml_file=temp_file) + assert base_config.expected_directories == "!tests/*" + assert base_config.expected_files == "!*.pq,!*/tests/data/*.csv" assert base_config.ignore == "path-contains-spaces,too-many-files" assert base_config.inspect is True assert base_config.max_allowed_files == 8 assert base_config.max_allowed_size_compressed == "3G" assert base_config.max_allowed_size_uncompressed == "4.12G" - assert base_config.expected_directories == "!tests/*" - assert base_config.expected_files == "!*.pq,!*/tests/data/*.csv" + assert base_config.max_path_length == 25 def test_update_from_toml_converts_lists_to_comma_delimited_string(base_config, tmpdir):