Skip to content

Commit

Permalink
add path-too-long check (#244)
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb authored May 22, 2024
1 parent 26b0a20 commit 46ab871
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 38 deletions.
7 changes: 7 additions & 0 deletions docs/_static/defaults.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ max_allowed_size_compressed = '50M'
# See 'pydistcheck --help' for available units.
max_allowed_size_uncompressed = '75M'

# If any file or directory in the distribution has a path longer
# than this many characters, pydistcheck reports a 'path-too-long' check failure.
#
# For help choosing a value, see
# https://pydistcheck.readthedocs.io/en/latest/check-reference.html#path-too-long
max_path_length = 200

# List of fnmatch.fnmatchcase() patterns to be compared to directories
# in the distribution.
expected_directories = [
Expand Down
52 changes: 52 additions & 0 deletions docs/check-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,58 @@ For more information, see:
* `"Don't use spaces or underscores in file paths" (blog post) <https://yihui.org/en/2018/03/space-pain/>`_
* `"What technical reasons exist for not using space characters in file names?" (Stack Overflow) <https://superuser.com/questions/29111/what-technical-reasons-exist-for-not-using-space-characters-in-file-names>`_

path-too-long
*************

A file or directory in the distribution has a path that has too many characters.

Some operating systems have limits on path lengths, and distributions with longer paths
might not be installable on those systems.

By default, ``pydistcheck`` reports this check failure if it detects any paths longer than ``200`` characters.
This is primarily informed by the following limitations:

* many Windows systems limit the total filepath length (excluding drive specifiers like ``C://``) to 256 characters
* some older ``tar`` implementations will not support paths longer than 256 characters

See below for details.

> *Tarballs are only required to store paths of up to 100 bytes and cannot store those of more than 256 bytes*.

`R CMD check source code <https://github.com/wch/r-source/blob/29559f9bf4df2c55ef5eace203cbe335bbd03f2f/src/library/tools/R/check.R#L839>`__

> *...packages are normally distributed as tarballs, and these have a limit on path lengths: for maximal portability 100 bytes.*

`"Package Structure" (Writing R Extensions) <https://cran.r-project.org/doc/manuals/R-exts.html#Package-structure>`__

> *Windows historically has limited path lengths to 260 characters.*
> *This meant that paths longer than this would not resolve and errors would result.*
>
> *In the latest versions of Windows, this limitation can be expanded to approximately 32,000 characters.*
> *Your administrator will need to activate the ``"Enable Win32 long paths"`` group policy, or set ``LongPathsEnabled`` to 1 in the registry key ``HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem``.*
>
> *This allows the open() function, the os module and most other path functionality to accept and return paths longer than 260 characters.*

`"Removing the Max Path Limitation" (Python Windows docs) <https://docs.python.org/3/using/windows.html#removing-the-max-path-limitation>`__

> *Git has a limit of 4096 characters for a filename, except on Windows when Git is compiled with msys.*
> *It uses an older version of the Windows API and there's a limit of 260 characters for a filename.*
>
> *You can circumvent this by using another Git client on Windows or set ``core.longpaths`` to ``true``...*

`Filename too long in Git for Windows (Stack Overflow answer) <https://stackoverflow.com/a/22575737/3986677>`__

Other relevant discussions:

* `"Maximum Path Length" (Windows docs) <https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry>`__
* `"Comparison of Filesystems: Limits" (Wikipedia) <https://en.wikipedia.org/wiki/Comparison_of_file_systems#Limits>`__
* `"Could the 100 byte path length limit be lifted?" (r-pkg-devel, 2023) <https://stat.ethz.ch/pipermail/r-package-devel/2023q4/010203.html>`__
* `"R CMD check NOTE - Long paths in package" (r-pkg-devel, 2015) <https://stat.ethz.ch/pipermail/r-package-devel/2015q4/000511.html>`__
* `"Filename length limits on linux?" (serverfault answer, 2009-2016) <https://serverfault.com/a/9548>`__
* `"Command prompt (Cmd. exe) command-line string limitation" (Windows docs, 2023) <https://learn.microsoft.com/en-us/troubleshoot/windows-client/shell-experience/command-line-string-limitation>`__
* `conda-build discussion about 255-character prefix limit (conda/conda-build#1482) <https://github.com/conda/conda-build/issues/1482#issuecomment-256530225>`__
* `discussion about paths lengths (Python Discourse, 2023) <https://discuss.python.org/t/you-can-now-download-pypi-locally/32662/8>`__

too-many-files
**************

Expand Down
21 changes: 21 additions & 0 deletions src/pydistcheck/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"mixed-file-extensions",
"path-contains-non-ascii-characters",
"path-contains-spaces",
"path-too-long",
"unexpected-files",
}

Expand Down Expand Up @@ -203,6 +204,26 @@ def __call__(self, distro_summary: _DistributionSummary) -> List[str]:
return out


class _PathTooLongCheck(_CheckProtocol):
check_name = "path-too-long"

def __init__(self, max_path_length: int):
self.max_path_length = max_path_length

def __call__(self, distro_summary: _DistributionSummary) -> List[str]:
out: List[str] = []
bad_paths = [
p for p in distro_summary.all_paths if len(p) > self.max_path_length
]
for file_path in bad_paths:
msg = (
f"[{self.check_name}] Path too long ({len(file_path)} > {self.max_path_length}): "
f"'{file_path}'"
)
out.append(msg)
return out


class _SpacesInPathCheck(_CheckProtocol):
check_name = "path-contains-spaces"

Expand Down
61 changes: 36 additions & 25 deletions src/pydistcheck/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
_FilesOnlyDifferByCaseCheck,
_MixedFileExtensionCheck,
_NonAsciiCharacterCheck,
_PathTooLongCheck,
_SpacesInPathCheck,
_UnexpectedFilesCheck,
)
Expand Down Expand Up @@ -74,6 +75,32 @@ class ExitCodes:
"Print a summary of the distribution, like its total size and largest files."
),
)
@click.option( # type: ignore[misc]
"--expected-directories",
default=_Config.expected_directories,
show_default=True,
type=str,
help=(
"comma-delimited list of patterns matching directories that are expected "
"to be found in the distribution. Prefix with '!' to indicate a pattern which "
"should NOT match any of the distribution's contents. Patterns should be in "
"the format understood by ``fnmatch.fnmatchcase()``. "
"See https://docs.python.org/3/library/fnmatch.html."
),
)
@click.option( # type: ignore[misc]
"--expected-files",
default=_Config.expected_files,
show_default=True,
type=str,
help=(
"comma-delimited list of patterns matching files that are expected "
"to be found in the distribution. Prefix with '!' to indicate a pattern which "
"should NOT match any of the distribution's contents. Patterns should be in "
"the format understood by ``fnmatch.fnmatchcase()``. "
"See https://docs.python.org/3/library/fnmatch.html."
),
)
@click.option( # type: ignore[misc]
"--max-allowed-files",
default=_Config.max_allowed_files,
Expand Down Expand Up @@ -110,43 +137,25 @@ class ExitCodes:
),
)
@click.option( # type: ignore[misc]
"--expected-directories",
default=_Config.expected_directories,
show_default=True,
type=str,
help=(
"comma-delimited list of patterns matching directories that are expected "
"to be found in the distribution. Prefix with '!' to indicate a pattern which "
"should NOT match any of the distribution's contents. Patterns should be in "
"the format understood by ``fnmatch.fnmatchcase()``. "
"See https://docs.python.org/3/library/fnmatch.html."
),
)
@click.option( # type: ignore[misc]
"--expected-files",
default=_Config.expected_files,
"--max-path-length",
default=_Config.max_path_length,
show_default=True,
type=str,
help=(
"comma-delimited list of patterns matching files that are expected "
"to be found in the distribution. Prefix with '!' to indicate a pattern which "
"should NOT match any of the distribution's contents. Patterns should be in "
"the format understood by ``fnmatch.fnmatchcase()``. "
"See https://docs.python.org/3/library/fnmatch.html."
),
type=int,
help="Maximum allowed filepath length for files or directories in the distribution.",
)
def check( # noqa: PLR0913
*,
filepaths: str,
version: bool,
config: str,
expected_directories: str,
expected_files: str,
ignore: str,
inspect: bool,
max_allowed_files: int,
max_allowed_size_compressed: str,
max_allowed_size_uncompressed: str,
expected_directories: str,
expected_files: str,
max_path_length: int,
) -> None:
"""
Run the contents of a distribution through a set of checks, and warn about
Expand All @@ -170,6 +179,7 @@ def check( # noqa: PLR0913
"max_allowed_files": max_allowed_files,
"max_allowed_size_compressed": max_allowed_size_compressed,
"max_allowed_size_uncompressed": max_allowed_size_uncompressed,
"max_path_length": max_path_length,
"expected_directories": expected_directories,
"expected_files": expected_files,
}
Expand Down Expand Up @@ -213,6 +223,7 @@ def check( # noqa: PLR0913
_FileCountCheck(max_allowed_files=conf.max_allowed_files),
_FilesOnlyDifferByCaseCheck(),
_MixedFileExtensionCheck(),
_PathTooLongCheck(max_path_length=conf.max_path_length),
_SpacesInPathCheck(),
_UnexpectedFilesCheck(
directory_patterns=expected_directories.split(","),
Expand Down
10 changes: 6 additions & 4 deletions src/pydistcheck/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@
#
# unit tests confirm that it matches the `_Config` class, so it shouldn't ever drift from that class
_ALLOWED_CONFIG_VALUES = {
"expected_directories",
"expected_files",
"ignore",
"inspect",
"max_allowed_files",
"max_allowed_size_compressed",
"max_allowed_size_uncompressed",
"expected_directories",
"expected_files",
"max_path_length",
}

_EXPECTED_DIRECTORIES = ",".join(
Expand Down Expand Up @@ -62,13 +63,14 @@

@dataclass
class _Config:
expected_directories: str = _EXPECTED_DIRECTORIES
expected_files: str = _EXPECTED_FILES
ignore: str = ""
inspect: bool = False
max_allowed_files: int = 2000
max_allowed_size_compressed: str = "50M"
max_allowed_size_uncompressed: str = "75M"
expected_directories: str = _EXPECTED_DIRECTORIES
expected_files: str = _EXPECTED_FILES
max_path_length: int = 200

def __setattr__(self, name: str, value: Any) -> None:
attr_name = name.replace("-", "_")
Expand Down
47 changes: 46 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@
]
MACOS_SUFFIX = "macosx_10_15_x86_64.macosx_11_6_x86_64.macosx_12_5_x86_64.whl"
MANYLINUX_SUFFIX = "manylinux_2_28_x86_64.manylinux_2_5_x86_64.manylinux1_x86_64.whl"
BASEBALL_PACKAGES = [
BASEBALL_CONDA_PACKAGES = [
"osx-64-baseballmetrics-0.1.0-0.conda",
"osx-64-baseballmetrics-0.1.0-0.tar.bz2",
]
BASEBALL_WHEELS = [
f"baseballmetrics-0.1.0-py3-none-{MACOS_SUFFIX}",
f"baseballmetrics-0.1.0-py3-none-{MANYLINUX_SUFFIX}",
]
BASEBALL_PACKAGES = BASEBALL_CONDA_PACKAGES + BASEBALL_WHEELS
# NOTE: .bz2 and .tar.gz packages here are just unzipped
# and re-tarred Python wheels... to avoid pydistcheck
# implicitly assuming that, for example, '*.tar.gz' must
Expand Down Expand Up @@ -671,6 +674,48 @@ def test_expected_files_does_not_raise_check_failure_if_directory_pattern_matche
)


@pytest.mark.parametrize("distro_file", BASEBALL_CONDA_PACKAGES)
def test_path_too_long_check_works_for_conda_packages(distro_file):
runner = CliRunner()
result = runner.invoke(
check,
[
"--max-path-length=5",
os.path.join(TEST_DATA_DIR, distro_file),
],
)
assert result.exit_code == 1

_assert_log_matches_pattern(
result=result,
pattern=(
r"\[path\-too\-long\] Path too long \(78 > 5\)\: "
r"'lib/python3\.9/site\-packages/baseballmetrics/__pycache__/metrics\.cpython\-39\.pyc'"
),
)


@pytest.mark.parametrize("distro_file", BASEBALL_WHEELS)
def test_path_too_long_check_works_for_wheels(distro_file):
runner = CliRunner()
result = runner.invoke(
check,
[
"--max-path-length=5",
os.path.join(TEST_DATA_DIR, distro_file),
],
)
assert result.exit_code == 1

_assert_log_matches_pattern(
result=result,
pattern=(
r"\[path\-too\-long\] Path too long \(30 > 5\)\: "
r"'baseballmetrics/_shared_lib\.py'"
),
)


@pytest.mark.parametrize("distro_file", PROBLEMATIC_PACKAGES)
def test_cli_raises_exactly_the_expected_number_of_errors_for_the_problematic_package(
distro_file,
Expand Down
20 changes: 12 additions & 8 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,24 +59,26 @@ def test_update_from_dict_works_when_changing_all_values(base_config):
assert base_config.max_allowed_size_compressed == "1G"
assert base_config.max_allowed_size_uncompressed == "18K"
patch_dict = {
"expected_directories": "!*/tests",
"expected_files": "!*.xlsx,!data/*.csv",
"ignore": "path-contains-spaces,too-many-files",
"inspect": True,
"max_allowed_files": 8,
"max_allowed_size_compressed": "2G",
"max_allowed_size_uncompressed": "141K",
"expected_directories": "!*/tests",
"expected_files": "!*.xlsx,!data/*.csv",
"max_path_length": 600,
}
assert (
set(patch_dict.keys()) == _ALLOWED_CONFIG_VALUES
), "this test needs to be updated"
base_config.update_from_dict(patch_dict)
assert base_config.expected_directories == "!*/tests"
assert base_config.expected_files == "!*.xlsx,!data/*.csv"
assert base_config.inspect is True
assert base_config.max_allowed_files == 8
assert base_config.max_allowed_size_compressed == "2G"
assert base_config.max_allowed_size_uncompressed == "141K"
assert base_config.expected_directories == "!*/tests"
assert base_config.expected_files == "!*.xlsx,!data/*.csv"
assert base_config.max_path_length == 600


def test_update_from_toml_silently_returns_self_if_file_does_not_exist(base_config):
Expand Down Expand Up @@ -126,13 +128,14 @@ def test_update_from_toml_works_with_all_config_values(
):
temp_file = os.path.join(tmpdir, f"{uuid.uuid4().hex}.toml")
patch_dict = {
"expected_directories": "[\n'!tests/*'\n]",
"expected_files": "[\n'!*.pq',\n'!*/tests/data/*.csv']",
"ignore": "[\n'path-contains-spaces',\n'too-many-files'\n]",
"inspect": "true",
"max_allowed_files": 8,
"max_allowed_size_compressed": "'3G'",
"max_allowed_size_uncompressed": "'4.12G'",
"expected_directories": "[\n'!tests/*'\n]",
"expected_files": "[\n'!*.pq',\n'!*/tests/data/*.csv']",
"max_path_length": 25,
}
assert (
set(patch_dict.keys()) == _ALLOWED_CONFIG_VALUES
Expand All @@ -145,13 +148,14 @@ def test_update_from_toml_works_with_all_config_values(
]
f.write("\n".join(lines))
base_config.update_from_toml(toml_file=temp_file)
assert base_config.expected_directories == "!tests/*"
assert base_config.expected_files == "!*.pq,!*/tests/data/*.csv"
assert base_config.ignore == "path-contains-spaces,too-many-files"
assert base_config.inspect is True
assert base_config.max_allowed_files == 8
assert base_config.max_allowed_size_compressed == "3G"
assert base_config.max_allowed_size_uncompressed == "4.12G"
assert base_config.expected_directories == "!tests/*"
assert base_config.expected_files == "!*.pq,!*/tests/data/*.csv"
assert base_config.max_path_length == 25


def test_update_from_toml_converts_lists_to_comma_delimited_string(base_config, tmpdir):
Expand Down

0 comments on commit 46ab871

Please sign in to comment.