From fcee822a1454c27b1d1c45f117a14bff4f5df5e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matthias=20Ri=C3=9Fe?= <m.risse@fz-juelich.de>
Date: Thu, 16 Feb 2023 14:27:55 +0100
Subject: [PATCH] Add better handling of symlinks

---
 src/reuse/project.py | 84 ++++++++++++++++++++++++++++----------------
 src/reuse/report.py  |  7 ++--
 src/reuse/vcs.py     |  4 +--
 3 files changed, 60 insertions(+), 35 deletions(-)

diff --git a/src/reuse/project.py b/src/reuse/project.py
index 9f192f9e7..d0089639f 100644
--- a/src/reuse/project.py
+++ b/src/reuse/project.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
 # SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
 # SPDX-FileCopyrightText: 2023 DB Systel GmbH
+# SPDX-FileCopyrightText: 2023 Matthias Riße
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
@@ -136,8 +137,27 @@ def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]:
                     _LOGGER.debug("ignoring '%s'", the_file)
                     continue
                 if the_file.is_symlink():
-                    _LOGGER.debug("skipping symlink '%s'", the_file)
-                    continue
+                    # Needs to use os.path.absolute instead of Path.absolute
+                    # since the former normalizes the path, i.e. resolves "..".
+                    # There is no method in pathlib for this which doesn't also
+                    # resolve symlinks recursively, like Path.resolve.
+                    target_file = Path(
+                        os.path.abspath(the_file.readlink())  # type: ignore
+                    )
+                    _LOGGER.debug(
+                        "'%s' is a symlink pointing to '%s'",
+                        the_file,
+                        target_file,
+                    )
+                    if (
+                        target_file.is_relative_to(  # type: ignore # pylint: disable=E1101
+                            self.root.resolve()
+                        )
+                        and (target_file.exists() or target_file.is_symlink())
+                        and not self._is_path_ignored(target_file)
+                    ):
+                        _LOGGER.debug("skipping symlink '%s'", the_file)
+                        continue
                 # Suppressing this error because I simply don't want to deal
                 # with that here.
                 with contextlib.suppress(OSError):
@@ -184,35 +204,39 @@ def reuse_info_of(self, path: StrPath) -> ReuseInfo:
                 dep5_path = source_path
 
         # Search the file for REUSE information.
-        with path.open("rb") as fp:
-            try:
-                # Completely read the file once to search for possible snippets
-                if _contains_snippet(fp):
-                    _LOGGER.debug(f"'{path}' seems to contain a SPDX Snippet")
-                    read_limit = None
-                else:
-                    read_limit = _HEADER_BYTES
-                # Reset read position
-                fp.seek(0)
-                # Scan the file for REUSE info, possible limiting the read
-                # length
-                file_result = extract_reuse_info(
-                    decoded_text_from_binary(fp, size=read_limit)
-                )
-                if file_result:
-                    source_path = str(path)
-                    if path.suffix == ".license":
-                        source_type = SourceType.DOT_LICENSE_FILE
+        if not path.is_symlink():
+            with path.open("rb") as fp:
+                try:
+                    # Completely read the file once to search for possible
+                    # snippets
+                    if _contains_snippet(fp):
+                        _LOGGER.debug(
+                            f"'{path}' seems to contain a SPDX Snippet"
+                        )
+                        read_limit = None
                     else:
-                        source_type = SourceType.FILE_HEADER
-
-            except (ExpressionError, ParseError):
-                _LOGGER.error(
-                    _(
-                        "'{path}' holds an SPDX expression that cannot be"
-                        " parsed, skipping the file"
-                    ).format(path=path)
-                )
+                        read_limit = _HEADER_BYTES
+                    # Reset read position
+                    fp.seek(0)
+                    # Scan the file for REUSE info, possible limiting the read
+                    # length
+                    file_result = extract_reuse_info(
+                        decoded_text_from_binary(fp, size=read_limit)
+                    )
+                    if file_result:
+                        source_path = str(path)
+                        if path.suffix == ".license":
+                            source_type = SourceType.DOT_LICENSE_FILE
+                        else:
+                            source_type = SourceType.FILE_HEADER
+
+                except (ExpressionError, ParseError):
+                    _LOGGER.error(
+                        _(
+                            "'{path}' holds an SPDX expression that cannot be"
+                            " parsed, skipping the file"
+                        ).format(path=path)
+                    )
 
         # There is both information in a .dep5 file and in the file header
         if (
diff --git a/src/reuse/report.py b/src/reuse/report.py
index 268e48430..297a8e0a8 100644
--- a/src/reuse/report.py
+++ b/src/reuse/report.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
 # SPDX-FileCopyrightText: 2022 Florian Snow <florian@familysnow.net>
 # SPDX-FileCopyrightText: 2022 Pietro Albini <pietro.albini@ferrous-systems.com>
+# SPDX-FileCopyrightText: 2023 Matthias Riße
 #
 # SPDX-License-Identifier: GPL-3.0-or-later
 
@@ -445,14 +446,14 @@ def generate(
     ) -> "FileReport":
         """Generate a FileReport from a path in a Project."""
         path = Path(path)
-        if not path.is_file():
-            raise OSError(f"{path} is not a file")
+        if not path.is_file() and not path.is_symlink():
+            raise OSError(f"{path} is not supported")
 
         relative = project.relative_from_root(path)
         report = cls("./" + str(relative), path, do_checksum=do_checksum)
 
         # Checksum and ID
-        if report.do_checksum:
+        if report.do_checksum and not path.is_symlink():
             report.spdxfile.chk_sum = _checksum(path)
         else:
             # This path avoids a lot of heavy computation, which is handy for
diff --git a/src/reuse/vcs.py b/src/reuse/vcs.py
index ec760abea..a22e9de6d 100644
--- a/src/reuse/vcs.py
+++ b/src/reuse/vcs.py
@@ -100,7 +100,7 @@ def _find_all_ignored_files(self) -> Set[Path]:
         ]
         result = execute_command(command, _LOGGER, cwd=self.project.root)
         all_files = result.stdout.decode("utf-8").split("\0")
-        return {Path(file_) for file_ in all_files[:-1]}
+        return {Path(file_) for file_ in all_files[:-1]}.union({Path(".git")})
 
     def is_ignored(self, path: StrPath) -> bool:
         path = self.project.relative_from_root(path)
@@ -168,7 +168,7 @@ def _find_all_ignored_files(self) -> Set[Path]:
         ]
         result = execute_command(command, _LOGGER, cwd=self.project.root)
         all_files = result.stdout.decode("utf-8").split("\0")
-        return {Path(file_) for file_ in all_files[:-1]}
+        return {Path(file_) for file_ in all_files[:-1]}.union({Path(".hg")})
 
     def is_ignored(self, path: StrPath) -> bool:
         path = self.project.relative_from_root(path)