Skip to content

Commit

Permalink
Merge pull request #3681 from nexB/update-referenced-files
Browse files Browse the repository at this point in the history
Refine referenced filenames #3547
  • Loading branch information
AyanSinhaMahapatra authored Mar 19, 2024
2 parents bfd88b6 + 5f28d5c commit 6c15ebf
Show file tree
Hide file tree
Showing 9 changed files with 482 additions and 35 deletions.
74 changes: 74 additions & 0 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,25 @@ def identifier_with_expression(self):
id_safe_expression = python_safe_name(s=str(self.license_expression))
return "{}-{}".format(id_safe_expression, self._identifier)

@property
def is_unknown(self):
"""
Return True if there are unknown license keys in the license expression
for this detection, return False otherwise.
"""
unknown_license_keys = [
"unknown-license-reference",
"unknown-spdx",
"unknown",
"free-unknown"
]

for license_key in unknown_license_keys:
if license_key in self.license_expression:
return True

return False

def get_start_end_line(self):
"""
Return start and end line for a license detection issue, from the
Expand Down Expand Up @@ -1356,6 +1375,61 @@ def has_references_to_local_files(license_matches):
)


def use_referenced_license_expression(referenced_license_expression, license_detection, licensing=Licensing()):
"""
Return True if the ``license_detection`` LicenseDetection should include
the matches represented by the ``referenced_license_expression`` string.
Return False otherwise.
Used when we have a ``license_detection`` with a match to a license rule like
"See license in COPYING" and where the ``referenced_license_expression`` is the
expression found in the "COPYING" file, which is the combined expression from
all license detections found in "COPYING" (or multiple referenced files).
Reference: https://github.com/nexB/scancode-toolkit/issues/3547
"""
#TODO: Also determing if referenced matches could be added but
# resulting license expression should not be modified.

if not referenced_license_expression or not license_detection:
return False

# We should always include referenced license matches to resolve an unknown
# license reference
if license_detection.is_unknown:
return True

# We should always include referenced license matches when the license
# expression from the referenced license matches match the license
# expression for the detection
if referenced_license_expression == license_detection.license_expression:
return True

license_keys = set(
licensing.license_keys(expression=license_detection.license_expression)
)
referenced_license_keys = set(
licensing.license_keys(expression=referenced_license_expression)
)
same_expression = referenced_license_expression == license_detection.license_expression
same_license_keys = license_keys == referenced_license_keys

# If we have the same license keys but not the same license expression then
# the reference could merely be pointing to notices, combining which produces
# a different expression, and the original detection is correct
if same_license_keys and not same_expression:
return False

# when there are many license keys in an expression, and there are no
# unknown or other cases, we cannot safely conclude that we should
# follow the license in the referenced filenames. This is likely
# a case where we have larger notices and several combined expressions,
if len(referenced_license_keys) > 5:
return False

return True


def get_detected_license_expression(
analysis,
license_matches=None,
Expand Down
70 changes: 55 additions & 15 deletions src/licensedcode/plugin_license.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import SCAN_GROUP
from commoncode.cliutils import SCAN_OPTIONS_GROUP
from license_expression import combine_expressions
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl

Expand All @@ -30,10 +31,12 @@
from licensedcode.detection import LicenseDetectionFromResult
from licensedcode.detection import sort_unique_detections
from licensedcode.detection import UniqueDetection
from licensedcode.detection import use_referenced_license_expression
from packagedcode.utils import combine_expressions
from scancode.api import SCANCODE_LICENSEDB_URL

TRACE = os.environ.get('SCANCODE_DEBUG_PLUGIN_LICENSE', False)
TRACE_REFERENCE = os.environ.get('SCANCODE_DEBUG_PLUGIN_LICENSE_REFERENCE', False)


def logger_debug(*args):
Expand All @@ -42,7 +45,7 @@ def logger_debug(*args):

logger = logging.getLogger(__name__)

if TRACE:
if TRACE or TRACE_REFERENCE:
import sys
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -214,6 +217,8 @@ def process_codebase(self, codebase, license_text=False, license_diagnostics=Fal
f'before: {license_expressions_before}\n'
f'after : {license_expressions_after}'
)

#raise Exception()

license_detections = collect_license_detections(
codebase=codebase,
Expand Down Expand Up @@ -259,20 +264,28 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):

modified = False

if TRACE_REFERENCE:
logger_debug(
f'add_referenced_license_matches: resource_path: {resource.path}',
)

for license_detection_mapping in license_detection_mappings:

license_detection = LicenseDetectionFromResult.from_license_detection_mapping(
license_detection_mapping=license_detection_mapping,
file_path=resource.path,
)
detection_modified = False
detections_added = []
license_match_mappings = license_detection_mapping["matches"]
referenced_filenames = get_referenced_filenames(license_detection.matches)

if not referenced_filenames:
if TRACE_REFERENCE:
logger_debug(
f'No references at license detection with expression: {license_detection.license_expression}',
)
continue

referenced_detections = []
for referenced_filename in referenced_filenames:
referenced_resource = find_referenced_resource(
referenced_filename=referenced_filename,
Expand All @@ -281,26 +294,53 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):
)

if referenced_resource and referenced_resource.license_detections:
modified = True
detection_modified = True
detections_added.extend(referenced_resource.license_detections)
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_resource.license_detections
referenced_detections.extend(
referenced_resource.license_detections
)
populate_matches_with_path(
matches=matches_to_extend,
path=referenced_resource.path
)
license_match_mappings.extend(matches_to_extend)

if not detection_modified:
for detection in referenced_resource.license_detections:
populate_matches_with_path(
matches=detection["matches"],
path=referenced_resource.path
)

referenced_license_expression = combine_expressions(
expressions=[
detection["license_expression"]
for detection in referenced_detections
],
)
if not use_referenced_license_expression(
referenced_license_expression=referenced_license_expression,
license_detection=license_detection,
):
if TRACE_REFERENCE:
logger_debug(
f'use_referenced_license_expression: False for '
f'resource: {referenced_resource.path} and '
f'license_expression: {referenced_license_expression}',
)
continue

if TRACE_REFERENCE:
logger_debug(
f'use_referenced_license_expression: True for '
f'resource: {referenced_resource.path} and '
f'license_expression: {referenced_license_expression}',
)

modified = True
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_detections
)
license_match_mappings.extend(matches_to_extend)

detection_log, license_expression = get_detected_license_expression(
license_match_mappings=license_match_mappings,
analysis=DetectionCategory.UNKNOWN_FILE_REFERENCE_LOCAL.value,
post_scan=True,
)

license_expression_spdx = build_spdx_license_expression(
license_expression=str(license_expression),
licensing=get_cache().licensing,
Expand All @@ -310,7 +350,7 @@ def add_referenced_filenames_license_matches_for_detections(resource, codebase):
license_detection_mapping["detection_log"] = detection_log
license_detection_mapping["identifier"] = get_new_identifier_from_detections(
initial_detection=license_detection_mapping,
detections_added=detections_added,
detections_added=referenced_detections,
license_expression=license_expression,
)

Expand Down
78 changes: 59 additions & 19 deletions src/packagedcode/licensing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from licensedcode.detection import detect_licenses
from licensedcode.detection import LicenseDetectionFromResult
from licensedcode.detection import populate_matches_with_path
from licensedcode.detection import use_referenced_license_expression
from licensedcode.spans import Span
from licensedcode import query

Expand Down Expand Up @@ -93,41 +94,52 @@ def add_referenced_license_matches_for_package(resource, codebase):
file_path=resource.path,
)

detection_modified = False
detections_added = []
license_match_mappings = license_detection_mapping["matches"]
referenced_filenames = get_referenced_filenames(license_detection_object.matches)
if not referenced_filenames:
continue

referenced_detections = []
for referenced_filename in referenced_filenames:
referenced_resource = find_referenced_resource(
referenced_filename=referenced_filename,
resource=resource,
codebase=codebase,
)

if not referenced_resource:
continue

referenced_license_detections = referenced_resource.license_detections

if referenced_license_detections:
modified = True
detection_modified = True
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_license_detections
if referenced_resource and referenced_resource.license_detections:
referenced_detections.extend(
referenced_resource.license_detections
)

# For LicenseMatches with different resources as origin, add the
# resource path to these matches as origin info
populate_matches_with_path(
matches=matches_to_extend,
path=referenced_resource.path
)
license_match_mappings.extend(matches_to_extend)

if not detection_modified:
for detection in referenced_resource.license_detections:
populate_matches_with_path(
matches=detection["matches"],
path=referenced_resource.path
)

referenced_license_expression = combine_expressions(
expressions=[
detection["license_expression"]
for detection in referenced_detections
],
)
if not use_referenced_license_expression(
referenced_license_expression=referenced_license_expression,
license_detection=license_detection_object,
):
continue

modified = True
detections_added.extend(referenced_resource.license_detections)
matches_to_extend = get_matches_from_detection_mappings(
license_detections=referenced_resource.license_detections,
)
license_match_mappings.extend(matches_to_extend)

detection_log, license_expression = get_detected_license_expression(
license_match_mappings=license_match_mappings,
analysis=DetectionCategory.PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL.value,
Expand All @@ -142,7 +154,7 @@ def add_referenced_license_matches_for_package(resource, codebase):
license_detection_mapping["detection_log"] = detection_log
license_detection_mapping["identifier"] = get_new_identifier_from_detections(
initial_detection=license_detection_mapping,
detections_added=referenced_license_detections,
detections_added=detections_added,
license_expression=license_expression,
)

Expand Down Expand Up @@ -223,7 +235,20 @@ def add_referenced_license_detection_from_package(resource, codebase):
f'sibling_license_detections: {sibling_license_detections}'
)

referenced_license_expression = combine_expressions(
expressions=[
detection["license_expression"]
for detection in sibling_license_detections
],
)
if not use_referenced_license_expression(
referenced_license_expression=referenced_license_expression,
license_detection=license_detection_object,
):
continue

for sibling_detection in sibling_license_detections:

modified = True
detection_modified = True
license_match_mappings.extend(sibling_detection["matches"])
Expand All @@ -239,6 +264,21 @@ def add_referenced_license_detection_from_package(resource, codebase):
break

pkg_detections = codebase_package["license_detections"]
if not pkg_detections:
continue

referenced_license_expression = combine_expressions(
expressions=[
detection["license_expression"]
for detection in pkg_detections
],
)
if not use_referenced_license_expression(
referenced_license_expression=referenced_license_expression,
license_detection=license_detection_object,
):
continue

for pkg_detection in pkg_detections:
modified = True
detection_modified = True
Expand Down
Loading

0 comments on commit 6c15ebf

Please sign in to comment.