From 89938c49688ca8be34f51164e12c1b8b9fd02d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Sun, 15 Sep 2024 10:02:39 +0300 Subject: [PATCH] Add --multiple option to the CLI The --multiple option enables uses ScanMode::TopDown to detect multiple licenses in the same file. The confidence score is still zero for the whole file but having a list of licenses in the file is still useful. An improvement would be to use the individual confidence scores in the reported total score. Added a test case that uses the license from python-zeep that issue #40 is about where the file has one MIT and two BSD-3-Clause licenses in it. --- cli/src/commands.rs | 4 ++ cli/src/crawl.rs | 2 +- cli/src/identify.rs | 9 ++- cli/src/main.rs | 3 +- cli/tests/cli.rs | 27 +++++++++ cli/tests/data/python-zeep.LICENSE | 92 ++++++++++++++++++++++++++++++ 6 files changed, 132 insertions(+), 5 deletions(-) create mode 100644 cli/tests/data/python-zeep.LICENSE diff --git a/cli/src/commands.rs b/cli/src/commands.rs index 5d0c8bb..7de4312 100644 --- a/cli/src/commands.rs +++ b/cli/src/commands.rs @@ -50,6 +50,10 @@ pub enum Subcommand { /// Read in filenames on stdin for batch identification #[structopt(long = "batch", short = "b")] batch: bool, + + /// Detect multiple licenses in the same file + #[structopt(long = "multiple", short = "m")] + topdown: bool, }, /// Crawl a directory identifying license files diff --git a/cli/src/crawl.rs b/cli/src/crawl.rs index 1976909..4e5c700 100644 --- a/cli/src/crawl.rs +++ b/cli/src/crawl.rs @@ -56,7 +56,7 @@ pub fn crawl( match read_to_string(path) { Ok(content) => { let data = TextData::new(&content); - let idres = identify_data(&store, &data, false, false); + let idres = identify_data(&store, &data, false, false, false); let fileres = FileResult::from_identification_result(&path_lossy, &idres); fileres.print_as(output_format, true); } diff --git a/cli/src/identify.rs b/cli/src/identify.rs index bbb1294..a460ab0 100644 --- a/cli/src/identify.rs +++ b/cli/src/identify.rs @@ -23,6 +23,7 @@ pub fn identify( optimize: bool, want_diff: bool, batch: bool, + topdown: bool, ) -> Result<(), Error> { // load the cache from disk or embedded data let cache_inst = Instant::now(); @@ -44,7 +45,7 @@ pub fn identify( read_to_string(&filename)? }; - let idres = identify_data(&store, &content.into(), optimize, want_diff); + let idres = identify_data(&store, &content.into(), optimize, want_diff, topdown); let file_lossy = filename.to_string_lossy(); let fileres = FileResult::from_identification_result(&file_lossy, &idres); fileres.print_as(output_format, false); @@ -74,7 +75,7 @@ pub fn identify( } }; - let idres = identify_data(&store, &content.into(), optimize, want_diff); + let idres = identify_data(&store, &content.into(), optimize, want_diff, topdown); let fileres = FileResult::from_identification_result(&buf, &idres); fileres.print_as(output_format, false); } @@ -87,11 +88,13 @@ pub fn identify_data( text_data: &TextData, optimize: bool, want_diff: bool, + topdown: bool, ) -> Result { let inst = Instant::now(); + let scan_mode = if topdown {ScanMode::TopDown} else {ScanMode::Elimination}; let strategy = ScanStrategy::new(store) - .mode(ScanMode::Elimination) + .mode(scan_mode) .confidence_threshold(MIN_SCORE) .optimize(optimize) .max_passes(1); diff --git a/cli/src/main.rs b/cli/src/main.rs index 4cc83a6..f06265d 100755 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -34,7 +34,8 @@ fn main() { optimize, diff, batch, - } => identify::identify(&cache_file, &output_format, filename, optimize, diff, batch), + topdown, + } => identify::identify(&cache_file, &output_format, filename, optimize, diff, batch, topdown), Subcommand::Crawl { directory, follow_links, diff --git a/cli/tests/cli.rs b/cli/tests/cli.rs index 0370eb0..3c7a0e5 100644 --- a/cli/tests/cli.rs +++ b/cli/tests/cli.rs @@ -69,3 +69,30 @@ fn output_json() { .len() ); } + +#[test] +fn multiple_licenses() { + let out = run(&["id", "./tests/data/python-zeep.LICENSE"]); + assert!(!out.status.success()); + + let json = run_json(&["id", "-m", "./tests/data/python-zeep.LICENSE"]); + + assert_eq!("./tests/data/python-zeep.LICENSE", json["path"]); + + // The score is currently zero for any file with multiple licenses in it + assert!( + json["result"]["score"] + .as_f64() + .expect("score must be a number") + == 0.0 + ); + + assert_eq!("MIT", json["result"]["containing"][0]["license"]["name"]); + assert_eq!("original", json["result"]["containing"][0]["license"]["kind"]); + + assert_eq!("BSD-3-Clause", json["result"]["containing"][1]["license"]["name"]); + assert_eq!("original", json["result"]["containing"][1]["license"]["kind"]); + + assert_eq!("BSD-3-Clause", json["result"]["containing"][2]["license"]["name"]); + assert_eq!("original", json["result"]["containing"][2]["license"]["kind"]); +} diff --git a/cli/tests/data/python-zeep.LICENSE b/cli/tests/data/python-zeep.LICENSE new file mode 100644 index 0000000..db6c7c6 --- /dev/null +++ b/cli/tests/data/python-zeep.LICENSE @@ -0,0 +1,92 @@ +The MIT License (MIT) + +Copyright (c) 2016-2021 Michael van Tellingen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + +-- + +Parts of the XSD handling are heavily inspired by soapfish, see: +https://github.com/soapteam/soapfish + +Copyright (c) 2011-2021, Soapfish Contributors +All rights reserved. +For the exact contribution history, see the git revision log. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY +OF SUCH DAMAGE. + + +-- + +The support for BinarySecurityToken is from py-wsse, see: +https://github.com/orcasgit/py-wsse + + +Copyright (c) 2015 ORCAS, Inc +Some portions from py-soap-wsse (c) Michael van Tellingen +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of the author nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.