From 9022a871b53be8494e4456e5eb6e6e8a5b27cb37 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Fri, 4 Sep 2020 10:27:10 -0700 Subject: [PATCH 01/15] empty list should be initialized *before* it might be populated --- link_checker/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index 2e6b629..8b8001a 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -459,9 +459,9 @@ def main(): run_full_inspection = ( no_parser_args or all_parser_args_but_no_subparser_args ) + exit_status_list = [] if run_sub_command: exit_status_list = args.func(args) - exit_status_list = [] if args.legalcode and not all_parser_args_but_no_subparser_args: exit_status_list = check_legalcode(args) if args.deeds and not all_parser_args_but_no_subparser_args: From 259e9dc1f39261aea5146b832a917c91cf4d663b Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Fri, 4 Sep 2020 17:37:45 -0700 Subject: [PATCH 02/15] refactored argument parser to only use subcommands and added additional subcommands --- link_checker/__main__.py | 505 ++++++++++++++++++++++----------------- link_checker/utils.py | 9 +- 2 files changed, 298 insertions(+), 216 deletions(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index 8b8001a..60e97e7 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -27,7 +27,6 @@ INFO, DEBUG, ) - from link_checker.utils import ( CheckerError, get_legalcode, @@ -47,129 +46,165 @@ ) -def parse_argument(arguments): +def parse_arguments(): """parse arguments from cli Args: args (list): list of arguments parsed from command line """ - # Setup argument parser + + # Primary argument parser and sub-parser (for subcommands) parser = argparse.ArgumentParser( - prog="link_checker.py", description=__doc__ - ) - parser.add_argument( - "--legalcode", - help="Runs link_checker for legalcode only. (Note: --licenses is" - " deprecated and will be dropped from a future release. Please use" - " --legalcode instead.)", - action="store_true", - ) - parser.add_argument( - "--deeds", - help="Runs link_checker for deeds only (the legalcode files will still" - " be scraped, but not checked for broken links)", - action="store_true", - ) - parser.add_argument( - "--rdf", help="Runs link_checker for rdf only", action="store_true" + prog="link_checker", + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter, + epilog="Also see the help output each subcommand", ) - parser.add_argument( - "--index", - help="Runs link_checker for index.rdf only", - action="store_true", - ) - parser.add_argument( - "--local", - help="Scrapes legalcode files from local file system", - action="store_true", - ) - parser.add_argument( - "--output-errors", - help="Outputs all link errors to file (default: errorlog.txt) and" - " creates junit-xml type summary(test-summary/junit-xml-report.xml)", - metavar="output_file", - const="errorlog.txt", - nargs="?", - type=argparse.FileType("w", encoding="utf-8"), + subparsers = parser.add_subparsers( + title="subcommands (a single subcomamnd is required)", + dest="subcommand", + required=True, ) - parser.add_argument( + + # Shared Parsers + + # Shared parser (optional arguments used by all subcommands) + parser_shared = argparse.ArgumentParser() + parser_shared.add_argument( "-q", "--quiet", action="append_const", const=10, + help="decrease verbosity (can be specified multiple times)", dest="verbosity", - help="Decrease verbosity. Can be specified multiple times.", - ) - parser.add_argument( - "--root-url", help=f"Set root URL (default: {DEFAULT_ROOT_URL})", ) - parser.add_argument( + parser_shared.add_argument( "-v", "--verbose", action="append_const", const=-10, + help="increase verbosity (can be specified multiple times)", dest="verbosity", - help="Increase verbosity. Can be specified multiple times.", ) - # Sub-Parser Section - subparsers = parser.add_subparsers(help="sub-command help") - # legalcode section: link_checker legalcode -h - parser_legalcode = subparsers.add_parser( - "legalcode", help="legalcode help" + parser_shared.add_argument( + "--root-url", + default=DEFAULT_ROOT_URL, + help=f"set root URL (default: '{DEFAULT_ROOT_URL}')", ) - parser_legalcode.add_argument( + + # Shared licenses parser (optional arguments used by all license + # subcommands) + parser_shared_licenses = argparse.ArgumentParser(add_help=False) + parser_shared_licenses.add_argument( "--local", - help=( - "Scrapes legalcode files from local file system.\n" - "Add 'LICENSE_LOCAL_PATH' to your environment,\n" - "otherwise this tool will search for legalcode files\n" - f"in '{LICENSES_DIR}'." - ), action="store_true", + help="process local filesystem legalcode files to determine valid" + " license paths (uses LICENSE_LOCAL_PATH environment variable and" + f" falls back to default: '{LICENSES_DIR}')", ) - parser_legalcode.set_defaults(func=check_legalcode) - # deeds section: link_checker deeds -h - parser_deeds = subparsers.add_parser("deeds", help="deeds help") - parser_deeds.add_argument( - "--local", - help=( - "Scrapes deed files based on the legalcode files " - "found on the local file system.\n" - "Add 'LICENSE_LOCAL_PATH' to your environment,\n" - "otherwise this tool will search for legalcode files\n" - f"in '{LICENSES_DIR}'." - ), + + # Shared reporting parser (optional arguments used by all reporting + # subcommands) + parser_shared_reporting = argparse.ArgumentParser(add_help=False) + parser_shared_reporting.add_argument( + "--output-errors", + nargs="?", + const="errorlog.txt", + type=argparse.FileType("w", encoding="utf-8"), + help="output all link errors to file (default: errorlog.txt) and" + " create junit-xml type summary (test-summary/junit-xml-report.xml)", + metavar="output_file", + ) + + # Shared RDF parser (optional arguments used by all RDF subcommands) + parser_shared_rdf = argparse.ArgumentParser(add_help=False) + parser_shared_rdf.add_argument( + "--local-index", action="store_true", + help="process local filesystem index.rdf (uses INDEX_RDF_LOCAL_PATH" + " environment variable and falls back to default: './index.rdf')", + ) + + # Subcommands + + # Deeds subcommand: link_checker deeds -h + parser_deeds = subparsers.add_parser( + "deeds", + add_help=False, + help="check the links for each license's deed", + parents=[ + parser_shared, + parser_shared_licenses, + parser_shared_reporting, + ], ) parser_deeds.set_defaults(func=check_deeds) - # rdf section: link_checker rdf -h - parser_rdf = subparsers.add_parser("rdf", help="rdf help") - parser_rdf.add_argument( - "--local", - help=( - "Scrapes rdf files based on the legalcode files " - "found on the local file system.\n" - "Add 'LICENSE_LOCAL_PATH' to your environment,\n" - "otherwise this tool will search for legalcode files\n" - f"in '{LICENSES_DIR}'." - ), - action="store_true", + + # Legalcode subcommand: link_checker legalcode -h + parser_legalcode = subparsers.add_parser( + "legalcode", + add_help=False, + help="check the links for each license's legalcode", + parents=[ + parser_shared, + parser_shared_licenses, + parser_shared_reporting, + ], ) - parser_rdf.add_argument( - "--index", - help=( - "Checks index.rdf file instead of checking rdf files.\n" - "If you want to check the index.rdf file locally add\n" - "'INDEX_RDF_LOCAL_PATH' to your environment; otherwise this\n" - "variable defaults to './index.rdf'." - ), - action="store_true", + parser_legalcode.set_defaults(func=check_legalcode) + + # RDF subcommand: link_checker rdf -h + parser_rdf = subparsers.add_parser( + "rdf", + add_help=False, + help="check the links for each license's RDF", + parents=[ + parser_shared, + parser_shared_licenses, + parser_shared_rdf, + parser_shared_reporting, + ], ) parser_rdf.set_defaults(func=check_rdfs) - args = parser.parse_args(arguments) - if args.root_url is None: - args.root_url = DEFAULT_ROOT_URL + # index.rdf subcommand: link_checker index -h + parser_index = subparsers.add_parser( + "index", + add_help=False, + help="check the links within index.rdf", + parents=[parser_shared, parser_shared_rdf, parser_shared_reporting], + ) + parser_index.set_defaults(func=check_index_rdf) + + # complete subcommand: link_checker complete -h + parser_complete = subparsers.add_parser( + "complete", + add_help=False, + help="Complete check (deeds, legalcode, rdf, and index)", + parents=[ + parser_shared, + parser_shared_licenses, + parser_shared_rdf, + parser_shared_reporting, + ], + ) + parser_complete.set_defaults(func=check_complete) + + # Canonical License URLs subcommand: link_checker canonical -h + parser_canonical = subparsers.add_parser( + "canonical", + add_help=False, + help="print canonical license URLs", + parents=[parser_shared, parser_shared_licenses], + ) + parser_canonical.set_defaults(func=print_canonical) + parser_canonical.add_argument( + "--include-gnu", + action="store_true", + help="include GNU licenses in addition to Creative Commons licenses", + ) + + args = parser.parse_args() args.log_level = WARNING if args.verbosity: for v in args.verbosity: @@ -178,91 +213,14 @@ def parse_argument(arguments): args.log_level = DEBUG elif args.log_level > CRITICAL: args.log_level = CRITICAL - if not args.output_errors: + del args.verbosity + if "output_errors" not in args or not args.output_errors: args.output_errors = None - return args - - -def check_legalcode(args): - print("\n\nChecking LegalCode License...\n\n") - license_names = get_legalcode(args) - if args.log_level <= INFO: - print("Number of files to be checked:", len(license_names)) - errors_total = 0 - exit_status = 0 - for license_name in license_names: - caught_errors = 0 - context_printed = False - filename = license_name[: -len(".html")] - base_url = create_base_link(args, filename) - context = f"\n\nChecking: {license_name}\nURL: {base_url}" - if args.local: - source_html = request_local_text(LICENSE_LOCAL_PATH, license_name) - else: - page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name) - source_html = request_text(page_url) - license_soup = BeautifulSoup(source_html, "lxml") - links_found = license_soup.find_all("a") - link_count = len(links_found) - if args.log_level <= INFO: - print(f"{context}\nNumber of links found: {link_count}") - context_printed = True - valid_anchors, valid_links, context_printed = get_scrapable_links( - args, base_url, links_found, context, context_printed - ) - if valid_links: - memoized_results = get_memoized_result(valid_links, valid_anchors) - stored_links = memoized_results[0] - stored_anchors = memoized_results[1] - stored_result = memoized_results[2] - check_links = memoized_results[3] - check_anchors = memoized_results[4] - if check_links: - rs = ( - # Since we're only checking for validity, we can retreive - # only the headers/metadata - grequests.head(link, timeout=REQUESTS_TIMEOUT) - for link in check_links - ) - responses = list() - # Explicitly close connections to free up file handles and - # avoid Connection Errors per: - # https://stackoverflow.com/a/22839550 - for response in grequests.map( - rs, exception_handler=exception_handler - ): - try: - responses.append(response.status_code) - response.close() - except AttributeError: - responses.append(response) - memoize_result(check_links, responses) - stored_anchors += check_anchors - stored_result += responses - stored_links += check_links - caught_errors = write_response( - args, - stored_links, - stored_result, - base_url, - license_name, - stored_anchors, - context, - context_printed, - ) - - if caught_errors: - errors_total += caught_errors - exit_status = 1 - - print("\nCompleted in: {}".format(time.time() - START_TIME)) - if args.output_errors: - output_summary(args, license_names, errors_total) - print("\nError file present at: ", args.output_errors.name) - output_test_summary(errors_total) + if args.log_level == DEBUG: + print(f"DEBUG: args: {args}") - return [exit_status, 0, 0] + return args def check_deeds(args): @@ -354,8 +312,90 @@ def check_deeds(args): return [0, exit_status, 0] -def check_rdfs(args): - if args.index: +def check_legalcode(args): + print("\n\nChecking LegalCode License...\n\n") + license_names = get_legalcode(args) + if args.log_level <= INFO: + print("Number of files to be checked:", len(license_names)) + errors_total = 0 + exit_status = 0 + for license_name in license_names: + caught_errors = 0 + context_printed = False + filename = license_name[: -len(".html")] + base_url = create_base_link(args, filename) + context = f"\n\nChecking: {license_name}\nURL: {base_url}" + if args.local: + source_html = request_local_text(LICENSE_LOCAL_PATH, license_name) + else: + page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name) + source_html = request_text(page_url) + license_soup = BeautifulSoup(source_html, "lxml") + links_found = license_soup.find_all("a") + link_count = len(links_found) + if args.log_level <= INFO: + print(f"{context}\nNumber of links found: {link_count}") + context_printed = True + valid_anchors, valid_links, context_printed = get_scrapable_links( + args, base_url, links_found, context, context_printed + ) + if valid_links: + memoized_results = get_memoized_result(valid_links, valid_anchors) + stored_links = memoized_results[0] + stored_anchors = memoized_results[1] + stored_result = memoized_results[2] + check_links = memoized_results[3] + check_anchors = memoized_results[4] + if check_links: + rs = ( + # Since we're only checking for validity, we can retreive + # only the headers/metadata + grequests.head(link, timeout=REQUESTS_TIMEOUT) + for link in check_links + ) + responses = list() + # Explicitly close connections to free up file handles and + # avoid Connection Errors per: + # https://stackoverflow.com/a/22839550 + for response in grequests.map( + rs, exception_handler=exception_handler + ): + try: + responses.append(response.status_code) + response.close() + except AttributeError: + responses.append(response) + memoize_result(check_links, responses) + stored_anchors += check_anchors + stored_result += responses + stored_links += check_links + caught_errors = write_response( + args, + stored_links, + stored_result, + base_url, + license_name, + stored_anchors, + context, + context_printed, + ) + + if caught_errors: + errors_total += caught_errors + exit_status = 1 + + print("\nCompleted in: {}".format(time.time() - START_TIME)) + + if args.output_errors: + output_summary(args, license_names, errors_total) + print("\nError file present at: ", args.output_errors.name) + output_test_summary(errors_total) + + return [exit_status, 0, 0] + + +def check_rdfs(args, index=False): + if index: print("\n\nChecking index.rdf...\n\n") rdf_obj_list = get_index_rdf(args) else: @@ -446,44 +486,81 @@ def check_rdfs(args): return [0, 0, exit_status] -def main(): - args = parse_argument(sys.argv[1:]) - args_dict = vars(args) - run_sub_command = args_dict.get("func", False) - no_parser_args = not any( - [args.legalcode, args.deeds, args.rdf, run_sub_command] - ) - all_parser_args_but_no_subparser_args = ( - all([args.legalcode, args.deeds, args.rdf]) and not run_sub_command - ) - run_full_inspection = ( - no_parser_args or all_parser_args_but_no_subparser_args +def check_index_rdf(args): + exit_status_list = check_rdfs(args, index=True) + return exit_status_list + + +def check_complete(args): + print( + "Running Full Inspection:" + " Checking links for LegalCode, Deeds, RDF, and index.rdf" ) - exit_status_list = [] - if run_sub_command: - exit_status_list = args.func(args) - if args.legalcode and not all_parser_args_but_no_subparser_args: - exit_status_list = check_legalcode(args) - if args.deeds and not all_parser_args_but_no_subparser_args: - exit_status_list = check_deeds(args) - if args.rdf and not all_parser_args_but_no_subparser_args: - exit_status_list = check_rdfs(args) - if run_full_inspection: - print( - "\nRunning Full Inspection:" - " Checking Links for LegalCode, Deed, RDF, and index.rdf files" - ) - exit_status_legalcode, y, z = check_legalcode(args) - x, exit_status_deeds, z = check_deeds(args) - x, y, exit_status_rdf = check_rdfs(args) - args.index = True - x, y, exit_status_index_rdf = check_rdfs(args) - exit_status_list = [ - exit_status_legalcode, - exit_status_deeds, - exit_status_rdf, - exit_status_index_rdf, - ] + exit_status_legalcode, _, _ = check_legalcode(args) + _, exit_status_deeds, _ = check_deeds(args) + _, _, exit_status_rdf = check_rdfs(args) + _, _, exit_status_index_rdf = check_rdfs(args, index=True) + exit_status_list = [ + exit_status_legalcode, + exit_status_deeds, + exit_status_rdf, + exit_status_index_rdf, + ] + return exit_status_list + + +def print_canonical(args): + license_names = get_legalcode(args) + grouped = [ + set(), # 0: by* 4.0 licenses + set(), # 1: by* 3.0 licenses + set(), # 2: by* 2.5 licenses + set(), # 3: by* 2.1 licenses + set(), # 4: by* 2.0 licenses + set(), # 5: by* 1.x licenes + set(), # 6: miscellanious licenses + set(), # 7: zero 1.0 public domain + set(), # 8: miscellanious public domain + ] + for license_name in license_names: + if not args.include_gnu: + testname = license_name.lower() + if testname.startswith("gpl") or testname.startswith("lgpl"): + continue + filename = license_name[: -len(".html")] + url = create_base_link(args, filename, for_canonical=True) + parts = url.split("/") + bystar_starts = ("by", "nc", "nd", "sa") + if parts[3] == "licenses" and parts[4].startswith(bystar_starts): + if parts[5].startswith("4"): + grouped[0].add(url) + elif parts[5].startswith("3"): + grouped[1].add(url) + elif parts[5] == "2.5": + grouped[2].add(url) + elif parts[5] == "2.1": + grouped[3].add(url) + elif parts[5] == "2.0": + grouped[4].add(url) + elif parts[5].startswith("1"): + grouped[5].add(url) + else: + grouped[6].add(url) + elif parts[3] == "publicdomain" and parts[4] == "zero": + grouped[7].add(url) + else: + grouped[8].add(url) + for urls in grouped: + urls = list(urls) + urls.sort() + for url in urls: + print(url) + return [0, 0, 0] + + +def main(): + args = parse_arguments() + exit_status_list = args.func(args) if 1 in exit_status_list: return sys.exit(1) return sys.exit(0) diff --git a/link_checker/utils.py b/link_checker/utils.py index 36e8a81..64a6886 100644 --- a/link_checker/utils.py +++ b/link_checker/utils.py @@ -397,7 +397,9 @@ def get_scrapable_links( return (valid_anchors, valid_links, context_printed) -def create_base_link(args, filename, for_deeds=False, for_rdfs=False): +def create_base_link( + args, filename, for_deeds=False, for_rdfs=False, for_canonical=False +): """Generates base URL on which the license file will be displayed Args: @@ -437,7 +439,10 @@ def create_base_link(args, filename, for_deeds=False, for_rdfs=False): if jurisdiction: url = posixpath.join(url, jurisdiction) - url = posixpath.join(url, legalcode) + if for_canonical: + url = posixpath.join(url, "") + else: + url = posixpath.join(url, legalcode) if for_deeds: url = get_url_from_legalcode_url(url) if for_rdfs: From 20772ebe037405111235cf4e985b71b1ac0ad230 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Fri, 4 Sep 2020 21:44:55 -0700 Subject: [PATCH 03/15] updated variables for refactor and improve comments --- link_checker/__main__.py | 10 ++++------ link_checker/utils.py | 35 ++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index 60e97e7..94745fd 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -325,7 +325,7 @@ def check_legalcode(args): filename = license_name[: -len(".html")] base_url = create_base_link(args, filename) context = f"\n\nChecking: {license_name}\nURL: {base_url}" - if args.local: + if args.local_index: source_html = request_local_text(LICENSE_LOCAL_PATH, license_name) else: page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name) @@ -402,7 +402,7 @@ def check_rdfs(args, index=False): print("\n\nChecking RDFs...\n\n") rdf_obj_list = get_rdf(args) if args.log_level <= INFO: - if not args.index: + if not index: print("Number of rdf files to be checked:", len(rdf_obj_list)) else: print( @@ -415,12 +415,10 @@ def check_rdfs(args, index=False): caught_errors = 0 context_printed = False rdf_url = ( - rdf_obj["rdf:about"] - if args.index - else f'{rdf_obj["rdf:about"]}rdf' + rdf_obj["rdf:about"] if index else f"{rdf_obj['rdf:about']}rdf" ) links_found = get_links_from_rdf(rdf_obj) - checking = "URL" if not args.index else "RDF_ABOUT" + checking = "URL" if not index else "RDF_ABOUT" context = f"\n\nChecking: \n{checking}: {rdf_url}" link_count = len(links_found) if args.log_level <= INFO: diff --git a/link_checker/utils.py b/link_checker/utils.py index 64a6886..671a4ec 100644 --- a/link_checker/utils.py +++ b/link_checker/utils.py @@ -162,7 +162,7 @@ def get_local_legalcode(): def get_rdf(args): - """Helper function that determines rdf urls + """Helper function that determines RDF urls from license_names found locally or on github and returns a list of valid rdf objects. @@ -188,13 +188,13 @@ def get_rdf(args): def get_index_rdf(args, local_path=""): - """Determine if local rdf files or remote rdf files - should be parsed and call the appropriate function. + """Determine if local index.rdf file or remote index.rdf file + should be parsed and then call the appropriate function. Returns: - rdf_obj_list: list of rdf objects found in index.rdf + rdf_obj_list: list of RDF objects found in index.rdf """ - if args.local: + if args.local_index: rdf_obj_list = get_local_index_rdf(local_path) else: rdf_obj_list = get_remote_index_rdf() @@ -202,7 +202,7 @@ def get_index_rdf(args, local_path=""): def get_remote_index_rdf(): - """This function reads rdfs found at + """This function reads RDFs found at https://creativecommons.org/licenses/index.rdf Returns: @@ -220,12 +220,12 @@ def get_local_index_rdf(local_path=""): """This function reads from index.rdf stored locally Parameters: - local_path: path to rdf file. If not supplied - the INDEX_RDF_LOCAL_PATH constant is used - (which uses your environment or defaults to - "./index.rdf"; see constants.py) + local_path: path to index.rdf file. If not supplied + the INDEX_RDF_LOCAL_PATH constant is used + (which uses your environment or defaults to + "./index.rdf"; see constants.py) Returns: - rdf_obj_list: list of rdf objects found in index.rdf + rdf_obj_list: list of RDF objects found in index.rdf """ try: local_path = local_path or INDEX_RDF_LOCAL_PATH @@ -244,11 +244,11 @@ def get_local_index_rdf(local_path=""): def get_links_from_rdf(rdf_obj): - """This function parses an rdf and returns links found + """This function parses an RDF and returns links found Parameters: rdf_obj: soup object Returns: - links_found: list of link dictionaries found in rdf soup object + links_found: list of link dictionaries found in RDF soup object """ tags = rdf_obj.findChildren() links_found = [] @@ -326,8 +326,9 @@ def get_scrapable_links( links_found (list): List of all the links found in file Returns: - set: valid_anchors - list of all scrapable anchor tags - valid_links - list of all absolute scrapable links + list: valid_anchors - list of all scrapable anchor tags + list: valid_links - list of all absolute scrapable links + bool: context_printed """ valid_links = [] valid_anchors = [] @@ -344,7 +345,7 @@ def get_scrapable_links( # " {:<24}{}".format("Skipping internal link ", link) # ) continue - if href.startswith("mailto:"): + elif href.startswith("mailto:"): # mailto links are valid, but out of scope # No need to report non-issue (not actionable) # warnings.append @@ -375,7 +376,7 @@ def get_scrapable_links( # " {:<24}{}".format("Skipping internal link ", link) # ) continue - if href.startswith("mailto:"): + elif href.startswith("mailto:"): # mailto links are valid, but out of scope # No need to report non-issue (not actionable) # warnings.append From 3349907167c43d374e77a1ad3a209c2e6d355a81 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Fri, 4 Sep 2020 21:57:23 -0700 Subject: [PATCH 04/15] replaced duplicate print statements with single INFO print statement --- link_checker/__main__.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index 94745fd..ba48181 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -302,7 +302,6 @@ def check_deeds(args): errors_total += caught_errors exit_status = 1 - print("\nCompleted in: {}".format(time.time() - START_TIME)) if args.output_errors: output_summary(args, license_names, errors_total) @@ -384,8 +383,6 @@ def check_legalcode(args): errors_total += caught_errors exit_status = 1 - print("\nCompleted in: {}".format(time.time() - START_TIME)) - if args.output_errors: output_summary(args, license_names, errors_total) print("\nError file present at: ", args.output_errors.name) @@ -474,8 +471,6 @@ def check_rdfs(args, index=False): errors_total += caught_errors exit_status = 1 - print("\nCompleted in: {}".format(time.time() - START_TIME)) - if args.output_errors: output_summary(args, rdf_obj_list, errors_total) print("\nError file present at: ", args.output_errors.name) @@ -559,6 +554,9 @@ def print_canonical(args): def main(): args = parse_arguments() exit_status_list = args.func(args) + if args.log_level <= INFO: + print() + print(f"Completed in: {time.time() - START_TIME:.2f} seconds") if 1 in exit_status_list: return sys.exit(1) return sys.exit(0) From fd3be17208dabafad49781effe1992c327a9ed2f Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 09:48:43 -0700 Subject: [PATCH 05/15] refactored output and exit status handling - output summaries now handled in a single location - exit status handling refactored and simplified to match subcommand organization --- link_checker/__main__.py | 76 ++++++++++++++++++++-------------------- link_checker/utils.py | 37 +++++++++++++------ 2 files changed, 64 insertions(+), 49 deletions(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index ba48181..79d3e55 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -41,8 +41,7 @@ exception_handler, memoize_result, write_response, - output_summary, - output_test_summary, + output_summaries, ) @@ -176,11 +175,11 @@ def parse_arguments(): ) parser_index.set_defaults(func=check_index_rdf) - # complete subcommand: link_checker complete -h - parser_complete = subparsers.add_parser( - "complete", + # combined subcommand: link_checker combined -h + parser_combined = subparsers.add_parser( + "combined", add_help=False, - help="Complete check (deeds, legalcode, rdf, and index)", + help="Combined check (deeds, legalcode, rdf, and index)", parents=[ parser_shared, parser_shared_licenses, @@ -188,7 +187,7 @@ def parse_arguments(): parser_shared_reporting, ], ) - parser_complete.set_defaults(func=check_complete) + parser_combined.set_defaults(func=check_combined) # Canonical License URLs subcommand: link_checker canonical -h parser_canonical = subparsers.add_parser( @@ -302,13 +301,7 @@ def check_deeds(args): errors_total += caught_errors exit_status = 1 - - if args.output_errors: - output_summary(args, license_names, errors_total) - print("\nError file present at: ", args.output_errors.name) - output_test_summary(errors_total) - - return [0, exit_status, 0] + return license_names, errors_total, exit_status def check_legalcode(args): @@ -324,7 +317,7 @@ def check_legalcode(args): filename = license_name[: -len(".html")] base_url = create_base_link(args, filename) context = f"\n\nChecking: {license_name}\nURL: {base_url}" - if args.local_index: + if args.local: source_html = request_local_text(LICENSE_LOCAL_PATH, license_name) else: page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name) @@ -383,12 +376,7 @@ def check_legalcode(args): errors_total += caught_errors exit_status = 1 - if args.output_errors: - output_summary(args, license_names, errors_total) - print("\nError file present at: ", args.output_errors.name) - output_test_summary(errors_total) - - return [exit_status, 0, 0] + return license_names, errors_total, exit_status def check_rdfs(args, index=False): @@ -471,17 +459,12 @@ def check_rdfs(args, index=False): errors_total += caught_errors exit_status = 1 - if args.output_errors: - output_summary(args, rdf_obj_list, errors_total) - print("\nError file present at: ", args.output_errors.name) - output_test_summary(errors_total) - - return [0, 0, exit_status] + return rdf_obj_list, errors_total, exit_status def check_index_rdf(args): exit_status_list = check_rdfs(args, index=True) - return exit_status_list + return license_names, errors_total, exit_status_list def check_complete(args): @@ -489,17 +472,35 @@ def check_complete(args): "Running Full Inspection:" " Checking links for LegalCode, Deeds, RDF, and index.rdf" ) - exit_status_legalcode, _, _ = check_legalcode(args) - _, exit_status_deeds, _ = check_deeds(args) - _, _, exit_status_rdf = check_rdfs(args) - _, _, exit_status_index_rdf = check_rdfs(args, index=True) + license_names = [] + errors_total = 0 + exit_status = 0 + + names, total, exit_status_legalcode = check_legalcode(args) + license_names += names + errors_total += total + + names, total, exit_status_deeds = check_deeds(args) + license_names += names + errors_total += total + + names, total, exit_status_rdf = check_rdfs(args) + license_names += names + errors_total += total + + names, total, exit_status_index_rdf = check_rdfs(args, index=True) + license_names += names + errors_total += total + exit_status_list = [ exit_status_legalcode, exit_status_deeds, exit_status_rdf, exit_status_index_rdf, ] - return exit_status_list + if 1 in exit_status_list: + exit_status = 1 + return license_names, errors_total, exit_status def print_canonical(args): @@ -548,18 +549,17 @@ def print_canonical(args): urls.sort() for url in urls: print(url) - return [0, 0, 0] + return [], 0, 0 def main(): args = parse_arguments() - exit_status_list = args.func(args) + license_names, errors_total, exit_status = args.func(args) + output_summaries(args, license_names, errors_total) if args.log_level <= INFO: print() print(f"Completed in: {time.time() - START_TIME:.2f} seconds") - if 1 in exit_status_list: - return sys.exit(1) - return sys.exit(0) + return sys.exit(exit_status) if __name__ == "__main__": diff --git a/link_checker/utils.py b/link_checker/utils.py index 671a4ec..a3514d7 100644 --- a/link_checker/utils.py +++ b/link_checker/utils.py @@ -338,7 +338,7 @@ def get_scrapable_links( try: href = link["href"] except KeyError: - if href[0] == "#": + if href.startswith("#"): # anchor links are valid, but out of scope # No need to report non-issue (not actionable) # warnings.append( @@ -362,14 +362,22 @@ def get_scrapable_links( try: assert link["name"] warnings.append( - " {:<24}{}".format("Anchor uses name", link) + " {:<24}{}".format("Anchor uses name", + str(link).replace("\n", "")) ) except: warnings.append( - " {:<24}{}".format("Anchor w/o href or id", link) + " {:<24}{}".format("Anchor w/o href or id", + str(link).replace("\n", "")) ) continue - if href != "" and href[0] == "#": + if href == "": + warnings.append( + " {:<24}{}".format("Empty href", + str(link).replace("\n", "")) + ) + continue + elif href.startswith("#"): # anchor links are valid, but out of scope # No need to report non-issue (not actionable) # warnings.append( @@ -586,12 +594,10 @@ def write_response( if not context_printed: print(context) print("Errors:") - output_write( - args, "\n{}\nURL: {}".format(license_name, base_url) - ) - result = " {:<24}{}\n{}{}".format( - str(status), all_links[idx], " " * 26, valid_anchors[idx] - ) + output_write(args, f"\n{license_name}\nURL: {base_url}") + link = all_links[idx] + anchor = str(valid_anchors[idx]).replace("\n", "").strip() + result = f" {str(status):<24}{link}\n{'':<26}{anchor}" if args.log_level <= ERROR: print(result) output_write(args, result) @@ -619,7 +625,7 @@ def output_write(args, *args_, **kwargs): print(*args_, **kwargs) -def output_summary(args, license_names, num_errors): +def output_issues_summary(args, license_names, num_errors): """Prints short summary of broken links in the output error file Args: @@ -661,3 +667,12 @@ def output_test_summary(errors_total): ) ts = TestSuite("cc-link-checker", [test_case]) to_xml_report_file(test_summary, [ts]) + + +def output_summaries(args, license_names, errors_total): + if not args.output_errors: + return + output_issues_summary(args, license_names, errors_total) + if args.log_level <= INFO: + print("\nOutput to error file: ", args.output_errors.name) + output_test_summary(errors_total) From eae0a3a16d52dcd19fae49a132cbe1c78f101b3d Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 11:12:02 -0700 Subject: [PATCH 06/15] added --limit option and improved output - added --limit option to reduce time required for testing and iteration - replaced format strings with fstrings - added additional DEBUG statements (seen with -vv) --- link_checker/__main__.py | 30 ++++++++++++++++++------------ link_checker/utils.py | 28 +++++++++++++++++++--------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index 79d3e55..678f329 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -46,10 +46,10 @@ def parse_arguments(): - """parse arguments from cli + """parse arguments from CLI Args: - args (list): list of arguments parsed from command line + args (list): list of arguments parsed from command line interface """ # Primary argument parser and sub-parser (for subcommands) @@ -77,6 +77,17 @@ def parse_arguments(): help="decrease verbosity (can be specified multiple times)", dest="verbosity", ) + parser_shared.add_argument( + "--root-url", + default=DEFAULT_ROOT_URL, + help=f"set root URL (default: '{DEFAULT_ROOT_URL}')", + ) + parser_shared.add_argument( + "--limit", + default=10, + type=int, + help=f"Limit check lists to specified integer (default: 10)", + ) parser_shared.add_argument( "-v", "--verbose", @@ -85,11 +96,6 @@ def parse_arguments(): help="increase verbosity (can be specified multiple times)", dest="verbosity", ) - parser_shared.add_argument( - "--root-url", - default=DEFAULT_ROOT_URL, - help=f"set root URL (default: '{DEFAULT_ROOT_URL}')", - ) # Shared licenses parser (optional arguments used by all license # subcommands) @@ -239,7 +245,7 @@ def check_deeds(args): # cc/engine/templates/legalcode/standard_deed.html # Scrapping the html found on the active site if deed_base_url: - context = f"\n\nChecking: \nURL: {deed_base_url}" + context = f"\n\nChecking: deed\nURL: {deed_base_url}" page_url = deed_base_url source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") @@ -316,7 +322,7 @@ def check_legalcode(args): context_printed = False filename = license_name[: -len(".html")] base_url = create_base_link(args, filename) - context = f"\n\nChecking: {license_name}\nURL: {base_url}" + context = f"\n\nChecking: legalcode\nURL: {base_url}" if args.local: source_html = request_local_text(LICENSE_LOCAL_PATH, license_name) else: @@ -388,10 +394,10 @@ def check_rdfs(args, index=False): rdf_obj_list = get_rdf(args) if args.log_level <= INFO: if not index: - print("Number of rdf files to be checked:", len(rdf_obj_list)) + print("Number of RDF files to be checked:", len(rdf_obj_list)) else: print( - "Number of rdf objects/sections to be checked in index.rdf:", + "Number of RDF objects/sections to be checked in index.rdf:", len(rdf_obj_list), ) errors_total = 0 @@ -467,7 +473,7 @@ def check_index_rdf(args): return license_names, errors_total, exit_status_list -def check_complete(args): +def check_combined(args): print( "Running Full Inspection:" " Checking links for LegalCode, Deeds, RDF, and index.rdf" diff --git a/link_checker/utils.py b/link_checker/utils.py index a3514d7..3756ae7 100644 --- a/link_checker/utils.py +++ b/link_checker/utils.py @@ -27,6 +27,8 @@ TEST_ORDER, ERROR, WARNING, + INFO, + DEBUG, ) @@ -92,9 +94,15 @@ def get_legalcode(args): str[]: The list of license/deeds files found in the repository """ if args.local: + if args.log_level == DEBUG: + print("DEBUG: processing local legacode files") license_names = get_local_legalcode() else: + if args.log_level == DEBUG: + print("DEBUG: processing GitHub legacode files") license_names = get_github_legalcode() + if args.limit and args.subcommand != "rdf": + license_names = license_names[0:args.limit] return license_names @@ -175,8 +183,12 @@ def get_rdf(args): for license_name in license_names: filename = license_name[: -len(".html")] rdf_base_url = create_base_link(args, filename, for_rdfs=True) + if not rdf_base_url: + continue rdf_urls.append(rdf_base_url) unique_rdf_urls = list(set(rdf_urls)) + if args.limit: + unique_rdf_urls = unique_rdf_urls[0:args.limit] for url in unique_rdf_urls: if url: page_text = request_text(url) @@ -198,6 +210,8 @@ def get_index_rdf(args, local_path=""): rdf_obj_list = get_local_index_rdf(local_path) else: rdf_obj_list = get_remote_index_rdf() + if args.limit: + rdf_obj_list = rdf_obj_list[0:args.limit] return rdf_obj_list @@ -353,6 +367,7 @@ def get_scrapable_links( # ) continue else: + link_text = str(link).replace("\n", "") try: href = link["href"] except KeyError: @@ -362,20 +377,15 @@ def get_scrapable_links( try: assert link["name"] warnings.append( - " {:<24}{}".format("Anchor uses name", - str(link).replace("\n", "")) + f" {'Anchor uses name':<24}{link_text}" ) except: warnings.append( - " {:<24}{}".format("Anchor w/o href or id", - str(link).replace("\n", "")) + f" {'Anchor w/o href or id':<24}{link_text}" ) continue if href == "": - warnings.append( - " {:<24}{}".format("Empty href", - str(link).replace("\n", "")) - ) + warnings.append(f" {'Empty href':<24}{link_text}") continue elif href.startswith("#"): # anchor links are valid, but out of scope @@ -674,5 +684,5 @@ def output_summaries(args, license_names, errors_total): return output_issues_summary(args, license_names, errors_total) if args.log_level <= INFO: - print("\nOutput to error file: ", args.output_errors.name) + print("\nOutput to error file:", args.output_errors.name) output_test_summary(errors_total) From 2ba2e755e133782381fe5c62fa9cadd7253233c6 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 11:17:32 -0700 Subject: [PATCH 07/15] black formatted and ignore flake8 E203 where it conflicts with black --- link_checker/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/link_checker/utils.py b/link_checker/utils.py index 3756ae7..934c371 100644 --- a/link_checker/utils.py +++ b/link_checker/utils.py @@ -102,7 +102,7 @@ def get_legalcode(args): print("DEBUG: processing GitHub legacode files") license_names = get_github_legalcode() if args.limit and args.subcommand != "rdf": - license_names = license_names[0:args.limit] + license_names = license_names[0 : args.limit] # noqa: E203 return license_names @@ -188,7 +188,7 @@ def get_rdf(args): rdf_urls.append(rdf_base_url) unique_rdf_urls = list(set(rdf_urls)) if args.limit: - unique_rdf_urls = unique_rdf_urls[0:args.limit] + unique_rdf_urls = unique_rdf_urls[0 : args.limit] # noqa: E203 for url in unique_rdf_urls: if url: page_text = request_text(url) @@ -211,7 +211,7 @@ def get_index_rdf(args, local_path=""): else: rdf_obj_list = get_remote_index_rdf() if args.limit: - rdf_obj_list = rdf_obj_list[0:args.limit] + rdf_obj_list = rdf_obj_list[0 : args.limit] # noqa: E203 return rdf_obj_list From a2851a86ade4ede3e30606f9382b4ad97ba5cfd2 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 11:20:03 -0700 Subject: [PATCH 08/15] fixed flake8 errors and fixed check_index_rdf function --- link_checker/__main__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index 678f329..cea7c08 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -86,7 +86,7 @@ def parse_arguments(): "--limit", default=10, type=int, - help=f"Limit check lists to specified integer (default: 10)", + help="Limit check lists to specified integer (default: 10)", ) parser_shared.add_argument( "-v", @@ -469,8 +469,7 @@ def check_rdfs(args, index=False): def check_index_rdf(args): - exit_status_list = check_rdfs(args, index=True) - return license_names, errors_total, exit_status_list + return check_rdfs(args, index=True) def check_combined(args): From 7664b65e0835a416af4723a07f473872fa0cc245 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 11:27:57 -0700 Subject: [PATCH 09/15] updated help text and TOC --- README.md | 202 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 147 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index e3591e5..563b5c4 100644 --- a/README.md +++ b/README.md @@ -13,12 +13,12 @@ - [User](#User) - [Development](#Development) - [Usage](#Usage) - - [`-h` or `--help`](#-h-or---help) - - [Default mode](#default-mode) - - [`-q` or `--quiet`](#-q-or---quiet) - - [`-v` or `--verbose`](#-v-or---verbose) - - [`--output-error`](#--output-error) - - [`--local`](#--local) + - [deeds](#deeds) + - [legalcode](#legalcode) + - [rdf](#rdf) + - [index](#index) + - [combined](#combined) + - [canonical](#canonical) - [Integrating with CI](#Integrating-with-CI) - [Unit Testing](#Unit-Testing) - [Troubleshooting](#Troubleshooting) @@ -84,92 +84,184 @@ environment and install dependencies ```shell pipenv run link_checker -h ``` - ``` -usage: link_checker.py [-h] [--legalcode] [--deeds] [--rdf] [--index] [--local] - [--output-errors [output_file]] [-q] [--root-url ROOT_URL] - [-v] - {legalcode,deeds,rdf} ... +usage: link_checker [-h] {deeds,legalcode,rdf,index,combined,canonical} ... Check for broken links in Creative Commons license deeds, legalcode, and rdf -positional arguments: - {legalcode,deeds,rdf} - sub-command help - legalcode legalcode help - deeds deeds help - rdf rdf help +optional arguments: + -h, --help show this help message and exit + +subcommands (a single subcomamnd is required): + {deeds,legalcode,rdf,index,combined,canonical} + deeds check the links for each license's deed + legalcode check the links for each license's legalcode + rdf check the links for each license's RDF + index check the links within index.rdf + combined Combined check (deeds, legalcode, rdf, and index) + canonical print canonical license URLs + +Also see the help output each subcommand +``` + + +### deeds + +```shell +pipenv run link_checker deeds -h +``` +``` +usage: link_checker deeds [-h] [-q] [--root-url ROOT_URL] [--limit LIMIT] [-v] + [--local] [--output-errors [output_file]] optional arguments: -h, --help show this help message and exit - --legalcode Runs link_checker for legalcode only. (Note: --licenses is - deprecated and will be dropped from a future release. - Please use --legalcode instead.) - --deeds Runs link_checker for deeds only (the legalcode files will - still be scraped, but not checked for broken links) - --rdf Runs link_checker for rdf only - --index Runs link_checker for index.rdf only - --local Scrapes legalcode files from local file system + -q, --quiet decrease verbosity (can be specified multiple times) + --root-url ROOT_URL set root URL (default: 'https://creativecommons.org') + --limit LIMIT Limit check lists to specified integer (default: 10) + -v, --verbose increase verbosity (can be specified multiple times) + --local process local filesystem legalcode files to determine + valid license paths (uses LICENSE_LOCAL_PATH environment + variable and falls back to default: + '../creativecommons.org/docroot/legalcode') --output-errors [output_file] - Outputs all link errors to file (default: errorlog.txt) - and creates junit-xml type summary(test-summary/junit-xml- + output all link errors to file (default: errorlog.txt) and + create junit-xml type summary (test-summary/junit-xml- report.xml) - -q, --quiet Decrease verbosity. Can be specified multiple times. - --root-url ROOT_URL Set root URL (default: https://creativecommons.org) - -v, --verbose Increase verbosity. Can be specified multiple times. ``` + ### legalcode ```shell pipenv run link_checker legalcode -h ``` ``` -usage: link_checker.py legalcode [-h] [--local] +usage: link_checker legalcode [-h] [-q] [--root-url ROOT_URL] [--limit LIMIT] [-v] + [--local] [--output-errors [output_file]] optional arguments: - -h, --help show this help message and exit - --local Scrapes legalcode files from local file system. Add - 'LICENSE_LOCAL_PATH' to your environment, otherwise this tool will - search for legalcode files in - '../creativecommons.org/docroot/legalcode'. + -h, --help show this help message and exit + -q, --quiet decrease verbosity (can be specified multiple times) + --root-url ROOT_URL set root URL (default: 'https://creativecommons.org') + --limit LIMIT Limit check lists to specified integer (default: 10) + -v, --verbose increase verbosity (can be specified multiple times) + --local process local filesystem legalcode files to determine + valid license paths (uses LICENSE_LOCAL_PATH environment + variable and falls back to default: + '../creativecommons.org/docroot/legalcode') + --output-errors [output_file] + output all link errors to file (default: errorlog.txt) and + create junit-xml type summary (test-summary/junit-xml- + report.xml) ``` -### deeds +### rdf ```shell -pipenv run link_checker deeds -h +pipenv run link_checker rdf -h ``` ``` -usage: link_checker.py deeds [-h] [--local] +usage: link_checker rdf [-h] [-q] [--root-url ROOT_URL] [--limit LIMIT] [-v] + [--local] [--local-index] [--output-errors [output_file]] optional arguments: - -h, --help show this help message and exit - --local Scrapes deed files based on the legalcode files found on the local - file system. Add 'LICENSE_LOCAL_PATH' to your environment, otherwise - this tool will search for legalcode files in - '../creativecommons.org/docroot/legalcode'. + -h, --help show this help message and exit + -q, --quiet decrease verbosity (can be specified multiple times) + --root-url ROOT_URL set root URL (default: 'https://creativecommons.org') + --limit LIMIT Limit check lists to specified integer (default: 10) + -v, --verbose increase verbosity (can be specified multiple times) + --local process local filesystem legalcode files to determine + valid license paths (uses LICENSE_LOCAL_PATH environment + variable and falls back to default: + '../creativecommons.org/docroot/legalcode') + --local-index process local filesystem index.rdf (uses + INDEX_RDF_LOCAL_PATH environment variable and falls back + to default: './index.rdf') + --output-errors [output_file] + output all link errors to file (default: errorlog.txt) and + create junit-xml type summary (test-summary/junit-xml- + report.xml) ``` -### rdf +### index ```shell -pipenv run link_checker rdf -h +pipenv run link_checker index -h +``` +``` +usage: link_checker index [-h] [-q] [--root-url ROOT_URL] [--limit LIMIT] [-v] + [--local-index] [--output-errors [output_file]] + +optional arguments: + -h, --help show this help message and exit + -q, --quiet decrease verbosity (can be specified multiple times) + --root-url ROOT_URL set root URL (default: 'https://creativecommons.org') + --limit LIMIT Limit check lists to specified integer (default: 10) + -v, --verbose increase verbosity (can be specified multiple times) + --local-index process local filesystem index.rdf (uses + INDEX_RDF_LOCAL_PATH environment variable and falls back + to default: './index.rdf') + --output-errors [output_file] + output all link errors to file (default: errorlog.txt) and + create junit-xml type summary (test-summary/junit-xml- + report.xml) +``` + + +### combined + +```shell +pipenv run link_checker combined -h +``` +``` +usage: link_checker combined [-h] [-q] [--root-url ROOT_URL] [--limit LIMIT] [-v] + [--local] [--local-index] + [--output-errors [output_file]] + +optional arguments: + -h, --help show this help message and exit + -q, --quiet decrease verbosity (can be specified multiple times) + --root-url ROOT_URL set root URL (default: 'https://creativecommons.org') + --limit LIMIT Limit check lists to specified integer (default: 10) + -v, --verbose increase verbosity (can be specified multiple times) + --local process local filesystem legalcode files to determine + valid license paths (uses LICENSE_LOCAL_PATH environment + variable and falls back to default: + '../creativecommons.org/docroot/legalcode') + --local-index process local filesystem index.rdf (uses + INDEX_RDF_LOCAL_PATH environment variable and falls back + to default: './index.rdf') + --output-errors [output_file] + output all link errors to file (default: errorlog.txt) and + create junit-xml type summary (test-summary/junit-xml- + report.xml) +``` + + +### canonical + +```shell +pipenv run link_checker canonical -h ``` ``` -usage: link_checker.py rdf [-h] [--local] [--index] +usage: link_checker canonical [-h] [-q] [--root-url ROOT_URL] [--limit LIMIT] [-v] + [--local] [--include-gnu] optional arguments: - -h, --help show this help message and exit - --local Scrapes rdf files based on the legalcode files found on the local - file system. Add 'LICENSE_LOCAL_PATH' to your environment, otherwise - this tool will search for legalcode files in - '../creativecommons.org/docroot/legalcode'. - --index Checks index.rdf file instead of checking rdf files. If you want to - check the index.rdf file locally add 'INDEX_RDF_LOCAL_PATH' to your - environment; otherwise this variable defaults to './index.rdf'. + -h, --help show this help message and exit + -q, --quiet decrease verbosity (can be specified multiple times) + --root-url ROOT_URL set root URL (default: 'https://creativecommons.org') + --limit LIMIT Limit check lists to specified integer (default: 10) + -v, --verbose increase verbosity (can be specified multiple times) + --local process local filesystem legalcode files to determine valid + license paths (uses LICENSE_LOCAL_PATH environment variable + and falls back to default: + '../creativecommons.org/docroot/legalcode') + --include-gnu include GNU licenses in addition to Creative Commons + licenses ``` From d7bd9eca5ec597c70b3be40b8b8d589d8cf31048 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 11:29:47 -0700 Subject: [PATCH 10/15] fixed --limit behavior (0 disables limit) --- link_checker/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index cea7c08..9e40098 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -84,9 +84,9 @@ def parse_arguments(): ) parser_shared.add_argument( "--limit", - default=10, + default=0, type=int, - help="Limit check lists to specified integer (default: 10)", + help="Limit check lists to specified integer", ) parser_shared.add_argument( "-v", From 7cf54dc85f7478891174ad6e7eb8c7a3ecd9ef72 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 11:30:28 -0700 Subject: [PATCH 11/15] updated cannonical help text --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 563b5c4..03e47d7 100644 --- a/README.md +++ b/README.md @@ -254,7 +254,7 @@ optional arguments: -h, --help show this help message and exit -q, --quiet decrease verbosity (can be specified multiple times) --root-url ROOT_URL set root URL (default: 'https://creativecommons.org') - --limit LIMIT Limit check lists to specified integer (default: 10) + --limit LIMIT Limit check lists to specified integer -v, --verbose increase verbosity (can be specified multiple times) --local process local filesystem legalcode files to determine valid license paths (uses LICENSE_LOCAL_PATH environment variable From 0d52a57a2cd82892bfd766685168e3477a74ae84 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 12:15:54 -0700 Subject: [PATCH 12/15] refactored link_checker tests to use subcommands and parsers --- link_checker/__main__.py | 6 +- link_checker/tests/test_link_checker.py | 172 ++++++++++++++---------- 2 files changed, 103 insertions(+), 75 deletions(-) diff --git a/link_checker/__main__.py b/link_checker/__main__.py index 9e40098..38d5d89 100755 --- a/link_checker/__main__.py +++ b/link_checker/__main__.py @@ -45,7 +45,7 @@ ) -def parse_arguments(): +def parse_arguments(arguments): """parse arguments from CLI Args: @@ -209,7 +209,7 @@ def parse_arguments(): help="include GNU licenses in addition to Creative Commons licenses", ) - args = parser.parse_args() + args = parser.parse_args(arguments) args.log_level = WARNING if args.verbosity: for v in args.verbosity: @@ -558,7 +558,7 @@ def print_canonical(args): def main(): - args = parse_arguments() + args = parse_arguments(sys.argv[1:]) license_names, errors_total, exit_status = args.func(args) output_summaries(args, license_names, errors_total) if args.log_level <= INFO: diff --git a/link_checker/tests/test_link_checker.py b/link_checker/tests/test_link_checker.py index d133d30..5ba5221 100644 --- a/link_checker/tests/test_link_checker.py +++ b/link_checker/tests/test_link_checker.py @@ -2,75 +2,103 @@ from link_checker import __main__ as link_checker -def test_parse_argument(tmpdir): - # Test default options - args = link_checker.parse_argument([]) - assert args.log_level == 30 - assert bool(args.output_errors) is False - assert args.local is False - assert args.root_url == "https://creativecommons.org" - # Test --licenses - args = link_checker.parse_argument(["--legalcode"]) - assert args.legalcode is True - args = link_checker.parse_argument(["legalcode"]) - assert args.func.__name__ == "check_legalcode" - args = link_checker.parse_argument(["legalcode", "--local"]) - assert args.local is True - # Test --deeds - args = link_checker.parse_argument(["--deeds"]) - assert args.deeds is True - args = link_checker.parse_argument(["deeds"]) - assert args.func.__name__ == "check_deeds" - args = link_checker.parse_argument(["deeds", "--local"]) - assert args.local is True - # Test --rdf - args = link_checker.parse_argument(["--rdf"]) - assert args.rdf is True - args = link_checker.parse_argument(["rdf"]) - assert args.func.__name__ == "check_rdfs" - args = link_checker.parse_argument(["rdf", "--index"]) - assert args.index is True - args = link_checker.parse_argument(["rdf", "--local"]) - assert args.local is True - # Test --index - args = link_checker.parse_argument(["--index"]) - assert args.index is True - # Test --local - args = link_checker.parse_argument(["--local"]) - assert args.local is True - # Test Logging Levels -q/--quiet - args = link_checker.parse_argument(["-q"]) - assert args.log_level == 40 - args = link_checker.parse_argument(["-qq"]) - assert args.log_level == 50 - args = link_checker.parse_argument(["-qqq"]) - assert args.log_level == 50 - args = link_checker.parse_argument(["-q", "--quiet"]) - assert args.log_level == 50 - # Test Logging Levels -v/--verbose - args = link_checker.parse_argument(["-v"]) - assert args.log_level == 20 - args = link_checker.parse_argument(["-vv"]) - assert args.log_level == 10 - args = link_checker.parse_argument(["-vvv"]) - assert args.log_level == 10 - args = link_checker.parse_argument(["-v", "--verbose"]) - assert args.log_level == 10 - # Test Logging Levels with both -v and -q - args = link_checker.parse_argument(["-vq"]) - assert args.log_level == 30 - args = link_checker.parse_argument(["-vvq"]) - assert args.log_level == 20 - args = link_checker.parse_argument(["-vqq"]) - assert args.log_level == 40 - # Test default value of --output-errors - args = link_checker.parse_argument(["--output-errors"]) - assert bool(args.output_errors) is True - assert args.output_errors.name == "errorlog.txt" - # Test custom value of --output-errors - output_file = tmpdir.join("errorlog.txt") - args = link_checker.parse_argument( - ["--output-errors", output_file.strpath] - ) - assert bool(args.output_errors) is True - assert args.output_errors.name == output_file.strpath +def test_parser_shared(): + subcmds = ["deeds", "legalcode", "rdf", "index", "combined", "canonical"] + + # Test defaults + for subcmd in subcmds: + args = link_checker.parse_arguments([subcmd]) + assert args.limit == 0 + assert args.log_level == 30 + assert args.root_url == "https://creativecommons.org" + + # Test arguments + for subcmd in subcmds: + # Test --limit + args = link_checker.parse_arguments([subcmd, "--limit", "10"]) + assert args.limit == 10 + args = link_checker.parse_arguments([subcmd, "--limit=100"]) + assert args.limit == 100 + # Test Logging Levels -q/--quiet + args = link_checker.parse_arguments([subcmd, "-q"]) + assert args.log_level == 40 + args = link_checker.parse_arguments([subcmd, "-qq"]) + assert args.log_level == 50 + args = link_checker.parse_arguments([subcmd, "-qqq"]) + assert args.log_level == 50 + args = link_checker.parse_arguments([subcmd, "-q", "--quiet"]) + assert args.log_level == 50 + # Test Logging Levels -v/--verbose + args = link_checker.parse_arguments([subcmd, "-v"]) + assert args.log_level == 20 + args = link_checker.parse_arguments([subcmd, "-vv"]) + assert args.log_level == 10 + args = link_checker.parse_arguments([subcmd, "-vvv"]) + assert args.log_level == 10 + args = link_checker.parse_arguments([subcmd, "-v", "--verbose"]) + assert args.log_level == 10 + # Test Logging Levels with both -v and -q + args = link_checker.parse_arguments([subcmd, "-vq"]) + assert args.log_level == 30 + args = link_checker.parse_arguments([subcmd, "-vvq"]) + assert args.log_level == 20 + args = link_checker.parse_arguments([subcmd, "-vqq"]) + assert args.log_level == 40 + # Test --root-url + args = link_checker.parse_arguments( + [subcmd, "--root-url", "https://pytest.creativecommons.org"] + ) + assert args.root_url == "https://pytest.creativecommons.org" + + +def test_parser_shared_licenses(): + subcmds = ["deeds", "legalcode", "rdf", "combined", "canonical"] + + # Test defaults + for subcmd in subcmds: + args = link_checker.parse_arguments([subcmd]) + assert args.local is False + + # Test argumetns + for subcmd in subcmds: + # Test --local + args = link_checker.parse_arguments([subcmd, "--local"]) + assert args.local is True + + +def test_parser_shared_rdf(): + subcmds = ["rdf", "index"] + + # Test defaults + for subcmd in subcmds: + args = link_checker.parse_arguments([subcmd]) + assert args.local_index is False + + # Test argumetns + for subcmd in subcmds: + # Test --local + args = link_checker.parse_arguments([subcmd, "--local-index"]) + assert args.local_index is True + + +def test_parser_shared_reporting(tmpdir): + subcmds = ["deeds", "legalcode", "rdf", "index", "combined"] + + # Test defaults + for subcmd in subcmds: + args = link_checker.parse_arguments([subcmd]) + assert bool(args.output_errors) is False + + # Test argumetns + for subcmd in subcmds: + # Test --output-errors with default value + args = link_checker.parse_arguments([subcmd, "--output-errors"]) + assert bool(args.output_errors) is True + assert args.output_errors.name == "errorlog.txt" + # Test --output-errors with custom value + output_file = tmpdir.join("errorlog.txt") + args = link_checker.parse_arguments( + [subcmd, "--output-errors", output_file.strpath] + ) + assert bool(args.output_errors) is True + assert args.output_errors.name == output_file.strpath From f1c69c9666ec192f59a9ba60b5c70816f7add303 Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Sat, 5 Sep 2020 13:00:47 -0700 Subject: [PATCH 13/15] updated utils tests to use subcommands --- link_checker/tests/test_utils.py | 125 +++++++++++++++++-------------- 1 file changed, 67 insertions(+), 58 deletions(-) diff --git a/link_checker/tests/test_utils.py b/link_checker/tests/test_utils.py index deb020a..5687d0a 100644 --- a/link_checker/tests/test_utils.py +++ b/link_checker/tests/test_utils.py @@ -24,7 +24,7 @@ map_links_file, memoize_result, write_response, - output_summary, + output_issues_summary, output_write, output_test_summary, ) @@ -43,70 +43,75 @@ def test_get_github_legalcode(): assert len(all_links) > 0 +def id_generator(data): + id_list = [] + for license in data: + id_list.append(license[0]) + return id_list + + license_url_data = [ # 2 part URL - ( + [ "by-nc-nd_2.0", - "https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode", "https://creativecommons.org/licenses/by-nc-nd/2.0/", + "https://creativecommons.org/licenses/by-nc-nd/2.0/legalcode", "https://creativecommons.org/licenses/by-nc-nd/2.0/rdf", - ), + ], # 3 part URL - ( + [ "by-nc-nd_4.0_cs", - "https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode.cs", "https://creativecommons.org/licenses/by-nc-nd/4.0/deed.cs", + "https://creativecommons.org/licenses/by-nc-nd/4.0/legalcode.cs", "https://creativecommons.org/licenses/by-nc-nd/4.0/rdf", - ), + ], # 4 part URL - ( + [ "by-nc-nd_3.0_rs_sr-Latn", + "https://creativecommons.org/licenses/by-nc-nd/3.0/rs/", "https://creativecommons.org/licenses/by-nc-nd/3.0/rs/" "legalcode.sr-Latn", - "https://creativecommons.org/licenses/by-nc-nd/3.0/rs/", "https://creativecommons.org/licenses/by-nc-nd/3.0/rs/rdf", - ), + ], # Special case - samplingplus - ( + [ "samplingplus_1.0", - "https://creativecommons.org/licenses/sampling+/1.0/legalcode", "https://creativecommons.org/licenses/sampling+/1.0/", + "https://creativecommons.org/licenses/sampling+/1.0/legalcode", "https://creativecommons.org/licenses/sampling+/1.0/rdf", - ), - ( + ], + [ "samplingplus_1.0_br", - "https://creativecommons.org/licenses/sampling+/1.0/br/legalcode", "https://creativecommons.org/licenses/sampling+/1.0/br/", + "https://creativecommons.org/licenses/sampling+/1.0/br/legalcode", "https://creativecommons.org/licenses/sampling+/1.0/br/rdf", - ), + ], # Special case - CC0 - ( + [ "zero_1.0", - "https://creativecommons.org/publicdomain/zero/1.0/legalcode", "https://creativecommons.org/publicdomain/zero/1.0/", + "https://creativecommons.org/publicdomain/zero/1.0/legalcode", "https://creativecommons.org/publicdomain/zero/1.0/rdf", - ), + ], ] -def id_generator(data): - id_list = [] - for license in data: - id_list.append(license[0]) - return id_list - - @pytest.mark.parametrize( - "filename, result, deed_result, rdf_result", + "filename, deed_result, legalcode_result, rdf_result", license_url_data, ids=id_generator(license_url_data), ) -def test_create_base_link(filename, result, deed_result, rdf_result): - args = link_checker.parse_argument([]) - baseURL = create_base_link(args, filename) - assert baseURL == result +def test_create_base_link(filename, deed_result, legalcode_result, rdf_result): + # deeds + args = link_checker.parse_arguments(["deeds"]) baseURL = create_base_link(args, filename, for_deeds=True) assert baseURL == deed_result + # legalcode + args = link_checker.parse_arguments(["legalcode"]) + baseURL = create_base_link(args, filename) + assert baseURL == legalcode_result + # rdf + args = link_checker.parse_arguments(["rdf"]) baseURL = create_base_link(args, filename, for_rdfs=True) assert baseURL == rdf_result @@ -114,19 +119,19 @@ def test_create_base_link(filename, result, deed_result, rdf_result): def test_output_write(tmpdir): # output_errors is set and written to output_file = tmpdir.join("errorlog.txt") - args = link_checker.parse_argument( - ["--output-errors", output_file.strpath] + args = link_checker.parse_arguments( + ["deeds", "--output-errors", output_file.strpath] ) output_write(args, "Output enabled") args.output_errors.flush() assert output_file.read() == "Output enabled\n" -def test_output_summary(reset_global, tmpdir): +def test_output_issues_summary(reset_global, tmpdir): # output_errors is set and written to output_file = tmpdir.join("errorlog.txt") - args = link_checker.parse_argument( - ["--output-errors", output_file.strpath] + args = link_checker.parse_arguments( + ["deeds", "--output-errors", output_file.strpath] ) utils.MAP_BROKEN_LINKS = { "https://link1.demo": [ @@ -136,7 +141,7 @@ def test_output_summary(reset_global, tmpdir): "https://link2.demo": ["https://file4.url/here"], } all_links = ["some link"] * 5 - output_summary(args, all_links, 3) + output_issues_summary(args, all_links, 3) args.output_errors.flush() lines = output_file.readlines() i = 0 @@ -198,7 +203,7 @@ def test_create_absolute_link(link, result): def test_get_scrapable_links(): - args = link_checker.parse_argument([]) + args = link_checker.parse_arguments(["deeds"]) test_file = ( "without href," " internal link," @@ -221,7 +226,7 @@ def test_get_scrapable_links(): == "['https://creativecommons.ca', 'https://www.demourl.com/index']" ) # Testing RDF - args = link_checker.parse_argument(["--local"]) + args = link_checker.parse_arguments(["index", "--local-index"]) rdf_obj_list = get_index_rdf( args, local_path=constants.TEST_RDF_LOCAL_PATH ) @@ -231,7 +236,7 @@ def test_get_scrapable_links(): valid_anchors, valid_links, _ = get_scrapable_links( args, base_url, links_found, None, False, rdf=True, ) - assert str(valid_anchors) == ( + expected_anchors = ( "[, ' "]' ) - assert str(valid_links) == ( - "['http://creativecommons.org/ns#DerivativeWorks', " - "'http://creativecommons.org/ns#Reproduction', " - "'http://creativecommons.org/ns#Distribution', " - "'http://creativecommons.org/international/ch/', " - "'https://i.creativecommons.org/l/by-nc-sa/2.5/ch/88x31.png', " - "'https://i.creativecommons.org/l/by-nc-sa/2.5/ch/80x15.png', " - "'http://creativecommons.org/licenses/by-nc-sa/2.5/ch/legalcode.de', " - "'http://creativecommons.org/licenses/by-nc-sa/2.5/', " - "'http://creativecommons.org', " - "'http://creativecommons.org/ns#CommercialUse', " - "'http://creativecommons.org/license/', " - "'http://creativecommons.org/ns#ShareAlike', " - "'http://creativecommons.org/ns#Attribution', " - "'http://creativecommons.org/ns#Notice']" - ) + assert str(valid_anchors) == expected_anchors + valid_links.sort() + expected_links = [ + "http://creativecommons.org", + "http://creativecommons.org/international/ch/", + "http://creativecommons.org/license/", + "http://creativecommons.org/licenses/by-nc-sa/2.5/", + "http://creativecommons.org/licenses/by-nc-sa/2.5/ch/legalcode.de", + "http://creativecommons.org/ns#Attribution", + "http://creativecommons.org/ns#CommercialUse", + "http://creativecommons.org/ns#DerivativeWorks", + "http://creativecommons.org/ns#Distribution", + "http://creativecommons.org/ns#Notice", + "http://creativecommons.org/ns#Reproduction", + "http://creativecommons.org/ns#ShareAlike", + "https://i.creativecommons.org/l/by-nc-sa/2.5/ch/80x15.png", + "https://i.creativecommons.org/l/by-nc-sa/2.5/ch/88x31.png", + ] + expected_links.sort() + assert valid_links == expected_links def test_exception_handler(): @@ -307,8 +316,8 @@ def test_map_links_file(reset_global): def test_write_response(tmpdir): # Set config output_file = tmpdir.join("errorlog.txt") - args = link_checker.parse_argument( - ["--output-errors", output_file.strpath] + args = link_checker.parse_arguments( + ["deeds", "--output-errors", output_file.strpath] ) # Text to extract valid_anchors From 3bfca6a66bbc535d4ff9ec8f21c0575ea5460b0b Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Tue, 8 Sep 2020 07:44:03 -0700 Subject: [PATCH 14/15] fixed typo/spelling error --- link_checker/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/link_checker/utils.py b/link_checker/utils.py index 934c371..63d598e 100644 --- a/link_checker/utils.py +++ b/link_checker/utils.py @@ -95,11 +95,11 @@ def get_legalcode(args): """ if args.local: if args.log_level == DEBUG: - print("DEBUG: processing local legacode files") + print("DEBUG: processing local legalcode files") license_names = get_local_legalcode() else: if args.log_level == DEBUG: - print("DEBUG: processing GitHub legacode files") + print("DEBUG: processing GitHub legalcode files") license_names = get_github_legalcode() if args.limit and args.subcommand != "rdf": license_names = license_names[0 : args.limit] # noqa: E203 From 24261736b24dd338f4b63625c663cd1da47b3e4c Mon Sep 17 00:00:00 2001 From: Timid Robot Zehta Date: Wed, 9 Sep 2020 08:35:54 -0700 Subject: [PATCH 15/15] Update link_checker/tests/test_link_checker.py spelling correction Co-authored-by: Alden S Page --- link_checker/tests/test_link_checker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/link_checker/tests/test_link_checker.py b/link_checker/tests/test_link_checker.py index 5ba5221..810dadf 100644 --- a/link_checker/tests/test_link_checker.py +++ b/link_checker/tests/test_link_checker.py @@ -89,7 +89,7 @@ def test_parser_shared_reporting(tmpdir): args = link_checker.parse_arguments([subcmd]) assert bool(args.output_errors) is False - # Test argumetns + # Test arguments for subcmd in subcmds: # Test --output-errors with default value args = link_checker.parse_arguments([subcmd, "--output-errors"])