From 59b698d7b5193f3a99434b7a2fd504fa39f4042b Mon Sep 17 00:00:00 2001 From: Trask Stalnaker Date: Sat, 30 Nov 2024 10:17:40 -0800 Subject: [PATCH] Support both options at the same time --- lychee-bin/src/client.rs | 10 +--- lychee-bin/src/main.rs | 9 +--- lychee-lib/src/collector.rs | 50 +++++++++++++------- lychee-lib/src/types/base.rs | 14 +----- lychee-lib/src/utils/request.rs | 84 +++++++++++++++++++++------------ 5 files changed, 90 insertions(+), 77 deletions(-) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index ce9d841d55..d1b982dc26 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -2,7 +2,7 @@ use crate::options::Config; use crate::parse::{parse_duration_secs, parse_headers, parse_remaps}; use anyhow::{Context, Result}; use http::StatusCode; -use lychee_lib::{Base, Client, ClientBuilder}; +use lychee_lib::{Client, ClientBuilder}; use regex::RegexSet; use reqwest_cookie_store::CookieStoreMutex; use std::sync::Arc; @@ -53,15 +53,9 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.include_mail }; - let base = if let Some(root_path) = &cfg.root_path { - Some(Base::create_root_path(&root_path)) - } else { - cfg.base.clone() - }; - ClientBuilder::builder() .remaps(remaps) - .base(base) + .base(cfg.base.clone()) .includes(includes) .excludes(excludes) .exclude_all_private(cfg.exclude_all_private) diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 1912f4950d..3250bdf41b 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -75,7 +75,6 @@ use openssl_sys as _; // required for vendored-openssl feature use options::LYCHEE_CONFIG_FILE; use ring as _; // required for apple silicon -use lychee_lib::Base; use lychee_lib::BasicAuthExtractor; use lychee_lib::Collector; use lychee_lib::CookieJar; @@ -289,13 +288,7 @@ fn underlying_io_error_kind(error: &Error) -> Option { async fn run(opts: &LycheeOptions) -> Result { let inputs = opts.inputs()?; - let base = if let Some(root_path) = &opts.config.root_path { - Some(Base::create_root_path(&root_path)) - } else { - opts.config.base.clone() - }; - - let mut collector = Collector::new(base) + let mut collector = Collector::new(opts.config.root_path.clone(), opts.config.base.clone()) .skip_missing_inputs(opts.config.skip_missing) .skip_hidden(!opts.config.hidden) .skip_ignored(!opts.config.no_ignore) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 8b91851871..6210c4b24e 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -21,13 +21,14 @@ pub struct Collector { skip_hidden: bool, include_verbatim: bool, use_html5ever: bool, + root_path: Option, base: Option, } impl Collector { /// Create a new collector with an empty cache #[must_use] - pub const fn new(base: Option) -> Self { + pub const fn new(root_path: Option, base: Option) -> Self { Collector { basic_auth_extractor: None, skip_missing_inputs: false, @@ -35,6 +36,7 @@ impl Collector { use_html5ever: false, skip_hidden: true, skip_ignored: true, + root_path, base, } } @@ -119,13 +121,19 @@ impl Collector { }) .flatten() .par_then_unordered(None, move |(content, base)| { + let root_path = self.root_path.clone(); let basic_auth_extractor = self.basic_auth_extractor.clone(); async move { let content = content?; let extractor = Extractor::new(self.use_html5ever, self.include_verbatim); let uris: Vec = extractor.extract(&content); - let requests = - request::create(uris, &content.source, &base, &basic_auth_extractor); + let requests = request::create( + uris, + &content.source, + &root_path, + &base, + &basic_auth_extractor, + ); Result::Ok(stream::iter(requests.into_iter().map(Ok))) } }) @@ -149,14 +157,22 @@ mod tests { }; // Helper function to run the collector on the given inputs - async fn collect(inputs: Vec, base: Option) -> HashSet { - let responses = Collector::new(base).collect_links(inputs); + async fn collect( + inputs: Vec, + root_path: Option, + base: Option, + ) -> HashSet { + let responses = Collector::new(root_path, base).collect_links(inputs); responses.map(|r| r.unwrap().uri).collect().await } // Helper function for collecting verbatim links - async fn collect_verbatim(inputs: Vec, base: Option) -> HashSet { - let responses = Collector::new(base) + async fn collect_verbatim( + inputs: Vec, + root_path: Option, + base: Option, + ) -> HashSet { + let responses = Collector::new(root_path, base) .include_verbatim(true) .collect_links(inputs); responses.map(|r| r.unwrap().uri).collect().await @@ -247,7 +263,7 @@ mod tests { }, ]; - let links = collect_verbatim(inputs, None).await; + let links = collect_verbatim(inputs, None, None).await; let expected_links = HashSet::from_iter([ website(TEST_STRING), @@ -270,7 +286,7 @@ mod tests { file_type_hint: Some(FileType::Markdown), excluded_paths: None, }; - let links = collect(vec![input], Some(base)).await; + let links = collect(vec![input], None, Some(base)).await; let expected_links = HashSet::from_iter([ website("https://endler.dev"), @@ -296,7 +312,7 @@ mod tests { file_type_hint: Some(FileType::Html), excluded_paths: None, }; - let links = collect(vec![input], Some(base)).await; + let links = collect(vec![input], None, Some(base)).await; let expected_links = HashSet::from_iter([ website("https://github.com/lycheeverse/lychee/"), @@ -325,7 +341,7 @@ mod tests { file_type_hint: Some(FileType::Html), excluded_paths: None, }; - let links = collect(vec![input], Some(base)).await; + let links = collect(vec![input], None, Some(base)).await; let expected_links = HashSet::from_iter([ website("https://example.com/static/image.png"), @@ -352,7 +368,7 @@ mod tests { excluded_paths: None, }; - let links = collect(vec![input], Some(base)).await; + let links = collect(vec![input], None, Some(base)).await; let expected = HashSet::from_iter([ website("https://localhost.com/@/internal.md"), @@ -374,7 +390,7 @@ mod tests { file_type_hint: Some(FileType::Html), excluded_paths: None, }; - let links = collect(vec![input], Some(base)).await; + let links = collect(vec![input], None, Some(base)).await; let expected_links = HashSet::from_iter([ // the body links wouldn't be present if the file was parsed strictly as XML @@ -407,7 +423,7 @@ mod tests { excluded_paths: None, }; - let links = collect(vec![input], None).await; + let links = collect(vec![input], None, None).await; let expected_urls = HashSet::from_iter([ website("https://github.com/lycheeverse/lychee/"), @@ -426,7 +442,7 @@ mod tests { file_type_hint: None, excluded_paths: None, }; - let links = collect(vec![input], None).await; + let links = collect(vec![input], None, None).await; let expected_links = HashSet::from_iter([mail("user@example.com")]); @@ -469,7 +485,7 @@ mod tests { }, ]; - let links = collect(inputs, None).await; + let links = collect(inputs, None, None).await; let expected_links = HashSet::from_iter([ website(&format!( @@ -503,7 +519,7 @@ mod tests { excluded_paths: None, }; - let links = collect(vec![input], Some(base)).await; + let links = collect(vec![input], None, Some(base)).await; let expected_links = HashSet::from_iter([ path("/path/to/root/index.html"), diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index aa716b3f26..fe21429326 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -15,8 +15,6 @@ pub enum Base { Local(PathBuf), /// Remote URL pointing to a website homepage Remote(Url), - /// Root path for checking absolute local links - RootPath(PathBuf), } impl Base { @@ -29,10 +27,6 @@ impl Base { let full_path = path.join(link); Url::from_file_path(full_path).ok() } - Self::RootPath(_path) => { - // this is unused currently because joining on RootPath is handled by create_uri_from_file_path - unreachable!() - } } } @@ -41,16 +35,10 @@ impl Base { pub(crate) fn dir(&self) -> Option { match self { Self::Remote(_) => None, - Self::Local(d) | Self::RootPath(d) => Some(d.clone()), + Self::Local(d) => Some(d.clone()), } } - /// Create a root path base - #[must_use] - pub fn create_root_path(value: &str) -> Base { - Self::RootPath(PathBuf::from(value)) - } - pub(crate) fn from_source(source: &InputSource) -> Option { match &source { InputSource::RemoteUrl(url) => { diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 72a59b701f..536d3fa0b7 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -25,10 +25,11 @@ fn extract_credentials( fn create_request( raw_uri: &RawUri, source: &InputSource, + root_path: &Option, base: &Option, extractor: &Option, ) -> Result { - let uri = try_parse_into_uri(raw_uri, source, base)?; + let uri = try_parse_into_uri(raw_uri, source, root_path, base)?; let source = truncate_source(source); let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); @@ -48,19 +49,30 @@ fn create_request( /// to create a valid URI. /// - If a URI cannot be created from the file path. /// - If the source is not a file path (i.e. the URI type is not supported). -fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option) -> Result { - let text = raw_uri.text.clone(); +fn try_parse_into_uri( + raw_uri: &RawUri, + source: &InputSource, + root_path: &Option, + base: &Option, +) -> Result { + let mut text = raw_uri.text.clone(); + if text.starts_with('/') { + if let Some(path) = root_path { + // TODO (trask) should PathBuf be used to handle Windows root paths? + text = path.to_owned() + &text; + } + } let uri = match Uri::try_from(raw_uri.clone()) { Ok(uri) => uri, Err(_) => match base { - Some(Base::RootPath(_)) | None => match source { - InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?, - _ => return Err(ErrorKind::UnsupportedUriType(text)), - }, Some(base_url) => match base_url.join(&text) { Some(url) => Uri { url }, None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), }, + None => match source { + InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?, + _ => return Err(ErrorKind::UnsupportedUriType(text)), + }, }, }; Ok(uri) @@ -126,21 +138,22 @@ fn truncate_source(source: &InputSource) -> InputSource { pub(crate) fn create( uris: Vec, source: &InputSource, + root_path: &Option, base: &Option, extractor: &Option, ) -> HashSet { let base = base.clone().or_else(|| Base::from_source(&source)); uris.into_iter() - .filter_map( - |raw_uri| match create_request(&raw_uri, &source, &base, extractor) { + .filter_map(|raw_uri| { + match create_request(&raw_uri, &source, &root_path, &base, extractor) { Ok(request) => Some(request), Err(e) => { warn!("Error creating request: {:?}", e); None } - }, - ) + } + }) .collect() } @@ -202,7 +215,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &None, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -216,7 +229,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &None, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -230,7 +243,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &None, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -244,7 +257,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &None, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -258,7 +271,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &None, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -268,11 +281,13 @@ mod tests { #[test] fn test_relative_url_resolution_from_root_path() { - let base = Some(Base::create_root_path("/tmp/lychee")); + let root_path = Some("/tmp/lychee".to_string()); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &root_path, &None, &None); + + println!("{:?}", requests); assert_eq!(requests.len(), 1); assert!(requests @@ -282,11 +297,11 @@ mod tests { #[test] fn test_absolute_url_resolution_from_root_path() { - let base = Some(Base::create_root_path("/tmp/lychee")); + let root_path = Some("/tmp/lychee".to_string()); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &root_path, &None, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -296,11 +311,11 @@ mod tests { #[test] fn test_root_relative_url_resolution_from_root_path() { - let base = Some(Base::create_root_path("/tmp/lychee")); + let root_path = Some("/tmp/lychee".to_string()); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &root_path, &None, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -310,11 +325,11 @@ mod tests { #[test] fn test_parent_directory_url_resolution_from_root_path() { - let base = Some(Base::create_root_path("/tmp/lychee")); + let root_path = Some("/tmp/lychee".to_string()); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &root_path, &None, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -324,11 +339,11 @@ mod tests { #[test] fn test_fragment_url_resolution_from_root_path() { - let base = Some(Base::create_root_path("/tmp/lychee")); + let root_path = Some("/tmp/lychee".to_string()); let source = InputSource::FsPath(PathBuf::from("/some/page.html")); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &root_path, &None, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -342,7 +357,7 @@ mod tests { let source = InputSource::String(String::new()); let uris = vec![RawUri::from("https://example.com/page")]; - let requests = create(uris, &source, &base, &None); + let requests = create(uris, &source, &None, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -355,8 +370,14 @@ mod tests { let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); let input_source = InputSource::FsPath(PathBuf::from("page.html")); - let actual = - create_request(&RawUri::from("file.html"), &input_source, &base, &None).unwrap(); + let actual = create_request( + &RawUri::from("file.html"), + &input_source, + &None, + &base, + &None, + ) + .unwrap(); assert_eq!( actual, @@ -381,6 +402,7 @@ mod tests { let actual = create_request( &RawUri::from("/usr/local/share/doc/example.html"), &input_source, + &None, &base, &None, ) @@ -406,7 +428,7 @@ mod tests { let source = InputSource::String(String::new()); let raw_uri = RawUri::from("relative.html"); - let uri = try_parse_into_uri(&raw_uri, &source, &base).unwrap(); + let uri = try_parse_into_uri(&raw_uri, &source, &None, &base).unwrap(); assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html"); } @@ -417,7 +439,7 @@ mod tests { let source = InputSource::String(String::new()); let raw_uri = RawUri::from("absolute.html"); - let uri = try_parse_into_uri(&raw_uri, &source, &base).unwrap(); + let uri = try_parse_into_uri(&raw_uri, &source, &None, &base).unwrap(); assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html"); }