diff --git a/fixtures/TEST_STYLESHEET_LINK.md b/fixtures/TEST_STYLESHEET_LINK.md new file mode 100644 index 0000000000..90ad264547 --- /dev/null +++ b/fixtures/TEST_STYLESHEET_LINK.md @@ -0,0 +1 @@ + diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index fcb7530cb7..637a59d8f0 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -231,6 +231,17 @@ mod cli { Ok(()) } + #[test] + fn test_stylesheet_misinterpreted_as_email() -> Result<()> { + test_json_output!( + "TEST_STYLESHEET_LINK.md", + MockResponseStats { + total: 0, + ..MockResponseStats::default() + } + ) + } + /// Test that a GitHub link can be checked without specifying the token. #[test] fn test_check_github_no_token() -> Result<()> { diff --git a/lychee-lib/src/extract/html/html5ever.rs b/lychee-lib/src/extract/html/html5ever.rs index 0a1414855b..1d30888a21 100644 --- a/lychee-lib/src/extract/html/html5ever.rs +++ b/lychee-lib/src/extract/html/html5ever.rs @@ -92,7 +92,7 @@ impl TokenSink for LinkExtractor { return TokenSinkResult::Continue; } - for attr in attrs { + for attr in &attrs { let urls = LinkExtractor::extract_urls_from_elem_attr( &attr.name.local, &name, @@ -104,8 +104,11 @@ impl TokenSink for LinkExtractor { Some(urls) => urls .into_iter() .filter(|url| { - // Only accept email addresses, which occur in `href` attributes - // and start with `mailto:`. Technically, email addresses could + // Only accept email addresses which + // - occur in `href` attributes + // - start with `mailto:` + // + // Technically, email addresses could // also occur in plain text, but we don't want to extract those // because of the high false positive rate. // @@ -115,6 +118,18 @@ impl TokenSink for LinkExtractor { let is_phone = url.starts_with("tel:"); let is_href = attr.name.local.as_ref() == "href"; + if attrs.iter().any(|attr| { + &attr.name.local == "rel" && attr.value.contains("stylesheet") + }) { + // Skip virtual/framework-specific stylesheet paths that start with /@ or @ + // These are typically resolved by dev servers or build tools rather than being real URLs + // Examples: /@global/style.css, @tailwind/base.css as in + // `` + if url.starts_with("/@") || url.starts_with('@') { + return false; + } + } + !is_email || (is_mailto && is_href) || (is_phone && is_href) }) .map(|url| RawUri { @@ -466,4 +481,14 @@ mod tests { let uris = extract_html(input, false); assert!(uris.is_empty()); } + + #[test] + fn test_skip_emails_in_stylesheets() { + let input = r#" + + "#; + + let uris = extract_html(input, false); + assert!(uris.is_empty()); + } } diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index 5fb41be69f..d28007af00 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -183,6 +183,22 @@ impl LinkExtractor { return; } + // Skip virtual/framework-specific stylesheet paths that start with /@ or @ + // These are typically resolved by dev servers or build tools rather than being real URLs + // Examples: /@global/style.css, @tailwind/base.css + if self + .current_attributes + .get("rel") + .map_or(false, |rel| rel.contains("stylesheet")) + { + if let Some(href) = self.current_attributes.get("href") { + if href.starts_with("/@") || href.starts_with('@') { + self.current_attributes.clear(); + return; + } + } + } + let new_urls = self .extract_urls_from_elem_attr() .into_iter() @@ -662,4 +678,14 @@ mod tests { let uris = extract_html(input, false); assert!(uris.is_empty()); } + + #[test] + fn test_skip_emails_in_stylesheets() { + let input = r#" + + "#; + + let uris = extract_html(input, false); + assert!(uris.is_empty()); + } }