Skip to content

Commit

Permalink
Fix skipping of email addresses in stylesheets (#1546)
Browse files Browse the repository at this point in the history
  • Loading branch information
mre authored Oct 26, 2024
1 parent 3094bbc commit e43086c
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 3 deletions.
1 change: 1 addition & 0 deletions fixtures/TEST_STYLESHEET_LINK.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<link href="/@global/global.css" rel="stylesheet">
11 changes: 11 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,17 @@ mod cli {
Ok(())
}

#[test]
fn test_stylesheet_misinterpreted_as_email() -> Result<()> {
test_json_output!(
"TEST_STYLESHEET_LINK.md",
MockResponseStats {
total: 0,
..MockResponseStats::default()
}
)
}

/// Test that a GitHub link can be checked without specifying the token.
#[test]
fn test_check_github_no_token() -> Result<()> {
Expand Down
31 changes: 28 additions & 3 deletions lychee-lib/src/extract/html/html5ever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ impl TokenSink for LinkExtractor {
return TokenSinkResult::Continue;
}

for attr in attrs {
for attr in &attrs {
let urls = LinkExtractor::extract_urls_from_elem_attr(
&attr.name.local,
&name,
Expand All @@ -104,8 +104,11 @@ impl TokenSink for LinkExtractor {
Some(urls) => urls
.into_iter()
.filter(|url| {
// Only accept email addresses, which occur in `href` attributes
// and start with `mailto:`. Technically, email addresses could
// Only accept email addresses which
// - occur in `href` attributes
// - start with `mailto:`
//
// Technically, email addresses could
// also occur in plain text, but we don't want to extract those
// because of the high false positive rate.
//
Expand All @@ -115,6 +118,18 @@ impl TokenSink for LinkExtractor {
let is_phone = url.starts_with("tel:");
let is_href = attr.name.local.as_ref() == "href";

if attrs.iter().any(|attr| {
&attr.name.local == "rel" && attr.value.contains("stylesheet")
}) {
// Skip virtual/framework-specific stylesheet paths that start with /@ or @
// These are typically resolved by dev servers or build tools rather than being real URLs
// Examples: /@global/style.css, @tailwind/base.css as in
// `<link href="/@global/style.css" rel="stylesheet">`
if url.starts_with("/@") || url.starts_with('@') {
return false;
}
}

!is_email || (is_mailto && is_href) || (is_phone && is_href)
})
.map(|url| RawUri {
Expand Down Expand Up @@ -466,4 +481,14 @@ mod tests {
let uris = extract_html(input, false);
assert!(uris.is_empty());
}

#[test]
fn test_skip_emails_in_stylesheets() {
let input = r#"
<link href="/@global/global.css" rel="stylesheet">
"#;

let uris = extract_html(input, false);
assert!(uris.is_empty());
}
}
26 changes: 26 additions & 0 deletions lychee-lib/src/extract/html/html5gum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,22 @@ impl LinkExtractor {
return;
}

// Skip virtual/framework-specific stylesheet paths that start with /@ or @
// These are typically resolved by dev servers or build tools rather than being real URLs
// Examples: /@global/style.css, @tailwind/base.css
if self
.current_attributes
.get("rel")
.map_or(false, |rel| rel.contains("stylesheet"))
{
if let Some(href) = self.current_attributes.get("href") {
if href.starts_with("/@") || href.starts_with('@') {
self.current_attributes.clear();
return;
}
}
}

let new_urls = self
.extract_urls_from_elem_attr()
.into_iter()
Expand Down Expand Up @@ -662,4 +678,14 @@ mod tests {
let uris = extract_html(input, false);
assert!(uris.is_empty());
}

#[test]
fn test_skip_emails_in_stylesheets() {
let input = r#"
<link href="/@global/global.css" rel="stylesheet">
"#;

let uris = extract_html(input, false);
assert!(uris.is_empty());
}
}

0 comments on commit e43086c

Please sign in to comment.