lychee_lib/extract/html/
html5ever.rs

1use std::cell::RefCell;
2
3use html5ever::{
4    buffer_queue::BufferQueue,
5    tendril::{StrTendril, Tendril, fmt::UTF8},
6    tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
7};
8
9use super::{
10    super::{css::extract_css_with_default_span, plaintext::extract_raw_uri_from_plaintext},
11    is_email_link, is_verbatim_elem, srcset,
12};
13use crate::types::uri::raw::{RawUri, RawUriSpan, SourceSpanProvider, SpanProvider};
14
15/// A [`SpanProvider`] which applies a given line offset.
16struct LineOffsetSpanProvider<'a> {
17    /// The number of lines each span will be offset by.
18    lines_before: usize,
19    /// The inner [`SpanProvider`] which will be responsible for computing the spans.
20    inner: &'a SourceSpanProvider<'a>,
21}
22
23impl SpanProvider for LineOffsetSpanProvider<'_> {
24    fn span(&self, offset: usize) -> RawUriSpan {
25        let mut span = self.inner.span(offset);
26        // if we stay in the same line the column information is wrong, since we didn't know the
27        // column beforehand and likely did not start at a linebreak.
28        // This can be improved in the future by using the computed length of lines.
29        if span.line.get() == 1 {
30            span.column = None;
31        }
32        span.line = span
33            .line
34            .saturating_add(self.lines_before.saturating_sub(1));
35        span
36    }
37}
38
39#[derive(Clone)]
40struct LinkExtractor {
41    links: RefCell<Vec<RawUri>>,
42    include_verbatim: bool,
43    current_verbatim_element_name: RefCell<Option<String>>,
44    /// Whether we're currently inside a `<style>` tag.
45    in_style_tag: RefCell<bool>,
46    /// Accumulated CSS content from within a `<style>` tag.
47    style_content: RefCell<String>,
48}
49
50impl TokenSink for LinkExtractor {
51    type Handle = ();
52
53    #[allow(clippy::match_same_arms)]
54    fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<()> {
55        debug_assert_ne!(line_number, 0);
56        let line_number =
57            usize::try_from(line_number).expect("Unable to convert u64 line_number to usize");
58
59        match token {
60            Token::CharacterTokens(raw) => {
61                // If we're inside a style tag, accumulate the CSS content
62                if *self.in_style_tag.borrow() {
63                    self.style_content.borrow_mut().push_str(&raw);
64                    return TokenSinkResult::Continue;
65                }
66
67                if self.current_verbatim_element_name.borrow().is_some() {
68                    return TokenSinkResult::Continue;
69                }
70                if self.include_verbatim {
71                    self.links
72                        .borrow_mut()
73                        .extend(extract_raw_uri_from_plaintext(
74                            &raw,
75                            &LineOffsetSpanProvider {
76                                lines_before: respect_multiline_tendril(line_number, &raw),
77                                inner: &SourceSpanProvider::from_input(&raw),
78                            },
79                        ));
80                }
81            }
82            Token::TagToken(tag) => return self.process_tag(tag, line_number),
83            Token::ParseError(_err) => {
84                // Silently ignore parse errors
85            }
86            Token::CommentToken(_raw) => (),
87            Token::NullCharacterToken => (),
88            Token::DoctypeToken(_doctype) => (),
89            Token::EOFToken => (),
90        }
91        TokenSinkResult::Continue
92    }
93}
94
95/// Offset line number by line breaks included in the raw text.
96/// This is necessary since html5ever version 0.35.0.
97/// Previously html5ever did not supply us with multiline `Tendril`s.
98fn respect_multiline_tendril(line_number: usize, raw: &Tendril<UTF8>) -> usize {
99    line_number.saturating_sub(raw.chars().filter(|c| *c == '\n').count())
100}
101
102impl LinkExtractor {
103    pub(crate) const fn new(include_verbatim: bool) -> Self {
104        Self {
105            links: RefCell::new(Vec::new()),
106            include_verbatim,
107            current_verbatim_element_name: RefCell::new(None),
108            in_style_tag: RefCell::new(false),
109            style_content: RefCell::new(String::new()),
110        }
111    }
112
113    fn process_tag(
114        &self,
115        Tag {
116            kind,
117            name,
118            self_closing: _,
119            attrs,
120        }: Tag,
121        line_number: usize,
122    ) -> TokenSinkResult<()> {
123        // Handle style tags for CSS URL extraction
124        if &name == "style" {
125            match kind {
126                TagKind::StartTag => {
127                    *self.in_style_tag.borrow_mut() = true;
128                    self.style_content.borrow_mut().clear();
129                }
130                TagKind::EndTag => {
131                    *self.in_style_tag.borrow_mut() = false;
132                    // Extract CSS URLs from the accumulated style content
133                    let css_content = self.style_content.borrow();
134                    let css_urls = extract_css_with_default_span(&css_content);
135                    self.links.borrow_mut().extend(css_urls);
136                    self.style_content.borrow_mut().clear();
137                }
138            }
139        }
140
141        // Check if this is a verbatim element, which we want to skip.
142        if !self.include_verbatim && is_verbatim_elem(&name) {
143            // Check if we're currently inside a verbatim block
144            let mut curr_verbatim_elem = self.current_verbatim_element_name.borrow_mut();
145
146            if curr_verbatim_elem.is_some() {
147                // Inside a verbatim block. Check if the verbatim
148                // element name matches with the current element name.
149                if curr_verbatim_elem.as_ref() == Some(&name.to_string()) {
150                    // If so, we're done with the verbatim block,
151                    // -- but only if this is an end tag.
152                    if matches!(kind, TagKind::EndTag) {
153                        *curr_verbatim_elem = None;
154                    }
155                }
156            } else if matches!(kind, TagKind::StartTag) {
157                // We're not inside a verbatim block, but we just
158                // encountered a verbatim element. Remember the name
159                // of the element.
160                *curr_verbatim_elem = Some(name.to_string());
161            }
162        }
163        if self.current_verbatim_element_name.borrow().is_some() {
164            // We want to skip the content of this element
165            // as we're inside a verbatim block.
166            return TokenSinkResult::Continue;
167        }
168
169        // Check for rel=nofollow. We only extract the first `rel` attribute.
170        // This is correct as per https://html.spec.whatwg.org/multipage/syntax.html#attributes-0, which states
171        // "There must never be two or more attributes on the same start tag whose names are an ASCII case-insensitive match for each other."
172        if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel")
173            && rel.value.contains("nofollow")
174        {
175            return TokenSinkResult::Continue;
176        }
177
178        // Check and exclude `rel=preconnect` and `rel=dns-prefetch`. Unlike `prefetch` and `preload`,
179        // `preconnect` and `dns-prefetch` only perform DNS lookups and do not necessarily link to a resource
180        if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel")
181            && (rel.value.contains("preconnect") || rel.value.contains("dns-prefetch"))
182        {
183            return TokenSinkResult::Continue;
184        }
185
186        // Check and exclude `prefix` attribute. This attribute is used to define a prefix
187        // for the current element. It is not used to link to a resource.
188        if let Some(_prefix) = attrs.iter().find(|attr| &attr.name.local == "prefix") {
189            return TokenSinkResult::Continue;
190        }
191
192        for attr in &attrs {
193            let urls =
194                LinkExtractor::extract_urls_from_elem_attr(&attr.name.local, &name, &attr.value);
195
196            let new_urls = match urls {
197                None => extract_raw_uri_from_plaintext(
198                    &attr.value,
199                    &LineOffsetSpanProvider {
200                        lines_before: line_number,
201                        inner: &SourceSpanProvider::from_input(&attr.value),
202                    },
203                ),
204                Some(urls) => urls
205                    .into_iter()
206                    .filter(|url| {
207                        // Only accept email addresses which
208                        // - occur in `href` attributes
209                        // - start with `mailto:`
210                        //
211                        // Technically, email addresses could
212                        // also occur in plain text, but we don't want to extract those
213                        // because of the high false positive rate.
214                        //
215                        // This ignores links like `<img srcset="v2@1.5x.png">`
216                        let is_email = is_email_link(url);
217                        let is_mailto = url.starts_with("mailto:");
218                        let is_phone = url.starts_with("tel:");
219                        let is_href = attr.name.local.as_ref() == "href";
220
221                        if attrs.iter().any(|attr| {
222                            &attr.name.local == "rel" && attr.value.contains("stylesheet")
223                        }) {
224                            // Skip virtual/framework-specific stylesheet paths that start with /@ or @
225                            // These are typically resolved by dev servers or build tools rather than being real URLs
226                            // Examples: /@global/style.css, @tailwind/base.css as in
227                            // `<link href="/@global/style.css" rel="stylesheet">`
228                            if url.starts_with("/@") || url.starts_with('@') {
229                                return false;
230                            }
231                            // Skip disabled stylesheets
232                            // Ref: https://developer.mozilla.org/en-US/docs/Web/API/HTMLLinkElement/disabled
233                            if attrs.iter().any(|attr| &attr.name.local == "disabled") {
234                                return false;
235                            }
236                        }
237
238                        !is_email || (is_mailto && is_href) || (is_phone && is_href)
239                    })
240                    .map(|url| RawUri {
241                        text: url.to_string(),
242                        element: Some(name.to_string()),
243                        attribute: Some(attr.name.local.to_string()),
244                        span: RawUriSpan {
245                            line: line_number
246                                .try_into()
247                                .expect("checked above that `line_number != 0`"),
248                            column: None,
249                        },
250                    })
251                    .collect::<Vec<_>>(),
252            };
253            self.links.borrow_mut().extend(new_urls);
254        }
255        TokenSinkResult::Continue
256    }
257
258    /// Extract all semantically known links from a given HTML attribute.
259    #[allow(clippy::unnested_or_patterns)]
260    pub(crate) fn extract_urls_from_elem_attr<'a>(
261        attr_name: &str,
262        elem_name: &str,
263        attr_value: &'a str,
264    ) -> Option<impl Iterator<Item = &'a str> + use<'a>> {
265        // For a comprehensive list of elements that might contain URLs/URIs
266        // see https://www.w3.org/TR/REC-html40/index/attributes.html
267        // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
268
269        match (elem_name, attr_name) {
270
271            // Common element/attribute combinations for links
272            (_, "href" | "src" | "cite" | "usemap")
273            // Less common (but still valid!) combinations
274            | ("applet", "codebase")
275            | ("body", "background")
276            | ("button", "formaction")
277            | ("command", "icon")
278            | ("form", "action")
279            | ("frame", "longdesc")
280            | ("head", "profile")
281            | ("html", "manifest")
282            | ("iframe", "longdesc")
283            | ("img", "longdesc")
284            | ("input", "formaction")
285            | ("object", "classid")
286            | ("object", "codebase")
287            | ("object", "data")
288            | ("video", "poster") => {
289                Some(vec![attr_value].into_iter())
290            }
291            (_, "srcset") => {
292                Some(srcset::parse(attr_value).into_iter())
293            }
294            _ => None,
295        }
296    }
297}
298
299/// Extract unparsed URL strings from an HTML string.
300pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
301    let input = BufferQueue::default();
302    input.push_back(StrTendril::from(buf));
303
304    let tokenizer = Tokenizer::new(
305        LinkExtractor::new(include_verbatim),
306        TokenizerOpts::default(),
307    );
308    let _handle = tokenizer.feed(&input);
309    tokenizer.end();
310
311    tokenizer.sink.links.into_inner()
312}
313
314#[cfg(test)]
315mod tests {
316    use crate::types::uri::raw::{span, span_line};
317
318    use super::*;
319
320    const HTML_INPUT: &str = r#"
321<html>
322    <body>
323        <p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
324        <pre>
325        Some random text
326        https://foo.com and http://bar.com/some/path
327        Something else
328        <a href="https://baz.org">example link inside pre</a>
329        </pre>
330        <p><b>bold</b></p>
331    </body>
332</html>"#;
333
334    #[test]
335    fn test_skip_verbatim() {
336        let expected = vec![RawUri {
337            text: "https://example.org".to_string(),
338            element: Some("a".to_string()),
339            attribute: Some("href".to_string()),
340            span: span_line(4),
341        }];
342
343        let uris = extract_html(HTML_INPUT, false);
344        assert_eq!(uris, expected);
345    }
346
347    #[test]
348    fn test_include_verbatim() {
349        let expected = vec![
350            RawUri {
351                text: "https://example.com".to_string(),
352                element: None,
353                attribute: None,
354                span: span_line(4),
355            },
356            RawUri {
357                text: "https://example.org".to_string(),
358                element: Some("a".to_string()),
359                attribute: Some("href".to_string()),
360                span: span_line(4),
361            },
362            RawUri {
363                text: "https://foo.com".to_string(),
364                element: None,
365                attribute: None,
366                span: span(7, 9),
367            },
368            RawUri {
369                text: "http://bar.com/some/path".to_string(),
370                element: None,
371                attribute: None,
372                span: span(7, 29),
373            },
374            RawUri {
375                text: "https://baz.org".to_string(),
376                element: Some("a".to_string()),
377                attribute: Some("href".to_string()),
378                span: span_line(9),
379            },
380        ];
381
382        let uris = extract_html(HTML_INPUT, true);
383        assert_eq!(uris, expected);
384    }
385
386    #[test]
387    fn test_include_verbatim_recursive() {
388        const HTML_INPUT: &str = r#"
389        <a href="https://example.com/">valid link</a>
390        <code>
391            <pre>
392                <span>https://example.org</span>
393            </pre>
394        </code>
395        "#;
396
397        let expected = vec![RawUri {
398            text: "https://example.com/".to_string(),
399            element: Some("a".to_string()),
400            attribute: Some("href".to_string()),
401            span: span_line(2),
402        }];
403
404        let uris = extract_html(HTML_INPUT, false);
405        assert_eq!(uris, expected);
406    }
407
408    #[test]
409    fn test_include_nofollow() {
410        let input = r#"
411        <a rel="nofollow" href="https://foo.com">do not follow me</a>
412        <a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
413        <a href="https://example.org">do not follow me</a>
414        "#;
415        let expected = vec![RawUri {
416            text: "https://example.org".to_string(),
417            element: Some("a".to_string()),
418            attribute: Some("href".to_string()),
419            span: span_line(4),
420        }];
421        let uris = extract_html(input, false);
422        assert_eq!(uris, expected);
423    }
424
425    #[test]
426    fn test_exclude_script_tags() {
427        let input = r#"
428        <script>
429        var foo = "https://example.com";
430        </script>
431        <a href="https://example.org">i'm fine</a>
432        "#;
433        let expected = vec![RawUri {
434            text: "https://example.org".to_string(),
435            element: Some("a".to_string()),
436            attribute: Some("href".to_string()),
437            span: span_line(5),
438        }];
439        let uris = extract_html(input, false);
440        assert_eq!(uris, expected);
441    }
442
443    #[test]
444    fn test_exclude_disabled_stylesheet() {
445        let input = r#"
446        <link rel="stylesheet" href="https://disabled.com" disabled>
447        <link rel="stylesheet" href="https://disabled.com" disabled="disabled">
448        <a href="https://example.org">i'm fine</a>
449        "#;
450        let expected = vec![RawUri {
451            text: "https://example.org".to_string(),
452            element: Some("a".to_string()),
453            attribute: Some("href".to_string()),
454            span: span_line(4),
455        }];
456        let uris = extract_html(input, false);
457        assert_eq!(uris, expected);
458    }
459
460    #[test]
461    fn test_valid_email() {
462        let input = r#"<!DOCTYPE html>
463        <html lang="en-US">
464          <head>
465            <meta charset="utf-8">
466            <title>Test</title>
467          </head>
468          <body>
469            <a href="mailto:foo@bar.com">
470          </body>
471        </html>"#;
472
473        let expected = vec![RawUri {
474            text: "mailto:foo@bar.com".to_string(),
475            element: Some("a".to_string()),
476            attribute: Some("href".to_string()),
477            span: span_line(8),
478        }];
479        let uris = extract_html(input, false);
480        assert_eq!(uris, expected);
481    }
482
483    #[test]
484    fn test_valid_tel() {
485        let input = r#"<!DOCTYPE html>
486        <html lang="en-US">
487          <head>
488            <meta charset="utf-8">
489            <title>Test</title>
490          </head>
491          <body>
492            <a href="tel:1234567890">
493          </body>
494        </html>"#;
495
496        let expected = vec![RawUri {
497            text: "tel:1234567890".to_string(),
498            element: Some("a".to_string()),
499            attribute: Some("href".to_string()),
500            span: span_line(8),
501        }];
502        let uris = extract_html(input, false);
503        assert_eq!(uris, expected);
504    }
505
506    #[test]
507    fn test_exclude_email_without_mailto() {
508        let input = r#"<!DOCTYPE html>
509        <html lang="en-US">
510          <head>
511            <meta charset="utf-8">
512            <title>Test</title>
513          </head>
514          <body>
515            <a href="foo@bar.com">
516          </body>
517        </html>"#;
518
519        let expected = vec![];
520        let uris = extract_html(input, false);
521        assert_eq!(uris, expected);
522    }
523
524    #[test]
525    fn test_email_false_positive() {
526        let input = r#"<!DOCTYPE html>
527        <html lang="en-US">
528          <head>
529            <meta charset="utf-8">
530            <title>Test</title>
531          </head>
532          <body>
533            <img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
534          </body>
535        </html>"#;
536
537        let expected = vec![];
538        let uris = extract_html(input, false);
539        assert_eq!(uris, expected);
540    }
541
542    #[test]
543    fn test_skip_preconnect() {
544        let input = r#"
545            <link rel="preconnect" href="https://example.com">
546        "#;
547
548        let uris = extract_html(input, false);
549        assert!(uris.is_empty());
550    }
551
552    #[test]
553    fn test_skip_preconnect_reverse_order() {
554        let input = r#"
555            <link href="https://example.com" rel="preconnect">
556        "#;
557
558        let uris = extract_html(input, false);
559        assert!(uris.is_empty());
560    }
561
562    #[test]
563    fn test_skip_prefix() {
564        let input = r#"
565            <html lang="en-EN" prefix="og: https://ogp.me/ns#">
566        "#;
567
568        let uris = extract_html(input, false);
569        assert!(uris.is_empty());
570    }
571
572    #[test]
573    fn test_ignore_text_content_links() {
574        let input = r#"
575            <a href="https://example.com">https://ignoreme.com</a>
576        "#;
577        let expected = vec![RawUri {
578            text: "https://example.com".to_string(),
579            element: Some("a".to_string()),
580            attribute: Some("href".to_string()),
581            span: span_line(2),
582        }];
583
584        let uris = extract_html(input, false);
585        assert_eq!(uris, expected);
586    }
587
588    #[test]
589    fn test_skip_dns_prefetch() {
590        let input = r#"
591            <link rel="dns-prefetch" href="https://example.com">
592        "#;
593
594        let uris = extract_html(input, false);
595        assert!(uris.is_empty());
596    }
597
598    #[test]
599    fn test_skip_dns_prefetch_reverse_order() {
600        let input = r#"
601            <link href="https://example.com" rel="dns-prefetch">
602        "#;
603
604        let uris = extract_html(input, false);
605        assert!(uris.is_empty());
606    }
607
608    #[test]
609    fn test_skip_emails_in_stylesheets() {
610        let input = r#"
611            <link href="/@global/global.css" rel="stylesheet">
612        "#;
613
614        let uris = extract_html(input, false);
615        assert!(uris.is_empty());
616    }
617
618    #[test]
619    fn test_extract_links_after_empty_verbatim_block() {
620        // Test that links are correctly extracted after empty <pre><code> blocks
621        let input = r#"
622        <body>
623            <div>
624                See <a href="https://example.com/1">First</a>
625            </div>
626            <pre>
627                <code></code>
628            </pre>
629            <div>
630                See <a href="https://example.com/2">Second</a>
631            </div>
632        </body>
633        "#;
634
635        let expected = vec![
636            RawUri {
637                text: "https://example.com/1".to_string(),
638                element: Some("a".to_string()),
639                attribute: Some("href".to_string()),
640                span: span_line(4),
641            },
642            RawUri {
643                text: "https://example.com/2".to_string(),
644                element: Some("a".to_string()),
645                attribute: Some("href".to_string()),
646                span: span_line(10),
647            },
648        ];
649
650        let uris = extract_html(input, false);
651        assert_eq!(uris, expected);
652    }
653}
lychee_lib/extract/html/html5ever.rs

lychee_lib/extract/html/
html5ever.rs