Skip to main content

lychee_lib/extract/html/
html5gum.rs

1use html5gum::{
2    Spanned, Tokenizer,
3    emitters::callback::{Callback, CallbackEmitter, CallbackEvent},
4};
5use std::collections::{HashMap, HashSet};
6
7use super::{is_email_link, is_verbatim_elem, srcset};
8use crate::{
9    extract::{css::extract_css, plaintext::extract_raw_uri_from_plaintext},
10    types::uri::raw::{OffsetSpanProvider, RawUri, SourceSpanProvider, SpanProvider},
11};
12
13/// Extract links from HTML documents.
14///
15/// This is the main driver for the html5gum tokenizer.
16/// It implements the `Emitter` trait, which is used by the tokenizer to
17/// communicate with the caller.
18///
19/// The `LinkExtractor` keeps track of the current element being processed,
20/// the current attribute being processed, and a bunch of plain characters
21/// currently being processed.
22///
23/// The `links` vector contains all links extracted from the HTML document and
24/// the `fragments` set contains all fragments extracted from the HTML document.
25#[derive(Clone, Debug)]
26struct LinkExtractor<S: SpanProvider> {
27    /// The [`SpanProvider`] which will be used to compute spans for URIs.
28    ///
29    /// This is generic, since e.g. the markdown parser has already started, so we have to compute
30    /// the span location in relation to the offset in the outer document.
31    span_provider: S,
32    /// Links extracted from the HTML document.
33    links: Vec<RawUri>,
34    /// Fragments extracted from the HTML document.
35    fragments: HashSet<String>,
36    /// Whether to include verbatim elements in the output.
37    include_verbatim: bool,
38    /// Current element name being processed.
39    /// This is called a tag in html5gum.
40    current_element: String,
41    /// Current attributes being processed.
42    /// This is a list of key-value pairs (in order of appearance), where the key is the attribute name
43    /// and the value is the attribute value.
44    current_attributes: HashMap<String, Spanned<String>>,
45    /// Current attribute name being processed.
46    current_attribute_name: String,
47    /// Element name of the current verbatim block.
48    /// Used to keep track of nested verbatim blocks.
49    verbatim_stack: Vec<String>,
50    /// Whether we're currently inside a `<style>` tag.
51    in_style_tag: bool,
52    /// Accumulated CSS content from within a `<style>` tag.
53    style_content: String,
54    /// Start offset of the style tag content (for span calculation).
55    style_content_offset: usize,
56}
57
58impl<S: SpanProvider> LinkExtractor<S> {
59    /// Create a new `LinkExtractor`.
60    ///
61    /// Set `include_verbatim` to `true` if you want to include verbatim
62    /// elements in the output.
63    fn new(span_provider: S, include_verbatim: bool) -> Self {
64        Self {
65            span_provider,
66            include_verbatim,
67            links: Vec::default(),
68            fragments: HashSet::default(),
69            current_element: String::default(),
70            current_attributes: HashMap::default(),
71            current_attribute_name: String::default(),
72            verbatim_stack: Vec::default(),
73            in_style_tag: false,
74            style_content: String::default(),
75            style_content_offset: 0,
76        }
77    }
78
79    /// Extract all semantically known links from a given HTML attribute.
80    // For a comprehensive list of elements that might contain URLs/URIs
81    // see https://www.w3.org/TR/REC-html40/index/attributes.html
82    // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1
83    fn extract_urls_from_elem_attr(&self) -> Vec<RawUri> {
84        let mut urls = Vec::new();
85
86        // Process 'srcset' attribute first
87        if let Some(srcset) = self.current_attributes.get("srcset") {
88            let span = srcset.span;
89            urls.extend(srcset::parse(srcset).into_iter().map(|url| RawUri {
90                text: url.to_string(),
91                element: Some(self.current_element.clone()),
92                attribute: Some("srcset".to_string()),
93                span: self.span_provider.span(span.start),
94            }));
95        }
96
97        // Process other attributes
98        for (attr_name, attr_value) in &self.current_attributes {
99            #[allow(clippy::unnested_or_patterns)]
100            match (self.current_element.as_str(), attr_name.as_str()) {
101                // Common element/attribute combinations for links
102                (_, "href" | "src" | "cite" | "usemap") |
103                // Less common (but still valid!) combinations
104                ("applet", "codebase") |
105                ("body", "background") |
106                ("button", "formaction") |
107                ("command", "icon") |
108                ("form", "action") |
109                ("frame", "longdesc") |
110                ("head", "profile") |
111                ("html", "manifest") |
112                ("iframe", "longdesc") |
113                ("img", "longdesc") |
114                ("input", "formaction") |
115                ("object", "classid" | "codebase" | "data") |
116                ("video", "poster") => {
117                    urls.push(RawUri {
118                        text: attr_value.to_string(),
119                        element: Some(self.current_element.clone()),
120                        attribute: Some(attr_name.clone()),
121                        span: self.span_provider.span(attr_value.span.start),
122                    });
123                }
124                _ => {}
125            }
126        }
127
128        urls
129    }
130
131    /// Check if we should filter out links in the current context due to being
132    /// inside a verbatim element.
133    fn filter_verbatim_here(&self) -> bool {
134        !self.include_verbatim
135            && (is_verbatim_elem(&self.current_element) || !self.verbatim_stack.is_empty())
136    }
137
138    /// Flush the current element and attribute values to the links vector.
139    ///
140    /// This function is called whenever a new element is encountered or when the
141    /// current element is closing. It extracts URLs from the current attribute value
142    /// and adds them to the links vector.
143    ///
144    /// Here are the rules for extracting links:
145    /// - If the current element has a `rel=nofollow` attribute, the current attribute
146    ///   value is ignored.
147    /// - If the current element has a `rel=preconnect` or `rel=dns-prefetch`
148    ///   attribute, the current attribute value is ignored.
149    /// - If the current attribute value is not a URL, it is treated as plain text and
150    ///   added to the links vector.
151    /// - If the current attribute name is `id`, the current attribute value is added
152    ///   to the fragments set.
153    ///
154    /// The current attribute name and value are cleared after processing.
155    fn flush_links(&mut self) {
156        if self.filter_verbatim_here() {
157            self.current_attributes.clear();
158            return;
159        }
160
161        if self.current_attributes.get("rel").is_some_and(|rel| {
162            rel.split(',').any(|r| {
163                r.trim() == "nofollow" || r.trim() == "preconnect" || r.trim() == "dns-prefetch"
164            })
165        }) {
166            self.current_attributes.clear();
167            return;
168        }
169
170        if self.current_attributes.contains_key("prefix") {
171            self.current_attributes.clear();
172            return;
173        }
174
175        // Skip virtual/framework-specific stylesheet paths that start with /@ or @
176        // These are typically resolved by dev servers or build tools rather than being real URLs
177        // Examples: /@global/style.css, @tailwind/base.css
178        if self
179            .current_attributes
180            .get("rel")
181            .is_some_and(|rel| rel.contains("stylesheet"))
182        {
183            if let Some(href) = self.current_attributes.get("href")
184                && (href.starts_with("/@") || href.starts_with('@'))
185            {
186                self.current_attributes.clear();
187                return;
188            }
189            // Skip disabled stylesheets
190            // Ref: https://developer.mozilla.org/en-US/docs/Web/API/HTMLLinkElement/disabled
191            if self.current_attribute_name == "disabled"
192                || self.current_attributes.contains_key("disabled")
193            {
194                self.current_attributes.clear();
195                return;
196            }
197        }
198
199        let new_urls = self
200            .extract_urls_from_elem_attr()
201            .into_iter()
202            .filter(|url| {
203                // Only accept email addresses or phone numbers, which
204                // occur in `href` attributes and start with `mailto:`
205                // or `tel:`, respectively
206                //
207                // Technically, email addresses could also occur in
208                // plain text, but we don't want to extract those
209                // because of the high false-positive rate.
210                //
211                // This skips links like `<img srcset="v2@1.5x.png">`
212                let is_email = is_email_link(&url.text);
213                let is_mailto = url.text.starts_with("mailto:");
214                let is_phone = url.text.starts_with("tel:");
215                let is_href = url.attribute.as_deref() == Some("href");
216
217                !is_email || (is_mailto && is_href) || (is_phone && is_href)
218            })
219            .collect::<Vec<_>>();
220
221        self.links.extend(new_urls);
222
223        if let Some(id) = self.current_attributes.get("id") {
224            self.fragments.insert(id.to_string());
225        }
226
227        // Also check for 'name' attributes for backward compatibility with older HTML
228        // standards. In HTML 4.01, both id and name could be used. This is not valid HTML 5,
229        // but it's still used by some widely deployed tools, for example:
230        //
231        // - JavaDoc - Oracle's tool generates <a name="anchor"> for method signatures and classes
232        //   (see https://docs.oracle.com/javase/8/docs/technotes/tools/windows/javadoc.html)
233        // - Doxygen - C++ documentation generator supports <A NAME="..."> in HTML commands
234        //   (see https://www.doxygen.nl/manual/htmlcmds.html)
235        //
236        // See https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements/a#name
237        // See https://stackoverflow.com/a/484781
238        if let Some(name) = self.current_attributes.get("name") {
239            self.fragments.insert(name.to_string());
240        }
241        self.current_attributes.clear();
242    }
243}
244
245impl<S: SpanProvider> Callback<(), usize> for &mut LinkExtractor<S> {
246    fn handle_event(
247        &mut self,
248        event: CallbackEvent<'_>,
249        span: html5gum::Span<usize>,
250    ) -> Option<()> {
251        match event {
252            CallbackEvent::OpenStartTag { name } => {
253                self.current_element = String::from_utf8_lossy(name).into_owned();
254
255                // Check if we're entering a style tag
256                if self.current_element == "style" {
257                    self.in_style_tag = true;
258                    self.style_content.clear();
259                }
260
261                // Update the current verbatim element name.
262                //
263                // Keeps track of the last verbatim element name, so that we can
264                // properly handle nested verbatim blocks.
265                if self.filter_verbatim_here() && is_verbatim_elem(&self.current_element) {
266                    self.verbatim_stack.push(self.current_element.clone());
267                }
268            }
269            CallbackEvent::AttributeName { name } => {
270                self.current_attribute_name = String::from_utf8_lossy(name).into_owned();
271            }
272            CallbackEvent::AttributeValue { value } => {
273                let value = String::from_utf8_lossy(value);
274                self.current_attributes
275                    .entry(self.current_attribute_name.clone())
276                    .and_modify(|v| v.push_str(&value))
277                    .or_insert_with(|| Spanned {
278                        value: value.into_owned(),
279                        span,
280                    });
281            }
282            CallbackEvent::CloseStartTag { self_closing } => {
283                self.flush_links();
284
285                // Update the current verbatim element name.
286                //
287                // Keeps track of the last verbatim element name, so that we can
288                // properly handle nested verbatim blocks.
289                if self_closing
290                    && self.filter_verbatim_here()
291                    && let Some(last_verbatim) = self.verbatim_stack.last()
292                    && last_verbatim == &self.current_element
293                {
294                    self.verbatim_stack.pop();
295                }
296            }
297            CallbackEvent::EndTag { name } => {
298                let tag_name = String::from_utf8_lossy(name);
299
300                // Extract CSS URLs when closing a style tag
301                if tag_name == "style" && self.in_style_tag {
302                    self.in_style_tag = false;
303                    let css_urls = extract_css(
304                        &self.style_content,
305                        &OffsetSpanProvider {
306                            offset: self.style_content_offset,
307                            inner: &self.span_provider,
308                        },
309                    );
310                    self.links.extend(css_urls);
311                    self.style_content.clear();
312                }
313
314                // Update the current verbatim element name.
315                //
316                // Keeps track of the last verbatim element name, so that we can
317                // properly handle nested verbatim blocks.
318                if !self.include_verbatim
319                    && let Some(last_verbatim) = self.verbatim_stack.last()
320                    && last_verbatim == tag_name.as_ref()
321                {
322                    self.verbatim_stack.pop();
323                }
324            }
325            CallbackEvent::String { value } => {
326                // If we're inside a style tag, accumulate the CSS content
327                if self.in_style_tag {
328                    if self.style_content.is_empty() {
329                        // Record the start offset of the style content
330                        self.style_content_offset = span.start;
331                    }
332                    self.style_content.push_str(&String::from_utf8_lossy(value));
333                    return None;
334                }
335
336                if !self.filter_verbatim_here() {
337                    // Extract links from the current string and add them to the links vector.
338                    self.links.extend(extract_raw_uri_from_plaintext(
339                        &String::from_utf8_lossy(value),
340                        &OffsetSpanProvider {
341                            offset: span.start,
342                            inner: &self.span_provider,
343                        },
344                    ));
345                }
346            }
347            CallbackEvent::Comment { .. }
348            | CallbackEvent::Doctype { .. }
349            | CallbackEvent::Error(_) => {}
350        }
351        None
352    }
353}
354
355/// Extract unparsed URL strings from an HTML string.
356pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
357    extract_html_with_span(buf, include_verbatim, SourceSpanProvider::from_input(buf))
358}
359
360pub(crate) fn extract_html_with_span<S: SpanProvider>(
361    buf: &str,
362    include_verbatim: bool,
363    span_provider: S,
364) -> Vec<RawUri> {
365    let mut extractor = LinkExtractor::new(span_provider, include_verbatim);
366    let mut tokenizer = Tokenizer::new_with_emitter(buf, CallbackEmitter::new(&mut extractor));
367    assert!(tokenizer.next().is_none());
368    extractor
369        .links
370        .into_iter()
371        .filter(|link| link.attribute.is_some() || include_verbatim)
372        .collect()
373}
374
375/// Extract fragments from id attributes within a HTML string.
376pub(crate) fn extract_html_fragments(buf: &str) -> HashSet<String> {
377    let span_provider = SourceSpanProvider::from_input(buf);
378    let mut extractor = LinkExtractor::new(span_provider, true);
379    let mut tokenizer = Tokenizer::new_with_emitter(buf, CallbackEmitter::new(&mut extractor));
380    assert!(tokenizer.next().is_none());
381    extractor.fragments
382}
383
384#[cfg(test)]
385mod tests {
386    use crate::types::uri::raw::span;
387
388    use super::*;
389
390    const HTML_INPUT: &str = r#"
391<html>
392    <body id="content">
393        <p>This is a paragraph with some inline <code id="inline-code">https://example.com</code> and a normal <a href="https://example.org">example</a></p>
394        <pre>
395        Some random text
396        https://foo.com and http://bar.com/some/path
397        Something else
398        <a href="https://baz.org">example link inside pre</a>
399        </pre>
400        <p id="emphasis"><b>bold</b></p>
401    </body>
402</html>"#;
403
404    #[test]
405    fn test_extract_fragments() {
406        let expected = HashSet::from([
407            "content".to_string(),
408            "inline-code".to_string(),
409            "emphasis".to_string(),
410        ]);
411        let actual = extract_html_fragments(HTML_INPUT);
412        assert_eq!(actual, expected);
413    }
414
415    #[test]
416    fn test_skip_verbatim() {
417        let expected = vec![RawUri {
418            text: "https://example.org".to_string(),
419            element: Some("a".to_string()),
420            attribute: Some("href".to_string()),
421            span: span(4, 121),
422        }];
423
424        let uris = extract_html(HTML_INPUT, false);
425        assert_eq!(uris, expected);
426    }
427
428    #[test]
429    fn test_include_verbatim() {
430        let expected = vec![
431            RawUri {
432                text: "https://example.com".to_string(),
433                element: None,
434                attribute: None,
435                span: span(4, 72),
436            },
437            RawUri {
438                text: "https://example.org".to_string(),
439                element: Some("a".to_string()),
440                attribute: Some("href".to_string()),
441                span: span(4, 121),
442            },
443            RawUri {
444                text: "https://foo.com".to_string(),
445                element: None,
446                attribute: None,
447                span: span(7, 9),
448            },
449            RawUri {
450                text: "http://bar.com/some/path".to_string(),
451                element: None,
452                attribute: None,
453                span: span(7, 29),
454            },
455            RawUri {
456                text: "https://baz.org".to_string(),
457                element: Some("a".to_string()),
458                attribute: Some("href".to_string()),
459                span: span(9, 18),
460            },
461        ];
462
463        let uris = extract_html(HTML_INPUT, true);
464        assert_eq!(uris, expected);
465    }
466
467    #[test]
468    fn test_include_verbatim_nested() {
469        const HTML_INPUT: &str = r#"
470        <a href="https://example.com/">valid link</a>
471        <code>
472            <pre>
473                <span>https://example.org</span>
474            </pre>
475        </code>
476        "#;
477
478        let expected = vec![RawUri {
479            text: "https://example.com/".to_string(),
480            element: Some("a".to_string()),
481            attribute: Some("href".to_string()),
482            span: span(2, 18),
483        }];
484
485        let uris = extract_html(HTML_INPUT, false);
486        assert_eq!(uris, expected);
487    }
488
489    #[test]
490    fn test_include_verbatim_nested_identical() {
491        const HTML_INPUT: &str = r#"
492        <pre>
493            <pre>
494            </pre>
495            <a href="https://example.org">invalid link</a>
496        </pre>
497        "#;
498
499        let uris = extract_html(HTML_INPUT, false);
500        assert!(uris.is_empty());
501    }
502
503    #[test]
504    fn test_exclude_nofollow() {
505        let input = r#"
506        <a rel="nofollow" href="https://foo.com">do not follow me</a>
507        <a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
508        <a href="https://example.org">i'm fine</a>
509        "#;
510        let expected = vec![RawUri {
511            text: "https://example.org".to_string(),
512            element: Some("a".to_string()),
513            attribute: Some("href".to_string()),
514            span: span(4, 18),
515        }];
516        let uris = extract_html(input, false);
517        assert_eq!(uris, expected);
518    }
519
520    #[test]
521    fn test_exclude_nofollow_change_order() {
522        let input = r#"
523        <a href="https://foo.com" rel="nofollow">do not follow me</a>
524        "#;
525        let uris = extract_html(input, false);
526        assert!(uris.is_empty());
527    }
528
529    #[test]
530    fn test_exclude_script_tags() {
531        let input = r#"
532        <script>
533        var foo = "https://example.com";
534        </script>
535        <a href="https://example.org">i'm fine</a>
536        "#;
537        let expected = vec![RawUri {
538            text: "https://example.org".to_string(),
539            element: Some("a".to_string()),
540            attribute: Some("href".to_string()),
541            span: span(5, 18),
542        }];
543        let uris = extract_html(input, false);
544        assert_eq!(uris, expected);
545    }
546
547    #[test]
548    fn test_exclude_disabled_stylesheet() {
549        let input = r#"
550        <link rel="stylesheet" href="https://disabled.com" disabled>
551        <link rel="stylesheet" href="https://disabled.com" disabled="disabled">
552        <a href="https://example.org">i'm fine</a>
553        "#;
554        let expected = vec![RawUri {
555            text: "https://example.org".to_string(),
556            element: Some("a".to_string()),
557            attribute: Some("href".to_string()),
558            span: span(4, 18),
559        }];
560        let uris = extract_html(input, false);
561        assert_eq!(uris, expected);
562    }
563
564    #[test]
565    fn test_valid_tel() {
566        let input = r#"<!DOCTYPE html>
567        <html lang="en-US">
568          <head>
569            <meta charset="utf-8">
570            <title>Test</title>
571          </head>
572          <body>
573            <a href="tel:1234567890">
574          </body>
575        </html>"#;
576
577        let expected = vec![RawUri {
578            text: "tel:1234567890".to_string(),
579            element: Some("a".to_string()),
580            attribute: Some("href".to_string()),
581            span: span(8, 22),
582        }];
583        let uris = extract_html(input, false);
584        assert_eq!(uris, expected);
585    }
586
587    #[test]
588    fn test_valid_email() {
589        let input = r#"<!DOCTYPE html>
590        <html lang="en-US">
591          <head>
592            <meta charset="utf-8">
593            <title>Test</title>
594          </head>
595          <body>
596            <a href="mailto:foo@bar.com">
597          </body>
598        </html>"#;
599
600        let expected = vec![RawUri {
601            text: "mailto:foo@bar.com".to_string(),
602            element: Some("a".to_string()),
603            attribute: Some("href".to_string()),
604            span: span(8, 22),
605        }];
606        let uris = extract_html(input, false);
607        assert_eq!(uris, expected);
608    }
609
610    #[test]
611    fn test_exclude_email_without_mailto() {
612        let input = r#"<!DOCTYPE html>
613        <html lang="en-US">
614          <head>
615            <meta charset="utf-8">
616            <title>Test</title>
617          </head>
618          <body>
619            <a href="foo@bar.com">
620          </body>
621        </html>"#;
622
623        let uris = extract_html(input, false);
624        assert!(uris.is_empty());
625    }
626
627    #[test]
628    fn test_email_false_positive() {
629        let input = r#"<img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">"#;
630        let uris = extract_html(input, false);
631        assert!(uris.is_empty());
632    }
633
634    #[test]
635    fn test_extract_srcset() {
636        let input = r#"
637            <img srcset="/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w" src="/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg">
638        "#;
639
640        let expected = vec![RawUri {
641            text: "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
642            element: Some("img".to_string()),
643            attribute: Some("srcset".to_string()),
644            span: span(2, 26),
645        },
646        RawUri {
647            text: "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
648            element: Some("img".to_string()),
649            attribute: Some("srcset".to_string()),
650            span: span(2, 26),
651        },
652        RawUri {
653            text: "/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
654            element: Some("img".to_string()),
655            attribute: Some("src".to_string()),
656            span: span(2, 231),
657        }
658
659        ];
660        let uris = extract_html(input, false);
661        assert_eq!(uris, expected);
662    }
663
664    #[test]
665    fn test_skip_preconnect() {
666        let input = r#"
667            <link rel="preconnect" href="https://example.com">
668        "#;
669
670        let uris = extract_html(input, false);
671        assert!(uris.is_empty());
672    }
673
674    #[test]
675    fn test_skip_preconnect_reverse_order() {
676        let input = r#"
677            <link href="https://example.com" rel="preconnect">
678        "#;
679
680        let uris = extract_html(input, false);
681        assert!(uris.is_empty());
682    }
683
684    #[test]
685    fn test_skip_prefix() {
686        let input = r#"
687            <html lang="en-EN" prefix="og: https://ogp.me/ns#">
688        "#;
689
690        let uris = extract_html(input, false);
691        assert!(uris.is_empty());
692    }
693
694    #[test]
695    fn test_ignore_text_content_links() {
696        let input = r#"
697            <a href="https://example.com">https://ignoreme.com</a>
698        "#;
699        let expected = vec![RawUri {
700            text: "https://example.com".to_string(),
701            element: Some("a".to_string()),
702            attribute: Some("href".to_string()),
703            span: span(2, 22),
704        }];
705
706        let uris = extract_html(input, false);
707        assert_eq!(uris, expected);
708    }
709
710    #[test]
711    fn test_skip_dns_prefetch() {
712        let input = r#"
713            <link rel="dns-prefetch" href="https://example.com">
714        "#;
715
716        let uris = extract_html(input, false);
717        assert!(uris.is_empty());
718    }
719
720    #[test]
721    fn test_skip_dns_prefetch_reverse_order() {
722        let input = r#"
723            <link href="https://example.com" rel="dns-prefetch">
724        "#;
725
726        let uris = extract_html(input, false);
727        assert!(uris.is_empty());
728    }
729
730    #[test]
731    fn test_skip_emails_in_stylesheets() {
732        let input = r#"
733            <link href="/@global/global.css" rel="stylesheet">
734        "#;
735
736        let uris = extract_html(input, false);
737        assert!(uris.is_empty());
738    }
739
740    #[test]
741    fn test_extract_fragments_with_name_attributes() {
742        // Test for JavaDoc-style name attributes used for anchors
743        let input = r#"
744        <html>
745        <body>
746            <h1 id="title">Title</h1>
747            <a name="skip.navbar.top"></a>
748            <a name="method.summary"></a>
749            <div>
750                <a name="clear--"></a>
751                <h2 id="section">Section</h2>
752                <a name="method.detail"></a>
753            </div>
754            <a name="skip.navbar.bottom"></a>
755        </body>
756        </html>
757        "#;
758
759        let expected = HashSet::from([
760            "title".to_string(),
761            "section".to_string(),
762            "skip.navbar.top".to_string(),
763            "method.summary".to_string(),
764            "clear--".to_string(),
765            "method.detail".to_string(),
766            "skip.navbar.bottom".to_string(),
767        ]);
768        let actual = extract_html_fragments(input);
769        assert_eq!(actual, expected);
770    }
771
772    #[test]
773    fn test_extract_links_after_empty_verbatim_block() {
774        // Test that links are correctly extracted after empty <pre><code> blocks
775        let input = r#"
776        <body>
777            <div>
778                See <a href="https://example.com/1">First</a>
779            </div>
780            <pre>
781                <code></code>
782            </pre>
783            <div>
784                See <a href="https://example.com/2">Second</a>
785            </div>
786        </body>
787        "#;
788
789        let expected = vec![
790            RawUri {
791                text: "https://example.com/1".to_string(),
792                element: Some("a".to_string()),
793                attribute: Some("href".to_string()),
794                span: span(4, 30),
795            },
796            RawUri {
797                text: "https://example.com/2".to_string(),
798                element: Some("a".to_string()),
799                attribute: Some("href".to_string()),
800                span: span(10, 30),
801            },
802        ];
803
804        let uris = extract_html(input, false);
805        assert_eq!(uris, expected);
806    }
807}