1use html5gum::{
2 Spanned, Tokenizer,
3 emitters::callback::{Callback, CallbackEmitter, CallbackEvent},
4};
5use std::collections::{HashMap, HashSet};
6
7use super::{is_email_link, is_verbatim_elem, srcset};
8use crate::{
9 extract::{css::extract_css, plaintext::extract_raw_uri_from_plaintext},
10 types::uri::raw::{OffsetSpanProvider, RawUri, SourceSpanProvider, SpanProvider},
11};
12
13#[derive(Clone, Debug)]
26struct LinkExtractor<S: SpanProvider> {
27 span_provider: S,
32 links: Vec<RawUri>,
34 fragments: HashSet<String>,
36 include_verbatim: bool,
38 current_element: String,
41 current_attributes: HashMap<String, Spanned<String>>,
45 current_attribute_name: String,
47 verbatim_stack: Vec<String>,
50 in_style_tag: bool,
52 style_content: String,
54 style_content_offset: usize,
56}
57
58impl<S: SpanProvider> LinkExtractor<S> {
59 fn new(span_provider: S, include_verbatim: bool) -> Self {
64 Self {
65 span_provider,
66 include_verbatim,
67 links: Vec::default(),
68 fragments: HashSet::default(),
69 current_element: String::default(),
70 current_attributes: HashMap::default(),
71 current_attribute_name: String::default(),
72 verbatim_stack: Vec::default(),
73 in_style_tag: false,
74 style_content: String::default(),
75 style_content_offset: 0,
76 }
77 }
78
79 fn extract_urls_from_elem_attr(&self) -> Vec<RawUri> {
84 let mut urls = Vec::new();
85
86 if let Some(srcset) = self.current_attributes.get("srcset") {
88 let span = srcset.span;
89 urls.extend(srcset::parse(srcset).into_iter().map(|url| RawUri {
90 text: url.to_string(),
91 element: Some(self.current_element.clone()),
92 attribute: Some("srcset".to_string()),
93 span: self.span_provider.span(span.start),
94 }));
95 }
96
97 for (attr_name, attr_value) in &self.current_attributes {
99 #[allow(clippy::unnested_or_patterns)]
100 match (self.current_element.as_str(), attr_name.as_str()) {
101 (_, "href" | "src" | "cite" | "usemap") |
103 ("applet", "codebase") |
105 ("body", "background") |
106 ("button", "formaction") |
107 ("command", "icon") |
108 ("form", "action") |
109 ("frame", "longdesc") |
110 ("head", "profile") |
111 ("html", "manifest") |
112 ("iframe", "longdesc") |
113 ("img", "longdesc") |
114 ("input", "formaction") |
115 ("object", "classid" | "codebase" | "data") |
116 ("video", "poster") => {
117 urls.push(RawUri {
118 text: attr_value.to_string(),
119 element: Some(self.current_element.clone()),
120 attribute: Some(attr_name.clone()),
121 span: self.span_provider.span(attr_value.span.start),
122 });
123 }
124 _ => {}
125 }
126 }
127
128 urls
129 }
130
131 fn filter_verbatim_here(&self) -> bool {
134 !self.include_verbatim
135 && (is_verbatim_elem(&self.current_element) || !self.verbatim_stack.is_empty())
136 }
137
138 fn flush_links(&mut self) {
156 if self.filter_verbatim_here() {
157 self.current_attributes.clear();
158 return;
159 }
160
161 if self.current_attributes.get("rel").is_some_and(|rel| {
162 rel.split(',').any(|r| {
163 r.trim() == "nofollow" || r.trim() == "preconnect" || r.trim() == "dns-prefetch"
164 })
165 }) {
166 self.current_attributes.clear();
167 return;
168 }
169
170 if self.current_attributes.contains_key("prefix") {
171 self.current_attributes.clear();
172 return;
173 }
174
175 if self
179 .current_attributes
180 .get("rel")
181 .is_some_and(|rel| rel.contains("stylesheet"))
182 {
183 if let Some(href) = self.current_attributes.get("href")
184 && (href.starts_with("/@") || href.starts_with('@'))
185 {
186 self.current_attributes.clear();
187 return;
188 }
189 if self.current_attribute_name == "disabled"
192 || self.current_attributes.contains_key("disabled")
193 {
194 self.current_attributes.clear();
195 return;
196 }
197 }
198
199 let new_urls = self
200 .extract_urls_from_elem_attr()
201 .into_iter()
202 .filter(|url| {
203 let is_email = is_email_link(&url.text);
213 let is_mailto = url.text.starts_with("mailto:");
214 let is_phone = url.text.starts_with("tel:");
215 let is_href = url.attribute.as_deref() == Some("href");
216
217 !is_email || (is_mailto && is_href) || (is_phone && is_href)
218 })
219 .collect::<Vec<_>>();
220
221 self.links.extend(new_urls);
222
223 if let Some(id) = self.current_attributes.get("id") {
224 self.fragments.insert(id.to_string());
225 }
226
227 if let Some(name) = self.current_attributes.get("name") {
239 self.fragments.insert(name.to_string());
240 }
241 self.current_attributes.clear();
242 }
243}
244
245impl<S: SpanProvider> Callback<(), usize> for &mut LinkExtractor<S> {
246 fn handle_event(
247 &mut self,
248 event: CallbackEvent<'_>,
249 span: html5gum::Span<usize>,
250 ) -> Option<()> {
251 match event {
252 CallbackEvent::OpenStartTag { name } => {
253 self.current_element = String::from_utf8_lossy(name).into_owned();
254
255 if self.current_element == "style" {
257 self.in_style_tag = true;
258 self.style_content.clear();
259 }
260
261 if self.filter_verbatim_here() && is_verbatim_elem(&self.current_element) {
266 self.verbatim_stack.push(self.current_element.clone());
267 }
268 }
269 CallbackEvent::AttributeName { name } => {
270 self.current_attribute_name = String::from_utf8_lossy(name).into_owned();
271 }
272 CallbackEvent::AttributeValue { value } => {
273 let value = String::from_utf8_lossy(value);
274 self.current_attributes
275 .entry(self.current_attribute_name.clone())
276 .and_modify(|v| v.push_str(&value))
277 .or_insert_with(|| Spanned {
278 value: value.into_owned(),
279 span,
280 });
281 }
282 CallbackEvent::CloseStartTag { self_closing } => {
283 self.flush_links();
284
285 if self_closing
290 && self.filter_verbatim_here()
291 && let Some(last_verbatim) = self.verbatim_stack.last()
292 && last_verbatim == &self.current_element
293 {
294 self.verbatim_stack.pop();
295 }
296 }
297 CallbackEvent::EndTag { name } => {
298 let tag_name = String::from_utf8_lossy(name);
299
300 if tag_name == "style" && self.in_style_tag {
302 self.in_style_tag = false;
303 let css_urls = extract_css(
304 &self.style_content,
305 &OffsetSpanProvider {
306 offset: self.style_content_offset,
307 inner: &self.span_provider,
308 },
309 );
310 self.links.extend(css_urls);
311 self.style_content.clear();
312 }
313
314 if !self.include_verbatim
319 && let Some(last_verbatim) = self.verbatim_stack.last()
320 && last_verbatim == tag_name.as_ref()
321 {
322 self.verbatim_stack.pop();
323 }
324 }
325 CallbackEvent::String { value } => {
326 if self.in_style_tag {
328 if self.style_content.is_empty() {
329 self.style_content_offset = span.start;
331 }
332 self.style_content.push_str(&String::from_utf8_lossy(value));
333 return None;
334 }
335
336 if !self.filter_verbatim_here() {
337 self.links.extend(extract_raw_uri_from_plaintext(
339 &String::from_utf8_lossy(value),
340 &OffsetSpanProvider {
341 offset: span.start,
342 inner: &self.span_provider,
343 },
344 ));
345 }
346 }
347 CallbackEvent::Comment { .. }
348 | CallbackEvent::Doctype { .. }
349 | CallbackEvent::Error(_) => {}
350 }
351 None
352 }
353}
354
355pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
357 extract_html_with_span(buf, include_verbatim, SourceSpanProvider::from_input(buf))
358}
359
360pub(crate) fn extract_html_with_span<S: SpanProvider>(
361 buf: &str,
362 include_verbatim: bool,
363 span_provider: S,
364) -> Vec<RawUri> {
365 let mut extractor = LinkExtractor::new(span_provider, include_verbatim);
366 let mut tokenizer = Tokenizer::new_with_emitter(buf, CallbackEmitter::new(&mut extractor));
367 assert!(tokenizer.next().is_none());
368 extractor
369 .links
370 .into_iter()
371 .filter(|link| link.attribute.is_some() || include_verbatim)
372 .collect()
373}
374
375pub(crate) fn extract_html_fragments(buf: &str) -> HashSet<String> {
377 let span_provider = SourceSpanProvider::from_input(buf);
378 let mut extractor = LinkExtractor::new(span_provider, true);
379 let mut tokenizer = Tokenizer::new_with_emitter(buf, CallbackEmitter::new(&mut extractor));
380 assert!(tokenizer.next().is_none());
381 extractor.fragments
382}
383
384#[cfg(test)]
385mod tests {
386 use crate::types::uri::raw::span;
387
388 use super::*;
389
390 const HTML_INPUT: &str = r#"
391<html>
392 <body id="content">
393 <p>This is a paragraph with some inline <code id="inline-code">https://example.com</code> and a normal <a href="https://example.org">example</a></p>
394 <pre>
395 Some random text
396 https://foo.com and http://bar.com/some/path
397 Something else
398 <a href="https://baz.org">example link inside pre</a>
399 </pre>
400 <p id="emphasis"><b>bold</b></p>
401 </body>
402</html>"#;
403
404 #[test]
405 fn test_extract_fragments() {
406 let expected = HashSet::from([
407 "content".to_string(),
408 "inline-code".to_string(),
409 "emphasis".to_string(),
410 ]);
411 let actual = extract_html_fragments(HTML_INPUT);
412 assert_eq!(actual, expected);
413 }
414
415 #[test]
416 fn test_skip_verbatim() {
417 let expected = vec![RawUri {
418 text: "https://example.org".to_string(),
419 element: Some("a".to_string()),
420 attribute: Some("href".to_string()),
421 span: span(4, 121),
422 }];
423
424 let uris = extract_html(HTML_INPUT, false);
425 assert_eq!(uris, expected);
426 }
427
428 #[test]
429 fn test_include_verbatim() {
430 let expected = vec![
431 RawUri {
432 text: "https://example.com".to_string(),
433 element: None,
434 attribute: None,
435 span: span(4, 72),
436 },
437 RawUri {
438 text: "https://example.org".to_string(),
439 element: Some("a".to_string()),
440 attribute: Some("href".to_string()),
441 span: span(4, 121),
442 },
443 RawUri {
444 text: "https://foo.com".to_string(),
445 element: None,
446 attribute: None,
447 span: span(7, 9),
448 },
449 RawUri {
450 text: "http://bar.com/some/path".to_string(),
451 element: None,
452 attribute: None,
453 span: span(7, 29),
454 },
455 RawUri {
456 text: "https://baz.org".to_string(),
457 element: Some("a".to_string()),
458 attribute: Some("href".to_string()),
459 span: span(9, 18),
460 },
461 ];
462
463 let uris = extract_html(HTML_INPUT, true);
464 assert_eq!(uris, expected);
465 }
466
467 #[test]
468 fn test_include_verbatim_nested() {
469 const HTML_INPUT: &str = r#"
470 <a href="https://example.com/">valid link</a>
471 <code>
472 <pre>
473 <span>https://example.org</span>
474 </pre>
475 </code>
476 "#;
477
478 let expected = vec![RawUri {
479 text: "https://example.com/".to_string(),
480 element: Some("a".to_string()),
481 attribute: Some("href".to_string()),
482 span: span(2, 18),
483 }];
484
485 let uris = extract_html(HTML_INPUT, false);
486 assert_eq!(uris, expected);
487 }
488
489 #[test]
490 fn test_include_verbatim_nested_identical() {
491 const HTML_INPUT: &str = r#"
492 <pre>
493 <pre>
494 </pre>
495 <a href="https://example.org">invalid link</a>
496 </pre>
497 "#;
498
499 let uris = extract_html(HTML_INPUT, false);
500 assert!(uris.is_empty());
501 }
502
503 #[test]
504 fn test_exclude_nofollow() {
505 let input = r#"
506 <a rel="nofollow" href="https://foo.com">do not follow me</a>
507 <a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
508 <a href="https://example.org">i'm fine</a>
509 "#;
510 let expected = vec![RawUri {
511 text: "https://example.org".to_string(),
512 element: Some("a".to_string()),
513 attribute: Some("href".to_string()),
514 span: span(4, 18),
515 }];
516 let uris = extract_html(input, false);
517 assert_eq!(uris, expected);
518 }
519
520 #[test]
521 fn test_exclude_nofollow_change_order() {
522 let input = r#"
523 <a href="https://foo.com" rel="nofollow">do not follow me</a>
524 "#;
525 let uris = extract_html(input, false);
526 assert!(uris.is_empty());
527 }
528
529 #[test]
530 fn test_exclude_script_tags() {
531 let input = r#"
532 <script>
533 var foo = "https://example.com";
534 </script>
535 <a href="https://example.org">i'm fine</a>
536 "#;
537 let expected = vec![RawUri {
538 text: "https://example.org".to_string(),
539 element: Some("a".to_string()),
540 attribute: Some("href".to_string()),
541 span: span(5, 18),
542 }];
543 let uris = extract_html(input, false);
544 assert_eq!(uris, expected);
545 }
546
547 #[test]
548 fn test_exclude_disabled_stylesheet() {
549 let input = r#"
550 <link rel="stylesheet" href="https://disabled.com" disabled>
551 <link rel="stylesheet" href="https://disabled.com" disabled="disabled">
552 <a href="https://example.org">i'm fine</a>
553 "#;
554 let expected = vec![RawUri {
555 text: "https://example.org".to_string(),
556 element: Some("a".to_string()),
557 attribute: Some("href".to_string()),
558 span: span(4, 18),
559 }];
560 let uris = extract_html(input, false);
561 assert_eq!(uris, expected);
562 }
563
564 #[test]
565 fn test_valid_tel() {
566 let input = r#"<!DOCTYPE html>
567 <html lang="en-US">
568 <head>
569 <meta charset="utf-8">
570 <title>Test</title>
571 </head>
572 <body>
573 <a href="tel:1234567890">
574 </body>
575 </html>"#;
576
577 let expected = vec![RawUri {
578 text: "tel:1234567890".to_string(),
579 element: Some("a".to_string()),
580 attribute: Some("href".to_string()),
581 span: span(8, 22),
582 }];
583 let uris = extract_html(input, false);
584 assert_eq!(uris, expected);
585 }
586
587 #[test]
588 fn test_valid_email() {
589 let input = r#"<!DOCTYPE html>
590 <html lang="en-US">
591 <head>
592 <meta charset="utf-8">
593 <title>Test</title>
594 </head>
595 <body>
596 <a href="mailto:foo@bar.com">
597 </body>
598 </html>"#;
599
600 let expected = vec![RawUri {
601 text: "mailto:foo@bar.com".to_string(),
602 element: Some("a".to_string()),
603 attribute: Some("href".to_string()),
604 span: span(8, 22),
605 }];
606 let uris = extract_html(input, false);
607 assert_eq!(uris, expected);
608 }
609
610 #[test]
611 fn test_exclude_email_without_mailto() {
612 let input = r#"<!DOCTYPE html>
613 <html lang="en-US">
614 <head>
615 <meta charset="utf-8">
616 <title>Test</title>
617 </head>
618 <body>
619 <a href="foo@bar.com">
620 </body>
621 </html>"#;
622
623 let uris = extract_html(input, false);
624 assert!(uris.is_empty());
625 }
626
627 #[test]
628 fn test_email_false_positive() {
629 let input = r#"<img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">"#;
630 let uris = extract_html(input, false);
631 assert!(uris.is_empty());
632 }
633
634 #[test]
635 fn test_extract_srcset() {
636 let input = r#"
637 <img srcset="/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 640w, /cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg 750w" src="/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg">
638 "#;
639
640 let expected = vec![RawUri {
641 text: "/cdn-cgi/image/format=webp,width=640/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
642 element: Some("img".to_string()),
643 attribute: Some("srcset".to_string()),
644 span: span(2, 26),
645 },
646 RawUri {
647 text: "/cdn-cgi/image/format=webp,width=750/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
648 element: Some("img".to_string()),
649 attribute: Some("srcset".to_string()),
650 span: span(2, 26),
651 },
652 RawUri {
653 text: "/cdn-cgi/image/format=webp,width=3840/https://img.youtube.com/vi/hVBl8_pgQf0/maxresdefault.jpg".to_string(),
654 element: Some("img".to_string()),
655 attribute: Some("src".to_string()),
656 span: span(2, 231),
657 }
658
659 ];
660 let uris = extract_html(input, false);
661 assert_eq!(uris, expected);
662 }
663
664 #[test]
665 fn test_skip_preconnect() {
666 let input = r#"
667 <link rel="preconnect" href="https://example.com">
668 "#;
669
670 let uris = extract_html(input, false);
671 assert!(uris.is_empty());
672 }
673
674 #[test]
675 fn test_skip_preconnect_reverse_order() {
676 let input = r#"
677 <link href="https://example.com" rel="preconnect">
678 "#;
679
680 let uris = extract_html(input, false);
681 assert!(uris.is_empty());
682 }
683
684 #[test]
685 fn test_skip_prefix() {
686 let input = r#"
687 <html lang="en-EN" prefix="og: https://ogp.me/ns#">
688 "#;
689
690 let uris = extract_html(input, false);
691 assert!(uris.is_empty());
692 }
693
694 #[test]
695 fn test_ignore_text_content_links() {
696 let input = r#"
697 <a href="https://example.com">https://ignoreme.com</a>
698 "#;
699 let expected = vec![RawUri {
700 text: "https://example.com".to_string(),
701 element: Some("a".to_string()),
702 attribute: Some("href".to_string()),
703 span: span(2, 22),
704 }];
705
706 let uris = extract_html(input, false);
707 assert_eq!(uris, expected);
708 }
709
710 #[test]
711 fn test_skip_dns_prefetch() {
712 let input = r#"
713 <link rel="dns-prefetch" href="https://example.com">
714 "#;
715
716 let uris = extract_html(input, false);
717 assert!(uris.is_empty());
718 }
719
720 #[test]
721 fn test_skip_dns_prefetch_reverse_order() {
722 let input = r#"
723 <link href="https://example.com" rel="dns-prefetch">
724 "#;
725
726 let uris = extract_html(input, false);
727 assert!(uris.is_empty());
728 }
729
730 #[test]
731 fn test_skip_emails_in_stylesheets() {
732 let input = r#"
733 <link href="/@global/global.css" rel="stylesheet">
734 "#;
735
736 let uris = extract_html(input, false);
737 assert!(uris.is_empty());
738 }
739
740 #[test]
741 fn test_extract_fragments_with_name_attributes() {
742 let input = r#"
744 <html>
745 <body>
746 <h1 id="title">Title</h1>
747 <a name="skip.navbar.top"></a>
748 <a name="method.summary"></a>
749 <div>
750 <a name="clear--"></a>
751 <h2 id="section">Section</h2>
752 <a name="method.detail"></a>
753 </div>
754 <a name="skip.navbar.bottom"></a>
755 </body>
756 </html>
757 "#;
758
759 let expected = HashSet::from([
760 "title".to_string(),
761 "section".to_string(),
762 "skip.navbar.top".to_string(),
763 "method.summary".to_string(),
764 "clear--".to_string(),
765 "method.detail".to_string(),
766 "skip.navbar.bottom".to_string(),
767 ]);
768 let actual = extract_html_fragments(input);
769 assert_eq!(actual, expected);
770 }
771
772 #[test]
773 fn test_extract_links_after_empty_verbatim_block() {
774 let input = r#"
776 <body>
777 <div>
778 See <a href="https://example.com/1">First</a>
779 </div>
780 <pre>
781 <code></code>
782 </pre>
783 <div>
784 See <a href="https://example.com/2">Second</a>
785 </div>
786 </body>
787 "#;
788
789 let expected = vec![
790 RawUri {
791 text: "https://example.com/1".to_string(),
792 element: Some("a".to_string()),
793 attribute: Some("href".to_string()),
794 span: span(4, 30),
795 },
796 RawUri {
797 text: "https://example.com/2".to_string(),
798 element: Some("a".to_string()),
799 attribute: Some("href".to_string()),
800 span: span(10, 30),
801 },
802 ];
803
804 let uris = extract_html(input, false);
805 assert_eq!(uris, expected);
806 }
807}