1use std::cell::RefCell;
2
3use html5ever::{
4 buffer_queue::BufferQueue,
5 tendril::{StrTendril, Tendril, fmt::UTF8},
6 tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
7};
8
9use super::{
10 super::{css::extract_css_with_default_span, plaintext::extract_raw_uri_from_plaintext},
11 is_email_link, is_verbatim_elem, srcset,
12};
13use crate::types::uri::raw::{RawUri, RawUriSpan, SourceSpanProvider, SpanProvider};
14
15struct LineOffsetSpanProvider<'a> {
17 lines_before: usize,
19 inner: &'a SourceSpanProvider<'a>,
21}
22
23impl SpanProvider for LineOffsetSpanProvider<'_> {
24 fn span(&self, offset: usize) -> RawUriSpan {
25 let mut span = self.inner.span(offset);
26 if span.line.get() == 1 {
30 span.column = None;
31 }
32 span.line = span
33 .line
34 .saturating_add(self.lines_before.saturating_sub(1));
35 span
36 }
37}
38
39#[derive(Clone)]
40struct LinkExtractor {
41 links: RefCell<Vec<RawUri>>,
42 include_verbatim: bool,
43 current_verbatim_element_name: RefCell<Option<String>>,
44 in_style_tag: RefCell<bool>,
46 style_content: RefCell<String>,
48}
49
50impl TokenSink for LinkExtractor {
51 type Handle = ();
52
53 #[allow(clippy::match_same_arms)]
54 fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult<()> {
55 debug_assert_ne!(line_number, 0);
56 let line_number =
57 usize::try_from(line_number).expect("Unable to convert u64 line_number to usize");
58
59 match token {
60 Token::CharacterTokens(raw) => {
61 if *self.in_style_tag.borrow() {
63 self.style_content.borrow_mut().push_str(&raw);
64 return TokenSinkResult::Continue;
65 }
66
67 if self.current_verbatim_element_name.borrow().is_some() {
68 return TokenSinkResult::Continue;
69 }
70 if self.include_verbatim {
71 self.links
72 .borrow_mut()
73 .extend(extract_raw_uri_from_plaintext(
74 &raw,
75 &LineOffsetSpanProvider {
76 lines_before: respect_multiline_tendril(line_number, &raw),
77 inner: &SourceSpanProvider::from_input(&raw),
78 },
79 ));
80 }
81 }
82 Token::TagToken(tag) => return self.process_tag(tag, line_number),
83 Token::ParseError(_err) => {
84 }
86 Token::CommentToken(_raw) => (),
87 Token::NullCharacterToken => (),
88 Token::DoctypeToken(_doctype) => (),
89 Token::EOFToken => (),
90 }
91 TokenSinkResult::Continue
92 }
93}
94
95fn respect_multiline_tendril(line_number: usize, raw: &Tendril<UTF8>) -> usize {
99 line_number.saturating_sub(raw.chars().filter(|c| *c == '\n').count())
100}
101
102impl LinkExtractor {
103 pub(crate) const fn new(include_verbatim: bool) -> Self {
104 Self {
105 links: RefCell::new(Vec::new()),
106 include_verbatim,
107 current_verbatim_element_name: RefCell::new(None),
108 in_style_tag: RefCell::new(false),
109 style_content: RefCell::new(String::new()),
110 }
111 }
112
113 fn process_tag(
114 &self,
115 Tag {
116 kind,
117 name,
118 self_closing: _,
119 attrs,
120 }: Tag,
121 line_number: usize,
122 ) -> TokenSinkResult<()> {
123 if &name == "style" {
125 match kind {
126 TagKind::StartTag => {
127 *self.in_style_tag.borrow_mut() = true;
128 self.style_content.borrow_mut().clear();
129 }
130 TagKind::EndTag => {
131 *self.in_style_tag.borrow_mut() = false;
132 let css_content = self.style_content.borrow();
134 let css_urls = extract_css_with_default_span(&css_content);
135 self.links.borrow_mut().extend(css_urls);
136 self.style_content.borrow_mut().clear();
137 }
138 }
139 }
140
141 if !self.include_verbatim && is_verbatim_elem(&name) {
143 let mut curr_verbatim_elem = self.current_verbatim_element_name.borrow_mut();
145
146 if curr_verbatim_elem.is_some() {
147 if curr_verbatim_elem.as_ref() == Some(&name.to_string()) {
150 if matches!(kind, TagKind::EndTag) {
153 *curr_verbatim_elem = None;
154 }
155 }
156 } else if matches!(kind, TagKind::StartTag) {
157 *curr_verbatim_elem = Some(name.to_string());
161 }
162 }
163 if self.current_verbatim_element_name.borrow().is_some() {
164 return TokenSinkResult::Continue;
167 }
168
169 if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel")
173 && rel.value.contains("nofollow")
174 {
175 return TokenSinkResult::Continue;
176 }
177
178 if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel")
181 && (rel.value.contains("preconnect") || rel.value.contains("dns-prefetch"))
182 {
183 return TokenSinkResult::Continue;
184 }
185
186 if let Some(_prefix) = attrs.iter().find(|attr| &attr.name.local == "prefix") {
189 return TokenSinkResult::Continue;
190 }
191
192 for attr in &attrs {
193 let urls =
194 LinkExtractor::extract_urls_from_elem_attr(&attr.name.local, &name, &attr.value);
195
196 let new_urls = match urls {
197 None => extract_raw_uri_from_plaintext(
198 &attr.value,
199 &LineOffsetSpanProvider {
200 lines_before: line_number,
201 inner: &SourceSpanProvider::from_input(&attr.value),
202 },
203 ),
204 Some(urls) => urls
205 .into_iter()
206 .filter(|url| {
207 let is_email = is_email_link(url);
217 let is_mailto = url.starts_with("mailto:");
218 let is_phone = url.starts_with("tel:");
219 let is_href = attr.name.local.as_ref() == "href";
220
221 if attrs.iter().any(|attr| {
222 &attr.name.local == "rel" && attr.value.contains("stylesheet")
223 }) {
224 if url.starts_with("/@") || url.starts_with('@') {
229 return false;
230 }
231 if attrs.iter().any(|attr| &attr.name.local == "disabled") {
234 return false;
235 }
236 }
237
238 !is_email || (is_mailto && is_href) || (is_phone && is_href)
239 })
240 .map(|url| RawUri {
241 text: url.to_string(),
242 element: Some(name.to_string()),
243 attribute: Some(attr.name.local.to_string()),
244 span: RawUriSpan {
245 line: line_number
246 .try_into()
247 .expect("checked above that `line_number != 0`"),
248 column: None,
249 },
250 })
251 .collect::<Vec<_>>(),
252 };
253 self.links.borrow_mut().extend(new_urls);
254 }
255 TokenSinkResult::Continue
256 }
257
258 #[allow(clippy::unnested_or_patterns)]
260 pub(crate) fn extract_urls_from_elem_attr<'a>(
261 attr_name: &str,
262 elem_name: &str,
263 attr_value: &'a str,
264 ) -> Option<impl Iterator<Item = &'a str> + use<'a>> {
265 match (elem_name, attr_name) {
270
271 (_, "href" | "src" | "cite" | "usemap")
273 | ("applet", "codebase")
275 | ("body", "background")
276 | ("button", "formaction")
277 | ("command", "icon")
278 | ("form", "action")
279 | ("frame", "longdesc")
280 | ("head", "profile")
281 | ("html", "manifest")
282 | ("iframe", "longdesc")
283 | ("img", "longdesc")
284 | ("input", "formaction")
285 | ("object", "classid")
286 | ("object", "codebase")
287 | ("object", "data")
288 | ("video", "poster") => {
289 Some(vec![attr_value].into_iter())
290 }
291 (_, "srcset") => {
292 Some(srcset::parse(attr_value).into_iter())
293 }
294 _ => None,
295 }
296 }
297}
298
299pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
301 let input = BufferQueue::default();
302 input.push_back(StrTendril::from(buf));
303
304 let tokenizer = Tokenizer::new(
305 LinkExtractor::new(include_verbatim),
306 TokenizerOpts::default(),
307 );
308 let _handle = tokenizer.feed(&input);
309 tokenizer.end();
310
311 tokenizer.sink.links.into_inner()
312}
313
314#[cfg(test)]
315mod tests {
316 use crate::types::uri::raw::{span, span_line};
317
318 use super::*;
319
320 const HTML_INPUT: &str = r#"
321<html>
322 <body>
323 <p>This is a paragraph with some inline <code>https://example.com</code> and a normal <a href="https://example.org">example</a></p>
324 <pre>
325 Some random text
326 https://foo.com and http://bar.com/some/path
327 Something else
328 <a href="https://baz.org">example link inside pre</a>
329 </pre>
330 <p><b>bold</b></p>
331 </body>
332</html>"#;
333
334 #[test]
335 fn test_skip_verbatim() {
336 let expected = vec![RawUri {
337 text: "https://example.org".to_string(),
338 element: Some("a".to_string()),
339 attribute: Some("href".to_string()),
340 span: span_line(4),
341 }];
342
343 let uris = extract_html(HTML_INPUT, false);
344 assert_eq!(uris, expected);
345 }
346
347 #[test]
348 fn test_include_verbatim() {
349 let expected = vec![
350 RawUri {
351 text: "https://example.com".to_string(),
352 element: None,
353 attribute: None,
354 span: span_line(4),
355 },
356 RawUri {
357 text: "https://example.org".to_string(),
358 element: Some("a".to_string()),
359 attribute: Some("href".to_string()),
360 span: span_line(4),
361 },
362 RawUri {
363 text: "https://foo.com".to_string(),
364 element: None,
365 attribute: None,
366 span: span(7, 9),
367 },
368 RawUri {
369 text: "http://bar.com/some/path".to_string(),
370 element: None,
371 attribute: None,
372 span: span(7, 29),
373 },
374 RawUri {
375 text: "https://baz.org".to_string(),
376 element: Some("a".to_string()),
377 attribute: Some("href".to_string()),
378 span: span_line(9),
379 },
380 ];
381
382 let uris = extract_html(HTML_INPUT, true);
383 assert_eq!(uris, expected);
384 }
385
386 #[test]
387 fn test_include_verbatim_recursive() {
388 const HTML_INPUT: &str = r#"
389 <a href="https://example.com/">valid link</a>
390 <code>
391 <pre>
392 <span>https://example.org</span>
393 </pre>
394 </code>
395 "#;
396
397 let expected = vec![RawUri {
398 text: "https://example.com/".to_string(),
399 element: Some("a".to_string()),
400 attribute: Some("href".to_string()),
401 span: span_line(2),
402 }];
403
404 let uris = extract_html(HTML_INPUT, false);
405 assert_eq!(uris, expected);
406 }
407
408 #[test]
409 fn test_include_nofollow() {
410 let input = r#"
411 <a rel="nofollow" href="https://foo.com">do not follow me</a>
412 <a rel="canonical,nofollow,dns-prefetch" href="https://example.com">do not follow me</a>
413 <a href="https://example.org">do not follow me</a>
414 "#;
415 let expected = vec![RawUri {
416 text: "https://example.org".to_string(),
417 element: Some("a".to_string()),
418 attribute: Some("href".to_string()),
419 span: span_line(4),
420 }];
421 let uris = extract_html(input, false);
422 assert_eq!(uris, expected);
423 }
424
425 #[test]
426 fn test_exclude_script_tags() {
427 let input = r#"
428 <script>
429 var foo = "https://example.com";
430 </script>
431 <a href="https://example.org">i'm fine</a>
432 "#;
433 let expected = vec![RawUri {
434 text: "https://example.org".to_string(),
435 element: Some("a".to_string()),
436 attribute: Some("href".to_string()),
437 span: span_line(5),
438 }];
439 let uris = extract_html(input, false);
440 assert_eq!(uris, expected);
441 }
442
443 #[test]
444 fn test_exclude_disabled_stylesheet() {
445 let input = r#"
446 <link rel="stylesheet" href="https://disabled.com" disabled>
447 <link rel="stylesheet" href="https://disabled.com" disabled="disabled">
448 <a href="https://example.org">i'm fine</a>
449 "#;
450 let expected = vec![RawUri {
451 text: "https://example.org".to_string(),
452 element: Some("a".to_string()),
453 attribute: Some("href".to_string()),
454 span: span_line(4),
455 }];
456 let uris = extract_html(input, false);
457 assert_eq!(uris, expected);
458 }
459
460 #[test]
461 fn test_valid_email() {
462 let input = r#"<!DOCTYPE html>
463 <html lang="en-US">
464 <head>
465 <meta charset="utf-8">
466 <title>Test</title>
467 </head>
468 <body>
469 <a href="mailto:foo@bar.com">
470 </body>
471 </html>"#;
472
473 let expected = vec![RawUri {
474 text: "mailto:foo@bar.com".to_string(),
475 element: Some("a".to_string()),
476 attribute: Some("href".to_string()),
477 span: span_line(8),
478 }];
479 let uris = extract_html(input, false);
480 assert_eq!(uris, expected);
481 }
482
483 #[test]
484 fn test_valid_tel() {
485 let input = r#"<!DOCTYPE html>
486 <html lang="en-US">
487 <head>
488 <meta charset="utf-8">
489 <title>Test</title>
490 </head>
491 <body>
492 <a href="tel:1234567890">
493 </body>
494 </html>"#;
495
496 let expected = vec![RawUri {
497 text: "tel:1234567890".to_string(),
498 element: Some("a".to_string()),
499 attribute: Some("href".to_string()),
500 span: span_line(8),
501 }];
502 let uris = extract_html(input, false);
503 assert_eq!(uris, expected);
504 }
505
506 #[test]
507 fn test_exclude_email_without_mailto() {
508 let input = r#"<!DOCTYPE html>
509 <html lang="en-US">
510 <head>
511 <meta charset="utf-8">
512 <title>Test</title>
513 </head>
514 <body>
515 <a href="foo@bar.com">
516 </body>
517 </html>"#;
518
519 let expected = vec![];
520 let uris = extract_html(input, false);
521 assert_eq!(uris, expected);
522 }
523
524 #[test]
525 fn test_email_false_positive() {
526 let input = r#"<!DOCTYPE html>
527 <html lang="en-US">
528 <head>
529 <meta charset="utf-8">
530 <title>Test</title>
531 </head>
532 <body>
533 <img srcset="v2@1.5x.png" alt="Wikipedia" width="200" height="183">
534 </body>
535 </html>"#;
536
537 let expected = vec![];
538 let uris = extract_html(input, false);
539 assert_eq!(uris, expected);
540 }
541
542 #[test]
543 fn test_skip_preconnect() {
544 let input = r#"
545 <link rel="preconnect" href="https://example.com">
546 "#;
547
548 let uris = extract_html(input, false);
549 assert!(uris.is_empty());
550 }
551
552 #[test]
553 fn test_skip_preconnect_reverse_order() {
554 let input = r#"
555 <link href="https://example.com" rel="preconnect">
556 "#;
557
558 let uris = extract_html(input, false);
559 assert!(uris.is_empty());
560 }
561
562 #[test]
563 fn test_skip_prefix() {
564 let input = r#"
565 <html lang="en-EN" prefix="og: https://ogp.me/ns#">
566 "#;
567
568 let uris = extract_html(input, false);
569 assert!(uris.is_empty());
570 }
571
572 #[test]
573 fn test_ignore_text_content_links() {
574 let input = r#"
575 <a href="https://example.com">https://ignoreme.com</a>
576 "#;
577 let expected = vec![RawUri {
578 text: "https://example.com".to_string(),
579 element: Some("a".to_string()),
580 attribute: Some("href".to_string()),
581 span: span_line(2),
582 }];
583
584 let uris = extract_html(input, false);
585 assert_eq!(uris, expected);
586 }
587
588 #[test]
589 fn test_skip_dns_prefetch() {
590 let input = r#"
591 <link rel="dns-prefetch" href="https://example.com">
592 "#;
593
594 let uris = extract_html(input, false);
595 assert!(uris.is_empty());
596 }
597
598 #[test]
599 fn test_skip_dns_prefetch_reverse_order() {
600 let input = r#"
601 <link href="https://example.com" rel="dns-prefetch">
602 "#;
603
604 let uris = extract_html(input, false);
605 assert!(uris.is_empty());
606 }
607
608 #[test]
609 fn test_skip_emails_in_stylesheets() {
610 let input = r#"
611 <link href="/@global/global.css" rel="stylesheet">
612 "#;
613
614 let uris = extract_html(input, false);
615 assert!(uris.is_empty());
616 }
617
618 #[test]
619 fn test_extract_links_after_empty_verbatim_block() {
620 let input = r#"
622 <body>
623 <div>
624 See <a href="https://example.com/1">First</a>
625 </div>
626 <pre>
627 <code></code>
628 </pre>
629 <div>
630 See <a href="https://example.com/2">Second</a>
631 </div>
632 </body>
633 "#;
634
635 let expected = vec![
636 RawUri {
637 text: "https://example.com/1".to_string(),
638 element: Some("a".to_string()),
639 attribute: Some("href".to_string()),
640 span: span_line(4),
641 },
642 RawUri {
643 text: "https://example.com/2".to_string(),
644 element: Some("a".to_string()),
645 attribute: Some("href".to_string()),
646 span: span_line(10),
647 },
648 ];
649
650 let uris = extract_html(input, false);
651 assert_eq!(uris, expected);
652 }
653}