1use std::collections::{HashMap, HashSet};
3
4use log::warn;
5use pulldown_cmark::{CowStr, Event, LinkType, Options, Parser, Tag, TagEnd, TextMergeWithOffset};
6
7use crate::{
8 checker::wikilink::wikilink,
9 extract::{html::html5gum::extract_html_with_span, plaintext::extract_raw_uri_from_plaintext},
10 types::uri::raw::{
11 OffsetSpanProvider, RawUri, RawUriSpan, SourceSpanProvider, SpanProvider as _,
12 },
13};
14
15use super::html::html5gum::extract_html_fragments;
16
17fn md_extensions() -> Options {
20 Options::ENABLE_HEADING_ATTRIBUTES
21 | Options::ENABLE_MATH
22 | Options::ENABLE_WIKILINKS
23 | Options::ENABLE_FOOTNOTES
24}
25
26#[allow(clippy::too_many_lines)]
29pub(crate) fn extract_markdown(
30 input: &str,
31 include_verbatim: bool,
32 include_wikilinks: bool,
33) -> Vec<RawUri> {
34 let mut inside_code_block = false;
37 let mut inside_link_block = false;
38 let mut inside_wikilink_block = false;
39
40 let mut inside_html_block = false;
42 let mut html_block_buffer = String::new();
43 let mut html_block_start_offset = 0;
44
45 let span_provider = SourceSpanProvider::from_input(input);
46 let parser =
47 TextMergeWithOffset::new(Parser::new_ext(input, md_extensions()).into_offset_iter());
48 parser
49 .filter_map(|(event, span)| match event {
50 Event::Start(Tag::Link {
52 link_type,
53 dest_url,
54 ..
55 }) => {
56 #[allow(clippy::match_same_arms)]
59 match link_type {
60 LinkType::Inline => {
63 inside_link_block = true;
64 Some(raw_uri(&dest_url, span_provider.span(span.start)))
65 }
66 LinkType::Reference |
68 LinkType::ReferenceUnknown |
70 LinkType::Collapsed|
72 LinkType::CollapsedUnknown |
74 LinkType::Shortcut |
76 LinkType::ShortcutUnknown => {
78 inside_link_block = true;
79 Some(raw_uri(&dest_url, span_provider.span(span.start)))
82 },
83 LinkType::Autolink |
85 LinkType::Email => {
87 let span_provider = get_email_span_provider(&span_provider, &span, link_type);
88 Some(extract_raw_uri_from_plaintext(&dest_url, &span_provider))
89 }
90 LinkType::WikiLink { has_pothole } => {
92 if !include_wikilinks {
94 return None;
95 }
96 inside_wikilink_block = true;
97 if ["_TOC_".to_string(), "TOC".to_string()].contains(&dest_url.to_string()) {
99 return None;
100 }
101
102 if let Ok(wikilink) = wikilink(&dest_url, has_pothole) {
103 Some(vec![RawUri {
104 text: wikilink.to_string(),
105 element: Some("a".to_string()),
106 attribute: Some("wikilink".to_string()),
107 span: span_provider.span(span.start + 2)
109 }])
110 } else {
111 warn!("The wikilink destination url {dest_url} could not be cleaned by removing potholes and fragments");
112 None
113 }
114 }
115 }
116 }
117
118 Event::Start(Tag::Image { dest_url, .. }) => Some(extract_image(&dest_url, span_provider.span(span.start))),
119
120 Event::Start(Tag::CodeBlock(_)) => {
122 inside_code_block = true;
123 None
124 }
125 Event::End(TagEnd::CodeBlock) => {
126 inside_code_block = false;
127 None
128 }
129
130 Event::Text(txt) => {
132 if inside_wikilink_block
133 || (inside_link_block && !include_verbatim)
134 || (inside_code_block && !include_verbatim) {
135 None
136 } else {
137 Some(extract_raw_uri_from_plaintext(
138 &txt,
139 &OffsetSpanProvider { offset: span.start, inner: &span_provider }
140 ))
141 }
142 }
143
144 Event::Start(Tag::HtmlBlock) => {
146 inside_html_block = true;
147 html_block_buffer.clear();
148 html_block_start_offset = span.start;
149 None
150 }
151
152 Event::End(TagEnd::HtmlBlock) => {
154 inside_html_block = false;
155 if html_block_buffer.is_empty() {
156 None
157 } else {
158 Some(extract_html_with_span(
159 &html_block_buffer,
160 include_verbatim,
161 OffsetSpanProvider {
162 offset: html_block_start_offset,
163 inner: &span_provider
164 }
165 ))
166 }
167 }
168
169 Event::Html(html) => {
171 if inside_html_block {
172 html_block_buffer.push_str(&html);
174 None
175 } else {
176 Some(extract_html_with_span(
178 &html,
179 include_verbatim,
180 OffsetSpanProvider { offset: span.start, inner: &span_provider }
181 ))
182 }
183 }
184
185 Event::InlineHtml(html) => {
187 Some(extract_html_with_span(
188 &html,
189 include_verbatim,
190 OffsetSpanProvider { offset: span.start, inner: &span_provider }
191 ))
192 }
193
194 Event::Code(code) => {
196 if include_verbatim {
197 Some(extract_raw_uri_from_plaintext(
199 &code,
200 &OffsetSpanProvider { offset: span.start + 1, inner: &span_provider }
201 ))
202 } else {
203 None
204 }
205 }
206
207 Event::End(TagEnd::Link) => {
208 inside_link_block = false;
209 inside_wikilink_block = false;
210 None
211 }
212
213 #[allow(clippy::match_same_arms)]
215 Event::FootnoteReference(_) | Event::Start(Tag::FootnoteDefinition(_)) | Event::End(TagEnd::FootnoteDefinition) => None,
216
217 _ => None,
219 })
220 .flatten()
221 .collect()
222}
223
224fn get_email_span_provider<'a>(
225 span_provider: &'a SourceSpanProvider<'_>,
226 span: &std::ops::Range<usize>,
227 link_type: LinkType,
228) -> OffsetSpanProvider<'a> {
229 let offset = match link_type {
230 LinkType::Reference | LinkType::CollapsedUnknown | LinkType::ShortcutUnknown => 0,
232 LinkType::ReferenceUnknown
234 | LinkType::Collapsed
235 | LinkType::Shortcut
236 | LinkType::Autolink
237 | LinkType::Email => 1,
238 _ => {
239 debug_assert!(false, "Unexpected email link type: {link_type:?}");
240 0
241 }
242 };
243
244 OffsetSpanProvider {
245 offset: span.start + offset,
246 inner: span_provider,
247 }
248}
249
250fn extract_image(dest_url: &CowStr<'_>, span: RawUriSpan) -> Vec<RawUri> {
253 vec![RawUri {
254 text: dest_url.to_string(),
255 element: Some("img".to_string()),
256 attribute: Some("src".to_string()),
257 span,
258 }]
259}
260
261fn raw_uri(dest_url: &CowStr<'_>, span: RawUriSpan) -> Vec<RawUri> {
264 vec![RawUri {
265 text: dest_url.to_string(),
266 element: Some("a".to_string()),
267 attribute: Some("href".to_string()),
268 span,
271 }]
272}
273
274pub(crate) fn extract_markdown_fragments(input: &str) -> HashSet<String> {
282 let mut in_heading = false;
283 let mut heading_text = String::new();
284 let mut heading_id: Option<CowStr<'_>> = None;
285 let mut id_generator = HeadingIdGenerator::default();
286
287 let mut out = HashSet::new();
288
289 for event in Parser::new_ext(input, md_extensions()) {
290 match event {
291 Event::Start(Tag::Heading { id, .. }) => {
292 heading_id = id;
293 in_heading = true;
294 }
295 Event::End(TagEnd::Heading(_)) => {
296 if let Some(frag) = heading_id.take() {
297 out.insert(frag.to_string());
298 }
299
300 if !heading_text.is_empty() {
301 let id = id_generator.generate(&heading_text);
302 out.insert(id);
303 heading_text.clear();
304 }
305
306 in_heading = false;
307 }
308 Event::Text(text) | Event::Code(text) => {
309 if in_heading {
310 heading_text.push_str(&text);
311 }
312 }
313
314 Event::Html(html) | Event::InlineHtml(html) => {
316 out.extend(extract_html_fragments(&html));
317 }
318
319 _ => (),
321 }
322 }
323 out
324}
325
326#[derive(Default)]
327struct HeadingIdGenerator {
328 counter: HashMap<String, usize>,
329}
330
331impl HeadingIdGenerator {
332 fn generate(&mut self, heading: &str) -> String {
333 let mut id = Self::into_kebab_case(heading);
334 let count = self.counter.entry(id.clone()).or_insert(0);
335 if *count != 0 {
336 id = format!("{}-{}", id, *count);
337 }
338 *count += 1;
339
340 id
341 }
342
343 #[must_use]
345 fn into_kebab_case(text: &str) -> String {
346 text.to_lowercase()
347 .chars()
348 .filter_map(|ch| {
349 if ch.is_alphanumeric() || ch == '_' || ch == '-' {
350 Some(ch)
351 } else if ch.is_whitespace() {
352 Some('-')
353 } else {
354 None
355 }
356 })
357 .collect::<String>()
358 }
359}
360
361#[cfg(test)]
362mod tests {
363 use crate::types::uri::raw::span;
364
365 use super::*;
366
367 const MD_INPUT: &str = r#"
368# A Test
369
370Some link in text [here](https://foo.com)
371
372## A test {#well-still-the-same-test}
373
374Code:
375
376```bash
377https://bar.com/123
378```
379
380or inline like `https://bar.org` for instance.
381
382### Some `code` in a heading.
383
384[example](http://example.com)
385
386<span id="the-end">The End</span>
387 "#;
388
389 #[test]
390 fn test_extract_fragments() {
391 let expected = HashSet::from([
392 "a-test".to_string(),
393 "a-test-1".to_string(),
394 "well-still-the-same-test".to_string(),
395 "some-code-in-a-heading".to_string(),
396 "the-end".to_string(),
397 ]);
398 let actual = extract_markdown_fragments(MD_INPUT);
399 assert_eq!(actual, expected);
400 }
401
402 #[test]
403 fn test_skip_verbatim() {
404 let expected = vec![
405 RawUri {
406 text: "https://foo.com".to_string(),
407 element: Some("a".to_string()),
408 attribute: Some("href".to_string()),
409 span: span(4, 19),
410 },
411 RawUri {
412 text: "http://example.com".to_string(),
413 element: Some("a".to_string()),
414 attribute: Some("href".to_string()),
415 span: span(18, 1),
416 },
417 ];
418
419 let uris = extract_markdown(MD_INPUT, false, false);
420 assert_eq!(uris, expected);
421 }
422
423 #[test]
424 fn test_include_verbatim() {
425 let expected = vec![
426 RawUri {
427 text: "https://foo.com".to_string(),
428 element: Some("a".to_string()),
429 attribute: Some("href".to_string()),
430 span: span(4, 19),
431 },
432 RawUri {
433 text: "https://bar.com/123".to_string(),
434 element: None,
435 attribute: None,
436 span: span(11, 1),
437 },
438 RawUri {
439 text: "https://bar.org".to_string(),
440 element: None,
441 attribute: None,
442 span: span(14, 17),
443 },
444 RawUri {
445 text: "http://example.com".to_string(),
446 element: Some("a".to_string()),
447 attribute: Some("href".to_string()),
448 span: span(18, 1),
449 },
450 ];
451
452 let uris = extract_markdown(MD_INPUT, true, false);
453 assert_eq!(uris, expected);
454 }
455
456 #[test]
457 fn test_skip_verbatim_html() {
458 let input = "
459<code>
460http://link.com
461</code>
462<pre>
463Some pre-formatted http://pre.com
464</pre>";
465
466 let expected = vec![];
467
468 let uris = extract_markdown(input, false, false);
469 assert_eq!(uris, expected);
470 }
471
472 #[test]
473 fn test_kebab_case() {
474 let check = |input, expected| {
475 let actual = HeadingIdGenerator::into_kebab_case(input);
476 assert_eq!(actual, expected);
477 };
478 check("A Heading", "a-heading");
479 check(
480 "This header has a :thumbsup: in it",
481 "this-header-has-a-thumbsup-in-it",
482 );
483 check(
484 "Header with 한글 characters (using unicode)",
485 "header-with-한글-characters-using-unicode",
486 );
487 check(
488 "Underscores foo_bar_, dots . and numbers 1.7e-3",
489 "underscores-foo_bar_-dots--and-numbers-17e-3",
490 );
491 check("Many spaces", "many----------spaces");
492 }
493
494 #[test]
495 fn test_markdown_math() {
496 let input = r"
497$$
498[\psi](\mathbf{L})
499$$
500";
501 let uris = extract_markdown(input, true, false);
502 assert!(uris.is_empty());
503 }
504
505 #[test]
506 fn test_single_word_footnote_is_not_detected_as_link() {
507 let markdown = "This footnote is[^actually] a link.\n\n[^actually]: not";
508 let expected = vec![];
509 let uris = extract_markdown(markdown, true, false);
510 assert_eq!(uris, expected);
511 }
512
513 #[test]
514 fn test_underscore_in_urls_middle() {
515 let markdown = r"https://example.com/_/foo";
516 let expected = vec![RawUri {
517 text: "https://example.com/_/foo".to_string(),
518 element: None,
519 attribute: None,
520 span: span(1, 1),
521 }];
522 let uris = extract_markdown(markdown, true, false);
523 assert_eq!(uris, expected);
524 }
525
526 #[test]
527 fn test_underscore_in_urls_end() {
528 let markdown = r"https://example.com/_";
529 let expected = vec![RawUri {
530 text: "https://example.com/_".to_string(),
531 element: None,
532 attribute: None,
533 span: span(1, 1),
534 }];
535 let uris = extract_markdown(markdown, true, false);
536 assert_eq!(uris, expected);
537 }
538
539 #[test]
540 fn test_wiki_link() {
541 let markdown = r"[[https://example.com/destination]]";
542 let expected = vec![RawUri {
543 text: "https://example.com/destination".to_string(),
544 element: Some("a".to_string()),
545 attribute: Some("wikilink".to_string()),
546 span: span(1, 3),
547 }];
548 let uris = extract_markdown(markdown, true, true);
549 assert_eq!(uris, expected);
550 }
551
552 #[test]
553 fn test_multiple_wiki_links() {
554 let markdown = r"[[https://example.com/destination]][[https://example.com/source]]";
555 let expected = vec![
556 RawUri {
557 text: "https://example.com/destination".to_string(),
558 element: Some("a".to_string()),
559 attribute: Some("wikilink".to_string()),
560 span: span(1, 3),
561 },
562 RawUri {
563 text: "https://example.com/source".to_string(),
564 element: Some("a".to_string()),
565 attribute: Some("wikilink".to_string()),
566 span: span(1, 38),
567 },
568 ];
569 let uris = extract_markdown(markdown, true, true);
570 assert_eq!(uris, expected);
571 }
572
573 #[test]
574 fn test_ignore_gitlab_toc() {
575 let markdown = r"[[_TOC_]][TOC]";
576 let uris = extract_markdown(markdown, true, true);
577 assert!(uris.is_empty());
578 }
579
580 #[test]
581 fn test_link_text_not_checked() {
582 let markdown =
584 r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)";
585 let uris = extract_markdown(markdown, false, false);
586
587 let expected = vec![RawUri {
589 text: "https://example.com".to_string(),
590 element: Some("a".to_string()),
591 attribute: Some("href".to_string()),
592 span: span(1, 1),
593 }];
594
595 assert_eq!(uris, expected);
596 assert_eq!(
597 uris.len(),
598 1,
599 "Should only find destination URL, not link text"
600 );
601 }
602
603 #[test]
604 fn test_link_text_checked_with_include_verbatim() {
605 let markdown =
607 r"[https://lycheerepublic.gov/notexist (archive.org link)](https://example.com)";
608 let uris = extract_markdown(markdown, true, false);
609
610 let expected = vec![
612 RawUri {
613 text: "https://example.com".to_string(),
614 element: Some("a".to_string()),
615 attribute: Some("href".to_string()),
616 span: span(1, 1),
617 },
618 RawUri {
619 text: "https://lycheerepublic.gov/notexist".to_string(),
620 element: None,
621 attribute: None,
622 span: span(1, 2),
623 },
624 ];
625
626 assert_eq!(
627 uris.len(),
628 2,
629 "Should find both destination URL and link text"
630 );
631 for expected_uri in expected {
633 assert!(
634 uris.contains(&expected_uri),
635 "Missing expected URI: {expected_uri:?}"
636 );
637 }
638 }
639
640 #[test]
641 fn test_reference_links_extraction() {
642 let markdown = r"
644Inline link: [link1](target1.md)
645
646Reference link: [link2][ref2]
647Collapsed link: [link3][]
648Shortcut link: [link4]
649
650[ref2]: target2.md
651[link3]: target3.md
652[link4]: target4.md
653";
654 let uris = extract_markdown(markdown, false, false);
655
656 let expected = vec![
657 RawUri {
658 text: "target1.md".to_string(),
659 element: Some("a".to_string()),
660 attribute: Some("href".to_string()),
661 span: span(2, 14),
662 },
663 RawUri {
664 text: "target2.md".to_string(),
665 element: Some("a".to_string()),
666 attribute: Some("href".to_string()),
667 span: span(4, 17),
668 },
669 RawUri {
670 text: "target3.md".to_string(),
671 element: Some("a".to_string()),
672 attribute: Some("href".to_string()),
673 span: span(5, 17),
674 },
675 RawUri {
676 text: "target4.md".to_string(),
677 element: Some("a".to_string()),
678 span: span(6, 16),
679 attribute: Some("href".to_string()),
680 },
681 ];
682
683 assert_eq!(uris.len(), 4, "Should extract all four link types");
684
685 for expected_uri in expected {
687 assert!(
688 uris.contains(&expected_uri),
689 "Missing expected URI: {expected_uri:?}. Found: {uris:?}"
690 );
691 }
692 }
693
694 #[test]
695 fn test_clean_wikilink() {
696 let markdown = r"
697[[foo|bar]]
698[[foo#bar]]
699[[foo#bar|baz]]
700";
701 let uris = extract_markdown(markdown, true, true);
702 let expected = vec![
703 RawUri {
704 text: "foo".to_string(),
705 element: Some("a".to_string()),
706 attribute: Some("wikilink".to_string()),
707 span: span(2, 3),
708 },
709 RawUri {
710 text: "foo".to_string(),
711 element: Some("a".to_string()),
712 attribute: Some("wikilink".to_string()),
713 span: span(3, 3),
714 },
715 RawUri {
716 text: "foo".to_string(),
717 element: Some("a".to_string()),
718 attribute: Some("wikilink".to_string()),
719 span: span(4, 3),
720 },
721 ];
722 assert_eq!(uris, expected);
723 }
724
725 #[test]
726 fn test_nested_html() {
727 let input = r#"<Foo>
728 <Bar href="https://example.com" >
729 Some text
730 </Bar>
731 </Foo>"#;
732
733 let expected = vec![RawUri {
734 text: "https://example.com".to_string(),
735 element: Some("bar".to_string()),
736 attribute: Some("href".to_string()),
737 span: span(2, 22),
738 }];
739
740 let uris = extract_markdown(input, false, false);
741
742 assert_eq!(uris, expected);
743 }
744
745 #[test]
746 fn test_wikilink_extraction_returns_none_on_empty_links() {
747 let markdown = r"
748[[|bar]]
749[[#bar]]
750[[#bar|baz]]
751";
752
753 let uris = extract_markdown(markdown, true, true);
754 assert!(uris.is_empty());
755 }
756
757 #[test]
758 fn test_mdx_multiline_jsx() {
759 let input = r#"<CardGroup cols={1}>
760 <Card
761 title="Example"
762 href="https://example.com"
763 >
764 Some text
765 </Card>
766</CardGroup>"#;
767
768 let expected = vec![RawUri {
769 text: "https://example.com".to_string(),
770 element: Some("card".to_string()),
771 attribute: Some("href".to_string()),
772 span: span(4, 11),
773 }];
774
775 let uris = extract_markdown(input, false, false);
776
777 assert_eq!(uris, expected);
778 }
779
780 #[test]
784 fn test_markdown_inside_html_block() {
785 let input = r"<div>
786
787[markdown link](https://example.com/markdown)
788
789</div>
790
791<span>[another link](https://example.com/another)</span>";
792
793 let uris = extract_markdown(input, false, false);
794
795 let expected_urls = vec![
797 "https://example.com/markdown",
798 "https://example.com/another",
799 ];
800
801 assert_eq!(uris.len(), 2, "Should extract both Markdown links");
802
803 for expected_url in expected_urls {
804 assert!(
805 uris.iter().any(|u| u.text == expected_url),
806 "Should find URL: {expected_url}"
807 );
808 }
809
810 for uri in &uris {
812 assert_eq!(uri.element, Some("a".to_string()));
813 assert_eq!(uri.attribute, Some("href".to_string()));
814 }
815 }
816
817 #[test]
818 fn test_remove_wikilink_potholes_and_fragments() {
819 let markdown = r"[[foo#bar|baz]]";
820 let uris = extract_markdown(markdown, true, true);
821 let expected = vec![RawUri {
822 text: "foo".to_string(),
823 element: Some("a".to_string()),
824 attribute: Some("wikilink".to_string()),
825 span: span(1, 3),
826 }];
827 assert_eq!(uris, expected);
828 }
829}