Skip to main content

lychee_lib/utils/
fragment_checker.rs

1use log::info;
2use std::{
3    borrow::Cow,
4    collections::{HashMap, HashSet, hash_map::Entry},
5    path::Path,
6    sync::Arc,
7};
8
9use crate::{
10    Result,
11    extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments},
12    types::{ErrorKind, FileType},
13};
14use percent_encoding::percent_decode_str;
15use tokio::{fs, sync::Mutex};
16use url::Url;
17
18/// Holds the content and file type of the fragment input.
19pub(crate) struct FragmentInput<'a> {
20    pub content: Cow<'a, str>,
21    pub file_type: FileType,
22}
23
24impl FragmentInput<'_> {
25    pub(crate) async fn from_path(path: &Path) -> Result<Self> {
26        let content = fs::read_to_string(path)
27            .await
28            .map_err(|err| ErrorKind::ReadFileInput(err, path.to_path_buf()))?;
29        let file_type = FileType::from(path);
30        Ok(Self {
31            content: Cow::Owned(content),
32            file_type,
33        })
34    }
35}
36
37/// A fragment builder that expands the given fragments into a list of candidates.
38struct FragmentBuilder {
39    variants: Vec<String>,
40    decoded: Vec<String>,
41}
42
43impl FragmentBuilder {
44    fn new(fragment: &str, url: &Url, file_type: FileType) -> Result<Self> {
45        let mut variants = vec![fragment.into()];
46        // For GitHub links, add "user-content-" prefix to the fragments.
47        // The following cases cannot be handled unless we simulate with a headless browser:
48        // - markdown files from any specific path (includes "blob/master/README.md")
49        // - "issuecomment" fragments from the GitHub issue pages
50        if url
51            .host_str()
52            .is_some_and(|host| host.ends_with("github.com"))
53        {
54            variants.push(format!("user-content-{fragment}"));
55        }
56
57        // Only store the percent-decoded variants if it's different from the original
58        // fragment. This avoids storing and comparing the same fragment twice.
59        let mut decoded = Vec::new();
60        for frag in &variants {
61            let mut require_alloc = false;
62            let mut fragment_decoded: Cow<'_, str> = match percent_decode_str(frag).decode_utf8()? {
63                Cow::Borrowed(s) => s.into(),
64                Cow::Owned(s) => {
65                    require_alloc = true;
66                    s.into()
67                }
68            };
69            if file_type == FileType::Markdown {
70                let lowercase = fragment_decoded.to_lowercase();
71                if lowercase != fragment_decoded {
72                    fragment_decoded = lowercase.into();
73                    require_alloc = true;
74                }
75            }
76            if require_alloc {
77                decoded.push(fragment_decoded.into());
78            }
79        }
80
81        Ok(Self { variants, decoded })
82    }
83
84    fn any_matches(&self, fragments: &HashSet<String>) -> bool {
85        self.variants
86            .iter()
87            .chain(self.decoded.iter())
88            .any(|frag| fragments.contains(frag))
89    }
90}
91
92/// Holds a cache of fragments for a given URL.
93///
94/// Fragments, also known as anchors, are used to link to a specific
95/// part of a page. For example, the URL `https://example.com#foo`
96/// will link to the element with the `id` of `foo`.
97///
98/// This cache is used to avoid having to re-parse the same file
99/// multiple times when checking if a given URL contains a fragment.
100///
101/// The cache is stored in a `HashMap` with the URL as the key and
102/// a `HashSet` of fragments as the value.
103#[derive(Default, Clone, Debug)]
104pub(crate) struct FragmentChecker {
105    cache: Arc<Mutex<HashMap<String, HashSet<String>>>>,
106}
107
108impl FragmentChecker {
109    /// Creates a new `FragmentChecker`.
110    pub(crate) fn new() -> Self {
111        Self {
112            cache: Arc::default(),
113        }
114    }
115
116    /// Checks if the given [`FragmentInput`] contains the given fragment.
117    ///
118    /// Returns false, if there is a fragment in the link which is not empty or "top"
119    /// and the path is to a Markdown file, which doesn't contain the given fragment.
120    /// (Empty # and #top fragments are always valid, triggering the browser to scroll to top.)
121    ///
122    /// In all other cases, returns true.
123    pub(crate) async fn check(&self, input: FragmentInput<'_>, url: &Url) -> Result<bool> {
124        let Some(fragment) = url.fragment() else {
125            return Ok(true);
126        };
127        if fragment.is_empty() || fragment.eq_ignore_ascii_case("top") {
128            return Ok(true);
129        }
130
131        let url_without_frag = Self::remove_fragment(url.clone());
132
133        let FragmentInput { content, file_type } = input;
134        let extractor = match file_type {
135            FileType::Markdown => extract_markdown_fragments,
136            FileType::Html => extract_html_fragments,
137            FileType::Css | FileType::Plaintext => {
138                info!("Skipping fragment check for {url} within a {file_type} file");
139                return Ok(true);
140            }
141        };
142
143        let fragment_candidates = FragmentBuilder::new(fragment, url, file_type)?;
144        match self.cache.lock().await.entry(url_without_frag) {
145            Entry::Vacant(entry) => {
146                let file_frags = extractor(&content);
147                let contains_fragment = fragment_candidates.any_matches(&file_frags);
148                entry.insert(file_frags);
149                Ok(contains_fragment)
150            }
151            Entry::Occupied(entry) => {
152                let file_frags = entry.get();
153                Ok(fragment_candidates.any_matches(file_frags))
154            }
155        }
156    }
157
158    fn remove_fragment(mut url: Url) -> String {
159        url.set_fragment(None);
160        url.into()
161    }
162}