lychee_lib/utils/
fragment_checker.rs1use log::info;
2use std::{
3 borrow::Cow,
4 collections::{HashMap, HashSet, hash_map::Entry},
5 path::Path,
6 sync::Arc,
7};
8
9use crate::{
10 Result,
11 extract::{html::html5gum::extract_html_fragments, markdown::extract_markdown_fragments},
12 types::{ErrorKind, FileType},
13};
14use percent_encoding::percent_decode_str;
15use tokio::{fs, sync::Mutex};
16use url::Url;
17
18pub(crate) struct FragmentInput<'a> {
20 pub content: Cow<'a, str>,
21 pub file_type: FileType,
22}
23
24impl FragmentInput<'_> {
25 pub(crate) async fn from_path(path: &Path) -> Result<Self> {
26 let content = fs::read_to_string(path)
27 .await
28 .map_err(|err| ErrorKind::ReadFileInput(err, path.to_path_buf()))?;
29 let file_type = FileType::from(path);
30 Ok(Self {
31 content: Cow::Owned(content),
32 file_type,
33 })
34 }
35}
36
37struct FragmentBuilder {
39 variants: Vec<String>,
40 decoded: Vec<String>,
41}
42
43impl FragmentBuilder {
44 fn new(fragment: &str, url: &Url, file_type: FileType) -> Result<Self> {
45 let mut variants = vec![fragment.into()];
46 if url
51 .host_str()
52 .is_some_and(|host| host.ends_with("github.com"))
53 {
54 variants.push(format!("user-content-{fragment}"));
55 }
56
57 let mut decoded = Vec::new();
60 for frag in &variants {
61 let mut require_alloc = false;
62 let mut fragment_decoded: Cow<'_, str> = match percent_decode_str(frag).decode_utf8()? {
63 Cow::Borrowed(s) => s.into(),
64 Cow::Owned(s) => {
65 require_alloc = true;
66 s.into()
67 }
68 };
69 if file_type == FileType::Markdown {
70 let lowercase = fragment_decoded.to_lowercase();
71 if lowercase != fragment_decoded {
72 fragment_decoded = lowercase.into();
73 require_alloc = true;
74 }
75 }
76 if require_alloc {
77 decoded.push(fragment_decoded.into());
78 }
79 }
80
81 Ok(Self { variants, decoded })
82 }
83
84 fn any_matches(&self, fragments: &HashSet<String>) -> bool {
85 self.variants
86 .iter()
87 .chain(self.decoded.iter())
88 .any(|frag| fragments.contains(frag))
89 }
90}
91
92#[derive(Default, Clone, Debug)]
104pub(crate) struct FragmentChecker {
105 cache: Arc<Mutex<HashMap<String, HashSet<String>>>>,
106}
107
108impl FragmentChecker {
109 pub(crate) fn new() -> Self {
111 Self {
112 cache: Arc::default(),
113 }
114 }
115
116 pub(crate) async fn check(&self, input: FragmentInput<'_>, url: &Url) -> Result<bool> {
124 let Some(fragment) = url.fragment() else {
125 return Ok(true);
126 };
127 if fragment.is_empty() || fragment.eq_ignore_ascii_case("top") {
128 return Ok(true);
129 }
130
131 let url_without_frag = Self::remove_fragment(url.clone());
132
133 let FragmentInput { content, file_type } = input;
134 let extractor = match file_type {
135 FileType::Markdown => extract_markdown_fragments,
136 FileType::Html => extract_html_fragments,
137 FileType::Css | FileType::Plaintext => {
138 info!("Skipping fragment check for {url} within a {file_type} file");
139 return Ok(true);
140 }
141 };
142
143 let fragment_candidates = FragmentBuilder::new(fragment, url, file_type)?;
144 match self.cache.lock().await.entry(url_without_frag) {
145 Entry::Vacant(entry) => {
146 let file_frags = extractor(&content);
147 let contains_fragment = fragment_candidates.any_matches(&file_frags);
148 entry.insert(file_frags);
149 Ok(contains_fragment)
150 }
151 Entry::Occupied(entry) => {
152 let file_frags = entry.get();
153 Ok(fragment_candidates.any_matches(file_frags))
154 }
155 }
156 }
157
158 fn remove_fragment(mut url: Url) -> String {
159 url.set_fragment(None);
160 url.into()
161 }
162}